并行执行多个sql,执行作业全部卡住不动

Viewed 106

部署在k8s上,采用helm chart方式部署,doris 版本 2.1.1

老师们好
我们在测试db过程发现,在并行执行多个sql,执行作业全部卡住不动,同时客户端工具无法连接上Doris,猜测是资源不满足Doris的运行要求,或是某些配置不正确。

调度作业的执行方式是:

mysql -uroot -hdoris-db.bigdata.test.zrzkwlw.com -P9030 -e "insert into table select xxx from xxx "

测试过程发现在大约6-10个并行,结果是作业全部卡住
查看Doris资源消耗,并未看到拉满的情况。我的环境资源分配如下:

  • be 3个实例,资源分别分配了4c 8g;
  • fe 3个实例,资源也分别分配了4c 8g

卡住期间be资源消耗情况
image.png
fe资源消耗情况
image.png

客户端连接也卡住了
image.png

请问老师们,看看是否我的配置有问题?

be yaml

kind: StatefulSet
apiVersion: apps/v1
metadata:
  name: doriscluster-helm-bigdata-test-be
  namespace: bigdata-test
  labels:
    app.doris.ownerreference/name: doriscluster-helm-bigdata-test
    app.kubernetes.io/component: be
  annotations:
    app.doris.components/hash: '3505525361'
spec:
  replicas: 3
  selector:
    matchLabels:
      app.doris.ownerreference/name: doriscluster-helm-bigdata-test-be
      app.kubernetes.io/component: be
  template:
    metadata:
      name: doriscluster-helm-bigdata-test-be
      creationTimestamp: null
      labels:
        app.doris.cluster: doriscluster-helm-bigdata-test
        app.doris.ownerreference/name: doriscluster-helm-bigdata-test-be
        app.kubernetes.io/component: be
    spec:
      volumes:
        - name: be-storage
          persistentVolumeClaim:
            claimName: be-storage
        - name: be-log
          persistentVolumeClaim:
            claimName: be-log
        - name: podinfo
          downwardAPI:
            items:
              - path: labels
                fieldRef:
                  apiVersion: v1
                  fieldPath: metadata.labels
              - path: annotations
                fieldRef:
                  apiVersion: v1
                  fieldPath: metadata.annotations
            defaultMode: 420
        - name: doriscluster-helm-be-configmap
          configMap:
            name: doriscluster-helm-be-configmap
            defaultMode: 420
      initContainers:
        - name: default-init
          image: 'selectdb/alpine:latest'
          command:
            - /bin/sh
          args:
            - '-c'
            - sysctl -w vm.max_map_count=2000000 && swapoff -a
          resources: {}
          terminationMessagePath: /dev/termination-log
          terminationMessagePolicy: File
          imagePullPolicy: IfNotPresent
          securityContext:
            privileged: true
      containers:
        - name: be
          image: 'selectdb/doris.be-ubuntu:2.1.1'
          command:
            - /opt/apache-doris/be_entrypoint.sh
          args:
            - $(ENV_FE_ADDR)
          ports:
            - name: be-port
              containerPort: 9060
              protocol: TCP
            - name: webserver-port
              containerPort: 8040
              protocol: TCP
            - name: heartbeat-port
              containerPort: 9050
              protocol: TCP
            - name: brpc-port
              containerPort: 8060
              protocol: TCP
          env:
            - name: POD_NAME
              valueFrom:
                fieldRef:
                  apiVersion: v1
                  fieldPath: metadata.name
            - name: POD_IP
              valueFrom:
                fieldRef:
                  apiVersion: v1
                  fieldPath: status.podIP
            - name: HOST_IP
              valueFrom:
                fieldRef:
                  apiVersion: v1
                  fieldPath: status.hostIP
            - name: POD_NAMESPACE
              valueFrom:
                fieldRef:
                  apiVersion: v1
                  fieldPath: metadata.namespace
            - name: USER
              value: root
            - name: DORIS_ROOT
              value: /opt/apache-doris
            - name: CONFIGMAP_MOUNT_PATH
              value: /etc/doris
            - name: ENV_FE_ADDR
              value: doriscluster-helm-bigdata-test-fe-service
            - name: FE_QUERY_PORT
              value: '9030'
          resources:
            limits:
              cpu: '4'
              memory: 8Gi
            requests:
              cpu: '4'
              memory: 8Gi
          volumeMounts:
            - name: podinfo
              mountPath: /etc/podinfo
            - name: be-storage
              mountPath: /opt/apache-doris/be/storage
            - name: be-log
              mountPath: /opt/apache-doris/be/log
            - name: doriscluster-helm-be-configmap
              mountPath: /etc/doris
          livenessProbe:
            tcpSocket:
              port: 9050
            initialDelaySeconds: 80
            timeoutSeconds: 180
            periodSeconds: 5
            successThreshold: 1
            failureThreshold: 3
          readinessProbe:
            httpGet:
              path: /api/health
              port: 8040
              scheme: HTTP
            timeoutSeconds: 1
            periodSeconds: 5
            successThreshold: 1
            failureThreshold: 3
          startupProbe:
            tcpSocket:
              port: 9050
            timeoutSeconds: 1
            periodSeconds: 5
            successThreshold: 1
            failureThreshold: 60
          lifecycle:
            preStop:
              exec:
                command:
                  - /opt/apache-doris/be_prestop.sh
          terminationMessagePath: /dev/termination-log
          terminationMessagePolicy: File
          imagePullPolicy: IfNotPresent
      restartPolicy: Always
      terminationGracePeriodSeconds: 30
      dnsPolicy: ClusterFirst
      securityContext: {}
      affinity:
        podAntiAffinity:
          preferredDuringSchedulingIgnoredDuringExecution:
            - weight: 20
              podAffinityTerm:
                labelSelector:
                  matchExpressions:
                    - key: app.kubernetes.io/component
                      operator: In
                      values:
                        - be
                topologyKey: kubernetes.io/hostname
            - weight: 80
              podAffinityTerm:
                labelSelector:
                  matchLabels:
                    kubernetes.io/hostname: fe
                topologyKey: kubernetes.io/hostname
      schedulerName: default-scheduler
  volumeClaimTemplates:
    - kind: PersistentVolumeClaim
      apiVersion: v1
      metadata:
        name: be-storage
        creationTimestamp: null
      spec:
        accessModes:
          - ReadWriteOnce
        resources:
          requests:
            storage: 30Gi
        storageClassName: be-volume-bigdata-test
        volumeMode: Filesystem
      status:
        phase: Pending
    - kind: PersistentVolumeClaim
      apiVersion: v1
      metadata:
        name: be-log
        creationTimestamp: null
      spec:
        accessModes:
          - ReadWriteOnce
        resources:
          requests:
            storage: 100Gi
        storageClassName: be-log-volume-bigdata-test
        volumeMode: Filesystem
      status:
        phase: Pending
  serviceName: doriscluster-helm-bigdata-test-be-internal
  podManagementPolicy: Parallel
  updateStrategy:
    type: RollingUpdate
    rollingUpdate:
      partition: 0
  revisionHistoryLimit: 5

fe yaml

kind: StatefulSet
apiVersion: apps/v1
metadata:
  name: doriscluster-helm-bigdata-test-fe
  namespace: bigdata-test
  labels:
    app.doris.ownerreference/name: doriscluster-helm-bigdata-test
    app.kubernetes.io/component: fe
  annotations:
    app.doris.components/hash: '3362463793'
spec:
  replicas: 3
  selector:
    matchLabels:
      app.doris.ownerreference/name: doriscluster-helm-bigdata-test-fe
      app.kubernetes.io/component: fe
  template:
    metadata:
      name: doriscluster-helm-bigdata-test-fe
      creationTimestamp: null
      labels:
        app.doris.cluster: doriscluster-helm-bigdata-test
        app.doris.ownerreference/name: doriscluster-helm-bigdata-test-fe
        app.kubernetes.io/component: fe
    spec:
      volumes:
        - name: fe-meta
          persistentVolumeClaim:
            claimName: fe-meta
        - name: fe-log
          persistentVolumeClaim:
            claimName: fe-log
        - name: podinfo
          downwardAPI:
            items:
              - path: labels
                fieldRef:
                  apiVersion: v1
                  fieldPath: metadata.labels
              - path: annotations
                fieldRef:
                  apiVersion: v1
                  fieldPath: metadata.annotations
            defaultMode: 420
        - name: doriscluster-helm-fe-configmap
          configMap:
            name: doriscluster-helm-fe-configmap
            defaultMode: 420
      containers:
        - name: fe
          image: 'selectdb/doris.fe-ubuntu:2.1.1'
          command:
            - /opt/apache-doris/fe_entrypoint.sh
          args:
            - $(ENV_FE_ADDR)
          ports:
            - name: http-port
              containerPort: 8030
              protocol: TCP
            - name: rpc-port
              containerPort: 9020
              protocol: TCP
            - name: query-port
              containerPort: 9030
              protocol: TCP
            - name: edit-log-port
              containerPort: 9010
              protocol: TCP
          env:
            - name: POD_NAME
              valueFrom:
                fieldRef:
                  apiVersion: v1
                  fieldPath: metadata.name
            - name: POD_IP
              valueFrom:
                fieldRef:
                  apiVersion: v1
                  fieldPath: status.podIP
            - name: HOST_IP
              valueFrom:
                fieldRef:
                  apiVersion: v1
                  fieldPath: status.hostIP
            - name: POD_NAMESPACE
              valueFrom:
                fieldRef:
                  apiVersion: v1
                  fieldPath: metadata.namespace
            - name: USER
              value: root
            - name: DORIS_ROOT
              value: /opt/apache-doris
            - name: CONFIGMAP_MOUNT_PATH
              value: /etc/doris
            - name: ENV_FE_ADDR
              value: doriscluster-helm-bigdata-test-fe-service
            - name: FE_QUERY_PORT
              value: '9030'
            - name: ELECT_NUMBER
              value: '3'
          resources:
            limits:
              cpu: '4'
              memory: 8Gi
            requests:
              cpu: '4'
              memory: 8Gi
          volumeMounts:
            - name: podinfo
              mountPath: /etc/podinfo
            - name: fe-meta
              mountPath: /opt/apache-doris/fe/doris-meta
            - name: fe-log
              mountPath: /opt/apache-doris/fe/log
            - name: doriscluster-helm-fe-configmap
              mountPath: /etc/doris
          livenessProbe:
            tcpSocket:
              port: 9030
            initialDelaySeconds: 80
            timeoutSeconds: 180
            periodSeconds: 5
            successThreshold: 1
            failureThreshold: 3
          readinessProbe:
            httpGet:
              path: /api/health
              port: 8030
              scheme: HTTP
            timeoutSeconds: 1
            periodSeconds: 5
            successThreshold: 1
            failureThreshold: 3
          startupProbe:
            tcpSocket:
              port: 9030
            timeoutSeconds: 1
            periodSeconds: 5
            successThreshold: 1
            failureThreshold: 60
          lifecycle:
            preStop:
              exec:
                command:
                  - /opt/apache-doris/fe_prestop.sh
          terminationMessagePath: /dev/termination-log
          terminationMessagePolicy: File
          imagePullPolicy: IfNotPresent
      restartPolicy: Always
      terminationGracePeriodSeconds: 30
      dnsPolicy: ClusterFirst
      securityContext: {}
      affinity:
        podAntiAffinity:
          preferredDuringSchedulingIgnoredDuringExecution:
            - weight: 20
              podAffinityTerm:
                labelSelector:
                  matchExpressions:
                    - key: app.kubernetes.io/component
                      operator: In
                      values:
                        - fe
                topologyKey: kubernetes.io/hostname
      schedulerName: default-scheduler
  volumeClaimTemplates:
    - kind: PersistentVolumeClaim
      apiVersion: v1
      metadata:
        name: fe-meta
        creationTimestamp: null
      spec:
        accessModes:
          - ReadWriteOnce
        resources:
          requests:
            storage: 30Gi
        storageClassName: fe-volume-bigdata-test
        volumeMode: Filesystem
      status:
        phase: Pending
    - kind: PersistentVolumeClaim
      apiVersion: v1
      metadata:
        name: fe-log
        creationTimestamp: null
      spec:
        accessModes:
          - ReadWriteOnce
        resources:
          requests:
            storage: 30Gi
        storageClassName: fe-log-volume-bigdata-test
        volumeMode: Filesystem
      status:
        phase: Pending
  serviceName: doriscluster-helm-bigdata-test-fe-internal
  podManagementPolicy: Parallel
  updateStrategy:
    type: RollingUpdate
    rollingUpdate:
      partition: 0
  revisionHistoryLimit: 5

fe.conf

# Licensed to the Apache Software Foundation (ASF) under one
CUR_DATE=`date +%Y%m%d-%H%M%S`

# the output dir of stderr and stdout
LOG_DIR = ${DORIS_HOME}/log

JAVA_OPTS="-Djavax.security.auth.useSubjectCredsOnly=false -Xss4m -Xmx8192m -XX:+UseMembar -XX:SurvivorRatio=8 -XX:MaxTenuringThreshold=7 -XX:+PrintGCDateStamps -XX:+PrintGCDetails -XX:+UseConcMarkSweepGC -XX:+UseParNewGC -XX:+CMSClassUnloadingEnabled -XX:-CMSParallelRemarkEnabled -XX:CMSInitiatingOccupancyFraction=80 -XX:SoftRefLRUPolicyMSPerMB=0 -Xloggc:$DORIS_HOME/log/fe.gc.log.$CUR_DATE"

# For jdk 9+, this JAVA_OPTS will be used as default JVM options
JAVA_OPTS_FOR_JDK_9="-Djavax.security.auth.useSubjectCredsOnly=false -Xss4m -Xmx8192m -XX:SurvivorRatio=8 -XX:MaxTenuringThreshold=7 -XX:+CMSClassUnloadingEnabled -XX:-CMSParallelRemarkEnabled -XX:CMSInitiatingOccupancyFraction=80 -XX:SoftRefLRUPolicyMSPerMB=0 -Xlog:gc*:$DORIS_HOME/log/fe.gc.log.$CUR_DATE:time"

##
## the lowercase properties are read by main program.
##

# INFO, WARN, ERROR, FATAL
sys_log_level = INFO

# NORMAL, BRIEF, ASYNC
sys_log_mode = NORMAL

# store metadata, must be created before start FE.
# Default value is ${DORIS_HOME}/doris-meta
meta_dir = ${DORIS_HOME}/doris-meta

# Default dirs to put jdbc drivers,default value is ${DORIS_HOME}/jdbc_drivers
# jdbc_drivers_dir = ${DORIS_HOME}/jdbc_drivers

http_port = 8030
rpc_port = 9020
query_port = 9030
edit_log_port = 9010

# Choose one if there are more than one ip except loopback address.
# Note that there should at most one ip match this list.
# If no ip match this rule, will choose one randomly.
# use CIDR format, e.g. 10.10.10.0/24 or IP format, e.g. 10.10.10.1
# Default value is empty.
# priority_networks = 10.10.10.0/24;192.168.0.0/16

# Advanced configurations
# log_roll_size_mb = 1024
# sys_log_dir = ${DORIS_HOME}/log
# sys_log_roll_num = 10
# sys_log_verbose_modules = org.apache.doris
# audit_log_dir = ${DORIS_HOME}/log
# audit_log_modules = slow_query, query
# audit_log_roll_num = 10
# meta_delay_toleration_second = 10
# qe_max_connection = 1024
# qe_query_timeout_second = 300
# qe_slow_log_ms = 5000
enable_fqdn_mode = true

be.conf

CUR_DATE=`date +%Y%m%d-%H%M%S`

PPROF_TMPDIR="$DORIS_HOME/log/"

JAVA_OPTS="-Xmx1024m -DlogPath=$DORIS_HOME/log/jni.log -Xloggc:$DORIS_HOME/log/be.gc.log.$CUR_DATE -Djavax.security.auth.useSubjectCredsOnly=false -Dsun.java.command=DorisBE -XX:-CriticalJNINatives -DJDBC_MIN_POOL=1 -DJDBC_MAX_POOL=100 -DJDBC_MAX_IDLE_TIME=300000 -DJDBC_MAX_WAIT_TIME=5000"

# For jdk 9+, this JAVA_OPTS will be used as default JVM options
JAVA_OPTS_FOR_JDK_9="-Xmx1024m -DlogPath=$DORIS_HOME/log/jni.log -Xlog:gc:$DORIS_HOME/log/be.gc.log.$CUR_DATE -Djavax.security.auth.useSubjectCredsOnly=false -Dsun.java.command=DorisBE -XX:-CriticalJNINatives -DJDBC_MIN_POOL=1 -DJDBC_MAX_POOL=100 -DJDBC_MAX_IDLE_TIME=300000 -DJDBC_MAX_WAIT_TIME=5000"

# since 1.2, the JAVA_HOME need to be set to run BE process.
# JAVA_HOME=/path/to/jdk/

# https://github.com/apache/doris/blob/master/docs/zh-CN/community/developer-guide/debug-tool.md#jemalloc-heap-profile
# https://jemalloc.net/jemalloc.3.html
JEMALLOC_CONF="percpu_arena:percpu,background_thread:true,metadata_thp:auto,muzzy_decay_ms:15000,dirty_decay_ms:15000,oversize_threshold:0,lg_tcache_max:20,prof:false,lg_prof_interval:32,lg_prof_sample:19,prof_gdump:false,prof_accum:false,prof_leak:false,prof_final:false"
JEMALLOC_PROF_PRFIX=""

# INFO, WARNING, ERROR, FATAL
sys_log_level = INFO

# ports for admin, web, heartbeat service
be_port = 9060
webserver_port = 8040
heartbeat_service_port = 9050
brpc_port = 8060

# HTTPS configures
enable_https = false
# path of certificate in PEM format.
ssl_certificate_path = "$DORIS_HOME/conf/cert.pem"
# path of private key in PEM format.
ssl_private_key_path = "$DORIS_HOME/conf/key.pem"

# enable auth check
enable_auth = false

# Choose one if there are more than one ip except loopback address.
# Note that there should at most one ip match this list.
# If no ip match this rule, will choose one randomly.
# use CIDR format, e.g. 10.10.10.0/24 or IP format, e.g. 10.10.10.1
# Default value is empty.
# priority_networks = 10.10.10.0/24;192.168.0.0/16

# data root path, separate by ';'
# you can specify the storage medium of each root path, HDD or SSD
# you can add capacity limit at the end of each root path, separate by ','
# eg:
# storage_root_path = /home/disk1/doris.HDD,50;/home/disk2/doris.SSD,1;/home/disk2/doris
# /home/disk1/doris.HDD, capacity limit is 50GB, HDD;
# /home/disk2/doris.SSD, capacity limit is 1GB, SSD;
# /home/disk2/doris, capacity limit is disk capacity, HDD(default)
#
# you also can specify the properties by setting '<property>:<value>', separate by ','
# property 'medium' has a higher priority than the extension of path
#
# Default value is ${DORIS_HOME}/storage, you should create it by hand.
storage_root_path = ${DORIS_HOME}/storage

# Default dirs to put jdbc drivers,default value is ${DORIS_HOME}/jdbc_drivers
# jdbc_drivers_dir = ${DORIS_HOME}/jdbc_drivers

# Advanced configurations
# sys_log_dir = ${DORIS_HOME}/log
# sys_log_roll_mode = SIZE-MB-1024
# sys_log_roll_num = 10
# sys_log_verbose_modules = *
# log_buffer_level = -1
# palo_cgroups
1 Answers

多个sql是多少呀,是不是连接数超限了?调整下用户的最大连接数和FE的最大连接数试试的:

用户最大连接数

FE 最大连接数

如果还是不起作用的话 ,需要在FE卡住的时候 打一个jstack 贴一下,我们分析下的