doris-2.0.2升级doris-2.1.6 BE节点启动时crash

Viewed 43

从2.0.2升级到2.1.6,选择一台节点做灰度升级,启动后crash,系统日志打印"abrt-hook-cpp: Process killed by SIGSEGV - dumping core",使用ulimit -c unlimited -n 65536 && sh start_be.sh --daemon方式启动,两分钟crash,输出的core.文件有27GB。
be.out答应的query_id是一个很普通的内表查询,多次启动都会crash,query_id对应的查询语句不同。
be.out如下

StdoutLogger 2024-12-25 18:17:10,237 Start time: Wed Dec 25 18:17:10 CST 2024
INFO: java_cmd /usr/java/latest/bin/java
INFO: jdk_version 8
SLF4J: Class path contains multiple SLF4J bindings.
SLF4J: Found binding in [jar:file:/home/tiankx/doris-2.1.6/be/lib/java_extensions/preload-extensions/preload-extensions-jar-with-dependencies.jar!/org/slf4j/impl/StaticLoggerBinder.class]
SLF4J: Found binding in [jar:file:/home/tiankx/doris-2.1.6/be/lib/java_extensions/java-udf/java-udf-jar-with-dependencies.jar!/org/slf4j/impl/StaticLoggerBinder.class]
SLF4J: Found binding in [jar:file:/home/tiankx/doris-2.1.6/be/lib/hadoop_hdfs/common/lib/slf4j-reload4j-1.7.36.jar!/org/slf4j/impl/StaticLoggerBinder.class]
SLF4J: See http://www.slf4j.org/codes.html#multiple_bindings for an explanation.
SLF4J: Actual binding is of type [org.slf4j.impl.Reload4jLoggerFactory]
*** Query id: bd77ff401989437a-8550d5bdd0d14507 ***
*** is nereids: 1 ***
*** tablet id: 0 ***
*** Aborted at 1735121953 (unix time) try "date -d @1735121953" if you are using GNU date ***
*** Current BE git commitID: 653e315ba5 ***
*** SIGSEGV address not mapped to object (@0x8) received by PID 23778 (TID 24195 OR 0x7f586cdf3700) from PID 8; stack trace: ***
 0# doris::signal::(anonymous namespace)::FailureSignalHandler(int, siginfo_t*, void*) at /home/zcp/repo_center/doris_release/doris/be/src/common/signal_handler.h:421
 1# os::Linux::chained_handler(int, siginfo*, void*) in /usr/java/latest/jre/lib/amd64/server/libjvm.so
 2# JVM_handle_linux_signal in /usr/java/latest/jre/lib/amd64/server/libjvm.so
 3# signalHandler(int, siginfo*, void*) in /usr/java/latest/jre/lib/amd64/server/libjvm.so
 4# 0x00007F61839B4400 in /lib64/libc.so.6
 5# __memset_sse2 in /lib64/libc.so.6
 6# doris::vectorized::PODArray<unsigned char, 4096ul, Allocator<false, false, false, DefaultMemoryAllocator>, 15ul, 16ul>::resize_fill(unsigned long, unsigned char const&) at /home/zcp/repo_center/doris_release/doris/be/src/vec/common/pod_array.h:377
 7# doris::vectorized::ColumnFilterHelper::resize_fill(unsigned long, unsigned char) at /home/zcp/repo_center/doris_release/doris/be/src/vec/columns/column_filter_helper.cpp:28
 8# doris::vectorized::VNestedLoopJoinNode::_append_left_data_with_null(doris::vectorized::Block&) const in /home/tiankx/doris-2.1.6/be/lib/doris_be
 9# doris::Status doris::vectorized::VNestedLoopJoinNode::_generate_join_block_data<std::integral_constant<doris::TJoinOp::type, (doris::TJoinOp::type)5>, false, false>(doris::RuntimeState*, std::integral_constant<doris::TJoinOp::type, (doris::TJoinOp::type)5>&) in /home/tiankx/doris-2.1.6/be/lib/doris_be
10# std::__detail::__variant::__gen_vtable_impl<std::__detail::__variant::_Multi_array<std::__detail::__variant::__deduce_visit_result<doris::Status> (*)(doris::vectorized::VNestedLoopJoinNode::push(doris::RuntimeState*, doris::vectorized::Block*, bool)::$_0&, std::variant<std::integral_constant<doris::TJoinOp::type, (doris::TJoinOp::type)0>, std::integral_constant<doris::TJoinOp::type, (doris::TJoinOp::type)2>, std::integral_constant<doris::TJoinOp::type, (doris::TJoinOp::type)8>, std::integral_constant<doris::TJoinOp::type, (doris::TJoinOp::type)1>, std::integral_constant<doris::TJoinOp::type, (doris::TJoinOp::type)4>, std::integral_constant<doris::TJoinOp::type, (doris::TJoinOp::type)3>, std::integral_constant<doris::TJoinOp::type, (doris::TJoinOp::type)5>, std::integral_constant<doris::TJoinOp::type, (doris::TJoinOp::type)7>, std::integral_constant<doris::TJoinOp::type, (doris::TJoinOp::type)9>, std::integral_constant<doris::TJoinOp::type, (doris::TJoinOp::type)10>, std::integral_constant<doris::TJoinOp::type, (doris::TJoinOp::type)11> >&, std::variant<std::integral_constant<bool, false>, std::integral_constant<bool, true> >&&, std::variant<std::integral_constant<bool, false>, std::integral_constant<bool, true> >&&)>, std::integer_sequence<unsigned long, 6ul, 0ul, 0ul> >::__visit_invoke(doris::vectorized::VNestedLoopJoinNode::push(doris::RuntimeState*, doris::vectorized::Block*, bool)::$_0&, std::variant<std::integral_constant<doris::TJoinOp::type, (doris::TJoinOp::type)0>, std::integral_constant<doris::TJoinOp::type, (doris::TJoinOp::type)2>, std::integral_constant<doris::TJoinOp::type, (doris::TJoinOp::type)8>, std::integral_constant<doris::TJoinOp::type, (doris::TJoinOp::type)1>, std::integral_constant<doris::TJoinOp::type, (doris::TJoinOp::type)4>, std::integral_constant<doris::TJoinOp::type, (doris::TJoinOp::type)3>, std::integral_constant<doris::TJoinOp::type, (doris::TJoinOp::type)5>, std::integral_constant<doris::TJoinOp::type, (doris::TJoinOp::type)7>, std::integral_constant<doris::TJoinOp::type, (doris::TJoinOp::type)9>, std::integral_constant<doris::TJoinOp::type, (doris::TJoinOp::type)10>, std::integral_constant<doris::TJoinOp::type, (doris::TJoinOp::type)11> >&, std::variant<std::integral_constant<bool, false>, std::integral_constant<bool, true> >&&, std::variant<std::integral_constant<bool, false>, std::integral_constant<bool, true> >&&) at /var/local/ldb-toolchain/bin/../lib/gcc/x86_64-linux-gnu/11/../../../../include/c++/11/variant:1013
11# doris::vectorized::VNestedLoopJoinNode::push(doris::RuntimeState*, doris::vectorized::Block*, bool) at /home/zcp/repo_center/doris_release/doris/be/src/vec/exec/join/vnested_loop_join_node.cpp:245
12# doris::pipeline::StatefulOperator<doris::vectorized::VNestedLoopJoinNode>::get_block(doris::RuntimeState*, doris::vectorized::Block*, doris::pipeline::SourceState&) at /home/zcp/repo_center/doris_release/doris/be/src/pipeline/exec/operator.h:431
13# doris::pipeline::StatefulOperator<doris::vectorized::HashJoinNode>::get_block(doris::RuntimeState*, doris::vectorized::Block*, doris::pipeline::SourceState&) at /home/zcp/repo_center/doris_release/doris/be/src/pipeline/exec/operator.h:425
14# doris::pipeline::PipelineTask::execute(bool*) at /home/zcp/repo_center/doris_release/doris/be/src/pipeline/pipeline_task.cpp:300
15# doris::pipeline::TaskScheduler::_do_work(unsigned long) at /home/zcp/repo_center/doris_release/doris/be/src/pipeline/task_scheduler.cpp:347
16# doris::ThreadPool::dispatch_thread() in /home/tiankx/doris-2.1.6/be/lib/doris_be
17# doris::Thread::supervise_thread(void*) at /home/zcp/repo_center/doris_release/doris/be/src/util/thread.cpp:499
18# start_thread in /lib64/libpthread.so.0
19# clone in /lib64/libc.so.6

2024-12-30更新

又尝试灰度升级一个节点到其他版本(2.0.3, 2.0.15, 2.1.5, 2.1.6, 2.1.7),一样的报错,be.out堆栈一致。
继续排查发现,BE crash都是由某几张表的特定查询语句稳定触发的,简化后的语句如下:

select 1
from my_db.my_tbl -- 特定的某几张表做这个查询才触发crash
where dt = 20241226
and (exists (select 1) or um in (select 'hello')) -- 须满足or两边的内容exists与in才会触发crash,可以改成具体表的查询,一样触发
limit 10
;

-- 建表语句如下
-- 
-- 
-- 
CREATE TABLE my_db.my_tbl (
  `cust_code` varchar(20) NULL COMMENT '客户号',
  -- ....
  `um` varchar(100) NULL COMMENT 'UM',
  `rn_ss` int(11) NULL COMMENT 'KPI排名',
  `dt` bigint(20) NOT NULL COMMENT '数据更新时间'
) ENGINE=OLAP
DUPLICATE KEY(`cust_code`)
COMMENT 'ai_detail'
PARTITION BY RANGE(`dt`)
(
PARTITION P_20241226 VALUES [("20241226"), ("20241227")),
PARTITION P_20241227 VALUES [("20241227"), ("20241228")),
PARTITION P_20241228 VALUES [("20241228"), ("20241229")),
PARTITION P_20241229 VALUES [("20241229"), ("20241230")),
PARTITION P_20241230 VALUES [("20241230"), ("20241231")))
DISTRIBUTED BY HASH(`cust_code`) BUCKETS 32
PROPERTIES (
"replication_allocation" = "tag.location.default: 3",
"is_being_synced" = "false",
"colocate_with" = "cust_code",
"dynamic_partition.enable" = "true",
"dynamic_partition.time_unit" = "DAY",
"dynamic_partition.time_zone" = "Asia/Shanghai",
"dynamic_partition.start" = "-100",
"dynamic_partition.end" = "2",
"dynamic_partition.prefix" = "P_",
"dynamic_partition.replication_allocation" = "tag.location.default: 3",
"dynamic_partition.buckets" = "32",
"dynamic_partition.create_history_partition" = "true",
"dynamic_partition.history_partition_num" = "-1",
"dynamic_partition.hot_partition_num" = "2",
"dynamic_partition.reserved_history_periods" = "NULL",
"dynamic_partition.storage_policy" = "",
"dynamic_partition.storage_medium" = "HDD",
"storage_format" = "V2",
"disable_auto_compaction" = "false",
"enable_single_replica_compaction" = "false"
);

反复尝试了其他情形:

  1. 参考有问题的表创建新表后导入数据,同样的查询无法复现出BE crash;
  2. 通过SNAPSHOT BACKUP-RESTORE到其他的doris-2.1.6环境,同样的查询也无法复现出BE crash;
  3. 会导致BE crash的表做其他查询未触发异常
  4. 有问题的表及查询语句,在升级前的版本doris-2.0.2一直稳定执行

导致BE crash的表没有其他特殊内容(无物化视图、索引等),仅有一个colocate group。FE与BE无其他相关日志,从问题表现来看,很像是数据存储导致异常,检查表与tablet副本未发现异常。

1 Answers

已私,先从2.0.2直升至2.1.7尝试。