【已解决】Doris2.1 查询AGGREGATE导致BE宕机

Viewed 92

表数据只有2条, 查询导致BE宕机
建表语句

CREATE TABLE `alarm_unique` (
 `id` BIGINT NOT NULL,
 `w_id` BIGINT REPLACE NOT NULL, 
 `origin` VARCHAR(100) REPLACE NOT NULL, 
 `data_id` BIGINT REPLACE NOT NULL, 
 `data_name` VARCHAR(100) REPLACE NOT NULL, 
 `first_trigger_time` DATETIME MIN NOT NULL, 
 `trigger_time` DATETIME MAX NOT NULL, 
 `level` TINYINT MAX NOT NULL,
 `title` TEXT REPLACE NULL, 
 `content` TEXT REPLACE NOT NULL, 
 `cate` TINYINT REPLACE NOT NULL,
 `source` TINYINT REPLACE NOT NULL,
 `block_rule_id` BIGINT REPLACE NULL,
 `block_rule_name` VARCHAR(100) REPLACE NULL,
 `assign_rule_id` TEXT REPLACE NULL,
 `assign_rule_name` TEXT REPLACE NULL, 
 `is_blocked` TINYINT REPLACE NULL DEFAULT "0",
 `is_assigned` TINYINT MAX NULL DEFAULT "0",
 `notify_id` TEXT REPLACE NULL,
 `notify_name` TEXT REPLACE NULL,
 `agg` TINYINT REPLACE NULL DEFAULT "0",
 `num` INT SUM NULL DEFAULT "0",
 `state` TINYINT MAX NULL DEFAULT "0",
 `labels` VARIANT REPLACE NULL,
 `create_time` DATETIME REPLACE NULL, 
 `update_time` DATETIME REPLACE NULL, 
 `updater` VARCHAR(60) REPLACE NULL, 
INDEX idx_alarm (`id`) USING INVERTED COMMENT '' 
) 
ENGINE=OLAP 
AGGREGATE KEY(`id`) COMMENT 'OLAP' 
DISTRIBUTED BY HASH(`id`) BUCKETS 32 
PROPERTIES ( 
"replication_allocation" = "tag.location.default: 1",
 "min_load_replica_num" = "-1", 
"is_being_synced" = "false", 
"storage_medium" = "hdd", 
"storage_format" = "V2", 
"light_schema_change" = "true", 
"disable_auto_compaction" = "false", 
"enable_single_replica_compaction" = "false", 
"group_commit_interval_ms" = "10000", 
"group_commit_data_bytes" = "134217728" );

be.info日志

I20240401 11:42:30.133008 18263 storage_engine.cpp:1110] collected 0 unused rowsets to remove, skipped 0 rowsets due to use count > 1, skipped 0 rowsets due to don't need to delete file, skipped 0 rowsets due to delayed expired timestamp.
I20240401 11:42:30.133016 18263 storage_engine.cpp:1126] removed all collected unused rowsets
I20240401 11:42:30.200465 18159 wal_manager.cpp:473] Scheduled(every 10s) WAL info: [/data/storage/wal: limit 52712313241 Bytes, used 0 Bytes, estimated wal bytes 0 Bytes, available 52712313241 Bytes.];
I20240401 11:42:36.840699 18537 fragment_mgr.cpp:623] query_id: 438bc6d0e8a4449e-b9e0a6164f4409d7 coord_addr TNetworkAddress(hostname=172.30.35.68, port=9020) total fragment num on current host: 2 fe process uuid: 1711937685715
I20240401 11:42:36.840754 18537 fragment_mgr.cpp:648] Query/load id: 438bc6d0e8a4449e-b9e0a6164f4409d7, use task group: TG[id = 1, name = normal, cpu_share = 1024, memory_limit = 8.44 GB, enable_memory_overcommit = true, version = 0, cpu_hard_limit = -1, scan_thread_num = 48, max_remote_scan_thread_num = 48, min_remote_scan_thread_num = 48], is pipeline: 1, enable cgroup soft limit: 1
I20240401 11:42:36.840790 18537 fragment_mgr.cpp:661] Register query/load memory tracker, query/load id: 438bc6d0e8a4449e-b9e0a6164f4409d7 limit: 0
I20240401 11:42:36.840813 18537 pipeline_x_fragment_context.cpp:169] PipelineXFragmentContext::prepare|query_id=438bc6d0e8a4449e-b9e0a6164f4409d7|fragment_id=1|pthread_id=140602122127104
I20240401 11:42:36.841109 18537 pipeline_x_fragment_context.cpp:169] PipelineXFragmentContext::prepare|query_id=438bc6d0e8a4449e-b9e0a6164f4409d7|fragment_id=0|pthread_id=140602122127104
F20240401 11:42:36.850886 19282 column_object.h:469] should not call the method in column object

be.out日志

F20240401 11:42:36.850886 19282 column_object.h:469] should not call the method in column object
*** Check failure stack trace: ***
    @     0x55713b8faff6  google::LogMessage::SendToLog()
    @     0x55713b8f7a40  google::LogMessage::Flush()
    @     0x55713b8fb839  google::LogMessageFatal::~LogMessageFatal()
    @     0x557134f19331  doris::vectorized::ColumnObject::replace_column_data_default()
    @     0x55713b0cde5e  doris::vectorized::BlockReader::_copy_agg_data()
    @     0x55713b0cd7ea  doris::vectorized::BlockReader::_update_agg_data()
    @     0x55713b0cced4  doris::vectorized::BlockReader::_agg_key_next_block()
    @     0x55713b0ca34f  doris::vectorized::BlockReader::next_block_with_aggregation()
    @     0x55713635395d  doris::vectorized::NewOlapScanner::_get_block_impl()
    @     0x557136418b7d  doris::vectorized::VScanner::get_block()
    @     0x5571364186f3  doris::vectorized::VScanner::get_block_after_projects()
    @     0x55713635dc0e  doris::vectorized::ScannerScheduler::_scanner_scan()
    @     0x55713635e7b7  std::_Function_handler<>::_M_invoke()
    @     0x5571321b5aa8  doris::ThreadPool::dispatch_thread()
    @     0x5571321ab141  doris::Thread::supervise_thread()
    @     0x7fe351457ea5  start_thread
    @     0x7fe351e86b0d  __clone
    @              (nil)  (unknown)
*** Query id: 438bc6d0e8a4449e-b9e0a6164f4409d7 ***
*** tablet id: 0 ***
*** Aborted at 1711942957 (unix time) try "date -d @1711942957" if you are using GNU date ***
*** Current BE git commitID: 91efb6a43d ***
*** SIGABRT unknown detail explain (@0x42aa) received by PID 17066 (TID 19282 OR 0x7fdf59022700) from PID 17066; stack trace: ***
 0# doris::signal::(anonymous namespace)::FailureSignalHandler(int, siginfo_t*, void*) at /home/zcp/repo_center/doris_release/doris/be/src/common/signal_handler.h:417
 1# 0x00007FE351DBE400 in /lib64/libc.so.6
 2# gsignal in /lib64/libc.so.6
 3# abort in /lib64/libc.so.6
 4# 0x000055713B90580D in /data/apache-doris-2.1.0-bin-x64/be/lib/doris_be
 5# 0x000055713B8F7F0A in /data/apache-doris-2.1.0-bin-x64/be/lib/doris_be
 6# google::LogMessage::SendToLog() in /data/apache-doris-2.1.0-bin-x64/be/lib/doris_be
 7# google::LogMessage::Flush() in /data/apache-doris-2.1.0-bin-x64/be/lib/doris_be
 8# google::LogMessageFatal::~LogMessageFatal() in /data/apache-doris-2.1.0-bin-x64/be/lib/doris_be
 9# doris::vectorized::ColumnObject::replace_column_data_default(unsigned long) in /data/apache-doris-2.1.0-bin-x64/be/lib/doris_be
10# doris::vectorized::BlockReader::_copy_agg_data() in /data/apache-doris-2.1.0-bin-x64/be/lib/doris_be
11# doris::vectorized::BlockReader::_update_agg_data(std::vector<COW<doris::vectorized::IColumn>::mutable_ptr<doris::vectorized::IColumn>, std::allocator<COW<doris::vectorized::IColumn>::mutable_ptr<doris::vectorized::IColumn> > >&) at /home/zcp/repo_center/doris_release/doris/be/src/vec/olap/block_reader.cpp:461
12# doris::vectorized::BlockReader::_agg_key_next_block(doris::vectorized::Block*, bool*) in /data/apache-doris-2.1.0-bin-x64/be/lib/doris_be
13# doris::vectorized::BlockReader::next_block_with_aggregation(doris::vectorized::Block*, bool*) at /home/zcp/repo_center/doris_release/doris/be/src/vec/olap/block_reader.cpp:66
14# doris::vectorized::NewOlapScanner::_get_block_impl(doris::RuntimeState*, doris::vectorized::Block*, bool*) at /home/zcp/repo_center/doris_release/doris/be/src/vec/exec/scan/new_olap_scanner.cpp:503
15# doris::vectorized::VScanner::get_block(doris::RuntimeState*, doris::vectorized::Block*, bool*) in /data/apache-doris-2.1.0-bin-x64/be/lib/doris_be
16# doris::vectorized::VScanner::get_block_after_projects(doris::RuntimeState*, doris::vectorized::Block*, bool*) at /home/zcp/repo_center/doris_release/doris/be/src/vec/exec/scan/vscanner.cpp:85
17# doris::vectorized::ScannerScheduler::_scanner_scan(std::shared_ptr<doris::vectorized::ScannerContext>, std::shared_ptr<doris::vectorized::ScanTask>) at /home/zcp/repo_center/doris_release/doris/be/src/vec/exec/scan/scanner_scheduler.cpp:267
18# std::_Function_handler<void (), doris::vectorized::ScannerScheduler::submit(std::shared_ptr<doris::vectorized::ScannerContext>, std::shared_ptr<doris::vectorized::ScanTask>)::$_1>::_M_invoke(std::_Any_data const&) at /var/local/ldb_toolchain/bin/../lib/gcc/x86_64-linux-gnu/11/../../../../include/c++/11/bits/std_function.h:291
19# doris::ThreadPool::dispatch_thread() in /data/apache-doris-2.1.0-bin-x64/be/lib/doris_be
20# doris::Thread::supervise_thread(void*) at /home/zcp/repo_center/doris_release/doris/be/src/util/thread.cpp:499
21# start_thread in /lib64/libpthread.so.0
22# clone in /lib64/libc.so.6

3 Answers

是因为聚合模型中使用了varint字段导致的