Doris版本2.1.5
库中有个表13亿条数据
select TABLE_SCHEMA,TABLE_NAME,TABLE_ROWS,AVG_ROW_LENGTH,DATA_LENGTH from information_schema.tables order by TABLE_ROWS desc
表结构如下
CREATE TABLE `rta_sys_monitor_caller` (
`timestamp` DATETIME(6) NOT NULL,
`key` VARCHAR(2000) NOT NULL,
`app_code` VARCHAR(100) NOT NULL,
`host_name` VARCHAR(100) NOT NULL,
`host_address` VARCHAR(100) NOT NULL,
`rta_sequence` BIGINT NOT NULL AUTO_INCREMENT(1),
`process_state` TINYINT NOT NULL,
`metrics_info` VARIANT NULL,
INDEX idx_Ir_ts2 (`app_code`) USING INVERTED PROPERTIES(
"char_filter_pattern" = "ts",
"parser" = "english",
"lower_case" = "true",
"char_filter_type" = "char_replace",
"support_phrase" = "true",
"ignore_above" = "256",
"char_filter_replacement" = "we"
)
) ENGINE = OLAP UNIQUE KEY(
`timestamp`,
`key`,
`app_code`,
`host_name`,
`host_address`,
`rta_sequence`
) AUTO PARTITION BY RANGE (date_trunc(`timestamp`, 'day')) (
PARTITION p20240630000000
VALUES
[('2024-06-30 00:00:00'), ('2024-07-01 00:00:00')), PARTITION p20240701000000 VALUES [('2024-07-01 00:00:00'), ('2024-07-02 00:00:00')), PARTITION p20240702000000 VALUES [('2024-07-02 00:00:00'), ('2024-07-03 00:00:00')), PARTITION p20240703000000 VALUES [('2024-07-03 00:00:00'), ('2024-07-04 00:00:00')), PARTITION p20240704000000 VALUES [('2024-07-04 00:00:00'), ('2024-07-05 00:00:00')), PARTITION p20240705000000 VALUES [('2024-07-05 00:00:00'), ('2024-07-06 00:00:00')), PARTITION p20240706000000 VALUES [('2024-07-06 00:00:00'), ('2024-07-07 00:00:00')), PARTITION p20240707000000 VALUES [('2024-07-07 00:00:00'), ('2024-07-08 00:00:00')), PARTITION p20240708000000 VALUES [('2024-07-08 00:00:00'), ('2024-07-09 00:00:00')), PARTITION p20240709000000 VALUES [('2024-07-09 00:00:00'), ('2024-07-10 00:00:00')), PARTITION p20240710000000 VALUES [('2024-07-10 00:00:00'), ('2024-07-11 00:00:00')), PARTITION p20240711000000 VALUES [('2024-07-11 00:00:00'), ('2024-07-12 00:00:00')), PARTITION p20240712000000 VALUES [('2024-07-12 00:00:00'), ('2024-07-13 00:00:00')), PARTITION p20240713000000 VALUES [('2024-07-13 00:00:00'), ('2024-07-14 00:00:00')), PARTITION p20240714000000 VALUES [('2024-07-14 00:00:00'), ('2024-07-15 00:00:00')), PARTITION p20240715000000 VALUES [('2024-07-15 00:00:00'), ('2024-07-16 00:00:00')), PARTITION p20240716000000 VALUES [('2024-07-16 00:00:00'), ('2024-07-17 00:00:00')), PARTITION p20240717000000 VALUES [('2024-07-17 00:00:00'), ('2024-07-18 00:00:00')), PARTITION p20240718000000 VALUES [('2024-07-18 00:00:00'), ('2024-07-19 00:00:00')), PARTITION p20240719000000 VALUES [('2024-07-19 00:00:00'), ('2024-07-20 00:00:00')), PARTITION p20240720000000 VALUES [('2024-07-20 00:00:00'), ('2024-07-21 00:00:00')), PARTITION p20240721000000 VALUES [('2024-07-21 00:00:00'), ('2024-07-22 00:00:00')), PARTITION p20240722000000 VALUES [('2024-07-22 00:00:00'), ('2024-07-23 00:00:00')), PARTITION p20240723000000 VALUES [('2024-07-23 00:00:00'), ('2024-07-24 00:00:00')), PARTITION p20240724000000 VALUES [('2024-07-24 00:00:00'), ('2024-07-25 00:00:00')), PARTITION p20240725000000 VALUES [('2024-07-25 00:00:00'), ('2024-07-26 00:00:00')), PARTITION p20240726000000 VALUES [('2024-07-26 00:00:00'), ('2024-07-27 00:00:00')), PARTITION p20240727000000 VALUES [('2024-07-27 00:00:00'), ('2024-07-28 00:00:00')), PARTITION p20240728000000 VALUES [('2024-07-28 00:00:00'), ('2024-07-29 00:00:00')), PARTITION p20240729000000 VALUES [('2024-07-29 00:00:00'), ('2024-07-30 00:00:00')), PARTITION p20240730000000 VALUES [('2024-07-30 00:00:00'), ('2024-07-31 00:00:00'))) DISTRIBUTED BY HASH(`timestamp`) BUCKETS AUTO PROPERTIES ( "replication_allocation" = "tag.location.default: 3", "min_load_replica_num" = "1", "is_being_synced" = "false", "storage_medium" = "hdd", "storage_format" = "V2", "inverted_index_storage_format" = "V1", "enable_unique_key_merge_on_write" = "true", "light_schema_change" = "true", "skip_write_index_on_load" = "true", "compaction_policy" = "time_series", "time_series_compaction_goal_size_mbytes" = "5120", "time_series_compaction_file_count_threshold" = "1000", "time_series_compaction_time_threshold_seconds" = "3600", "time_series_compaction_empty_rowsets_threshold" = "5", "time_series_compaction_level_threshold" = "2", "disable_auto_compaction" = "false", "enable_single_replica_compaction" = "false", "group_commit_interval_ms" = "10000", "group_commit_data_bytes" = "134217728", "enable_mow_light_delete" = "false" );
现在需要增加BloomFilter 索引和N-Gram 索引
ALTER TABLE rta_sys_monitor_caller SET ("bloom_filter_columns" = "app_code,host_name")
瞬间显示执行成功,但是show create table 等了半个小时还是没有增加上,看文档上没有相关执行进度查询的介绍,最后到底是执行成功还是失败不得而知。
同理还有N-Gram索引
ALTER TABLE rta_sys_monitor_caller ADD INDEX idx_NG_ts3(host_address) USING NGRAM_BF PROPERTIES("gram_size"="3","bf_size"="1024") COMMENT ' a'
倒排索引创建成功
按照文档show build也都Finishd了,show create table 也可以显示,只是创建过程大约1分钟,不确定对于13亿数据,1分钟是否对存量数据都创建成功,当然1分钟如果全部创建成功,那效率还是很高的!
最终的问题
BloomFilter 索引和N-Gram 索引到底是同步创建还是异步创建,创建进度和结果如何获取?