--参考文档
https://doris.apache.org/zh-CN/docs/table-design/index/ngram-bloomfilter-index
- 查询全表数据,SQL 运行 count() 确认导入数据成功
mysql> SELECT COUNT() FROM amazon_reviews;
+-----------+
| count(*) |
+-----------+
| 135589433 |
+-----------+
2.查看表的信息
mysql> show create table amazon_reviews \G
*************************** 1. row ***************************
Table: amazon_reviews
Create Table: CREATE TABLE amazon_reviews
(
review_date
INT NULL,
marketplace
VARCHAR(20) NULL,
customer_id
BIGINT NULL,
review_id
VARCHAR(40) NULL,
product_id
VARCHAR(10) NULL,
product_parent
BIGINT NULL,
product_title
VARCHAR(500) NULL,
product_category
VARCHAR(50) NULL,
star_rating
SMALLINT NULL,
helpful_votes
INT NULL,
total_votes
INT NULL,
vine
BOOLEAN NULL,
verified_purchase
BOOLEAN NULL,
review_headline
VARCHAR(500) NULL,
review_body
TEXT NULL
) ENGINE=OLAP
DUPLICATE KEY(review_date
)
COMMENT 'OLAP'
DISTRIBUTED BY HASH(review_date
) BUCKETS 16
PROPERTIES (
"replication_allocation" = "tag.location.default: 1",
"min_load_replica_num" = "-1",
"is_being_synced" = "false",
"storage_medium" = "hdd",
"storage_format" = "V2",
"inverted_index_storage_format" = "V1",
"compression" = "ZSTD",
"light_schema_change" = "true",
"disable_auto_compaction" = "false",
"enable_single_replica_compaction" = "false",
"group_commit_interval_ms" = "10000",
"group_commit_data_bytes" = "134217728"
);
1 row in set (0.00 sec)
3.没有索引的时候运行查询,WHERE 条件中有 LIKE 查询时间 8.27秒
mysql> SELECT
-> product_id,
-> any(product_title),
-> AVG(star_rating) AS rating,
-> COUNT() AS count
-> FROM
-> amazon_reviews
-> WHERE
-> review_body LIKE '%is super awesome%'
-> GROUP BY
-> product_id
-> ORDER BY
-> count DESC,
-> rating DESC,
-> product_id
-> LIMIT 5;
+------------+------------------------------------------+-------------------+-------+
| product_id | any_value(product_title) | rating | count |
+------------+------------------------------------------+-------------------+-------+
| B00992CF6W | Minecraft | 4.823529411764706 | 17 |
| B009UX2YAC | Subway Surfers | 4.777777777777778 | 9 |
| B00DJFIMW6 | Minion Rush: Despicable Me Official Game | 4.875 | 8 |
| B0086700CM | Temple Run | 5 | 6 |
| B00KWVZ750 | Angry Birds Epic RPG | 5 | 6 |
+------------+------------------------------------------+-------------------+-------+
5 rows in set (8.27 sec)
4.添加 NGram BloomFilter 索引
mysql> ALTER TABLE amazon_reviews ADD INDEX review_body_ngram_idx(review_body) USING NGRAM_BF PROPERTIES("gram_size"="10", "bf_size"="10240");
Query OK, 0 rows affected (0.01 sec)
--查看表结构 索引创建成功
Create Table: CREATE TABLE amazon_reviews
(
review_date
INT NULL,
marketplace
VARCHAR(20) NULL,
customer_id
BIGINT NULL,
review_id
VARCHAR(40) NULL,
product_id
VARCHAR(10) NULL,
product_parent
BIGINT NULL,
product_title
VARCHAR(500) NULL,
product_category
VARCHAR(50) NULL,
star_rating
SMALLINT NULL,
helpful_votes
INT NULL,
total_votes
INT NULL,
vine
BOOLEAN NULL,
verified_purchase
BOOLEAN NULL,
review_headline
VARCHAR(500) NULL,
review_body
TEXT NULL,
INDEX review_body_ngram_idx (review_body
) USING NGRAM_BF PROPERTIES("gram_size" = "10", "bf_size" = "10240") COMMENT ''
) ENGINE=OLAP
DUPLICATE KEY(review_date
)
COMMENT 'OLAP'
DISTRIBUTED BY HASH(review_date
) BUCKETS 16
PROPERTIES (
"replication_allocation" = "tag.location.default: 1",
"min_load_replica_num" = "-1",
"is_being_synced" = "false",
"storage_medium" = "hdd",
"storage_format" = "V2",
"inverted_index_storage_format" = "V1",
"compression" = "ZSTD",
"light_schema_change" = "true",
"disable_auto_compaction" = "false",
"enable_single_replica_compaction" = "false",
"group_commit_interval_ms" = "10000",
"group_commit_data_bytes" = "134217728"
);
5.再次查询SQL语句 查询时间 7.65 秒,并没有提的8倍的查询性能
mysql> SELECT
-> product_id,
-> any(product_title),
-> AVG(star_rating) AS rating,
-> COUNT() AS count
-> FROM
-> amazon_reviews
-> WHERE
-> review_body LIKE '%is super awesome%'
-> GROUP BY
-> product_id
-> ORDER BY
-> count DESC,
-> rating DESC,
-> product_id
-> LIMIT 5;
+------------+------------------------------------------+-------------------+-------+
| product_id | any_value(product_title) | rating | count |
+------------+------------------------------------------+-------------------+-------+
| B00992CF6W | Minecraft | 4.823529411764706 | 17 |
| B009UX2YAC | Subway Surfers | 4.777777777777778 | 9 |
| B00DJFIMW6 | Minion Rush: Despicable Me Official Game | 4.875 | 8 |
| B0086700CM | Temple Run | 5 | 6 |
| B00KWVZ750 | Angry Birds Epic RPG | 5 | 6 |
+------------+------------------------------------------+-------------------+-------+
5 rows in set (7.65 sec)
--下面是官方文档的执行结果,创建所有后性能提升了 8倍