[Commits] d1d0c156c62: Apply patch: Add partial index iterator
revision-id: d1d0c156c629689b013de067b6fa01e4009484d5 (percona-202102-54-gd1d0c156c62) parent(s): aaebd623e98e59db3efe1b231307e4142240c485 author: Sergei Petrunia committer: Sergei Petrunia timestamp: 2021-05-17 18:00:42 +0300 message: Apply patch: Add partial index iterator Summary: This adds the partial index iterator. It is a special iterator that sorts groups from the primary key on the fly as needed, and if it exceeds a certain threshold, it will materialize the rows on the secondary key as well. For point queries, the secondary key read is not found, we simply extract the primary key parts and read from the primary key. This means that point queries don't trigger materialization though. Test Plan: mtr Reviewers: luqun, herman, yzha, #mysql_eng Subscribers: phabricatorlinter, pgl Differential Revision: https://phabricator.intern.facebook.com/D25933178 --- mysql-test/include/diff_queries.inc | 14 + mysql-test/suite/rocksdb/r/partial_index.result | 146 +++++ .../suite/rocksdb/r/partial_index_assoc.result | 171 ++++++ .../suite/rocksdb/r/partial_index_stress.result | 74 +++ mysql-test/suite/rocksdb/r/rocksdb.result | 12 + mysql-test/suite/rocksdb/t/partial_index.inc | 156 ++++++ mysql-test/suite/rocksdb/t/partial_index.test | 19 + .../suite/rocksdb/t/partial_index_assoc-master.opt | 3 + mysql-test/suite/rocksdb/t/partial_index_assoc.inc | 95 ++++ .../suite/rocksdb/t/partial_index_assoc.test | 58 ++ .../rocksdb/t/partial_index_stress-master.opt | 3 + mysql-test/suite/rocksdb/t/partial_index_stress.py | 114 ++++ .../suite/rocksdb/t/partial_index_stress.test | 64 +++ storage/rocksdb/ha_rocksdb.cc | 77 ++- storage/rocksdb/ha_rocksdb.h | 8 + storage/rocksdb/rdb_datadic.h | 6 + storage/rocksdb/rdb_iterator.cc | 591 +++++++++++++++++++++ storage/rocksdb/rdb_iterator.h | 69 ++- 18 files changed, 1672 insertions(+), 8 deletions(-) diff --git a/mysql-test/include/diff_queries.inc b/mysql-test/include/diff_queries.inc new file mode 100644 index 00000000000..beb75093759 --- /dev/null +++ b/mysql-test/include/diff_queries.inc @@ -0,0 +1,14 @@ +--disable_query_log + +--output $MYSQL_TMP_DIR/A +--eval $query1 + +--output $MYSQL_TMP_DIR/B +--eval $query2 + +--enable_query_log + +--diff_files $MYSQL_TMP_DIR/A $MYSQL_TMP_DIR/B + +--remove_file $MYSQL_TMP_DIR/A +--remove_file $MYSQL_TMP_DIR/B diff --git a/mysql-test/suite/rocksdb/r/partial_index.result b/mysql-test/suite/rocksdb/r/partial_index.result new file mode 100644 index 00000000000..a752d37180c --- /dev/null +++ b/mysql-test/suite/rocksdb/r/partial_index.result @@ -0,0 +1,146 @@ +set optimizer_force_index_for_range = on; +CREATE TABLE t (i varchar(64), j varchar(64), k varchar(64), l varchar(64), +PRIMARY KEY (i, j), +KEY ik1 (i, k) COMMENT 'cfname=cf;partial_group_keyparts=1;partial_group_threshold=5', +KEY ik2 (i, k) COMMENT 'cfname=cf' +) ENGINE=ROCKSDB; +Warnings: +Warning 1831 Duplicate index 'ik2' defined on the table 'test.t'. This is deprecated and will be disallowed in a future release. +INSERT INTO t values ("1", "1", "2", "1"); +INSERT INTO t values ("1", "2", "1", "1"); +INSERT INTO t values ("11111111", "1", "9", "1"); +INSERT INTO t values ("11111111", "2", "8", "1"); +INSERT INTO t values ("11111111", "3", "7", "1"); +INSERT INTO t values ("11111111", "4", "5", "1"); +INSERT INTO t values ("11111111", "5", "4", "1"); +INSERT INTO t values ("11111111", "6", "2", "1"); +INSERT INTO t values ("111111111", "1", "9", "1"); +INSERT INTO t values ("111111111", "2", "2", "1"); +INSERT INTO t values ("11111112", "1", "1", "1"); +DROP TABLE t; +CREATE TABLE t (i int, j int, k int, l int, +PRIMARY KEY (i, j), +KEY ik1 (i, k) COMMENT 'cfname=cf;partial_group_keyparts=1;partial_group_threshold=5', +KEY ik2 (i, k) COMMENT 'cfname=cf' +) ENGINE=ROCKSDB; +Warnings: +Warning 1831 Duplicate index 'ik2' defined on the table 'test.t'. This is deprecated and will be disallowed in a future release. +INSERT INTO t values (1, 1, 2, 1); +INSERT INTO t values (1, 2, 1, 1); +INSERT INTO t values (2, 1, 9, 1); +INSERT INTO t values (2, 2, 8, 1); +INSERT INTO t values (2, 3, 7, 1); +INSERT INTO t values (2, 4, 5, 1); +INSERT INTO t values (2, 5, 4, 1); +INSERT INTO t values (2, 6, 2, 1); +INSERT INTO t values (4, 1, 1, 1); +DROP TABLE t; +CREATE TABLE t (i varchar(64), j varchar(64), k varchar(64), l varchar(64), +PRIMARY KEY (i, j), +KEY ik1 (i, k) COMMENT 'cfname=rev:cf;partial_group_keyparts=1;partial_group_threshold=5', +KEY ik2 (i, k) COMMENT 'cfname=rev:cf' +) ENGINE=ROCKSDB; +Warnings: +Warning 1831 Duplicate index 'ik2' defined on the table 'test.t'. This is deprecated and will be disallowed in a future release. +INSERT INTO t values ("1", "1", "2", "1"); +INSERT INTO t values ("1", "2", "1", "1"); +INSERT INTO t values ("11111111", "1", "9", "1"); +INSERT INTO t values ("11111111", "2", "8", "1"); +INSERT INTO t values ("11111111", "3", "7", "1"); +INSERT INTO t values ("11111111", "4", "5", "1"); +INSERT INTO t values ("11111111", "5", "4", "1"); +INSERT INTO t values ("11111111", "6", "2", "1"); +INSERT INTO t values ("111111111", "1", "9", "1"); +INSERT INTO t values ("111111111", "2", "2", "1"); +INSERT INTO t values ("11111112", "1", "1", "1"); +DROP TABLE t; +CREATE TABLE t (i int, j int, k int, l int, +PRIMARY KEY (i, j), +KEY ik1 (i, k) COMMENT 'cfname=rev:cf;partial_group_keyparts=1;partial_group_threshold=5', +KEY ik2 (i, k) COMMENT 'cfname=rev:cf' +) ENGINE=ROCKSDB; +Warnings: +Warning 1831 Duplicate index 'ik2' defined on the table 'test.t'. This is deprecated and will be disallowed in a future release. +INSERT INTO t values (1, 1, 2, 1); +INSERT INTO t values (1, 2, 1, 1); +INSERT INTO t values (2, 1, 9, 1); +INSERT INTO t values (2, 2, 8, 1); +INSERT INTO t values (2, 3, 7, 1); +INSERT INTO t values (2, 4, 5, 1); +INSERT INTO t values (2, 5, 4, 1); +INSERT INTO t values (2, 6, 2, 1); +INSERT INTO t values (4, 1, 1, 1); +DROP TABLE t; +CREATE TABLE t (i varchar(64), j varchar(64), k varchar(64), l varchar(64), +PRIMARY KEY (i, j), +KEY ik1 (i, k) COMMENT 'cfname=cf;partial_group_keyparts=1;partial_group_threshold=5', +KEY ik2 (i, k) COMMENT 'cfname=cf' +) ENGINE=ROCKSDB; +Warnings: +Warning 1831 Duplicate index 'ik2' defined on the table 'test.t'. This is deprecated and will be disallowed in a future release. +INSERT INTO t values ("1", "1", "2", "1"); +INSERT INTO t values ("1", "2", "1", "1"); +INSERT INTO t values ("11111111", "1", "9", "1"); +INSERT INTO t values ("11111111", "2", "8", "1"); +INSERT INTO t values ("11111111", "3", "7", "1"); +INSERT INTO t values ("11111111", "4", "5", "1"); +INSERT INTO t values ("11111111", "5", "4", "1"); +INSERT INTO t values ("11111111", "6", "2", "1"); +INSERT INTO t values ("111111111", "1", "9", "1"); +INSERT INTO t values ("111111111", "2", "2", "1"); +INSERT INTO t values ("11111112", "1", "1", "1"); +DROP TABLE t; +CREATE TABLE t (i int, j int, k int, l int, +PRIMARY KEY (i, j), +KEY ik1 (i, k) COMMENT 'cfname=cf;partial_group_keyparts=1;partial_group_threshold=5', +KEY ik2 (i, k) COMMENT 'cfname=cf' +) ENGINE=ROCKSDB; +Warnings: +Warning 1831 Duplicate index 'ik2' defined on the table 'test.t'. This is deprecated and will be disallowed in a future release. +INSERT INTO t values (1, 1, 2, 1); +INSERT INTO t values (1, 2, 1, 1); +INSERT INTO t values (2, 1, 9, 1); +INSERT INTO t values (2, 2, 8, 1); +INSERT INTO t values (2, 3, 7, 1); +INSERT INTO t values (2, 4, 5, 1); +INSERT INTO t values (2, 5, 4, 1); +INSERT INTO t values (2, 6, 2, 1); +INSERT INTO t values (4, 1, 1, 1); +DROP TABLE t; +CREATE TABLE t (i varchar(64), j varchar(64), k varchar(64), l varchar(64), +PRIMARY KEY (i, j), +KEY ik1 (i, k) COMMENT 'cfname=rev:cf;partial_group_keyparts=1;partial_group_threshold=5', +KEY ik2 (i, k) COMMENT 'cfname=rev:cf' +) ENGINE=ROCKSDB; +Warnings: +Warning 1831 Duplicate index 'ik2' defined on the table 'test.t'. This is deprecated and will be disallowed in a future release. +INSERT INTO t values ("1", "1", "2", "1"); +INSERT INTO t values ("1", "2", "1", "1"); +INSERT INTO t values ("11111111", "1", "9", "1"); +INSERT INTO t values ("11111111", "2", "8", "1"); +INSERT INTO t values ("11111111", "3", "7", "1"); +INSERT INTO t values ("11111111", "4", "5", "1"); +INSERT INTO t values ("11111111", "5", "4", "1"); +INSERT INTO t values ("11111111", "6", "2", "1"); +INSERT INTO t values ("111111111", "1", "9", "1"); +INSERT INTO t values ("111111111", "2", "2", "1"); +INSERT INTO t values ("11111112", "1", "1", "1"); +DROP TABLE t; +CREATE TABLE t (i int, j int, k int, l int, +PRIMARY KEY (i, j), +KEY ik1 (i, k) COMMENT 'cfname=rev:cf;partial_group_keyparts=1;partial_group_threshold=5', +KEY ik2 (i, k) COMMENT 'cfname=rev:cf' +) ENGINE=ROCKSDB; +Warnings: +Warning 1831 Duplicate index 'ik2' defined on the table 'test.t'. This is deprecated and will be disallowed in a future release. +INSERT INTO t values (1, 1, 2, 1); +INSERT INTO t values (1, 2, 1, 1); +INSERT INTO t values (2, 1, 9, 1); +INSERT INTO t values (2, 2, 8, 1); +INSERT INTO t values (2, 3, 7, 1); +INSERT INTO t values (2, 4, 5, 1); +INSERT INTO t values (2, 5, 4, 1); +INSERT INTO t values (2, 6, 2, 1); +INSERT INTO t values (4, 1, 1, 1); +DROP TABLE t; +set optimizer_force_index_for_range = off; diff --git a/mysql-test/suite/rocksdb/r/partial_index_assoc.result b/mysql-test/suite/rocksdb/r/partial_index_assoc.result new file mode 100644 index 00000000000..fbb89d16b35 --- /dev/null +++ b/mysql-test/suite/rocksdb/r/partial_index_assoc.result @@ -0,0 +1,171 @@ +set optimizer_force_index_for_range = on; +CREATE TABLE `assoc_table` ( +`id1` bigint(20) unsigned NOT NULL DEFAULT '0', +`id1_type` int(10) unsigned NOT NULL DEFAULT '0', +`id2` bigint(20) unsigned NOT NULL DEFAULT '0', +`id2_type` int(10) unsigned NOT NULL DEFAULT '0', +`assoc_type` bigint(20) unsigned NOT NULL DEFAULT '0', +`visibility` tinyint(3) NOT NULL DEFAULT '0', +`data` varchar(255) COLLATE latin1_bin NOT NULL DEFAULT '', +`time` int(10) unsigned NOT NULL DEFAULT '0', +`version` bigint(20) unsigned NOT NULL DEFAULT '0', +PRIMARY KEY (`assoc_type`,`id1`,`id2`) COMMENT 'cf_assoc', +KEY `id1_type` (`assoc_type`, `id1`, `visibility`,`time`,`id2`,`version`,`data`) COMMENT 'rev:cf_assoc_id1_type', +KEY `id1_type2` (`assoc_type`, `id1`, `visibility`,`time`,`id2`,`version`,`data`) COMMENT 'cfname=rev:cf_assoc_id1_type;partial_group_keyparts=2;partial_group_threshold=10' +) ENGINE=ROCKSDB DEFAULT CHARSET=latin1 COLLATE=latin1_bin ROW_FORMAT=COMPRESSED KEY_BLOCK_SIZE=8; +Warnings: +Warning 1681 Integer display width is deprecated and will be removed in a future release. +Warning 1681 Integer display width is deprecated and will be removed in a future release. +Warning 1681 Integer display width is deprecated and will be removed in a future release. +Warning 1681 Integer display width is deprecated and will be removed in a future release. +Warning 1681 Integer display width is deprecated and will be removed in a future release. +Warning 1681 Integer display width is deprecated and will be removed in a future release. +Warning 1681 Integer display width is deprecated and will be removed in a future release. +Warning 1681 Integer display width is deprecated and will be removed in a future release. +Warning 1831 Duplicate index 'id1_type2' defined on the table 'test.assoc_table'. This is deprecated and will be disallowed in a future release. +ALTER TABLE assoc_table ENGINE=ROCKSDB; +CREATE TEMPORARY TABLE t1 AS +SELECT * FROM performance_schema.global_status +WHERE variable_name LIKE 'rocksdb_partial_index%'; +CREATE TEMPORARY TABLE t2 AS +SELECT * FROM performance_schema.global_status +WHERE variable_name LIKE 'rocksdb_partial_index%'; +SELECT variable_name, t2.variable_value - t1.variable_value AS diff FROM t1 JOIN t2 USING (variable_name); +variable_name diff +rocksdb_partial_index_groups_materialized 40 +rocksdb_partial_index_groups_sorted 47 +rocksdb_partial_index_rows_materialized 955 +rocksdb_partial_index_rows_sorted 1000 +include/assert.inc [Check that materialized groups are non-zero.] +include/assert.inc [Check that materialized rows are non-zero.] +DROP TABLE t1, t2; +CREATE TEMPORARY TABLE t1 AS +SELECT * FROM performance_schema.global_status +WHERE variable_name LIKE 'rocksdb_partial_index%'; +SELECT id1, id2, assoc_type, visibility, time, version FROM assoc_table FORCE INDEX (id1_type2); +CREATE TEMPORARY TABLE t2 AS +SELECT * FROM performance_schema.global_status +WHERE variable_name LIKE 'rocksdb_partial_index%'; +SELECT variable_name, t2.variable_value - t1.variable_value AS diff FROM t1 JOIN t2 USING (variable_name); +variable_name diff +rocksdb_partial_index_groups_materialized 0 +rocksdb_partial_index_groups_sorted 7 +rocksdb_partial_index_rows_materialized 0 +rocksdb_partial_index_rows_sorted 45 +include/assert.inc [Check that materialized groups are zero.] +include/assert.inc [Check that materialized rows are zero.] +DROP TABLE t1, t2; +DROP TABLE assoc_table; +CREATE TABLE `assoc_table` ( +`id1` binary(16) NOT NULL DEFAULT '\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0', +`raw_key` text COLLATE latin1_bin, +`id2` bigint(20) unsigned NOT NULL DEFAULT '0', +`id2_type` int(10) unsigned NOT NULL DEFAULT '0', +`assoc_type` bigint(20) unsigned NOT NULL DEFAULT '0', +`visibility` tinyint(3) NOT NULL DEFAULT '0', +`data` varchar(255) COLLATE latin1_bin NOT NULL DEFAULT '', +`time` int(10) unsigned NOT NULL DEFAULT '0', +`version` bigint(20) unsigned NOT NULL DEFAULT '0', +PRIMARY KEY (`assoc_type`,`id1`,`id2`) COMMENT 'cf_assoc', +KEY `id1_type` (`assoc_type`,`id1`,`visibility`,`time`,`id2`,`version`,`data`) COMMENT 'rev:cf_assoc_id1_type', +KEY `id1_type2` (`assoc_type`, `id1`, `visibility`,`time`,`id2`,`version`,`data`) COMMENT 'cfname=rev:cf_assoc_id1_type;partial_group_keyparts=2;partial_group_threshold=10' +) ENGINE=ROCKSDB DEFAULT CHARSET=latin1 COLLATE=latin1_bin ROW_FORMAT=COMPRESSED KEY_BLOCK_SIZE=8; +Warnings: +Warning 1681 Integer display width is deprecated and will be removed in a future release. +Warning 1681 Integer display width is deprecated and will be removed in a future release. +Warning 1681 Integer display width is deprecated and will be removed in a future release. +Warning 1681 Integer display width is deprecated and will be removed in a future release. +Warning 1681 Integer display width is deprecated and will be removed in a future release. +Warning 1681 Integer display width is deprecated and will be removed in a future release. +Warning 1831 Duplicate index 'id1_type2' defined on the table 'test.assoc_table'. This is deprecated and will be disallowed in a future release. +ALTER TABLE assoc_table ENGINE=ROCKSDB; +CREATE TEMPORARY TABLE t1 AS +SELECT * FROM performance_schema.global_status +WHERE variable_name LIKE 'rocksdb_partial_index%'; +CREATE TEMPORARY TABLE t2 AS +SELECT * FROM performance_schema.global_status +WHERE variable_name LIKE 'rocksdb_partial_index%'; +SELECT variable_name, t2.variable_value - t1.variable_value AS diff FROM t1 JOIN t2 USING (variable_name); +variable_name diff +rocksdb_partial_index_groups_materialized 40 +rocksdb_partial_index_groups_sorted 47 +rocksdb_partial_index_rows_materialized 955 +rocksdb_partial_index_rows_sorted 1000 +include/assert.inc [Check that materialized groups are non-zero.] +include/assert.inc [Check that materialized rows are non-zero.] +DROP TABLE t1, t2; +CREATE TEMPORARY TABLE t1 AS +SELECT * FROM performance_schema.global_status +WHERE variable_name LIKE 'rocksdb_partial_index%'; +SELECT id1, id2, assoc_type, visibility, time, version FROM assoc_table FORCE INDEX (id1_type2); +CREATE TEMPORARY TABLE t2 AS +SELECT * FROM performance_schema.global_status +WHERE variable_name LIKE 'rocksdb_partial_index%'; +SELECT variable_name, t2.variable_value - t1.variable_value AS diff FROM t1 JOIN t2 USING (variable_name); +variable_name diff +rocksdb_partial_index_groups_materialized 0 +rocksdb_partial_index_groups_sorted 7 +rocksdb_partial_index_rows_materialized 0 +rocksdb_partial_index_rows_sorted 45 +include/assert.inc [Check that materialized groups are zero.] +include/assert.inc [Check that materialized rows are zero.] +DROP TABLE t1, t2; +DROP TABLE assoc_table; +CREATE TABLE `assoc_table` ( +`id1` bigint(20) unsigned NOT NULL DEFAULT '0', +`id1_type` int(10) unsigned NOT NULL DEFAULT '0', +`id2` bigint(20) unsigned NOT NULL DEFAULT '0', +`id2_type` int(10) unsigned NOT NULL DEFAULT '0', +`assoc_type` bigint(20) unsigned NOT NULL DEFAULT '0', +`visibility` tinyint(4) NOT NULL DEFAULT '0', +`data` text COLLATE latin1_bin NOT NULL, +`time` int(10) unsigned NOT NULL DEFAULT '0', +`version` bigint(20) unsigned NOT NULL DEFAULT '0', +PRIMARY KEY (`assoc_type`,`id1`,`id2`) COMMENT 'cf_assoc', +KEY `id1_type` (`assoc_type`, `id1`, `visibility`,`time`,`id2`,`version`,`data`(255)) COMMENT 'rev:cf_assoc_id1_type', +KEY `id1_type2` (`assoc_type`, `id1`, `visibility`,`time`,`id2`,`version`,`data`(255)) COMMENT 'cfname=rev:cf_assoc_id1_type;partial_group_keyparts=2;partial_group_threshold=10' +) ENGINE=ROCKSDB DEFAULT CHARSET=latin1 COLLATE=latin1_bin ROW_FORMAT=COMPRESSED KEY_BLOCK_SIZE=4; +Warnings: +Warning 1681 Integer display width is deprecated and will be removed in a future release. +Warning 1681 Integer display width is deprecated and will be removed in a future release. +Warning 1681 Integer display width is deprecated and will be removed in a future release. +Warning 1681 Integer display width is deprecated and will be removed in a future release. +Warning 1681 Integer display width is deprecated and will be removed in a future release. +Warning 1681 Integer display width is deprecated and will be removed in a future release. +Warning 1681 Integer display width is deprecated and will be removed in a future release. +Warning 1681 Integer display width is deprecated and will be removed in a future release. +Warning 1831 Duplicate index 'id1_type2' defined on the table 'test.assoc_table'. This is deprecated and will be disallowed in a future release. +ALTER TABLE assoc_table ENGINE=ROCKSDB; +CREATE TEMPORARY TABLE t1 AS +SELECT * FROM performance_schema.global_status +WHERE variable_name LIKE 'rocksdb_partial_index%'; +CREATE TEMPORARY TABLE t2 AS +SELECT * FROM performance_schema.global_status +WHERE variable_name LIKE 'rocksdb_partial_index%'; +SELECT variable_name, t2.variable_value - t1.variable_value AS diff FROM t1 JOIN t2 USING (variable_name); +variable_name diff +rocksdb_partial_index_groups_materialized 40 +rocksdb_partial_index_groups_sorted 47 +rocksdb_partial_index_rows_materialized 955 +rocksdb_partial_index_rows_sorted 1000 +include/assert.inc [Check that materialized groups are non-zero.] +include/assert.inc [Check that materialized rows are non-zero.] +DROP TABLE t1, t2; +CREATE TEMPORARY TABLE t1 AS +SELECT * FROM performance_schema.global_status +WHERE variable_name LIKE 'rocksdb_partial_index%'; +SELECT id1, id2, assoc_type, visibility, time, version FROM assoc_table FORCE INDEX (id1_type2); +CREATE TEMPORARY TABLE t2 AS +SELECT * FROM performance_schema.global_status +WHERE variable_name LIKE 'rocksdb_partial_index%'; +SELECT variable_name, t2.variable_value - t1.variable_value AS diff FROM t1 JOIN t2 USING (variable_name); +variable_name diff +rocksdb_partial_index_groups_materialized 0 +rocksdb_partial_index_groups_sorted 7 +rocksdb_partial_index_rows_materialized 0 +rocksdb_partial_index_rows_sorted 45 +include/assert.inc [Check that materialized groups are zero.] +include/assert.inc [Check that materialized rows are zero.] +DROP TABLE t1, t2; +DROP TABLE assoc_table; +set optimizer_force_index_for_range = off; diff --git a/mysql-test/suite/rocksdb/r/partial_index_stress.result b/mysql-test/suite/rocksdb/r/partial_index_stress.result new file mode 100644 index 00000000000..88f77bcc63f --- /dev/null +++ b/mysql-test/suite/rocksdb/r/partial_index_stress.result @@ -0,0 +1,74 @@ +set @save_rocksdb_lock_wait_timeout = @@rocksdb_lock_wait_timeout; +set global rocksdb_lock_wait_timeout = 100000; +CREATE TABLE `assoc_table` ( +`id1` bigint(20) unsigned NOT NULL DEFAULT '0', +`id1_type` int(10) unsigned NOT NULL DEFAULT '0', +`id2` bigint(20) unsigned NOT NULL DEFAULT '0', +`id2_type` int(10) unsigned NOT NULL DEFAULT '0', +`assoc_type` bigint(20) unsigned NOT NULL DEFAULT '0', +`visibility` tinyint(3) NOT NULL DEFAULT '0', +`data` varchar(255) COLLATE latin1_bin NOT NULL DEFAULT '', +`time` int(10) unsigned NOT NULL DEFAULT '0', +`version` bigint(20) unsigned NOT NULL DEFAULT '0', +PRIMARY KEY (`assoc_type`,`id1`,`id2`) COMMENT 'cf_assoc', +KEY `id1_type` (`assoc_type`, `id1`, `visibility`,`time`,`id2`,`version`,`data`) COMMENT 'cfname=rev:cf_assoc_id1_type;partial_group_keyparts=2;partial_group_threshold=10' +) ENGINE=ROCKSDB DEFAULT CHARSET=latin1 COLLATE=latin1_bin ROW_FORMAT=COMPRESSED KEY_BLOCK_SIZE=8; +Warnings: +Warning 1681 Integer display width is deprecated and will be removed in a future release. +Warning 1681 Integer display width is deprecated and will be removed in a future release. +Warning 1681 Integer display width is deprecated and will be removed in a future release. +Warning 1681 Integer display width is deprecated and will be removed in a future release. +Warning 1681 Integer display width is deprecated and will be removed in a future release. +Warning 1681 Integer display width is deprecated and will be removed in a future release. +Warning 1681 Integer display width is deprecated and will be removed in a future release. +Warning 1681 Integer display width is deprecated and will be removed in a future release. +DROP TABLE assoc_table; +CREATE TABLE `assoc_table` ( +`id1` binary(16) NOT NULL DEFAULT '\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0', +`raw_key` text COLLATE latin1_bin, +`id2` bigint(20) unsigned NOT NULL DEFAULT '0', +`id2_type` int(10) unsigned NOT NULL DEFAULT '0', +`assoc_type` bigint(20) unsigned NOT NULL DEFAULT '0', +`visibility` tinyint(3) NOT NULL DEFAULT '0', +`data` varchar(255) COLLATE latin1_bin NOT NULL DEFAULT '', +`time` int(10) unsigned NOT NULL DEFAULT '0', +`version` bigint(20) unsigned NOT NULL DEFAULT '0', +PRIMARY KEY (`assoc_type`,`id1`,`id2`) COMMENT 'cf_assoc', +KEY `id1_type` (`assoc_type`,`id1`,`visibility`,`time`,`id2`,`version`,`data`) COMMENT 'rev:cf_assoc_id1_type', +KEY `id1_type2` (`assoc_type`, `id1`, `visibility`,`time`,`id2`,`version`,`data`) COMMENT 'cfname=rev:cf_assoc_id1_type;partial_group_keyparts=2;partial_group_threshold=10' +) ENGINE=ROCKSDB DEFAULT CHARSET=latin1 COLLATE=latin1_bin ROW_FORMAT=COMPRESSED KEY_BLOCK_SIZE=8; +Warnings: +Warning 1681 Integer display width is deprecated and will be removed in a future release. +Warning 1681 Integer display width is deprecated and will be removed in a future release. +Warning 1681 Integer display width is deprecated and will be removed in a future release. +Warning 1681 Integer display width is deprecated and will be removed in a future release. +Warning 1681 Integer display width is deprecated and will be removed in a future release. +Warning 1681 Integer display width is deprecated and will be removed in a future release. +Warning 1831 Duplicate index 'id1_type2' defined on the table 'test.assoc_table'. This is deprecated and will be disallowed in a future release. +DROP TABLE assoc_table; +CREATE TABLE `assoc_table` ( +`id1` bigint(20) unsigned NOT NULL DEFAULT '0', +`id1_type` int(10) unsigned NOT NULL DEFAULT '0', +`id2` bigint(20) unsigned NOT NULL DEFAULT '0', +`id2_type` int(10) unsigned NOT NULL DEFAULT '0', +`assoc_type` bigint(20) unsigned NOT NULL DEFAULT '0', +`visibility` tinyint(4) NOT NULL DEFAULT '0', +`data` text COLLATE latin1_bin NOT NULL, +`time` int(10) unsigned NOT NULL DEFAULT '0', +`version` bigint(20) unsigned NOT NULL DEFAULT '0', +PRIMARY KEY (`assoc_type`,`id1`,`id2`) COMMENT 'cf_assoc', +KEY `id1_type` (`assoc_type`, `id1`, `visibility`,`time`,`id2`,`version`,`data`(255)) COMMENT 'rev:cf_assoc_id1_type', +KEY `id1_type2` (`assoc_type`, `id1`, `visibility`,`time`,`id2`,`version`,`data`(255)) COMMENT 'cfname=rev:cf_assoc_id1_type;partial_group_keyparts=2;partial_group_threshold=10' +) ENGINE=ROCKSDB DEFAULT CHARSET=latin1 COLLATE=latin1_bin ROW_FORMAT=COMPRESSED KEY_BLOCK_SIZE=4; +Warnings: +Warning 1681 Integer display width is deprecated and will be removed in a future release. +Warning 1681 Integer display width is deprecated and will be removed in a future release. +Warning 1681 Integer display width is deprecated and will be removed in a future release. +Warning 1681 Integer display width is deprecated and will be removed in a future release. +Warning 1681 Integer display width is deprecated and will be removed in a future release. +Warning 1681 Integer display width is deprecated and will be removed in a future release. +Warning 1681 Integer display width is deprecated and will be removed in a future release. +Warning 1681 Integer display width is deprecated and will be removed in a future release. +Warning 1831 Duplicate index 'id1_type2' defined on the table 'test.assoc_table'. This is deprecated and will be disallowed in a future release. +DROP TABLE assoc_table; +set global rocksdb_lock_wait_timeout = @save_rocksdb_lock_wait_timeout; diff --git a/mysql-test/suite/rocksdb/r/rocksdb.result b/mysql-test/suite/rocksdb/r/rocksdb.result index 61dd8184ddf..5c440c88317 100644 --- a/mysql-test/suite/rocksdb/r/rocksdb.result +++ b/mysql-test/suite/rocksdb/r/rocksdb.result @@ -1656,6 +1656,10 @@ rocksdb_number_sst_entry_singledelete # rocksdb_number_superversion_acquires # rocksdb_number_superversion_cleanups # rocksdb_number_superversion_releases # +rocksdb_partial_index_groups_materialized # +rocksdb_partial_index_groups_sorted # +rocksdb_partial_index_rows_materialized # +rocksdb_partial_index_rows_sorted # rocksdb_row_lock_deadlocks # rocksdb_row_lock_wait_timeouts # rocksdb_select_bypass_executed # @@ -1760,6 +1764,10 @@ ROCKSDB_NUMBER_SUPERVERSION_ACQUIRES ROCKSDB_NUMBER_SUPERVERSION_CLEANUPS ROCKSDB_NUMBER_SUPERVERSION_RELEASES ROCKSDB_NUM_ITERATORS +ROCKSDB_PARTIAL_INDEX_GROUPS_MATERIALIZED +ROCKSDB_PARTIAL_INDEX_GROUPS_SORTED +ROCKSDB_PARTIAL_INDEX_ROWS_MATERIALIZED +ROCKSDB_PARTIAL_INDEX_ROWS_SORTED ROCKSDB_QUERIES_POINT ROCKSDB_QUERIES_RANGE ROCKSDB_ROWS_DELETED @@ -1883,6 +1891,10 @@ ROCKSDB_NUMBER_SUPERVERSION_ACQUIRES ROCKSDB_NUMBER_SUPERVERSION_CLEANUPS ROCKSDB_NUMBER_SUPERVERSION_RELEASES ROCKSDB_NUM_ITERATORS +ROCKSDB_PARTIAL_INDEX_GROUPS_MATERIALIZED +ROCKSDB_PARTIAL_INDEX_GROUPS_SORTED +ROCKSDB_PARTIAL_INDEX_ROWS_MATERIALIZED +ROCKSDB_PARTIAL_INDEX_ROWS_SORTED ROCKSDB_QUERIES_POINT ROCKSDB_QUERIES_RANGE ROCKSDB_ROWS_DELETED diff --git a/mysql-test/suite/rocksdb/t/partial_index.inc b/mysql-test/suite/rocksdb/t/partial_index.inc new file mode 100644 index 00000000000..7137a696d6a --- /dev/null +++ b/mysql-test/suite/rocksdb/t/partial_index.inc @@ -0,0 +1,156 @@ +eval +CREATE TABLE t (i varchar(64), j varchar(64), k varchar(64), l varchar(64), + PRIMARY KEY (i, j), + KEY ik1 (i, k) COMMENT 'cfname=$cfname;partial_group_keyparts=1;partial_group_threshold=5', + KEY ik2 (i, k) COMMENT 'cfname=$cfname' +) ENGINE=ROCKSDB; + +INSERT INTO t values ("1", "1", "2", "1"); +INSERT INTO t values ("1", "2", "1", "1"); + +INSERT INTO t values ("11111111", "1", "9", "1"); +INSERT INTO t values ("11111111", "2", "8", "1"); +INSERT INTO t values ("11111111", "3", "7", "1"); +INSERT INTO t values ("11111111", "4", "5", "1"); +INSERT INTO t values ("11111111", "5", "4", "1"); +INSERT INTO t values ("11111111", "6", "2", "1"); + +INSERT INTO t values ("111111111", "1", "9", "1"); +INSERT INTO t values ("111111111", "2", "2", "1"); + +INSERT INTO t values ("11111112", "1", "1", "1"); + +--let $query1= SELECT i, j, k FROM t FORCE INDEX (ik1) ORDER BY i $asc , k $asc; +--let $query2= SELECT i, j, k FROM t FORCE INDEX (ik2) ORDER BY i $asc , k $asc; +--source include/diff_queries.inc + +--let $query1= SELECT i, j, k FROM t FORCE INDEX (ik1) WHERE i = "1" ORDER BY i $asc , k $asc; +--let $query2= SELECT i, j, k FROM t FORCE INDEX (ik2) WHERE i = "1" ORDER BY i $asc , k $asc; +--source include/diff_queries.inc + +--let $query1= SELECT i, j, k FROM t FORCE INDEX (ik1) WHERE i = "11111111" ORDER BY i $asc , k $asc; +--let $query2= SELECT i, j, k FROM t FORCE INDEX (ik2) WHERE i = "11111111" ORDER BY i $asc , k $asc; +--source include/diff_queries.inc + +--let $query1= SELECT i, j, k FROM t FORCE INDEX (ik1) WHERE i = "111111110" ORDER BY i $asc , k $asc; +--let $query2= SELECT i, j, k FROM t FORCE INDEX (ik2) WHERE i = "111111110" ORDER BY i $asc , k $asc; +--source include/diff_queries.inc + +--let $query1= SELECT i, j, k FROM t FORCE INDEX (ik1) WHERE i = "11111112" ORDER BY i $asc , k $asc; +--let $query2= SELECT i, j, k FROM t FORCE INDEX (ik2) WHERE i = "11111112" ORDER BY i $asc , k $asc; +--source include/diff_queries.inc + +--let $query1= SELECT i, j, k FROM t FORCE INDEX (ik1) WHERE i = "11111111" AND k < "5" ORDER BY i $asc , k $asc; +--let $query2= SELECT i, j, k FROM t FORCE INDEX (ik2) WHERE i = "11111111" AND k < "5" ORDER BY i $asc , k $asc; +--source include/diff_queries.inc + +--let $query1= SELECT i, j, k FROM t FORCE INDEX (ik1) WHERE i = "11111111" AND k > "2" ORDER BY i $asc , k $asc; +--let $query2= SELECT i, j, k FROM t FORCE INDEX (ik2) WHERE i = "11111111" AND k > "2" ORDER BY i $asc , k $asc; +--source include/diff_queries.inc + +--let $query1= SELECT i, j, k FROM t FORCE INDEX (ik1) WHERE i = "11111111" AND k > "2" AND k < "5" ORDER BY i $asc , k $asc; +--let $query2= SELECT i, j, k FROM t FORCE INDEX (ik2) WHERE i = "11111111" AND k > "2" AND k < "5" ORDER BY i $asc , k $asc; +--source include/diff_queries.inc + +--let $query1= SELECT i, j, k FROM t FORCE INDEX (ik1) WHERE i = "11111111" AND k > "7" AND k < "9" ORDER BY i $asc , k $asc; +--let $query2= SELECT i, j, k FROM t FORCE INDEX (ik2) WHERE i = "11111111" AND k > "7" AND k < "9" ORDER BY i $asc , k $asc; +--source include/diff_queries.inc + +--let $query1= SELECT i, j, k FROM t FORCE INDEX (ik1) WHERE i < "111111110" ORDER BY i $asc , k $asc; +--let $query2= SELECT i, j, k FROM t FORCE INDEX (ik2) WHERE i < "111111110" ORDER BY i $asc , k $asc; +--source include/diff_queries.inc + +--let $query1= SELECT i, j, k FROM t FORCE INDEX (ik1) WHERE i > "111111110" ORDER BY i $asc , k $asc; +--let $query2= SELECT i, j, k FROM t FORCE INDEX (ik2) WHERE i > "111111110" ORDER BY i $asc , k $asc; +--source include/diff_queries.inc + +--let $query1= SELECT i, j, k FROM t FORCE INDEX (ik1) WHERE i >= "111111110" ORDER BY i $asc , k $asc; +--let $query2= SELECT i, j, k FROM t FORCE INDEX (ik2) WHERE i >= "111111110" ORDER BY i $asc , k $asc; +--source include/diff_queries.inc + +--let $query1= SELECT i, j, k FROM t FORCE INDEX (ik1) WHERE i <= "111111110" ORDER BY i $asc , k $asc; +--let $query2= SELECT i, j, k FROM t FORCE INDEX (ik2) WHERE i <= "111111110" ORDER BY i $asc , k $asc; +--source include/diff_queries.inc + +--let $query1= SELECT i, j, k FROM t FORCE INDEX (ik1) WHERE i LIKE "1%" ORDER BY i $asc , k $asc; +--let $query2= SELECT i, j, k FROM t FORCE INDEX (ik2) WHERE i LIKE "1%" ORDER BY i $asc , k $asc; +--source include/diff_queries.inc + +--let $query1= SELECT i, j, k FROM t FORCE INDEX (ik1) WHERE i LIKE "11111111%" ORDER BY i $asc , k $asc; +--let $query2= SELECT i, j, k FROM t FORCE INDEX (ik2) WHERE i LIKE "11111111%" ORDER BY i $asc , k $asc; +--source include/diff_queries.inc + +DROP TABLE t; + +eval +CREATE TABLE t (i int, j int, k int, l int, + PRIMARY KEY (i, j), + KEY ik1 (i, k) COMMENT 'cfname=$cfname;partial_group_keyparts=1;partial_group_threshold=5', + KEY ik2 (i, k) COMMENT 'cfname=$cfname' +) ENGINE=ROCKSDB; + +INSERT INTO t values (1, 1, 2, 1); +INSERT INTO t values (1, 2, 1, 1); + +INSERT INTO t values (2, 1, 9, 1); +INSERT INTO t values (2, 2, 8, 1); +INSERT INTO t values (2, 3, 7, 1); +INSERT INTO t values (2, 4, 5, 1); +INSERT INTO t values (2, 5, 4, 1); +INSERT INTO t values (2, 6, 2, 1); + +INSERT INTO t values (4, 1, 1, 1); + +--let $query1= SELECT i, j, k FROM t FORCE INDEX (ik1) ORDER BY i $asc , k $asc; +--let $query2= SELECT i, j, k FROM t FORCE INDEX (ik2) ORDER BY i $asc , k $asc; +--source include/diff_queries.inc + +--let $query1= SELECT i, j, k FROM t FORCE INDEX (ik1) WHERE i = 1 ORDER BY i $asc , k $asc; +--let $query2= SELECT i, j, k FROM t FORCE INDEX (ik2) WHERE i = 1 ORDER BY i $asc , k $asc; +--source include/diff_queries.inc + +--let $query1= SELECT i, j, k FROM t FORCE INDEX (ik1) WHERE i = 2 ORDER BY i $asc , k $asc; +--let $query2= SELECT i, j, k FROM t FORCE INDEX (ik2) WHERE i = 2 ORDER BY i $asc , k $asc; +--source include/diff_queries.inc + +--let $query1= SELECT i, j, k FROM t FORCE INDEX (ik1) WHERE i = 3 ORDER BY i $asc , k $asc; +--let $query2= SELECT i, j, k FROM t FORCE INDEX (ik2) WHERE i = 3 ORDER BY i $asc , k $asc; +--source include/diff_queries.inc + +--let $query1= SELECT i, j, k FROM t FORCE INDEX (ik1) WHERE i = 4 ORDER BY i $asc , k $asc; +--let $query2= SELECT i, j, k FROM t FORCE INDEX (ik2) WHERE i = 4 ORDER BY i $asc , k $asc; +--source include/diff_queries.inc + +--let $query1= SELECT i, j, k FROM t FORCE INDEX (ik1) WHERE i = 2 AND k < 5 ORDER BY i $asc , k $asc; +--let $query2= SELECT i, j, k FROM t FORCE INDEX (ik2) WHERE i = 2 AND k < 5 ORDER BY i $asc , k $asc; +--source include/diff_queries.inc + +--let $query1= SELECT i, j, k FROM t FORCE INDEX (ik1) WHERE i = 2 AND k > 2 ORDER BY i $asc , k $asc; +--let $query2= SELECT i, j, k FROM t FORCE INDEX (ik2) WHERE i = 2 AND k > 2 ORDER BY i $asc , k $asc; +--source include/diff_queries.inc + +--let $query1= SELECT i, j, k FROM t FORCE INDEX (ik1) WHERE i = 2 AND k > 2 AND k < 5 ORDER BY i $asc , k $asc; +--let $query2= SELECT i, j, k FROM t FORCE INDEX (ik2) WHERE i = 2 AND k > 2 AND k < 5 ORDER BY i $asc , k $asc; +--source include/diff_queries.inc + +--let $query1= SELECT i, j, k FROM t FORCE INDEX (ik1) WHERE i = 2 AND k > 7 AND k < 9 ORDER BY i $asc , k $asc; +--let $query2= SELECT i, j, k FROM t FORCE INDEX (ik2) WHERE i = 2 AND k > 7 AND k < 9 ORDER BY i $asc , k $asc; +--source include/diff_queries.inc + +--let $query1= SELECT i, j, k FROM t FORCE INDEX (ik1) WHERE i < 3 ORDER BY i $asc , k $asc; +--let $query2= SELECT i, j, k FROM t FORCE INDEX (ik2) WHERE i < 3 ORDER BY i $asc , k $asc; +--source include/diff_queries.inc + +--let $query1= SELECT i, j, k FROM t FORCE INDEX (ik1) WHERE i > 3 ORDER BY i $asc , k $asc; +--let $query2= SELECT i, j, k FROM t FORCE INDEX (ik2) WHERE i > 3 ORDER BY i $asc , k $asc; +--source include/diff_queries.inc + +--let $query1= SELECT i, j, k FROM t FORCE INDEX (ik1) WHERE i >= 3 ORDER BY i $asc , k $asc; +--let $query2= SELECT i, j, k FROM t FORCE INDEX (ik2) WHERE i >= 3 ORDER BY i $asc , k $asc; +--source include/diff_queries.inc + +--let $query1= SELECT i, j, k FROM t FORCE INDEX (ik1) WHERE i <= 3 ORDER BY i $asc , k $asc; +--let $query2= SELECT i, j, k FROM t FORCE INDEX (ik2) WHERE i <= 3 ORDER BY i $asc , k $asc; +--source include/diff_queries.inc + +DROP TABLE t; diff --git a/mysql-test/suite/rocksdb/t/partial_index.test b/mysql-test/suite/rocksdb/t/partial_index.test new file mode 100644 index 00000000000..410c772765e --- /dev/null +++ b/mysql-test/suite/rocksdb/t/partial_index.test @@ -0,0 +1,19 @@ +set optimizer_force_index_for_range = on; + +--let $asc=ASC +--let $cfname=cf +--source partial_index.inc + +--let $asc=ASC +--let $cfname=rev:cf +--source partial_index.inc + +--let $asc=DESC +--let $cfname=cf +--source partial_index.inc + +--let $asc=DESC +--let $cfname=rev:cf +--source partial_index.inc + +set optimizer_force_index_for_range = off; diff --git a/mysql-test/suite/rocksdb/t/partial_index_assoc-master.opt b/mysql-test/suite/rocksdb/t/partial_index_assoc-master.opt new file mode 100644 index 00000000000..81bc90b0531 --- /dev/null +++ b/mysql-test/suite/rocksdb/t/partial_index_assoc-master.opt @@ -0,0 +1,3 @@ +--rocksdb_default_cf_options=write_buffer_size=128m;target_file_size_base=32m;max_bytes_for_level_base=512m;level0_file_num_compaction_trigger=4;level0_slowdown_writes_trigger=20;level0_stop_writes_trigger=30;max_write_buffer_number=4;compression_per_level=kLZ4Compression;bottommost_compression=kZSTD;compression_opts=-14:6:0;block_based_table_factory={cache_index_and_filter_blocks=1;filter_policy=bloomfilter:10:false;whole_key_filtering=0};prefix_extractor=capped:12;level_compaction_dynamic_level_bytes=true;optimize_filters_for_hits=true;memtable_prefix_bloom_size_ratio=0.039;max_compaction_bytes=402653184;report_bg_io_stats=true;compaction_pri=kMinOverlappingRatio;soft_pending_compaction_bytes_limit=20480000000 +--rocksdb_override_cf_options=cf_assoc={prefix_extractor=capped:28};cf_assoc_count={prefix_extractor=capped:20};rev:cf_assoc_id1_type={prefix_extractor=capped:20};cf_fbobj_type_id={prefix_extractor=capped:16};cf_assoc_disagg={prefix_extractor=capped:20};__system__={write_buffer_size=16m}; + diff --git a/mysql-test/suite/rocksdb/t/partial_index_assoc.inc b/mysql-test/suite/rocksdb/t/partial_index_assoc.inc new file mode 100644 index 00000000000..d0508a3f40b --- /dev/null +++ b/mysql-test/suite/rocksdb/t/partial_index_assoc.inc @@ -0,0 +1,95 @@ +--let $binary_id1=1 +if (`select DATA_TYPE = 'binary' from information_schema.columns where TABLE_NAME = 'assoc_table' and TABLE_SCHEMA = 'test' and COLUMN_NAME = 'id1'`) { + --let $binary_id1="1" +} +--let $text=`select DATA_TYPE = 'text' from information_schema.columns where TABLE_NAME = 'assoc_table' and TABLE_SCHEMA = 'test' and COLUMN_NAME = 'data'` + +# This creates 10 distinct types, with up to 9 distinct id1s per type, to give up to 90 groups. +--disable_query_log +let $i=0; +while ($i < 1000) +{ + if ($text) { + eval INSERT INTO assoc_table VALUES (FLOOR(RAND($i) * 9), 123, $i, 456, FLOOR($i / 100), FLOOR(RAND($i) * 2), REPEAT("1234567890", FLOOR(RAND($i) * 50)), FLOOR(RAND($i) * 100000), 789); + } + if (!$text) { + eval INSERT INTO assoc_table VALUES (FLOOR(RAND($i) * 9), 123, $i, 456, FLOOR($i / 100), FLOOR(RAND($i) * 2), REPEAT("1234567890", FLOOR(RAND($i) * 20)), FLOOR(RAND($i) * 100000), 789); + } + inc $i; +} +--enable_query_log + +let $i=0; +while ($i < 10) { + # This gives a range plan + --let $query1= SELECT id1, id2, assoc_type, visibility, data, time, version FROM assoc_table FORCE INDEX (id1_type) WHERE assoc_type = $i AND id1 = $binary_id1 AND visibility = 1 AND time >= 100 AND time <= 50000 ORDER BY time DESC, id2 DESC LIMIT 10000 + --let $query2= SELECT id1, id2, assoc_type, visibility, data, time, version FROM assoc_table FORCE INDEX (id1_type2) WHERE assoc_type = $i AND id1 = $binary_id1 AND visibility = 1 AND time >= 100 AND time <= 50000 ORDER BY time DESC, id2 DESC LIMIT 10000 + --source include/diff_queries.inc + + # This gives a ref plan + --let $query1= SELECT id1, id2, assoc_type, visibility, data, time, version FROM assoc_table FORCE INDEX (id1_type) WHERE assoc_type = $i AND id1 = $binary_id1 AND visibility = 0 AND time >= 0 AND time <= 4294967295 ORDER BY time DESC, id2 DESC LIMIT 10000 + --let $query2= SELECT id1, id2, assoc_type, visibility, data, time, version FROM assoc_table FORCE INDEX (id1_type2) WHERE assoc_type = $i AND id1 = $binary_id1 AND visibility = 0 AND time >= 0 AND time <= 4294967295 ORDER BY time DESC, id2 DESC LIMIT 10000 + --source include/diff_queries.inc + + inc $i; +} + +let $i=0; +while ($i < 10) { + --let $query1= SELECT id1, id2, assoc_type, visibility, data, time, version FROM assoc_table FORCE INDEX (id1_type) WHERE assoc_type = 1 + --let $query2= SELECT id1, id2, assoc_type, visibility, data, time, version FROM assoc_table FORCE INDEX (id1_type2) WHERE assoc_type = 1 + --source include/diff_queries.inc + + --let $query1= SELECT id1, id2, assoc_type, visibility, data, time, version FROM assoc_table FORCE INDEX (id1_type) WHERE assoc_type <= 2 + --let $query2= SELECT id1, id2, assoc_type, visibility, data, time, version FROM assoc_table FORCE INDEX (id1_type2) WHERE assoc_type <= 2 + --source include/diff_queries.inc + + inc $i; +} + +# Rebuild the table so that nothing is materialized anymore. +ALTER TABLE assoc_table ENGINE=ROCKSDB; + +CREATE TEMPORARY TABLE t1 AS +SELECT * FROM performance_schema.global_status +WHERE variable_name LIKE 'rocksdb_partial_index%'; + +--let $query1= SELECT id1, id2, assoc_type, visibility, time, version FROM assoc_table FORCE INDEX (id1_type) +--let $query2= SELECT id1, id2, assoc_type, visibility, time, version FROM assoc_table FORCE INDEX (id1_type2) +--source include/diff_queries.inc + +CREATE TEMPORARY TABLE t2 AS +SELECT * FROM performance_schema.global_status +WHERE variable_name LIKE 'rocksdb_partial_index%'; + +SELECT variable_name, t2.variable_value - t1.variable_value AS diff FROM t1 JOIN t2 USING (variable_name); +--let $assert_text = Check that materialized groups are non-zero. +--let $assert_cond = [SELECT t2.variable_value - t1.variable_value AS diff FROM t1 JOIN t2 USING (variable_name) WHERE variable_name = "rocksdb_partial_index_groups_materialized", diff, 1] > 0 +--source include/assert.inc +--let $assert_text = Check that materialized rows are non-zero. +--let $assert_cond = [SELECT t2.variable_value - t1.variable_value AS diff FROM t1 JOIN t2 USING (variable_name) WHERE variable_name = "rocksdb_partial_index_rows_materialized", diff, 1] > 0 +--source include/assert.inc +DROP TABLE t1, t2; + +# Rerun full index scan a second time, and check that no materialization occurs +CREATE TEMPORARY TABLE t1 AS +SELECT * FROM performance_schema.global_status +WHERE variable_name LIKE 'rocksdb_partial_index%'; + +--disable_result_log +SELECT id1, id2, assoc_type, visibility, time, version FROM assoc_table FORCE INDEX (id1_type2); +--enable_result_log + +CREATE TEMPORARY TABLE t2 AS +SELECT * FROM performance_schema.global_status +WHERE variable_name LIKE 'rocksdb_partial_index%'; + +SELECT variable_name, t2.variable_value - t1.variable_value AS diff FROM t1 JOIN t2 USING (variable_name); +--let $assert_text = Check that materialized groups are zero. +--let $assert_cond = [SELECT t2.variable_value - t1.variable_value AS diff FROM t1 JOIN t2 USING (variable_name) WHERE variable_name = "rocksdb_partial_index_groups_materialized", diff, 1] = 0 +--source include/assert.inc +--let $assert_text = Check that materialized rows are zero. +--let $assert_cond = [SELECT t2.variable_value - t1.variable_value AS diff FROM t1 JOIN t2 USING (variable_name) WHERE variable_name = "rocksdb_partial_index_rows_materialized", diff, 1] = 0 +--source include/assert.inc + +DROP TABLE t1, t2; diff --git a/mysql-test/suite/rocksdb/t/partial_index_assoc.test b/mysql-test/suite/rocksdb/t/partial_index_assoc.test new file mode 100644 index 00000000000..a559c67f673 --- /dev/null +++ b/mysql-test/suite/rocksdb/t/partial_index_assoc.test @@ -0,0 +1,58 @@ +set optimizer_force_index_for_range = on; +CREATE TABLE `assoc_table` ( + `id1` bigint(20) unsigned NOT NULL DEFAULT '0', + `id1_type` int(10) unsigned NOT NULL DEFAULT '0', + `id2` bigint(20) unsigned NOT NULL DEFAULT '0', + `id2_type` int(10) unsigned NOT NULL DEFAULT '0', + `assoc_type` bigint(20) unsigned NOT NULL DEFAULT '0', + `visibility` tinyint(3) NOT NULL DEFAULT '0', + `data` varchar(255) COLLATE latin1_bin NOT NULL DEFAULT '', + `time` int(10) unsigned NOT NULL DEFAULT '0', + `version` bigint(20) unsigned NOT NULL DEFAULT '0', + PRIMARY KEY (`assoc_type`,`id1`,`id2`) COMMENT 'cf_assoc', + KEY `id1_type` (`assoc_type`, `id1`, `visibility`,`time`,`id2`,`version`,`data`) COMMENT 'rev:cf_assoc_id1_type', + KEY `id1_type2` (`assoc_type`, `id1`, `visibility`,`time`,`id2`,`version`,`data`) COMMENT 'cfname=rev:cf_assoc_id1_type;partial_group_keyparts=2;partial_group_threshold=10' +) ENGINE=ROCKSDB DEFAULT CHARSET=latin1 COLLATE=latin1_bin ROW_FORMAT=COMPRESSED KEY_BLOCK_SIZE=8; + +--source partial_index_assoc.inc + +DROP TABLE assoc_table; + +CREATE TABLE `assoc_table` ( + `id1` binary(16) NOT NULL DEFAULT '\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0', + `raw_key` text COLLATE latin1_bin, + `id2` bigint(20) unsigned NOT NULL DEFAULT '0', + `id2_type` int(10) unsigned NOT NULL DEFAULT '0', + `assoc_type` bigint(20) unsigned NOT NULL DEFAULT '0', + `visibility` tinyint(3) NOT NULL DEFAULT '0', + `data` varchar(255) COLLATE latin1_bin NOT NULL DEFAULT '', + `time` int(10) unsigned NOT NULL DEFAULT '0', + `version` bigint(20) unsigned NOT NULL DEFAULT '0', + PRIMARY KEY (`assoc_type`,`id1`,`id2`) COMMENT 'cf_assoc', + KEY `id1_type` (`assoc_type`,`id1`,`visibility`,`time`,`id2`,`version`,`data`) COMMENT 'rev:cf_assoc_id1_type', + KEY `id1_type2` (`assoc_type`, `id1`, `visibility`,`time`,`id2`,`version`,`data`) COMMENT 'cfname=rev:cf_assoc_id1_type;partial_group_keyparts=2;partial_group_threshold=10' +) ENGINE=ROCKSDB DEFAULT CHARSET=latin1 COLLATE=latin1_bin ROW_FORMAT=COMPRESSED KEY_BLOCK_SIZE=8; + +--source partial_index_assoc.inc + +DROP TABLE assoc_table; + +CREATE TABLE `assoc_table` ( + `id1` bigint(20) unsigned NOT NULL DEFAULT '0', + `id1_type` int(10) unsigned NOT NULL DEFAULT '0', + `id2` bigint(20) unsigned NOT NULL DEFAULT '0', + `id2_type` int(10) unsigned NOT NULL DEFAULT '0', + `assoc_type` bigint(20) unsigned NOT NULL DEFAULT '0', + `visibility` tinyint(4) NOT NULL DEFAULT '0', + `data` text COLLATE latin1_bin NOT NULL, + `time` int(10) unsigned NOT NULL DEFAULT '0', + `version` bigint(20) unsigned NOT NULL DEFAULT '0', + PRIMARY KEY (`assoc_type`,`id1`,`id2`) COMMENT 'cf_assoc', + KEY `id1_type` (`assoc_type`, `id1`, `visibility`,`time`,`id2`,`version`,`data`(255)) COMMENT 'rev:cf_assoc_id1_type', + KEY `id1_type2` (`assoc_type`, `id1`, `visibility`,`time`,`id2`,`version`,`data`(255)) COMMENT 'cfname=rev:cf_assoc_id1_type;partial_group_keyparts=2;partial_group_threshold=10' +) ENGINE=ROCKSDB DEFAULT CHARSET=latin1 COLLATE=latin1_bin ROW_FORMAT=COMPRESSED KEY_BLOCK_SIZE=4; + +--source partial_index_assoc.inc + +DROP TABLE assoc_table; +set optimizer_force_index_for_range = off; diff --git a/mysql-test/suite/rocksdb/t/partial_index_stress-master.opt b/mysql-test/suite/rocksdb/t/partial_index_stress-master.opt new file mode 100644 index 00000000000..a105847c183 --- /dev/null +++ b/mysql-test/suite/rocksdb/t/partial_index_stress-master.opt @@ -0,0 +1,3 @@ +--initialize --default_authentication_plugin=mysql_native_password +--rocksdb_default_cf_options=write_buffer_size=128m;target_file_size_base=32m;max_bytes_for_level_base=512m;level0_file_num_compaction_trigger=4;level0_slowdown_writes_trigger=20;level0_stop_writes_trigger=30;max_write_buffer_number=4;compression_per_level=kLZ4Compression;bottommost_compression=kZSTD;compression_opts=-14:6:0;block_based_table_factory={cache_index_and_filter_blocks=1;filter_policy=bloomfilter:10:false;whole_key_filtering=0};prefix_extractor=capped:12;level_compaction_dynamic_level_bytes=true;optimize_filters_for_hits=true;memtable_prefix_bloom_size_ratio=0.039;max_compaction_bytes=402653184;report_bg_io_stats=true;compaction_pri=kMinOverlappingRatio;soft_pending_compaction_bytes_limit=20480000000 +--rocksdb_override_cf_options=cf_assoc={prefix_extractor=capped:28};cf_assoc_count={prefix_extractor=capped:20};rev:cf_assoc_id1_type={prefix_extractor=capped:20};cf_fbobj_type_id={prefix_extractor=capped:16};cf_assoc_disagg={prefix_extractor=capped:20};__system__={write_buffer_size=16m}; diff --git a/mysql-test/suite/rocksdb/t/partial_index_stress.py b/mysql-test/suite/rocksdb/t/partial_index_stress.py new file mode 100644 index 00000000000..07220d88705 --- /dev/null +++ b/mysql-test/suite/rocksdb/t/partial_index_stress.py @@ -0,0 +1,114 @@ +""" +This script stress tests partial indexes by performing writes while concurrently checking PK/SK consistency. + +Usage: partial_index_stress.py user host port db_name table_name + num_iters num_threads +""" +import MySQLdb +import random +import sys +import threading +import traceback + +def get_query(table_name, binary_id1): + assoc_type = random.randint(1, 2) + id1 = random.randint(1, 5) + id2 = random.randint(1, 20) + + r = random.randint(1, 3) + + if r == 1: + if binary_id1: + return """DELETE FROM %s WHERE id1 = "%d" and id2 = %d and assoc_type = %d""" % (table_name, id1, id2, assoc_type) + else: + return """DELETE FROM %s WHERE id1 = %d and id2 = %d and assoc_type = %d""" % (table_name, id1, id2, assoc_type) + else: + return """INSERT INTO %s VALUES (%d, 0, %d, 0, %d, 1, 'abc', 100, 1) ON DUPLICATE KEY UPDATE time=time+10, version=version+1""" % (table_name, id1, id2, assoc_type) + +class Worker(threading.Thread): + def __init__(self, con, table_name, num_iters, check, event): + threading.Thread.__init__(self) + self.con = con + self.table_name = table_name + self.num_iters = num_iters + self.check = check + self.event = event + self.exception = None + self.start() + + def run(self): + try: + if self.check: + self.run_check() + else: + self.run_write() + except Exception as e: + self.exception = traceback.format_exc() + + def run_write(self): + cur = self.con.cursor() + cur.execute("select data_type from information_schema.columns where table_schema = database() and table_name = '%s' and column_name = 'id1'" % self.table_name); + binary_id1 = cur.fetchone()[0] == "binary" + cur.execute("SET SESSION TRANSACTION ISOLATION LEVEL READ COMMITTED") + for x in range(self.num_iters): + try: + cur.execute(get_query(self.table_name, binary_id1)) + self.con.commit() + except MySQLdb.OperationalError as e: + self.con.rollback() + cur = self.con.cursor() + raise e + + def run_check(self): + cur = self.con.cursor() + while not self.event.is_set(): + try: + cur.execute("SELECT COUNT(*) FROM %s FORCE INDEX(PRIMARY) UNION ALL SELECT COUNT(*) FROM %s FORCE INDEX(id1_type)" % (self.table_name, self.table_name)) + pk_count = cur.fetchone()[0] + sk_count = cur.fetchone()[0] + assert pk_count == sk_count, "Count mismatch %d != %d" % (pk_count, sk_count) + self.con.commit() + except MySQLdb.OperationalError as e: + self.con.rollback() + cur = self.con.cursor() + raise e + +if __name__ == '__main__': + if len(sys.argv) != 8: + print("Usage: partial_index_stress.py user host port db_name " \ + "table_name num_iters num_threads") + sys.exit(1) + + user = sys.argv[1] + host = sys.argv[2] + port = int(sys.argv[3]) + db = sys.argv[4] + table_name = sys.argv[5] + num_iters = int(sys.argv[6]) + num_workers = int(sys.argv[7]) + + done_event = threading.Event(); + + worker_failed = False + workers = [] + for i in range(num_workers): + w = Worker( + MySQLdb.connect(user=user, host=host, port=port, db=db), table_name, + num_iters, False, None) + workers.append(w) + + checker = Worker( + MySQLdb.connect(user=user, host=host, port=port, db=db), table_name, + num_iters, True, done_event) + + for w in workers: + w.join() + if w.exception: + print("Worker hit an exception:\n%s\n" % w.exception) + worker_failed = True + + done_event.set() + checker.join() + + if worker_failed: + sys.exit(1) diff --git a/mysql-test/suite/rocksdb/t/partial_index_stress.test b/mysql-test/suite/rocksdb/t/partial_index_stress.test new file mode 100644 index 00000000000..c78e8cb980e --- /dev/null +++ b/mysql-test/suite/rocksdb/t/partial_index_stress.test @@ -0,0 +1,64 @@ +# +# Stress partial indexes by performing writes, and checking that PK/SK are still consistent. +# + +set @save_rocksdb_lock_wait_timeout = @@rocksdb_lock_wait_timeout; +set global rocksdb_lock_wait_timeout = 100000; + +CREATE TABLE `assoc_table` ( + `id1` bigint(20) unsigned NOT NULL DEFAULT '0', + `id1_type` int(10) unsigned NOT NULL DEFAULT '0', + `id2` bigint(20) unsigned NOT NULL DEFAULT '0', + `id2_type` int(10) unsigned NOT NULL DEFAULT '0', + `assoc_type` bigint(20) unsigned NOT NULL DEFAULT '0', + `visibility` tinyint(3) NOT NULL DEFAULT '0', + `data` varchar(255) COLLATE latin1_bin NOT NULL DEFAULT '', + `time` int(10) unsigned NOT NULL DEFAULT '0', + `version` bigint(20) unsigned NOT NULL DEFAULT '0', + PRIMARY KEY (`assoc_type`,`id1`,`id2`) COMMENT 'cf_assoc', + KEY `id1_type` (`assoc_type`, `id1`, `visibility`,`time`,`id2`,`version`,`data`) COMMENT 'cfname=rev:cf_assoc_id1_type;partial_group_keyparts=2;partial_group_threshold=10' +) ENGINE=ROCKSDB DEFAULT CHARSET=latin1 COLLATE=latin1_bin ROW_FORMAT=COMPRESSED KEY_BLOCK_SIZE=8; + +exec /usr/bin/python3 suite/rocksdb/t/partial_index_stress.py root 127.0.0.1 $MASTER_MYPORT test assoc_table 1000 10; + +DROP TABLE assoc_table; + +CREATE TABLE `assoc_table` ( + `id1` binary(16) NOT NULL DEFAULT '\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0', + `raw_key` text COLLATE latin1_bin, + `id2` bigint(20) unsigned NOT NULL DEFAULT '0', + `id2_type` int(10) unsigned NOT NULL DEFAULT '0', + `assoc_type` bigint(20) unsigned NOT NULL DEFAULT '0', + `visibility` tinyint(3) NOT NULL DEFAULT '0', + `data` varchar(255) COLLATE latin1_bin NOT NULL DEFAULT '', + `time` int(10) unsigned NOT NULL DEFAULT '0', + `version` bigint(20) unsigned NOT NULL DEFAULT '0', + PRIMARY KEY (`assoc_type`,`id1`,`id2`) COMMENT 'cf_assoc', + KEY `id1_type` (`assoc_type`,`id1`,`visibility`,`time`,`id2`,`version`,`data`) COMMENT 'rev:cf_assoc_id1_type', + KEY `id1_type2` (`assoc_type`, `id1`, `visibility`,`time`,`id2`,`version`,`data`) COMMENT 'cfname=rev:cf_assoc_id1_type;partial_group_keyparts=2;partial_group_threshold=10' +) ENGINE=ROCKSDB DEFAULT CHARSET=latin1 COLLATE=latin1_bin ROW_FORMAT=COMPRESSED KEY_BLOCK_SIZE=8; + +exec /usr/bin/python3 suite/rocksdb/t/partial_index_stress.py root 127.0.0.1 $MASTER_MYPORT test assoc_table 1000 10; + +DROP TABLE assoc_table; + +CREATE TABLE `assoc_table` ( + `id1` bigint(20) unsigned NOT NULL DEFAULT '0', + `id1_type` int(10) unsigned NOT NULL DEFAULT '0', + `id2` bigint(20) unsigned NOT NULL DEFAULT '0', + `id2_type` int(10) unsigned NOT NULL DEFAULT '0', + `assoc_type` bigint(20) unsigned NOT NULL DEFAULT '0', + `visibility` tinyint(4) NOT NULL DEFAULT '0', + `data` text COLLATE latin1_bin NOT NULL, + `time` int(10) unsigned NOT NULL DEFAULT '0', + `version` bigint(20) unsigned NOT NULL DEFAULT '0', + PRIMARY KEY (`assoc_type`,`id1`,`id2`) COMMENT 'cf_assoc', + KEY `id1_type` (`assoc_type`, `id1`, `visibility`,`time`,`id2`,`version`,`data`(255)) COMMENT 'rev:cf_assoc_id1_type', + KEY `id1_type2` (`assoc_type`, `id1`, `visibility`,`time`,`id2`,`version`,`data`(255)) COMMENT 'cfname=rev:cf_assoc_id1_type;partial_group_keyparts=2;partial_group_threshold=10' +) ENGINE=ROCKSDB DEFAULT CHARSET=latin1 COLLATE=latin1_bin ROW_FORMAT=COMPRESSED KEY_BLOCK_SIZE=4; + +exec /usr/bin/python3 suite/rocksdb/t/partial_index_stress.py root 127.0.0.1 $MASTER_MYPORT test assoc_table 1000 10; + +DROP TABLE assoc_table; + +set global rocksdb_lock_wait_timeout = @save_rocksdb_lock_wait_timeout; diff --git a/storage/rocksdb/ha_rocksdb.cc b/storage/rocksdb/ha_rocksdb.cc index 15c8b289a5e..f59b456675d 100644 --- a/storage/rocksdb/ha_rocksdb.cc +++ b/storage/rocksdb/ha_rocksdb.cc @@ -802,6 +802,11 @@ std::atomic<uint64_t> rocksdb_select_bypass_executed(0); std::atomic<uint64_t> rocksdb_select_bypass_rejected(0); std::atomic<uint64_t> rocksdb_select_bypass_failed(0); +std::atomic<uint64_t> rocksdb_partial_index_groups_sorted(0); +std::atomic<uint64_t> rocksdb_partial_index_groups_materialized(0); +std::atomic<uint64_t> rocksdb_partial_index_rows_sorted(0); +std::atomic<uint64_t> rocksdb_partial_index_rows_materialized(0); + static int rocksdb_trace_block_cache_access( THD *const thd MY_ATTRIBUTE((__unused__)), struct SYS_VAR *const var MY_ATTRIBUTE((__unused__)), void *const save, @@ -3111,7 +3116,7 @@ class Rdb_transaction { virtual void set_sync(bool sync) = 0; virtual void release_lock(const Rdb_key_def &key_descr, - const std::string &rowkey) = 0; + const std::string &rowkey, bool force = false) = 0; virtual bool prepare() = 0; @@ -3714,9 +3719,9 @@ class Rdb_transaction_impl : public Rdb_transaction { m_rocksdb_tx->GetWriteOptions()->sync = sync; } - void release_lock(const Rdb_key_def &key_descr, - const std::string &rowkey) override { - if (!THDVAR(m_thd, lock_scanned_rows)) { + void release_lock(const Rdb_key_def &key_descr, const std::string &rowkey, + bool force) override { + if (!THDVAR(m_thd, lock_scanned_rows) || force) { m_rocksdb_tx->UndoGetForUpdate(key_descr.get_cf(), rocksdb::Slice(rowkey)); // row_lock_count track row(pk) @@ -4167,7 +4172,8 @@ class Rdb_writebatch_impl : public Rdb_transaction { void set_sync(bool sync) override { write_opts.sync = sync; } void release_lock(const Rdb_key_def &key_descr MY_ATTRIBUTE((unused)), - const std::string &rowkey MY_ATTRIBUTE((unused))) override { + const std::string &rowkey MY_ATTRIBUTE((unused)), + bool force MY_ATTRIBUTE((unused))) override { // Nothing to do here since we don't hold any row locks. } @@ -6922,6 +6928,7 @@ ulonglong ha_rocksdb::load_auto_incr_value_from_index() { active_index = table->s->next_number_index; const uint8 save_table_status = table->m_status; + DBUG_ASSERT(!m_key_descr_arr[active_index_pos()]->is_partial_index()); std::unique_ptr<Rdb_iterator> save_iterator(new Rdb_iterator_base( ha_thd(), m_key_descr_arr[active_index_pos()], m_pk_descr, m_tbl_def)); std::swap(m_iterator, save_iterator); @@ -10203,6 +10210,7 @@ int ha_rocksdb::check_and_lock_sk( The bloom filter may need to be disabled for this lookup. */ + DBUG_ASSERT(!m_key_descr_arr[key_id]->is_partial_index()); Rdb_iterator_base iter(ha_thd(), m_key_descr_arr[key_id], m_pk_descr, m_tbl_def); int rc = HA_EXIT_SUCCESS; @@ -10608,6 +10616,34 @@ int ha_rocksdb::update_write_sk(const TABLE *const table_arg, bytes_written = old_key_slice.size(); } + if (kd.is_partial_index()) { + // Obtain shared lock on prefix. + int size = kd.pack_record(table_arg, m_pack_buffer, row_info.new_data, + m_sk_packed_tuple, nullptr, false, 0, + kd.partial_index_keyparts()); + const rocksdb::Slice prefix_slice = + rocksdb::Slice((const char *)m_sk_packed_tuple, size); + + const rocksdb::Status s = row_info.tx->get_for_update( + kd, prefix_slice, nullptr, false /* exclusive */, + false /* do validate */); + if (!s.ok()) { + return row_info.tx->set_status_error(table_arg->in_use, s, kd, m_tbl_def, + m_table_handler); + } + + // Check if this prefix has been materialized. + Rdb_iterator_base iter(ha_thd(), m_key_descr_arr[kd.get_keyno()], + m_pk_descr, m_tbl_def); + rc = iter.seek(HA_READ_KEY_EXACT, prefix_slice, false, prefix_slice, + true /* read current */); + + // We can skip updating the index, if the prefix is not materialized. + if (rc == HA_ERR_END_OF_FILE || rc == HA_ERR_KEY_NOT_FOUND) { + return 0; + } + } + new_key_slice = rocksdb::Slice( reinterpret_cast<const char *>(m_sk_packed_tuple), new_packed_size); new_value_slice = @@ -10897,8 +10933,16 @@ int ha_rocksdb::index_init(uint idx, bool sorted MY_ATTRIBUTE((__unused__))) { DBUG_ASSERT(tx != nullptr); active_index = idx; - m_iterator.reset(new Rdb_iterator_base( - thd, m_key_descr_arr[active_index_pos()], m_pk_descr, m_tbl_def)); + if (idx != table->s->primary_key && + m_key_descr_arr[idx]->is_partial_index()) { + m_iterator.reset( + new Rdb_iterator_partial(thd, active_index, + m_key_descr_arr[active_index_pos()], + m_pk_descr, m_tbl_def, table)); + } else { + m_iterator.reset(new Rdb_iterator_base( + thd, m_key_descr_arr[active_index_pos()], m_pk_descr, m_tbl_def)); + } // If m_lock_rows is not RDB_LOCK_NONE then we will be doing a get_for_update // when accessing the index, so don't acquire the snapshot right away. @@ -13379,6 +13423,9 @@ int ha_rocksdb::inplace_populate_sk( THDVAR(ha_thd(), merge_tmp_file_removal_delay_ms); for (const auto &index : indexes) { + // Skip populating partial indexes for now. + if (index->is_partial_index()) continue; + bool is_unique_index = new_table_arg->key_info[index->get_keyno()].flags & HA_NOSAME; @@ -14210,6 +14257,17 @@ static SHOW_VAR rocksdb_status_vars[] = { &rocksdb_select_bypass_rejected, SHOW_LONGLONG), DEF_STATUS_VAR_PTR("select_bypass_failed", &rocksdb_select_bypass_failed, SHOW_LONGLONG), + + DEF_STATUS_VAR_PTR("partial_index_groups_sorted", + &rocksdb_partial_index_groups_sorted, SHOW_LONGLONG), + DEF_STATUS_VAR_PTR("partial_index_groups_materialized", + &rocksdb_partial_index_groups_materialized, + SHOW_LONGLONG), + DEF_STATUS_VAR_PTR("partial_index_rows_sorted", + &rocksdb_partial_index_rows_sorted, SHOW_LONGLONG), + DEF_STATUS_VAR_PTR("partial_index_rows_materialized", + &rocksdb_partial_index_rows_materialized, SHOW_LONGLONG), + // the variables generated by SHOW_FUNC are sorted only by prefix (first // arg in the tuple below), so make sure it is unique to make sorting // deterministic as quick sort is not stable @@ -15844,6 +15902,11 @@ rocksdb::Status rdb_tx_get_for_update(Rdb_transaction *tx, return s; } +void rdb_tx_release_lock(Rdb_transaction *tx, const Rdb_key_def &kd, + const rocksdb::Slice &key) { + tx->release_lock(kd, std::string(key.data(), key.size())); +} + void rdb_tx_multi_get(Rdb_transaction *tx, rocksdb::ColumnFamilyHandle *const column_family, const size_t num_keys, const rocksdb::Slice *keys, diff --git a/storage/rocksdb/ha_rocksdb.h b/storage/rocksdb/ha_rocksdb.h index 369af0bc9c4..d0baeefe942 100644 --- a/storage/rocksdb/ha_rocksdb.h +++ b/storage/rocksdb/ha_rocksdb.h @@ -1166,6 +1166,9 @@ rocksdb::Status rdb_tx_get_for_update(Rdb_transaction *tx, rocksdb::PinnableSlice *const value, bool exclusive); +void rdb_tx_release_lock(Rdb_transaction *tx, const Rdb_key_def &kd, + const rocksdb::Slice &key); + void rdb_tx_multi_get(Rdb_transaction *tx, rocksdb::ColumnFamilyHandle *const column_family, const size_t num_keys, const rocksdb::Slice *keys, @@ -1218,4 +1221,9 @@ extern std::atomic<uint64_t> rocksdb_select_bypass_executed; extern std::atomic<uint64_t> rocksdb_select_bypass_rejected; extern std::atomic<uint64_t> rocksdb_select_bypass_failed; +extern std::atomic<uint64_t> rocksdb_partial_index_groups_sorted; +extern std::atomic<uint64_t> rocksdb_partial_index_groups_materialized; +extern std::atomic<uint64_t> rocksdb_partial_index_rows_sorted; +extern std::atomic<uint64_t> rocksdb_partial_index_rows_materialized; + } // namespace myrocks diff --git a/storage/rocksdb/rdb_datadic.h b/storage/rocksdb/rdb_datadic.h index 2c5828a6b8a..1b12c33d8d7 100644 --- a/storage/rocksdb/rdb_datadic.h +++ b/storage/rocksdb/rdb_datadic.h @@ -586,6 +586,12 @@ class Rdb_key_def { uint extract_partial_index_info(const TABLE *const table_arg, const Rdb_tbl_def *const tbl_def_arg); inline bool is_partial_index() const { return m_partial_index_threshold > 0; } + inline uint partial_index_threshold() const { + return m_partial_index_threshold; + } + inline uint partial_index_keyparts() const { + return m_partial_index_keyparts; + } static bool has_index_flag(uint32 index_flags, enum INDEX_FLAG flag); static uint32 calculate_index_flag_offset(uint32 index_flags, diff --git a/storage/rocksdb/rdb_iterator.cc b/storage/rocksdb/rdb_iterator.cc index 529cd6dacae..60dd0c4c6ab 100644 --- a/storage/rocksdb/rdb_iterator.cc +++ b/storage/rocksdb/rdb_iterator.cc @@ -16,7 +16,10 @@ #include "./rdb_iterator.h" +/* MySQL includes */ #include "scope_guard.h" +#include "sql/sql_class.h" +#include "sql/thr_malloc.h" namespace myrocks { @@ -356,4 +359,592 @@ int Rdb_iterator_base::get(const rocksdb::Slice *key, return rc; } +Rdb_iterator_partial::Rdb_iterator_partial( + THD *thd, uint active_index, + const std::shared_ptr<Rdb_key_def> kd, + const std::shared_ptr<Rdb_key_def> pkd, const Rdb_tbl_def *tbl_def, + TABLE *table) + : Rdb_iterator_base(thd, kd, pkd, tbl_def), + m_table(table), + m_iterator_pk(thd, pkd, pkd, tbl_def), + m_converter(thd, tbl_def, table), + m_valid(false), + m_materialized(false), + m_threshold(kd->partial_index_threshold()), + m_prefix_keyparts(kd->partial_index_keyparts()), + m_cur_prefix_key_len(0), + m_records(slice_comparator(m_kd->get_cf()->GetComparator())), + m_records_it(m_records.end()) { + init_sql_alloc(PSI_NOT_INSTRUMENTED, &m_mem_root, 4024, 0); + m_converter.setup_field_decoders(table->read_set, active_index, + true /* keyread_only */, true /* decode all */); + const uint packed_len = + std::max(m_kd->max_storage_fmt_length(), m_pkd->max_storage_fmt_length()); + m_cur_prefix_key = reinterpret_cast<uchar *>( + my_malloc(PSI_NOT_INSTRUMENTED, packed_len, MYF(0))); + m_record_buf = reinterpret_cast<uchar *>( + my_malloc(PSI_NOT_INSTRUMENTED, table->s->reclength, MYF(0))); + m_pack_buffer = reinterpret_cast<uchar *>( + my_malloc(PSI_NOT_INSTRUMENTED, packed_len, MYF(0))); + m_sk_packed_tuple = reinterpret_cast<uchar *>( + my_malloc(PSI_NOT_INSTRUMENTED, packed_len, MYF(0))); +} + +Rdb_iterator_partial::~Rdb_iterator_partial() { + reset(); + my_free(m_cur_prefix_key); + m_cur_prefix_key = nullptr; + my_free(m_record_buf); + m_record_buf = nullptr; + my_free(m_pack_buffer); + m_pack_buffer = nullptr; + my_free(m_sk_packed_tuple); + m_sk_packed_tuple = nullptr; +} + +int Rdb_iterator_partial::get_prefix_len(const rocksdb::Slice &start_key, + uint *prefix_cnt, uint *prefix_len) { + Rdb_string_reader reader(&start_key); + if ((!reader.read(Rdb_key_def::INDEX_ID_SIZE))) { + return HA_ERR_INTERNAL_ERROR; + } + + for (uint i = 0; i < m_prefix_keyparts; i++) { + if (reader.remaining_bytes() == 0) { + *prefix_cnt = i; + *prefix_len = reader.get_current_ptr() - start_key.data(); + return HA_EXIT_SUCCESS; + } + + if (m_kd->read_memcmp_key_part(m_table, &reader, i) > 0) { + return HA_ERR_INTERNAL_ERROR; + } + } + + *prefix_cnt = m_prefix_keyparts; + *prefix_len = reader.get_current_ptr() - start_key.data(); + + return HA_EXIT_SUCCESS; +} + +/* + * Determines the correct prefix from start_key by reading from primary key if + * needed. + * + * Populates m_cur_prefix_key/m_cur_prefix_key_len. + */ +int Rdb_iterator_partial::get_prefix_from_start( + enum ha_rkey_function find_flag, const rocksdb::Slice &start_key) { + int rc = 0; + uint prefix_cnt = 0; + uint prefix_len = 0; + + rc = get_prefix_len(start_key, &prefix_cnt, &prefix_len); + if (rc) { + return rc; + } + DBUG_ASSERT_IMP(prefix_cnt == 0, prefix_len == Rdb_key_def::INDEX_ID_SIZE); + + // There are 2 scenarios where a read is required to determine the prefix: + // 1. There are not enough keyparts in the start_key. + // 2. An exclusive seek key is provided, meaning that we need to read the next + // prefix. + if (prefix_cnt < m_prefix_keyparts || + (prefix_len == start_key.size() && + (find_flag == HA_READ_AFTER_KEY || find_flag == HA_READ_BEFORE_KEY))) { + uint tmp; + + rocksdb::Slice empty_end_key; + + // Since the PK/SK share the same prefix, the primary key can be constructed + // using the secondary key, with the index_id overwritten. + memcpy(m_cur_prefix_key, start_key.data(), prefix_len); + rocksdb::Slice seek_key((const char *)m_cur_prefix_key, prefix_len); + m_pkd->get_infimum_key(m_cur_prefix_key, &tmp); + + rc = m_iterator_pk.seek(find_flag, seek_key, false, empty_end_key); + if (rc) { + return rc; + } + + rc = get_prefix_len(m_iterator_pk.key(), &prefix_cnt, &prefix_len); + if (rc) { + return rc; + } + memcpy(m_cur_prefix_key, m_iterator_pk.key().data(), prefix_len); + } else { + memcpy(m_cur_prefix_key, start_key.data(), prefix_len); + } + + m_cur_prefix_key_len = prefix_len; + return HA_EXIT_SUCCESS; +} + +int Rdb_iterator_partial::get_next_prefix(bool direction) { + rocksdb::Slice cur_prefix_key((const char *)m_cur_prefix_key, + m_cur_prefix_key_len); + uint tmp; + + int rc = get_prefix_from_start( + direction ? HA_READ_AFTER_KEY : HA_READ_BEFORE_KEY, cur_prefix_key); + m_kd->get_infimum_key(m_cur_prefix_key, &tmp); + + cur_prefix_key = + rocksdb::Slice((const char *)m_cur_prefix_key, m_cur_prefix_key_len); + if (!rc && !m_kd->value_matches_prefix(cur_prefix_key, m_prefix_tuple)) { + rc = HA_ERR_END_OF_FILE; + } + + return rc; +} +int Rdb_iterator_partial::seek_next_prefix(bool direction) { + rocksdb::Slice empty_end_key; + uint tmp; + + // Fetch next prefix using PK. + int rc = get_next_prefix(direction); + if (rc) return rc; + + // Rdb_iterator_base::seek below will overwrite m_prefix_tuple, so we save a + // copy here. + size_t prefix_buf_len = m_prefix_tuple.size(); + uchar *prefix_buf_copy = (uchar *)my_alloca(prefix_buf_len); + memcpy(prefix_buf_copy, m_prefix_buf, prefix_buf_len); + + // First try reading from SK in the current prefix. + rocksdb::Slice cur_prefix_key((const char *)m_cur_prefix_key, + m_cur_prefix_key_len); + m_kd->get_infimum_key(m_cur_prefix_key, &tmp); + + rc = Rdb_iterator_base::seek( + direction ? HA_READ_KEY_EXACT : HA_READ_PREFIX_LAST, cur_prefix_key, + false, empty_end_key); + + // Restore m_prefix_tuple + memcpy(m_prefix_buf, prefix_buf_copy, prefix_buf_len); + m_prefix_tuple = rocksdb::Slice((char *)m_prefix_buf, prefix_buf_len); + + if (rc == HA_ERR_END_OF_FILE) { + // Nothing in SK, so check PK. + rc = read_prefix_from_pk(); + + if (rc == 0) { + // Not materialized on disk, seek to beginning/end of map. + m_materialized = false; + if (direction ^ m_kd->m_is_reverse_cf) { + m_records_it = m_records.begin(); + } else { + m_records_it = m_records.end(); + m_records_it--; + } + } else { + // The current prefix was determined by reading from PK in + // get_next_prefix, so rows must exist within this prefix on the PK. + DBUG_ASSERT(rc != HA_ERR_END_OF_FILE); + } + } else if (rc == 0) { + // Found rows in SK, so use them + m_materialized = true; + } + + return rc; +} + +int Rdb_iterator_partial::materialize_prefix() { + MEM_ROOT mem_root; + init_sql_alloc(PSI_NOT_INSTRUMENTED, &mem_root, 4024, 0); + uint tmp; + Rdb_transaction *const tx = get_tx_from_thd(m_thd); + m_kd->get_infimum_key(m_cur_prefix_key, &tmp); + rocksdb::Slice cur_prefix_key((const char *)m_cur_prefix_key, + m_cur_prefix_key_len); + + auto s = + rdb_tx_get_for_update(tx, *m_kd, cur_prefix_key, nullptr, RDB_LOCK_WRITE); + if (!s.ok()) { + return rdb_tx_set_status_error(tx, s, *m_kd, m_tbl_def); + } + + // It is possible that someone else has already materialized this group + // before we locked. Double check if the prefix is still empty. + Rdb_iterator_base iter(m_thd, m_kd, m_pkd, m_tbl_def); + m_kd->get_infimum_key(m_cur_prefix_key, &tmp); + int rc = iter.seek(HA_READ_KEY_EXACT, cur_prefix_key, false, cur_prefix_key, + true /* read current */); + if (rc == 0 || rc != HA_ERR_END_OF_FILE) { + rdb_tx_release_lock(tx, *m_kd, cur_prefix_key); + return rc; + } + + rocksdb::WriteOptions options; + options.sync = false; + rocksdb::TransactionDBWriteOptimizations optimize; + optimize.skip_concurrency_control = true; + + auto wb = std::unique_ptr<rocksdb::WriteBatch>(new rocksdb::WriteBatch); + m_pkd->get_infimum_key(m_cur_prefix_key, &tmp); + rc = m_iterator_pk.seek(HA_READ_KEY_EXACT, cur_prefix_key, false, + cur_prefix_key, true /* read current */); + size_t num_rows = 0; + + while (!rc) { + if (thd_killed(m_thd)) { + rc = HA_ERR_QUERY_INTERRUPTED; + goto exit; + } + + const rocksdb::Slice &rkey = m_iterator_pk.key(); + const rocksdb::Slice &rval = m_iterator_pk.value(); + + // Unpack from PK format + rc = m_converter.decode(m_pkd, m_record_buf, &rkey, &rval); + if (rc) { + goto exit; + } + + // Repack into SK format + uint sk_packed_size = m_kd->pack_record( + m_table, m_pack_buffer, m_record_buf, m_sk_packed_tuple, &m_sk_tails, + false /* store_row_debug_checksums */, 0 /* hidden_pk_id */, 0, nullptr, + m_converter.get_ttl_bytes_buffer()); + + const char *key = + (const char *)memdup_root(&mem_root, m_sk_packed_tuple, sk_packed_size); + const char *val = (const char *)memdup_root(&mem_root, m_sk_tails.ptr(), + m_sk_tails.get_current_pos()); + + s = wb->Put(m_kd->get_cf(), rocksdb::Slice(key, sk_packed_size), + rocksdb::Slice(val, m_sk_tails.get_current_pos())); + if (!s.ok()) { + rc = rdb_tx_set_status_error(tx, s, *m_kd, m_tbl_def); + goto exit; + } + + num_rows++; + rc = m_iterator_pk.next(); + } + + if (rc != HA_ERR_END_OF_FILE) goto exit; + rc = HA_EXIT_SUCCESS; + + s = rdb_get_rocksdb_db()->Write(options, optimize, wb.get()); + if (!s.ok()) { + rc = rdb_tx_set_status_error(tx, s, *m_kd, m_tbl_def); + goto exit; + } + + rocksdb_partial_index_groups_materialized++; + rocksdb_partial_index_rows_materialized += num_rows; + +exit: + rdb_tx_release_lock(tx, *m_kd, cur_prefix_key); + return rc; +} + +int Rdb_iterator_partial::read_prefix_from_pk() { + uint tmp; + int rc = 0; + size_t num_rows = 0; + + free_root(&m_mem_root, MYF(MY_KEEP_PREALLOC)); + m_records.clear(); + + const char *old_proc_info = m_thd->get_proc_info(); + thd_proc_info(m_thd, "Materializing group in partial index"); + + rocksdb::Slice cur_prefix_key((const char *)m_cur_prefix_key, + m_cur_prefix_key_len); + m_pkd->get_infimum_key(m_cur_prefix_key, &tmp); + + // Since rocksdb does not support reverse prefix seeks, we always seek in the + // forwards direction (even PK is a reverse cf). + rc = m_iterator_pk.seek(HA_READ_KEY_EXACT, cur_prefix_key, false, + cur_prefix_key); + + while (!rc) { + if (thd_killed(m_thd)) { + rc = HA_ERR_QUERY_INTERRUPTED; + goto exit; + } + + const rocksdb::Slice &rkey = m_iterator_pk.key(); + const rocksdb::Slice &rval = m_iterator_pk.value(); + + // Unpack from PK format + rc = m_converter.decode(m_pkd, m_record_buf, &rkey, &rval); + if (rc) goto exit; + + // Repack into SK format + uint sk_packed_size = m_kd->pack_record( + m_table, m_pack_buffer, m_record_buf, m_sk_packed_tuple, &m_sk_tails, + false /* store_row_debug_checksums */, 0 /* hidden_pk_id */, 0, nullptr, + m_converter.get_ttl_bytes_buffer()); + + const char *key = (const char *)memdup_root(&m_mem_root, m_sk_packed_tuple, + sk_packed_size); + const char *val = (const char *)memdup_root(&m_mem_root, m_sk_tails.ptr(), + m_sk_tails.get_current_pos()); + + m_records.emplace(rocksdb::Slice(key, sk_packed_size), + rocksdb::Slice(val, m_sk_tails.get_current_pos())); + + num_rows++; + rc = m_iterator_pk.next(); + } + + if (rc != HA_ERR_END_OF_FILE) goto exit; + rc = HA_EXIT_SUCCESS; + + rocksdb_partial_index_groups_sorted++; + rocksdb_partial_index_rows_sorted += num_rows; + + if (num_rows > m_threshold) { + rc = materialize_prefix(); + } else if (num_rows == 0) { + rc = HA_ERR_END_OF_FILE; + } + +exit: + thd_proc_info(m_thd, old_proc_info); + return rc; +} + +int Rdb_iterator_partial::seek(enum ha_rkey_function find_flag, + const rocksdb::Slice start_key, + bool full_key_match, + const rocksdb::Slice end_key, + bool read_current) { + int rc = 0; + uint tmp; + + DBUG_ASSERT(!read_current); + reset(); + + bool direction = (find_flag == HA_READ_KEY_EXACT) || + (find_flag == HA_READ_AFTER_KEY) || + (find_flag == HA_READ_KEY_OR_NEXT); + + // Get current prefix. + if ((rc = get_prefix_from_start(find_flag, start_key)) != 0) { + return rc; + } + + // First try reading from SK in the current prefix. + rocksdb::Slice cur_prefix_key((const char *)m_cur_prefix_key, + m_cur_prefix_key_len); + m_kd->get_infimum_key(m_cur_prefix_key, &tmp); + + rc = Rdb_iterator_base::seek(find_flag, start_key, full_key_match, end_key, + read_current); + + // Check if we're still in our current prefix. If not, we may have missed + // some unmaterialized keys, so we have to check PK. + if (rc == 0 && + !m_kd->value_matches_prefix(Rdb_iterator_base::key(), cur_prefix_key)) { + rc = HA_ERR_END_OF_FILE; + } + + bool next_prefix = false; + + if (rc == HA_ERR_END_OF_FILE) { + // Nothing in SK, so check PK. + rc = read_prefix_from_pk(); + + if (rc == HA_ERR_END_OF_FILE) { + // Nothing in PK, so move to next prefix. + next_prefix = true; + } else if (rc == 0) { + // Not materialized on disk. + m_materialized = false; + + // Seek to correct spot. + uchar *start_key_buf = (uchar *)start_key.data(); + + // Similar to Rdb_iterator_base::seek, convert start_key into an rocksdb + // key that we will actually seek to. + auto start_key_guard = + create_scope_guard([this, start_key_buf, start_key] { + this->m_kd->predecessor(start_key_buf, start_key.size()); + }); + if (find_flag == HA_READ_PREFIX_LAST_OR_PREV || + find_flag == HA_READ_PREFIX_LAST || find_flag == HA_READ_AFTER_KEY) { + m_kd->successor(start_key_buf, start_key.size()); + } else { + start_key_guard.commit(); + } + + if (direction) { + if (m_kd->m_is_reverse_cf) { + // Emulate "SeekForPrev" behaviour. + m_records_it = m_records.upper_bound(start_key); + if (m_records_it == m_records.begin()) { + next_prefix = true; + } else { + m_records_it--; + } + } else { + m_records_it = m_records.lower_bound(start_key); + if (m_records_it == m_records.end()) { + next_prefix = true; + } + } + } else { + if (m_kd->m_is_reverse_cf) { + m_records_it = m_records.upper_bound(start_key); + if (m_records_it == m_records.end()) { + next_prefix = true; + } + } else { + // Emulate "SeekForPrev" behaviour. + m_records_it = m_records.lower_bound(start_key); + if (m_records_it == m_records.begin()) { + next_prefix = true; + } else { + m_records_it--; + } + } + } + } + } else if (rc == 0) { + // Found rows in SK, so use them. + m_materialized = true; + } + + if (next_prefix) { + rc = seek_next_prefix(direction); + } + + if (!rc) { + if (!m_kd->value_matches_prefix(key(), m_prefix_tuple)) { + rc = HA_ERR_END_OF_FILE; + } else { + m_valid = true; + } + } + + return rc; +} + +int Rdb_iterator_partial::get(const rocksdb::Slice *key, + rocksdb::PinnableSlice *value, Rdb_lock_type type, + bool skip_ttl_check) { + int rc = Rdb_iterator_base::get(key, value, type, skip_ttl_check); + + if (rc == HA_ERR_KEY_NOT_FOUND) { + const uint size = + m_kd->get_primary_key_tuple(m_table, *m_pkd, key, m_sk_packed_tuple); + if (size == RDB_INVALID_KEY_LEN) { + return HA_ERR_ROCKSDB_CORRUPT_DATA; + } + + rocksdb::Slice pk_key((const char *)m_sk_packed_tuple, size); + + rc = m_iterator_pk.get(&pk_key, value, type, skip_ttl_check); + if (rc) return rc; + + // Unpack from PK format + rc = m_converter.decode(m_pkd, m_record_buf, &pk_key, value); + if (rc) return rc; + + // Repack into SK format + uint sk_packed_size = m_kd->pack_record( + m_table, m_pack_buffer, m_record_buf, m_sk_packed_tuple, &m_sk_tails, + false /* store_row_debug_checksums */, 0 /* hidden_pk_id */, 0, nullptr, + m_converter.get_ttl_bytes_buffer()); + + value->PinSelf( + rocksdb::Slice((const char *)m_sk_packed_tuple, sk_packed_size)); + rc = 0; + } + + m_valid = false; + return rc; +} + +int Rdb_iterator_partial::next_with_direction_in_group(bool direction) { + uint tmp; + int rc = HA_EXIT_SUCCESS; + if (m_materialized) { + rc = direction ? Rdb_iterator_base::next() : Rdb_iterator_base::prev(); + + if (rc == HA_EXIT_SUCCESS) { + rocksdb::Slice cur_prefix_key((const char *)m_cur_prefix_key, + m_cur_prefix_key_len); + m_kd->get_infimum_key(m_cur_prefix_key, &tmp); + + if (!m_kd->value_matches_prefix(Rdb_iterator_base::key(), + cur_prefix_key)) { + return HA_ERR_END_OF_FILE; + } + } + } else { + if (direction ^ m_kd->m_is_reverse_cf) { + m_records_it++; + if (m_records_it == m_records.end()) return HA_ERR_END_OF_FILE; + } else { + if (m_records_it == m_records.begin()) return HA_ERR_END_OF_FILE; + m_records_it--; + } + } + + return rc; +} + +int Rdb_iterator_partial::next_with_direction(bool direction) { + if (!m_valid) return HA_ERR_INTERNAL_ERROR; + + int rc = next_with_direction_in_group(direction); + + if (!rc) { + // On success, check if key is still within prefix. + if (!m_kd->value_matches_prefix(key(), m_prefix_tuple)) { + rc = HA_ERR_END_OF_FILE; + } + } else if (rc == HA_ERR_END_OF_FILE) { + uint tmp; + rocksdb::Slice cur_prefix_key((const char *)m_cur_prefix_key, + m_cur_prefix_key_len); + m_kd->get_infimum_key(m_cur_prefix_key, &tmp); + + if (m_prefix_tuple.size() >= cur_prefix_key.size()) { + DBUG_ASSERT(memcmp(m_prefix_tuple.data(), cur_prefix_key.data(), + cur_prefix_key.size()) == 0); + return HA_ERR_END_OF_FILE; + } + + rc = seek_next_prefix(direction); + } + + return rc; +} + +int Rdb_iterator_partial::next() { + int rc = next_with_direction(true); + if (rc == HA_ERR_END_OF_FILE) m_valid = false; + return rc; +} + +int Rdb_iterator_partial::prev() { + int rc = next_with_direction(false); + if (rc == HA_ERR_END_OF_FILE) m_valid = false; + return rc; +} + +void Rdb_iterator_partial::reset() { + m_valid = false; + m_materialized = false; + free_root(&m_mem_root, MYF(MY_KEEP_PREALLOC)); + m_records.clear(); + m_iterator_pk.reset(); + Rdb_iterator_base::reset(); +} + +rocksdb::Slice Rdb_iterator_partial::key() { + return m_materialized ? Rdb_iterator_base::key() : m_records_it->first; +} + +rocksdb::Slice Rdb_iterator_partial::value() { + return m_materialized ? Rdb_iterator_base::value() : m_records_it->second; +} + } // namespace myrocks diff --git a/storage/rocksdb/rdb_iterator.h b/storage/rocksdb/rdb_iterator.h index 2a0f5bd5760..164d32c4099 100644 --- a/storage/rocksdb/rdb_iterator.h +++ b/storage/rocksdb/rdb_iterator.h @@ -75,7 +75,7 @@ class Rdb_iterator_base : public Rdb_iterator { int seek(enum ha_rkey_function find_flag, const rocksdb::Slice start_key, bool full_key_match, const rocksdb::Slice end_key, - bool read_current) override; + bool read_current = false) override; int get(const rocksdb::Slice *key, rocksdb::PinnableSlice *value, Rdb_lock_type type, bool skip_ttl_check = false) override; @@ -118,4 +118,71 @@ class Rdb_iterator_base : public Rdb_iterator { rocksdb::Slice m_prefix_tuple; }; +class Rdb_iterator_partial : public Rdb_iterator_base { + private: + TABLE *m_table; + MEM_ROOT m_mem_root; + + Rdb_iterator_base m_iterator_pk; + Rdb_converter m_converter; + + bool m_valid; + bool m_materialized; + + const uint m_threshold; + const uint m_prefix_keyparts; + + uchar *m_cur_prefix_key; + uint m_cur_prefix_key_len; + + uchar *m_record_buf; + uchar *m_pack_buffer; + uchar *m_sk_packed_tuple; + + Rdb_string_writer m_sk_tails; + + int get_prefix_len(const rocksdb::Slice &start_key, uint *prefix_cnt, + uint *prefix_len); + int get_prefix_from_start(enum ha_rkey_function find_flag, + const rocksdb::Slice &start_key); + int get_next_prefix(bool direction); + int seek_next_prefix(bool direction); + int materialize_prefix(); + int read_prefix_from_pk(); + int next_with_direction_in_group(bool direction); + int next_with_direction(bool direction); + + struct slice_comparator { + slice_comparator(const rocksdb::Comparator *c) : m_comparator(c) {} + const rocksdb::Comparator *const m_comparator; + + bool operator()(const rocksdb::Slice &lhs, const rocksdb::Slice &rhs) { + return m_comparator->Compare(lhs, rhs) < 0; + } + }; + + std::map<const rocksdb::Slice, const rocksdb::Slice, slice_comparator> + m_records; + std::map<const rocksdb::Slice, const rocksdb::Slice, + slice_comparator>::iterator m_records_it; + + public: + Rdb_iterator_partial(THD *thd, uint active_index, + const std::shared_ptr<Rdb_key_def> kd, + const std::shared_ptr<Rdb_key_def> pkd, + const Rdb_tbl_def *tbl_def, TABLE *table); + ~Rdb_iterator_partial() override; + + int seek(enum ha_rkey_function find_flag, const rocksdb::Slice start_key, + bool full_key_match, const rocksdb::Slice end_key, + bool read_current = false) override; + int get(const rocksdb::Slice *key, rocksdb::PinnableSlice *value, + Rdb_lock_type type, bool skip_ttl_check = false) override; + int next() override; + int prev() override; + rocksdb::Slice key() override; + rocksdb::Slice value() override; + void reset() override; +}; + } // namespace myrocks
participants (1)
-
psergey