revision-id: d1d0c156c629689b013de067b6fa01e4009484d5 (percona-202102-54-gd1d0c156c62)
parent(s): aaebd623e98e59db3efe1b231307e4142240c485
author: Sergei Petrunia
committer: Sergei Petrunia
timestamp: 2021-05-17 18:00:42 +0300
message:
Apply patch: Add partial index iterator
Summary:
This adds the partial index iterator. It is a special iterator that sorts groups from the primary key on the fly as needed, and if it exceeds a certain threshold, it will materialize the rows on the secondary key as well.
For point queries, the secondary key read is not found, we simply extract the primary key parts and read from the primary key. This means that point queries don't trigger materialization though.
Test Plan: mtr
Reviewers: luqun, herman, yzha, #mysql_eng
Subscribers: phabricatorlinter, pgl
Differential Revision: https://phabricator.intern.facebook.com/D25933178
---
mysql-test/include/diff_queries.inc | 14 +
mysql-test/suite/rocksdb/r/partial_index.result | 146 +++++
.../suite/rocksdb/r/partial_index_assoc.result | 171 ++++++
.../suite/rocksdb/r/partial_index_stress.result | 74 +++
mysql-test/suite/rocksdb/r/rocksdb.result | 12 +
mysql-test/suite/rocksdb/t/partial_index.inc | 156 ++++++
mysql-test/suite/rocksdb/t/partial_index.test | 19 +
.../suite/rocksdb/t/partial_index_assoc-master.opt | 3 +
mysql-test/suite/rocksdb/t/partial_index_assoc.inc | 95 ++++
.../suite/rocksdb/t/partial_index_assoc.test | 58 ++
.../rocksdb/t/partial_index_stress-master.opt | 3 +
mysql-test/suite/rocksdb/t/partial_index_stress.py | 114 ++++
.../suite/rocksdb/t/partial_index_stress.test | 64 +++
storage/rocksdb/ha_rocksdb.cc | 77 ++-
storage/rocksdb/ha_rocksdb.h | 8 +
storage/rocksdb/rdb_datadic.h | 6 +
storage/rocksdb/rdb_iterator.cc | 591 +++++++++++++++++++++
storage/rocksdb/rdb_iterator.h | 69 ++-
18 files changed, 1672 insertions(+), 8 deletions(-)
diff --git a/mysql-test/include/diff_queries.inc b/mysql-test/include/diff_queries.inc
new file mode 100644
index 00000000000..beb75093759
--- /dev/null
+++ b/mysql-test/include/diff_queries.inc
@@ -0,0 +1,14 @@
+--disable_query_log
+
+--output $MYSQL_TMP_DIR/A
+--eval $query1
+
+--output $MYSQL_TMP_DIR/B
+--eval $query2
+
+--enable_query_log
+
+--diff_files $MYSQL_TMP_DIR/A $MYSQL_TMP_DIR/B
+
+--remove_file $MYSQL_TMP_DIR/A
+--remove_file $MYSQL_TMP_DIR/B
diff --git a/mysql-test/suite/rocksdb/r/partial_index.result b/mysql-test/suite/rocksdb/r/partial_index.result
new file mode 100644
index 00000000000..a752d37180c
--- /dev/null
+++ b/mysql-test/suite/rocksdb/r/partial_index.result
@@ -0,0 +1,146 @@
+set optimizer_force_index_for_range = on;
+CREATE TABLE t (i varchar(64), j varchar(64), k varchar(64), l varchar(64),
+PRIMARY KEY (i, j),
+KEY ik1 (i, k) COMMENT 'cfname=cf;partial_group_keyparts=1;partial_group_threshold=5',
+KEY ik2 (i, k) COMMENT 'cfname=cf'
+) ENGINE=ROCKSDB;
+Warnings:
+Warning 1831 Duplicate index 'ik2' defined on the table 'test.t'. This is deprecated and will be disallowed in a future release.
+INSERT INTO t values ("1", "1", "2", "1");
+INSERT INTO t values ("1", "2", "1", "1");
+INSERT INTO t values ("11111111", "1", "9", "1");
+INSERT INTO t values ("11111111", "2", "8", "1");
+INSERT INTO t values ("11111111", "3", "7", "1");
+INSERT INTO t values ("11111111", "4", "5", "1");
+INSERT INTO t values ("11111111", "5", "4", "1");
+INSERT INTO t values ("11111111", "6", "2", "1");
+INSERT INTO t values ("111111111", "1", "9", "1");
+INSERT INTO t values ("111111111", "2", "2", "1");
+INSERT INTO t values ("11111112", "1", "1", "1");
+DROP TABLE t;
+CREATE TABLE t (i int, j int, k int, l int,
+PRIMARY KEY (i, j),
+KEY ik1 (i, k) COMMENT 'cfname=cf;partial_group_keyparts=1;partial_group_threshold=5',
+KEY ik2 (i, k) COMMENT 'cfname=cf'
+) ENGINE=ROCKSDB;
+Warnings:
+Warning 1831 Duplicate index 'ik2' defined on the table 'test.t'. This is deprecated and will be disallowed in a future release.
+INSERT INTO t values (1, 1, 2, 1);
+INSERT INTO t values (1, 2, 1, 1);
+INSERT INTO t values (2, 1, 9, 1);
+INSERT INTO t values (2, 2, 8, 1);
+INSERT INTO t values (2, 3, 7, 1);
+INSERT INTO t values (2, 4, 5, 1);
+INSERT INTO t values (2, 5, 4, 1);
+INSERT INTO t values (2, 6, 2, 1);
+INSERT INTO t values (4, 1, 1, 1);
+DROP TABLE t;
+CREATE TABLE t (i varchar(64), j varchar(64), k varchar(64), l varchar(64),
+PRIMARY KEY (i, j),
+KEY ik1 (i, k) COMMENT 'cfname=rev:cf;partial_group_keyparts=1;partial_group_threshold=5',
+KEY ik2 (i, k) COMMENT 'cfname=rev:cf'
+) ENGINE=ROCKSDB;
+Warnings:
+Warning 1831 Duplicate index 'ik2' defined on the table 'test.t'. This is deprecated and will be disallowed in a future release.
+INSERT INTO t values ("1", "1", "2", "1");
+INSERT INTO t values ("1", "2", "1", "1");
+INSERT INTO t values ("11111111", "1", "9", "1");
+INSERT INTO t values ("11111111", "2", "8", "1");
+INSERT INTO t values ("11111111", "3", "7", "1");
+INSERT INTO t values ("11111111", "4", "5", "1");
+INSERT INTO t values ("11111111", "5", "4", "1");
+INSERT INTO t values ("11111111", "6", "2", "1");
+INSERT INTO t values ("111111111", "1", "9", "1");
+INSERT INTO t values ("111111111", "2", "2", "1");
+INSERT INTO t values ("11111112", "1", "1", "1");
+DROP TABLE t;
+CREATE TABLE t (i int, j int, k int, l int,
+PRIMARY KEY (i, j),
+KEY ik1 (i, k) COMMENT 'cfname=rev:cf;partial_group_keyparts=1;partial_group_threshold=5',
+KEY ik2 (i, k) COMMENT 'cfname=rev:cf'
+) ENGINE=ROCKSDB;
+Warnings:
+Warning 1831 Duplicate index 'ik2' defined on the table 'test.t'. This is deprecated and will be disallowed in a future release.
+INSERT INTO t values (1, 1, 2, 1);
+INSERT INTO t values (1, 2, 1, 1);
+INSERT INTO t values (2, 1, 9, 1);
+INSERT INTO t values (2, 2, 8, 1);
+INSERT INTO t values (2, 3, 7, 1);
+INSERT INTO t values (2, 4, 5, 1);
+INSERT INTO t values (2, 5, 4, 1);
+INSERT INTO t values (2, 6, 2, 1);
+INSERT INTO t values (4, 1, 1, 1);
+DROP TABLE t;
+CREATE TABLE t (i varchar(64), j varchar(64), k varchar(64), l varchar(64),
+PRIMARY KEY (i, j),
+KEY ik1 (i, k) COMMENT 'cfname=cf;partial_group_keyparts=1;partial_group_threshold=5',
+KEY ik2 (i, k) COMMENT 'cfname=cf'
+) ENGINE=ROCKSDB;
+Warnings:
+Warning 1831 Duplicate index 'ik2' defined on the table 'test.t'. This is deprecated and will be disallowed in a future release.
+INSERT INTO t values ("1", "1", "2", "1");
+INSERT INTO t values ("1", "2", "1", "1");
+INSERT INTO t values ("11111111", "1", "9", "1");
+INSERT INTO t values ("11111111", "2", "8", "1");
+INSERT INTO t values ("11111111", "3", "7", "1");
+INSERT INTO t values ("11111111", "4", "5", "1");
+INSERT INTO t values ("11111111", "5", "4", "1");
+INSERT INTO t values ("11111111", "6", "2", "1");
+INSERT INTO t values ("111111111", "1", "9", "1");
+INSERT INTO t values ("111111111", "2", "2", "1");
+INSERT INTO t values ("11111112", "1", "1", "1");
+DROP TABLE t;
+CREATE TABLE t (i int, j int, k int, l int,
+PRIMARY KEY (i, j),
+KEY ik1 (i, k) COMMENT 'cfname=cf;partial_group_keyparts=1;partial_group_threshold=5',
+KEY ik2 (i, k) COMMENT 'cfname=cf'
+) ENGINE=ROCKSDB;
+Warnings:
+Warning 1831 Duplicate index 'ik2' defined on the table 'test.t'. This is deprecated and will be disallowed in a future release.
+INSERT INTO t values (1, 1, 2, 1);
+INSERT INTO t values (1, 2, 1, 1);
+INSERT INTO t values (2, 1, 9, 1);
+INSERT INTO t values (2, 2, 8, 1);
+INSERT INTO t values (2, 3, 7, 1);
+INSERT INTO t values (2, 4, 5, 1);
+INSERT INTO t values (2, 5, 4, 1);
+INSERT INTO t values (2, 6, 2, 1);
+INSERT INTO t values (4, 1, 1, 1);
+DROP TABLE t;
+CREATE TABLE t (i varchar(64), j varchar(64), k varchar(64), l varchar(64),
+PRIMARY KEY (i, j),
+KEY ik1 (i, k) COMMENT 'cfname=rev:cf;partial_group_keyparts=1;partial_group_threshold=5',
+KEY ik2 (i, k) COMMENT 'cfname=rev:cf'
+) ENGINE=ROCKSDB;
+Warnings:
+Warning 1831 Duplicate index 'ik2' defined on the table 'test.t'. This is deprecated and will be disallowed in a future release.
+INSERT INTO t values ("1", "1", "2", "1");
+INSERT INTO t values ("1", "2", "1", "1");
+INSERT INTO t values ("11111111", "1", "9", "1");
+INSERT INTO t values ("11111111", "2", "8", "1");
+INSERT INTO t values ("11111111", "3", "7", "1");
+INSERT INTO t values ("11111111", "4", "5", "1");
+INSERT INTO t values ("11111111", "5", "4", "1");
+INSERT INTO t values ("11111111", "6", "2", "1");
+INSERT INTO t values ("111111111", "1", "9", "1");
+INSERT INTO t values ("111111111", "2", "2", "1");
+INSERT INTO t values ("11111112", "1", "1", "1");
+DROP TABLE t;
+CREATE TABLE t (i int, j int, k int, l int,
+PRIMARY KEY (i, j),
+KEY ik1 (i, k) COMMENT 'cfname=rev:cf;partial_group_keyparts=1;partial_group_threshold=5',
+KEY ik2 (i, k) COMMENT 'cfname=rev:cf'
+) ENGINE=ROCKSDB;
+Warnings:
+Warning 1831 Duplicate index 'ik2' defined on the table 'test.t'. This is deprecated and will be disallowed in a future release.
+INSERT INTO t values (1, 1, 2, 1);
+INSERT INTO t values (1, 2, 1, 1);
+INSERT INTO t values (2, 1, 9, 1);
+INSERT INTO t values (2, 2, 8, 1);
+INSERT INTO t values (2, 3, 7, 1);
+INSERT INTO t values (2, 4, 5, 1);
+INSERT INTO t values (2, 5, 4, 1);
+INSERT INTO t values (2, 6, 2, 1);
+INSERT INTO t values (4, 1, 1, 1);
+DROP TABLE t;
+set optimizer_force_index_for_range = off;
diff --git a/mysql-test/suite/rocksdb/r/partial_index_assoc.result b/mysql-test/suite/rocksdb/r/partial_index_assoc.result
new file mode 100644
index 00000000000..fbb89d16b35
--- /dev/null
+++ b/mysql-test/suite/rocksdb/r/partial_index_assoc.result
@@ -0,0 +1,171 @@
+set optimizer_force_index_for_range = on;
+CREATE TABLE `assoc_table` (
+`id1` bigint(20) unsigned NOT NULL DEFAULT '0',
+`id1_type` int(10) unsigned NOT NULL DEFAULT '0',
+`id2` bigint(20) unsigned NOT NULL DEFAULT '0',
+`id2_type` int(10) unsigned NOT NULL DEFAULT '0',
+`assoc_type` bigint(20) unsigned NOT NULL DEFAULT '0',
+`visibility` tinyint(3) NOT NULL DEFAULT '0',
+`data` varchar(255) COLLATE latin1_bin NOT NULL DEFAULT '',
+`time` int(10) unsigned NOT NULL DEFAULT '0',
+`version` bigint(20) unsigned NOT NULL DEFAULT '0',
+PRIMARY KEY (`assoc_type`,`id1`,`id2`) COMMENT 'cf_assoc',
+KEY `id1_type` (`assoc_type`, `id1`, `visibility`,`time`,`id2`,`version`,`data`) COMMENT 'rev:cf_assoc_id1_type',
+KEY `id1_type2` (`assoc_type`, `id1`, `visibility`,`time`,`id2`,`version`,`data`) COMMENT 'cfname=rev:cf_assoc_id1_type;partial_group_keyparts=2;partial_group_threshold=10'
+) ENGINE=ROCKSDB DEFAULT CHARSET=latin1 COLLATE=latin1_bin ROW_FORMAT=COMPRESSED KEY_BLOCK_SIZE=8;
+Warnings:
+Warning 1681 Integer display width is deprecated and will be removed in a future release.
+Warning 1681 Integer display width is deprecated and will be removed in a future release.
+Warning 1681 Integer display width is deprecated and will be removed in a future release.
+Warning 1681 Integer display width is deprecated and will be removed in a future release.
+Warning 1681 Integer display width is deprecated and will be removed in a future release.
+Warning 1681 Integer display width is deprecated and will be removed in a future release.
+Warning 1681 Integer display width is deprecated and will be removed in a future release.
+Warning 1681 Integer display width is deprecated and will be removed in a future release.
+Warning 1831 Duplicate index 'id1_type2' defined on the table 'test.assoc_table'. This is deprecated and will be disallowed in a future release.
+ALTER TABLE assoc_table ENGINE=ROCKSDB;
+CREATE TEMPORARY TABLE t1 AS
+SELECT * FROM performance_schema.global_status
+WHERE variable_name LIKE 'rocksdb_partial_index%';
+CREATE TEMPORARY TABLE t2 AS
+SELECT * FROM performance_schema.global_status
+WHERE variable_name LIKE 'rocksdb_partial_index%';
+SELECT variable_name, t2.variable_value - t1.variable_value AS diff FROM t1 JOIN t2 USING (variable_name);
+variable_name diff
+rocksdb_partial_index_groups_materialized 40
+rocksdb_partial_index_groups_sorted 47
+rocksdb_partial_index_rows_materialized 955
+rocksdb_partial_index_rows_sorted 1000
+include/assert.inc [Check that materialized groups are non-zero.]
+include/assert.inc [Check that materialized rows are non-zero.]
+DROP TABLE t1, t2;
+CREATE TEMPORARY TABLE t1 AS
+SELECT * FROM performance_schema.global_status
+WHERE variable_name LIKE 'rocksdb_partial_index%';
+SELECT id1, id2, assoc_type, visibility, time, version FROM assoc_table FORCE INDEX (id1_type2);
+CREATE TEMPORARY TABLE t2 AS
+SELECT * FROM performance_schema.global_status
+WHERE variable_name LIKE 'rocksdb_partial_index%';
+SELECT variable_name, t2.variable_value - t1.variable_value AS diff FROM t1 JOIN t2 USING (variable_name);
+variable_name diff
+rocksdb_partial_index_groups_materialized 0
+rocksdb_partial_index_groups_sorted 7
+rocksdb_partial_index_rows_materialized 0
+rocksdb_partial_index_rows_sorted 45
+include/assert.inc [Check that materialized groups are zero.]
+include/assert.inc [Check that materialized rows are zero.]
+DROP TABLE t1, t2;
+DROP TABLE assoc_table;
+CREATE TABLE `assoc_table` (
+`id1` binary(16) NOT NULL DEFAULT '\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0',
+`raw_key` text COLLATE latin1_bin,
+`id2` bigint(20) unsigned NOT NULL DEFAULT '0',
+`id2_type` int(10) unsigned NOT NULL DEFAULT '0',
+`assoc_type` bigint(20) unsigned NOT NULL DEFAULT '0',
+`visibility` tinyint(3) NOT NULL DEFAULT '0',
+`data` varchar(255) COLLATE latin1_bin NOT NULL DEFAULT '',
+`time` int(10) unsigned NOT NULL DEFAULT '0',
+`version` bigint(20) unsigned NOT NULL DEFAULT '0',
+PRIMARY KEY (`assoc_type`,`id1`,`id2`) COMMENT 'cf_assoc',
+KEY `id1_type` (`assoc_type`,`id1`,`visibility`,`time`,`id2`,`version`,`data`) COMMENT 'rev:cf_assoc_id1_type',
+KEY `id1_type2` (`assoc_type`, `id1`, `visibility`,`time`,`id2`,`version`,`data`) COMMENT 'cfname=rev:cf_assoc_id1_type;partial_group_keyparts=2;partial_group_threshold=10'
+) ENGINE=ROCKSDB DEFAULT CHARSET=latin1 COLLATE=latin1_bin ROW_FORMAT=COMPRESSED KEY_BLOCK_SIZE=8;
+Warnings:
+Warning 1681 Integer display width is deprecated and will be removed in a future release.
+Warning 1681 Integer display width is deprecated and will be removed in a future release.
+Warning 1681 Integer display width is deprecated and will be removed in a future release.
+Warning 1681 Integer display width is deprecated and will be removed in a future release.
+Warning 1681 Integer display width is deprecated and will be removed in a future release.
+Warning 1681 Integer display width is deprecated and will be removed in a future release.
+Warning 1831 Duplicate index 'id1_type2' defined on the table 'test.assoc_table'. This is deprecated and will be disallowed in a future release.
+ALTER TABLE assoc_table ENGINE=ROCKSDB;
+CREATE TEMPORARY TABLE t1 AS
+SELECT * FROM performance_schema.global_status
+WHERE variable_name LIKE 'rocksdb_partial_index%';
+CREATE TEMPORARY TABLE t2 AS
+SELECT * FROM performance_schema.global_status
+WHERE variable_name LIKE 'rocksdb_partial_index%';
+SELECT variable_name, t2.variable_value - t1.variable_value AS diff FROM t1 JOIN t2 USING (variable_name);
+variable_name diff
+rocksdb_partial_index_groups_materialized 40
+rocksdb_partial_index_groups_sorted 47
+rocksdb_partial_index_rows_materialized 955
+rocksdb_partial_index_rows_sorted 1000
+include/assert.inc [Check that materialized groups are non-zero.]
+include/assert.inc [Check that materialized rows are non-zero.]
+DROP TABLE t1, t2;
+CREATE TEMPORARY TABLE t1 AS
+SELECT * FROM performance_schema.global_status
+WHERE variable_name LIKE 'rocksdb_partial_index%';
+SELECT id1, id2, assoc_type, visibility, time, version FROM assoc_table FORCE INDEX (id1_type2);
+CREATE TEMPORARY TABLE t2 AS
+SELECT * FROM performance_schema.global_status
+WHERE variable_name LIKE 'rocksdb_partial_index%';
+SELECT variable_name, t2.variable_value - t1.variable_value AS diff FROM t1 JOIN t2 USING (variable_name);
+variable_name diff
+rocksdb_partial_index_groups_materialized 0
+rocksdb_partial_index_groups_sorted 7
+rocksdb_partial_index_rows_materialized 0
+rocksdb_partial_index_rows_sorted 45
+include/assert.inc [Check that materialized groups are zero.]
+include/assert.inc [Check that materialized rows are zero.]
+DROP TABLE t1, t2;
+DROP TABLE assoc_table;
+CREATE TABLE `assoc_table` (
+`id1` bigint(20) unsigned NOT NULL DEFAULT '0',
+`id1_type` int(10) unsigned NOT NULL DEFAULT '0',
+`id2` bigint(20) unsigned NOT NULL DEFAULT '0',
+`id2_type` int(10) unsigned NOT NULL DEFAULT '0',
+`assoc_type` bigint(20) unsigned NOT NULL DEFAULT '0',
+`visibility` tinyint(4) NOT NULL DEFAULT '0',
+`data` text COLLATE latin1_bin NOT NULL,
+`time` int(10) unsigned NOT NULL DEFAULT '0',
+`version` bigint(20) unsigned NOT NULL DEFAULT '0',
+PRIMARY KEY (`assoc_type`,`id1`,`id2`) COMMENT 'cf_assoc',
+KEY `id1_type` (`assoc_type`, `id1`, `visibility`,`time`,`id2`,`version`,`data`(255)) COMMENT 'rev:cf_assoc_id1_type',
+KEY `id1_type2` (`assoc_type`, `id1`, `visibility`,`time`,`id2`,`version`,`data`(255)) COMMENT 'cfname=rev:cf_assoc_id1_type;partial_group_keyparts=2;partial_group_threshold=10'
+) ENGINE=ROCKSDB DEFAULT CHARSET=latin1 COLLATE=latin1_bin ROW_FORMAT=COMPRESSED KEY_BLOCK_SIZE=4;
+Warnings:
+Warning 1681 Integer display width is deprecated and will be removed in a future release.
+Warning 1681 Integer display width is deprecated and will be removed in a future release.
+Warning 1681 Integer display width is deprecated and will be removed in a future release.
+Warning 1681 Integer display width is deprecated and will be removed in a future release.
+Warning 1681 Integer display width is deprecated and will be removed in a future release.
+Warning 1681 Integer display width is deprecated and will be removed in a future release.
+Warning 1681 Integer display width is deprecated and will be removed in a future release.
+Warning 1681 Integer display width is deprecated and will be removed in a future release.
+Warning 1831 Duplicate index 'id1_type2' defined on the table 'test.assoc_table'. This is deprecated and will be disallowed in a future release.
+ALTER TABLE assoc_table ENGINE=ROCKSDB;
+CREATE TEMPORARY TABLE t1 AS
+SELECT * FROM performance_schema.global_status
+WHERE variable_name LIKE 'rocksdb_partial_index%';
+CREATE TEMPORARY TABLE t2 AS
+SELECT * FROM performance_schema.global_status
+WHERE variable_name LIKE 'rocksdb_partial_index%';
+SELECT variable_name, t2.variable_value - t1.variable_value AS diff FROM t1 JOIN t2 USING (variable_name);
+variable_name diff
+rocksdb_partial_index_groups_materialized 40
+rocksdb_partial_index_groups_sorted 47
+rocksdb_partial_index_rows_materialized 955
+rocksdb_partial_index_rows_sorted 1000
+include/assert.inc [Check that materialized groups are non-zero.]
+include/assert.inc [Check that materialized rows are non-zero.]
+DROP TABLE t1, t2;
+CREATE TEMPORARY TABLE t1 AS
+SELECT * FROM performance_schema.global_status
+WHERE variable_name LIKE 'rocksdb_partial_index%';
+SELECT id1, id2, assoc_type, visibility, time, version FROM assoc_table FORCE INDEX (id1_type2);
+CREATE TEMPORARY TABLE t2 AS
+SELECT * FROM performance_schema.global_status
+WHERE variable_name LIKE 'rocksdb_partial_index%';
+SELECT variable_name, t2.variable_value - t1.variable_value AS diff FROM t1 JOIN t2 USING (variable_name);
+variable_name diff
+rocksdb_partial_index_groups_materialized 0
+rocksdb_partial_index_groups_sorted 7
+rocksdb_partial_index_rows_materialized 0
+rocksdb_partial_index_rows_sorted 45
+include/assert.inc [Check that materialized groups are zero.]
+include/assert.inc [Check that materialized rows are zero.]
+DROP TABLE t1, t2;
+DROP TABLE assoc_table;
+set optimizer_force_index_for_range = off;
diff --git a/mysql-test/suite/rocksdb/r/partial_index_stress.result b/mysql-test/suite/rocksdb/r/partial_index_stress.result
new file mode 100644
index 00000000000..88f77bcc63f
--- /dev/null
+++ b/mysql-test/suite/rocksdb/r/partial_index_stress.result
@@ -0,0 +1,74 @@
+set @save_rocksdb_lock_wait_timeout = @@rocksdb_lock_wait_timeout;
+set global rocksdb_lock_wait_timeout = 100000;
+CREATE TABLE `assoc_table` (
+`id1` bigint(20) unsigned NOT NULL DEFAULT '0',
+`id1_type` int(10) unsigned NOT NULL DEFAULT '0',
+`id2` bigint(20) unsigned NOT NULL DEFAULT '0',
+`id2_type` int(10) unsigned NOT NULL DEFAULT '0',
+`assoc_type` bigint(20) unsigned NOT NULL DEFAULT '0',
+`visibility` tinyint(3) NOT NULL DEFAULT '0',
+`data` varchar(255) COLLATE latin1_bin NOT NULL DEFAULT '',
+`time` int(10) unsigned NOT NULL DEFAULT '0',
+`version` bigint(20) unsigned NOT NULL DEFAULT '0',
+PRIMARY KEY (`assoc_type`,`id1`,`id2`) COMMENT 'cf_assoc',
+KEY `id1_type` (`assoc_type`, `id1`, `visibility`,`time`,`id2`,`version`,`data`) COMMENT 'cfname=rev:cf_assoc_id1_type;partial_group_keyparts=2;partial_group_threshold=10'
+) ENGINE=ROCKSDB DEFAULT CHARSET=latin1 COLLATE=latin1_bin ROW_FORMAT=COMPRESSED KEY_BLOCK_SIZE=8;
+Warnings:
+Warning 1681 Integer display width is deprecated and will be removed in a future release.
+Warning 1681 Integer display width is deprecated and will be removed in a future release.
+Warning 1681 Integer display width is deprecated and will be removed in a future release.
+Warning 1681 Integer display width is deprecated and will be removed in a future release.
+Warning 1681 Integer display width is deprecated and will be removed in a future release.
+Warning 1681 Integer display width is deprecated and will be removed in a future release.
+Warning 1681 Integer display width is deprecated and will be removed in a future release.
+Warning 1681 Integer display width is deprecated and will be removed in a future release.
+DROP TABLE assoc_table;
+CREATE TABLE `assoc_table` (
+`id1` binary(16) NOT NULL DEFAULT '\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0',
+`raw_key` text COLLATE latin1_bin,
+`id2` bigint(20) unsigned NOT NULL DEFAULT '0',
+`id2_type` int(10) unsigned NOT NULL DEFAULT '0',
+`assoc_type` bigint(20) unsigned NOT NULL DEFAULT '0',
+`visibility` tinyint(3) NOT NULL DEFAULT '0',
+`data` varchar(255) COLLATE latin1_bin NOT NULL DEFAULT '',
+`time` int(10) unsigned NOT NULL DEFAULT '0',
+`version` bigint(20) unsigned NOT NULL DEFAULT '0',
+PRIMARY KEY (`assoc_type`,`id1`,`id2`) COMMENT 'cf_assoc',
+KEY `id1_type` (`assoc_type`,`id1`,`visibility`,`time`,`id2`,`version`,`data`) COMMENT 'rev:cf_assoc_id1_type',
+KEY `id1_type2` (`assoc_type`, `id1`, `visibility`,`time`,`id2`,`version`,`data`) COMMENT 'cfname=rev:cf_assoc_id1_type;partial_group_keyparts=2;partial_group_threshold=10'
+) ENGINE=ROCKSDB DEFAULT CHARSET=latin1 COLLATE=latin1_bin ROW_FORMAT=COMPRESSED KEY_BLOCK_SIZE=8;
+Warnings:
+Warning 1681 Integer display width is deprecated and will be removed in a future release.
+Warning 1681 Integer display width is deprecated and will be removed in a future release.
+Warning 1681 Integer display width is deprecated and will be removed in a future release.
+Warning 1681 Integer display width is deprecated and will be removed in a future release.
+Warning 1681 Integer display width is deprecated and will be removed in a future release.
+Warning 1681 Integer display width is deprecated and will be removed in a future release.
+Warning 1831 Duplicate index 'id1_type2' defined on the table 'test.assoc_table'. This is deprecated and will be disallowed in a future release.
+DROP TABLE assoc_table;
+CREATE TABLE `assoc_table` (
+`id1` bigint(20) unsigned NOT NULL DEFAULT '0',
+`id1_type` int(10) unsigned NOT NULL DEFAULT '0',
+`id2` bigint(20) unsigned NOT NULL DEFAULT '0',
+`id2_type` int(10) unsigned NOT NULL DEFAULT '0',
+`assoc_type` bigint(20) unsigned NOT NULL DEFAULT '0',
+`visibility` tinyint(4) NOT NULL DEFAULT '0',
+`data` text COLLATE latin1_bin NOT NULL,
+`time` int(10) unsigned NOT NULL DEFAULT '0',
+`version` bigint(20) unsigned NOT NULL DEFAULT '0',
+PRIMARY KEY (`assoc_type`,`id1`,`id2`) COMMENT 'cf_assoc',
+KEY `id1_type` (`assoc_type`, `id1`, `visibility`,`time`,`id2`,`version`,`data`(255)) COMMENT 'rev:cf_assoc_id1_type',
+KEY `id1_type2` (`assoc_type`, `id1`, `visibility`,`time`,`id2`,`version`,`data`(255)) COMMENT 'cfname=rev:cf_assoc_id1_type;partial_group_keyparts=2;partial_group_threshold=10'
+) ENGINE=ROCKSDB DEFAULT CHARSET=latin1 COLLATE=latin1_bin ROW_FORMAT=COMPRESSED KEY_BLOCK_SIZE=4;
+Warnings:
+Warning 1681 Integer display width is deprecated and will be removed in a future release.
+Warning 1681 Integer display width is deprecated and will be removed in a future release.
+Warning 1681 Integer display width is deprecated and will be removed in a future release.
+Warning 1681 Integer display width is deprecated and will be removed in a future release.
+Warning 1681 Integer display width is deprecated and will be removed in a future release.
+Warning 1681 Integer display width is deprecated and will be removed in a future release.
+Warning 1681 Integer display width is deprecated and will be removed in a future release.
+Warning 1681 Integer display width is deprecated and will be removed in a future release.
+Warning 1831 Duplicate index 'id1_type2' defined on the table 'test.assoc_table'. This is deprecated and will be disallowed in a future release.
+DROP TABLE assoc_table;
+set global rocksdb_lock_wait_timeout = @save_rocksdb_lock_wait_timeout;
diff --git a/mysql-test/suite/rocksdb/r/rocksdb.result b/mysql-test/suite/rocksdb/r/rocksdb.result
index 61dd8184ddf..5c440c88317 100644
--- a/mysql-test/suite/rocksdb/r/rocksdb.result
+++ b/mysql-test/suite/rocksdb/r/rocksdb.result
@@ -1656,6 +1656,10 @@ rocksdb_number_sst_entry_singledelete #
rocksdb_number_superversion_acquires #
rocksdb_number_superversion_cleanups #
rocksdb_number_superversion_releases #
+rocksdb_partial_index_groups_materialized #
+rocksdb_partial_index_groups_sorted #
+rocksdb_partial_index_rows_materialized #
+rocksdb_partial_index_rows_sorted #
rocksdb_row_lock_deadlocks #
rocksdb_row_lock_wait_timeouts #
rocksdb_select_bypass_executed #
@@ -1760,6 +1764,10 @@ ROCKSDB_NUMBER_SUPERVERSION_ACQUIRES
ROCKSDB_NUMBER_SUPERVERSION_CLEANUPS
ROCKSDB_NUMBER_SUPERVERSION_RELEASES
ROCKSDB_NUM_ITERATORS
+ROCKSDB_PARTIAL_INDEX_GROUPS_MATERIALIZED
+ROCKSDB_PARTIAL_INDEX_GROUPS_SORTED
+ROCKSDB_PARTIAL_INDEX_ROWS_MATERIALIZED
+ROCKSDB_PARTIAL_INDEX_ROWS_SORTED
ROCKSDB_QUERIES_POINT
ROCKSDB_QUERIES_RANGE
ROCKSDB_ROWS_DELETED
@@ -1883,6 +1891,10 @@ ROCKSDB_NUMBER_SUPERVERSION_ACQUIRES
ROCKSDB_NUMBER_SUPERVERSION_CLEANUPS
ROCKSDB_NUMBER_SUPERVERSION_RELEASES
ROCKSDB_NUM_ITERATORS
+ROCKSDB_PARTIAL_INDEX_GROUPS_MATERIALIZED
+ROCKSDB_PARTIAL_INDEX_GROUPS_SORTED
+ROCKSDB_PARTIAL_INDEX_ROWS_MATERIALIZED
+ROCKSDB_PARTIAL_INDEX_ROWS_SORTED
ROCKSDB_QUERIES_POINT
ROCKSDB_QUERIES_RANGE
ROCKSDB_ROWS_DELETED
diff --git a/mysql-test/suite/rocksdb/t/partial_index.inc b/mysql-test/suite/rocksdb/t/partial_index.inc
new file mode 100644
index 00000000000..7137a696d6a
--- /dev/null
+++ b/mysql-test/suite/rocksdb/t/partial_index.inc
@@ -0,0 +1,156 @@
+eval
+CREATE TABLE t (i varchar(64), j varchar(64), k varchar(64), l varchar(64),
+ PRIMARY KEY (i, j),
+ KEY ik1 (i, k) COMMENT 'cfname=$cfname;partial_group_keyparts=1;partial_group_threshold=5',
+ KEY ik2 (i, k) COMMENT 'cfname=$cfname'
+) ENGINE=ROCKSDB;
+
+INSERT INTO t values ("1", "1", "2", "1");
+INSERT INTO t values ("1", "2", "1", "1");
+
+INSERT INTO t values ("11111111", "1", "9", "1");
+INSERT INTO t values ("11111111", "2", "8", "1");
+INSERT INTO t values ("11111111", "3", "7", "1");
+INSERT INTO t values ("11111111", "4", "5", "1");
+INSERT INTO t values ("11111111", "5", "4", "1");
+INSERT INTO t values ("11111111", "6", "2", "1");
+
+INSERT INTO t values ("111111111", "1", "9", "1");
+INSERT INTO t values ("111111111", "2", "2", "1");
+
+INSERT INTO t values ("11111112", "1", "1", "1");
+
+--let $query1= SELECT i, j, k FROM t FORCE INDEX (ik1) ORDER BY i $asc , k $asc;
+--let $query2= SELECT i, j, k FROM t FORCE INDEX (ik2) ORDER BY i $asc , k $asc;
+--source include/diff_queries.inc
+
+--let $query1= SELECT i, j, k FROM t FORCE INDEX (ik1) WHERE i = "1" ORDER BY i $asc , k $asc;
+--let $query2= SELECT i, j, k FROM t FORCE INDEX (ik2) WHERE i = "1" ORDER BY i $asc , k $asc;
+--source include/diff_queries.inc
+
+--let $query1= SELECT i, j, k FROM t FORCE INDEX (ik1) WHERE i = "11111111" ORDER BY i $asc , k $asc;
+--let $query2= SELECT i, j, k FROM t FORCE INDEX (ik2) WHERE i = "11111111" ORDER BY i $asc , k $asc;
+--source include/diff_queries.inc
+
+--let $query1= SELECT i, j, k FROM t FORCE INDEX (ik1) WHERE i = "111111110" ORDER BY i $asc , k $asc;
+--let $query2= SELECT i, j, k FROM t FORCE INDEX (ik2) WHERE i = "111111110" ORDER BY i $asc , k $asc;
+--source include/diff_queries.inc
+
+--let $query1= SELECT i, j, k FROM t FORCE INDEX (ik1) WHERE i = "11111112" ORDER BY i $asc , k $asc;
+--let $query2= SELECT i, j, k FROM t FORCE INDEX (ik2) WHERE i = "11111112" ORDER BY i $asc , k $asc;
+--source include/diff_queries.inc
+
+--let $query1= SELECT i, j, k FROM t FORCE INDEX (ik1) WHERE i = "11111111" AND k < "5" ORDER BY i $asc , k $asc;
+--let $query2= SELECT i, j, k FROM t FORCE INDEX (ik2) WHERE i = "11111111" AND k < "5" ORDER BY i $asc , k $asc;
+--source include/diff_queries.inc
+
+--let $query1= SELECT i, j, k FROM t FORCE INDEX (ik1) WHERE i = "11111111" AND k > "2" ORDER BY i $asc , k $asc;
+--let $query2= SELECT i, j, k FROM t FORCE INDEX (ik2) WHERE i = "11111111" AND k > "2" ORDER BY i $asc , k $asc;
+--source include/diff_queries.inc
+
+--let $query1= SELECT i, j, k FROM t FORCE INDEX (ik1) WHERE i = "11111111" AND k > "2" AND k < "5" ORDER BY i $asc , k $asc;
+--let $query2= SELECT i, j, k FROM t FORCE INDEX (ik2) WHERE i = "11111111" AND k > "2" AND k < "5" ORDER BY i $asc , k $asc;
+--source include/diff_queries.inc
+
+--let $query1= SELECT i, j, k FROM t FORCE INDEX (ik1) WHERE i = "11111111" AND k > "7" AND k < "9" ORDER BY i $asc , k $asc;
+--let $query2= SELECT i, j, k FROM t FORCE INDEX (ik2) WHERE i = "11111111" AND k > "7" AND k < "9" ORDER BY i $asc , k $asc;
+--source include/diff_queries.inc
+
+--let $query1= SELECT i, j, k FROM t FORCE INDEX (ik1) WHERE i < "111111110" ORDER BY i $asc , k $asc;
+--let $query2= SELECT i, j, k FROM t FORCE INDEX (ik2) WHERE i < "111111110" ORDER BY i $asc , k $asc;
+--source include/diff_queries.inc
+
+--let $query1= SELECT i, j, k FROM t FORCE INDEX (ik1) WHERE i > "111111110" ORDER BY i $asc , k $asc;
+--let $query2= SELECT i, j, k FROM t FORCE INDEX (ik2) WHERE i > "111111110" ORDER BY i $asc , k $asc;
+--source include/diff_queries.inc
+
+--let $query1= SELECT i, j, k FROM t FORCE INDEX (ik1) WHERE i >= "111111110" ORDER BY i $asc , k $asc;
+--let $query2= SELECT i, j, k FROM t FORCE INDEX (ik2) WHERE i >= "111111110" ORDER BY i $asc , k $asc;
+--source include/diff_queries.inc
+
+--let $query1= SELECT i, j, k FROM t FORCE INDEX (ik1) WHERE i <= "111111110" ORDER BY i $asc , k $asc;
+--let $query2= SELECT i, j, k FROM t FORCE INDEX (ik2) WHERE i <= "111111110" ORDER BY i $asc , k $asc;
+--source include/diff_queries.inc
+
+--let $query1= SELECT i, j, k FROM t FORCE INDEX (ik1) WHERE i LIKE "1%" ORDER BY i $asc , k $asc;
+--let $query2= SELECT i, j, k FROM t FORCE INDEX (ik2) WHERE i LIKE "1%" ORDER BY i $asc , k $asc;
+--source include/diff_queries.inc
+
+--let $query1= SELECT i, j, k FROM t FORCE INDEX (ik1) WHERE i LIKE "11111111%" ORDER BY i $asc , k $asc;
+--let $query2= SELECT i, j, k FROM t FORCE INDEX (ik2) WHERE i LIKE "11111111%" ORDER BY i $asc , k $asc;
+--source include/diff_queries.inc
+
+DROP TABLE t;
+
+eval
+CREATE TABLE t (i int, j int, k int, l int,
+ PRIMARY KEY (i, j),
+ KEY ik1 (i, k) COMMENT 'cfname=$cfname;partial_group_keyparts=1;partial_group_threshold=5',
+ KEY ik2 (i, k) COMMENT 'cfname=$cfname'
+) ENGINE=ROCKSDB;
+
+INSERT INTO t values (1, 1, 2, 1);
+INSERT INTO t values (1, 2, 1, 1);
+
+INSERT INTO t values (2, 1, 9, 1);
+INSERT INTO t values (2, 2, 8, 1);
+INSERT INTO t values (2, 3, 7, 1);
+INSERT INTO t values (2, 4, 5, 1);
+INSERT INTO t values (2, 5, 4, 1);
+INSERT INTO t values (2, 6, 2, 1);
+
+INSERT INTO t values (4, 1, 1, 1);
+
+--let $query1= SELECT i, j, k FROM t FORCE INDEX (ik1) ORDER BY i $asc , k $asc;
+--let $query2= SELECT i, j, k FROM t FORCE INDEX (ik2) ORDER BY i $asc , k $asc;
+--source include/diff_queries.inc
+
+--let $query1= SELECT i, j, k FROM t FORCE INDEX (ik1) WHERE i = 1 ORDER BY i $asc , k $asc;
+--let $query2= SELECT i, j, k FROM t FORCE INDEX (ik2) WHERE i = 1 ORDER BY i $asc , k $asc;
+--source include/diff_queries.inc
+
+--let $query1= SELECT i, j, k FROM t FORCE INDEX (ik1) WHERE i = 2 ORDER BY i $asc , k $asc;
+--let $query2= SELECT i, j, k FROM t FORCE INDEX (ik2) WHERE i = 2 ORDER BY i $asc , k $asc;
+--source include/diff_queries.inc
+
+--let $query1= SELECT i, j, k FROM t FORCE INDEX (ik1) WHERE i = 3 ORDER BY i $asc , k $asc;
+--let $query2= SELECT i, j, k FROM t FORCE INDEX (ik2) WHERE i = 3 ORDER BY i $asc , k $asc;
+--source include/diff_queries.inc
+
+--let $query1= SELECT i, j, k FROM t FORCE INDEX (ik1) WHERE i = 4 ORDER BY i $asc , k $asc;
+--let $query2= SELECT i, j, k FROM t FORCE INDEX (ik2) WHERE i = 4 ORDER BY i $asc , k $asc;
+--source include/diff_queries.inc
+
+--let $query1= SELECT i, j, k FROM t FORCE INDEX (ik1) WHERE i = 2 AND k < 5 ORDER BY i $asc , k $asc;
+--let $query2= SELECT i, j, k FROM t FORCE INDEX (ik2) WHERE i = 2 AND k < 5 ORDER BY i $asc , k $asc;
+--source include/diff_queries.inc
+
+--let $query1= SELECT i, j, k FROM t FORCE INDEX (ik1) WHERE i = 2 AND k > 2 ORDER BY i $asc , k $asc;
+--let $query2= SELECT i, j, k FROM t FORCE INDEX (ik2) WHERE i = 2 AND k > 2 ORDER BY i $asc , k $asc;
+--source include/diff_queries.inc
+
+--let $query1= SELECT i, j, k FROM t FORCE INDEX (ik1) WHERE i = 2 AND k > 2 AND k < 5 ORDER BY i $asc , k $asc;
+--let $query2= SELECT i, j, k FROM t FORCE INDEX (ik2) WHERE i = 2 AND k > 2 AND k < 5 ORDER BY i $asc , k $asc;
+--source include/diff_queries.inc
+
+--let $query1= SELECT i, j, k FROM t FORCE INDEX (ik1) WHERE i = 2 AND k > 7 AND k < 9 ORDER BY i $asc , k $asc;
+--let $query2= SELECT i, j, k FROM t FORCE INDEX (ik2) WHERE i = 2 AND k > 7 AND k < 9 ORDER BY i $asc , k $asc;
+--source include/diff_queries.inc
+
+--let $query1= SELECT i, j, k FROM t FORCE INDEX (ik1) WHERE i < 3 ORDER BY i $asc , k $asc;
+--let $query2= SELECT i, j, k FROM t FORCE INDEX (ik2) WHERE i < 3 ORDER BY i $asc , k $asc;
+--source include/diff_queries.inc
+
+--let $query1= SELECT i, j, k FROM t FORCE INDEX (ik1) WHERE i > 3 ORDER BY i $asc , k $asc;
+--let $query2= SELECT i, j, k FROM t FORCE INDEX (ik2) WHERE i > 3 ORDER BY i $asc , k $asc;
+--source include/diff_queries.inc
+
+--let $query1= SELECT i, j, k FROM t FORCE INDEX (ik1) WHERE i >= 3 ORDER BY i $asc , k $asc;
+--let $query2= SELECT i, j, k FROM t FORCE INDEX (ik2) WHERE i >= 3 ORDER BY i $asc , k $asc;
+--source include/diff_queries.inc
+
+--let $query1= SELECT i, j, k FROM t FORCE INDEX (ik1) WHERE i <= 3 ORDER BY i $asc , k $asc;
+--let $query2= SELECT i, j, k FROM t FORCE INDEX (ik2) WHERE i <= 3 ORDER BY i $asc , k $asc;
+--source include/diff_queries.inc
+
+DROP TABLE t;
diff --git a/mysql-test/suite/rocksdb/t/partial_index.test b/mysql-test/suite/rocksdb/t/partial_index.test
new file mode 100644
index 00000000000..410c772765e
--- /dev/null
+++ b/mysql-test/suite/rocksdb/t/partial_index.test
@@ -0,0 +1,19 @@
+set optimizer_force_index_for_range = on;
+
+--let $asc=ASC
+--let $cfname=cf
+--source partial_index.inc
+
+--let $asc=ASC
+--let $cfname=rev:cf
+--source partial_index.inc
+
+--let $asc=DESC
+--let $cfname=cf
+--source partial_index.inc
+
+--let $asc=DESC
+--let $cfname=rev:cf
+--source partial_index.inc
+
+set optimizer_force_index_for_range = off;
diff --git a/mysql-test/suite/rocksdb/t/partial_index_assoc-master.opt b/mysql-test/suite/rocksdb/t/partial_index_assoc-master.opt
new file mode 100644
index 00000000000..81bc90b0531
--- /dev/null
+++ b/mysql-test/suite/rocksdb/t/partial_index_assoc-master.opt
@@ -0,0 +1,3 @@
+--rocksdb_default_cf_options=write_buffer_size=128m;target_file_size_base=32m;max_bytes_for_level_base=512m;level0_file_num_compaction_trigger=4;level0_slowdown_writes_trigger=20;level0_stop_writes_trigger=30;max_write_buffer_number=4;compression_per_level=kLZ4Compression;bottommost_compression=kZSTD;compression_opts=-14:6:0;block_based_table_factory={cache_index_and_filter_blocks=1;filter_policy=bloomfilter:10:false;whole_key_filtering=0};prefix_extractor=capped:12;level_compaction_dynamic_level_bytes=true;optimize_filters_for_hits=true;memtable_prefix_bloom_size_ratio=0.039;max_compaction_bytes=402653184;report_bg_io_stats=true;compaction_pri=kMinOverlappingRatio;soft_pending_compaction_bytes_limit=20480000000
+--rocksdb_override_cf_options=cf_assoc={prefix_extractor=capped:28};cf_assoc_count={prefix_extractor=capped:20};rev:cf_assoc_id1_type={prefix_extractor=capped:20};cf_fbobj_type_id={prefix_extractor=capped:16};cf_assoc_disagg={prefix_extractor=capped:20};__system__={write_buffer_size=16m};
+
diff --git a/mysql-test/suite/rocksdb/t/partial_index_assoc.inc b/mysql-test/suite/rocksdb/t/partial_index_assoc.inc
new file mode 100644
index 00000000000..d0508a3f40b
--- /dev/null
+++ b/mysql-test/suite/rocksdb/t/partial_index_assoc.inc
@@ -0,0 +1,95 @@
+--let $binary_id1=1
+if (`select DATA_TYPE = 'binary' from information_schema.columns where TABLE_NAME = 'assoc_table' and TABLE_SCHEMA = 'test' and COLUMN_NAME = 'id1'`) {
+ --let $binary_id1="1"
+}
+--let $text=`select DATA_TYPE = 'text' from information_schema.columns where TABLE_NAME = 'assoc_table' and TABLE_SCHEMA = 'test' and COLUMN_NAME = 'data'`
+
+# This creates 10 distinct types, with up to 9 distinct id1s per type, to give up to 90 groups.
+--disable_query_log
+let $i=0;
+while ($i < 1000)
+{
+ if ($text) {
+ eval INSERT INTO assoc_table VALUES (FLOOR(RAND($i) * 9), 123, $i, 456, FLOOR($i / 100), FLOOR(RAND($i) * 2), REPEAT("1234567890", FLOOR(RAND($i) * 50)), FLOOR(RAND($i) * 100000), 789);
+ }
+ if (!$text) {
+ eval INSERT INTO assoc_table VALUES (FLOOR(RAND($i) * 9), 123, $i, 456, FLOOR($i / 100), FLOOR(RAND($i) * 2), REPEAT("1234567890", FLOOR(RAND($i) * 20)), FLOOR(RAND($i) * 100000), 789);
+ }
+ inc $i;
+}
+--enable_query_log
+
+let $i=0;
+while ($i < 10) {
+ # This gives a range plan
+ --let $query1= SELECT id1, id2, assoc_type, visibility, data, time, version FROM assoc_table FORCE INDEX (id1_type) WHERE assoc_type = $i AND id1 = $binary_id1 AND visibility = 1 AND time >= 100 AND time <= 50000 ORDER BY time DESC, id2 DESC LIMIT 10000
+ --let $query2= SELECT id1, id2, assoc_type, visibility, data, time, version FROM assoc_table FORCE INDEX (id1_type2) WHERE assoc_type = $i AND id1 = $binary_id1 AND visibility = 1 AND time >= 100 AND time <= 50000 ORDER BY time DESC, id2 DESC LIMIT 10000
+ --source include/diff_queries.inc
+
+ # This gives a ref plan
+ --let $query1= SELECT id1, id2, assoc_type, visibility, data, time, version FROM assoc_table FORCE INDEX (id1_type) WHERE assoc_type = $i AND id1 = $binary_id1 AND visibility = 0 AND time >= 0 AND time <= 4294967295 ORDER BY time DESC, id2 DESC LIMIT 10000
+ --let $query2= SELECT id1, id2, assoc_type, visibility, data, time, version FROM assoc_table FORCE INDEX (id1_type2) WHERE assoc_type = $i AND id1 = $binary_id1 AND visibility = 0 AND time >= 0 AND time <= 4294967295 ORDER BY time DESC, id2 DESC LIMIT 10000
+ --source include/diff_queries.inc
+
+ inc $i;
+}
+
+let $i=0;
+while ($i < 10) {
+ --let $query1= SELECT id1, id2, assoc_type, visibility, data, time, version FROM assoc_table FORCE INDEX (id1_type) WHERE assoc_type = 1
+ --let $query2= SELECT id1, id2, assoc_type, visibility, data, time, version FROM assoc_table FORCE INDEX (id1_type2) WHERE assoc_type = 1
+ --source include/diff_queries.inc
+
+ --let $query1= SELECT id1, id2, assoc_type, visibility, data, time, version FROM assoc_table FORCE INDEX (id1_type) WHERE assoc_type <= 2
+ --let $query2= SELECT id1, id2, assoc_type, visibility, data, time, version FROM assoc_table FORCE INDEX (id1_type2) WHERE assoc_type <= 2
+ --source include/diff_queries.inc
+
+ inc $i;
+}
+
+# Rebuild the table so that nothing is materialized anymore.
+ALTER TABLE assoc_table ENGINE=ROCKSDB;
+
+CREATE TEMPORARY TABLE t1 AS
+SELECT * FROM performance_schema.global_status
+WHERE variable_name LIKE 'rocksdb_partial_index%';
+
+--let $query1= SELECT id1, id2, assoc_type, visibility, time, version FROM assoc_table FORCE INDEX (id1_type)
+--let $query2= SELECT id1, id2, assoc_type, visibility, time, version FROM assoc_table FORCE INDEX (id1_type2)
+--source include/diff_queries.inc
+
+CREATE TEMPORARY TABLE t2 AS
+SELECT * FROM performance_schema.global_status
+WHERE variable_name LIKE 'rocksdb_partial_index%';
+
+SELECT variable_name, t2.variable_value - t1.variable_value AS diff FROM t1 JOIN t2 USING (variable_name);
+--let $assert_text = Check that materialized groups are non-zero.
+--let $assert_cond = [SELECT t2.variable_value - t1.variable_value AS diff FROM t1 JOIN t2 USING (variable_name) WHERE variable_name = "rocksdb_partial_index_groups_materialized", diff, 1] > 0
+--source include/assert.inc
+--let $assert_text = Check that materialized rows are non-zero.
+--let $assert_cond = [SELECT t2.variable_value - t1.variable_value AS diff FROM t1 JOIN t2 USING (variable_name) WHERE variable_name = "rocksdb_partial_index_rows_materialized", diff, 1] > 0
+--source include/assert.inc
+DROP TABLE t1, t2;
+
+# Rerun full index scan a second time, and check that no materialization occurs
+CREATE TEMPORARY TABLE t1 AS
+SELECT * FROM performance_schema.global_status
+WHERE variable_name LIKE 'rocksdb_partial_index%';
+
+--disable_result_log
+SELECT id1, id2, assoc_type, visibility, time, version FROM assoc_table FORCE INDEX (id1_type2);
+--enable_result_log
+
+CREATE TEMPORARY TABLE t2 AS
+SELECT * FROM performance_schema.global_status
+WHERE variable_name LIKE 'rocksdb_partial_index%';
+
+SELECT variable_name, t2.variable_value - t1.variable_value AS diff FROM t1 JOIN t2 USING (variable_name);
+--let $assert_text = Check that materialized groups are zero.
+--let $assert_cond = [SELECT t2.variable_value - t1.variable_value AS diff FROM t1 JOIN t2 USING (variable_name) WHERE variable_name = "rocksdb_partial_index_groups_materialized", diff, 1] = 0
+--source include/assert.inc
+--let $assert_text = Check that materialized rows are zero.
+--let $assert_cond = [SELECT t2.variable_value - t1.variable_value AS diff FROM t1 JOIN t2 USING (variable_name) WHERE variable_name = "rocksdb_partial_index_rows_materialized", diff, 1] = 0
+--source include/assert.inc
+
+DROP TABLE t1, t2;
diff --git a/mysql-test/suite/rocksdb/t/partial_index_assoc.test b/mysql-test/suite/rocksdb/t/partial_index_assoc.test
new file mode 100644
index 00000000000..a559c67f673
--- /dev/null
+++ b/mysql-test/suite/rocksdb/t/partial_index_assoc.test
@@ -0,0 +1,58 @@
+set optimizer_force_index_for_range = on;
+CREATE TABLE `assoc_table` (
+ `id1` bigint(20) unsigned NOT NULL DEFAULT '0',
+ `id1_type` int(10) unsigned NOT NULL DEFAULT '0',
+ `id2` bigint(20) unsigned NOT NULL DEFAULT '0',
+ `id2_type` int(10) unsigned NOT NULL DEFAULT '0',
+ `assoc_type` bigint(20) unsigned NOT NULL DEFAULT '0',
+ `visibility` tinyint(3) NOT NULL DEFAULT '0',
+ `data` varchar(255) COLLATE latin1_bin NOT NULL DEFAULT '',
+ `time` int(10) unsigned NOT NULL DEFAULT '0',
+ `version` bigint(20) unsigned NOT NULL DEFAULT '0',
+ PRIMARY KEY (`assoc_type`,`id1`,`id2`) COMMENT 'cf_assoc',
+ KEY `id1_type` (`assoc_type`, `id1`, `visibility`,`time`,`id2`,`version`,`data`) COMMENT 'rev:cf_assoc_id1_type',
+ KEY `id1_type2` (`assoc_type`, `id1`, `visibility`,`time`,`id2`,`version`,`data`) COMMENT 'cfname=rev:cf_assoc_id1_type;partial_group_keyparts=2;partial_group_threshold=10'
+) ENGINE=ROCKSDB DEFAULT CHARSET=latin1 COLLATE=latin1_bin ROW_FORMAT=COMPRESSED KEY_BLOCK_SIZE=8;
+
+--source partial_index_assoc.inc
+
+DROP TABLE assoc_table;
+
+CREATE TABLE `assoc_table` (
+ `id1` binary(16) NOT NULL DEFAULT '\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0',
+ `raw_key` text COLLATE latin1_bin,
+ `id2` bigint(20) unsigned NOT NULL DEFAULT '0',
+ `id2_type` int(10) unsigned NOT NULL DEFAULT '0',
+ `assoc_type` bigint(20) unsigned NOT NULL DEFAULT '0',
+ `visibility` tinyint(3) NOT NULL DEFAULT '0',
+ `data` varchar(255) COLLATE latin1_bin NOT NULL DEFAULT '',
+ `time` int(10) unsigned NOT NULL DEFAULT '0',
+ `version` bigint(20) unsigned NOT NULL DEFAULT '0',
+ PRIMARY KEY (`assoc_type`,`id1`,`id2`) COMMENT 'cf_assoc',
+ KEY `id1_type` (`assoc_type`,`id1`,`visibility`,`time`,`id2`,`version`,`data`) COMMENT 'rev:cf_assoc_id1_type',
+ KEY `id1_type2` (`assoc_type`, `id1`, `visibility`,`time`,`id2`,`version`,`data`) COMMENT 'cfname=rev:cf_assoc_id1_type;partial_group_keyparts=2;partial_group_threshold=10'
+) ENGINE=ROCKSDB DEFAULT CHARSET=latin1 COLLATE=latin1_bin ROW_FORMAT=COMPRESSED KEY_BLOCK_SIZE=8;
+
+--source partial_index_assoc.inc
+
+DROP TABLE assoc_table;
+
+CREATE TABLE `assoc_table` (
+ `id1` bigint(20) unsigned NOT NULL DEFAULT '0',
+ `id1_type` int(10) unsigned NOT NULL DEFAULT '0',
+ `id2` bigint(20) unsigned NOT NULL DEFAULT '0',
+ `id2_type` int(10) unsigned NOT NULL DEFAULT '0',
+ `assoc_type` bigint(20) unsigned NOT NULL DEFAULT '0',
+ `visibility` tinyint(4) NOT NULL DEFAULT '0',
+ `data` text COLLATE latin1_bin NOT NULL,
+ `time` int(10) unsigned NOT NULL DEFAULT '0',
+ `version` bigint(20) unsigned NOT NULL DEFAULT '0',
+ PRIMARY KEY (`assoc_type`,`id1`,`id2`) COMMENT 'cf_assoc',
+ KEY `id1_type` (`assoc_type`, `id1`, `visibility`,`time`,`id2`,`version`,`data`(255)) COMMENT 'rev:cf_assoc_id1_type',
+ KEY `id1_type2` (`assoc_type`, `id1`, `visibility`,`time`,`id2`,`version`,`data`(255)) COMMENT 'cfname=rev:cf_assoc_id1_type;partial_group_keyparts=2;partial_group_threshold=10'
+) ENGINE=ROCKSDB DEFAULT CHARSET=latin1 COLLATE=latin1_bin ROW_FORMAT=COMPRESSED KEY_BLOCK_SIZE=4;
+
+--source partial_index_assoc.inc
+
+DROP TABLE assoc_table;
+set optimizer_force_index_for_range = off;
diff --git a/mysql-test/suite/rocksdb/t/partial_index_stress-master.opt b/mysql-test/suite/rocksdb/t/partial_index_stress-master.opt
new file mode 100644
index 00000000000..a105847c183
--- /dev/null
+++ b/mysql-test/suite/rocksdb/t/partial_index_stress-master.opt
@@ -0,0 +1,3 @@
+--initialize --default_authentication_plugin=mysql_native_password
+--rocksdb_default_cf_options=write_buffer_size=128m;target_file_size_base=32m;max_bytes_for_level_base=512m;level0_file_num_compaction_trigger=4;level0_slowdown_writes_trigger=20;level0_stop_writes_trigger=30;max_write_buffer_number=4;compression_per_level=kLZ4Compression;bottommost_compression=kZSTD;compression_opts=-14:6:0;block_based_table_factory={cache_index_and_filter_blocks=1;filter_policy=bloomfilter:10:false;whole_key_filtering=0};prefix_extractor=capped:12;level_compaction_dynamic_level_bytes=true;optimize_filters_for_hits=true;memtable_prefix_bloom_size_ratio=0.039;max_compaction_bytes=402653184;report_bg_io_stats=true;compaction_pri=kMinOverlappingRatio;soft_pending_compaction_bytes_limit=20480000000
+--rocksdb_override_cf_options=cf_assoc={prefix_extractor=capped:28};cf_assoc_count={prefix_extractor=capped:20};rev:cf_assoc_id1_type={prefix_extractor=capped:20};cf_fbobj_type_id={prefix_extractor=capped:16};cf_assoc_disagg={prefix_extractor=capped:20};__system__={write_buffer_size=16m};
diff --git a/mysql-test/suite/rocksdb/t/partial_index_stress.py b/mysql-test/suite/rocksdb/t/partial_index_stress.py
new file mode 100644
index 00000000000..07220d88705
--- /dev/null
+++ b/mysql-test/suite/rocksdb/t/partial_index_stress.py
@@ -0,0 +1,114 @@
+"""
+This script stress tests partial indexes by performing writes while concurrently checking PK/SK consistency.
+
+Usage: partial_index_stress.py user host port db_name table_name
+ num_iters num_threads
+"""
+import MySQLdb
+import random
+import sys
+import threading
+import traceback
+
+def get_query(table_name, binary_id1):
+ assoc_type = random.randint(1, 2)
+ id1 = random.randint(1, 5)
+ id2 = random.randint(1, 20)
+
+ r = random.randint(1, 3)
+
+ if r == 1:
+ if binary_id1:
+ return """DELETE FROM %s WHERE id1 = "%d" and id2 = %d and assoc_type = %d""" % (table_name, id1, id2, assoc_type)
+ else:
+ return """DELETE FROM %s WHERE id1 = %d and id2 = %d and assoc_type = %d""" % (table_name, id1, id2, assoc_type)
+ else:
+ return """INSERT INTO %s VALUES (%d, 0, %d, 0, %d, 1, 'abc', 100, 1) ON DUPLICATE KEY UPDATE time=time+10, version=version+1""" % (table_name, id1, id2, assoc_type)
+
+class Worker(threading.Thread):
+ def __init__(self, con, table_name, num_iters, check, event):
+ threading.Thread.__init__(self)
+ self.con = con
+ self.table_name = table_name
+ self.num_iters = num_iters
+ self.check = check
+ self.event = event
+ self.exception = None
+ self.start()
+
+ def run(self):
+ try:
+ if self.check:
+ self.run_check()
+ else:
+ self.run_write()
+ except Exception as e:
+ self.exception = traceback.format_exc()
+
+ def run_write(self):
+ cur = self.con.cursor()
+ cur.execute("select data_type from information_schema.columns where table_schema = database() and table_name = '%s' and column_name = 'id1'" % self.table_name);
+ binary_id1 = cur.fetchone()[0] == "binary"
+ cur.execute("SET SESSION TRANSACTION ISOLATION LEVEL READ COMMITTED")
+ for x in range(self.num_iters):
+ try:
+ cur.execute(get_query(self.table_name, binary_id1))
+ self.con.commit()
+ except MySQLdb.OperationalError as e:
+ self.con.rollback()
+ cur = self.con.cursor()
+ raise e
+
+ def run_check(self):
+ cur = self.con.cursor()
+ while not self.event.is_set():
+ try:
+ cur.execute("SELECT COUNT(*) FROM %s FORCE INDEX(PRIMARY) UNION ALL SELECT COUNT(*) FROM %s FORCE INDEX(id1_type)" % (self.table_name, self.table_name))
+ pk_count = cur.fetchone()[0]
+ sk_count = cur.fetchone()[0]
+ assert pk_count == sk_count, "Count mismatch %d != %d" % (pk_count, sk_count)
+ self.con.commit()
+ except MySQLdb.OperationalError as e:
+ self.con.rollback()
+ cur = self.con.cursor()
+ raise e
+
+if __name__ == '__main__':
+ if len(sys.argv) != 8:
+ print("Usage: partial_index_stress.py user host port db_name " \
+ "table_name num_iters num_threads")
+ sys.exit(1)
+
+ user = sys.argv[1]
+ host = sys.argv[2]
+ port = int(sys.argv[3])
+ db = sys.argv[4]
+ table_name = sys.argv[5]
+ num_iters = int(sys.argv[6])
+ num_workers = int(sys.argv[7])
+
+ done_event = threading.Event();
+
+ worker_failed = False
+ workers = []
+ for i in range(num_workers):
+ w = Worker(
+ MySQLdb.connect(user=user, host=host, port=port, db=db), table_name,
+ num_iters, False, None)
+ workers.append(w)
+
+ checker = Worker(
+ MySQLdb.connect(user=user, host=host, port=port, db=db), table_name,
+ num_iters, True, done_event)
+
+ for w in workers:
+ w.join()
+ if w.exception:
+ print("Worker hit an exception:\n%s\n" % w.exception)
+ worker_failed = True
+
+ done_event.set()
+ checker.join()
+
+ if worker_failed:
+ sys.exit(1)
diff --git a/mysql-test/suite/rocksdb/t/partial_index_stress.test b/mysql-test/suite/rocksdb/t/partial_index_stress.test
new file mode 100644
index 00000000000..c78e8cb980e
--- /dev/null
+++ b/mysql-test/suite/rocksdb/t/partial_index_stress.test
@@ -0,0 +1,64 @@
+#
+# Stress partial indexes by performing writes, and checking that PK/SK are still consistent.
+#
+
+set @save_rocksdb_lock_wait_timeout = @@rocksdb_lock_wait_timeout;
+set global rocksdb_lock_wait_timeout = 100000;
+
+CREATE TABLE `assoc_table` (
+ `id1` bigint(20) unsigned NOT NULL DEFAULT '0',
+ `id1_type` int(10) unsigned NOT NULL DEFAULT '0',
+ `id2` bigint(20) unsigned NOT NULL DEFAULT '0',
+ `id2_type` int(10) unsigned NOT NULL DEFAULT '0',
+ `assoc_type` bigint(20) unsigned NOT NULL DEFAULT '0',
+ `visibility` tinyint(3) NOT NULL DEFAULT '0',
+ `data` varchar(255) COLLATE latin1_bin NOT NULL DEFAULT '',
+ `time` int(10) unsigned NOT NULL DEFAULT '0',
+ `version` bigint(20) unsigned NOT NULL DEFAULT '0',
+ PRIMARY KEY (`assoc_type`,`id1`,`id2`) COMMENT 'cf_assoc',
+ KEY `id1_type` (`assoc_type`, `id1`, `visibility`,`time`,`id2`,`version`,`data`) COMMENT 'cfname=rev:cf_assoc_id1_type;partial_group_keyparts=2;partial_group_threshold=10'
+) ENGINE=ROCKSDB DEFAULT CHARSET=latin1 COLLATE=latin1_bin ROW_FORMAT=COMPRESSED KEY_BLOCK_SIZE=8;
+
+exec /usr/bin/python3 suite/rocksdb/t/partial_index_stress.py root 127.0.0.1 $MASTER_MYPORT test assoc_table 1000 10;
+
+DROP TABLE assoc_table;
+
+CREATE TABLE `assoc_table` (
+ `id1` binary(16) NOT NULL DEFAULT '\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0',
+ `raw_key` text COLLATE latin1_bin,
+ `id2` bigint(20) unsigned NOT NULL DEFAULT '0',
+ `id2_type` int(10) unsigned NOT NULL DEFAULT '0',
+ `assoc_type` bigint(20) unsigned NOT NULL DEFAULT '0',
+ `visibility` tinyint(3) NOT NULL DEFAULT '0',
+ `data` varchar(255) COLLATE latin1_bin NOT NULL DEFAULT '',
+ `time` int(10) unsigned NOT NULL DEFAULT '0',
+ `version` bigint(20) unsigned NOT NULL DEFAULT '0',
+ PRIMARY KEY (`assoc_type`,`id1`,`id2`) COMMENT 'cf_assoc',
+ KEY `id1_type` (`assoc_type`,`id1`,`visibility`,`time`,`id2`,`version`,`data`) COMMENT 'rev:cf_assoc_id1_type',
+ KEY `id1_type2` (`assoc_type`, `id1`, `visibility`,`time`,`id2`,`version`,`data`) COMMENT 'cfname=rev:cf_assoc_id1_type;partial_group_keyparts=2;partial_group_threshold=10'
+) ENGINE=ROCKSDB DEFAULT CHARSET=latin1 COLLATE=latin1_bin ROW_FORMAT=COMPRESSED KEY_BLOCK_SIZE=8;
+
+exec /usr/bin/python3 suite/rocksdb/t/partial_index_stress.py root 127.0.0.1 $MASTER_MYPORT test assoc_table 1000 10;
+
+DROP TABLE assoc_table;
+
+CREATE TABLE `assoc_table` (
+ `id1` bigint(20) unsigned NOT NULL DEFAULT '0',
+ `id1_type` int(10) unsigned NOT NULL DEFAULT '0',
+ `id2` bigint(20) unsigned NOT NULL DEFAULT '0',
+ `id2_type` int(10) unsigned NOT NULL DEFAULT '0',
+ `assoc_type` bigint(20) unsigned NOT NULL DEFAULT '0',
+ `visibility` tinyint(4) NOT NULL DEFAULT '0',
+ `data` text COLLATE latin1_bin NOT NULL,
+ `time` int(10) unsigned NOT NULL DEFAULT '0',
+ `version` bigint(20) unsigned NOT NULL DEFAULT '0',
+ PRIMARY KEY (`assoc_type`,`id1`,`id2`) COMMENT 'cf_assoc',
+ KEY `id1_type` (`assoc_type`, `id1`, `visibility`,`time`,`id2`,`version`,`data`(255)) COMMENT 'rev:cf_assoc_id1_type',
+ KEY `id1_type2` (`assoc_type`, `id1`, `visibility`,`time`,`id2`,`version`,`data`(255)) COMMENT 'cfname=rev:cf_assoc_id1_type;partial_group_keyparts=2;partial_group_threshold=10'
+) ENGINE=ROCKSDB DEFAULT CHARSET=latin1 COLLATE=latin1_bin ROW_FORMAT=COMPRESSED KEY_BLOCK_SIZE=4;
+
+exec /usr/bin/python3 suite/rocksdb/t/partial_index_stress.py root 127.0.0.1 $MASTER_MYPORT test assoc_table 1000 10;
+
+DROP TABLE assoc_table;
+
+set global rocksdb_lock_wait_timeout = @save_rocksdb_lock_wait_timeout;
diff --git a/storage/rocksdb/ha_rocksdb.cc b/storage/rocksdb/ha_rocksdb.cc
index 15c8b289a5e..f59b456675d 100644
--- a/storage/rocksdb/ha_rocksdb.cc
+++ b/storage/rocksdb/ha_rocksdb.cc
@@ -802,6 +802,11 @@ std::atomic<uint64_t> rocksdb_select_bypass_executed(0);
std::atomic<uint64_t> rocksdb_select_bypass_rejected(0);
std::atomic<uint64_t> rocksdb_select_bypass_failed(0);
+std::atomic<uint64_t> rocksdb_partial_index_groups_sorted(0);
+std::atomic<uint64_t> rocksdb_partial_index_groups_materialized(0);
+std::atomic<uint64_t> rocksdb_partial_index_rows_sorted(0);
+std::atomic<uint64_t> rocksdb_partial_index_rows_materialized(0);
+
static int rocksdb_trace_block_cache_access(
THD *const thd MY_ATTRIBUTE((__unused__)),
struct SYS_VAR *const var MY_ATTRIBUTE((__unused__)), void *const save,
@@ -3111,7 +3116,7 @@ class Rdb_transaction {
virtual void set_sync(bool sync) = 0;
virtual void release_lock(const Rdb_key_def &key_descr,
- const std::string &rowkey) = 0;
+ const std::string &rowkey, bool force = false) = 0;
virtual bool prepare() = 0;
@@ -3714,9 +3719,9 @@ class Rdb_transaction_impl : public Rdb_transaction {
m_rocksdb_tx->GetWriteOptions()->sync = sync;
}
- void release_lock(const Rdb_key_def &key_descr,
- const std::string &rowkey) override {
- if (!THDVAR(m_thd, lock_scanned_rows)) {
+ void release_lock(const Rdb_key_def &key_descr, const std::string &rowkey,
+ bool force) override {
+ if (!THDVAR(m_thd, lock_scanned_rows) || force) {
m_rocksdb_tx->UndoGetForUpdate(key_descr.get_cf(),
rocksdb::Slice(rowkey));
// row_lock_count track row(pk)
@@ -4167,7 +4172,8 @@ class Rdb_writebatch_impl : public Rdb_transaction {
void set_sync(bool sync) override { write_opts.sync = sync; }
void release_lock(const Rdb_key_def &key_descr MY_ATTRIBUTE((unused)),
- const std::string &rowkey MY_ATTRIBUTE((unused))) override {
+ const std::string &rowkey MY_ATTRIBUTE((unused)),
+ bool force MY_ATTRIBUTE((unused))) override {
// Nothing to do here since we don't hold any row locks.
}
@@ -6922,6 +6928,7 @@ ulonglong ha_rocksdb::load_auto_incr_value_from_index() {
active_index = table->s->next_number_index;
const uint8 save_table_status = table->m_status;
+ DBUG_ASSERT(!m_key_descr_arr[active_index_pos()]->is_partial_index());
std::unique_ptr<Rdb_iterator> save_iterator(new Rdb_iterator_base(
ha_thd(), m_key_descr_arr[active_index_pos()], m_pk_descr, m_tbl_def));
std::swap(m_iterator, save_iterator);
@@ -10203,6 +10210,7 @@ int ha_rocksdb::check_and_lock_sk(
The bloom filter may need to be disabled for this lookup.
*/
+ DBUG_ASSERT(!m_key_descr_arr[key_id]->is_partial_index());
Rdb_iterator_base iter(ha_thd(), m_key_descr_arr[key_id], m_pk_descr,
m_tbl_def);
int rc = HA_EXIT_SUCCESS;
@@ -10608,6 +10616,34 @@ int ha_rocksdb::update_write_sk(const TABLE *const table_arg,
bytes_written = old_key_slice.size();
}
+ if (kd.is_partial_index()) {
+ // Obtain shared lock on prefix.
+ int size = kd.pack_record(table_arg, m_pack_buffer, row_info.new_data,
+ m_sk_packed_tuple, nullptr, false, 0,
+ kd.partial_index_keyparts());
+ const rocksdb::Slice prefix_slice =
+ rocksdb::Slice((const char *)m_sk_packed_tuple, size);
+
+ const rocksdb::Status s = row_info.tx->get_for_update(
+ kd, prefix_slice, nullptr, false /* exclusive */,
+ false /* do validate */);
+ if (!s.ok()) {
+ return row_info.tx->set_status_error(table_arg->in_use, s, kd, m_tbl_def,
+ m_table_handler);
+ }
+
+ // Check if this prefix has been materialized.
+ Rdb_iterator_base iter(ha_thd(), m_key_descr_arr[kd.get_keyno()],
+ m_pk_descr, m_tbl_def);
+ rc = iter.seek(HA_READ_KEY_EXACT, prefix_slice, false, prefix_slice,
+ true /* read current */);
+
+ // We can skip updating the index, if the prefix is not materialized.
+ if (rc == HA_ERR_END_OF_FILE || rc == HA_ERR_KEY_NOT_FOUND) {
+ return 0;
+ }
+ }
+
new_key_slice = rocksdb::Slice(
reinterpret_cast<const char *>(m_sk_packed_tuple), new_packed_size);
new_value_slice =
@@ -10897,8 +10933,16 @@ int ha_rocksdb::index_init(uint idx, bool sorted MY_ATTRIBUTE((__unused__))) {
DBUG_ASSERT(tx != nullptr);
active_index = idx;
- m_iterator.reset(new Rdb_iterator_base(
- thd, m_key_descr_arr[active_index_pos()], m_pk_descr, m_tbl_def));
+ if (idx != table->s->primary_key &&
+ m_key_descr_arr[idx]->is_partial_index()) {
+ m_iterator.reset(
+ new Rdb_iterator_partial(thd, active_index,
+ m_key_descr_arr[active_index_pos()],
+ m_pk_descr, m_tbl_def, table));
+ } else {
+ m_iterator.reset(new Rdb_iterator_base(
+ thd, m_key_descr_arr[active_index_pos()], m_pk_descr, m_tbl_def));
+ }
// If m_lock_rows is not RDB_LOCK_NONE then we will be doing a get_for_update
// when accessing the index, so don't acquire the snapshot right away.
@@ -13379,6 +13423,9 @@ int ha_rocksdb::inplace_populate_sk(
THDVAR(ha_thd(), merge_tmp_file_removal_delay_ms);
for (const auto &index : indexes) {
+ // Skip populating partial indexes for now.
+ if (index->is_partial_index()) continue;
+
bool is_unique_index =
new_table_arg->key_info[index->get_keyno()].flags & HA_NOSAME;
@@ -14210,6 +14257,17 @@ static SHOW_VAR rocksdb_status_vars[] = {
&rocksdb_select_bypass_rejected, SHOW_LONGLONG),
DEF_STATUS_VAR_PTR("select_bypass_failed", &rocksdb_select_bypass_failed,
SHOW_LONGLONG),
+
+ DEF_STATUS_VAR_PTR("partial_index_groups_sorted",
+ &rocksdb_partial_index_groups_sorted, SHOW_LONGLONG),
+ DEF_STATUS_VAR_PTR("partial_index_groups_materialized",
+ &rocksdb_partial_index_groups_materialized,
+ SHOW_LONGLONG),
+ DEF_STATUS_VAR_PTR("partial_index_rows_sorted",
+ &rocksdb_partial_index_rows_sorted, SHOW_LONGLONG),
+ DEF_STATUS_VAR_PTR("partial_index_rows_materialized",
+ &rocksdb_partial_index_rows_materialized, SHOW_LONGLONG),
+
// the variables generated by SHOW_FUNC are sorted only by prefix (first
// arg in the tuple below), so make sure it is unique to make sorting
// deterministic as quick sort is not stable
@@ -15844,6 +15902,11 @@ rocksdb::Status rdb_tx_get_for_update(Rdb_transaction *tx,
return s;
}
+void rdb_tx_release_lock(Rdb_transaction *tx, const Rdb_key_def &kd,
+ const rocksdb::Slice &key) {
+ tx->release_lock(kd, std::string(key.data(), key.size()));
+}
+
void rdb_tx_multi_get(Rdb_transaction *tx,
rocksdb::ColumnFamilyHandle *const column_family,
const size_t num_keys, const rocksdb::Slice *keys,
diff --git a/storage/rocksdb/ha_rocksdb.h b/storage/rocksdb/ha_rocksdb.h
index 369af0bc9c4..d0baeefe942 100644
--- a/storage/rocksdb/ha_rocksdb.h
+++ b/storage/rocksdb/ha_rocksdb.h
@@ -1166,6 +1166,9 @@ rocksdb::Status rdb_tx_get_for_update(Rdb_transaction *tx,
rocksdb::PinnableSlice *const value,
bool exclusive);
+void rdb_tx_release_lock(Rdb_transaction *tx, const Rdb_key_def &kd,
+ const rocksdb::Slice &key);
+
void rdb_tx_multi_get(Rdb_transaction *tx,
rocksdb::ColumnFamilyHandle *const column_family,
const size_t num_keys, const rocksdb::Slice *keys,
@@ -1218,4 +1221,9 @@ extern std::atomic<uint64_t> rocksdb_select_bypass_executed;
extern std::atomic<uint64_t> rocksdb_select_bypass_rejected;
extern std::atomic<uint64_t> rocksdb_select_bypass_failed;
+extern std::atomic<uint64_t> rocksdb_partial_index_groups_sorted;
+extern std::atomic<uint64_t> rocksdb_partial_index_groups_materialized;
+extern std::atomic<uint64_t> rocksdb_partial_index_rows_sorted;
+extern std::atomic<uint64_t> rocksdb_partial_index_rows_materialized;
+
} // namespace myrocks
diff --git a/storage/rocksdb/rdb_datadic.h b/storage/rocksdb/rdb_datadic.h
index 2c5828a6b8a..1b12c33d8d7 100644
--- a/storage/rocksdb/rdb_datadic.h
+++ b/storage/rocksdb/rdb_datadic.h
@@ -586,6 +586,12 @@ class Rdb_key_def {
uint extract_partial_index_info(const TABLE *const table_arg,
const Rdb_tbl_def *const tbl_def_arg);
inline bool is_partial_index() const { return m_partial_index_threshold > 0; }
+ inline uint partial_index_threshold() const {
+ return m_partial_index_threshold;
+ }
+ inline uint partial_index_keyparts() const {
+ return m_partial_index_keyparts;
+ }
static bool has_index_flag(uint32 index_flags, enum INDEX_FLAG flag);
static uint32 calculate_index_flag_offset(uint32 index_flags,
diff --git a/storage/rocksdb/rdb_iterator.cc b/storage/rocksdb/rdb_iterator.cc
index 529cd6dacae..60dd0c4c6ab 100644
--- a/storage/rocksdb/rdb_iterator.cc
+++ b/storage/rocksdb/rdb_iterator.cc
@@ -16,7 +16,10 @@
#include "./rdb_iterator.h"
+/* MySQL includes */
#include "scope_guard.h"
+#include "sql/sql_class.h"
+#include "sql/thr_malloc.h"
namespace myrocks {
@@ -356,4 +359,592 @@ int Rdb_iterator_base::get(const rocksdb::Slice *key,
return rc;
}
+Rdb_iterator_partial::Rdb_iterator_partial(
+ THD *thd, uint active_index,
+ const std::shared_ptr<Rdb_key_def> kd,
+ const std::shared_ptr<Rdb_key_def> pkd, const Rdb_tbl_def *tbl_def,
+ TABLE *table)
+ : Rdb_iterator_base(thd, kd, pkd, tbl_def),
+ m_table(table),
+ m_iterator_pk(thd, pkd, pkd, tbl_def),
+ m_converter(thd, tbl_def, table),
+ m_valid(false),
+ m_materialized(false),
+ m_threshold(kd->partial_index_threshold()),
+ m_prefix_keyparts(kd->partial_index_keyparts()),
+ m_cur_prefix_key_len(0),
+ m_records(slice_comparator(m_kd->get_cf()->GetComparator())),
+ m_records_it(m_records.end()) {
+ init_sql_alloc(PSI_NOT_INSTRUMENTED, &m_mem_root, 4024, 0);
+ m_converter.setup_field_decoders(table->read_set, active_index,
+ true /* keyread_only */, true /* decode all */);
+ const uint packed_len =
+ std::max(m_kd->max_storage_fmt_length(), m_pkd->max_storage_fmt_length());
+ m_cur_prefix_key = reinterpret_cast<uchar *>(
+ my_malloc(PSI_NOT_INSTRUMENTED, packed_len, MYF(0)));
+ m_record_buf = reinterpret_cast<uchar *>(
+ my_malloc(PSI_NOT_INSTRUMENTED, table->s->reclength, MYF(0)));
+ m_pack_buffer = reinterpret_cast<uchar *>(
+ my_malloc(PSI_NOT_INSTRUMENTED, packed_len, MYF(0)));
+ m_sk_packed_tuple = reinterpret_cast<uchar *>(
+ my_malloc(PSI_NOT_INSTRUMENTED, packed_len, MYF(0)));
+}
+
+Rdb_iterator_partial::~Rdb_iterator_partial() {
+ reset();
+ my_free(m_cur_prefix_key);
+ m_cur_prefix_key = nullptr;
+ my_free(m_record_buf);
+ m_record_buf = nullptr;
+ my_free(m_pack_buffer);
+ m_pack_buffer = nullptr;
+ my_free(m_sk_packed_tuple);
+ m_sk_packed_tuple = nullptr;
+}
+
+int Rdb_iterator_partial::get_prefix_len(const rocksdb::Slice &start_key,
+ uint *prefix_cnt, uint *prefix_len) {
+ Rdb_string_reader reader(&start_key);
+ if ((!reader.read(Rdb_key_def::INDEX_ID_SIZE))) {
+ return HA_ERR_INTERNAL_ERROR;
+ }
+
+ for (uint i = 0; i < m_prefix_keyparts; i++) {
+ if (reader.remaining_bytes() == 0) {
+ *prefix_cnt = i;
+ *prefix_len = reader.get_current_ptr() - start_key.data();
+ return HA_EXIT_SUCCESS;
+ }
+
+ if (m_kd->read_memcmp_key_part(m_table, &reader, i) > 0) {
+ return HA_ERR_INTERNAL_ERROR;
+ }
+ }
+
+ *prefix_cnt = m_prefix_keyparts;
+ *prefix_len = reader.get_current_ptr() - start_key.data();
+
+ return HA_EXIT_SUCCESS;
+}
+
+/*
+ * Determines the correct prefix from start_key by reading from primary key if
+ * needed.
+ *
+ * Populates m_cur_prefix_key/m_cur_prefix_key_len.
+ */
+int Rdb_iterator_partial::get_prefix_from_start(
+ enum ha_rkey_function find_flag, const rocksdb::Slice &start_key) {
+ int rc = 0;
+ uint prefix_cnt = 0;
+ uint prefix_len = 0;
+
+ rc = get_prefix_len(start_key, &prefix_cnt, &prefix_len);
+ if (rc) {
+ return rc;
+ }
+ DBUG_ASSERT_IMP(prefix_cnt == 0, prefix_len == Rdb_key_def::INDEX_ID_SIZE);
+
+ // There are 2 scenarios where a read is required to determine the prefix:
+ // 1. There are not enough keyparts in the start_key.
+ // 2. An exclusive seek key is provided, meaning that we need to read the next
+ // prefix.
+ if (prefix_cnt < m_prefix_keyparts ||
+ (prefix_len == start_key.size() &&
+ (find_flag == HA_READ_AFTER_KEY || find_flag == HA_READ_BEFORE_KEY))) {
+ uint tmp;
+
+ rocksdb::Slice empty_end_key;
+
+ // Since the PK/SK share the same prefix, the primary key can be constructed
+ // using the secondary key, with the index_id overwritten.
+ memcpy(m_cur_prefix_key, start_key.data(), prefix_len);
+ rocksdb::Slice seek_key((const char *)m_cur_prefix_key, prefix_len);
+ m_pkd->get_infimum_key(m_cur_prefix_key, &tmp);
+
+ rc = m_iterator_pk.seek(find_flag, seek_key, false, empty_end_key);
+ if (rc) {
+ return rc;
+ }
+
+ rc = get_prefix_len(m_iterator_pk.key(), &prefix_cnt, &prefix_len);
+ if (rc) {
+ return rc;
+ }
+ memcpy(m_cur_prefix_key, m_iterator_pk.key().data(), prefix_len);
+ } else {
+ memcpy(m_cur_prefix_key, start_key.data(), prefix_len);
+ }
+
+ m_cur_prefix_key_len = prefix_len;
+ return HA_EXIT_SUCCESS;
+}
+
+int Rdb_iterator_partial::get_next_prefix(bool direction) {
+ rocksdb::Slice cur_prefix_key((const char *)m_cur_prefix_key,
+ m_cur_prefix_key_len);
+ uint tmp;
+
+ int rc = get_prefix_from_start(
+ direction ? HA_READ_AFTER_KEY : HA_READ_BEFORE_KEY, cur_prefix_key);
+ m_kd->get_infimum_key(m_cur_prefix_key, &tmp);
+
+ cur_prefix_key =
+ rocksdb::Slice((const char *)m_cur_prefix_key, m_cur_prefix_key_len);
+ if (!rc && !m_kd->value_matches_prefix(cur_prefix_key, m_prefix_tuple)) {
+ rc = HA_ERR_END_OF_FILE;
+ }
+
+ return rc;
+}
+int Rdb_iterator_partial::seek_next_prefix(bool direction) {
+ rocksdb::Slice empty_end_key;
+ uint tmp;
+
+ // Fetch next prefix using PK.
+ int rc = get_next_prefix(direction);
+ if (rc) return rc;
+
+ // Rdb_iterator_base::seek below will overwrite m_prefix_tuple, so we save a
+ // copy here.
+ size_t prefix_buf_len = m_prefix_tuple.size();
+ uchar *prefix_buf_copy = (uchar *)my_alloca(prefix_buf_len);
+ memcpy(prefix_buf_copy, m_prefix_buf, prefix_buf_len);
+
+ // First try reading from SK in the current prefix.
+ rocksdb::Slice cur_prefix_key((const char *)m_cur_prefix_key,
+ m_cur_prefix_key_len);
+ m_kd->get_infimum_key(m_cur_prefix_key, &tmp);
+
+ rc = Rdb_iterator_base::seek(
+ direction ? HA_READ_KEY_EXACT : HA_READ_PREFIX_LAST, cur_prefix_key,
+ false, empty_end_key);
+
+ // Restore m_prefix_tuple
+ memcpy(m_prefix_buf, prefix_buf_copy, prefix_buf_len);
+ m_prefix_tuple = rocksdb::Slice((char *)m_prefix_buf, prefix_buf_len);
+
+ if (rc == HA_ERR_END_OF_FILE) {
+ // Nothing in SK, so check PK.
+ rc = read_prefix_from_pk();
+
+ if (rc == 0) {
+ // Not materialized on disk, seek to beginning/end of map.
+ m_materialized = false;
+ if (direction ^ m_kd->m_is_reverse_cf) {
+ m_records_it = m_records.begin();
+ } else {
+ m_records_it = m_records.end();
+ m_records_it--;
+ }
+ } else {
+ // The current prefix was determined by reading from PK in
+ // get_next_prefix, so rows must exist within this prefix on the PK.
+ DBUG_ASSERT(rc != HA_ERR_END_OF_FILE);
+ }
+ } else if (rc == 0) {
+ // Found rows in SK, so use them
+ m_materialized = true;
+ }
+
+ return rc;
+}
+
+int Rdb_iterator_partial::materialize_prefix() {
+ MEM_ROOT mem_root;
+ init_sql_alloc(PSI_NOT_INSTRUMENTED, &mem_root, 4024, 0);
+ uint tmp;
+ Rdb_transaction *const tx = get_tx_from_thd(m_thd);
+ m_kd->get_infimum_key(m_cur_prefix_key, &tmp);
+ rocksdb::Slice cur_prefix_key((const char *)m_cur_prefix_key,
+ m_cur_prefix_key_len);
+
+ auto s =
+ rdb_tx_get_for_update(tx, *m_kd, cur_prefix_key, nullptr, RDB_LOCK_WRITE);
+ if (!s.ok()) {
+ return rdb_tx_set_status_error(tx, s, *m_kd, m_tbl_def);
+ }
+
+ // It is possible that someone else has already materialized this group
+ // before we locked. Double check if the prefix is still empty.
+ Rdb_iterator_base iter(m_thd, m_kd, m_pkd, m_tbl_def);
+ m_kd->get_infimum_key(m_cur_prefix_key, &tmp);
+ int rc = iter.seek(HA_READ_KEY_EXACT, cur_prefix_key, false, cur_prefix_key,
+ true /* read current */);
+ if (rc == 0 || rc != HA_ERR_END_OF_FILE) {
+ rdb_tx_release_lock(tx, *m_kd, cur_prefix_key);
+ return rc;
+ }
+
+ rocksdb::WriteOptions options;
+ options.sync = false;
+ rocksdb::TransactionDBWriteOptimizations optimize;
+ optimize.skip_concurrency_control = true;
+
+ auto wb = std::unique_ptr<rocksdb::WriteBatch>(new rocksdb::WriteBatch);
+ m_pkd->get_infimum_key(m_cur_prefix_key, &tmp);
+ rc = m_iterator_pk.seek(HA_READ_KEY_EXACT, cur_prefix_key, false,
+ cur_prefix_key, true /* read current */);
+ size_t num_rows = 0;
+
+ while (!rc) {
+ if (thd_killed(m_thd)) {
+ rc = HA_ERR_QUERY_INTERRUPTED;
+ goto exit;
+ }
+
+ const rocksdb::Slice &rkey = m_iterator_pk.key();
+ const rocksdb::Slice &rval = m_iterator_pk.value();
+
+ // Unpack from PK format
+ rc = m_converter.decode(m_pkd, m_record_buf, &rkey, &rval);
+ if (rc) {
+ goto exit;
+ }
+
+ // Repack into SK format
+ uint sk_packed_size = m_kd->pack_record(
+ m_table, m_pack_buffer, m_record_buf, m_sk_packed_tuple, &m_sk_tails,
+ false /* store_row_debug_checksums */, 0 /* hidden_pk_id */, 0, nullptr,
+ m_converter.get_ttl_bytes_buffer());
+
+ const char *key =
+ (const char *)memdup_root(&mem_root, m_sk_packed_tuple, sk_packed_size);
+ const char *val = (const char *)memdup_root(&mem_root, m_sk_tails.ptr(),
+ m_sk_tails.get_current_pos());
+
+ s = wb->Put(m_kd->get_cf(), rocksdb::Slice(key, sk_packed_size),
+ rocksdb::Slice(val, m_sk_tails.get_current_pos()));
+ if (!s.ok()) {
+ rc = rdb_tx_set_status_error(tx, s, *m_kd, m_tbl_def);
+ goto exit;
+ }
+
+ num_rows++;
+ rc = m_iterator_pk.next();
+ }
+
+ if (rc != HA_ERR_END_OF_FILE) goto exit;
+ rc = HA_EXIT_SUCCESS;
+
+ s = rdb_get_rocksdb_db()->Write(options, optimize, wb.get());
+ if (!s.ok()) {
+ rc = rdb_tx_set_status_error(tx, s, *m_kd, m_tbl_def);
+ goto exit;
+ }
+
+ rocksdb_partial_index_groups_materialized++;
+ rocksdb_partial_index_rows_materialized += num_rows;
+
+exit:
+ rdb_tx_release_lock(tx, *m_kd, cur_prefix_key);
+ return rc;
+}
+
+int Rdb_iterator_partial::read_prefix_from_pk() {
+ uint tmp;
+ int rc = 0;
+ size_t num_rows = 0;
+
+ free_root(&m_mem_root, MYF(MY_KEEP_PREALLOC));
+ m_records.clear();
+
+ const char *old_proc_info = m_thd->get_proc_info();
+ thd_proc_info(m_thd, "Materializing group in partial index");
+
+ rocksdb::Slice cur_prefix_key((const char *)m_cur_prefix_key,
+ m_cur_prefix_key_len);
+ m_pkd->get_infimum_key(m_cur_prefix_key, &tmp);
+
+ // Since rocksdb does not support reverse prefix seeks, we always seek in the
+ // forwards direction (even PK is a reverse cf).
+ rc = m_iterator_pk.seek(HA_READ_KEY_EXACT, cur_prefix_key, false,
+ cur_prefix_key);
+
+ while (!rc) {
+ if (thd_killed(m_thd)) {
+ rc = HA_ERR_QUERY_INTERRUPTED;
+ goto exit;
+ }
+
+ const rocksdb::Slice &rkey = m_iterator_pk.key();
+ const rocksdb::Slice &rval = m_iterator_pk.value();
+
+ // Unpack from PK format
+ rc = m_converter.decode(m_pkd, m_record_buf, &rkey, &rval);
+ if (rc) goto exit;
+
+ // Repack into SK format
+ uint sk_packed_size = m_kd->pack_record(
+ m_table, m_pack_buffer, m_record_buf, m_sk_packed_tuple, &m_sk_tails,
+ false /* store_row_debug_checksums */, 0 /* hidden_pk_id */, 0, nullptr,
+ m_converter.get_ttl_bytes_buffer());
+
+ const char *key = (const char *)memdup_root(&m_mem_root, m_sk_packed_tuple,
+ sk_packed_size);
+ const char *val = (const char *)memdup_root(&m_mem_root, m_sk_tails.ptr(),
+ m_sk_tails.get_current_pos());
+
+ m_records.emplace(rocksdb::Slice(key, sk_packed_size),
+ rocksdb::Slice(val, m_sk_tails.get_current_pos()));
+
+ num_rows++;
+ rc = m_iterator_pk.next();
+ }
+
+ if (rc != HA_ERR_END_OF_FILE) goto exit;
+ rc = HA_EXIT_SUCCESS;
+
+ rocksdb_partial_index_groups_sorted++;
+ rocksdb_partial_index_rows_sorted += num_rows;
+
+ if (num_rows > m_threshold) {
+ rc = materialize_prefix();
+ } else if (num_rows == 0) {
+ rc = HA_ERR_END_OF_FILE;
+ }
+
+exit:
+ thd_proc_info(m_thd, old_proc_info);
+ return rc;
+}
+
+int Rdb_iterator_partial::seek(enum ha_rkey_function find_flag,
+ const rocksdb::Slice start_key,
+ bool full_key_match,
+ const rocksdb::Slice end_key,
+ bool read_current) {
+ int rc = 0;
+ uint tmp;
+
+ DBUG_ASSERT(!read_current);
+ reset();
+
+ bool direction = (find_flag == HA_READ_KEY_EXACT) ||
+ (find_flag == HA_READ_AFTER_KEY) ||
+ (find_flag == HA_READ_KEY_OR_NEXT);
+
+ // Get current prefix.
+ if ((rc = get_prefix_from_start(find_flag, start_key)) != 0) {
+ return rc;
+ }
+
+ // First try reading from SK in the current prefix.
+ rocksdb::Slice cur_prefix_key((const char *)m_cur_prefix_key,
+ m_cur_prefix_key_len);
+ m_kd->get_infimum_key(m_cur_prefix_key, &tmp);
+
+ rc = Rdb_iterator_base::seek(find_flag, start_key, full_key_match, end_key,
+ read_current);
+
+ // Check if we're still in our current prefix. If not, we may have missed
+ // some unmaterialized keys, so we have to check PK.
+ if (rc == 0 &&
+ !m_kd->value_matches_prefix(Rdb_iterator_base::key(), cur_prefix_key)) {
+ rc = HA_ERR_END_OF_FILE;
+ }
+
+ bool next_prefix = false;
+
+ if (rc == HA_ERR_END_OF_FILE) {
+ // Nothing in SK, so check PK.
+ rc = read_prefix_from_pk();
+
+ if (rc == HA_ERR_END_OF_FILE) {
+ // Nothing in PK, so move to next prefix.
+ next_prefix = true;
+ } else if (rc == 0) {
+ // Not materialized on disk.
+ m_materialized = false;
+
+ // Seek to correct spot.
+ uchar *start_key_buf = (uchar *)start_key.data();
+
+ // Similar to Rdb_iterator_base::seek, convert start_key into an rocksdb
+ // key that we will actually seek to.
+ auto start_key_guard =
+ create_scope_guard([this, start_key_buf, start_key] {
+ this->m_kd->predecessor(start_key_buf, start_key.size());
+ });
+ if (find_flag == HA_READ_PREFIX_LAST_OR_PREV ||
+ find_flag == HA_READ_PREFIX_LAST || find_flag == HA_READ_AFTER_KEY) {
+ m_kd->successor(start_key_buf, start_key.size());
+ } else {
+ start_key_guard.commit();
+ }
+
+ if (direction) {
+ if (m_kd->m_is_reverse_cf) {
+ // Emulate "SeekForPrev" behaviour.
+ m_records_it = m_records.upper_bound(start_key);
+ if (m_records_it == m_records.begin()) {
+ next_prefix = true;
+ } else {
+ m_records_it--;
+ }
+ } else {
+ m_records_it = m_records.lower_bound(start_key);
+ if (m_records_it == m_records.end()) {
+ next_prefix = true;
+ }
+ }
+ } else {
+ if (m_kd->m_is_reverse_cf) {
+ m_records_it = m_records.upper_bound(start_key);
+ if (m_records_it == m_records.end()) {
+ next_prefix = true;
+ }
+ } else {
+ // Emulate "SeekForPrev" behaviour.
+ m_records_it = m_records.lower_bound(start_key);
+ if (m_records_it == m_records.begin()) {
+ next_prefix = true;
+ } else {
+ m_records_it--;
+ }
+ }
+ }
+ }
+ } else if (rc == 0) {
+ // Found rows in SK, so use them.
+ m_materialized = true;
+ }
+
+ if (next_prefix) {
+ rc = seek_next_prefix(direction);
+ }
+
+ if (!rc) {
+ if (!m_kd->value_matches_prefix(key(), m_prefix_tuple)) {
+ rc = HA_ERR_END_OF_FILE;
+ } else {
+ m_valid = true;
+ }
+ }
+
+ return rc;
+}
+
+int Rdb_iterator_partial::get(const rocksdb::Slice *key,
+ rocksdb::PinnableSlice *value, Rdb_lock_type type,
+ bool skip_ttl_check) {
+ int rc = Rdb_iterator_base::get(key, value, type, skip_ttl_check);
+
+ if (rc == HA_ERR_KEY_NOT_FOUND) {
+ const uint size =
+ m_kd->get_primary_key_tuple(m_table, *m_pkd, key, m_sk_packed_tuple);
+ if (size == RDB_INVALID_KEY_LEN) {
+ return HA_ERR_ROCKSDB_CORRUPT_DATA;
+ }
+
+ rocksdb::Slice pk_key((const char *)m_sk_packed_tuple, size);
+
+ rc = m_iterator_pk.get(&pk_key, value, type, skip_ttl_check);
+ if (rc) return rc;
+
+ // Unpack from PK format
+ rc = m_converter.decode(m_pkd, m_record_buf, &pk_key, value);
+ if (rc) return rc;
+
+ // Repack into SK format
+ uint sk_packed_size = m_kd->pack_record(
+ m_table, m_pack_buffer, m_record_buf, m_sk_packed_tuple, &m_sk_tails,
+ false /* store_row_debug_checksums */, 0 /* hidden_pk_id */, 0, nullptr,
+ m_converter.get_ttl_bytes_buffer());
+
+ value->PinSelf(
+ rocksdb::Slice((const char *)m_sk_packed_tuple, sk_packed_size));
+ rc = 0;
+ }
+
+ m_valid = false;
+ return rc;
+}
+
+int Rdb_iterator_partial::next_with_direction_in_group(bool direction) {
+ uint tmp;
+ int rc = HA_EXIT_SUCCESS;
+ if (m_materialized) {
+ rc = direction ? Rdb_iterator_base::next() : Rdb_iterator_base::prev();
+
+ if (rc == HA_EXIT_SUCCESS) {
+ rocksdb::Slice cur_prefix_key((const char *)m_cur_prefix_key,
+ m_cur_prefix_key_len);
+ m_kd->get_infimum_key(m_cur_prefix_key, &tmp);
+
+ if (!m_kd->value_matches_prefix(Rdb_iterator_base::key(),
+ cur_prefix_key)) {
+ return HA_ERR_END_OF_FILE;
+ }
+ }
+ } else {
+ if (direction ^ m_kd->m_is_reverse_cf) {
+ m_records_it++;
+ if (m_records_it == m_records.end()) return HA_ERR_END_OF_FILE;
+ } else {
+ if (m_records_it == m_records.begin()) return HA_ERR_END_OF_FILE;
+ m_records_it--;
+ }
+ }
+
+ return rc;
+}
+
+int Rdb_iterator_partial::next_with_direction(bool direction) {
+ if (!m_valid) return HA_ERR_INTERNAL_ERROR;
+
+ int rc = next_with_direction_in_group(direction);
+
+ if (!rc) {
+ // On success, check if key is still within prefix.
+ if (!m_kd->value_matches_prefix(key(), m_prefix_tuple)) {
+ rc = HA_ERR_END_OF_FILE;
+ }
+ } else if (rc == HA_ERR_END_OF_FILE) {
+ uint tmp;
+ rocksdb::Slice cur_prefix_key((const char *)m_cur_prefix_key,
+ m_cur_prefix_key_len);
+ m_kd->get_infimum_key(m_cur_prefix_key, &tmp);
+
+ if (m_prefix_tuple.size() >= cur_prefix_key.size()) {
+ DBUG_ASSERT(memcmp(m_prefix_tuple.data(), cur_prefix_key.data(),
+ cur_prefix_key.size()) == 0);
+ return HA_ERR_END_OF_FILE;
+ }
+
+ rc = seek_next_prefix(direction);
+ }
+
+ return rc;
+}
+
+int Rdb_iterator_partial::next() {
+ int rc = next_with_direction(true);
+ if (rc == HA_ERR_END_OF_FILE) m_valid = false;
+ return rc;
+}
+
+int Rdb_iterator_partial::prev() {
+ int rc = next_with_direction(false);
+ if (rc == HA_ERR_END_OF_FILE) m_valid = false;
+ return rc;
+}
+
+void Rdb_iterator_partial::reset() {
+ m_valid = false;
+ m_materialized = false;
+ free_root(&m_mem_root, MYF(MY_KEEP_PREALLOC));
+ m_records.clear();
+ m_iterator_pk.reset();
+ Rdb_iterator_base::reset();
+}
+
+rocksdb::Slice Rdb_iterator_partial::key() {
+ return m_materialized ? Rdb_iterator_base::key() : m_records_it->first;
+}
+
+rocksdb::Slice Rdb_iterator_partial::value() {
+ return m_materialized ? Rdb_iterator_base::value() : m_records_it->second;
+}
+
} // namespace myrocks
diff --git a/storage/rocksdb/rdb_iterator.h b/storage/rocksdb/rdb_iterator.h
index 2a0f5bd5760..164d32c4099 100644
--- a/storage/rocksdb/rdb_iterator.h
+++ b/storage/rocksdb/rdb_iterator.h
@@ -75,7 +75,7 @@ class Rdb_iterator_base : public Rdb_iterator {
int seek(enum ha_rkey_function find_flag, const rocksdb::Slice start_key,
bool full_key_match, const rocksdb::Slice end_key,
- bool read_current) override;
+ bool read_current = false) override;
int get(const rocksdb::Slice *key, rocksdb::PinnableSlice *value,
Rdb_lock_type type, bool skip_ttl_check = false) override;
@@ -118,4 +118,71 @@ class Rdb_iterator_base : public Rdb_iterator {
rocksdb::Slice m_prefix_tuple;
};
+class Rdb_iterator_partial : public Rdb_iterator_base {
+ private:
+ TABLE *m_table;
+ MEM_ROOT m_mem_root;
+
+ Rdb_iterator_base m_iterator_pk;
+ Rdb_converter m_converter;
+
+ bool m_valid;
+ bool m_materialized;
+
+ const uint m_threshold;
+ const uint m_prefix_keyparts;
+
+ uchar *m_cur_prefix_key;
+ uint m_cur_prefix_key_len;
+
+ uchar *m_record_buf;
+ uchar *m_pack_buffer;
+ uchar *m_sk_packed_tuple;
+
+ Rdb_string_writer m_sk_tails;
+
+ int get_prefix_len(const rocksdb::Slice &start_key, uint *prefix_cnt,
+ uint *prefix_len);
+ int get_prefix_from_start(enum ha_rkey_function find_flag,
+ const rocksdb::Slice &start_key);
+ int get_next_prefix(bool direction);
+ int seek_next_prefix(bool direction);
+ int materialize_prefix();
+ int read_prefix_from_pk();
+ int next_with_direction_in_group(bool direction);
+ int next_with_direction(bool direction);
+
+ struct slice_comparator {
+ slice_comparator(const rocksdb::Comparator *c) : m_comparator(c) {}
+ const rocksdb::Comparator *const m_comparator;
+
+ bool operator()(const rocksdb::Slice &lhs, const rocksdb::Slice &rhs) {
+ return m_comparator->Compare(lhs, rhs) < 0;
+ }
+ };
+
+ std::map<const rocksdb::Slice, const rocksdb::Slice, slice_comparator>
+ m_records;
+ std::map<const rocksdb::Slice, const rocksdb::Slice,
+ slice_comparator>::iterator m_records_it;
+
+ public:
+ Rdb_iterator_partial(THD *thd, uint active_index,
+ const std::shared_ptr<Rdb_key_def> kd,
+ const std::shared_ptr<Rdb_key_def> pkd,
+ const Rdb_tbl_def *tbl_def, TABLE *table);
+ ~Rdb_iterator_partial() override;
+
+ int seek(enum ha_rkey_function find_flag, const rocksdb::Slice start_key,
+ bool full_key_match, const rocksdb::Slice end_key,
+ bool read_current = false) override;
+ int get(const rocksdb::Slice *key, rocksdb::PinnableSlice *value,
+ Rdb_lock_type type, bool skip_ttl_check = false) override;
+ int next() override;
+ int prev() override;
+ rocksdb::Slice key() override;
+ rocksdb::Slice value() override;
+ void reset() override;
+};
+
} // namespace myrocks
1
0
revision-id: 4dec5ee8f8150f691bd64b990395dc171b4f5973 (percona-202102-54-g4dec5ee8f81)
parent(s): aaebd623e98e59db3efe1b231307e4142240c485
author: Sergei Petrunia
committer: Sergei Petrunia
timestamp: 2021-05-17 17:45:56 +0300
message:
Apply patch: Add partial index iterator
Summary:
This adds the partial index iterator. It is a special iterator that sorts groups from the primary key on the fly as needed, and if it exceeds a certain threshold, it will materialize the rows on the secondary key as well.
For point queries, the secondary key read is not found, we simply extract the primary key parts and read from the primary key. This means that point queries don't trigger materialization though.
Test Plan: mtr
Reviewers: luqun, herman, yzha, #mysql_eng
Subscribers: phabricatorlinter, pgl
Differential Revision: https://phabricator.intern.facebook.com/D25933178
---
mysql-test/include/diff_queries.inc | 14 +
mysql-test/suite/rocksdb/r/partial_index.result | 146 +++++
.../suite/rocksdb/r/partial_index_assoc.result | 171 ++++++
.../suite/rocksdb/r/partial_index_stress.result | 74 +++
mysql-test/suite/rocksdb/r/rocksdb.result | 12 +
mysql-test/suite/rocksdb/t/partial_index.inc | 156 ++++++
mysql-test/suite/rocksdb/t/partial_index.test | 19 +
.../suite/rocksdb/t/partial_index_assoc-master.opt | 3 +
mysql-test/suite/rocksdb/t/partial_index_assoc.inc | 95 ++++
.../suite/rocksdb/t/partial_index_assoc.test | 58 ++
.../rocksdb/t/partial_index_stress-master.opt | 3 +
mysql-test/suite/rocksdb/t/partial_index_stress.py | 114 ++++
.../suite/rocksdb/t/partial_index_stress.test | 64 +++
storage/rocksdb/ha_rocksdb.cc | 76 ++-
storage/rocksdb/ha_rocksdb.h | 8 +
storage/rocksdb/rdb_datadic.h | 6 +
storage/rocksdb/rdb_iterator.cc | 590 +++++++++++++++++++++
storage/rocksdb/rdb_iterator.h | 68 ++-
18 files changed, 1669 insertions(+), 8 deletions(-)
diff --git a/mysql-test/include/diff_queries.inc b/mysql-test/include/diff_queries.inc
new file mode 100644
index 00000000000..beb75093759
--- /dev/null
+++ b/mysql-test/include/diff_queries.inc
@@ -0,0 +1,14 @@
+--disable_query_log
+
+--output $MYSQL_TMP_DIR/A
+--eval $query1
+
+--output $MYSQL_TMP_DIR/B
+--eval $query2
+
+--enable_query_log
+
+--diff_files $MYSQL_TMP_DIR/A $MYSQL_TMP_DIR/B
+
+--remove_file $MYSQL_TMP_DIR/A
+--remove_file $MYSQL_TMP_DIR/B
diff --git a/mysql-test/suite/rocksdb/r/partial_index.result b/mysql-test/suite/rocksdb/r/partial_index.result
new file mode 100644
index 00000000000..a752d37180c
--- /dev/null
+++ b/mysql-test/suite/rocksdb/r/partial_index.result
@@ -0,0 +1,146 @@
+set optimizer_force_index_for_range = on;
+CREATE TABLE t (i varchar(64), j varchar(64), k varchar(64), l varchar(64),
+PRIMARY KEY (i, j),
+KEY ik1 (i, k) COMMENT 'cfname=cf;partial_group_keyparts=1;partial_group_threshold=5',
+KEY ik2 (i, k) COMMENT 'cfname=cf'
+) ENGINE=ROCKSDB;
+Warnings:
+Warning 1831 Duplicate index 'ik2' defined on the table 'test.t'. This is deprecated and will be disallowed in a future release.
+INSERT INTO t values ("1", "1", "2", "1");
+INSERT INTO t values ("1", "2", "1", "1");
+INSERT INTO t values ("11111111", "1", "9", "1");
+INSERT INTO t values ("11111111", "2", "8", "1");
+INSERT INTO t values ("11111111", "3", "7", "1");
+INSERT INTO t values ("11111111", "4", "5", "1");
+INSERT INTO t values ("11111111", "5", "4", "1");
+INSERT INTO t values ("11111111", "6", "2", "1");
+INSERT INTO t values ("111111111", "1", "9", "1");
+INSERT INTO t values ("111111111", "2", "2", "1");
+INSERT INTO t values ("11111112", "1", "1", "1");
+DROP TABLE t;
+CREATE TABLE t (i int, j int, k int, l int,
+PRIMARY KEY (i, j),
+KEY ik1 (i, k) COMMENT 'cfname=cf;partial_group_keyparts=1;partial_group_threshold=5',
+KEY ik2 (i, k) COMMENT 'cfname=cf'
+) ENGINE=ROCKSDB;
+Warnings:
+Warning 1831 Duplicate index 'ik2' defined on the table 'test.t'. This is deprecated and will be disallowed in a future release.
+INSERT INTO t values (1, 1, 2, 1);
+INSERT INTO t values (1, 2, 1, 1);
+INSERT INTO t values (2, 1, 9, 1);
+INSERT INTO t values (2, 2, 8, 1);
+INSERT INTO t values (2, 3, 7, 1);
+INSERT INTO t values (2, 4, 5, 1);
+INSERT INTO t values (2, 5, 4, 1);
+INSERT INTO t values (2, 6, 2, 1);
+INSERT INTO t values (4, 1, 1, 1);
+DROP TABLE t;
+CREATE TABLE t (i varchar(64), j varchar(64), k varchar(64), l varchar(64),
+PRIMARY KEY (i, j),
+KEY ik1 (i, k) COMMENT 'cfname=rev:cf;partial_group_keyparts=1;partial_group_threshold=5',
+KEY ik2 (i, k) COMMENT 'cfname=rev:cf'
+) ENGINE=ROCKSDB;
+Warnings:
+Warning 1831 Duplicate index 'ik2' defined on the table 'test.t'. This is deprecated and will be disallowed in a future release.
+INSERT INTO t values ("1", "1", "2", "1");
+INSERT INTO t values ("1", "2", "1", "1");
+INSERT INTO t values ("11111111", "1", "9", "1");
+INSERT INTO t values ("11111111", "2", "8", "1");
+INSERT INTO t values ("11111111", "3", "7", "1");
+INSERT INTO t values ("11111111", "4", "5", "1");
+INSERT INTO t values ("11111111", "5", "4", "1");
+INSERT INTO t values ("11111111", "6", "2", "1");
+INSERT INTO t values ("111111111", "1", "9", "1");
+INSERT INTO t values ("111111111", "2", "2", "1");
+INSERT INTO t values ("11111112", "1", "1", "1");
+DROP TABLE t;
+CREATE TABLE t (i int, j int, k int, l int,
+PRIMARY KEY (i, j),
+KEY ik1 (i, k) COMMENT 'cfname=rev:cf;partial_group_keyparts=1;partial_group_threshold=5',
+KEY ik2 (i, k) COMMENT 'cfname=rev:cf'
+) ENGINE=ROCKSDB;
+Warnings:
+Warning 1831 Duplicate index 'ik2' defined on the table 'test.t'. This is deprecated and will be disallowed in a future release.
+INSERT INTO t values (1, 1, 2, 1);
+INSERT INTO t values (1, 2, 1, 1);
+INSERT INTO t values (2, 1, 9, 1);
+INSERT INTO t values (2, 2, 8, 1);
+INSERT INTO t values (2, 3, 7, 1);
+INSERT INTO t values (2, 4, 5, 1);
+INSERT INTO t values (2, 5, 4, 1);
+INSERT INTO t values (2, 6, 2, 1);
+INSERT INTO t values (4, 1, 1, 1);
+DROP TABLE t;
+CREATE TABLE t (i varchar(64), j varchar(64), k varchar(64), l varchar(64),
+PRIMARY KEY (i, j),
+KEY ik1 (i, k) COMMENT 'cfname=cf;partial_group_keyparts=1;partial_group_threshold=5',
+KEY ik2 (i, k) COMMENT 'cfname=cf'
+) ENGINE=ROCKSDB;
+Warnings:
+Warning 1831 Duplicate index 'ik2' defined on the table 'test.t'. This is deprecated and will be disallowed in a future release.
+INSERT INTO t values ("1", "1", "2", "1");
+INSERT INTO t values ("1", "2", "1", "1");
+INSERT INTO t values ("11111111", "1", "9", "1");
+INSERT INTO t values ("11111111", "2", "8", "1");
+INSERT INTO t values ("11111111", "3", "7", "1");
+INSERT INTO t values ("11111111", "4", "5", "1");
+INSERT INTO t values ("11111111", "5", "4", "1");
+INSERT INTO t values ("11111111", "6", "2", "1");
+INSERT INTO t values ("111111111", "1", "9", "1");
+INSERT INTO t values ("111111111", "2", "2", "1");
+INSERT INTO t values ("11111112", "1", "1", "1");
+DROP TABLE t;
+CREATE TABLE t (i int, j int, k int, l int,
+PRIMARY KEY (i, j),
+KEY ik1 (i, k) COMMENT 'cfname=cf;partial_group_keyparts=1;partial_group_threshold=5',
+KEY ik2 (i, k) COMMENT 'cfname=cf'
+) ENGINE=ROCKSDB;
+Warnings:
+Warning 1831 Duplicate index 'ik2' defined on the table 'test.t'. This is deprecated and will be disallowed in a future release.
+INSERT INTO t values (1, 1, 2, 1);
+INSERT INTO t values (1, 2, 1, 1);
+INSERT INTO t values (2, 1, 9, 1);
+INSERT INTO t values (2, 2, 8, 1);
+INSERT INTO t values (2, 3, 7, 1);
+INSERT INTO t values (2, 4, 5, 1);
+INSERT INTO t values (2, 5, 4, 1);
+INSERT INTO t values (2, 6, 2, 1);
+INSERT INTO t values (4, 1, 1, 1);
+DROP TABLE t;
+CREATE TABLE t (i varchar(64), j varchar(64), k varchar(64), l varchar(64),
+PRIMARY KEY (i, j),
+KEY ik1 (i, k) COMMENT 'cfname=rev:cf;partial_group_keyparts=1;partial_group_threshold=5',
+KEY ik2 (i, k) COMMENT 'cfname=rev:cf'
+) ENGINE=ROCKSDB;
+Warnings:
+Warning 1831 Duplicate index 'ik2' defined on the table 'test.t'. This is deprecated and will be disallowed in a future release.
+INSERT INTO t values ("1", "1", "2", "1");
+INSERT INTO t values ("1", "2", "1", "1");
+INSERT INTO t values ("11111111", "1", "9", "1");
+INSERT INTO t values ("11111111", "2", "8", "1");
+INSERT INTO t values ("11111111", "3", "7", "1");
+INSERT INTO t values ("11111111", "4", "5", "1");
+INSERT INTO t values ("11111111", "5", "4", "1");
+INSERT INTO t values ("11111111", "6", "2", "1");
+INSERT INTO t values ("111111111", "1", "9", "1");
+INSERT INTO t values ("111111111", "2", "2", "1");
+INSERT INTO t values ("11111112", "1", "1", "1");
+DROP TABLE t;
+CREATE TABLE t (i int, j int, k int, l int,
+PRIMARY KEY (i, j),
+KEY ik1 (i, k) COMMENT 'cfname=rev:cf;partial_group_keyparts=1;partial_group_threshold=5',
+KEY ik2 (i, k) COMMENT 'cfname=rev:cf'
+) ENGINE=ROCKSDB;
+Warnings:
+Warning 1831 Duplicate index 'ik2' defined on the table 'test.t'. This is deprecated and will be disallowed in a future release.
+INSERT INTO t values (1, 1, 2, 1);
+INSERT INTO t values (1, 2, 1, 1);
+INSERT INTO t values (2, 1, 9, 1);
+INSERT INTO t values (2, 2, 8, 1);
+INSERT INTO t values (2, 3, 7, 1);
+INSERT INTO t values (2, 4, 5, 1);
+INSERT INTO t values (2, 5, 4, 1);
+INSERT INTO t values (2, 6, 2, 1);
+INSERT INTO t values (4, 1, 1, 1);
+DROP TABLE t;
+set optimizer_force_index_for_range = off;
diff --git a/mysql-test/suite/rocksdb/r/partial_index_assoc.result b/mysql-test/suite/rocksdb/r/partial_index_assoc.result
new file mode 100644
index 00000000000..fbb89d16b35
--- /dev/null
+++ b/mysql-test/suite/rocksdb/r/partial_index_assoc.result
@@ -0,0 +1,171 @@
+set optimizer_force_index_for_range = on;
+CREATE TABLE `assoc_table` (
+`id1` bigint(20) unsigned NOT NULL DEFAULT '0',
+`id1_type` int(10) unsigned NOT NULL DEFAULT '0',
+`id2` bigint(20) unsigned NOT NULL DEFAULT '0',
+`id2_type` int(10) unsigned NOT NULL DEFAULT '0',
+`assoc_type` bigint(20) unsigned NOT NULL DEFAULT '0',
+`visibility` tinyint(3) NOT NULL DEFAULT '0',
+`data` varchar(255) COLLATE latin1_bin NOT NULL DEFAULT '',
+`time` int(10) unsigned NOT NULL DEFAULT '0',
+`version` bigint(20) unsigned NOT NULL DEFAULT '0',
+PRIMARY KEY (`assoc_type`,`id1`,`id2`) COMMENT 'cf_assoc',
+KEY `id1_type` (`assoc_type`, `id1`, `visibility`,`time`,`id2`,`version`,`data`) COMMENT 'rev:cf_assoc_id1_type',
+KEY `id1_type2` (`assoc_type`, `id1`, `visibility`,`time`,`id2`,`version`,`data`) COMMENT 'cfname=rev:cf_assoc_id1_type;partial_group_keyparts=2;partial_group_threshold=10'
+) ENGINE=ROCKSDB DEFAULT CHARSET=latin1 COLLATE=latin1_bin ROW_FORMAT=COMPRESSED KEY_BLOCK_SIZE=8;
+Warnings:
+Warning 1681 Integer display width is deprecated and will be removed in a future release.
+Warning 1681 Integer display width is deprecated and will be removed in a future release.
+Warning 1681 Integer display width is deprecated and will be removed in a future release.
+Warning 1681 Integer display width is deprecated and will be removed in a future release.
+Warning 1681 Integer display width is deprecated and will be removed in a future release.
+Warning 1681 Integer display width is deprecated and will be removed in a future release.
+Warning 1681 Integer display width is deprecated and will be removed in a future release.
+Warning 1681 Integer display width is deprecated and will be removed in a future release.
+Warning 1831 Duplicate index 'id1_type2' defined on the table 'test.assoc_table'. This is deprecated and will be disallowed in a future release.
+ALTER TABLE assoc_table ENGINE=ROCKSDB;
+CREATE TEMPORARY TABLE t1 AS
+SELECT * FROM performance_schema.global_status
+WHERE variable_name LIKE 'rocksdb_partial_index%';
+CREATE TEMPORARY TABLE t2 AS
+SELECT * FROM performance_schema.global_status
+WHERE variable_name LIKE 'rocksdb_partial_index%';
+SELECT variable_name, t2.variable_value - t1.variable_value AS diff FROM t1 JOIN t2 USING (variable_name);
+variable_name diff
+rocksdb_partial_index_groups_materialized 40
+rocksdb_partial_index_groups_sorted 47
+rocksdb_partial_index_rows_materialized 955
+rocksdb_partial_index_rows_sorted 1000
+include/assert.inc [Check that materialized groups are non-zero.]
+include/assert.inc [Check that materialized rows are non-zero.]
+DROP TABLE t1, t2;
+CREATE TEMPORARY TABLE t1 AS
+SELECT * FROM performance_schema.global_status
+WHERE variable_name LIKE 'rocksdb_partial_index%';
+SELECT id1, id2, assoc_type, visibility, time, version FROM assoc_table FORCE INDEX (id1_type2);
+CREATE TEMPORARY TABLE t2 AS
+SELECT * FROM performance_schema.global_status
+WHERE variable_name LIKE 'rocksdb_partial_index%';
+SELECT variable_name, t2.variable_value - t1.variable_value AS diff FROM t1 JOIN t2 USING (variable_name);
+variable_name diff
+rocksdb_partial_index_groups_materialized 0
+rocksdb_partial_index_groups_sorted 7
+rocksdb_partial_index_rows_materialized 0
+rocksdb_partial_index_rows_sorted 45
+include/assert.inc [Check that materialized groups are zero.]
+include/assert.inc [Check that materialized rows are zero.]
+DROP TABLE t1, t2;
+DROP TABLE assoc_table;
+CREATE TABLE `assoc_table` (
+`id1` binary(16) NOT NULL DEFAULT '\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0',
+`raw_key` text COLLATE latin1_bin,
+`id2` bigint(20) unsigned NOT NULL DEFAULT '0',
+`id2_type` int(10) unsigned NOT NULL DEFAULT '0',
+`assoc_type` bigint(20) unsigned NOT NULL DEFAULT '0',
+`visibility` tinyint(3) NOT NULL DEFAULT '0',
+`data` varchar(255) COLLATE latin1_bin NOT NULL DEFAULT '',
+`time` int(10) unsigned NOT NULL DEFAULT '0',
+`version` bigint(20) unsigned NOT NULL DEFAULT '0',
+PRIMARY KEY (`assoc_type`,`id1`,`id2`) COMMENT 'cf_assoc',
+KEY `id1_type` (`assoc_type`,`id1`,`visibility`,`time`,`id2`,`version`,`data`) COMMENT 'rev:cf_assoc_id1_type',
+KEY `id1_type2` (`assoc_type`, `id1`, `visibility`,`time`,`id2`,`version`,`data`) COMMENT 'cfname=rev:cf_assoc_id1_type;partial_group_keyparts=2;partial_group_threshold=10'
+) ENGINE=ROCKSDB DEFAULT CHARSET=latin1 COLLATE=latin1_bin ROW_FORMAT=COMPRESSED KEY_BLOCK_SIZE=8;
+Warnings:
+Warning 1681 Integer display width is deprecated and will be removed in a future release.
+Warning 1681 Integer display width is deprecated and will be removed in a future release.
+Warning 1681 Integer display width is deprecated and will be removed in a future release.
+Warning 1681 Integer display width is deprecated and will be removed in a future release.
+Warning 1681 Integer display width is deprecated and will be removed in a future release.
+Warning 1681 Integer display width is deprecated and will be removed in a future release.
+Warning 1831 Duplicate index 'id1_type2' defined on the table 'test.assoc_table'. This is deprecated and will be disallowed in a future release.
+ALTER TABLE assoc_table ENGINE=ROCKSDB;
+CREATE TEMPORARY TABLE t1 AS
+SELECT * FROM performance_schema.global_status
+WHERE variable_name LIKE 'rocksdb_partial_index%';
+CREATE TEMPORARY TABLE t2 AS
+SELECT * FROM performance_schema.global_status
+WHERE variable_name LIKE 'rocksdb_partial_index%';
+SELECT variable_name, t2.variable_value - t1.variable_value AS diff FROM t1 JOIN t2 USING (variable_name);
+variable_name diff
+rocksdb_partial_index_groups_materialized 40
+rocksdb_partial_index_groups_sorted 47
+rocksdb_partial_index_rows_materialized 955
+rocksdb_partial_index_rows_sorted 1000
+include/assert.inc [Check that materialized groups are non-zero.]
+include/assert.inc [Check that materialized rows are non-zero.]
+DROP TABLE t1, t2;
+CREATE TEMPORARY TABLE t1 AS
+SELECT * FROM performance_schema.global_status
+WHERE variable_name LIKE 'rocksdb_partial_index%';
+SELECT id1, id2, assoc_type, visibility, time, version FROM assoc_table FORCE INDEX (id1_type2);
+CREATE TEMPORARY TABLE t2 AS
+SELECT * FROM performance_schema.global_status
+WHERE variable_name LIKE 'rocksdb_partial_index%';
+SELECT variable_name, t2.variable_value - t1.variable_value AS diff FROM t1 JOIN t2 USING (variable_name);
+variable_name diff
+rocksdb_partial_index_groups_materialized 0
+rocksdb_partial_index_groups_sorted 7
+rocksdb_partial_index_rows_materialized 0
+rocksdb_partial_index_rows_sorted 45
+include/assert.inc [Check that materialized groups are zero.]
+include/assert.inc [Check that materialized rows are zero.]
+DROP TABLE t1, t2;
+DROP TABLE assoc_table;
+CREATE TABLE `assoc_table` (
+`id1` bigint(20) unsigned NOT NULL DEFAULT '0',
+`id1_type` int(10) unsigned NOT NULL DEFAULT '0',
+`id2` bigint(20) unsigned NOT NULL DEFAULT '0',
+`id2_type` int(10) unsigned NOT NULL DEFAULT '0',
+`assoc_type` bigint(20) unsigned NOT NULL DEFAULT '0',
+`visibility` tinyint(4) NOT NULL DEFAULT '0',
+`data` text COLLATE latin1_bin NOT NULL,
+`time` int(10) unsigned NOT NULL DEFAULT '0',
+`version` bigint(20) unsigned NOT NULL DEFAULT '0',
+PRIMARY KEY (`assoc_type`,`id1`,`id2`) COMMENT 'cf_assoc',
+KEY `id1_type` (`assoc_type`, `id1`, `visibility`,`time`,`id2`,`version`,`data`(255)) COMMENT 'rev:cf_assoc_id1_type',
+KEY `id1_type2` (`assoc_type`, `id1`, `visibility`,`time`,`id2`,`version`,`data`(255)) COMMENT 'cfname=rev:cf_assoc_id1_type;partial_group_keyparts=2;partial_group_threshold=10'
+) ENGINE=ROCKSDB DEFAULT CHARSET=latin1 COLLATE=latin1_bin ROW_FORMAT=COMPRESSED KEY_BLOCK_SIZE=4;
+Warnings:
+Warning 1681 Integer display width is deprecated and will be removed in a future release.
+Warning 1681 Integer display width is deprecated and will be removed in a future release.
+Warning 1681 Integer display width is deprecated and will be removed in a future release.
+Warning 1681 Integer display width is deprecated and will be removed in a future release.
+Warning 1681 Integer display width is deprecated and will be removed in a future release.
+Warning 1681 Integer display width is deprecated and will be removed in a future release.
+Warning 1681 Integer display width is deprecated and will be removed in a future release.
+Warning 1681 Integer display width is deprecated and will be removed in a future release.
+Warning 1831 Duplicate index 'id1_type2' defined on the table 'test.assoc_table'. This is deprecated and will be disallowed in a future release.
+ALTER TABLE assoc_table ENGINE=ROCKSDB;
+CREATE TEMPORARY TABLE t1 AS
+SELECT * FROM performance_schema.global_status
+WHERE variable_name LIKE 'rocksdb_partial_index%';
+CREATE TEMPORARY TABLE t2 AS
+SELECT * FROM performance_schema.global_status
+WHERE variable_name LIKE 'rocksdb_partial_index%';
+SELECT variable_name, t2.variable_value - t1.variable_value AS diff FROM t1 JOIN t2 USING (variable_name);
+variable_name diff
+rocksdb_partial_index_groups_materialized 40
+rocksdb_partial_index_groups_sorted 47
+rocksdb_partial_index_rows_materialized 955
+rocksdb_partial_index_rows_sorted 1000
+include/assert.inc [Check that materialized groups are non-zero.]
+include/assert.inc [Check that materialized rows are non-zero.]
+DROP TABLE t1, t2;
+CREATE TEMPORARY TABLE t1 AS
+SELECT * FROM performance_schema.global_status
+WHERE variable_name LIKE 'rocksdb_partial_index%';
+SELECT id1, id2, assoc_type, visibility, time, version FROM assoc_table FORCE INDEX (id1_type2);
+CREATE TEMPORARY TABLE t2 AS
+SELECT * FROM performance_schema.global_status
+WHERE variable_name LIKE 'rocksdb_partial_index%';
+SELECT variable_name, t2.variable_value - t1.variable_value AS diff FROM t1 JOIN t2 USING (variable_name);
+variable_name diff
+rocksdb_partial_index_groups_materialized 0
+rocksdb_partial_index_groups_sorted 7
+rocksdb_partial_index_rows_materialized 0
+rocksdb_partial_index_rows_sorted 45
+include/assert.inc [Check that materialized groups are zero.]
+include/assert.inc [Check that materialized rows are zero.]
+DROP TABLE t1, t2;
+DROP TABLE assoc_table;
+set optimizer_force_index_for_range = off;
diff --git a/mysql-test/suite/rocksdb/r/partial_index_stress.result b/mysql-test/suite/rocksdb/r/partial_index_stress.result
new file mode 100644
index 00000000000..88f77bcc63f
--- /dev/null
+++ b/mysql-test/suite/rocksdb/r/partial_index_stress.result
@@ -0,0 +1,74 @@
+set @save_rocksdb_lock_wait_timeout = @@rocksdb_lock_wait_timeout;
+set global rocksdb_lock_wait_timeout = 100000;
+CREATE TABLE `assoc_table` (
+`id1` bigint(20) unsigned NOT NULL DEFAULT '0',
+`id1_type` int(10) unsigned NOT NULL DEFAULT '0',
+`id2` bigint(20) unsigned NOT NULL DEFAULT '0',
+`id2_type` int(10) unsigned NOT NULL DEFAULT '0',
+`assoc_type` bigint(20) unsigned NOT NULL DEFAULT '0',
+`visibility` tinyint(3) NOT NULL DEFAULT '0',
+`data` varchar(255) COLLATE latin1_bin NOT NULL DEFAULT '',
+`time` int(10) unsigned NOT NULL DEFAULT '0',
+`version` bigint(20) unsigned NOT NULL DEFAULT '0',
+PRIMARY KEY (`assoc_type`,`id1`,`id2`) COMMENT 'cf_assoc',
+KEY `id1_type` (`assoc_type`, `id1`, `visibility`,`time`,`id2`,`version`,`data`) COMMENT 'cfname=rev:cf_assoc_id1_type;partial_group_keyparts=2;partial_group_threshold=10'
+) ENGINE=ROCKSDB DEFAULT CHARSET=latin1 COLLATE=latin1_bin ROW_FORMAT=COMPRESSED KEY_BLOCK_SIZE=8;
+Warnings:
+Warning 1681 Integer display width is deprecated and will be removed in a future release.
+Warning 1681 Integer display width is deprecated and will be removed in a future release.
+Warning 1681 Integer display width is deprecated and will be removed in a future release.
+Warning 1681 Integer display width is deprecated and will be removed in a future release.
+Warning 1681 Integer display width is deprecated and will be removed in a future release.
+Warning 1681 Integer display width is deprecated and will be removed in a future release.
+Warning 1681 Integer display width is deprecated and will be removed in a future release.
+Warning 1681 Integer display width is deprecated and will be removed in a future release.
+DROP TABLE assoc_table;
+CREATE TABLE `assoc_table` (
+`id1` binary(16) NOT NULL DEFAULT '\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0',
+`raw_key` text COLLATE latin1_bin,
+`id2` bigint(20) unsigned NOT NULL DEFAULT '0',
+`id2_type` int(10) unsigned NOT NULL DEFAULT '0',
+`assoc_type` bigint(20) unsigned NOT NULL DEFAULT '0',
+`visibility` tinyint(3) NOT NULL DEFAULT '0',
+`data` varchar(255) COLLATE latin1_bin NOT NULL DEFAULT '',
+`time` int(10) unsigned NOT NULL DEFAULT '0',
+`version` bigint(20) unsigned NOT NULL DEFAULT '0',
+PRIMARY KEY (`assoc_type`,`id1`,`id2`) COMMENT 'cf_assoc',
+KEY `id1_type` (`assoc_type`,`id1`,`visibility`,`time`,`id2`,`version`,`data`) COMMENT 'rev:cf_assoc_id1_type',
+KEY `id1_type2` (`assoc_type`, `id1`, `visibility`,`time`,`id2`,`version`,`data`) COMMENT 'cfname=rev:cf_assoc_id1_type;partial_group_keyparts=2;partial_group_threshold=10'
+) ENGINE=ROCKSDB DEFAULT CHARSET=latin1 COLLATE=latin1_bin ROW_FORMAT=COMPRESSED KEY_BLOCK_SIZE=8;
+Warnings:
+Warning 1681 Integer display width is deprecated and will be removed in a future release.
+Warning 1681 Integer display width is deprecated and will be removed in a future release.
+Warning 1681 Integer display width is deprecated and will be removed in a future release.
+Warning 1681 Integer display width is deprecated and will be removed in a future release.
+Warning 1681 Integer display width is deprecated and will be removed in a future release.
+Warning 1681 Integer display width is deprecated and will be removed in a future release.
+Warning 1831 Duplicate index 'id1_type2' defined on the table 'test.assoc_table'. This is deprecated and will be disallowed in a future release.
+DROP TABLE assoc_table;
+CREATE TABLE `assoc_table` (
+`id1` bigint(20) unsigned NOT NULL DEFAULT '0',
+`id1_type` int(10) unsigned NOT NULL DEFAULT '0',
+`id2` bigint(20) unsigned NOT NULL DEFAULT '0',
+`id2_type` int(10) unsigned NOT NULL DEFAULT '0',
+`assoc_type` bigint(20) unsigned NOT NULL DEFAULT '0',
+`visibility` tinyint(4) NOT NULL DEFAULT '0',
+`data` text COLLATE latin1_bin NOT NULL,
+`time` int(10) unsigned NOT NULL DEFAULT '0',
+`version` bigint(20) unsigned NOT NULL DEFAULT '0',
+PRIMARY KEY (`assoc_type`,`id1`,`id2`) COMMENT 'cf_assoc',
+KEY `id1_type` (`assoc_type`, `id1`, `visibility`,`time`,`id2`,`version`,`data`(255)) COMMENT 'rev:cf_assoc_id1_type',
+KEY `id1_type2` (`assoc_type`, `id1`, `visibility`,`time`,`id2`,`version`,`data`(255)) COMMENT 'cfname=rev:cf_assoc_id1_type;partial_group_keyparts=2;partial_group_threshold=10'
+) ENGINE=ROCKSDB DEFAULT CHARSET=latin1 COLLATE=latin1_bin ROW_FORMAT=COMPRESSED KEY_BLOCK_SIZE=4;
+Warnings:
+Warning 1681 Integer display width is deprecated and will be removed in a future release.
+Warning 1681 Integer display width is deprecated and will be removed in a future release.
+Warning 1681 Integer display width is deprecated and will be removed in a future release.
+Warning 1681 Integer display width is deprecated and will be removed in a future release.
+Warning 1681 Integer display width is deprecated and will be removed in a future release.
+Warning 1681 Integer display width is deprecated and will be removed in a future release.
+Warning 1681 Integer display width is deprecated and will be removed in a future release.
+Warning 1681 Integer display width is deprecated and will be removed in a future release.
+Warning 1831 Duplicate index 'id1_type2' defined on the table 'test.assoc_table'. This is deprecated and will be disallowed in a future release.
+DROP TABLE assoc_table;
+set global rocksdb_lock_wait_timeout = @save_rocksdb_lock_wait_timeout;
diff --git a/mysql-test/suite/rocksdb/r/rocksdb.result b/mysql-test/suite/rocksdb/r/rocksdb.result
index 61dd8184ddf..5c440c88317 100644
--- a/mysql-test/suite/rocksdb/r/rocksdb.result
+++ b/mysql-test/suite/rocksdb/r/rocksdb.result
@@ -1656,6 +1656,10 @@ rocksdb_number_sst_entry_singledelete #
rocksdb_number_superversion_acquires #
rocksdb_number_superversion_cleanups #
rocksdb_number_superversion_releases #
+rocksdb_partial_index_groups_materialized #
+rocksdb_partial_index_groups_sorted #
+rocksdb_partial_index_rows_materialized #
+rocksdb_partial_index_rows_sorted #
rocksdb_row_lock_deadlocks #
rocksdb_row_lock_wait_timeouts #
rocksdb_select_bypass_executed #
@@ -1760,6 +1764,10 @@ ROCKSDB_NUMBER_SUPERVERSION_ACQUIRES
ROCKSDB_NUMBER_SUPERVERSION_CLEANUPS
ROCKSDB_NUMBER_SUPERVERSION_RELEASES
ROCKSDB_NUM_ITERATORS
+ROCKSDB_PARTIAL_INDEX_GROUPS_MATERIALIZED
+ROCKSDB_PARTIAL_INDEX_GROUPS_SORTED
+ROCKSDB_PARTIAL_INDEX_ROWS_MATERIALIZED
+ROCKSDB_PARTIAL_INDEX_ROWS_SORTED
ROCKSDB_QUERIES_POINT
ROCKSDB_QUERIES_RANGE
ROCKSDB_ROWS_DELETED
@@ -1883,6 +1891,10 @@ ROCKSDB_NUMBER_SUPERVERSION_ACQUIRES
ROCKSDB_NUMBER_SUPERVERSION_CLEANUPS
ROCKSDB_NUMBER_SUPERVERSION_RELEASES
ROCKSDB_NUM_ITERATORS
+ROCKSDB_PARTIAL_INDEX_GROUPS_MATERIALIZED
+ROCKSDB_PARTIAL_INDEX_GROUPS_SORTED
+ROCKSDB_PARTIAL_INDEX_ROWS_MATERIALIZED
+ROCKSDB_PARTIAL_INDEX_ROWS_SORTED
ROCKSDB_QUERIES_POINT
ROCKSDB_QUERIES_RANGE
ROCKSDB_ROWS_DELETED
diff --git a/mysql-test/suite/rocksdb/t/partial_index.inc b/mysql-test/suite/rocksdb/t/partial_index.inc
new file mode 100644
index 00000000000..7137a696d6a
--- /dev/null
+++ b/mysql-test/suite/rocksdb/t/partial_index.inc
@@ -0,0 +1,156 @@
+eval
+CREATE TABLE t (i varchar(64), j varchar(64), k varchar(64), l varchar(64),
+ PRIMARY KEY (i, j),
+ KEY ik1 (i, k) COMMENT 'cfname=$cfname;partial_group_keyparts=1;partial_group_threshold=5',
+ KEY ik2 (i, k) COMMENT 'cfname=$cfname'
+) ENGINE=ROCKSDB;
+
+INSERT INTO t values ("1", "1", "2", "1");
+INSERT INTO t values ("1", "2", "1", "1");
+
+INSERT INTO t values ("11111111", "1", "9", "1");
+INSERT INTO t values ("11111111", "2", "8", "1");
+INSERT INTO t values ("11111111", "3", "7", "1");
+INSERT INTO t values ("11111111", "4", "5", "1");
+INSERT INTO t values ("11111111", "5", "4", "1");
+INSERT INTO t values ("11111111", "6", "2", "1");
+
+INSERT INTO t values ("111111111", "1", "9", "1");
+INSERT INTO t values ("111111111", "2", "2", "1");
+
+INSERT INTO t values ("11111112", "1", "1", "1");
+
+--let $query1= SELECT i, j, k FROM t FORCE INDEX (ik1) ORDER BY i $asc , k $asc;
+--let $query2= SELECT i, j, k FROM t FORCE INDEX (ik2) ORDER BY i $asc , k $asc;
+--source include/diff_queries.inc
+
+--let $query1= SELECT i, j, k FROM t FORCE INDEX (ik1) WHERE i = "1" ORDER BY i $asc , k $asc;
+--let $query2= SELECT i, j, k FROM t FORCE INDEX (ik2) WHERE i = "1" ORDER BY i $asc , k $asc;
+--source include/diff_queries.inc
+
+--let $query1= SELECT i, j, k FROM t FORCE INDEX (ik1) WHERE i = "11111111" ORDER BY i $asc , k $asc;
+--let $query2= SELECT i, j, k FROM t FORCE INDEX (ik2) WHERE i = "11111111" ORDER BY i $asc , k $asc;
+--source include/diff_queries.inc
+
+--let $query1= SELECT i, j, k FROM t FORCE INDEX (ik1) WHERE i = "111111110" ORDER BY i $asc , k $asc;
+--let $query2= SELECT i, j, k FROM t FORCE INDEX (ik2) WHERE i = "111111110" ORDER BY i $asc , k $asc;
+--source include/diff_queries.inc
+
+--let $query1= SELECT i, j, k FROM t FORCE INDEX (ik1) WHERE i = "11111112" ORDER BY i $asc , k $asc;
+--let $query2= SELECT i, j, k FROM t FORCE INDEX (ik2) WHERE i = "11111112" ORDER BY i $asc , k $asc;
+--source include/diff_queries.inc
+
+--let $query1= SELECT i, j, k FROM t FORCE INDEX (ik1) WHERE i = "11111111" AND k < "5" ORDER BY i $asc , k $asc;
+--let $query2= SELECT i, j, k FROM t FORCE INDEX (ik2) WHERE i = "11111111" AND k < "5" ORDER BY i $asc , k $asc;
+--source include/diff_queries.inc
+
+--let $query1= SELECT i, j, k FROM t FORCE INDEX (ik1) WHERE i = "11111111" AND k > "2" ORDER BY i $asc , k $asc;
+--let $query2= SELECT i, j, k FROM t FORCE INDEX (ik2) WHERE i = "11111111" AND k > "2" ORDER BY i $asc , k $asc;
+--source include/diff_queries.inc
+
+--let $query1= SELECT i, j, k FROM t FORCE INDEX (ik1) WHERE i = "11111111" AND k > "2" AND k < "5" ORDER BY i $asc , k $asc;
+--let $query2= SELECT i, j, k FROM t FORCE INDEX (ik2) WHERE i = "11111111" AND k > "2" AND k < "5" ORDER BY i $asc , k $asc;
+--source include/diff_queries.inc
+
+--let $query1= SELECT i, j, k FROM t FORCE INDEX (ik1) WHERE i = "11111111" AND k > "7" AND k < "9" ORDER BY i $asc , k $asc;
+--let $query2= SELECT i, j, k FROM t FORCE INDEX (ik2) WHERE i = "11111111" AND k > "7" AND k < "9" ORDER BY i $asc , k $asc;
+--source include/diff_queries.inc
+
+--let $query1= SELECT i, j, k FROM t FORCE INDEX (ik1) WHERE i < "111111110" ORDER BY i $asc , k $asc;
+--let $query2= SELECT i, j, k FROM t FORCE INDEX (ik2) WHERE i < "111111110" ORDER BY i $asc , k $asc;
+--source include/diff_queries.inc
+
+--let $query1= SELECT i, j, k FROM t FORCE INDEX (ik1) WHERE i > "111111110" ORDER BY i $asc , k $asc;
+--let $query2= SELECT i, j, k FROM t FORCE INDEX (ik2) WHERE i > "111111110" ORDER BY i $asc , k $asc;
+--source include/diff_queries.inc
+
+--let $query1= SELECT i, j, k FROM t FORCE INDEX (ik1) WHERE i >= "111111110" ORDER BY i $asc , k $asc;
+--let $query2= SELECT i, j, k FROM t FORCE INDEX (ik2) WHERE i >= "111111110" ORDER BY i $asc , k $asc;
+--source include/diff_queries.inc
+
+--let $query1= SELECT i, j, k FROM t FORCE INDEX (ik1) WHERE i <= "111111110" ORDER BY i $asc , k $asc;
+--let $query2= SELECT i, j, k FROM t FORCE INDEX (ik2) WHERE i <= "111111110" ORDER BY i $asc , k $asc;
+--source include/diff_queries.inc
+
+--let $query1= SELECT i, j, k FROM t FORCE INDEX (ik1) WHERE i LIKE "1%" ORDER BY i $asc , k $asc;
+--let $query2= SELECT i, j, k FROM t FORCE INDEX (ik2) WHERE i LIKE "1%" ORDER BY i $asc , k $asc;
+--source include/diff_queries.inc
+
+--let $query1= SELECT i, j, k FROM t FORCE INDEX (ik1) WHERE i LIKE "11111111%" ORDER BY i $asc , k $asc;
+--let $query2= SELECT i, j, k FROM t FORCE INDEX (ik2) WHERE i LIKE "11111111%" ORDER BY i $asc , k $asc;
+--source include/diff_queries.inc
+
+DROP TABLE t;
+
+eval
+CREATE TABLE t (i int, j int, k int, l int,
+ PRIMARY KEY (i, j),
+ KEY ik1 (i, k) COMMENT 'cfname=$cfname;partial_group_keyparts=1;partial_group_threshold=5',
+ KEY ik2 (i, k) COMMENT 'cfname=$cfname'
+) ENGINE=ROCKSDB;
+
+INSERT INTO t values (1, 1, 2, 1);
+INSERT INTO t values (1, 2, 1, 1);
+
+INSERT INTO t values (2, 1, 9, 1);
+INSERT INTO t values (2, 2, 8, 1);
+INSERT INTO t values (2, 3, 7, 1);
+INSERT INTO t values (2, 4, 5, 1);
+INSERT INTO t values (2, 5, 4, 1);
+INSERT INTO t values (2, 6, 2, 1);
+
+INSERT INTO t values (4, 1, 1, 1);
+
+--let $query1= SELECT i, j, k FROM t FORCE INDEX (ik1) ORDER BY i $asc , k $asc;
+--let $query2= SELECT i, j, k FROM t FORCE INDEX (ik2) ORDER BY i $asc , k $asc;
+--source include/diff_queries.inc
+
+--let $query1= SELECT i, j, k FROM t FORCE INDEX (ik1) WHERE i = 1 ORDER BY i $asc , k $asc;
+--let $query2= SELECT i, j, k FROM t FORCE INDEX (ik2) WHERE i = 1 ORDER BY i $asc , k $asc;
+--source include/diff_queries.inc
+
+--let $query1= SELECT i, j, k FROM t FORCE INDEX (ik1) WHERE i = 2 ORDER BY i $asc , k $asc;
+--let $query2= SELECT i, j, k FROM t FORCE INDEX (ik2) WHERE i = 2 ORDER BY i $asc , k $asc;
+--source include/diff_queries.inc
+
+--let $query1= SELECT i, j, k FROM t FORCE INDEX (ik1) WHERE i = 3 ORDER BY i $asc , k $asc;
+--let $query2= SELECT i, j, k FROM t FORCE INDEX (ik2) WHERE i = 3 ORDER BY i $asc , k $asc;
+--source include/diff_queries.inc
+
+--let $query1= SELECT i, j, k FROM t FORCE INDEX (ik1) WHERE i = 4 ORDER BY i $asc , k $asc;
+--let $query2= SELECT i, j, k FROM t FORCE INDEX (ik2) WHERE i = 4 ORDER BY i $asc , k $asc;
+--source include/diff_queries.inc
+
+--let $query1= SELECT i, j, k FROM t FORCE INDEX (ik1) WHERE i = 2 AND k < 5 ORDER BY i $asc , k $asc;
+--let $query2= SELECT i, j, k FROM t FORCE INDEX (ik2) WHERE i = 2 AND k < 5 ORDER BY i $asc , k $asc;
+--source include/diff_queries.inc
+
+--let $query1= SELECT i, j, k FROM t FORCE INDEX (ik1) WHERE i = 2 AND k > 2 ORDER BY i $asc , k $asc;
+--let $query2= SELECT i, j, k FROM t FORCE INDEX (ik2) WHERE i = 2 AND k > 2 ORDER BY i $asc , k $asc;
+--source include/diff_queries.inc
+
+--let $query1= SELECT i, j, k FROM t FORCE INDEX (ik1) WHERE i = 2 AND k > 2 AND k < 5 ORDER BY i $asc , k $asc;
+--let $query2= SELECT i, j, k FROM t FORCE INDEX (ik2) WHERE i = 2 AND k > 2 AND k < 5 ORDER BY i $asc , k $asc;
+--source include/diff_queries.inc
+
+--let $query1= SELECT i, j, k FROM t FORCE INDEX (ik1) WHERE i = 2 AND k > 7 AND k < 9 ORDER BY i $asc , k $asc;
+--let $query2= SELECT i, j, k FROM t FORCE INDEX (ik2) WHERE i = 2 AND k > 7 AND k < 9 ORDER BY i $asc , k $asc;
+--source include/diff_queries.inc
+
+--let $query1= SELECT i, j, k FROM t FORCE INDEX (ik1) WHERE i < 3 ORDER BY i $asc , k $asc;
+--let $query2= SELECT i, j, k FROM t FORCE INDEX (ik2) WHERE i < 3 ORDER BY i $asc , k $asc;
+--source include/diff_queries.inc
+
+--let $query1= SELECT i, j, k FROM t FORCE INDEX (ik1) WHERE i > 3 ORDER BY i $asc , k $asc;
+--let $query2= SELECT i, j, k FROM t FORCE INDEX (ik2) WHERE i > 3 ORDER BY i $asc , k $asc;
+--source include/diff_queries.inc
+
+--let $query1= SELECT i, j, k FROM t FORCE INDEX (ik1) WHERE i >= 3 ORDER BY i $asc , k $asc;
+--let $query2= SELECT i, j, k FROM t FORCE INDEX (ik2) WHERE i >= 3 ORDER BY i $asc , k $asc;
+--source include/diff_queries.inc
+
+--let $query1= SELECT i, j, k FROM t FORCE INDEX (ik1) WHERE i <= 3 ORDER BY i $asc , k $asc;
+--let $query2= SELECT i, j, k FROM t FORCE INDEX (ik2) WHERE i <= 3 ORDER BY i $asc , k $asc;
+--source include/diff_queries.inc
+
+DROP TABLE t;
diff --git a/mysql-test/suite/rocksdb/t/partial_index.test b/mysql-test/suite/rocksdb/t/partial_index.test
new file mode 100644
index 00000000000..410c772765e
--- /dev/null
+++ b/mysql-test/suite/rocksdb/t/partial_index.test
@@ -0,0 +1,19 @@
+set optimizer_force_index_for_range = on;
+
+--let $asc=ASC
+--let $cfname=cf
+--source partial_index.inc
+
+--let $asc=ASC
+--let $cfname=rev:cf
+--source partial_index.inc
+
+--let $asc=DESC
+--let $cfname=cf
+--source partial_index.inc
+
+--let $asc=DESC
+--let $cfname=rev:cf
+--source partial_index.inc
+
+set optimizer_force_index_for_range = off;
diff --git a/mysql-test/suite/rocksdb/t/partial_index_assoc-master.opt b/mysql-test/suite/rocksdb/t/partial_index_assoc-master.opt
new file mode 100644
index 00000000000..81bc90b0531
--- /dev/null
+++ b/mysql-test/suite/rocksdb/t/partial_index_assoc-master.opt
@@ -0,0 +1,3 @@
+--rocksdb_default_cf_options=write_buffer_size=128m;target_file_size_base=32m;max_bytes_for_level_base=512m;level0_file_num_compaction_trigger=4;level0_slowdown_writes_trigger=20;level0_stop_writes_trigger=30;max_write_buffer_number=4;compression_per_level=kLZ4Compression;bottommost_compression=kZSTD;compression_opts=-14:6:0;block_based_table_factory={cache_index_and_filter_blocks=1;filter_policy=bloomfilter:10:false;whole_key_filtering=0};prefix_extractor=capped:12;level_compaction_dynamic_level_bytes=true;optimize_filters_for_hits=true;memtable_prefix_bloom_size_ratio=0.039;max_compaction_bytes=402653184;report_bg_io_stats=true;compaction_pri=kMinOverlappingRatio;soft_pending_compaction_bytes_limit=20480000000
+--rocksdb_override_cf_options=cf_assoc={prefix_extractor=capped:28};cf_assoc_count={prefix_extractor=capped:20};rev:cf_assoc_id1_type={prefix_extractor=capped:20};cf_fbobj_type_id={prefix_extractor=capped:16};cf_assoc_disagg={prefix_extractor=capped:20};__system__={write_buffer_size=16m};
+
diff --git a/mysql-test/suite/rocksdb/t/partial_index_assoc.inc b/mysql-test/suite/rocksdb/t/partial_index_assoc.inc
new file mode 100644
index 00000000000..d0508a3f40b
--- /dev/null
+++ b/mysql-test/suite/rocksdb/t/partial_index_assoc.inc
@@ -0,0 +1,95 @@
+--let $binary_id1=1
+if (`select DATA_TYPE = 'binary' from information_schema.columns where TABLE_NAME = 'assoc_table' and TABLE_SCHEMA = 'test' and COLUMN_NAME = 'id1'`) {
+ --let $binary_id1="1"
+}
+--let $text=`select DATA_TYPE = 'text' from information_schema.columns where TABLE_NAME = 'assoc_table' and TABLE_SCHEMA = 'test' and COLUMN_NAME = 'data'`
+
+# This creates 10 distinct types, with up to 9 distinct id1s per type, to give up to 90 groups.
+--disable_query_log
+let $i=0;
+while ($i < 1000)
+{
+ if ($text) {
+ eval INSERT INTO assoc_table VALUES (FLOOR(RAND($i) * 9), 123, $i, 456, FLOOR($i / 100), FLOOR(RAND($i) * 2), REPEAT("1234567890", FLOOR(RAND($i) * 50)), FLOOR(RAND($i) * 100000), 789);
+ }
+ if (!$text) {
+ eval INSERT INTO assoc_table VALUES (FLOOR(RAND($i) * 9), 123, $i, 456, FLOOR($i / 100), FLOOR(RAND($i) * 2), REPEAT("1234567890", FLOOR(RAND($i) * 20)), FLOOR(RAND($i) * 100000), 789);
+ }
+ inc $i;
+}
+--enable_query_log
+
+let $i=0;
+while ($i < 10) {
+ # This gives a range plan
+ --let $query1= SELECT id1, id2, assoc_type, visibility, data, time, version FROM assoc_table FORCE INDEX (id1_type) WHERE assoc_type = $i AND id1 = $binary_id1 AND visibility = 1 AND time >= 100 AND time <= 50000 ORDER BY time DESC, id2 DESC LIMIT 10000
+ --let $query2= SELECT id1, id2, assoc_type, visibility, data, time, version FROM assoc_table FORCE INDEX (id1_type2) WHERE assoc_type = $i AND id1 = $binary_id1 AND visibility = 1 AND time >= 100 AND time <= 50000 ORDER BY time DESC, id2 DESC LIMIT 10000
+ --source include/diff_queries.inc
+
+ # This gives a ref plan
+ --let $query1= SELECT id1, id2, assoc_type, visibility, data, time, version FROM assoc_table FORCE INDEX (id1_type) WHERE assoc_type = $i AND id1 = $binary_id1 AND visibility = 0 AND time >= 0 AND time <= 4294967295 ORDER BY time DESC, id2 DESC LIMIT 10000
+ --let $query2= SELECT id1, id2, assoc_type, visibility, data, time, version FROM assoc_table FORCE INDEX (id1_type2) WHERE assoc_type = $i AND id1 = $binary_id1 AND visibility = 0 AND time >= 0 AND time <= 4294967295 ORDER BY time DESC, id2 DESC LIMIT 10000
+ --source include/diff_queries.inc
+
+ inc $i;
+}
+
+let $i=0;
+while ($i < 10) {
+ --let $query1= SELECT id1, id2, assoc_type, visibility, data, time, version FROM assoc_table FORCE INDEX (id1_type) WHERE assoc_type = 1
+ --let $query2= SELECT id1, id2, assoc_type, visibility, data, time, version FROM assoc_table FORCE INDEX (id1_type2) WHERE assoc_type = 1
+ --source include/diff_queries.inc
+
+ --let $query1= SELECT id1, id2, assoc_type, visibility, data, time, version FROM assoc_table FORCE INDEX (id1_type) WHERE assoc_type <= 2
+ --let $query2= SELECT id1, id2, assoc_type, visibility, data, time, version FROM assoc_table FORCE INDEX (id1_type2) WHERE assoc_type <= 2
+ --source include/diff_queries.inc
+
+ inc $i;
+}
+
+# Rebuild the table so that nothing is materialized anymore.
+ALTER TABLE assoc_table ENGINE=ROCKSDB;
+
+CREATE TEMPORARY TABLE t1 AS
+SELECT * FROM performance_schema.global_status
+WHERE variable_name LIKE 'rocksdb_partial_index%';
+
+--let $query1= SELECT id1, id2, assoc_type, visibility, time, version FROM assoc_table FORCE INDEX (id1_type)
+--let $query2= SELECT id1, id2, assoc_type, visibility, time, version FROM assoc_table FORCE INDEX (id1_type2)
+--source include/diff_queries.inc
+
+CREATE TEMPORARY TABLE t2 AS
+SELECT * FROM performance_schema.global_status
+WHERE variable_name LIKE 'rocksdb_partial_index%';
+
+SELECT variable_name, t2.variable_value - t1.variable_value AS diff FROM t1 JOIN t2 USING (variable_name);
+--let $assert_text = Check that materialized groups are non-zero.
+--let $assert_cond = [SELECT t2.variable_value - t1.variable_value AS diff FROM t1 JOIN t2 USING (variable_name) WHERE variable_name = "rocksdb_partial_index_groups_materialized", diff, 1] > 0
+--source include/assert.inc
+--let $assert_text = Check that materialized rows are non-zero.
+--let $assert_cond = [SELECT t2.variable_value - t1.variable_value AS diff FROM t1 JOIN t2 USING (variable_name) WHERE variable_name = "rocksdb_partial_index_rows_materialized", diff, 1] > 0
+--source include/assert.inc
+DROP TABLE t1, t2;
+
+# Rerun full index scan a second time, and check that no materialization occurs
+CREATE TEMPORARY TABLE t1 AS
+SELECT * FROM performance_schema.global_status
+WHERE variable_name LIKE 'rocksdb_partial_index%';
+
+--disable_result_log
+SELECT id1, id2, assoc_type, visibility, time, version FROM assoc_table FORCE INDEX (id1_type2);
+--enable_result_log
+
+CREATE TEMPORARY TABLE t2 AS
+SELECT * FROM performance_schema.global_status
+WHERE variable_name LIKE 'rocksdb_partial_index%';
+
+SELECT variable_name, t2.variable_value - t1.variable_value AS diff FROM t1 JOIN t2 USING (variable_name);
+--let $assert_text = Check that materialized groups are zero.
+--let $assert_cond = [SELECT t2.variable_value - t1.variable_value AS diff FROM t1 JOIN t2 USING (variable_name) WHERE variable_name = "rocksdb_partial_index_groups_materialized", diff, 1] = 0
+--source include/assert.inc
+--let $assert_text = Check that materialized rows are zero.
+--let $assert_cond = [SELECT t2.variable_value - t1.variable_value AS diff FROM t1 JOIN t2 USING (variable_name) WHERE variable_name = "rocksdb_partial_index_rows_materialized", diff, 1] = 0
+--source include/assert.inc
+
+DROP TABLE t1, t2;
diff --git a/mysql-test/suite/rocksdb/t/partial_index_assoc.test b/mysql-test/suite/rocksdb/t/partial_index_assoc.test
new file mode 100644
index 00000000000..a559c67f673
--- /dev/null
+++ b/mysql-test/suite/rocksdb/t/partial_index_assoc.test
@@ -0,0 +1,58 @@
+set optimizer_force_index_for_range = on;
+CREATE TABLE `assoc_table` (
+ `id1` bigint(20) unsigned NOT NULL DEFAULT '0',
+ `id1_type` int(10) unsigned NOT NULL DEFAULT '0',
+ `id2` bigint(20) unsigned NOT NULL DEFAULT '0',
+ `id2_type` int(10) unsigned NOT NULL DEFAULT '0',
+ `assoc_type` bigint(20) unsigned NOT NULL DEFAULT '0',
+ `visibility` tinyint(3) NOT NULL DEFAULT '0',
+ `data` varchar(255) COLLATE latin1_bin NOT NULL DEFAULT '',
+ `time` int(10) unsigned NOT NULL DEFAULT '0',
+ `version` bigint(20) unsigned NOT NULL DEFAULT '0',
+ PRIMARY KEY (`assoc_type`,`id1`,`id2`) COMMENT 'cf_assoc',
+ KEY `id1_type` (`assoc_type`, `id1`, `visibility`,`time`,`id2`,`version`,`data`) COMMENT 'rev:cf_assoc_id1_type',
+ KEY `id1_type2` (`assoc_type`, `id1`, `visibility`,`time`,`id2`,`version`,`data`) COMMENT 'cfname=rev:cf_assoc_id1_type;partial_group_keyparts=2;partial_group_threshold=10'
+) ENGINE=ROCKSDB DEFAULT CHARSET=latin1 COLLATE=latin1_bin ROW_FORMAT=COMPRESSED KEY_BLOCK_SIZE=8;
+
+--source partial_index_assoc.inc
+
+DROP TABLE assoc_table;
+
+CREATE TABLE `assoc_table` (
+ `id1` binary(16) NOT NULL DEFAULT '\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0',
+ `raw_key` text COLLATE latin1_bin,
+ `id2` bigint(20) unsigned NOT NULL DEFAULT '0',
+ `id2_type` int(10) unsigned NOT NULL DEFAULT '0',
+ `assoc_type` bigint(20) unsigned NOT NULL DEFAULT '0',
+ `visibility` tinyint(3) NOT NULL DEFAULT '0',
+ `data` varchar(255) COLLATE latin1_bin NOT NULL DEFAULT '',
+ `time` int(10) unsigned NOT NULL DEFAULT '0',
+ `version` bigint(20) unsigned NOT NULL DEFAULT '0',
+ PRIMARY KEY (`assoc_type`,`id1`,`id2`) COMMENT 'cf_assoc',
+ KEY `id1_type` (`assoc_type`,`id1`,`visibility`,`time`,`id2`,`version`,`data`) COMMENT 'rev:cf_assoc_id1_type',
+ KEY `id1_type2` (`assoc_type`, `id1`, `visibility`,`time`,`id2`,`version`,`data`) COMMENT 'cfname=rev:cf_assoc_id1_type;partial_group_keyparts=2;partial_group_threshold=10'
+) ENGINE=ROCKSDB DEFAULT CHARSET=latin1 COLLATE=latin1_bin ROW_FORMAT=COMPRESSED KEY_BLOCK_SIZE=8;
+
+--source partial_index_assoc.inc
+
+DROP TABLE assoc_table;
+
+CREATE TABLE `assoc_table` (
+ `id1` bigint(20) unsigned NOT NULL DEFAULT '0',
+ `id1_type` int(10) unsigned NOT NULL DEFAULT '0',
+ `id2` bigint(20) unsigned NOT NULL DEFAULT '0',
+ `id2_type` int(10) unsigned NOT NULL DEFAULT '0',
+ `assoc_type` bigint(20) unsigned NOT NULL DEFAULT '0',
+ `visibility` tinyint(4) NOT NULL DEFAULT '0',
+ `data` text COLLATE latin1_bin NOT NULL,
+ `time` int(10) unsigned NOT NULL DEFAULT '0',
+ `version` bigint(20) unsigned NOT NULL DEFAULT '0',
+ PRIMARY KEY (`assoc_type`,`id1`,`id2`) COMMENT 'cf_assoc',
+ KEY `id1_type` (`assoc_type`, `id1`, `visibility`,`time`,`id2`,`version`,`data`(255)) COMMENT 'rev:cf_assoc_id1_type',
+ KEY `id1_type2` (`assoc_type`, `id1`, `visibility`,`time`,`id2`,`version`,`data`(255)) COMMENT 'cfname=rev:cf_assoc_id1_type;partial_group_keyparts=2;partial_group_threshold=10'
+) ENGINE=ROCKSDB DEFAULT CHARSET=latin1 COLLATE=latin1_bin ROW_FORMAT=COMPRESSED KEY_BLOCK_SIZE=4;
+
+--source partial_index_assoc.inc
+
+DROP TABLE assoc_table;
+set optimizer_force_index_for_range = off;
diff --git a/mysql-test/suite/rocksdb/t/partial_index_stress-master.opt b/mysql-test/suite/rocksdb/t/partial_index_stress-master.opt
new file mode 100644
index 00000000000..a105847c183
--- /dev/null
+++ b/mysql-test/suite/rocksdb/t/partial_index_stress-master.opt
@@ -0,0 +1,3 @@
+--initialize --default_authentication_plugin=mysql_native_password
+--rocksdb_default_cf_options=write_buffer_size=128m;target_file_size_base=32m;max_bytes_for_level_base=512m;level0_file_num_compaction_trigger=4;level0_slowdown_writes_trigger=20;level0_stop_writes_trigger=30;max_write_buffer_number=4;compression_per_level=kLZ4Compression;bottommost_compression=kZSTD;compression_opts=-14:6:0;block_based_table_factory={cache_index_and_filter_blocks=1;filter_policy=bloomfilter:10:false;whole_key_filtering=0};prefix_extractor=capped:12;level_compaction_dynamic_level_bytes=true;optimize_filters_for_hits=true;memtable_prefix_bloom_size_ratio=0.039;max_compaction_bytes=402653184;report_bg_io_stats=true;compaction_pri=kMinOverlappingRatio;soft_pending_compaction_bytes_limit=20480000000
+--rocksdb_override_cf_options=cf_assoc={prefix_extractor=capped:28};cf_assoc_count={prefix_extractor=capped:20};rev:cf_assoc_id1_type={prefix_extractor=capped:20};cf_fbobj_type_id={prefix_extractor=capped:16};cf_assoc_disagg={prefix_extractor=capped:20};__system__={write_buffer_size=16m};
diff --git a/mysql-test/suite/rocksdb/t/partial_index_stress.py b/mysql-test/suite/rocksdb/t/partial_index_stress.py
new file mode 100644
index 00000000000..07220d88705
--- /dev/null
+++ b/mysql-test/suite/rocksdb/t/partial_index_stress.py
@@ -0,0 +1,114 @@
+"""
+This script stress tests partial indexes by performing writes while concurrently checking PK/SK consistency.
+
+Usage: partial_index_stress.py user host port db_name table_name
+ num_iters num_threads
+"""
+import MySQLdb
+import random
+import sys
+import threading
+import traceback
+
+def get_query(table_name, binary_id1):
+ assoc_type = random.randint(1, 2)
+ id1 = random.randint(1, 5)
+ id2 = random.randint(1, 20)
+
+ r = random.randint(1, 3)
+
+ if r == 1:
+ if binary_id1:
+ return """DELETE FROM %s WHERE id1 = "%d" and id2 = %d and assoc_type = %d""" % (table_name, id1, id2, assoc_type)
+ else:
+ return """DELETE FROM %s WHERE id1 = %d and id2 = %d and assoc_type = %d""" % (table_name, id1, id2, assoc_type)
+ else:
+ return """INSERT INTO %s VALUES (%d, 0, %d, 0, %d, 1, 'abc', 100, 1) ON DUPLICATE KEY UPDATE time=time+10, version=version+1""" % (table_name, id1, id2, assoc_type)
+
+class Worker(threading.Thread):
+ def __init__(self, con, table_name, num_iters, check, event):
+ threading.Thread.__init__(self)
+ self.con = con
+ self.table_name = table_name
+ self.num_iters = num_iters
+ self.check = check
+ self.event = event
+ self.exception = None
+ self.start()
+
+ def run(self):
+ try:
+ if self.check:
+ self.run_check()
+ else:
+ self.run_write()
+ except Exception as e:
+ self.exception = traceback.format_exc()
+
+ def run_write(self):
+ cur = self.con.cursor()
+ cur.execute("select data_type from information_schema.columns where table_schema = database() and table_name = '%s' and column_name = 'id1'" % self.table_name);
+ binary_id1 = cur.fetchone()[0] == "binary"
+ cur.execute("SET SESSION TRANSACTION ISOLATION LEVEL READ COMMITTED")
+ for x in range(self.num_iters):
+ try:
+ cur.execute(get_query(self.table_name, binary_id1))
+ self.con.commit()
+ except MySQLdb.OperationalError as e:
+ self.con.rollback()
+ cur = self.con.cursor()
+ raise e
+
+ def run_check(self):
+ cur = self.con.cursor()
+ while not self.event.is_set():
+ try:
+ cur.execute("SELECT COUNT(*) FROM %s FORCE INDEX(PRIMARY) UNION ALL SELECT COUNT(*) FROM %s FORCE INDEX(id1_type)" % (self.table_name, self.table_name))
+ pk_count = cur.fetchone()[0]
+ sk_count = cur.fetchone()[0]
+ assert pk_count == sk_count, "Count mismatch %d != %d" % (pk_count, sk_count)
+ self.con.commit()
+ except MySQLdb.OperationalError as e:
+ self.con.rollback()
+ cur = self.con.cursor()
+ raise e
+
+if __name__ == '__main__':
+ if len(sys.argv) != 8:
+ print("Usage: partial_index_stress.py user host port db_name " \
+ "table_name num_iters num_threads")
+ sys.exit(1)
+
+ user = sys.argv[1]
+ host = sys.argv[2]
+ port = int(sys.argv[3])
+ db = sys.argv[4]
+ table_name = sys.argv[5]
+ num_iters = int(sys.argv[6])
+ num_workers = int(sys.argv[7])
+
+ done_event = threading.Event();
+
+ worker_failed = False
+ workers = []
+ for i in range(num_workers):
+ w = Worker(
+ MySQLdb.connect(user=user, host=host, port=port, db=db), table_name,
+ num_iters, False, None)
+ workers.append(w)
+
+ checker = Worker(
+ MySQLdb.connect(user=user, host=host, port=port, db=db), table_name,
+ num_iters, True, done_event)
+
+ for w in workers:
+ w.join()
+ if w.exception:
+ print("Worker hit an exception:\n%s\n" % w.exception)
+ worker_failed = True
+
+ done_event.set()
+ checker.join()
+
+ if worker_failed:
+ sys.exit(1)
diff --git a/mysql-test/suite/rocksdb/t/partial_index_stress.test b/mysql-test/suite/rocksdb/t/partial_index_stress.test
new file mode 100644
index 00000000000..c78e8cb980e
--- /dev/null
+++ b/mysql-test/suite/rocksdb/t/partial_index_stress.test
@@ -0,0 +1,64 @@
+#
+# Stress partial indexes by performing writes, and checking that PK/SK are still consistent.
+#
+
+set @save_rocksdb_lock_wait_timeout = @@rocksdb_lock_wait_timeout;
+set global rocksdb_lock_wait_timeout = 100000;
+
+CREATE TABLE `assoc_table` (
+ `id1` bigint(20) unsigned NOT NULL DEFAULT '0',
+ `id1_type` int(10) unsigned NOT NULL DEFAULT '0',
+ `id2` bigint(20) unsigned NOT NULL DEFAULT '0',
+ `id2_type` int(10) unsigned NOT NULL DEFAULT '0',
+ `assoc_type` bigint(20) unsigned NOT NULL DEFAULT '0',
+ `visibility` tinyint(3) NOT NULL DEFAULT '0',
+ `data` varchar(255) COLLATE latin1_bin NOT NULL DEFAULT '',
+ `time` int(10) unsigned NOT NULL DEFAULT '0',
+ `version` bigint(20) unsigned NOT NULL DEFAULT '0',
+ PRIMARY KEY (`assoc_type`,`id1`,`id2`) COMMENT 'cf_assoc',
+ KEY `id1_type` (`assoc_type`, `id1`, `visibility`,`time`,`id2`,`version`,`data`) COMMENT 'cfname=rev:cf_assoc_id1_type;partial_group_keyparts=2;partial_group_threshold=10'
+) ENGINE=ROCKSDB DEFAULT CHARSET=latin1 COLLATE=latin1_bin ROW_FORMAT=COMPRESSED KEY_BLOCK_SIZE=8;
+
+exec /usr/bin/python3 suite/rocksdb/t/partial_index_stress.py root 127.0.0.1 $MASTER_MYPORT test assoc_table 1000 10;
+
+DROP TABLE assoc_table;
+
+CREATE TABLE `assoc_table` (
+ `id1` binary(16) NOT NULL DEFAULT '\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0',
+ `raw_key` text COLLATE latin1_bin,
+ `id2` bigint(20) unsigned NOT NULL DEFAULT '0',
+ `id2_type` int(10) unsigned NOT NULL DEFAULT '0',
+ `assoc_type` bigint(20) unsigned NOT NULL DEFAULT '0',
+ `visibility` tinyint(3) NOT NULL DEFAULT '0',
+ `data` varchar(255) COLLATE latin1_bin NOT NULL DEFAULT '',
+ `time` int(10) unsigned NOT NULL DEFAULT '0',
+ `version` bigint(20) unsigned NOT NULL DEFAULT '0',
+ PRIMARY KEY (`assoc_type`,`id1`,`id2`) COMMENT 'cf_assoc',
+ KEY `id1_type` (`assoc_type`,`id1`,`visibility`,`time`,`id2`,`version`,`data`) COMMENT 'rev:cf_assoc_id1_type',
+ KEY `id1_type2` (`assoc_type`, `id1`, `visibility`,`time`,`id2`,`version`,`data`) COMMENT 'cfname=rev:cf_assoc_id1_type;partial_group_keyparts=2;partial_group_threshold=10'
+) ENGINE=ROCKSDB DEFAULT CHARSET=latin1 COLLATE=latin1_bin ROW_FORMAT=COMPRESSED KEY_BLOCK_SIZE=8;
+
+exec /usr/bin/python3 suite/rocksdb/t/partial_index_stress.py root 127.0.0.1 $MASTER_MYPORT test assoc_table 1000 10;
+
+DROP TABLE assoc_table;
+
+CREATE TABLE `assoc_table` (
+ `id1` bigint(20) unsigned NOT NULL DEFAULT '0',
+ `id1_type` int(10) unsigned NOT NULL DEFAULT '0',
+ `id2` bigint(20) unsigned NOT NULL DEFAULT '0',
+ `id2_type` int(10) unsigned NOT NULL DEFAULT '0',
+ `assoc_type` bigint(20) unsigned NOT NULL DEFAULT '0',
+ `visibility` tinyint(4) NOT NULL DEFAULT '0',
+ `data` text COLLATE latin1_bin NOT NULL,
+ `time` int(10) unsigned NOT NULL DEFAULT '0',
+ `version` bigint(20) unsigned NOT NULL DEFAULT '0',
+ PRIMARY KEY (`assoc_type`,`id1`,`id2`) COMMENT 'cf_assoc',
+ KEY `id1_type` (`assoc_type`, `id1`, `visibility`,`time`,`id2`,`version`,`data`(255)) COMMENT 'rev:cf_assoc_id1_type',
+ KEY `id1_type2` (`assoc_type`, `id1`, `visibility`,`time`,`id2`,`version`,`data`(255)) COMMENT 'cfname=rev:cf_assoc_id1_type;partial_group_keyparts=2;partial_group_threshold=10'
+) ENGINE=ROCKSDB DEFAULT CHARSET=latin1 COLLATE=latin1_bin ROW_FORMAT=COMPRESSED KEY_BLOCK_SIZE=4;
+
+exec /usr/bin/python3 suite/rocksdb/t/partial_index_stress.py root 127.0.0.1 $MASTER_MYPORT test assoc_table 1000 10;
+
+DROP TABLE assoc_table;
+
+set global rocksdb_lock_wait_timeout = @save_rocksdb_lock_wait_timeout;
diff --git a/storage/rocksdb/ha_rocksdb.cc b/storage/rocksdb/ha_rocksdb.cc
index 15c8b289a5e..452d2a9ff8f 100644
--- a/storage/rocksdb/ha_rocksdb.cc
+++ b/storage/rocksdb/ha_rocksdb.cc
@@ -802,6 +802,11 @@ std::atomic<uint64_t> rocksdb_select_bypass_executed(0);
std::atomic<uint64_t> rocksdb_select_bypass_rejected(0);
std::atomic<uint64_t> rocksdb_select_bypass_failed(0);
+std::atomic<uint64_t> rocksdb_partial_index_groups_sorted(0);
+std::atomic<uint64_t> rocksdb_partial_index_groups_materialized(0);
+std::atomic<uint64_t> rocksdb_partial_index_rows_sorted(0);
+std::atomic<uint64_t> rocksdb_partial_index_rows_materialized(0);
+
static int rocksdb_trace_block_cache_access(
THD *const thd MY_ATTRIBUTE((__unused__)),
struct SYS_VAR *const var MY_ATTRIBUTE((__unused__)), void *const save,
@@ -3111,7 +3116,7 @@ class Rdb_transaction {
virtual void set_sync(bool sync) = 0;
virtual void release_lock(const Rdb_key_def &key_descr,
- const std::string &rowkey) = 0;
+ const std::string &rowkey, bool force = false) = 0;
virtual bool prepare() = 0;
@@ -3714,9 +3719,9 @@ class Rdb_transaction_impl : public Rdb_transaction {
m_rocksdb_tx->GetWriteOptions()->sync = sync;
}
- void release_lock(const Rdb_key_def &key_descr,
- const std::string &rowkey) override {
- if (!THDVAR(m_thd, lock_scanned_rows)) {
+ void release_lock(const Rdb_key_def &key_descr, const std::string &rowkey,
+ bool force) override {
+ if (!THDVAR(m_thd, lock_scanned_rows) || force) {
m_rocksdb_tx->UndoGetForUpdate(key_descr.get_cf(),
rocksdb::Slice(rowkey));
// row_lock_count track row(pk)
@@ -4167,7 +4172,8 @@ class Rdb_writebatch_impl : public Rdb_transaction {
void set_sync(bool sync) override { write_opts.sync = sync; }
void release_lock(const Rdb_key_def &key_descr MY_ATTRIBUTE((unused)),
- const std::string &rowkey MY_ATTRIBUTE((unused))) override {
+ const std::string &rowkey MY_ATTRIBUTE((unused)),
+ bool force MY_ATTRIBUTE((unused))) override {
// Nothing to do here since we don't hold any row locks.
}
@@ -6922,6 +6928,7 @@ ulonglong ha_rocksdb::load_auto_incr_value_from_index() {
active_index = table->s->next_number_index;
const uint8 save_table_status = table->m_status;
+ DBUG_ASSERT(!m_key_descr_arr[active_index_pos()]->is_partial_index());
std::unique_ptr<Rdb_iterator> save_iterator(new Rdb_iterator_base(
ha_thd(), m_key_descr_arr[active_index_pos()], m_pk_descr, m_tbl_def));
std::swap(m_iterator, save_iterator);
@@ -10203,6 +10210,7 @@ int ha_rocksdb::check_and_lock_sk(
The bloom filter may need to be disabled for this lookup.
*/
+ DBUG_ASSERT(!m_key_descr_arr[key_id]->is_partial_index());
Rdb_iterator_base iter(ha_thd(), m_key_descr_arr[key_id], m_pk_descr,
m_tbl_def);
int rc = HA_EXIT_SUCCESS;
@@ -10608,6 +10616,34 @@ int ha_rocksdb::update_write_sk(const TABLE *const table_arg,
bytes_written = old_key_slice.size();
}
+ if (kd.is_partial_index()) {
+ // Obtain shared lock on prefix.
+ int size = kd.pack_record(table_arg, m_pack_buffer, row_info.new_data,
+ m_sk_packed_tuple, nullptr, false, 0,
+ kd.partial_index_keyparts());
+ const rocksdb::Slice prefix_slice =
+ rocksdb::Slice((const char *)m_sk_packed_tuple, size);
+
+ const rocksdb::Status s = row_info.tx->get_for_update(
+ kd, prefix_slice, nullptr, false /* exclusive */,
+ false /* do validate */);
+ if (!s.ok()) {
+ return row_info.tx->set_status_error(table_arg->in_use, s, kd, m_tbl_def,
+ m_table_handler);
+ }
+
+ // Check if this prefix has been materialized.
+ Rdb_iterator_base iter(ha_thd(), m_key_descr_arr[kd.get_keyno()],
+ m_pk_descr, m_tbl_def);
+ rc = iter.seek(HA_READ_KEY_EXACT, prefix_slice, false, prefix_slice,
+ true /* read current */);
+
+ // We can skip updating the index, if the prefix is not materialized.
+ if (rc == HA_ERR_END_OF_FILE || rc == HA_ERR_KEY_NOT_FOUND) {
+ return 0;
+ }
+ }
+
new_key_slice = rocksdb::Slice(
reinterpret_cast<const char *>(m_sk_packed_tuple), new_packed_size);
new_value_slice =
@@ -10897,8 +10933,15 @@ int ha_rocksdb::index_init(uint idx, bool sorted MY_ATTRIBUTE((__unused__))) {
DBUG_ASSERT(tx != nullptr);
active_index = idx;
- m_iterator.reset(new Rdb_iterator_base(
- thd, m_key_descr_arr[active_index_pos()], m_pk_descr, m_tbl_def));
+ if (idx != table->s->primary_key &&
+ m_key_descr_arr[idx]->is_partial_index()) {
+ m_iterator.reset(
+ new Rdb_iterator_partial(thd, m_key_descr_arr[active_index_pos()],
+ m_pk_descr, m_tbl_def, table));
+ } else {
+ m_iterator.reset(new Rdb_iterator_base(
+ thd, m_key_descr_arr[active_index_pos()], m_pk_descr, m_tbl_def));
+ }
// If m_lock_rows is not RDB_LOCK_NONE then we will be doing a get_for_update
// when accessing the index, so don't acquire the snapshot right away.
@@ -13379,6 +13422,9 @@ int ha_rocksdb::inplace_populate_sk(
THDVAR(ha_thd(), merge_tmp_file_removal_delay_ms);
for (const auto &index : indexes) {
+ // Skip populating partial indexes for now.
+ if (index->is_partial_index()) continue;
+
bool is_unique_index =
new_table_arg->key_info[index->get_keyno()].flags & HA_NOSAME;
@@ -14210,6 +14256,17 @@ static SHOW_VAR rocksdb_status_vars[] = {
&rocksdb_select_bypass_rejected, SHOW_LONGLONG),
DEF_STATUS_VAR_PTR("select_bypass_failed", &rocksdb_select_bypass_failed,
SHOW_LONGLONG),
+
+ DEF_STATUS_VAR_PTR("partial_index_groups_sorted",
+ &rocksdb_partial_index_groups_sorted, SHOW_LONGLONG),
+ DEF_STATUS_VAR_PTR("partial_index_groups_materialized",
+ &rocksdb_partial_index_groups_materialized,
+ SHOW_LONGLONG),
+ DEF_STATUS_VAR_PTR("partial_index_rows_sorted",
+ &rocksdb_partial_index_rows_sorted, SHOW_LONGLONG),
+ DEF_STATUS_VAR_PTR("partial_index_rows_materialized",
+ &rocksdb_partial_index_rows_materialized, SHOW_LONGLONG),
+
// the variables generated by SHOW_FUNC are sorted only by prefix (first
// arg in the tuple below), so make sure it is unique to make sorting
// deterministic as quick sort is not stable
@@ -15844,6 +15901,11 @@ rocksdb::Status rdb_tx_get_for_update(Rdb_transaction *tx,
return s;
}
+void rdb_tx_release_lock(Rdb_transaction *tx, const Rdb_key_def &kd,
+ const rocksdb::Slice &key) {
+ tx->release_lock(kd, std::string(key.data(), key.size()));
+}
+
void rdb_tx_multi_get(Rdb_transaction *tx,
rocksdb::ColumnFamilyHandle *const column_family,
const size_t num_keys, const rocksdb::Slice *keys,
diff --git a/storage/rocksdb/ha_rocksdb.h b/storage/rocksdb/ha_rocksdb.h
index 369af0bc9c4..d0baeefe942 100644
--- a/storage/rocksdb/ha_rocksdb.h
+++ b/storage/rocksdb/ha_rocksdb.h
@@ -1166,6 +1166,9 @@ rocksdb::Status rdb_tx_get_for_update(Rdb_transaction *tx,
rocksdb::PinnableSlice *const value,
bool exclusive);
+void rdb_tx_release_lock(Rdb_transaction *tx, const Rdb_key_def &kd,
+ const rocksdb::Slice &key);
+
void rdb_tx_multi_get(Rdb_transaction *tx,
rocksdb::ColumnFamilyHandle *const column_family,
const size_t num_keys, const rocksdb::Slice *keys,
@@ -1218,4 +1221,9 @@ extern std::atomic<uint64_t> rocksdb_select_bypass_executed;
extern std::atomic<uint64_t> rocksdb_select_bypass_rejected;
extern std::atomic<uint64_t> rocksdb_select_bypass_failed;
+extern std::atomic<uint64_t> rocksdb_partial_index_groups_sorted;
+extern std::atomic<uint64_t> rocksdb_partial_index_groups_materialized;
+extern std::atomic<uint64_t> rocksdb_partial_index_rows_sorted;
+extern std::atomic<uint64_t> rocksdb_partial_index_rows_materialized;
+
} // namespace myrocks
diff --git a/storage/rocksdb/rdb_datadic.h b/storage/rocksdb/rdb_datadic.h
index 2c5828a6b8a..1b12c33d8d7 100644
--- a/storage/rocksdb/rdb_datadic.h
+++ b/storage/rocksdb/rdb_datadic.h
@@ -586,6 +586,12 @@ class Rdb_key_def {
uint extract_partial_index_info(const TABLE *const table_arg,
const Rdb_tbl_def *const tbl_def_arg);
inline bool is_partial_index() const { return m_partial_index_threshold > 0; }
+ inline uint partial_index_threshold() const {
+ return m_partial_index_threshold;
+ }
+ inline uint partial_index_keyparts() const {
+ return m_partial_index_keyparts;
+ }
static bool has_index_flag(uint32 index_flags, enum INDEX_FLAG flag);
static uint32 calculate_index_flag_offset(uint32 index_flags,
diff --git a/storage/rocksdb/rdb_iterator.cc b/storage/rocksdb/rdb_iterator.cc
index 529cd6dacae..eac051db9a7 100644
--- a/storage/rocksdb/rdb_iterator.cc
+++ b/storage/rocksdb/rdb_iterator.cc
@@ -16,7 +16,10 @@
#include "./rdb_iterator.h"
+/* MySQL includes */
#include "scope_guard.h"
+#include "sql/sql_class.h"
+#include "sql/thr_malloc.h"
namespace myrocks {
@@ -356,4 +359,591 @@ int Rdb_iterator_base::get(const rocksdb::Slice *key,
return rc;
}
+Rdb_iterator_partial::Rdb_iterator_partial(
+ THD *thd, const std::shared_ptr<Rdb_key_def> kd,
+ const std::shared_ptr<Rdb_key_def> pkd, const Rdb_tbl_def *tbl_def,
+ TABLE *table)
+ : Rdb_iterator_base(thd, kd, pkd, tbl_def),
+ m_table(table),
+ m_iterator_pk(thd, pkd, pkd, tbl_def),
+ m_converter(thd, tbl_def, table),
+ m_valid(false),
+ m_materialized(false),
+ m_threshold(kd->partial_index_threshold()),
+ m_prefix_keyparts(kd->partial_index_keyparts()),
+ m_cur_prefix_key_len(0),
+ m_records(slice_comparator(m_kd->get_cf()->GetComparator())),
+ m_records_it(m_records.end()) {
+ init_sql_alloc(PSI_NOT_INSTRUMENTED, &m_mem_root, 4024, 0);
+ m_converter.setup_field_decoders(table->read_set, true /* decode all */);
+
+ const uint packed_len =
+ std::max(m_kd->max_storage_fmt_length(), m_pkd->max_storage_fmt_length());
+ m_cur_prefix_key = reinterpret_cast<uchar *>(
+ my_malloc(PSI_NOT_INSTRUMENTED, packed_len, MYF(0)));
+ m_record_buf = reinterpret_cast<uchar *>(
+ my_malloc(PSI_NOT_INSTRUMENTED, table->s->reclength, MYF(0)));
+ m_pack_buffer = reinterpret_cast<uchar *>(
+ my_malloc(PSI_NOT_INSTRUMENTED, packed_len, MYF(0)));
+ m_sk_packed_tuple = reinterpret_cast<uchar *>(
+ my_malloc(PSI_NOT_INSTRUMENTED, packed_len, MYF(0)));
+}
+
+Rdb_iterator_partial::~Rdb_iterator_partial() {
+ reset();
+ my_free(m_cur_prefix_key);
+ m_cur_prefix_key = nullptr;
+ my_free(m_record_buf);
+ m_record_buf = nullptr;
+ my_free(m_pack_buffer);
+ m_pack_buffer = nullptr;
+ my_free(m_sk_packed_tuple);
+ m_sk_packed_tuple = nullptr;
+}
+
+int Rdb_iterator_partial::get_prefix_len(const rocksdb::Slice &start_key,
+ uint *prefix_cnt, uint *prefix_len) {
+ Rdb_string_reader reader(&start_key);
+ if ((!reader.read(Rdb_key_def::INDEX_ID_SIZE))) {
+ return HA_ERR_INTERNAL_ERROR;
+ }
+
+ for (uint i = 0; i < m_prefix_keyparts; i++) {
+ if (reader.remaining_bytes() == 0) {
+ *prefix_cnt = i;
+ *prefix_len = reader.get_current_ptr() - start_key.data();
+ return HA_EXIT_SUCCESS;
+ }
+
+ if (m_kd->read_memcmp_key_part(m_table, &reader, i) > 0) {
+ return HA_ERR_INTERNAL_ERROR;
+ }
+ }
+
+ *prefix_cnt = m_prefix_keyparts;
+ *prefix_len = reader.get_current_ptr() - start_key.data();
+
+ return HA_EXIT_SUCCESS;
+}
+
+/*
+ * Determines the correct prefix from start_key by reading from primary key if
+ * needed.
+ *
+ * Populates m_cur_prefix_key/m_cur_prefix_key_len.
+ */
+int Rdb_iterator_partial::get_prefix_from_start(
+ enum ha_rkey_function find_flag, const rocksdb::Slice &start_key) {
+ int rc = 0;
+ uint prefix_cnt = 0;
+ uint prefix_len = 0;
+
+ rc = get_prefix_len(start_key, &prefix_cnt, &prefix_len);
+ if (rc) {
+ return rc;
+ }
+ DBUG_ASSERT_IMP(prefix_cnt == 0, prefix_len == Rdb_key_def::INDEX_ID_SIZE);
+
+ // There are 2 scenarios where a read is required to determine the prefix:
+ // 1. There are not enough keyparts in the start_key.
+ // 2. An exclusive seek key is provided, meaning that we need to read the next
+ // prefix.
+ if (prefix_cnt < m_prefix_keyparts ||
+ (prefix_len == start_key.size() &&
+ (find_flag == HA_READ_AFTER_KEY || find_flag == HA_READ_BEFORE_KEY))) {
+ uint tmp;
+
+ rocksdb::Slice empty_end_key;
+
+ // Since the PK/SK share the same prefix, the primary key can be constructed
+ // using the secondary key, with the index_id overwritten.
+ memcpy(m_cur_prefix_key, start_key.data(), prefix_len);
+ rocksdb::Slice seek_key((const char *)m_cur_prefix_key, prefix_len);
+ m_pkd->get_infimum_key(m_cur_prefix_key, &tmp);
+
+ rc = m_iterator_pk.seek(find_flag, seek_key, false, empty_end_key);
+ if (rc) {
+ return rc;
+ }
+
+ rc = get_prefix_len(m_iterator_pk.key(), &prefix_cnt, &prefix_len);
+ if (rc) {
+ return rc;
+ }
+ memcpy(m_cur_prefix_key, m_iterator_pk.key().data(), prefix_len);
+ } else {
+ memcpy(m_cur_prefix_key, start_key.data(), prefix_len);
+ }
+
+ m_cur_prefix_key_len = prefix_len;
+ return HA_EXIT_SUCCESS;
+}
+
+int Rdb_iterator_partial::get_next_prefix(bool direction) {
+ rocksdb::Slice cur_prefix_key((const char *)m_cur_prefix_key,
+ m_cur_prefix_key_len);
+ uint tmp;
+
+ int rc = get_prefix_from_start(
+ direction ? HA_READ_AFTER_KEY : HA_READ_BEFORE_KEY, cur_prefix_key);
+ m_kd->get_infimum_key(m_cur_prefix_key, &tmp);
+
+ cur_prefix_key =
+ rocksdb::Slice((const char *)m_cur_prefix_key, m_cur_prefix_key_len);
+ if (!rc && !m_kd->value_matches_prefix(cur_prefix_key, m_prefix_tuple)) {
+ rc = HA_ERR_END_OF_FILE;
+ }
+
+ return rc;
+}
+int Rdb_iterator_partial::seek_next_prefix(bool direction) {
+ rocksdb::Slice empty_end_key;
+ uint tmp;
+
+ // Fetch next prefix using PK.
+ int rc = get_next_prefix(direction);
+ if (rc) return rc;
+
+ // Rdb_iterator_base::seek below will overwrite m_prefix_tuple, so we save a
+ // copy here.
+ size_t prefix_buf_len = m_prefix_tuple.size();
+ uchar *prefix_buf_copy = (uchar *)my_alloca(prefix_buf_len);
+ memcpy(prefix_buf_copy, m_prefix_buf, prefix_buf_len);
+
+ // First try reading from SK in the current prefix.
+ rocksdb::Slice cur_prefix_key((const char *)m_cur_prefix_key,
+ m_cur_prefix_key_len);
+ m_kd->get_infimum_key(m_cur_prefix_key, &tmp);
+
+ rc = Rdb_iterator_base::seek(
+ direction ? HA_READ_KEY_EXACT : HA_READ_PREFIX_LAST, cur_prefix_key,
+ false, empty_end_key);
+
+ // Restore m_prefix_tuple
+ memcpy(m_prefix_buf, prefix_buf_copy, prefix_buf_len);
+ m_prefix_tuple = rocksdb::Slice((char *)m_prefix_buf, prefix_buf_len);
+
+ if (rc == HA_ERR_END_OF_FILE) {
+ // Nothing in SK, so check PK.
+ rc = read_prefix_from_pk();
+
+ if (rc == 0) {
+ // Not materialized on disk, seek to beginning/end of map.
+ m_materialized = false;
+ if (direction ^ m_kd->m_is_reverse_cf) {
+ m_records_it = m_records.begin();
+ } else {
+ m_records_it = m_records.end();
+ m_records_it--;
+ }
+ } else {
+ // The current prefix was determined by reading from PK in
+ // get_next_prefix, so rows must exist within this prefix on the PK.
+ DBUG_ASSERT(rc != HA_ERR_END_OF_FILE);
+ }
+ } else if (rc == 0) {
+ // Found rows in SK, so use them
+ m_materialized = true;
+ }
+
+ return rc;
+}
+
+int Rdb_iterator_partial::materialize_prefix() {
+ MEM_ROOT mem_root;
+ init_sql_alloc(PSI_NOT_INSTRUMENTED, &mem_root, 4024, 0);
+ uint tmp;
+ Rdb_transaction *const tx = get_tx_from_thd(m_thd);
+ m_kd->get_infimum_key(m_cur_prefix_key, &tmp);
+ rocksdb::Slice cur_prefix_key((const char *)m_cur_prefix_key,
+ m_cur_prefix_key_len);
+
+ auto s =
+ rdb_tx_get_for_update(tx, *m_kd, cur_prefix_key, nullptr, RDB_LOCK_WRITE);
+ if (!s.ok()) {
+ return rdb_tx_set_status_error(tx, s, *m_kd, m_tbl_def);
+ }
+
+ // It is possible that someone else has already materialized this group
+ // before we locked. Double check if the prefix is still empty.
+ Rdb_iterator_base iter(m_thd, m_kd, m_pkd, m_tbl_def);
+ m_kd->get_infimum_key(m_cur_prefix_key, &tmp);
+ int rc = iter.seek(HA_READ_KEY_EXACT, cur_prefix_key, false, cur_prefix_key,
+ true /* read current */);
+ if (rc == 0 || rc != HA_ERR_END_OF_FILE) {
+ rdb_tx_release_lock(tx, *m_kd, cur_prefix_key);
+ return rc;
+ }
+
+ rocksdb::WriteOptions options;
+ options.sync = false;
+ rocksdb::TransactionDBWriteOptimizations optimize;
+ optimize.skip_concurrency_control = true;
+
+ auto wb = std::unique_ptr<rocksdb::WriteBatch>(new rocksdb::WriteBatch);
+ m_pkd->get_infimum_key(m_cur_prefix_key, &tmp);
+ rc = m_iterator_pk.seek(HA_READ_KEY_EXACT, cur_prefix_key, false,
+ cur_prefix_key, true /* read current */);
+ size_t num_rows = 0;
+
+ while (!rc) {
+ if (thd_killed(m_thd)) {
+ rc = HA_ERR_QUERY_INTERRUPTED;
+ goto exit;
+ }
+
+ const rocksdb::Slice &rkey = m_iterator_pk.key();
+ const rocksdb::Slice &rval = m_iterator_pk.value();
+
+ // Unpack from PK format
+ rc = m_converter.decode(m_pkd, m_record_buf, &rkey, &rval);
+ if (rc) {
+ goto exit;
+ }
+
+ // Repack into SK format
+ uint sk_packed_size = m_kd->pack_record(
+ m_table, m_pack_buffer, m_record_buf, m_sk_packed_tuple, &m_sk_tails,
+ false /* store_row_debug_checksums */, 0 /* hidden_pk_id */, 0, nullptr,
+ m_converter.get_ttl_bytes_buffer());
+
+ const char *key =
+ (const char *)memdup_root(&mem_root, m_sk_packed_tuple, sk_packed_size);
+ const char *val = (const char *)memdup_root(&mem_root, m_sk_tails.ptr(),
+ m_sk_tails.get_current_pos());
+
+ s = wb->Put(m_kd->get_cf(), rocksdb::Slice(key, sk_packed_size),
+ rocksdb::Slice(val, m_sk_tails.get_current_pos()));
+ if (!s.ok()) {
+ rc = rdb_tx_set_status_error(tx, s, *m_kd, m_tbl_def);
+ goto exit;
+ }
+
+ num_rows++;
+ rc = m_iterator_pk.next();
+ }
+
+ if (rc != HA_ERR_END_OF_FILE) goto exit;
+ rc = HA_EXIT_SUCCESS;
+
+ s = rdb_get_rocksdb_db()->Write(options, optimize, wb.get());
+ if (!s.ok()) {
+ rc = rdb_tx_set_status_error(tx, s, *m_kd, m_tbl_def);
+ goto exit;
+ }
+
+ rocksdb_partial_index_groups_materialized++;
+ rocksdb_partial_index_rows_materialized += num_rows;
+
+exit:
+ rdb_tx_release_lock(tx, *m_kd, cur_prefix_key);
+ return rc;
+}
+
+int Rdb_iterator_partial::read_prefix_from_pk() {
+ uint tmp;
+ int rc = 0;
+ size_t num_rows = 0;
+
+ free_root(&m_mem_root, MYF(MY_KEEP_PREALLOC));
+ m_records.clear();
+
+ const char *old_proc_info = m_thd->get_proc_info();
+ thd_proc_info(m_thd, "Materializing group in partial index");
+
+ rocksdb::Slice cur_prefix_key((const char *)m_cur_prefix_key,
+ m_cur_prefix_key_len);
+ m_pkd->get_infimum_key(m_cur_prefix_key, &tmp);
+
+ // Since rocksdb does not support reverse prefix seeks, we always seek in the
+ // forwards direction (even PK is a reverse cf).
+ rc = m_iterator_pk.seek(HA_READ_KEY_EXACT, cur_prefix_key, false,
+ cur_prefix_key);
+
+ while (!rc) {
+ if (thd_killed(m_thd)) {
+ rc = HA_ERR_QUERY_INTERRUPTED;
+ goto exit;
+ }
+
+ const rocksdb::Slice &rkey = m_iterator_pk.key();
+ const rocksdb::Slice &rval = m_iterator_pk.value();
+
+ // Unpack from PK format
+ rc = m_converter.decode(m_pkd, m_record_buf, &rkey, &rval);
+ if (rc) goto exit;
+
+ // Repack into SK format
+ uint sk_packed_size = m_kd->pack_record(
+ m_table, m_pack_buffer, m_record_buf, m_sk_packed_tuple, &m_sk_tails,
+ false /* store_row_debug_checksums */, 0 /* hidden_pk_id */, 0, nullptr,
+ m_converter.get_ttl_bytes_buffer());
+
+ const char *key = (const char *)memdup_root(&m_mem_root, m_sk_packed_tuple,
+ sk_packed_size);
+ const char *val = (const char *)memdup_root(&m_mem_root, m_sk_tails.ptr(),
+ m_sk_tails.get_current_pos());
+
+ m_records.emplace(rocksdb::Slice(key, sk_packed_size),
+ rocksdb::Slice(val, m_sk_tails.get_current_pos()));
+
+ num_rows++;
+ rc = m_iterator_pk.next();
+ }
+
+ if (rc != HA_ERR_END_OF_FILE) goto exit;
+ rc = HA_EXIT_SUCCESS;
+
+ rocksdb_partial_index_groups_sorted++;
+ rocksdb_partial_index_rows_sorted += num_rows;
+
+ if (num_rows > m_threshold) {
+ rc = materialize_prefix();
+ } else if (num_rows == 0) {
+ rc = HA_ERR_END_OF_FILE;
+ }
+
+exit:
+ thd_proc_info(m_thd, old_proc_info);
+ return rc;
+}
+
+int Rdb_iterator_partial::seek(enum ha_rkey_function find_flag,
+ const rocksdb::Slice start_key,
+ bool full_key_match,
+ const rocksdb::Slice end_key,
+ bool read_current) {
+ int rc = 0;
+ uint tmp;
+
+ DBUG_ASSERT(!read_current);
+ reset();
+
+ bool direction = (find_flag == HA_READ_KEY_EXACT) ||
+ (find_flag == HA_READ_AFTER_KEY) ||
+ (find_flag == HA_READ_KEY_OR_NEXT);
+
+ // Get current prefix.
+ if ((rc = get_prefix_from_start(find_flag, start_key)) != 0) {
+ return rc;
+ }
+
+ // First try reading from SK in the current prefix.
+ rocksdb::Slice cur_prefix_key((const char *)m_cur_prefix_key,
+ m_cur_prefix_key_len);
+ m_kd->get_infimum_key(m_cur_prefix_key, &tmp);
+
+ rc = Rdb_iterator_base::seek(find_flag, start_key, full_key_match, end_key,
+ read_current);
+
+ // Check if we're still in our current prefix. If not, we may have missed
+ // some unmaterialized keys, so we have to check PK.
+ if (rc == 0 &&
+ !m_kd->value_matches_prefix(Rdb_iterator_base::key(), cur_prefix_key)) {
+ rc = HA_ERR_END_OF_FILE;
+ }
+
+ bool next_prefix = false;
+
+ if (rc == HA_ERR_END_OF_FILE) {
+ // Nothing in SK, so check PK.
+ rc = read_prefix_from_pk();
+
+ if (rc == HA_ERR_END_OF_FILE) {
+ // Nothing in PK, so move to next prefix.
+ next_prefix = true;
+ } else if (rc == 0) {
+ // Not materialized on disk.
+ m_materialized = false;
+
+ // Seek to correct spot.
+ uchar *start_key_buf = (uchar *)start_key.data();
+
+ // Similar to Rdb_iterator_base::seek, convert start_key into an rocksdb
+ // key that we will actually seek to.
+ auto start_key_guard =
+ create_scope_guard([this, start_key_buf, start_key] {
+ this->m_kd->predecessor(start_key_buf, start_key.size());
+ });
+ if (find_flag == HA_READ_PREFIX_LAST_OR_PREV ||
+ find_flag == HA_READ_PREFIX_LAST || find_flag == HA_READ_AFTER_KEY) {
+ m_kd->successor(start_key_buf, start_key.size());
+ } else {
+ start_key_guard.commit();
+ }
+
+ if (direction) {
+ if (m_kd->m_is_reverse_cf) {
+ // Emulate "SeekForPrev" behaviour.
+ m_records_it = m_records.upper_bound(start_key);
+ if (m_records_it == m_records.begin()) {
+ next_prefix = true;
+ } else {
+ m_records_it--;
+ }
+ } else {
+ m_records_it = m_records.lower_bound(start_key);
+ if (m_records_it == m_records.end()) {
+ next_prefix = true;
+ }
+ }
+ } else {
+ if (m_kd->m_is_reverse_cf) {
+ m_records_it = m_records.upper_bound(start_key);
+ if (m_records_it == m_records.end()) {
+ next_prefix = true;
+ }
+ } else {
+ // Emulate "SeekForPrev" behaviour.
+ m_records_it = m_records.lower_bound(start_key);
+ if (m_records_it == m_records.begin()) {
+ next_prefix = true;
+ } else {
+ m_records_it--;
+ }
+ }
+ }
+ }
+ } else if (rc == 0) {
+ // Found rows in SK, so use them.
+ m_materialized = true;
+ }
+
+ if (next_prefix) {
+ rc = seek_next_prefix(direction);
+ }
+
+ if (!rc) {
+ if (!m_kd->value_matches_prefix(key(), m_prefix_tuple)) {
+ rc = HA_ERR_END_OF_FILE;
+ } else {
+ m_valid = true;
+ }
+ }
+
+ return rc;
+}
+
+int Rdb_iterator_partial::get(const rocksdb::Slice *key,
+ rocksdb::PinnableSlice *value, Rdb_lock_type type,
+ bool skip_ttl_check) {
+ int rc = Rdb_iterator_base::get(key, value, type, skip_ttl_check);
+
+ if (rc == HA_ERR_KEY_NOT_FOUND) {
+ const uint size =
+ m_kd->get_primary_key_tuple(m_table, *m_pkd, key, m_sk_packed_tuple);
+ if (size == RDB_INVALID_KEY_LEN) {
+ return HA_ERR_ROCKSDB_CORRUPT_DATA;
+ }
+
+ rocksdb::Slice pk_key((const char *)m_sk_packed_tuple, size);
+
+ rc = m_iterator_pk.get(&pk_key, value, type, skip_ttl_check);
+ if (rc) return rc;
+
+ // Unpack from PK format
+ rc = m_converter.decode(m_pkd, m_record_buf, &pk_key, value);
+ if (rc) return rc;
+
+ // Repack into SK format
+ uint sk_packed_size = m_kd->pack_record(
+ m_table, m_pack_buffer, m_record_buf, m_sk_packed_tuple, &m_sk_tails,
+ false /* store_row_debug_checksums */, 0 /* hidden_pk_id */, 0, nullptr,
+ m_converter.get_ttl_bytes_buffer());
+
+ value->PinSelf(
+ rocksdb::Slice((const char *)m_sk_packed_tuple, sk_packed_size));
+ rc = 0;
+ }
+
+ m_valid = false;
+ return rc;
+}
+
+int Rdb_iterator_partial::next_with_direction_in_group(bool direction) {
+ uint tmp;
+ int rc = HA_EXIT_SUCCESS;
+ if (m_materialized) {
+ rc = direction ? Rdb_iterator_base::next() : Rdb_iterator_base::prev();
+
+ if (rc == HA_EXIT_SUCCESS) {
+ rocksdb::Slice cur_prefix_key((const char *)m_cur_prefix_key,
+ m_cur_prefix_key_len);
+ m_kd->get_infimum_key(m_cur_prefix_key, &tmp);
+
+ if (!m_kd->value_matches_prefix(Rdb_iterator_base::key(),
+ cur_prefix_key)) {
+ return HA_ERR_END_OF_FILE;
+ }
+ }
+ } else {
+ if (direction ^ m_kd->m_is_reverse_cf) {
+ m_records_it++;
+ if (m_records_it == m_records.end()) return HA_ERR_END_OF_FILE;
+ } else {
+ if (m_records_it == m_records.begin()) return HA_ERR_END_OF_FILE;
+ m_records_it--;
+ }
+ }
+
+ return rc;
+}
+
+int Rdb_iterator_partial::next_with_direction(bool direction) {
+ if (!m_valid) return HA_ERR_INTERNAL_ERROR;
+
+ int rc = next_with_direction_in_group(direction);
+
+ if (!rc) {
+ // On success, check if key is still within prefix.
+ if (!m_kd->value_matches_prefix(key(), m_prefix_tuple)) {
+ rc = HA_ERR_END_OF_FILE;
+ }
+ } else if (rc == HA_ERR_END_OF_FILE) {
+ uint tmp;
+ rocksdb::Slice cur_prefix_key((const char *)m_cur_prefix_key,
+ m_cur_prefix_key_len);
+ m_kd->get_infimum_key(m_cur_prefix_key, &tmp);
+
+ if (m_prefix_tuple.size() >= cur_prefix_key.size()) {
+ DBUG_ASSERT(memcmp(m_prefix_tuple.data(), cur_prefix_key.data(),
+ cur_prefix_key.size()) == 0);
+ return HA_ERR_END_OF_FILE;
+ }
+
+ rc = seek_next_prefix(direction);
+ }
+
+ return rc;
+}
+
+int Rdb_iterator_partial::next() {
+ int rc = next_with_direction(true);
+ if (rc == HA_ERR_END_OF_FILE) m_valid = false;
+ return rc;
+}
+
+int Rdb_iterator_partial::prev() {
+ int rc = next_with_direction(false);
+ if (rc == HA_ERR_END_OF_FILE) m_valid = false;
+ return rc;
+}
+
+void Rdb_iterator_partial::reset() {
+ m_valid = false;
+ m_materialized = false;
+ free_root(&m_mem_root, MYF(MY_KEEP_PREALLOC));
+ m_records.clear();
+ m_iterator_pk.reset();
+ Rdb_iterator_base::reset();
+}
+
+rocksdb::Slice Rdb_iterator_partial::key() {
+ return m_materialized ? Rdb_iterator_base::key() : m_records_it->first;
+}
+
+rocksdb::Slice Rdb_iterator_partial::value() {
+ return m_materialized ? Rdb_iterator_base::value() : m_records_it->second;
+}
+
} // namespace myrocks
diff --git a/storage/rocksdb/rdb_iterator.h b/storage/rocksdb/rdb_iterator.h
index 2a0f5bd5760..73c2deb3850 100644
--- a/storage/rocksdb/rdb_iterator.h
+++ b/storage/rocksdb/rdb_iterator.h
@@ -75,7 +75,7 @@ class Rdb_iterator_base : public Rdb_iterator {
int seek(enum ha_rkey_function find_flag, const rocksdb::Slice start_key,
bool full_key_match, const rocksdb::Slice end_key,
- bool read_current) override;
+ bool read_current = false) override;
int get(const rocksdb::Slice *key, rocksdb::PinnableSlice *value,
Rdb_lock_type type, bool skip_ttl_check = false) override;
@@ -118,4 +118,70 @@ class Rdb_iterator_base : public Rdb_iterator {
rocksdb::Slice m_prefix_tuple;
};
+class Rdb_iterator_partial : public Rdb_iterator_base {
+ private:
+ TABLE *m_table;
+ MEM_ROOT m_mem_root;
+
+ Rdb_iterator_base m_iterator_pk;
+ Rdb_converter m_converter;
+
+ bool m_valid;
+ bool m_materialized;
+
+ const uint m_threshold;
+ const uint m_prefix_keyparts;
+
+ uchar *m_cur_prefix_key;
+ uint m_cur_prefix_key_len;
+
+ uchar *m_record_buf;
+ uchar *m_pack_buffer;
+ uchar *m_sk_packed_tuple;
+
+ Rdb_string_writer m_sk_tails;
+
+ int get_prefix_len(const rocksdb::Slice &start_key, uint *prefix_cnt,
+ uint *prefix_len);
+ int get_prefix_from_start(enum ha_rkey_function find_flag,
+ const rocksdb::Slice &start_key);
+ int get_next_prefix(bool direction);
+ int seek_next_prefix(bool direction);
+ int materialize_prefix();
+ int read_prefix_from_pk();
+ int next_with_direction_in_group(bool direction);
+ int next_with_direction(bool direction);
+
+ struct slice_comparator {
+ slice_comparator(const rocksdb::Comparator *c) : m_comparator(c) {}
+ const rocksdb::Comparator *const m_comparator;
+
+ bool operator()(const rocksdb::Slice &lhs, const rocksdb::Slice &rhs) {
+ return m_comparator->Compare(lhs, rhs) < 0;
+ }
+ };
+
+ std::map<const rocksdb::Slice, const rocksdb::Slice, slice_comparator>
+ m_records;
+ std::map<const rocksdb::Slice, const rocksdb::Slice,
+ slice_comparator>::iterator m_records_it;
+
+ public:
+ Rdb_iterator_partial(THD *thd, const std::shared_ptr<Rdb_key_def> kd,
+ const std::shared_ptr<Rdb_key_def> pkd,
+ const Rdb_tbl_def *tbl_def, TABLE *table);
+ ~Rdb_iterator_partial() override;
+
+ int seek(enum ha_rkey_function find_flag, const rocksdb::Slice start_key,
+ bool full_key_match, const rocksdb::Slice end_key,
+ bool read_current = false) override;
+ int get(const rocksdb::Slice *key, rocksdb::PinnableSlice *value,
+ Rdb_lock_type type, bool skip_ttl_check = false) override;
+ int next() override;
+ int prev() override;
+ rocksdb::Slice key() override;
+ rocksdb::Slice value() override;
+ void reset() override;
+};
+
} // namespace myrocks
1
0
revision-id: 0df64d186279f0688d95a5770523f6a53fd16e58 (percona-202102-55-g0df64d18627)
parent(s): 4f9c13c2622f46ae6324c0253989f416a495eaf3
author: Sergei Petrunia
committer: Sergei Petrunia
timestamp: 2021-05-17 17:44:09 +0300
message:
Apply the Partial Iterator fix from Manuel
---
storage/rocksdb/rdb_iterator.cc | 1 +
1 file changed, 1 insertion(+)
diff --git a/storage/rocksdb/rdb_iterator.cc b/storage/rocksdb/rdb_iterator.cc
index eac051db9a7..2c527e7f439 100644
--- a/storage/rocksdb/rdb_iterator.cc
+++ b/storage/rocksdb/rdb_iterator.cc
@@ -636,6 +636,7 @@ int Rdb_iterator_partial::materialize_prefix() {
rocksdb_partial_index_rows_materialized += num_rows;
exit:
+ m_kd->get_infimum_key(m_cur_prefix_key, &tmp);
rdb_tx_release_lock(tx, *m_kd, cur_prefix_key);
return rc;
}
1
0
revision-id: 4f9c13c2622f46ae6324c0253989f416a495eaf3 (percona-202102-54-g4f9c13c2622)
parent(s): aaebd623e98e59db3efe1b231307e4142240c485
author: Sergei Petrunia
committer: Sergei Petrunia
timestamp: 2021-05-17 17:43:56 +0300
message:
Apply patch: Add partial index iterator
Summary:
This adds the partial index iterator. It is a special iterator that sorts groups from the primary key on the fly as needed, and if it exceeds a certain threshold, it will materialize the rows on the secondary key as well.
For point queries, the secondary key read is not found, we simply extract the primary key parts and read from the primary key. This means that point queries don't trigger materialization though.
Test Plan: mtr
Reviewers: luqun, herman, yzha, #mysql_eng
Subscribers: phabricatorlinter, pgl
Differential Revision: https://phabricator.intern.facebook.com/D25933178
---
mysql-test/include/diff_queries.inc | 14 +
mysql-test/suite/rocksdb/r/partial_index.result | 146 +++++
.../suite/rocksdb/r/partial_index_assoc.result | 171 ++++++
.../suite/rocksdb/r/partial_index_stress.result | 74 +++
mysql-test/suite/rocksdb/r/rocksdb.result | 12 +
mysql-test/suite/rocksdb/t/partial_index.inc | 156 ++++++
mysql-test/suite/rocksdb/t/partial_index.test | 19 +
.../suite/rocksdb/t/partial_index_assoc-master.opt | 3 +
mysql-test/suite/rocksdb/t/partial_index_assoc.inc | 95 ++++
.../suite/rocksdb/t/partial_index_assoc.test | 58 ++
.../rocksdb/t/partial_index_stress-master.opt | 3 +
mysql-test/suite/rocksdb/t/partial_index_stress.py | 114 ++++
.../suite/rocksdb/t/partial_index_stress.test | 64 +++
storage/rocksdb/ha_rocksdb.cc | 76 ++-
storage/rocksdb/ha_rocksdb.h | 8 +
storage/rocksdb/rdb_datadic.h | 6 +
storage/rocksdb/rdb_iterator.cc | 590 +++++++++++++++++++++
storage/rocksdb/rdb_iterator.h | 68 ++-
18 files changed, 1669 insertions(+), 8 deletions(-)
diff --git a/mysql-test/include/diff_queries.inc b/mysql-test/include/diff_queries.inc
new file mode 100644
index 00000000000..beb75093759
--- /dev/null
+++ b/mysql-test/include/diff_queries.inc
@@ -0,0 +1,14 @@
+--disable_query_log
+
+--output $MYSQL_TMP_DIR/A
+--eval $query1
+
+--output $MYSQL_TMP_DIR/B
+--eval $query2
+
+--enable_query_log
+
+--diff_files $MYSQL_TMP_DIR/A $MYSQL_TMP_DIR/B
+
+--remove_file $MYSQL_TMP_DIR/A
+--remove_file $MYSQL_TMP_DIR/B
diff --git a/mysql-test/suite/rocksdb/r/partial_index.result b/mysql-test/suite/rocksdb/r/partial_index.result
new file mode 100644
index 00000000000..a752d37180c
--- /dev/null
+++ b/mysql-test/suite/rocksdb/r/partial_index.result
@@ -0,0 +1,146 @@
+set optimizer_force_index_for_range = on;
+CREATE TABLE t (i varchar(64), j varchar(64), k varchar(64), l varchar(64),
+PRIMARY KEY (i, j),
+KEY ik1 (i, k) COMMENT 'cfname=cf;partial_group_keyparts=1;partial_group_threshold=5',
+KEY ik2 (i, k) COMMENT 'cfname=cf'
+) ENGINE=ROCKSDB;
+Warnings:
+Warning 1831 Duplicate index 'ik2' defined on the table 'test.t'. This is deprecated and will be disallowed in a future release.
+INSERT INTO t values ("1", "1", "2", "1");
+INSERT INTO t values ("1", "2", "1", "1");
+INSERT INTO t values ("11111111", "1", "9", "1");
+INSERT INTO t values ("11111111", "2", "8", "1");
+INSERT INTO t values ("11111111", "3", "7", "1");
+INSERT INTO t values ("11111111", "4", "5", "1");
+INSERT INTO t values ("11111111", "5", "4", "1");
+INSERT INTO t values ("11111111", "6", "2", "1");
+INSERT INTO t values ("111111111", "1", "9", "1");
+INSERT INTO t values ("111111111", "2", "2", "1");
+INSERT INTO t values ("11111112", "1", "1", "1");
+DROP TABLE t;
+CREATE TABLE t (i int, j int, k int, l int,
+PRIMARY KEY (i, j),
+KEY ik1 (i, k) COMMENT 'cfname=cf;partial_group_keyparts=1;partial_group_threshold=5',
+KEY ik2 (i, k) COMMENT 'cfname=cf'
+) ENGINE=ROCKSDB;
+Warnings:
+Warning 1831 Duplicate index 'ik2' defined on the table 'test.t'. This is deprecated and will be disallowed in a future release.
+INSERT INTO t values (1, 1, 2, 1);
+INSERT INTO t values (1, 2, 1, 1);
+INSERT INTO t values (2, 1, 9, 1);
+INSERT INTO t values (2, 2, 8, 1);
+INSERT INTO t values (2, 3, 7, 1);
+INSERT INTO t values (2, 4, 5, 1);
+INSERT INTO t values (2, 5, 4, 1);
+INSERT INTO t values (2, 6, 2, 1);
+INSERT INTO t values (4, 1, 1, 1);
+DROP TABLE t;
+CREATE TABLE t (i varchar(64), j varchar(64), k varchar(64), l varchar(64),
+PRIMARY KEY (i, j),
+KEY ik1 (i, k) COMMENT 'cfname=rev:cf;partial_group_keyparts=1;partial_group_threshold=5',
+KEY ik2 (i, k) COMMENT 'cfname=rev:cf'
+) ENGINE=ROCKSDB;
+Warnings:
+Warning 1831 Duplicate index 'ik2' defined on the table 'test.t'. This is deprecated and will be disallowed in a future release.
+INSERT INTO t values ("1", "1", "2", "1");
+INSERT INTO t values ("1", "2", "1", "1");
+INSERT INTO t values ("11111111", "1", "9", "1");
+INSERT INTO t values ("11111111", "2", "8", "1");
+INSERT INTO t values ("11111111", "3", "7", "1");
+INSERT INTO t values ("11111111", "4", "5", "1");
+INSERT INTO t values ("11111111", "5", "4", "1");
+INSERT INTO t values ("11111111", "6", "2", "1");
+INSERT INTO t values ("111111111", "1", "9", "1");
+INSERT INTO t values ("111111111", "2", "2", "1");
+INSERT INTO t values ("11111112", "1", "1", "1");
+DROP TABLE t;
+CREATE TABLE t (i int, j int, k int, l int,
+PRIMARY KEY (i, j),
+KEY ik1 (i, k) COMMENT 'cfname=rev:cf;partial_group_keyparts=1;partial_group_threshold=5',
+KEY ik2 (i, k) COMMENT 'cfname=rev:cf'
+) ENGINE=ROCKSDB;
+Warnings:
+Warning 1831 Duplicate index 'ik2' defined on the table 'test.t'. This is deprecated and will be disallowed in a future release.
+INSERT INTO t values (1, 1, 2, 1);
+INSERT INTO t values (1, 2, 1, 1);
+INSERT INTO t values (2, 1, 9, 1);
+INSERT INTO t values (2, 2, 8, 1);
+INSERT INTO t values (2, 3, 7, 1);
+INSERT INTO t values (2, 4, 5, 1);
+INSERT INTO t values (2, 5, 4, 1);
+INSERT INTO t values (2, 6, 2, 1);
+INSERT INTO t values (4, 1, 1, 1);
+DROP TABLE t;
+CREATE TABLE t (i varchar(64), j varchar(64), k varchar(64), l varchar(64),
+PRIMARY KEY (i, j),
+KEY ik1 (i, k) COMMENT 'cfname=cf;partial_group_keyparts=1;partial_group_threshold=5',
+KEY ik2 (i, k) COMMENT 'cfname=cf'
+) ENGINE=ROCKSDB;
+Warnings:
+Warning 1831 Duplicate index 'ik2' defined on the table 'test.t'. This is deprecated and will be disallowed in a future release.
+INSERT INTO t values ("1", "1", "2", "1");
+INSERT INTO t values ("1", "2", "1", "1");
+INSERT INTO t values ("11111111", "1", "9", "1");
+INSERT INTO t values ("11111111", "2", "8", "1");
+INSERT INTO t values ("11111111", "3", "7", "1");
+INSERT INTO t values ("11111111", "4", "5", "1");
+INSERT INTO t values ("11111111", "5", "4", "1");
+INSERT INTO t values ("11111111", "6", "2", "1");
+INSERT INTO t values ("111111111", "1", "9", "1");
+INSERT INTO t values ("111111111", "2", "2", "1");
+INSERT INTO t values ("11111112", "1", "1", "1");
+DROP TABLE t;
+CREATE TABLE t (i int, j int, k int, l int,
+PRIMARY KEY (i, j),
+KEY ik1 (i, k) COMMENT 'cfname=cf;partial_group_keyparts=1;partial_group_threshold=5',
+KEY ik2 (i, k) COMMENT 'cfname=cf'
+) ENGINE=ROCKSDB;
+Warnings:
+Warning 1831 Duplicate index 'ik2' defined on the table 'test.t'. This is deprecated and will be disallowed in a future release.
+INSERT INTO t values (1, 1, 2, 1);
+INSERT INTO t values (1, 2, 1, 1);
+INSERT INTO t values (2, 1, 9, 1);
+INSERT INTO t values (2, 2, 8, 1);
+INSERT INTO t values (2, 3, 7, 1);
+INSERT INTO t values (2, 4, 5, 1);
+INSERT INTO t values (2, 5, 4, 1);
+INSERT INTO t values (2, 6, 2, 1);
+INSERT INTO t values (4, 1, 1, 1);
+DROP TABLE t;
+CREATE TABLE t (i varchar(64), j varchar(64), k varchar(64), l varchar(64),
+PRIMARY KEY (i, j),
+KEY ik1 (i, k) COMMENT 'cfname=rev:cf;partial_group_keyparts=1;partial_group_threshold=5',
+KEY ik2 (i, k) COMMENT 'cfname=rev:cf'
+) ENGINE=ROCKSDB;
+Warnings:
+Warning 1831 Duplicate index 'ik2' defined on the table 'test.t'. This is deprecated and will be disallowed in a future release.
+INSERT INTO t values ("1", "1", "2", "1");
+INSERT INTO t values ("1", "2", "1", "1");
+INSERT INTO t values ("11111111", "1", "9", "1");
+INSERT INTO t values ("11111111", "2", "8", "1");
+INSERT INTO t values ("11111111", "3", "7", "1");
+INSERT INTO t values ("11111111", "4", "5", "1");
+INSERT INTO t values ("11111111", "5", "4", "1");
+INSERT INTO t values ("11111111", "6", "2", "1");
+INSERT INTO t values ("111111111", "1", "9", "1");
+INSERT INTO t values ("111111111", "2", "2", "1");
+INSERT INTO t values ("11111112", "1", "1", "1");
+DROP TABLE t;
+CREATE TABLE t (i int, j int, k int, l int,
+PRIMARY KEY (i, j),
+KEY ik1 (i, k) COMMENT 'cfname=rev:cf;partial_group_keyparts=1;partial_group_threshold=5',
+KEY ik2 (i, k) COMMENT 'cfname=rev:cf'
+) ENGINE=ROCKSDB;
+Warnings:
+Warning 1831 Duplicate index 'ik2' defined on the table 'test.t'. This is deprecated and will be disallowed in a future release.
+INSERT INTO t values (1, 1, 2, 1);
+INSERT INTO t values (1, 2, 1, 1);
+INSERT INTO t values (2, 1, 9, 1);
+INSERT INTO t values (2, 2, 8, 1);
+INSERT INTO t values (2, 3, 7, 1);
+INSERT INTO t values (2, 4, 5, 1);
+INSERT INTO t values (2, 5, 4, 1);
+INSERT INTO t values (2, 6, 2, 1);
+INSERT INTO t values (4, 1, 1, 1);
+DROP TABLE t;
+set optimizer_force_index_for_range = off;
diff --git a/mysql-test/suite/rocksdb/r/partial_index_assoc.result b/mysql-test/suite/rocksdb/r/partial_index_assoc.result
new file mode 100644
index 00000000000..fbb89d16b35
--- /dev/null
+++ b/mysql-test/suite/rocksdb/r/partial_index_assoc.result
@@ -0,0 +1,171 @@
+set optimizer_force_index_for_range = on;
+CREATE TABLE `assoc_table` (
+`id1` bigint(20) unsigned NOT NULL DEFAULT '0',
+`id1_type` int(10) unsigned NOT NULL DEFAULT '0',
+`id2` bigint(20) unsigned NOT NULL DEFAULT '0',
+`id2_type` int(10) unsigned NOT NULL DEFAULT '0',
+`assoc_type` bigint(20) unsigned NOT NULL DEFAULT '0',
+`visibility` tinyint(3) NOT NULL DEFAULT '0',
+`data` varchar(255) COLLATE latin1_bin NOT NULL DEFAULT '',
+`time` int(10) unsigned NOT NULL DEFAULT '0',
+`version` bigint(20) unsigned NOT NULL DEFAULT '0',
+PRIMARY KEY (`assoc_type`,`id1`,`id2`) COMMENT 'cf_assoc',
+KEY `id1_type` (`assoc_type`, `id1`, `visibility`,`time`,`id2`,`version`,`data`) COMMENT 'rev:cf_assoc_id1_type',
+KEY `id1_type2` (`assoc_type`, `id1`, `visibility`,`time`,`id2`,`version`,`data`) COMMENT 'cfname=rev:cf_assoc_id1_type;partial_group_keyparts=2;partial_group_threshold=10'
+) ENGINE=ROCKSDB DEFAULT CHARSET=latin1 COLLATE=latin1_bin ROW_FORMAT=COMPRESSED KEY_BLOCK_SIZE=8;
+Warnings:
+Warning 1681 Integer display width is deprecated and will be removed in a future release.
+Warning 1681 Integer display width is deprecated and will be removed in a future release.
+Warning 1681 Integer display width is deprecated and will be removed in a future release.
+Warning 1681 Integer display width is deprecated and will be removed in a future release.
+Warning 1681 Integer display width is deprecated and will be removed in a future release.
+Warning 1681 Integer display width is deprecated and will be removed in a future release.
+Warning 1681 Integer display width is deprecated and will be removed in a future release.
+Warning 1681 Integer display width is deprecated and will be removed in a future release.
+Warning 1831 Duplicate index 'id1_type2' defined on the table 'test.assoc_table'. This is deprecated and will be disallowed in a future release.
+ALTER TABLE assoc_table ENGINE=ROCKSDB;
+CREATE TEMPORARY TABLE t1 AS
+SELECT * FROM performance_schema.global_status
+WHERE variable_name LIKE 'rocksdb_partial_index%';
+CREATE TEMPORARY TABLE t2 AS
+SELECT * FROM performance_schema.global_status
+WHERE variable_name LIKE 'rocksdb_partial_index%';
+SELECT variable_name, t2.variable_value - t1.variable_value AS diff FROM t1 JOIN t2 USING (variable_name);
+variable_name diff
+rocksdb_partial_index_groups_materialized 40
+rocksdb_partial_index_groups_sorted 47
+rocksdb_partial_index_rows_materialized 955
+rocksdb_partial_index_rows_sorted 1000
+include/assert.inc [Check that materialized groups are non-zero.]
+include/assert.inc [Check that materialized rows are non-zero.]
+DROP TABLE t1, t2;
+CREATE TEMPORARY TABLE t1 AS
+SELECT * FROM performance_schema.global_status
+WHERE variable_name LIKE 'rocksdb_partial_index%';
+SELECT id1, id2, assoc_type, visibility, time, version FROM assoc_table FORCE INDEX (id1_type2);
+CREATE TEMPORARY TABLE t2 AS
+SELECT * FROM performance_schema.global_status
+WHERE variable_name LIKE 'rocksdb_partial_index%';
+SELECT variable_name, t2.variable_value - t1.variable_value AS diff FROM t1 JOIN t2 USING (variable_name);
+variable_name diff
+rocksdb_partial_index_groups_materialized 0
+rocksdb_partial_index_groups_sorted 7
+rocksdb_partial_index_rows_materialized 0
+rocksdb_partial_index_rows_sorted 45
+include/assert.inc [Check that materialized groups are zero.]
+include/assert.inc [Check that materialized rows are zero.]
+DROP TABLE t1, t2;
+DROP TABLE assoc_table;
+CREATE TABLE `assoc_table` (
+`id1` binary(16) NOT NULL DEFAULT '\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0',
+`raw_key` text COLLATE latin1_bin,
+`id2` bigint(20) unsigned NOT NULL DEFAULT '0',
+`id2_type` int(10) unsigned NOT NULL DEFAULT '0',
+`assoc_type` bigint(20) unsigned NOT NULL DEFAULT '0',
+`visibility` tinyint(3) NOT NULL DEFAULT '0',
+`data` varchar(255) COLLATE latin1_bin NOT NULL DEFAULT '',
+`time` int(10) unsigned NOT NULL DEFAULT '0',
+`version` bigint(20) unsigned NOT NULL DEFAULT '0',
+PRIMARY KEY (`assoc_type`,`id1`,`id2`) COMMENT 'cf_assoc',
+KEY `id1_type` (`assoc_type`,`id1`,`visibility`,`time`,`id2`,`version`,`data`) COMMENT 'rev:cf_assoc_id1_type',
+KEY `id1_type2` (`assoc_type`, `id1`, `visibility`,`time`,`id2`,`version`,`data`) COMMENT 'cfname=rev:cf_assoc_id1_type;partial_group_keyparts=2;partial_group_threshold=10'
+) ENGINE=ROCKSDB DEFAULT CHARSET=latin1 COLLATE=latin1_bin ROW_FORMAT=COMPRESSED KEY_BLOCK_SIZE=8;
+Warnings:
+Warning 1681 Integer display width is deprecated and will be removed in a future release.
+Warning 1681 Integer display width is deprecated and will be removed in a future release.
+Warning 1681 Integer display width is deprecated and will be removed in a future release.
+Warning 1681 Integer display width is deprecated and will be removed in a future release.
+Warning 1681 Integer display width is deprecated and will be removed in a future release.
+Warning 1681 Integer display width is deprecated and will be removed in a future release.
+Warning 1831 Duplicate index 'id1_type2' defined on the table 'test.assoc_table'. This is deprecated and will be disallowed in a future release.
+ALTER TABLE assoc_table ENGINE=ROCKSDB;
+CREATE TEMPORARY TABLE t1 AS
+SELECT * FROM performance_schema.global_status
+WHERE variable_name LIKE 'rocksdb_partial_index%';
+CREATE TEMPORARY TABLE t2 AS
+SELECT * FROM performance_schema.global_status
+WHERE variable_name LIKE 'rocksdb_partial_index%';
+SELECT variable_name, t2.variable_value - t1.variable_value AS diff FROM t1 JOIN t2 USING (variable_name);
+variable_name diff
+rocksdb_partial_index_groups_materialized 40
+rocksdb_partial_index_groups_sorted 47
+rocksdb_partial_index_rows_materialized 955
+rocksdb_partial_index_rows_sorted 1000
+include/assert.inc [Check that materialized groups are non-zero.]
+include/assert.inc [Check that materialized rows are non-zero.]
+DROP TABLE t1, t2;
+CREATE TEMPORARY TABLE t1 AS
+SELECT * FROM performance_schema.global_status
+WHERE variable_name LIKE 'rocksdb_partial_index%';
+SELECT id1, id2, assoc_type, visibility, time, version FROM assoc_table FORCE INDEX (id1_type2);
+CREATE TEMPORARY TABLE t2 AS
+SELECT * FROM performance_schema.global_status
+WHERE variable_name LIKE 'rocksdb_partial_index%';
+SELECT variable_name, t2.variable_value - t1.variable_value AS diff FROM t1 JOIN t2 USING (variable_name);
+variable_name diff
+rocksdb_partial_index_groups_materialized 0
+rocksdb_partial_index_groups_sorted 7
+rocksdb_partial_index_rows_materialized 0
+rocksdb_partial_index_rows_sorted 45
+include/assert.inc [Check that materialized groups are zero.]
+include/assert.inc [Check that materialized rows are zero.]
+DROP TABLE t1, t2;
+DROP TABLE assoc_table;
+CREATE TABLE `assoc_table` (
+`id1` bigint(20) unsigned NOT NULL DEFAULT '0',
+`id1_type` int(10) unsigned NOT NULL DEFAULT '0',
+`id2` bigint(20) unsigned NOT NULL DEFAULT '0',
+`id2_type` int(10) unsigned NOT NULL DEFAULT '0',
+`assoc_type` bigint(20) unsigned NOT NULL DEFAULT '0',
+`visibility` tinyint(4) NOT NULL DEFAULT '0',
+`data` text COLLATE latin1_bin NOT NULL,
+`time` int(10) unsigned NOT NULL DEFAULT '0',
+`version` bigint(20) unsigned NOT NULL DEFAULT '0',
+PRIMARY KEY (`assoc_type`,`id1`,`id2`) COMMENT 'cf_assoc',
+KEY `id1_type` (`assoc_type`, `id1`, `visibility`,`time`,`id2`,`version`,`data`(255)) COMMENT 'rev:cf_assoc_id1_type',
+KEY `id1_type2` (`assoc_type`, `id1`, `visibility`,`time`,`id2`,`version`,`data`(255)) COMMENT 'cfname=rev:cf_assoc_id1_type;partial_group_keyparts=2;partial_group_threshold=10'
+) ENGINE=ROCKSDB DEFAULT CHARSET=latin1 COLLATE=latin1_bin ROW_FORMAT=COMPRESSED KEY_BLOCK_SIZE=4;
+Warnings:
+Warning 1681 Integer display width is deprecated and will be removed in a future release.
+Warning 1681 Integer display width is deprecated and will be removed in a future release.
+Warning 1681 Integer display width is deprecated and will be removed in a future release.
+Warning 1681 Integer display width is deprecated and will be removed in a future release.
+Warning 1681 Integer display width is deprecated and will be removed in a future release.
+Warning 1681 Integer display width is deprecated and will be removed in a future release.
+Warning 1681 Integer display width is deprecated and will be removed in a future release.
+Warning 1681 Integer display width is deprecated and will be removed in a future release.
+Warning 1831 Duplicate index 'id1_type2' defined on the table 'test.assoc_table'. This is deprecated and will be disallowed in a future release.
+ALTER TABLE assoc_table ENGINE=ROCKSDB;
+CREATE TEMPORARY TABLE t1 AS
+SELECT * FROM performance_schema.global_status
+WHERE variable_name LIKE 'rocksdb_partial_index%';
+CREATE TEMPORARY TABLE t2 AS
+SELECT * FROM performance_schema.global_status
+WHERE variable_name LIKE 'rocksdb_partial_index%';
+SELECT variable_name, t2.variable_value - t1.variable_value AS diff FROM t1 JOIN t2 USING (variable_name);
+variable_name diff
+rocksdb_partial_index_groups_materialized 40
+rocksdb_partial_index_groups_sorted 47
+rocksdb_partial_index_rows_materialized 955
+rocksdb_partial_index_rows_sorted 1000
+include/assert.inc [Check that materialized groups are non-zero.]
+include/assert.inc [Check that materialized rows are non-zero.]
+DROP TABLE t1, t2;
+CREATE TEMPORARY TABLE t1 AS
+SELECT * FROM performance_schema.global_status
+WHERE variable_name LIKE 'rocksdb_partial_index%';
+SELECT id1, id2, assoc_type, visibility, time, version FROM assoc_table FORCE INDEX (id1_type2);
+CREATE TEMPORARY TABLE t2 AS
+SELECT * FROM performance_schema.global_status
+WHERE variable_name LIKE 'rocksdb_partial_index%';
+SELECT variable_name, t2.variable_value - t1.variable_value AS diff FROM t1 JOIN t2 USING (variable_name);
+variable_name diff
+rocksdb_partial_index_groups_materialized 0
+rocksdb_partial_index_groups_sorted 7
+rocksdb_partial_index_rows_materialized 0
+rocksdb_partial_index_rows_sorted 45
+include/assert.inc [Check that materialized groups are zero.]
+include/assert.inc [Check that materialized rows are zero.]
+DROP TABLE t1, t2;
+DROP TABLE assoc_table;
+set optimizer_force_index_for_range = off;
diff --git a/mysql-test/suite/rocksdb/r/partial_index_stress.result b/mysql-test/suite/rocksdb/r/partial_index_stress.result
new file mode 100644
index 00000000000..88f77bcc63f
--- /dev/null
+++ b/mysql-test/suite/rocksdb/r/partial_index_stress.result
@@ -0,0 +1,74 @@
+set @save_rocksdb_lock_wait_timeout = @@rocksdb_lock_wait_timeout;
+set global rocksdb_lock_wait_timeout = 100000;
+CREATE TABLE `assoc_table` (
+`id1` bigint(20) unsigned NOT NULL DEFAULT '0',
+`id1_type` int(10) unsigned NOT NULL DEFAULT '0',
+`id2` bigint(20) unsigned NOT NULL DEFAULT '0',
+`id2_type` int(10) unsigned NOT NULL DEFAULT '0',
+`assoc_type` bigint(20) unsigned NOT NULL DEFAULT '0',
+`visibility` tinyint(3) NOT NULL DEFAULT '0',
+`data` varchar(255) COLLATE latin1_bin NOT NULL DEFAULT '',
+`time` int(10) unsigned NOT NULL DEFAULT '0',
+`version` bigint(20) unsigned NOT NULL DEFAULT '0',
+PRIMARY KEY (`assoc_type`,`id1`,`id2`) COMMENT 'cf_assoc',
+KEY `id1_type` (`assoc_type`, `id1`, `visibility`,`time`,`id2`,`version`,`data`) COMMENT 'cfname=rev:cf_assoc_id1_type;partial_group_keyparts=2;partial_group_threshold=10'
+) ENGINE=ROCKSDB DEFAULT CHARSET=latin1 COLLATE=latin1_bin ROW_FORMAT=COMPRESSED KEY_BLOCK_SIZE=8;
+Warnings:
+Warning 1681 Integer display width is deprecated and will be removed in a future release.
+Warning 1681 Integer display width is deprecated and will be removed in a future release.
+Warning 1681 Integer display width is deprecated and will be removed in a future release.
+Warning 1681 Integer display width is deprecated and will be removed in a future release.
+Warning 1681 Integer display width is deprecated and will be removed in a future release.
+Warning 1681 Integer display width is deprecated and will be removed in a future release.
+Warning 1681 Integer display width is deprecated and will be removed in a future release.
+Warning 1681 Integer display width is deprecated and will be removed in a future release.
+DROP TABLE assoc_table;
+CREATE TABLE `assoc_table` (
+`id1` binary(16) NOT NULL DEFAULT '\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0',
+`raw_key` text COLLATE latin1_bin,
+`id2` bigint(20) unsigned NOT NULL DEFAULT '0',
+`id2_type` int(10) unsigned NOT NULL DEFAULT '0',
+`assoc_type` bigint(20) unsigned NOT NULL DEFAULT '0',
+`visibility` tinyint(3) NOT NULL DEFAULT '0',
+`data` varchar(255) COLLATE latin1_bin NOT NULL DEFAULT '',
+`time` int(10) unsigned NOT NULL DEFAULT '0',
+`version` bigint(20) unsigned NOT NULL DEFAULT '0',
+PRIMARY KEY (`assoc_type`,`id1`,`id2`) COMMENT 'cf_assoc',
+KEY `id1_type` (`assoc_type`,`id1`,`visibility`,`time`,`id2`,`version`,`data`) COMMENT 'rev:cf_assoc_id1_type',
+KEY `id1_type2` (`assoc_type`, `id1`, `visibility`,`time`,`id2`,`version`,`data`) COMMENT 'cfname=rev:cf_assoc_id1_type;partial_group_keyparts=2;partial_group_threshold=10'
+) ENGINE=ROCKSDB DEFAULT CHARSET=latin1 COLLATE=latin1_bin ROW_FORMAT=COMPRESSED KEY_BLOCK_SIZE=8;
+Warnings:
+Warning 1681 Integer display width is deprecated and will be removed in a future release.
+Warning 1681 Integer display width is deprecated and will be removed in a future release.
+Warning 1681 Integer display width is deprecated and will be removed in a future release.
+Warning 1681 Integer display width is deprecated and will be removed in a future release.
+Warning 1681 Integer display width is deprecated and will be removed in a future release.
+Warning 1681 Integer display width is deprecated and will be removed in a future release.
+Warning 1831 Duplicate index 'id1_type2' defined on the table 'test.assoc_table'. This is deprecated and will be disallowed in a future release.
+DROP TABLE assoc_table;
+CREATE TABLE `assoc_table` (
+`id1` bigint(20) unsigned NOT NULL DEFAULT '0',
+`id1_type` int(10) unsigned NOT NULL DEFAULT '0',
+`id2` bigint(20) unsigned NOT NULL DEFAULT '0',
+`id2_type` int(10) unsigned NOT NULL DEFAULT '0',
+`assoc_type` bigint(20) unsigned NOT NULL DEFAULT '0',
+`visibility` tinyint(4) NOT NULL DEFAULT '0',
+`data` text COLLATE latin1_bin NOT NULL,
+`time` int(10) unsigned NOT NULL DEFAULT '0',
+`version` bigint(20) unsigned NOT NULL DEFAULT '0',
+PRIMARY KEY (`assoc_type`,`id1`,`id2`) COMMENT 'cf_assoc',
+KEY `id1_type` (`assoc_type`, `id1`, `visibility`,`time`,`id2`,`version`,`data`(255)) COMMENT 'rev:cf_assoc_id1_type',
+KEY `id1_type2` (`assoc_type`, `id1`, `visibility`,`time`,`id2`,`version`,`data`(255)) COMMENT 'cfname=rev:cf_assoc_id1_type;partial_group_keyparts=2;partial_group_threshold=10'
+) ENGINE=ROCKSDB DEFAULT CHARSET=latin1 COLLATE=latin1_bin ROW_FORMAT=COMPRESSED KEY_BLOCK_SIZE=4;
+Warnings:
+Warning 1681 Integer display width is deprecated and will be removed in a future release.
+Warning 1681 Integer display width is deprecated and will be removed in a future release.
+Warning 1681 Integer display width is deprecated and will be removed in a future release.
+Warning 1681 Integer display width is deprecated and will be removed in a future release.
+Warning 1681 Integer display width is deprecated and will be removed in a future release.
+Warning 1681 Integer display width is deprecated and will be removed in a future release.
+Warning 1681 Integer display width is deprecated and will be removed in a future release.
+Warning 1681 Integer display width is deprecated and will be removed in a future release.
+Warning 1831 Duplicate index 'id1_type2' defined on the table 'test.assoc_table'. This is deprecated and will be disallowed in a future release.
+DROP TABLE assoc_table;
+set global rocksdb_lock_wait_timeout = @save_rocksdb_lock_wait_timeout;
diff --git a/mysql-test/suite/rocksdb/r/rocksdb.result b/mysql-test/suite/rocksdb/r/rocksdb.result
index 61dd8184ddf..5c440c88317 100644
--- a/mysql-test/suite/rocksdb/r/rocksdb.result
+++ b/mysql-test/suite/rocksdb/r/rocksdb.result
@@ -1656,6 +1656,10 @@ rocksdb_number_sst_entry_singledelete #
rocksdb_number_superversion_acquires #
rocksdb_number_superversion_cleanups #
rocksdb_number_superversion_releases #
+rocksdb_partial_index_groups_materialized #
+rocksdb_partial_index_groups_sorted #
+rocksdb_partial_index_rows_materialized #
+rocksdb_partial_index_rows_sorted #
rocksdb_row_lock_deadlocks #
rocksdb_row_lock_wait_timeouts #
rocksdb_select_bypass_executed #
@@ -1760,6 +1764,10 @@ ROCKSDB_NUMBER_SUPERVERSION_ACQUIRES
ROCKSDB_NUMBER_SUPERVERSION_CLEANUPS
ROCKSDB_NUMBER_SUPERVERSION_RELEASES
ROCKSDB_NUM_ITERATORS
+ROCKSDB_PARTIAL_INDEX_GROUPS_MATERIALIZED
+ROCKSDB_PARTIAL_INDEX_GROUPS_SORTED
+ROCKSDB_PARTIAL_INDEX_ROWS_MATERIALIZED
+ROCKSDB_PARTIAL_INDEX_ROWS_SORTED
ROCKSDB_QUERIES_POINT
ROCKSDB_QUERIES_RANGE
ROCKSDB_ROWS_DELETED
@@ -1883,6 +1891,10 @@ ROCKSDB_NUMBER_SUPERVERSION_ACQUIRES
ROCKSDB_NUMBER_SUPERVERSION_CLEANUPS
ROCKSDB_NUMBER_SUPERVERSION_RELEASES
ROCKSDB_NUM_ITERATORS
+ROCKSDB_PARTIAL_INDEX_GROUPS_MATERIALIZED
+ROCKSDB_PARTIAL_INDEX_GROUPS_SORTED
+ROCKSDB_PARTIAL_INDEX_ROWS_MATERIALIZED
+ROCKSDB_PARTIAL_INDEX_ROWS_SORTED
ROCKSDB_QUERIES_POINT
ROCKSDB_QUERIES_RANGE
ROCKSDB_ROWS_DELETED
diff --git a/mysql-test/suite/rocksdb/t/partial_index.inc b/mysql-test/suite/rocksdb/t/partial_index.inc
new file mode 100644
index 00000000000..7137a696d6a
--- /dev/null
+++ b/mysql-test/suite/rocksdb/t/partial_index.inc
@@ -0,0 +1,156 @@
+eval
+CREATE TABLE t (i varchar(64), j varchar(64), k varchar(64), l varchar(64),
+ PRIMARY KEY (i, j),
+ KEY ik1 (i, k) COMMENT 'cfname=$cfname;partial_group_keyparts=1;partial_group_threshold=5',
+ KEY ik2 (i, k) COMMENT 'cfname=$cfname'
+) ENGINE=ROCKSDB;
+
+INSERT INTO t values ("1", "1", "2", "1");
+INSERT INTO t values ("1", "2", "1", "1");
+
+INSERT INTO t values ("11111111", "1", "9", "1");
+INSERT INTO t values ("11111111", "2", "8", "1");
+INSERT INTO t values ("11111111", "3", "7", "1");
+INSERT INTO t values ("11111111", "4", "5", "1");
+INSERT INTO t values ("11111111", "5", "4", "1");
+INSERT INTO t values ("11111111", "6", "2", "1");
+
+INSERT INTO t values ("111111111", "1", "9", "1");
+INSERT INTO t values ("111111111", "2", "2", "1");
+
+INSERT INTO t values ("11111112", "1", "1", "1");
+
+--let $query1= SELECT i, j, k FROM t FORCE INDEX (ik1) ORDER BY i $asc , k $asc;
+--let $query2= SELECT i, j, k FROM t FORCE INDEX (ik2) ORDER BY i $asc , k $asc;
+--source include/diff_queries.inc
+
+--let $query1= SELECT i, j, k FROM t FORCE INDEX (ik1) WHERE i = "1" ORDER BY i $asc , k $asc;
+--let $query2= SELECT i, j, k FROM t FORCE INDEX (ik2) WHERE i = "1" ORDER BY i $asc , k $asc;
+--source include/diff_queries.inc
+
+--let $query1= SELECT i, j, k FROM t FORCE INDEX (ik1) WHERE i = "11111111" ORDER BY i $asc , k $asc;
+--let $query2= SELECT i, j, k FROM t FORCE INDEX (ik2) WHERE i = "11111111" ORDER BY i $asc , k $asc;
+--source include/diff_queries.inc
+
+--let $query1= SELECT i, j, k FROM t FORCE INDEX (ik1) WHERE i = "111111110" ORDER BY i $asc , k $asc;
+--let $query2= SELECT i, j, k FROM t FORCE INDEX (ik2) WHERE i = "111111110" ORDER BY i $asc , k $asc;
+--source include/diff_queries.inc
+
+--let $query1= SELECT i, j, k FROM t FORCE INDEX (ik1) WHERE i = "11111112" ORDER BY i $asc , k $asc;
+--let $query2= SELECT i, j, k FROM t FORCE INDEX (ik2) WHERE i = "11111112" ORDER BY i $asc , k $asc;
+--source include/diff_queries.inc
+
+--let $query1= SELECT i, j, k FROM t FORCE INDEX (ik1) WHERE i = "11111111" AND k < "5" ORDER BY i $asc , k $asc;
+--let $query2= SELECT i, j, k FROM t FORCE INDEX (ik2) WHERE i = "11111111" AND k < "5" ORDER BY i $asc , k $asc;
+--source include/diff_queries.inc
+
+--let $query1= SELECT i, j, k FROM t FORCE INDEX (ik1) WHERE i = "11111111" AND k > "2" ORDER BY i $asc , k $asc;
+--let $query2= SELECT i, j, k FROM t FORCE INDEX (ik2) WHERE i = "11111111" AND k > "2" ORDER BY i $asc , k $asc;
+--source include/diff_queries.inc
+
+--let $query1= SELECT i, j, k FROM t FORCE INDEX (ik1) WHERE i = "11111111" AND k > "2" AND k < "5" ORDER BY i $asc , k $asc;
+--let $query2= SELECT i, j, k FROM t FORCE INDEX (ik2) WHERE i = "11111111" AND k > "2" AND k < "5" ORDER BY i $asc , k $asc;
+--source include/diff_queries.inc
+
+--let $query1= SELECT i, j, k FROM t FORCE INDEX (ik1) WHERE i = "11111111" AND k > "7" AND k < "9" ORDER BY i $asc , k $asc;
+--let $query2= SELECT i, j, k FROM t FORCE INDEX (ik2) WHERE i = "11111111" AND k > "7" AND k < "9" ORDER BY i $asc , k $asc;
+--source include/diff_queries.inc
+
+--let $query1= SELECT i, j, k FROM t FORCE INDEX (ik1) WHERE i < "111111110" ORDER BY i $asc , k $asc;
+--let $query2= SELECT i, j, k FROM t FORCE INDEX (ik2) WHERE i < "111111110" ORDER BY i $asc , k $asc;
+--source include/diff_queries.inc
+
+--let $query1= SELECT i, j, k FROM t FORCE INDEX (ik1) WHERE i > "111111110" ORDER BY i $asc , k $asc;
+--let $query2= SELECT i, j, k FROM t FORCE INDEX (ik2) WHERE i > "111111110" ORDER BY i $asc , k $asc;
+--source include/diff_queries.inc
+
+--let $query1= SELECT i, j, k FROM t FORCE INDEX (ik1) WHERE i >= "111111110" ORDER BY i $asc , k $asc;
+--let $query2= SELECT i, j, k FROM t FORCE INDEX (ik2) WHERE i >= "111111110" ORDER BY i $asc , k $asc;
+--source include/diff_queries.inc
+
+--let $query1= SELECT i, j, k FROM t FORCE INDEX (ik1) WHERE i <= "111111110" ORDER BY i $asc , k $asc;
+--let $query2= SELECT i, j, k FROM t FORCE INDEX (ik2) WHERE i <= "111111110" ORDER BY i $asc , k $asc;
+--source include/diff_queries.inc
+
+--let $query1= SELECT i, j, k FROM t FORCE INDEX (ik1) WHERE i LIKE "1%" ORDER BY i $asc , k $asc;
+--let $query2= SELECT i, j, k FROM t FORCE INDEX (ik2) WHERE i LIKE "1%" ORDER BY i $asc , k $asc;
+--source include/diff_queries.inc
+
+--let $query1= SELECT i, j, k FROM t FORCE INDEX (ik1) WHERE i LIKE "11111111%" ORDER BY i $asc , k $asc;
+--let $query2= SELECT i, j, k FROM t FORCE INDEX (ik2) WHERE i LIKE "11111111%" ORDER BY i $asc , k $asc;
+--source include/diff_queries.inc
+
+DROP TABLE t;
+
+eval
+CREATE TABLE t (i int, j int, k int, l int,
+ PRIMARY KEY (i, j),
+ KEY ik1 (i, k) COMMENT 'cfname=$cfname;partial_group_keyparts=1;partial_group_threshold=5',
+ KEY ik2 (i, k) COMMENT 'cfname=$cfname'
+) ENGINE=ROCKSDB;
+
+INSERT INTO t values (1, 1, 2, 1);
+INSERT INTO t values (1, 2, 1, 1);
+
+INSERT INTO t values (2, 1, 9, 1);
+INSERT INTO t values (2, 2, 8, 1);
+INSERT INTO t values (2, 3, 7, 1);
+INSERT INTO t values (2, 4, 5, 1);
+INSERT INTO t values (2, 5, 4, 1);
+INSERT INTO t values (2, 6, 2, 1);
+
+INSERT INTO t values (4, 1, 1, 1);
+
+--let $query1= SELECT i, j, k FROM t FORCE INDEX (ik1) ORDER BY i $asc , k $asc;
+--let $query2= SELECT i, j, k FROM t FORCE INDEX (ik2) ORDER BY i $asc , k $asc;
+--source include/diff_queries.inc
+
+--let $query1= SELECT i, j, k FROM t FORCE INDEX (ik1) WHERE i = 1 ORDER BY i $asc , k $asc;
+--let $query2= SELECT i, j, k FROM t FORCE INDEX (ik2) WHERE i = 1 ORDER BY i $asc , k $asc;
+--source include/diff_queries.inc
+
+--let $query1= SELECT i, j, k FROM t FORCE INDEX (ik1) WHERE i = 2 ORDER BY i $asc , k $asc;
+--let $query2= SELECT i, j, k FROM t FORCE INDEX (ik2) WHERE i = 2 ORDER BY i $asc , k $asc;
+--source include/diff_queries.inc
+
+--let $query1= SELECT i, j, k FROM t FORCE INDEX (ik1) WHERE i = 3 ORDER BY i $asc , k $asc;
+--let $query2= SELECT i, j, k FROM t FORCE INDEX (ik2) WHERE i = 3 ORDER BY i $asc , k $asc;
+--source include/diff_queries.inc
+
+--let $query1= SELECT i, j, k FROM t FORCE INDEX (ik1) WHERE i = 4 ORDER BY i $asc , k $asc;
+--let $query2= SELECT i, j, k FROM t FORCE INDEX (ik2) WHERE i = 4 ORDER BY i $asc , k $asc;
+--source include/diff_queries.inc
+
+--let $query1= SELECT i, j, k FROM t FORCE INDEX (ik1) WHERE i = 2 AND k < 5 ORDER BY i $asc , k $asc;
+--let $query2= SELECT i, j, k FROM t FORCE INDEX (ik2) WHERE i = 2 AND k < 5 ORDER BY i $asc , k $asc;
+--source include/diff_queries.inc
+
+--let $query1= SELECT i, j, k FROM t FORCE INDEX (ik1) WHERE i = 2 AND k > 2 ORDER BY i $asc , k $asc;
+--let $query2= SELECT i, j, k FROM t FORCE INDEX (ik2) WHERE i = 2 AND k > 2 ORDER BY i $asc , k $asc;
+--source include/diff_queries.inc
+
+--let $query1= SELECT i, j, k FROM t FORCE INDEX (ik1) WHERE i = 2 AND k > 2 AND k < 5 ORDER BY i $asc , k $asc;
+--let $query2= SELECT i, j, k FROM t FORCE INDEX (ik2) WHERE i = 2 AND k > 2 AND k < 5 ORDER BY i $asc , k $asc;
+--source include/diff_queries.inc
+
+--let $query1= SELECT i, j, k FROM t FORCE INDEX (ik1) WHERE i = 2 AND k > 7 AND k < 9 ORDER BY i $asc , k $asc;
+--let $query2= SELECT i, j, k FROM t FORCE INDEX (ik2) WHERE i = 2 AND k > 7 AND k < 9 ORDER BY i $asc , k $asc;
+--source include/diff_queries.inc
+
+--let $query1= SELECT i, j, k FROM t FORCE INDEX (ik1) WHERE i < 3 ORDER BY i $asc , k $asc;
+--let $query2= SELECT i, j, k FROM t FORCE INDEX (ik2) WHERE i < 3 ORDER BY i $asc , k $asc;
+--source include/diff_queries.inc
+
+--let $query1= SELECT i, j, k FROM t FORCE INDEX (ik1) WHERE i > 3 ORDER BY i $asc , k $asc;
+--let $query2= SELECT i, j, k FROM t FORCE INDEX (ik2) WHERE i > 3 ORDER BY i $asc , k $asc;
+--source include/diff_queries.inc
+
+--let $query1= SELECT i, j, k FROM t FORCE INDEX (ik1) WHERE i >= 3 ORDER BY i $asc , k $asc;
+--let $query2= SELECT i, j, k FROM t FORCE INDEX (ik2) WHERE i >= 3 ORDER BY i $asc , k $asc;
+--source include/diff_queries.inc
+
+--let $query1= SELECT i, j, k FROM t FORCE INDEX (ik1) WHERE i <= 3 ORDER BY i $asc , k $asc;
+--let $query2= SELECT i, j, k FROM t FORCE INDEX (ik2) WHERE i <= 3 ORDER BY i $asc , k $asc;
+--source include/diff_queries.inc
+
+DROP TABLE t;
diff --git a/mysql-test/suite/rocksdb/t/partial_index.test b/mysql-test/suite/rocksdb/t/partial_index.test
new file mode 100644
index 00000000000..410c772765e
--- /dev/null
+++ b/mysql-test/suite/rocksdb/t/partial_index.test
@@ -0,0 +1,19 @@
+set optimizer_force_index_for_range = on;
+
+--let $asc=ASC
+--let $cfname=cf
+--source partial_index.inc
+
+--let $asc=ASC
+--let $cfname=rev:cf
+--source partial_index.inc
+
+--let $asc=DESC
+--let $cfname=cf
+--source partial_index.inc
+
+--let $asc=DESC
+--let $cfname=rev:cf
+--source partial_index.inc
+
+set optimizer_force_index_for_range = off;
diff --git a/mysql-test/suite/rocksdb/t/partial_index_assoc-master.opt b/mysql-test/suite/rocksdb/t/partial_index_assoc-master.opt
new file mode 100644
index 00000000000..81bc90b0531
--- /dev/null
+++ b/mysql-test/suite/rocksdb/t/partial_index_assoc-master.opt
@@ -0,0 +1,3 @@
+--rocksdb_default_cf_options=write_buffer_size=128m;target_file_size_base=32m;max_bytes_for_level_base=512m;level0_file_num_compaction_trigger=4;level0_slowdown_writes_trigger=20;level0_stop_writes_trigger=30;max_write_buffer_number=4;compression_per_level=kLZ4Compression;bottommost_compression=kZSTD;compression_opts=-14:6:0;block_based_table_factory={cache_index_and_filter_blocks=1;filter_policy=bloomfilter:10:false;whole_key_filtering=0};prefix_extractor=capped:12;level_compaction_dynamic_level_bytes=true;optimize_filters_for_hits=true;memtable_prefix_bloom_size_ratio=0.039;max_compaction_bytes=402653184;report_bg_io_stats=true;compaction_pri=kMinOverlappingRatio;soft_pending_compaction_bytes_limit=20480000000
+--rocksdb_override_cf_options=cf_assoc={prefix_extractor=capped:28};cf_assoc_count={prefix_extractor=capped:20};rev:cf_assoc_id1_type={prefix_extractor=capped:20};cf_fbobj_type_id={prefix_extractor=capped:16};cf_assoc_disagg={prefix_extractor=capped:20};__system__={write_buffer_size=16m};
+
diff --git a/mysql-test/suite/rocksdb/t/partial_index_assoc.inc b/mysql-test/suite/rocksdb/t/partial_index_assoc.inc
new file mode 100644
index 00000000000..d0508a3f40b
--- /dev/null
+++ b/mysql-test/suite/rocksdb/t/partial_index_assoc.inc
@@ -0,0 +1,95 @@
+--let $binary_id1=1
+if (`select DATA_TYPE = 'binary' from information_schema.columns where TABLE_NAME = 'assoc_table' and TABLE_SCHEMA = 'test' and COLUMN_NAME = 'id1'`) {
+ --let $binary_id1="1"
+}
+--let $text=`select DATA_TYPE = 'text' from information_schema.columns where TABLE_NAME = 'assoc_table' and TABLE_SCHEMA = 'test' and COLUMN_NAME = 'data'`
+
+# This creates 10 distinct types, with up to 9 distinct id1s per type, to give up to 90 groups.
+--disable_query_log
+let $i=0;
+while ($i < 1000)
+{
+ if ($text) {
+ eval INSERT INTO assoc_table VALUES (FLOOR(RAND($i) * 9), 123, $i, 456, FLOOR($i / 100), FLOOR(RAND($i) * 2), REPEAT("1234567890", FLOOR(RAND($i) * 50)), FLOOR(RAND($i) * 100000), 789);
+ }
+ if (!$text) {
+ eval INSERT INTO assoc_table VALUES (FLOOR(RAND($i) * 9), 123, $i, 456, FLOOR($i / 100), FLOOR(RAND($i) * 2), REPEAT("1234567890", FLOOR(RAND($i) * 20)), FLOOR(RAND($i) * 100000), 789);
+ }
+ inc $i;
+}
+--enable_query_log
+
+let $i=0;
+while ($i < 10) {
+ # This gives a range plan
+ --let $query1= SELECT id1, id2, assoc_type, visibility, data, time, version FROM assoc_table FORCE INDEX (id1_type) WHERE assoc_type = $i AND id1 = $binary_id1 AND visibility = 1 AND time >= 100 AND time <= 50000 ORDER BY time DESC, id2 DESC LIMIT 10000
+ --let $query2= SELECT id1, id2, assoc_type, visibility, data, time, version FROM assoc_table FORCE INDEX (id1_type2) WHERE assoc_type = $i AND id1 = $binary_id1 AND visibility = 1 AND time >= 100 AND time <= 50000 ORDER BY time DESC, id2 DESC LIMIT 10000
+ --source include/diff_queries.inc
+
+ # This gives a ref plan
+ --let $query1= SELECT id1, id2, assoc_type, visibility, data, time, version FROM assoc_table FORCE INDEX (id1_type) WHERE assoc_type = $i AND id1 = $binary_id1 AND visibility = 0 AND time >= 0 AND time <= 4294967295 ORDER BY time DESC, id2 DESC LIMIT 10000
+ --let $query2= SELECT id1, id2, assoc_type, visibility, data, time, version FROM assoc_table FORCE INDEX (id1_type2) WHERE assoc_type = $i AND id1 = $binary_id1 AND visibility = 0 AND time >= 0 AND time <= 4294967295 ORDER BY time DESC, id2 DESC LIMIT 10000
+ --source include/diff_queries.inc
+
+ inc $i;
+}
+
+let $i=0;
+while ($i < 10) {
+ --let $query1= SELECT id1, id2, assoc_type, visibility, data, time, version FROM assoc_table FORCE INDEX (id1_type) WHERE assoc_type = 1
+ --let $query2= SELECT id1, id2, assoc_type, visibility, data, time, version FROM assoc_table FORCE INDEX (id1_type2) WHERE assoc_type = 1
+ --source include/diff_queries.inc
+
+ --let $query1= SELECT id1, id2, assoc_type, visibility, data, time, version FROM assoc_table FORCE INDEX (id1_type) WHERE assoc_type <= 2
+ --let $query2= SELECT id1, id2, assoc_type, visibility, data, time, version FROM assoc_table FORCE INDEX (id1_type2) WHERE assoc_type <= 2
+ --source include/diff_queries.inc
+
+ inc $i;
+}
+
+# Rebuild the table so that nothing is materialized anymore.
+ALTER TABLE assoc_table ENGINE=ROCKSDB;
+
+CREATE TEMPORARY TABLE t1 AS
+SELECT * FROM performance_schema.global_status
+WHERE variable_name LIKE 'rocksdb_partial_index%';
+
+--let $query1= SELECT id1, id2, assoc_type, visibility, time, version FROM assoc_table FORCE INDEX (id1_type)
+--let $query2= SELECT id1, id2, assoc_type, visibility, time, version FROM assoc_table FORCE INDEX (id1_type2)
+--source include/diff_queries.inc
+
+CREATE TEMPORARY TABLE t2 AS
+SELECT * FROM performance_schema.global_status
+WHERE variable_name LIKE 'rocksdb_partial_index%';
+
+SELECT variable_name, t2.variable_value - t1.variable_value AS diff FROM t1 JOIN t2 USING (variable_name);
+--let $assert_text = Check that materialized groups are non-zero.
+--let $assert_cond = [SELECT t2.variable_value - t1.variable_value AS diff FROM t1 JOIN t2 USING (variable_name) WHERE variable_name = "rocksdb_partial_index_groups_materialized", diff, 1] > 0
+--source include/assert.inc
+--let $assert_text = Check that materialized rows are non-zero.
+--let $assert_cond = [SELECT t2.variable_value - t1.variable_value AS diff FROM t1 JOIN t2 USING (variable_name) WHERE variable_name = "rocksdb_partial_index_rows_materialized", diff, 1] > 0
+--source include/assert.inc
+DROP TABLE t1, t2;
+
+# Rerun full index scan a second time, and check that no materialization occurs
+CREATE TEMPORARY TABLE t1 AS
+SELECT * FROM performance_schema.global_status
+WHERE variable_name LIKE 'rocksdb_partial_index%';
+
+--disable_result_log
+SELECT id1, id2, assoc_type, visibility, time, version FROM assoc_table FORCE INDEX (id1_type2);
+--enable_result_log
+
+CREATE TEMPORARY TABLE t2 AS
+SELECT * FROM performance_schema.global_status
+WHERE variable_name LIKE 'rocksdb_partial_index%';
+
+SELECT variable_name, t2.variable_value - t1.variable_value AS diff FROM t1 JOIN t2 USING (variable_name);
+--let $assert_text = Check that materialized groups are zero.
+--let $assert_cond = [SELECT t2.variable_value - t1.variable_value AS diff FROM t1 JOIN t2 USING (variable_name) WHERE variable_name = "rocksdb_partial_index_groups_materialized", diff, 1] = 0
+--source include/assert.inc
+--let $assert_text = Check that materialized rows are zero.
+--let $assert_cond = [SELECT t2.variable_value - t1.variable_value AS diff FROM t1 JOIN t2 USING (variable_name) WHERE variable_name = "rocksdb_partial_index_rows_materialized", diff, 1] = 0
+--source include/assert.inc
+
+DROP TABLE t1, t2;
diff --git a/mysql-test/suite/rocksdb/t/partial_index_assoc.test b/mysql-test/suite/rocksdb/t/partial_index_assoc.test
new file mode 100644
index 00000000000..a559c67f673
--- /dev/null
+++ b/mysql-test/suite/rocksdb/t/partial_index_assoc.test
@@ -0,0 +1,58 @@
+set optimizer_force_index_for_range = on;
+CREATE TABLE `assoc_table` (
+ `id1` bigint(20) unsigned NOT NULL DEFAULT '0',
+ `id1_type` int(10) unsigned NOT NULL DEFAULT '0',
+ `id2` bigint(20) unsigned NOT NULL DEFAULT '0',
+ `id2_type` int(10) unsigned NOT NULL DEFAULT '0',
+ `assoc_type` bigint(20) unsigned NOT NULL DEFAULT '0',
+ `visibility` tinyint(3) NOT NULL DEFAULT '0',
+ `data` varchar(255) COLLATE latin1_bin NOT NULL DEFAULT '',
+ `time` int(10) unsigned NOT NULL DEFAULT '0',
+ `version` bigint(20) unsigned NOT NULL DEFAULT '0',
+ PRIMARY KEY (`assoc_type`,`id1`,`id2`) COMMENT 'cf_assoc',
+ KEY `id1_type` (`assoc_type`, `id1`, `visibility`,`time`,`id2`,`version`,`data`) COMMENT 'rev:cf_assoc_id1_type',
+ KEY `id1_type2` (`assoc_type`, `id1`, `visibility`,`time`,`id2`,`version`,`data`) COMMENT 'cfname=rev:cf_assoc_id1_type;partial_group_keyparts=2;partial_group_threshold=10'
+) ENGINE=ROCKSDB DEFAULT CHARSET=latin1 COLLATE=latin1_bin ROW_FORMAT=COMPRESSED KEY_BLOCK_SIZE=8;
+
+--source partial_index_assoc.inc
+
+DROP TABLE assoc_table;
+
+CREATE TABLE `assoc_table` (
+ `id1` binary(16) NOT NULL DEFAULT '\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0',
+ `raw_key` text COLLATE latin1_bin,
+ `id2` bigint(20) unsigned NOT NULL DEFAULT '0',
+ `id2_type` int(10) unsigned NOT NULL DEFAULT '0',
+ `assoc_type` bigint(20) unsigned NOT NULL DEFAULT '0',
+ `visibility` tinyint(3) NOT NULL DEFAULT '0',
+ `data` varchar(255) COLLATE latin1_bin NOT NULL DEFAULT '',
+ `time` int(10) unsigned NOT NULL DEFAULT '0',
+ `version` bigint(20) unsigned NOT NULL DEFAULT '0',
+ PRIMARY KEY (`assoc_type`,`id1`,`id2`) COMMENT 'cf_assoc',
+ KEY `id1_type` (`assoc_type`,`id1`,`visibility`,`time`,`id2`,`version`,`data`) COMMENT 'rev:cf_assoc_id1_type',
+ KEY `id1_type2` (`assoc_type`, `id1`, `visibility`,`time`,`id2`,`version`,`data`) COMMENT 'cfname=rev:cf_assoc_id1_type;partial_group_keyparts=2;partial_group_threshold=10'
+) ENGINE=ROCKSDB DEFAULT CHARSET=latin1 COLLATE=latin1_bin ROW_FORMAT=COMPRESSED KEY_BLOCK_SIZE=8;
+
+--source partial_index_assoc.inc
+
+DROP TABLE assoc_table;
+
+CREATE TABLE `assoc_table` (
+ `id1` bigint(20) unsigned NOT NULL DEFAULT '0',
+ `id1_type` int(10) unsigned NOT NULL DEFAULT '0',
+ `id2` bigint(20) unsigned NOT NULL DEFAULT '0',
+ `id2_type` int(10) unsigned NOT NULL DEFAULT '0',
+ `assoc_type` bigint(20) unsigned NOT NULL DEFAULT '0',
+ `visibility` tinyint(4) NOT NULL DEFAULT '0',
+ `data` text COLLATE latin1_bin NOT NULL,
+ `time` int(10) unsigned NOT NULL DEFAULT '0',
+ `version` bigint(20) unsigned NOT NULL DEFAULT '0',
+ PRIMARY KEY (`assoc_type`,`id1`,`id2`) COMMENT 'cf_assoc',
+ KEY `id1_type` (`assoc_type`, `id1`, `visibility`,`time`,`id2`,`version`,`data`(255)) COMMENT 'rev:cf_assoc_id1_type',
+ KEY `id1_type2` (`assoc_type`, `id1`, `visibility`,`time`,`id2`,`version`,`data`(255)) COMMENT 'cfname=rev:cf_assoc_id1_type;partial_group_keyparts=2;partial_group_threshold=10'
+) ENGINE=ROCKSDB DEFAULT CHARSET=latin1 COLLATE=latin1_bin ROW_FORMAT=COMPRESSED KEY_BLOCK_SIZE=4;
+
+--source partial_index_assoc.inc
+
+DROP TABLE assoc_table;
+set optimizer_force_index_for_range = off;
diff --git a/mysql-test/suite/rocksdb/t/partial_index_stress-master.opt b/mysql-test/suite/rocksdb/t/partial_index_stress-master.opt
new file mode 100644
index 00000000000..a105847c183
--- /dev/null
+++ b/mysql-test/suite/rocksdb/t/partial_index_stress-master.opt
@@ -0,0 +1,3 @@
+--initialize --default_authentication_plugin=mysql_native_password
+--rocksdb_default_cf_options=write_buffer_size=128m;target_file_size_base=32m;max_bytes_for_level_base=512m;level0_file_num_compaction_trigger=4;level0_slowdown_writes_trigger=20;level0_stop_writes_trigger=30;max_write_buffer_number=4;compression_per_level=kLZ4Compression;bottommost_compression=kZSTD;compression_opts=-14:6:0;block_based_table_factory={cache_index_and_filter_blocks=1;filter_policy=bloomfilter:10:false;whole_key_filtering=0};prefix_extractor=capped:12;level_compaction_dynamic_level_bytes=true;optimize_filters_for_hits=true;memtable_prefix_bloom_size_ratio=0.039;max_compaction_bytes=402653184;report_bg_io_stats=true;compaction_pri=kMinOverlappingRatio;soft_pending_compaction_bytes_limit=20480000000
+--rocksdb_override_cf_options=cf_assoc={prefix_extractor=capped:28};cf_assoc_count={prefix_extractor=capped:20};rev:cf_assoc_id1_type={prefix_extractor=capped:20};cf_fbobj_type_id={prefix_extractor=capped:16};cf_assoc_disagg={prefix_extractor=capped:20};__system__={write_buffer_size=16m};
diff --git a/mysql-test/suite/rocksdb/t/partial_index_stress.py b/mysql-test/suite/rocksdb/t/partial_index_stress.py
new file mode 100644
index 00000000000..07220d88705
--- /dev/null
+++ b/mysql-test/suite/rocksdb/t/partial_index_stress.py
@@ -0,0 +1,114 @@
+"""
+This script stress tests partial indexes by performing writes while concurrently checking PK/SK consistency.
+
+Usage: partial_index_stress.py user host port db_name table_name
+ num_iters num_threads
+"""
+import MySQLdb
+import random
+import sys
+import threading
+import traceback
+
+def get_query(table_name, binary_id1):
+ assoc_type = random.randint(1, 2)
+ id1 = random.randint(1, 5)
+ id2 = random.randint(1, 20)
+
+ r = random.randint(1, 3)
+
+ if r == 1:
+ if binary_id1:
+ return """DELETE FROM %s WHERE id1 = "%d" and id2 = %d and assoc_type = %d""" % (table_name, id1, id2, assoc_type)
+ else:
+ return """DELETE FROM %s WHERE id1 = %d and id2 = %d and assoc_type = %d""" % (table_name, id1, id2, assoc_type)
+ else:
+ return """INSERT INTO %s VALUES (%d, 0, %d, 0, %d, 1, 'abc', 100, 1) ON DUPLICATE KEY UPDATE time=time+10, version=version+1""" % (table_name, id1, id2, assoc_type)
+
+class Worker(threading.Thread):
+ def __init__(self, con, table_name, num_iters, check, event):
+ threading.Thread.__init__(self)
+ self.con = con
+ self.table_name = table_name
+ self.num_iters = num_iters
+ self.check = check
+ self.event = event
+ self.exception = None
+ self.start()
+
+ def run(self):
+ try:
+ if self.check:
+ self.run_check()
+ else:
+ self.run_write()
+ except Exception as e:
+ self.exception = traceback.format_exc()
+
+ def run_write(self):
+ cur = self.con.cursor()
+ cur.execute("select data_type from information_schema.columns where table_schema = database() and table_name = '%s' and column_name = 'id1'" % self.table_name);
+ binary_id1 = cur.fetchone()[0] == "binary"
+ cur.execute("SET SESSION TRANSACTION ISOLATION LEVEL READ COMMITTED")
+ for x in range(self.num_iters):
+ try:
+ cur.execute(get_query(self.table_name, binary_id1))
+ self.con.commit()
+ except MySQLdb.OperationalError as e:
+ self.con.rollback()
+ cur = self.con.cursor()
+ raise e
+
+ def run_check(self):
+ cur = self.con.cursor()
+ while not self.event.is_set():
+ try:
+ cur.execute("SELECT COUNT(*) FROM %s FORCE INDEX(PRIMARY) UNION ALL SELECT COUNT(*) FROM %s FORCE INDEX(id1_type)" % (self.table_name, self.table_name))
+ pk_count = cur.fetchone()[0]
+ sk_count = cur.fetchone()[0]
+ assert pk_count == sk_count, "Count mismatch %d != %d" % (pk_count, sk_count)
+ self.con.commit()
+ except MySQLdb.OperationalError as e:
+ self.con.rollback()
+ cur = self.con.cursor()
+ raise e
+
+if __name__ == '__main__':
+ if len(sys.argv) != 8:
+ print("Usage: partial_index_stress.py user host port db_name " \
+ "table_name num_iters num_threads")
+ sys.exit(1)
+
+ user = sys.argv[1]
+ host = sys.argv[2]
+ port = int(sys.argv[3])
+ db = sys.argv[4]
+ table_name = sys.argv[5]
+ num_iters = int(sys.argv[6])
+ num_workers = int(sys.argv[7])
+
+ done_event = threading.Event();
+
+ worker_failed = False
+ workers = []
+ for i in range(num_workers):
+ w = Worker(
+ MySQLdb.connect(user=user, host=host, port=port, db=db), table_name,
+ num_iters, False, None)
+ workers.append(w)
+
+ checker = Worker(
+ MySQLdb.connect(user=user, host=host, port=port, db=db), table_name,
+ num_iters, True, done_event)
+
+ for w in workers:
+ w.join()
+ if w.exception:
+ print("Worker hit an exception:\n%s\n" % w.exception)
+ worker_failed = True
+
+ done_event.set()
+ checker.join()
+
+ if worker_failed:
+ sys.exit(1)
diff --git a/mysql-test/suite/rocksdb/t/partial_index_stress.test b/mysql-test/suite/rocksdb/t/partial_index_stress.test
new file mode 100644
index 00000000000..c78e8cb980e
--- /dev/null
+++ b/mysql-test/suite/rocksdb/t/partial_index_stress.test
@@ -0,0 +1,64 @@
+#
+# Stress partial indexes by performing writes, and checking that PK/SK are still consistent.
+#
+
+set @save_rocksdb_lock_wait_timeout = @@rocksdb_lock_wait_timeout;
+set global rocksdb_lock_wait_timeout = 100000;
+
+CREATE TABLE `assoc_table` (
+ `id1` bigint(20) unsigned NOT NULL DEFAULT '0',
+ `id1_type` int(10) unsigned NOT NULL DEFAULT '0',
+ `id2` bigint(20) unsigned NOT NULL DEFAULT '0',
+ `id2_type` int(10) unsigned NOT NULL DEFAULT '0',
+ `assoc_type` bigint(20) unsigned NOT NULL DEFAULT '0',
+ `visibility` tinyint(3) NOT NULL DEFAULT '0',
+ `data` varchar(255) COLLATE latin1_bin NOT NULL DEFAULT '',
+ `time` int(10) unsigned NOT NULL DEFAULT '0',
+ `version` bigint(20) unsigned NOT NULL DEFAULT '0',
+ PRIMARY KEY (`assoc_type`,`id1`,`id2`) COMMENT 'cf_assoc',
+ KEY `id1_type` (`assoc_type`, `id1`, `visibility`,`time`,`id2`,`version`,`data`) COMMENT 'cfname=rev:cf_assoc_id1_type;partial_group_keyparts=2;partial_group_threshold=10'
+) ENGINE=ROCKSDB DEFAULT CHARSET=latin1 COLLATE=latin1_bin ROW_FORMAT=COMPRESSED KEY_BLOCK_SIZE=8;
+
+exec /usr/bin/python3 suite/rocksdb/t/partial_index_stress.py root 127.0.0.1 $MASTER_MYPORT test assoc_table 1000 10;
+
+DROP TABLE assoc_table;
+
+CREATE TABLE `assoc_table` (
+ `id1` binary(16) NOT NULL DEFAULT '\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0',
+ `raw_key` text COLLATE latin1_bin,
+ `id2` bigint(20) unsigned NOT NULL DEFAULT '0',
+ `id2_type` int(10) unsigned NOT NULL DEFAULT '0',
+ `assoc_type` bigint(20) unsigned NOT NULL DEFAULT '0',
+ `visibility` tinyint(3) NOT NULL DEFAULT '0',
+ `data` varchar(255) COLLATE latin1_bin NOT NULL DEFAULT '',
+ `time` int(10) unsigned NOT NULL DEFAULT '0',
+ `version` bigint(20) unsigned NOT NULL DEFAULT '0',
+ PRIMARY KEY (`assoc_type`,`id1`,`id2`) COMMENT 'cf_assoc',
+ KEY `id1_type` (`assoc_type`,`id1`,`visibility`,`time`,`id2`,`version`,`data`) COMMENT 'rev:cf_assoc_id1_type',
+ KEY `id1_type2` (`assoc_type`, `id1`, `visibility`,`time`,`id2`,`version`,`data`) COMMENT 'cfname=rev:cf_assoc_id1_type;partial_group_keyparts=2;partial_group_threshold=10'
+) ENGINE=ROCKSDB DEFAULT CHARSET=latin1 COLLATE=latin1_bin ROW_FORMAT=COMPRESSED KEY_BLOCK_SIZE=8;
+
+exec /usr/bin/python3 suite/rocksdb/t/partial_index_stress.py root 127.0.0.1 $MASTER_MYPORT test assoc_table 1000 10;
+
+DROP TABLE assoc_table;
+
+CREATE TABLE `assoc_table` (
+ `id1` bigint(20) unsigned NOT NULL DEFAULT '0',
+ `id1_type` int(10) unsigned NOT NULL DEFAULT '0',
+ `id2` bigint(20) unsigned NOT NULL DEFAULT '0',
+ `id2_type` int(10) unsigned NOT NULL DEFAULT '0',
+ `assoc_type` bigint(20) unsigned NOT NULL DEFAULT '0',
+ `visibility` tinyint(4) NOT NULL DEFAULT '0',
+ `data` text COLLATE latin1_bin NOT NULL,
+ `time` int(10) unsigned NOT NULL DEFAULT '0',
+ `version` bigint(20) unsigned NOT NULL DEFAULT '0',
+ PRIMARY KEY (`assoc_type`,`id1`,`id2`) COMMENT 'cf_assoc',
+ KEY `id1_type` (`assoc_type`, `id1`, `visibility`,`time`,`id2`,`version`,`data`(255)) COMMENT 'rev:cf_assoc_id1_type',
+ KEY `id1_type2` (`assoc_type`, `id1`, `visibility`,`time`,`id2`,`version`,`data`(255)) COMMENT 'cfname=rev:cf_assoc_id1_type;partial_group_keyparts=2;partial_group_threshold=10'
+) ENGINE=ROCKSDB DEFAULT CHARSET=latin1 COLLATE=latin1_bin ROW_FORMAT=COMPRESSED KEY_BLOCK_SIZE=4;
+
+exec /usr/bin/python3 suite/rocksdb/t/partial_index_stress.py root 127.0.0.1 $MASTER_MYPORT test assoc_table 1000 10;
+
+DROP TABLE assoc_table;
+
+set global rocksdb_lock_wait_timeout = @save_rocksdb_lock_wait_timeout;
diff --git a/storage/rocksdb/ha_rocksdb.cc b/storage/rocksdb/ha_rocksdb.cc
index 15c8b289a5e..452d2a9ff8f 100644
--- a/storage/rocksdb/ha_rocksdb.cc
+++ b/storage/rocksdb/ha_rocksdb.cc
@@ -802,6 +802,11 @@ std::atomic<uint64_t> rocksdb_select_bypass_executed(0);
std::atomic<uint64_t> rocksdb_select_bypass_rejected(0);
std::atomic<uint64_t> rocksdb_select_bypass_failed(0);
+std::atomic<uint64_t> rocksdb_partial_index_groups_sorted(0);
+std::atomic<uint64_t> rocksdb_partial_index_groups_materialized(0);
+std::atomic<uint64_t> rocksdb_partial_index_rows_sorted(0);
+std::atomic<uint64_t> rocksdb_partial_index_rows_materialized(0);
+
static int rocksdb_trace_block_cache_access(
THD *const thd MY_ATTRIBUTE((__unused__)),
struct SYS_VAR *const var MY_ATTRIBUTE((__unused__)), void *const save,
@@ -3111,7 +3116,7 @@ class Rdb_transaction {
virtual void set_sync(bool sync) = 0;
virtual void release_lock(const Rdb_key_def &key_descr,
- const std::string &rowkey) = 0;
+ const std::string &rowkey, bool force = false) = 0;
virtual bool prepare() = 0;
@@ -3714,9 +3719,9 @@ class Rdb_transaction_impl : public Rdb_transaction {
m_rocksdb_tx->GetWriteOptions()->sync = sync;
}
- void release_lock(const Rdb_key_def &key_descr,
- const std::string &rowkey) override {
- if (!THDVAR(m_thd, lock_scanned_rows)) {
+ void release_lock(const Rdb_key_def &key_descr, const std::string &rowkey,
+ bool force) override {
+ if (!THDVAR(m_thd, lock_scanned_rows) || force) {
m_rocksdb_tx->UndoGetForUpdate(key_descr.get_cf(),
rocksdb::Slice(rowkey));
// row_lock_count track row(pk)
@@ -4167,7 +4172,8 @@ class Rdb_writebatch_impl : public Rdb_transaction {
void set_sync(bool sync) override { write_opts.sync = sync; }
void release_lock(const Rdb_key_def &key_descr MY_ATTRIBUTE((unused)),
- const std::string &rowkey MY_ATTRIBUTE((unused))) override {
+ const std::string &rowkey MY_ATTRIBUTE((unused)),
+ bool force MY_ATTRIBUTE((unused))) override {
// Nothing to do here since we don't hold any row locks.
}
@@ -6922,6 +6928,7 @@ ulonglong ha_rocksdb::load_auto_incr_value_from_index() {
active_index = table->s->next_number_index;
const uint8 save_table_status = table->m_status;
+ DBUG_ASSERT(!m_key_descr_arr[active_index_pos()]->is_partial_index());
std::unique_ptr<Rdb_iterator> save_iterator(new Rdb_iterator_base(
ha_thd(), m_key_descr_arr[active_index_pos()], m_pk_descr, m_tbl_def));
std::swap(m_iterator, save_iterator);
@@ -10203,6 +10210,7 @@ int ha_rocksdb::check_and_lock_sk(
The bloom filter may need to be disabled for this lookup.
*/
+ DBUG_ASSERT(!m_key_descr_arr[key_id]->is_partial_index());
Rdb_iterator_base iter(ha_thd(), m_key_descr_arr[key_id], m_pk_descr,
m_tbl_def);
int rc = HA_EXIT_SUCCESS;
@@ -10608,6 +10616,34 @@ int ha_rocksdb::update_write_sk(const TABLE *const table_arg,
bytes_written = old_key_slice.size();
}
+ if (kd.is_partial_index()) {
+ // Obtain shared lock on prefix.
+ int size = kd.pack_record(table_arg, m_pack_buffer, row_info.new_data,
+ m_sk_packed_tuple, nullptr, false, 0,
+ kd.partial_index_keyparts());
+ const rocksdb::Slice prefix_slice =
+ rocksdb::Slice((const char *)m_sk_packed_tuple, size);
+
+ const rocksdb::Status s = row_info.tx->get_for_update(
+ kd, prefix_slice, nullptr, false /* exclusive */,
+ false /* do validate */);
+ if (!s.ok()) {
+ return row_info.tx->set_status_error(table_arg->in_use, s, kd, m_tbl_def,
+ m_table_handler);
+ }
+
+ // Check if this prefix has been materialized.
+ Rdb_iterator_base iter(ha_thd(), m_key_descr_arr[kd.get_keyno()],
+ m_pk_descr, m_tbl_def);
+ rc = iter.seek(HA_READ_KEY_EXACT, prefix_slice, false, prefix_slice,
+ true /* read current */);
+
+ // We can skip updating the index, if the prefix is not materialized.
+ if (rc == HA_ERR_END_OF_FILE || rc == HA_ERR_KEY_NOT_FOUND) {
+ return 0;
+ }
+ }
+
new_key_slice = rocksdb::Slice(
reinterpret_cast<const char *>(m_sk_packed_tuple), new_packed_size);
new_value_slice =
@@ -10897,8 +10933,15 @@ int ha_rocksdb::index_init(uint idx, bool sorted MY_ATTRIBUTE((__unused__))) {
DBUG_ASSERT(tx != nullptr);
active_index = idx;
- m_iterator.reset(new Rdb_iterator_base(
- thd, m_key_descr_arr[active_index_pos()], m_pk_descr, m_tbl_def));
+ if (idx != table->s->primary_key &&
+ m_key_descr_arr[idx]->is_partial_index()) {
+ m_iterator.reset(
+ new Rdb_iterator_partial(thd, m_key_descr_arr[active_index_pos()],
+ m_pk_descr, m_tbl_def, table));
+ } else {
+ m_iterator.reset(new Rdb_iterator_base(
+ thd, m_key_descr_arr[active_index_pos()], m_pk_descr, m_tbl_def));
+ }
// If m_lock_rows is not RDB_LOCK_NONE then we will be doing a get_for_update
// when accessing the index, so don't acquire the snapshot right away.
@@ -13379,6 +13422,9 @@ int ha_rocksdb::inplace_populate_sk(
THDVAR(ha_thd(), merge_tmp_file_removal_delay_ms);
for (const auto &index : indexes) {
+ // Skip populating partial indexes for now.
+ if (index->is_partial_index()) continue;
+
bool is_unique_index =
new_table_arg->key_info[index->get_keyno()].flags & HA_NOSAME;
@@ -14210,6 +14256,17 @@ static SHOW_VAR rocksdb_status_vars[] = {
&rocksdb_select_bypass_rejected, SHOW_LONGLONG),
DEF_STATUS_VAR_PTR("select_bypass_failed", &rocksdb_select_bypass_failed,
SHOW_LONGLONG),
+
+ DEF_STATUS_VAR_PTR("partial_index_groups_sorted",
+ &rocksdb_partial_index_groups_sorted, SHOW_LONGLONG),
+ DEF_STATUS_VAR_PTR("partial_index_groups_materialized",
+ &rocksdb_partial_index_groups_materialized,
+ SHOW_LONGLONG),
+ DEF_STATUS_VAR_PTR("partial_index_rows_sorted",
+ &rocksdb_partial_index_rows_sorted, SHOW_LONGLONG),
+ DEF_STATUS_VAR_PTR("partial_index_rows_materialized",
+ &rocksdb_partial_index_rows_materialized, SHOW_LONGLONG),
+
// the variables generated by SHOW_FUNC are sorted only by prefix (first
// arg in the tuple below), so make sure it is unique to make sorting
// deterministic as quick sort is not stable
@@ -15844,6 +15901,11 @@ rocksdb::Status rdb_tx_get_for_update(Rdb_transaction *tx,
return s;
}
+void rdb_tx_release_lock(Rdb_transaction *tx, const Rdb_key_def &kd,
+ const rocksdb::Slice &key) {
+ tx->release_lock(kd, std::string(key.data(), key.size()));
+}
+
void rdb_tx_multi_get(Rdb_transaction *tx,
rocksdb::ColumnFamilyHandle *const column_family,
const size_t num_keys, const rocksdb::Slice *keys,
diff --git a/storage/rocksdb/ha_rocksdb.h b/storage/rocksdb/ha_rocksdb.h
index 369af0bc9c4..d0baeefe942 100644
--- a/storage/rocksdb/ha_rocksdb.h
+++ b/storage/rocksdb/ha_rocksdb.h
@@ -1166,6 +1166,9 @@ rocksdb::Status rdb_tx_get_for_update(Rdb_transaction *tx,
rocksdb::PinnableSlice *const value,
bool exclusive);
+void rdb_tx_release_lock(Rdb_transaction *tx, const Rdb_key_def &kd,
+ const rocksdb::Slice &key);
+
void rdb_tx_multi_get(Rdb_transaction *tx,
rocksdb::ColumnFamilyHandle *const column_family,
const size_t num_keys, const rocksdb::Slice *keys,
@@ -1218,4 +1221,9 @@ extern std::atomic<uint64_t> rocksdb_select_bypass_executed;
extern std::atomic<uint64_t> rocksdb_select_bypass_rejected;
extern std::atomic<uint64_t> rocksdb_select_bypass_failed;
+extern std::atomic<uint64_t> rocksdb_partial_index_groups_sorted;
+extern std::atomic<uint64_t> rocksdb_partial_index_groups_materialized;
+extern std::atomic<uint64_t> rocksdb_partial_index_rows_sorted;
+extern std::atomic<uint64_t> rocksdb_partial_index_rows_materialized;
+
} // namespace myrocks
diff --git a/storage/rocksdb/rdb_datadic.h b/storage/rocksdb/rdb_datadic.h
index 2c5828a6b8a..1b12c33d8d7 100644
--- a/storage/rocksdb/rdb_datadic.h
+++ b/storage/rocksdb/rdb_datadic.h
@@ -586,6 +586,12 @@ class Rdb_key_def {
uint extract_partial_index_info(const TABLE *const table_arg,
const Rdb_tbl_def *const tbl_def_arg);
inline bool is_partial_index() const { return m_partial_index_threshold > 0; }
+ inline uint partial_index_threshold() const {
+ return m_partial_index_threshold;
+ }
+ inline uint partial_index_keyparts() const {
+ return m_partial_index_keyparts;
+ }
static bool has_index_flag(uint32 index_flags, enum INDEX_FLAG flag);
static uint32 calculate_index_flag_offset(uint32 index_flags,
diff --git a/storage/rocksdb/rdb_iterator.cc b/storage/rocksdb/rdb_iterator.cc
index 529cd6dacae..eac051db9a7 100644
--- a/storage/rocksdb/rdb_iterator.cc
+++ b/storage/rocksdb/rdb_iterator.cc
@@ -16,7 +16,10 @@
#include "./rdb_iterator.h"
+/* MySQL includes */
#include "scope_guard.h"
+#include "sql/sql_class.h"
+#include "sql/thr_malloc.h"
namespace myrocks {
@@ -356,4 +359,591 @@ int Rdb_iterator_base::get(const rocksdb::Slice *key,
return rc;
}
+Rdb_iterator_partial::Rdb_iterator_partial(
+ THD *thd, const std::shared_ptr<Rdb_key_def> kd,
+ const std::shared_ptr<Rdb_key_def> pkd, const Rdb_tbl_def *tbl_def,
+ TABLE *table)
+ : Rdb_iterator_base(thd, kd, pkd, tbl_def),
+ m_table(table),
+ m_iterator_pk(thd, pkd, pkd, tbl_def),
+ m_converter(thd, tbl_def, table),
+ m_valid(false),
+ m_materialized(false),
+ m_threshold(kd->partial_index_threshold()),
+ m_prefix_keyparts(kd->partial_index_keyparts()),
+ m_cur_prefix_key_len(0),
+ m_records(slice_comparator(m_kd->get_cf()->GetComparator())),
+ m_records_it(m_records.end()) {
+ init_sql_alloc(PSI_NOT_INSTRUMENTED, &m_mem_root, 4024, 0);
+ m_converter.setup_field_decoders(table->read_set, true /* decode all */);
+
+ const uint packed_len =
+ std::max(m_kd->max_storage_fmt_length(), m_pkd->max_storage_fmt_length());
+ m_cur_prefix_key = reinterpret_cast<uchar *>(
+ my_malloc(PSI_NOT_INSTRUMENTED, packed_len, MYF(0)));
+ m_record_buf = reinterpret_cast<uchar *>(
+ my_malloc(PSI_NOT_INSTRUMENTED, table->s->reclength, MYF(0)));
+ m_pack_buffer = reinterpret_cast<uchar *>(
+ my_malloc(PSI_NOT_INSTRUMENTED, packed_len, MYF(0)));
+ m_sk_packed_tuple = reinterpret_cast<uchar *>(
+ my_malloc(PSI_NOT_INSTRUMENTED, packed_len, MYF(0)));
+}
+
+Rdb_iterator_partial::~Rdb_iterator_partial() {
+ reset();
+ my_free(m_cur_prefix_key);
+ m_cur_prefix_key = nullptr;
+ my_free(m_record_buf);
+ m_record_buf = nullptr;
+ my_free(m_pack_buffer);
+ m_pack_buffer = nullptr;
+ my_free(m_sk_packed_tuple);
+ m_sk_packed_tuple = nullptr;
+}
+
+int Rdb_iterator_partial::get_prefix_len(const rocksdb::Slice &start_key,
+ uint *prefix_cnt, uint *prefix_len) {
+ Rdb_string_reader reader(&start_key);
+ if ((!reader.read(Rdb_key_def::INDEX_ID_SIZE))) {
+ return HA_ERR_INTERNAL_ERROR;
+ }
+
+ for (uint i = 0; i < m_prefix_keyparts; i++) {
+ if (reader.remaining_bytes() == 0) {
+ *prefix_cnt = i;
+ *prefix_len = reader.get_current_ptr() - start_key.data();
+ return HA_EXIT_SUCCESS;
+ }
+
+ if (m_kd->read_memcmp_key_part(m_table, &reader, i) > 0) {
+ return HA_ERR_INTERNAL_ERROR;
+ }
+ }
+
+ *prefix_cnt = m_prefix_keyparts;
+ *prefix_len = reader.get_current_ptr() - start_key.data();
+
+ return HA_EXIT_SUCCESS;
+}
+
+/*
+ * Determines the correct prefix from start_key by reading from primary key if
+ * needed.
+ *
+ * Populates m_cur_prefix_key/m_cur_prefix_key_len.
+ */
+int Rdb_iterator_partial::get_prefix_from_start(
+ enum ha_rkey_function find_flag, const rocksdb::Slice &start_key) {
+ int rc = 0;
+ uint prefix_cnt = 0;
+ uint prefix_len = 0;
+
+ rc = get_prefix_len(start_key, &prefix_cnt, &prefix_len);
+ if (rc) {
+ return rc;
+ }
+ DBUG_ASSERT_IMP(prefix_cnt == 0, prefix_len == Rdb_key_def::INDEX_ID_SIZE);
+
+ // There are 2 scenarios where a read is required to determine the prefix:
+ // 1. There are not enough keyparts in the start_key.
+ // 2. An exclusive seek key is provided, meaning that we need to read the next
+ // prefix.
+ if (prefix_cnt < m_prefix_keyparts ||
+ (prefix_len == start_key.size() &&
+ (find_flag == HA_READ_AFTER_KEY || find_flag == HA_READ_BEFORE_KEY))) {
+ uint tmp;
+
+ rocksdb::Slice empty_end_key;
+
+ // Since the PK/SK share the same prefix, the primary key can be constructed
+ // using the secondary key, with the index_id overwritten.
+ memcpy(m_cur_prefix_key, start_key.data(), prefix_len);
+ rocksdb::Slice seek_key((const char *)m_cur_prefix_key, prefix_len);
+ m_pkd->get_infimum_key(m_cur_prefix_key, &tmp);
+
+ rc = m_iterator_pk.seek(find_flag, seek_key, false, empty_end_key);
+ if (rc) {
+ return rc;
+ }
+
+ rc = get_prefix_len(m_iterator_pk.key(), &prefix_cnt, &prefix_len);
+ if (rc) {
+ return rc;
+ }
+ memcpy(m_cur_prefix_key, m_iterator_pk.key().data(), prefix_len);
+ } else {
+ memcpy(m_cur_prefix_key, start_key.data(), prefix_len);
+ }
+
+ m_cur_prefix_key_len = prefix_len;
+ return HA_EXIT_SUCCESS;
+}
+
+int Rdb_iterator_partial::get_next_prefix(bool direction) {
+ rocksdb::Slice cur_prefix_key((const char *)m_cur_prefix_key,
+ m_cur_prefix_key_len);
+ uint tmp;
+
+ int rc = get_prefix_from_start(
+ direction ? HA_READ_AFTER_KEY : HA_READ_BEFORE_KEY, cur_prefix_key);
+ m_kd->get_infimum_key(m_cur_prefix_key, &tmp);
+
+ cur_prefix_key =
+ rocksdb::Slice((const char *)m_cur_prefix_key, m_cur_prefix_key_len);
+ if (!rc && !m_kd->value_matches_prefix(cur_prefix_key, m_prefix_tuple)) {
+ rc = HA_ERR_END_OF_FILE;
+ }
+
+ return rc;
+}
+int Rdb_iterator_partial::seek_next_prefix(bool direction) {
+ rocksdb::Slice empty_end_key;
+ uint tmp;
+
+ // Fetch next prefix using PK.
+ int rc = get_next_prefix(direction);
+ if (rc) return rc;
+
+ // Rdb_iterator_base::seek below will overwrite m_prefix_tuple, so we save a
+ // copy here.
+ size_t prefix_buf_len = m_prefix_tuple.size();
+ uchar *prefix_buf_copy = (uchar *)my_alloca(prefix_buf_len);
+ memcpy(prefix_buf_copy, m_prefix_buf, prefix_buf_len);
+
+ // First try reading from SK in the current prefix.
+ rocksdb::Slice cur_prefix_key((const char *)m_cur_prefix_key,
+ m_cur_prefix_key_len);
+ m_kd->get_infimum_key(m_cur_prefix_key, &tmp);
+
+ rc = Rdb_iterator_base::seek(
+ direction ? HA_READ_KEY_EXACT : HA_READ_PREFIX_LAST, cur_prefix_key,
+ false, empty_end_key);
+
+ // Restore m_prefix_tuple
+ memcpy(m_prefix_buf, prefix_buf_copy, prefix_buf_len);
+ m_prefix_tuple = rocksdb::Slice((char *)m_prefix_buf, prefix_buf_len);
+
+ if (rc == HA_ERR_END_OF_FILE) {
+ // Nothing in SK, so check PK.
+ rc = read_prefix_from_pk();
+
+ if (rc == 0) {
+ // Not materialized on disk, seek to beginning/end of map.
+ m_materialized = false;
+ if (direction ^ m_kd->m_is_reverse_cf) {
+ m_records_it = m_records.begin();
+ } else {
+ m_records_it = m_records.end();
+ m_records_it--;
+ }
+ } else {
+ // The current prefix was determined by reading from PK in
+ // get_next_prefix, so rows must exist within this prefix on the PK.
+ DBUG_ASSERT(rc != HA_ERR_END_OF_FILE);
+ }
+ } else if (rc == 0) {
+ // Found rows in SK, so use them
+ m_materialized = true;
+ }
+
+ return rc;
+}
+
+int Rdb_iterator_partial::materialize_prefix() {
+ MEM_ROOT mem_root;
+ init_sql_alloc(PSI_NOT_INSTRUMENTED, &mem_root, 4024, 0);
+ uint tmp;
+ Rdb_transaction *const tx = get_tx_from_thd(m_thd);
+ m_kd->get_infimum_key(m_cur_prefix_key, &tmp);
+ rocksdb::Slice cur_prefix_key((const char *)m_cur_prefix_key,
+ m_cur_prefix_key_len);
+
+ auto s =
+ rdb_tx_get_for_update(tx, *m_kd, cur_prefix_key, nullptr, RDB_LOCK_WRITE);
+ if (!s.ok()) {
+ return rdb_tx_set_status_error(tx, s, *m_kd, m_tbl_def);
+ }
+
+ // It is possible that someone else has already materialized this group
+ // before we locked. Double check if the prefix is still empty.
+ Rdb_iterator_base iter(m_thd, m_kd, m_pkd, m_tbl_def);
+ m_kd->get_infimum_key(m_cur_prefix_key, &tmp);
+ int rc = iter.seek(HA_READ_KEY_EXACT, cur_prefix_key, false, cur_prefix_key,
+ true /* read current */);
+ if (rc == 0 || rc != HA_ERR_END_OF_FILE) {
+ rdb_tx_release_lock(tx, *m_kd, cur_prefix_key);
+ return rc;
+ }
+
+ rocksdb::WriteOptions options;
+ options.sync = false;
+ rocksdb::TransactionDBWriteOptimizations optimize;
+ optimize.skip_concurrency_control = true;
+
+ auto wb = std::unique_ptr<rocksdb::WriteBatch>(new rocksdb::WriteBatch);
+ m_pkd->get_infimum_key(m_cur_prefix_key, &tmp);
+ rc = m_iterator_pk.seek(HA_READ_KEY_EXACT, cur_prefix_key, false,
+ cur_prefix_key, true /* read current */);
+ size_t num_rows = 0;
+
+ while (!rc) {
+ if (thd_killed(m_thd)) {
+ rc = HA_ERR_QUERY_INTERRUPTED;
+ goto exit;
+ }
+
+ const rocksdb::Slice &rkey = m_iterator_pk.key();
+ const rocksdb::Slice &rval = m_iterator_pk.value();
+
+ // Unpack from PK format
+ rc = m_converter.decode(m_pkd, m_record_buf, &rkey, &rval);
+ if (rc) {
+ goto exit;
+ }
+
+ // Repack into SK format
+ uint sk_packed_size = m_kd->pack_record(
+ m_table, m_pack_buffer, m_record_buf, m_sk_packed_tuple, &m_sk_tails,
+ false /* store_row_debug_checksums */, 0 /* hidden_pk_id */, 0, nullptr,
+ m_converter.get_ttl_bytes_buffer());
+
+ const char *key =
+ (const char *)memdup_root(&mem_root, m_sk_packed_tuple, sk_packed_size);
+ const char *val = (const char *)memdup_root(&mem_root, m_sk_tails.ptr(),
+ m_sk_tails.get_current_pos());
+
+ s = wb->Put(m_kd->get_cf(), rocksdb::Slice(key, sk_packed_size),
+ rocksdb::Slice(val, m_sk_tails.get_current_pos()));
+ if (!s.ok()) {
+ rc = rdb_tx_set_status_error(tx, s, *m_kd, m_tbl_def);
+ goto exit;
+ }
+
+ num_rows++;
+ rc = m_iterator_pk.next();
+ }
+
+ if (rc != HA_ERR_END_OF_FILE) goto exit;
+ rc = HA_EXIT_SUCCESS;
+
+ s = rdb_get_rocksdb_db()->Write(options, optimize, wb.get());
+ if (!s.ok()) {
+ rc = rdb_tx_set_status_error(tx, s, *m_kd, m_tbl_def);
+ goto exit;
+ }
+
+ rocksdb_partial_index_groups_materialized++;
+ rocksdb_partial_index_rows_materialized += num_rows;
+
+exit:
+ rdb_tx_release_lock(tx, *m_kd, cur_prefix_key);
+ return rc;
+}
+
+int Rdb_iterator_partial::read_prefix_from_pk() {
+ uint tmp;
+ int rc = 0;
+ size_t num_rows = 0;
+
+ free_root(&m_mem_root, MYF(MY_KEEP_PREALLOC));
+ m_records.clear();
+
+ const char *old_proc_info = m_thd->get_proc_info();
+ thd_proc_info(m_thd, "Materializing group in partial index");
+
+ rocksdb::Slice cur_prefix_key((const char *)m_cur_prefix_key,
+ m_cur_prefix_key_len);
+ m_pkd->get_infimum_key(m_cur_prefix_key, &tmp);
+
+ // Since rocksdb does not support reverse prefix seeks, we always seek in the
+ // forwards direction (even PK is a reverse cf).
+ rc = m_iterator_pk.seek(HA_READ_KEY_EXACT, cur_prefix_key, false,
+ cur_prefix_key);
+
+ while (!rc) {
+ if (thd_killed(m_thd)) {
+ rc = HA_ERR_QUERY_INTERRUPTED;
+ goto exit;
+ }
+
+ const rocksdb::Slice &rkey = m_iterator_pk.key();
+ const rocksdb::Slice &rval = m_iterator_pk.value();
+
+ // Unpack from PK format
+ rc = m_converter.decode(m_pkd, m_record_buf, &rkey, &rval);
+ if (rc) goto exit;
+
+ // Repack into SK format
+ uint sk_packed_size = m_kd->pack_record(
+ m_table, m_pack_buffer, m_record_buf, m_sk_packed_tuple, &m_sk_tails,
+ false /* store_row_debug_checksums */, 0 /* hidden_pk_id */, 0, nullptr,
+ m_converter.get_ttl_bytes_buffer());
+
+ const char *key = (const char *)memdup_root(&m_mem_root, m_sk_packed_tuple,
+ sk_packed_size);
+ const char *val = (const char *)memdup_root(&m_mem_root, m_sk_tails.ptr(),
+ m_sk_tails.get_current_pos());
+
+ m_records.emplace(rocksdb::Slice(key, sk_packed_size),
+ rocksdb::Slice(val, m_sk_tails.get_current_pos()));
+
+ num_rows++;
+ rc = m_iterator_pk.next();
+ }
+
+ if (rc != HA_ERR_END_OF_FILE) goto exit;
+ rc = HA_EXIT_SUCCESS;
+
+ rocksdb_partial_index_groups_sorted++;
+ rocksdb_partial_index_rows_sorted += num_rows;
+
+ if (num_rows > m_threshold) {
+ rc = materialize_prefix();
+ } else if (num_rows == 0) {
+ rc = HA_ERR_END_OF_FILE;
+ }
+
+exit:
+ thd_proc_info(m_thd, old_proc_info);
+ return rc;
+}
+
+int Rdb_iterator_partial::seek(enum ha_rkey_function find_flag,
+ const rocksdb::Slice start_key,
+ bool full_key_match,
+ const rocksdb::Slice end_key,
+ bool read_current) {
+ int rc = 0;
+ uint tmp;
+
+ DBUG_ASSERT(!read_current);
+ reset();
+
+ bool direction = (find_flag == HA_READ_KEY_EXACT) ||
+ (find_flag == HA_READ_AFTER_KEY) ||
+ (find_flag == HA_READ_KEY_OR_NEXT);
+
+ // Get current prefix.
+ if ((rc = get_prefix_from_start(find_flag, start_key)) != 0) {
+ return rc;
+ }
+
+ // First try reading from SK in the current prefix.
+ rocksdb::Slice cur_prefix_key((const char *)m_cur_prefix_key,
+ m_cur_prefix_key_len);
+ m_kd->get_infimum_key(m_cur_prefix_key, &tmp);
+
+ rc = Rdb_iterator_base::seek(find_flag, start_key, full_key_match, end_key,
+ read_current);
+
+ // Check if we're still in our current prefix. If not, we may have missed
+ // some unmaterialized keys, so we have to check PK.
+ if (rc == 0 &&
+ !m_kd->value_matches_prefix(Rdb_iterator_base::key(), cur_prefix_key)) {
+ rc = HA_ERR_END_OF_FILE;
+ }
+
+ bool next_prefix = false;
+
+ if (rc == HA_ERR_END_OF_FILE) {
+ // Nothing in SK, so check PK.
+ rc = read_prefix_from_pk();
+
+ if (rc == HA_ERR_END_OF_FILE) {
+ // Nothing in PK, so move to next prefix.
+ next_prefix = true;
+ } else if (rc == 0) {
+ // Not materialized on disk.
+ m_materialized = false;
+
+ // Seek to correct spot.
+ uchar *start_key_buf = (uchar *)start_key.data();
+
+ // Similar to Rdb_iterator_base::seek, convert start_key into an rocksdb
+ // key that we will actually seek to.
+ auto start_key_guard =
+ create_scope_guard([this, start_key_buf, start_key] {
+ this->m_kd->predecessor(start_key_buf, start_key.size());
+ });
+ if (find_flag == HA_READ_PREFIX_LAST_OR_PREV ||
+ find_flag == HA_READ_PREFIX_LAST || find_flag == HA_READ_AFTER_KEY) {
+ m_kd->successor(start_key_buf, start_key.size());
+ } else {
+ start_key_guard.commit();
+ }
+
+ if (direction) {
+ if (m_kd->m_is_reverse_cf) {
+ // Emulate "SeekForPrev" behaviour.
+ m_records_it = m_records.upper_bound(start_key);
+ if (m_records_it == m_records.begin()) {
+ next_prefix = true;
+ } else {
+ m_records_it--;
+ }
+ } else {
+ m_records_it = m_records.lower_bound(start_key);
+ if (m_records_it == m_records.end()) {
+ next_prefix = true;
+ }
+ }
+ } else {
+ if (m_kd->m_is_reverse_cf) {
+ m_records_it = m_records.upper_bound(start_key);
+ if (m_records_it == m_records.end()) {
+ next_prefix = true;
+ }
+ } else {
+ // Emulate "SeekForPrev" behaviour.
+ m_records_it = m_records.lower_bound(start_key);
+ if (m_records_it == m_records.begin()) {
+ next_prefix = true;
+ } else {
+ m_records_it--;
+ }
+ }
+ }
+ }
+ } else if (rc == 0) {
+ // Found rows in SK, so use them.
+ m_materialized = true;
+ }
+
+ if (next_prefix) {
+ rc = seek_next_prefix(direction);
+ }
+
+ if (!rc) {
+ if (!m_kd->value_matches_prefix(key(), m_prefix_tuple)) {
+ rc = HA_ERR_END_OF_FILE;
+ } else {
+ m_valid = true;
+ }
+ }
+
+ return rc;
+}
+
+int Rdb_iterator_partial::get(const rocksdb::Slice *key,
+ rocksdb::PinnableSlice *value, Rdb_lock_type type,
+ bool skip_ttl_check) {
+ int rc = Rdb_iterator_base::get(key, value, type, skip_ttl_check);
+
+ if (rc == HA_ERR_KEY_NOT_FOUND) {
+ const uint size =
+ m_kd->get_primary_key_tuple(m_table, *m_pkd, key, m_sk_packed_tuple);
+ if (size == RDB_INVALID_KEY_LEN) {
+ return HA_ERR_ROCKSDB_CORRUPT_DATA;
+ }
+
+ rocksdb::Slice pk_key((const char *)m_sk_packed_tuple, size);
+
+ rc = m_iterator_pk.get(&pk_key, value, type, skip_ttl_check);
+ if (rc) return rc;
+
+ // Unpack from PK format
+ rc = m_converter.decode(m_pkd, m_record_buf, &pk_key, value);
+ if (rc) return rc;
+
+ // Repack into SK format
+ uint sk_packed_size = m_kd->pack_record(
+ m_table, m_pack_buffer, m_record_buf, m_sk_packed_tuple, &m_sk_tails,
+ false /* store_row_debug_checksums */, 0 /* hidden_pk_id */, 0, nullptr,
+ m_converter.get_ttl_bytes_buffer());
+
+ value->PinSelf(
+ rocksdb::Slice((const char *)m_sk_packed_tuple, sk_packed_size));
+ rc = 0;
+ }
+
+ m_valid = false;
+ return rc;
+}
+
+int Rdb_iterator_partial::next_with_direction_in_group(bool direction) {
+ uint tmp;
+ int rc = HA_EXIT_SUCCESS;
+ if (m_materialized) {
+ rc = direction ? Rdb_iterator_base::next() : Rdb_iterator_base::prev();
+
+ if (rc == HA_EXIT_SUCCESS) {
+ rocksdb::Slice cur_prefix_key((const char *)m_cur_prefix_key,
+ m_cur_prefix_key_len);
+ m_kd->get_infimum_key(m_cur_prefix_key, &tmp);
+
+ if (!m_kd->value_matches_prefix(Rdb_iterator_base::key(),
+ cur_prefix_key)) {
+ return HA_ERR_END_OF_FILE;
+ }
+ }
+ } else {
+ if (direction ^ m_kd->m_is_reverse_cf) {
+ m_records_it++;
+ if (m_records_it == m_records.end()) return HA_ERR_END_OF_FILE;
+ } else {
+ if (m_records_it == m_records.begin()) return HA_ERR_END_OF_FILE;
+ m_records_it--;
+ }
+ }
+
+ return rc;
+}
+
+int Rdb_iterator_partial::next_with_direction(bool direction) {
+ if (!m_valid) return HA_ERR_INTERNAL_ERROR;
+
+ int rc = next_with_direction_in_group(direction);
+
+ if (!rc) {
+ // On success, check if key is still within prefix.
+ if (!m_kd->value_matches_prefix(key(), m_prefix_tuple)) {
+ rc = HA_ERR_END_OF_FILE;
+ }
+ } else if (rc == HA_ERR_END_OF_FILE) {
+ uint tmp;
+ rocksdb::Slice cur_prefix_key((const char *)m_cur_prefix_key,
+ m_cur_prefix_key_len);
+ m_kd->get_infimum_key(m_cur_prefix_key, &tmp);
+
+ if (m_prefix_tuple.size() >= cur_prefix_key.size()) {
+ DBUG_ASSERT(memcmp(m_prefix_tuple.data(), cur_prefix_key.data(),
+ cur_prefix_key.size()) == 0);
+ return HA_ERR_END_OF_FILE;
+ }
+
+ rc = seek_next_prefix(direction);
+ }
+
+ return rc;
+}
+
+int Rdb_iterator_partial::next() {
+ int rc = next_with_direction(true);
+ if (rc == HA_ERR_END_OF_FILE) m_valid = false;
+ return rc;
+}
+
+int Rdb_iterator_partial::prev() {
+ int rc = next_with_direction(false);
+ if (rc == HA_ERR_END_OF_FILE) m_valid = false;
+ return rc;
+}
+
+void Rdb_iterator_partial::reset() {
+ m_valid = false;
+ m_materialized = false;
+ free_root(&m_mem_root, MYF(MY_KEEP_PREALLOC));
+ m_records.clear();
+ m_iterator_pk.reset();
+ Rdb_iterator_base::reset();
+}
+
+rocksdb::Slice Rdb_iterator_partial::key() {
+ return m_materialized ? Rdb_iterator_base::key() : m_records_it->first;
+}
+
+rocksdb::Slice Rdb_iterator_partial::value() {
+ return m_materialized ? Rdb_iterator_base::value() : m_records_it->second;
+}
+
} // namespace myrocks
diff --git a/storage/rocksdb/rdb_iterator.h b/storage/rocksdb/rdb_iterator.h
index 2a0f5bd5760..73c2deb3850 100644
--- a/storage/rocksdb/rdb_iterator.h
+++ b/storage/rocksdb/rdb_iterator.h
@@ -75,7 +75,7 @@ class Rdb_iterator_base : public Rdb_iterator {
int seek(enum ha_rkey_function find_flag, const rocksdb::Slice start_key,
bool full_key_match, const rocksdb::Slice end_key,
- bool read_current) override;
+ bool read_current = false) override;
int get(const rocksdb::Slice *key, rocksdb::PinnableSlice *value,
Rdb_lock_type type, bool skip_ttl_check = false) override;
@@ -118,4 +118,70 @@ class Rdb_iterator_base : public Rdb_iterator {
rocksdb::Slice m_prefix_tuple;
};
+class Rdb_iterator_partial : public Rdb_iterator_base {
+ private:
+ TABLE *m_table;
+ MEM_ROOT m_mem_root;
+
+ Rdb_iterator_base m_iterator_pk;
+ Rdb_converter m_converter;
+
+ bool m_valid;
+ bool m_materialized;
+
+ const uint m_threshold;
+ const uint m_prefix_keyparts;
+
+ uchar *m_cur_prefix_key;
+ uint m_cur_prefix_key_len;
+
+ uchar *m_record_buf;
+ uchar *m_pack_buffer;
+ uchar *m_sk_packed_tuple;
+
+ Rdb_string_writer m_sk_tails;
+
+ int get_prefix_len(const rocksdb::Slice &start_key, uint *prefix_cnt,
+ uint *prefix_len);
+ int get_prefix_from_start(enum ha_rkey_function find_flag,
+ const rocksdb::Slice &start_key);
+ int get_next_prefix(bool direction);
+ int seek_next_prefix(bool direction);
+ int materialize_prefix();
+ int read_prefix_from_pk();
+ int next_with_direction_in_group(bool direction);
+ int next_with_direction(bool direction);
+
+ struct slice_comparator {
+ slice_comparator(const rocksdb::Comparator *c) : m_comparator(c) {}
+ const rocksdb::Comparator *const m_comparator;
+
+ bool operator()(const rocksdb::Slice &lhs, const rocksdb::Slice &rhs) {
+ return m_comparator->Compare(lhs, rhs) < 0;
+ }
+ };
+
+ std::map<const rocksdb::Slice, const rocksdb::Slice, slice_comparator>
+ m_records;
+ std::map<const rocksdb::Slice, const rocksdb::Slice,
+ slice_comparator>::iterator m_records_it;
+
+ public:
+ Rdb_iterator_partial(THD *thd, const std::shared_ptr<Rdb_key_def> kd,
+ const std::shared_ptr<Rdb_key_def> pkd,
+ const Rdb_tbl_def *tbl_def, TABLE *table);
+ ~Rdb_iterator_partial() override;
+
+ int seek(enum ha_rkey_function find_flag, const rocksdb::Slice start_key,
+ bool full_key_match, const rocksdb::Slice end_key,
+ bool read_current = false) override;
+ int get(const rocksdb::Slice *key, rocksdb::PinnableSlice *value,
+ Rdb_lock_type type, bool skip_ttl_check = false) override;
+ int next() override;
+ int prev() override;
+ rocksdb::Slice key() override;
+ rocksdb::Slice value() override;
+ void reset() override;
+};
+
} // namespace myrocks
1
0
[Commits] aaebd623e98: Apply patch: Support parsing index comments for partial indexes
by psergey 17 May '21
by psergey 17 May '21
17 May '21
revision-id: aaebd623e98e59db3efe1b231307e4142240c485 (percona-202102-53-gaaebd623e98)
parent(s): f286a3586f1fec1f9c7bad5314136e176ea30653
author: Sergei Petrunia
committer: Sergei Petrunia
timestamp: 2021-05-17 17:43:41 +0300
message:
Apply patch: Support parsing index comments for partial indexes
Summary:
This adds support for parsing the comment section on secondary keys for partial index keywords. This is then populated onto the `Rdb_key_def` structure.
This isn't persisted to the data dictionary because we don't really need this information for compaction (unlike some of our TTL related fields).
Test Plan: mtr
Reviewers: luqun, herman, yzha, #mysql_eng
Subscribers: pgl
Differential Revision: https://phabricator.intern.facebook.com/D25933175
---
.../rocksdb/r/partial_index_validation.result | 237 +++++++++++++++++++++
.../suite/rocksdb/t/partial_index_validation.test | 237 +++++++++++++++++++++
storage/rocksdb/ha_rocksdb.cc | 5 +
storage/rocksdb/rdb_datadic.cc | 160 ++++++++++----
storage/rocksdb/rdb_datadic.h | 15 +-
storage/rocksdb/rdb_global.h | 12 ++
6 files changed, 618 insertions(+), 48 deletions(-)
diff --git a/mysql-test/suite/rocksdb/r/partial_index_validation.result b/mysql-test/suite/rocksdb/r/partial_index_validation.result
new file mode 100644
index 00000000000..b0303aab4fa
--- /dev/null
+++ b/mysql-test/suite/rocksdb/r/partial_index_validation.result
@@ -0,0 +1,237 @@
+#
+# Negative test cases
+#
+CREATE TABLE t (
+c1 INT,
+c2 INT,
+PRIMARY KEY (c1) COMMENT 'p0_cfname=aaa;p1_cfname=bbb;p2_cfname=ccc',
+KEY (c2) COMMENT 'partial_group_keyparts=2;partial_group_threshold=10000'
+) ENGINE=ROCKSDB
+PARTITION BY LIST(c1) (
+PARTITION p0 VALUES IN (1, 4, 7),
+PARTITION p1 VALUES IN (2, 5, 8),
+PARTITION p2 VALUES IN (3, 6, 9)
+);
+ERROR 42000: Partial indexes not supported for partitioned tables.
+CREATE TABLE t (
+c1 INT,
+c2 INT,
+PRIMARY KEY (c1) COMMENT 'p0_cfname=aaa;p1_cfname=bbb;p2_cfname=ccc',
+KEY (c2) COMMENT 'p0_partial_group_keyparts=2;p0_partial_group_threshold=10000'
+) ENGINE=ROCKSDB
+PARTITION BY LIST(c1) (
+PARTITION p0 VALUES IN (1, 4, 7),
+PARTITION p1 VALUES IN (2, 5, 8),
+PARTITION p2 VALUES IN (3, 6, 9)
+);
+ERROR 42000: Partial indexes not supported for partitioned tables.
+CREATE TABLE t (
+c1 INT,
+c2 INT,
+PRIMARY KEY (c1) COMMENT 'partial_group_keyparts=2;partial_group_threshold=10000'
+) ENGINE=ROCKSDB;
+ERROR HY000: Primary key cannot be a partial index.
+CREATE TABLE t (
+c1 INT,
+c2 INT,
+KEY (c2) COMMENT 'partial_group_keyparts=2;partial_group_threshold=10000'
+) ENGINE=ROCKSDB;
+ERROR 42000: Autoincrement key cannot be a partial index.
+CREATE TABLE t (
+c1 INT,
+c2 INT,
+PRIMARY KEY (c1),
+UNIQUE KEY (c2) COMMENT 'partial_group_keyparts=2;partial_group_threshold=10000'
+) ENGINE=ROCKSDB;
+ERROR 42000: Unique key cannot be a partial index.
+CREATE TABLE t (
+c1 INT,
+c2 INT AUTO_INCREMENT,
+PRIMARY KEY (c1),
+KEY (c2) COMMENT 'partial_group_keyparts=2;partial_group_threshold=10000'
+) ENGINE=ROCKSDB;
+ERROR 42000: Autoincrement key cannot be a partial index.
+CREATE TABLE t (
+c1 INT,
+c2 INT,
+PRIMARY KEY (c1),
+KEY (c2) COMMENT 'partial_group_keyparts=2;partial_group_threshold=asdf'
+) ENGINE=ROCKSDB;
+ERROR HY000: Invalid partial index group size threshold.
+CREATE TABLE t (
+c1 INT,
+c2 INT,
+PRIMARY KEY (c1),
+KEY (c2) COMMENT 'partial_group_keyparts=asdf;partial_group_threshold=10000'
+) ENGINE=ROCKSDB;
+ERROR HY000: Invalid number of keyparts in partial index group.
+CREATE TABLE t (
+c1 INT,
+c2 INT,
+c3 INT,
+c4 INT,
+PRIMARY KEY (c1, c2),
+KEY (c1, c2) COMMENT 'partial_group_keyparts=2;partial_group_threshold=10000'
+) ENGINE=ROCKSDB;
+ERROR HY000: Too many keyparts in partial index group.
+CREATE TABLE t (
+c1 INT,
+c2 INT,
+c3 INT,
+c4 INT,
+PRIMARY KEY (c1, c2),
+KEY (c1, c2, c4) COMMENT 'partial_group_keyparts=2;partial_group_threshold=10000'
+) ENGINE=ROCKSDB;
+ERROR HY000: Too many keyparts in partial index group.
+CREATE TABLE t (
+c1 INT,
+c2 INT,
+c3 INT,
+c4 INT,
+PRIMARY KEY (c1, c2),
+KEY (c1, c2, c4) COMMENT 'partial_group_keyparts=3;partial_group_threshold=10000'
+) ENGINE=ROCKSDB;
+ERROR HY000: Too many keyparts in partial index group.
+CREATE TABLE t (
+c1 INT,
+c2 INT,
+c3 INT,
+c4 INT,
+PRIMARY KEY (c1, c2, c3),
+KEY (c1, c2, c4) COMMENT 'partial_group_keyparts=3;partial_group_threshold=10000'
+) ENGINE=ROCKSDB;
+ERROR HY000: Too many keyparts in partial index group.
+CREATE TABLE t (
+c1 INT,
+c2 INT,
+c3 INT,
+c4 INT,
+PRIMARY KEY (c2, c3, c4),
+KEY (c1, c2, c4) COMMENT 'partial_group_keyparts=3;partial_group_threshold=10000'
+) ENGINE=ROCKSDB;
+ERROR HY000: Too many keyparts in partial index group.
+CREATE TABLE t (
+c1 INT,
+c2 VARCHAR(255),
+c3 INT,
+c4 INT,
+PRIMARY KEY (c1, c2(5), c3),
+KEY (c1, c2(4), c4) COMMENT 'partial_group_keyparts=2;partial_group_threshold=10000'
+) ENGINE=ROCKSDB;
+ERROR HY000: Mismatched keyparts in partial index group.
+#
+# Positive test cases
+#
+CREATE TABLE t (
+c1 INT,
+c2 INT,
+c3 INT,
+c4 INT,
+c5 INT,
+PRIMARY KEY (c1, c2, c5),
+KEY (c1, c2, c4) COMMENT 'partial_group_keyparts=2;partial_group_threshold=10000'
+) ENGINE=ROCKSDB;
+DROP TABLE t;
+CREATE TABLE t (
+c1 INT,
+c2 INT,
+c3 INT,
+c4 INT,
+PRIMARY KEY (c1, c2, c3),
+KEY (c1, c2, c4) COMMENT 'partial_group_keyparts=2;partial_group_threshold=10000'
+) ENGINE=ROCKSDB;
+DROP TABLE t;
+CREATE TABLE t (
+c1 INT,
+c2 INT,
+c3 INT,
+c4 INT,
+PRIMARY KEY (c1, c2, c3),
+KEY (c1, c2) COMMENT 'partial_group_keyparts=2;partial_group_threshold=10000'
+) ENGINE=ROCKSDB;
+DROP TABLE t;
+CREATE TABLE t (
+c1 INT,
+c2 VARCHAR(255),
+c3 INT,
+c4 INT,
+PRIMARY KEY (c1, c2(4), c3),
+KEY (c1, c2(4), c4) COMMENT 'partial_group_keyparts=2;partial_group_threshold=10000'
+) ENGINE=ROCKSDB;
+DROP TABLE t;
+CREATE TABLE t (
+c1 INT,
+c2 VARCHAR(255),
+c3 INT,
+c4 INT,
+PRIMARY KEY (c1, c2(4), c3),
+KEY (c1, c2(4), c4) COMMENT 'partial_group_keyparts=1;partial_group_threshold=10000'
+) ENGINE=ROCKSDB;
+DROP TABLE t;
+CREATE TABLE t (
+id1 bigint(20) unsigned NOT NULL DEFAULT '0',
+id1_type int(10) unsigned NOT NULL DEFAULT '0',
+id2 bigint(20) unsigned NOT NULL DEFAULT '0',
+id2_type int(10) unsigned NOT NULL DEFAULT '0',
+assoc_type bigint(20) unsigned NOT NULL DEFAULT '0',
+visibility tinyint(3) NOT NULL DEFAULT '0',
+data varchar(255) COLLATE latin1_bin NOT NULL DEFAULT '',
+time int(10) unsigned NOT NULL DEFAULT '0',
+version bigint(20) unsigned NOT NULL DEFAULT '0',
+PRIMARY KEY (assoc_type, id1, id2) COMMENT 'cf_assoc',
+KEY id1_type (assoc_type, id1, visibility, time, id2, version, data) COMMENT 'cfname=rev:cf_assoc_id1_type;partial_group_keyparts=1;partial_group_threshold=10000'
+) ENGINE=ROCKSDB DEFAULT CHARSET=latin1 COLLATE=latin1_bin ROW_FORMAT=COMPRESSED KEY_BLOCK_SIZE=8;
+Warnings:
+Warning 1681 Integer display width is deprecated and will be removed in a future release.
+Warning 1681 Integer display width is deprecated and will be removed in a future release.
+Warning 1681 Integer display width is deprecated and will be removed in a future release.
+Warning 1681 Integer display width is deprecated and will be removed in a future release.
+Warning 1681 Integer display width is deprecated and will be removed in a future release.
+Warning 1681 Integer display width is deprecated and will be removed in a future release.
+Warning 1681 Integer display width is deprecated and will be removed in a future release.
+Warning 1681 Integer display width is deprecated and will be removed in a future release.
+DROP TABLE t;
+CREATE TABLE t (
+id1 binary(16) NOT NULL DEFAULT '\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0',
+raw_key text COLLATE latin1_bin,
+id2 bigint(20) unsigned NOT NULL DEFAULT '0',
+id2_type int(10) unsigned NOT NULL DEFAULT '0',
+assoc_type bigint(20) unsigned NOT NULL DEFAULT '0',
+visibility tinyint(3) NOT NULL DEFAULT '0',
+data varchar(255) COLLATE latin1_bin NOT NULL DEFAULT '',
+time int(10) unsigned NOT NULL DEFAULT '0',
+version bigint(20) unsigned NOT NULL DEFAULT '0',
+PRIMARY KEY (assoc_type, id1, id2) COMMENT 'cf_assoc',
+KEY id1_type (assoc_type, id1, visibility, time, id2, version, data) COMMENT 'cfname=rev:cf_assoc_id1_type;partial_group_keyparts=1;partial_group_threshold=10000'
+) ENGINE=ROCKSDB DEFAULT CHARSET=latin1 COLLATE=latin1_bin ROW_FORMAT=COMPRESSED KEY_BLOCK_SIZE=8;
+Warnings:
+Warning 1681 Integer display width is deprecated and will be removed in a future release.
+Warning 1681 Integer display width is deprecated and will be removed in a future release.
+Warning 1681 Integer display width is deprecated and will be removed in a future release.
+Warning 1681 Integer display width is deprecated and will be removed in a future release.
+Warning 1681 Integer display width is deprecated and will be removed in a future release.
+Warning 1681 Integer display width is deprecated and will be removed in a future release.
+DROP TABLE t;
+CREATE TABLE t (
+id1 bigint(20) unsigned NOT NULL DEFAULT '0',
+id1_type int(10) unsigned NOT NULL DEFAULT '0',
+id2 bigint(20) unsigned NOT NULL DEFAULT '0',
+id2_type int(10) unsigned NOT NULL DEFAULT '0',
+assoc_type bigint(20) unsigned NOT NULL DEFAULT '0',
+visibility tinyint(4) NOT NULL DEFAULT '0',
+data text COLLATE latin1_bin NOT NULL,
+time int(10) unsigned NOT NULL DEFAULT '0',
+version bigint(20) unsigned NOT NULL DEFAULT '0',
+PRIMARY KEY (assoc_type, id1, id2) COMMENT 'cf_assoc',
+KEY id1_type (assoc_type, id1, visibility, time, id2, version, data(255)) COMMENT 'cfname=rev:cf_assoc_id1_type;partial_group_keyparts=1;partial_group_threshold=10000'
+) ENGINE=ROCKSDB DEFAULT CHARSET=latin1 COLLATE=latin1_bin ROW_FORMAT=COMPRESSED KEY_BLOCK_SIZE=4;
+Warnings:
+Warning 1681 Integer display width is deprecated and will be removed in a future release.
+Warning 1681 Integer display width is deprecated and will be removed in a future release.
+Warning 1681 Integer display width is deprecated and will be removed in a future release.
+Warning 1681 Integer display width is deprecated and will be removed in a future release.
+Warning 1681 Integer display width is deprecated and will be removed in a future release.
+Warning 1681 Integer display width is deprecated and will be removed in a future release.
+Warning 1681 Integer display width is deprecated and will be removed in a future release.
+Warning 1681 Integer display width is deprecated and will be removed in a future release.
+DROP TABLE t;
diff --git a/mysql-test/suite/rocksdb/t/partial_index_validation.test b/mysql-test/suite/rocksdb/t/partial_index_validation.test
new file mode 100644
index 00000000000..92cd859a904
--- /dev/null
+++ b/mysql-test/suite/rocksdb/t/partial_index_validation.test
@@ -0,0 +1,237 @@
+
+--echo #
+--echo # Negative test cases
+--echo #
+
+--error ER_NOT_SUPPORTED_YET
+CREATE TABLE t (
+ c1 INT,
+ c2 INT,
+ PRIMARY KEY (c1) COMMENT 'p0_cfname=aaa;p1_cfname=bbb;p2_cfname=ccc',
+ KEY (c2) COMMENT 'partial_group_keyparts=2;partial_group_threshold=10000'
+) ENGINE=ROCKSDB
+PARTITION BY LIST(c1) (
+ PARTITION p0 VALUES IN (1, 4, 7),
+ PARTITION p1 VALUES IN (2, 5, 8),
+ PARTITION p2 VALUES IN (3, 6, 9)
+);
+
+--error ER_NOT_SUPPORTED_YET
+CREATE TABLE t (
+ c1 INT,
+ c2 INT,
+ PRIMARY KEY (c1) COMMENT 'p0_cfname=aaa;p1_cfname=bbb;p2_cfname=ccc',
+ KEY (c2) COMMENT 'p0_partial_group_keyparts=2;p0_partial_group_threshold=10000'
+) ENGINE=ROCKSDB
+PARTITION BY LIST(c1) (
+ PARTITION p0 VALUES IN (1, 4, 7),
+ PARTITION p1 VALUES IN (2, 5, 8),
+ PARTITION p2 VALUES IN (3, 6, 9)
+);
+
+--error ER_WRONG_ARGUMENTS
+CREATE TABLE t (
+ c1 INT,
+ c2 INT,
+ PRIMARY KEY (c1) COMMENT 'partial_group_keyparts=2;partial_group_threshold=10000'
+) ENGINE=ROCKSDB;
+
+--error ER_NOT_SUPPORTED_YET
+CREATE TABLE t (
+ c1 INT,
+ c2 INT,
+ KEY (c2) COMMENT 'partial_group_keyparts=2;partial_group_threshold=10000'
+) ENGINE=ROCKSDB;
+
+--error ER_NOT_SUPPORTED_YET
+CREATE TABLE t (
+ c1 INT,
+ c2 INT,
+ PRIMARY KEY (c1),
+ UNIQUE KEY (c2) COMMENT 'partial_group_keyparts=2;partial_group_threshold=10000'
+) ENGINE=ROCKSDB;
+
+--error ER_NOT_SUPPORTED_YET
+CREATE TABLE t (
+ c1 INT,
+ c2 INT AUTO_INCREMENT,
+ PRIMARY KEY (c1),
+ KEY (c2) COMMENT 'partial_group_keyparts=2;partial_group_threshold=10000'
+) ENGINE=ROCKSDB;
+
+--error ER_WRONG_ARGUMENTS
+CREATE TABLE t (
+ c1 INT,
+ c2 INT,
+ PRIMARY KEY (c1),
+ KEY (c2) COMMENT 'partial_group_keyparts=2;partial_group_threshold=asdf'
+) ENGINE=ROCKSDB;
+
+--error ER_WRONG_ARGUMENTS
+CREATE TABLE t (
+ c1 INT,
+ c2 INT,
+ PRIMARY KEY (c1),
+ KEY (c2) COMMENT 'partial_group_keyparts=asdf;partial_group_threshold=10000'
+) ENGINE=ROCKSDB;
+
+--error ER_WRONG_ARGUMENTS
+CREATE TABLE t (
+ c1 INT,
+ c2 INT,
+ c3 INT,
+ c4 INT,
+ PRIMARY KEY (c1, c2),
+ KEY (c1, c2) COMMENT 'partial_group_keyparts=2;partial_group_threshold=10000'
+) ENGINE=ROCKSDB;
+
+--error ER_WRONG_ARGUMENTS
+CREATE TABLE t (
+ c1 INT,
+ c2 INT,
+ c3 INT,
+ c4 INT,
+ PRIMARY KEY (c1, c2),
+ KEY (c1, c2, c4) COMMENT 'partial_group_keyparts=2;partial_group_threshold=10000'
+) ENGINE=ROCKSDB;
+
+--error ER_WRONG_ARGUMENTS
+CREATE TABLE t (
+ c1 INT,
+ c2 INT,
+ c3 INT,
+ c4 INT,
+ PRIMARY KEY (c1, c2),
+ KEY (c1, c2, c4) COMMENT 'partial_group_keyparts=3;partial_group_threshold=10000'
+) ENGINE=ROCKSDB;
+
+--error ER_WRONG_ARGUMENTS
+CREATE TABLE t (
+ c1 INT,
+ c2 INT,
+ c3 INT,
+ c4 INT,
+ PRIMARY KEY (c1, c2, c3),
+ KEY (c1, c2, c4) COMMENT 'partial_group_keyparts=3;partial_group_threshold=10000'
+) ENGINE=ROCKSDB;
+
+--error ER_WRONG_ARGUMENTS
+CREATE TABLE t (
+ c1 INT,
+ c2 INT,
+ c3 INT,
+ c4 INT,
+ PRIMARY KEY (c2, c3, c4),
+ KEY (c1, c2, c4) COMMENT 'partial_group_keyparts=3;partial_group_threshold=10000'
+) ENGINE=ROCKSDB;
+
+--error ER_WRONG_ARGUMENTS
+CREATE TABLE t (
+ c1 INT,
+ c2 VARCHAR(255),
+ c3 INT,
+ c4 INT,
+ PRIMARY KEY (c1, c2(5), c3),
+ KEY (c1, c2(4), c4) COMMENT 'partial_group_keyparts=2;partial_group_threshold=10000'
+) ENGINE=ROCKSDB;
+
+--echo #
+--echo # Positive test cases
+--echo #
+
+CREATE TABLE t (
+ c1 INT,
+ c2 INT,
+ c3 INT,
+ c4 INT,
+ c5 INT,
+ PRIMARY KEY (c1, c2, c5),
+ KEY (c1, c2, c4) COMMENT 'partial_group_keyparts=2;partial_group_threshold=10000'
+) ENGINE=ROCKSDB;
+DROP TABLE t;
+
+CREATE TABLE t (
+ c1 INT,
+ c2 INT,
+ c3 INT,
+ c4 INT,
+ PRIMARY KEY (c1, c2, c3),
+ KEY (c1, c2, c4) COMMENT 'partial_group_keyparts=2;partial_group_threshold=10000'
+) ENGINE=ROCKSDB;
+DROP TABLE t;
+
+CREATE TABLE t (
+ c1 INT,
+ c2 INT,
+ c3 INT,
+ c4 INT,
+ PRIMARY KEY (c1, c2, c3),
+ KEY (c1, c2) COMMENT 'partial_group_keyparts=2;partial_group_threshold=10000'
+) ENGINE=ROCKSDB;
+DROP TABLE t;
+
+CREATE TABLE t (
+ c1 INT,
+ c2 VARCHAR(255),
+ c3 INT,
+ c4 INT,
+ PRIMARY KEY (c1, c2(4), c3),
+ KEY (c1, c2(4), c4) COMMENT 'partial_group_keyparts=2;partial_group_threshold=10000'
+) ENGINE=ROCKSDB;
+DROP TABLE t;
+
+CREATE TABLE t (
+ c1 INT,
+ c2 VARCHAR(255),
+ c3 INT,
+ c4 INT,
+ PRIMARY KEY (c1, c2(4), c3),
+ KEY (c1, c2(4), c4) COMMENT 'partial_group_keyparts=1;partial_group_threshold=10000'
+) ENGINE=ROCKSDB;
+DROP TABLE t;
+
+CREATE TABLE t (
+ id1 bigint(20) unsigned NOT NULL DEFAULT '0',
+ id1_type int(10) unsigned NOT NULL DEFAULT '0',
+ id2 bigint(20) unsigned NOT NULL DEFAULT '0',
+ id2_type int(10) unsigned NOT NULL DEFAULT '0',
+ assoc_type bigint(20) unsigned NOT NULL DEFAULT '0',
+ visibility tinyint(3) NOT NULL DEFAULT '0',
+ data varchar(255) COLLATE latin1_bin NOT NULL DEFAULT '',
+ time int(10) unsigned NOT NULL DEFAULT '0',
+ version bigint(20) unsigned NOT NULL DEFAULT '0',
+ PRIMARY KEY (assoc_type, id1, id2) COMMENT 'cf_assoc',
+ KEY id1_type (assoc_type, id1, visibility, time, id2, version, data) COMMENT 'cfname=rev:cf_assoc_id1_type;partial_group_keyparts=1;partial_group_threshold=10000'
+) ENGINE=ROCKSDB DEFAULT CHARSET=latin1 COLLATE=latin1_bin ROW_FORMAT=COMPRESSED KEY_BLOCK_SIZE=8;
+DROP TABLE t;
+
+CREATE TABLE t (
+ id1 binary(16) NOT NULL DEFAULT '\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0',
+ raw_key text COLLATE latin1_bin,
+ id2 bigint(20) unsigned NOT NULL DEFAULT '0',
+ id2_type int(10) unsigned NOT NULL DEFAULT '0',
+ assoc_type bigint(20) unsigned NOT NULL DEFAULT '0',
+ visibility tinyint(3) NOT NULL DEFAULT '0',
+ data varchar(255) COLLATE latin1_bin NOT NULL DEFAULT '',
+ time int(10) unsigned NOT NULL DEFAULT '0',
+ version bigint(20) unsigned NOT NULL DEFAULT '0',
+ PRIMARY KEY (assoc_type, id1, id2) COMMENT 'cf_assoc',
+ KEY id1_type (assoc_type, id1, visibility, time, id2, version, data) COMMENT 'cfname=rev:cf_assoc_id1_type;partial_group_keyparts=1;partial_group_threshold=10000'
+) ENGINE=ROCKSDB DEFAULT CHARSET=latin1 COLLATE=latin1_bin ROW_FORMAT=COMPRESSED KEY_BLOCK_SIZE=8;
+DROP TABLE t;
+
+
+CREATE TABLE t (
+ id1 bigint(20) unsigned NOT NULL DEFAULT '0',
+ id1_type int(10) unsigned NOT NULL DEFAULT '0',
+ id2 bigint(20) unsigned NOT NULL DEFAULT '0',
+ id2_type int(10) unsigned NOT NULL DEFAULT '0',
+ assoc_type bigint(20) unsigned NOT NULL DEFAULT '0',
+ visibility tinyint(4) NOT NULL DEFAULT '0',
+ data text COLLATE latin1_bin NOT NULL,
+ time int(10) unsigned NOT NULL DEFAULT '0',
+ version bigint(20) unsigned NOT NULL DEFAULT '0',
+ PRIMARY KEY (assoc_type, id1, id2) COMMENT 'cf_assoc',
+ KEY id1_type (assoc_type, id1, visibility, time, id2, version, data(255)) COMMENT 'cfname=rev:cf_assoc_id1_type;partial_group_keyparts=1;partial_group_threshold=10000'
+) ENGINE=ROCKSDB DEFAULT CHARSET=latin1 COLLATE=latin1_bin ROW_FORMAT=COMPRESSED KEY_BLOCK_SIZE=4;
+DROP TABLE t;
diff --git a/storage/rocksdb/ha_rocksdb.cc b/storage/rocksdb/ha_rocksdb.cc
index c484dfb5894..15c8b289a5e 100644
--- a/storage/rocksdb/ha_rocksdb.cc
+++ b/storage/rocksdb/ha_rocksdb.cc
@@ -8251,6 +8251,11 @@ int ha_rocksdb::create_key_def(uint32_t dbnum_arg, const TABLE *const table_arg,
if (!ttl_column.empty()) {
(*new_key_def)->m_ttl_column = ttl_column;
}
+
+ if ((*new_key_def)->extract_partial_index_info(table_arg, tbl_def_arg)) {
+ DBUG_RETURN(HA_EXIT_FAILURE);
+ }
+
// initialize key_def
(*new_key_def)->setup(table_arg, tbl_def_arg);
DBUG_RETURN(HA_EXIT_SUCCESS);
diff --git a/storage/rocksdb/rdb_datadic.cc b/storage/rocksdb/rdb_datadic.cc
index a9bdad0620e..c26a9f9437b 100644
--- a/storage/rocksdb/rdb_datadic.cc
+++ b/storage/rocksdb/rdb_datadic.cc
@@ -289,6 +289,8 @@ Rdb_key_def::Rdb_key_def(
m_key_parts(0),
m_ttl_pk_key_part_offset(UINT_MAX),
m_ttl_field_index(UINT_MAX),
+ m_partial_index_keyparts(0),
+ m_partial_index_threshold(0),
m_prefix_extractor(nullptr),
m_maxlength(0) // means 'not intialized'
{
@@ -316,6 +318,8 @@ Rdb_key_def::Rdb_key_def(const Rdb_key_def &k)
m_key_parts(k.m_key_parts),
m_ttl_pk_key_part_offset(k.m_ttl_pk_key_part_offset),
m_ttl_field_index(UINT_MAX),
+ m_partial_index_keyparts(k.m_partial_index_keyparts),
+ m_partial_index_threshold(k.m_partial_index_threshold),
m_prefix_extractor(k.m_prefix_extractor),
m_maxlength(k.m_maxlength) {
mysql_mutex_init(0, &m_mutex, MY_MUTEX_INIT_FAST);
@@ -465,6 +469,8 @@ void Rdb_key_def::setup(const TABLE *const tbl,
Rdb_key_def::extract_ttl_col(tbl, tbl_def, &m_ttl_column,
&m_ttl_field_index, true);
+ extract_partial_index_info(tbl, tbl_def);
+
size_t max_len = INDEX_ID_SIZE;
int unpack_len = 0;
int max_part_len = 0;
@@ -701,57 +707,129 @@ uint Rdb_key_def::extract_ttl_col(const TABLE *const table_arg,
return HA_EXIT_SUCCESS;
}
-const std::string Rdb_key_def::gen_qualifier_for_table(
- const char *const qualifier, const std::string &partition_name) {
- bool has_partition = !partition_name.empty();
- std::string qualifier_str = "";
+uint Rdb_key_def::extract_partial_index_info(
+ const TABLE *const table_arg, const Rdb_tbl_def *const tbl_def_arg) {
+ // Nothing to parse if this is a hidden PK.
+ if (m_index_type == INDEX_TYPE_HIDDEN_PRIMARY) {
+ return HA_EXIT_SUCCESS;
+ }
- if (!strcmp(qualifier, RDB_CF_NAME_QUALIFIER)) {
- return has_partition ? gen_cf_name_qualifier_for_partition(partition_name)
- : qualifier_str + RDB_CF_NAME_QUALIFIER +
- RDB_QUALIFIER_VALUE_SEP;
- } else if (!strcmp(qualifier, RDB_TTL_DURATION_QUALIFIER)) {
- return has_partition
- ? gen_ttl_duration_qualifier_for_partition(partition_name)
- : qualifier_str + RDB_TTL_DURATION_QUALIFIER +
- RDB_QUALIFIER_VALUE_SEP;
- } else if (!strcmp(qualifier, RDB_TTL_COL_QUALIFIER)) {
- return has_partition ? gen_ttl_col_qualifier_for_partition(partition_name)
- : qualifier_str + RDB_TTL_COL_QUALIFIER +
- RDB_QUALIFIER_VALUE_SEP;
- } else {
- DBUG_ASSERT(0);
+ std::string key_comment(table_arg->key_info[m_keyno].comment.str,
+ table_arg->key_info[m_keyno].comment.length);
+
+ bool per_part_match = false;
+ std::string keyparts_str = Rdb_key_def::parse_comment_for_qualifier(
+ key_comment, table_arg, tbl_def_arg, &per_part_match,
+ RDB_PARTIAL_INDEX_KEYPARTS_QUALIFIER);
+
+ std::string threshold_str = Rdb_key_def::parse_comment_for_qualifier(
+ key_comment, table_arg, tbl_def_arg, &per_part_match,
+ RDB_PARTIAL_INDEX_THRESHOLD_QUALIFIER);
+
+ if (threshold_str.empty()) {
+ m_partial_index_keyparts = 0;
+ m_partial_index_threshold = 0;
+ return HA_EXIT_SUCCESS;
}
- return qualifier_str;
-}
+ if (table_arg->part_info != nullptr) {
+ my_printf_error(ER_NOT_SUPPORTED_YET,
+ "Partial indexes not supported for partitioned tables.",
+ MYF(0));
+ return HA_EXIT_FAILURE;
+ }
-/*
- Formats the string and returns the column family name assignment part for a
- specific partition.
-*/
-const std::string Rdb_key_def::gen_cf_name_qualifier_for_partition(
- const std::string &prefix) {
- DBUG_ASSERT(!prefix.empty());
+ if (is_primary_key()) {
+ my_printf_error(ER_WRONG_ARGUMENTS,
+ "Primary key cannot be a partial index.", MYF(0));
+ return HA_EXIT_FAILURE;
+ }
- return prefix + RDB_PER_PARTITION_QUALIFIER_NAME_SEP + RDB_CF_NAME_QUALIFIER +
- RDB_QUALIFIER_VALUE_SEP;
-}
+ if (table_arg->key_info[m_keyno].flags & HA_NOSAME) {
+ my_printf_error(ER_NOT_SUPPORTED_YET,
+ "Unique key cannot be a partial index.", MYF(0));
+ return HA_EXIT_FAILURE;
+ }
+
+ if (table_arg->s->next_number_index == m_keyno) {
+ my_printf_error(ER_NOT_SUPPORTED_YET,
+ "Autoincrement key cannot be a partial index.", MYF(0));
+ return HA_EXIT_FAILURE;
+ }
+
+ if (table_has_hidden_pk(table_arg)) {
+ my_printf_error(ER_NOT_SUPPORTED_YET,
+ "Table with no primary key cannot have a partial index.",
+ MYF(0));
+ return HA_EXIT_FAILURE;
+ }
+
+ m_partial_index_threshold = std::strtoull(threshold_str.c_str(), nullptr, 0);
+ if (!m_partial_index_threshold) {
+ my_printf_error(ER_WRONG_ARGUMENTS,
+ "Invalid partial index group size threshold.", MYF(0));
+ return HA_EXIT_FAILURE;
+ }
+
+ m_partial_index_keyparts = std::strtoull(keyparts_str.c_str(), nullptr, 0);
+ if (!m_partial_index_keyparts) {
+ my_printf_error(ER_WRONG_ARGUMENTS,
+ "Invalid number of keyparts in partial index group.",
+ MYF(0));
+ return HA_EXIT_FAILURE;
+ }
+
+ uint n_keyparts =
+ std::min(table_arg->key_info[table_arg->s->primary_key].actual_key_parts,
+ table_arg->key_info[m_keyno].actual_key_parts);
+ if (n_keyparts <= m_partial_index_keyparts) {
+ my_printf_error(ER_WRONG_ARGUMENTS,
+ "Too many keyparts in partial index group.", MYF(0));
+ return HA_EXIT_FAILURE;
+ }
-const std::string Rdb_key_def::gen_ttl_duration_qualifier_for_partition(
- const std::string &prefix) {
- DBUG_ASSERT(!prefix.empty());
+ // Verify that PK/SK actually share a common prefix.
+ KEY_PART_INFO *key_part_sk = table_arg->key_info[m_keyno].key_part;
+ KEY_PART_INFO *key_part_pk =
+ table_arg->key_info[table_arg->s->primary_key].key_part;
+
+ n_keyparts = std::min(n_keyparts, m_partial_index_keyparts);
+
+ for (uint i = 0; i < n_keyparts; i++) {
+ if (key_part_sk->fieldnr != key_part_pk->fieldnr ||
+ key_part_sk->field->field_length != key_part_pk->field->field_length) {
+ my_printf_error(ER_WRONG_ARGUMENTS,
+ "Mismatched keyparts in partial index group.", MYF(0));
+ return HA_EXIT_FAILURE;
+ }
+ key_part_sk++;
+ key_part_pk++;
+ }
- return prefix + RDB_PER_PARTITION_QUALIFIER_NAME_SEP +
- RDB_TTL_DURATION_QUALIFIER + RDB_QUALIFIER_VALUE_SEP;
+ return HA_EXIT_SUCCESS;
}
-const std::string Rdb_key_def::gen_ttl_col_qualifier_for_partition(
- const std::string &prefix) {
- DBUG_ASSERT(!prefix.empty());
+const std::string Rdb_key_def::gen_qualifier_for_table(
+ const char *const qualifier, const std::string &partition_name) {
+ bool has_partition = !partition_name.empty();
+ std::string qualifier_str = "";
+
+ if (has_partition) {
+ qualifier_str += partition_name + RDB_PER_PARTITION_QUALIFIER_NAME_SEP;
+ }
- return prefix + RDB_PER_PARTITION_QUALIFIER_NAME_SEP + RDB_TTL_COL_QUALIFIER +
- RDB_QUALIFIER_VALUE_SEP;
+ if (!strcmp(qualifier, RDB_CF_NAME_QUALIFIER) ||
+ !strcmp(qualifier, RDB_TTL_DURATION_QUALIFIER) ||
+ !strcmp(qualifier, RDB_TTL_COL_QUALIFIER) ||
+ !strcmp(qualifier, RDB_PARTIAL_INDEX_KEYPARTS_QUALIFIER) ||
+ !strcmp(qualifier, RDB_PARTIAL_INDEX_THRESHOLD_QUALIFIER)) {
+ qualifier_str += std::string(qualifier) + RDB_QUALIFIER_VALUE_SEP;
+ } else {
+ DBUG_ASSERT(false);
+ return std::string("");
+ }
+
+ return qualifier_str;
}
const std::string Rdb_key_def::parse_comment_for_qualifier(
diff --git a/storage/rocksdb/rdb_datadic.h b/storage/rocksdb/rdb_datadic.h
index f6917836f23..2c5828a6b8a 100644
--- a/storage/rocksdb/rdb_datadic.h
+++ b/storage/rocksdb/rdb_datadic.h
@@ -583,6 +583,10 @@ class Rdb_key_def {
bool skip_checks = false);
inline bool has_ttl() const { return m_ttl_duration > 0; }
+ uint extract_partial_index_info(const TABLE *const table_arg,
+ const Rdb_tbl_def *const tbl_def_arg);
+ inline bool is_partial_index() const { return m_partial_index_threshold > 0; }
+
static bool has_index_flag(uint32 index_flags, enum INDEX_FLAG flag);
static uint32 calculate_index_flag_offset(uint32 index_flags,
enum INDEX_FLAG flag,
@@ -593,13 +597,6 @@ class Rdb_key_def {
static const std::string gen_qualifier_for_table(
const char *const qualifier, const std::string &partition_name = "");
- static const std::string gen_cf_name_qualifier_for_partition(
- const std::string &s);
- static const std::string gen_ttl_duration_qualifier_for_partition(
- const std::string &s);
- static const std::string gen_ttl_col_qualifier_for_partition(
- const std::string &s);
-
static const std::string parse_comment_for_qualifier(
const std::string &comment, const TABLE *const table_arg,
const Rdb_tbl_def *const tbl_def_arg, bool *per_part_match_found,
@@ -926,6 +923,10 @@ class Rdb_key_def {
*/
uint m_ttl_field_index;
+ uint m_partial_index_keyparts;
+
+ uint m_partial_index_threshold;
+
/* Prefix extractor for the column family of the key definiton */
std::shared_ptr<const rocksdb::SliceTransform> m_prefix_extractor;
diff --git a/storage/rocksdb/rdb_global.h b/storage/rocksdb/rdb_global.h
index a7c628e541e..9f5c8cf76e8 100644
--- a/storage/rocksdb/rdb_global.h
+++ b/storage/rocksdb/rdb_global.h
@@ -165,6 +165,18 @@ const char *const RDB_TTL_DURATION_QUALIFIER = "ttl_duration";
*/
const char *const RDB_TTL_COL_QUALIFIER = "ttl_col";
+/*
+ Qualifier name for number of prefix keyparts in partial index
+*/
+const char *const RDB_PARTIAL_INDEX_KEYPARTS_QUALIFIER =
+ "partial_group_keyparts";
+
+/*
+ Qualifier name for materialization threshold in partial index
+*/
+const char *const RDB_PARTIAL_INDEX_THRESHOLD_QUALIFIER =
+ "partial_group_threshold";
+
/*
Default, minimal valid, and maximum valid sampling rate values when collecting
statistics about table.
1
0
revision-id: f286a3586f1fec1f9c7bad5314136e176ea30653 (percona-202102-52-gf286a3586f1)
parent(s): 0f4980afd8faa61b23a3008d3cba726881072174
author: Sergei Petrunia
committer: Sergei Petrunia
timestamp: 2021-05-17 17:42:54 +0300
message:
Apply patch: Implement iterator class
Summary: This abstracts basic iteration over keys (with TTL filtering) into the `Rdb_iterator_base` class. Logic that does ICP or primary key locking is not included.
Test Plan: mtr
Reviewers: luqun, herman, yzha, #mysql_eng
Subscribers: pgl
Differential Revision: https://phabricator.intern.facebook.com/D25933169
---
mysql-test/suite/rocksdb/r/check_flags.result | 2 +-
mysql-test/suite/rocksdb/t/check_flags.test | 2 +-
storage/rocksdb/CMakeLists.txt | 1 +
storage/rocksdb/ha_rocksdb.cc | 676 +++++++-------------------
storage/rocksdb/ha_rocksdb.h | 94 ++--
storage/rocksdb/ha_rocksdb_proto.h | 2 +-
storage/rocksdb/nosql_access.cc | 4 +-
storage/rocksdb/rdb_converter.h | 2 +-
storage/rocksdb/rdb_iterator.cc | 359 ++++++++++++++
storage/rocksdb/rdb_iterator.h | 121 +++++
10 files changed, 703 insertions(+), 560 deletions(-)
diff --git a/mysql-test/suite/rocksdb/r/check_flags.result b/mysql-test/suite/rocksdb/r/check_flags.result
index 8ff4153707e..9fe20b968a6 100644
--- a/mysql-test/suite/rocksdb/r/check_flags.result
+++ b/mysql-test/suite/rocksdb/r/check_flags.result
@@ -34,7 +34,7 @@ KILL QUERY $conn1_id;
set debug_sync='now SIGNAL go';
ERROR 70100: Query execution was interrupted
set debug_sync='RESET';
-set debug_sync='rocksdb.check_flags_inwdi SIGNAL parked WAIT_FOR go';
+set debug_sync='rocksdb.check_flags_nwd SIGNAL parked WAIT_FOR go';
SELECT kp1 FROM t3 ORDER BY kp1;
set debug_sync='now WAIT_FOR parked';
KILL QUERY $conn1_id;
diff --git a/mysql-test/suite/rocksdb/t/check_flags.test b/mysql-test/suite/rocksdb/t/check_flags.test
index 58dc1f4f8da..c100dce8afc 100644
--- a/mysql-test/suite/rocksdb/t/check_flags.test
+++ b/mysql-test/suite/rocksdb/t/check_flags.test
@@ -91,7 +91,7 @@ set debug_sync='RESET';
connection conn1;
-set debug_sync='rocksdb.check_flags_inwdi SIGNAL parked WAIT_FOR go';
+set debug_sync='rocksdb.check_flags_nwd SIGNAL parked WAIT_FOR go';
send SELECT kp1 FROM t3 ORDER BY kp1;
connection default;
diff --git a/storage/rocksdb/CMakeLists.txt b/storage/rocksdb/CMakeLists.txt
index 3fc21fb97cc..135a6af62df 100644
--- a/storage/rocksdb/CMakeLists.txt
+++ b/storage/rocksdb/CMakeLists.txt
@@ -125,6 +125,7 @@ SET(ROCKSDB_SOURCES
ha_rocksdb.cc ha_rocksdb.h ha_rocksdb_proto.h
logger.h
rdb_datadic.cc rdb_datadic.h
+ rdb_iterator.cc rdb_iterator.h
rdb_cf_options.cc rdb_cf_options.h
rdb_cf_manager.cc rdb_cf_manager.h
rdb_converter.cc rdb_converter.h
diff --git a/storage/rocksdb/ha_rocksdb.cc b/storage/rocksdb/ha_rocksdb.cc
index 2f41ba40a17..c484dfb5894 100644
--- a/storage/rocksdb/ha_rocksdb.cc
+++ b/storage/rocksdb/ha_rocksdb.cc
@@ -94,6 +94,7 @@
#include "./rdb_datadic.h"
#include "./rdb_i_s.h"
#include "./rdb_index_merge.h"
+#include "./rdb_iterator.h"
#include "./rdb_mutex_wrapper.h"
#include "./rdb_psi.h"
#include "./rdb_threads.h"
@@ -2975,8 +2976,9 @@ class Rdb_transaction {
}
int set_status_error(THD *const thd, const rocksdb::Status &s,
- const Rdb_key_def &kd, Rdb_tbl_def *const tbl_def,
- Rdb_table_handler *const table_handler) {
+ const Rdb_key_def &kd, const Rdb_tbl_def *const tbl_def,
+ Rdb_table_handler *const table_handler
+ MY_ATTRIBUTE((unused))) {
DBUG_ASSERT(!s.ok());
DBUG_ASSERT(tbl_def != nullptr);
@@ -2993,7 +2995,8 @@ class Rdb_transaction {
rocksdb_rollback_on_timeout);
m_detailed_error.copy(timeout_message(
"index", tbl_def->full_tablename().c_str(), kd.get_name().c_str()));
- table_handler->m_lock_wait_timeout_counter.inc();
+ /* TODO(yzha) - row stats are gone in 8.0
+ table_handler->m_lock_wait_timeout_counter.inc(); */
rocksdb_row_lock_wait_timeouts++;
return HA_ERR_LOCK_WAIT_TIMEOUT;
@@ -3002,7 +3005,8 @@ class Rdb_transaction {
if (s.IsDeadlock()) {
my_core::thd_mark_transaction_to_rollback(thd, 1 /* whole transaction */);
m_detailed_error = String();
- table_handler->m_deadlock_counter.inc();
+ /* TODO(yzha) - row stats are gone in 8.0
+ table_handler->m_deadlock_counter.inc(); */
rocksdb_row_lock_deadlocks++;
return HA_ERR_LOCK_DEADLOCK;
} else if (s.IsBusy()) {
@@ -3017,7 +3021,8 @@ class Rdb_transaction {
user_host_buff, thd->query());
}
m_detailed_error = String(" (snapshot conflict)", system_charset_info);
- table_handler->m_deadlock_counter.inc();
+ /* TODO(yzha) - row stats are gone in 8.0
+ table_handler->m_deadlock_counter.inc(); */
return HA_ERR_ROCKSDB_STATUS_BUSY;
}
@@ -3530,7 +3535,7 @@ class Rdb_transaction {
rocksdb::Iterator *get_iterator(
rocksdb::ColumnFamilyHandle *const column_family, bool skip_bloom_filter,
- bool fill_cache, const rocksdb::Slice &eq_cond_lower_bound,
+ const rocksdb::Slice &eq_cond_lower_bound,
const rocksdb::Slice &eq_cond_upper_bound, bool read_current = false,
bool create_snapshot = true) {
// Make sure we are not doing both read_current (which implies we don't
@@ -3542,6 +3547,7 @@ class Rdb_transaction {
if (create_snapshot) acquire_snapshot(true);
rocksdb::ReadOptions options = m_read_opts;
+ const bool fill_cache = !THDVAR(get_thd(), skip_fill_cache);
if (skip_bloom_filter) {
const bool enable_iterate_bounds =
@@ -6787,7 +6793,7 @@ static void dbug_change_status_to_corrupted(rocksdb::Status *status) {
// If the iterator is not valid it might be because of EOF but might be due
// to IOError or corruption. The good practice is always check it.
// https://github.com/facebook/rocksdb/wiki/Iterator#error-handling
-inline bool is_valid_iterator(rocksdb::Iterator *scan_it) {
+bool is_valid_iterator(rocksdb::Iterator *scan_it) {
if (scan_it->Valid()) {
return true;
} else {
@@ -6915,6 +6921,11 @@ ulonglong ha_rocksdb::load_auto_incr_value_from_index() {
const int save_active_index = active_index;
active_index = table->s->next_number_index;
const uint8 save_table_status = table->m_status;
+
+ std::unique_ptr<Rdb_iterator> save_iterator(new Rdb_iterator_base(
+ ha_thd(), m_key_descr_arr[active_index_pos()], m_pk_descr, m_tbl_def));
+ std::swap(m_iterator, save_iterator);
+
ulonglong last_val = 0;
Rdb_transaction *const tx = get_or_create_tx(table->in_use);
@@ -6966,7 +6977,7 @@ ulonglong ha_rocksdb::load_auto_incr_value_from_index() {
(Why don't we use index_init/index_end? class handler defines index_init
as private, for some reason).
*/
- release_scan_iterator();
+ std::swap(m_iterator, save_iterator);
return last_val;
}
@@ -7012,41 +7023,44 @@ int ha_rocksdb::load_hidden_pk_value() {
active_index = MAX_KEY;
const uint8 save_table_status = table->m_status;
+ std::unique_ptr<Rdb_iterator> save_iterator(new Rdb_iterator_base(
+ ha_thd(), m_key_descr_arr[active_index_pos()], m_pk_descr, m_tbl_def));
+ std::swap(m_iterator, save_iterator);
+
Rdb_transaction *const tx = get_or_create_tx(table->in_use);
const bool is_new_snapshot = !tx->has_snapshot();
longlong hidden_pk_id = 1;
+ longlong old = 0;
+ int rc = 0;
// Do a lookup.
if (!index_last(table->record[0])) {
/*
Decode PK field from the key
*/
- auto err = read_hidden_pk_id_from_rowkey(&hidden_pk_id);
- if (err) {
- if (is_new_snapshot) {
- tx->release_snapshot();
- }
- return err;
+ rc = read_hidden_pk_id_from_rowkey(&hidden_pk_id);
+ if (rc) {
+ goto exit;
}
hidden_pk_id++;
}
- longlong old = m_tbl_def->m_hidden_pk_val;
+ old = m_tbl_def->m_hidden_pk_val;
while (old < hidden_pk_id &&
!m_tbl_def->m_hidden_pk_val.compare_exchange_weak(old, hidden_pk_id)) {
}
+exit:
if (is_new_snapshot) {
tx->release_snapshot();
}
table->m_status = save_table_status;
active_index = save_active_index;
+ std::swap(m_iterator, save_iterator);
- release_scan_iterator();
-
- return HA_EXIT_SUCCESS;
+ return rc;
}
/* Get PK value from m_tbl_def->m_hidden_pk_info. */
@@ -7125,11 +7139,6 @@ ha_rocksdb::ha_rocksdb(my_core::handlerton *const hton,
my_core::TABLE_SHARE *const table_arg)
: handler(hton, table_arg),
m_table_handler(nullptr),
- m_scan_it(nullptr),
- m_scan_it_skips_bloom(false),
- m_scan_it_snapshot(nullptr),
- m_scan_it_lower_bound(nullptr),
- m_scan_it_upper_bound(nullptr),
m_tbl_def(nullptr),
m_pk_descr(nullptr),
m_key_descr_arr(nullptr),
@@ -7137,7 +7146,6 @@ ha_rocksdb::ha_rocksdb(my_core::handlerton *const hton,
m_pk_packed_tuple(nullptr),
m_sk_packed_tuple(nullptr),
m_end_key_packed_tuple(nullptr),
- m_sk_match_prefix(nullptr),
m_sk_packed_tuple_old(nullptr),
m_dup_sk_packed_tuple(nullptr),
m_dup_sk_packed_tuple_old(nullptr),
@@ -7203,12 +7211,13 @@ bool ha_rocksdb::init_with_fields() {
rows within a transaction, etc, because the compaction filter ignores
snapshots when filtering keys.
*/
-bool ha_rocksdb::should_hide_ttl_rec(const Rdb_key_def &kd,
- const rocksdb::Slice &ttl_rec_val,
- const int64_t curr_ts) {
+bool rdb_should_hide_ttl_rec(const Rdb_key_def &kd,
+ const rocksdb::Slice &ttl_rec_val,
+ Rdb_transaction *tx) {
DBUG_ASSERT(kd.has_ttl());
DBUG_ASSERT(kd.m_ttl_rec_offset != UINT_MAX);
- THD *thd = ha_thd();
+ THD *thd = tx->get_thd();
+ const int64_t curr_ts = tx->m_snapshot_timestamp;
/*
Curr_ts can only be 0 if there are no snapshots open.
@@ -7223,7 +7232,7 @@ bool ha_rocksdb::should_hide_ttl_rec(const Rdb_key_def &kd,
DBUG_ASSERT(false);
push_warning_printf(thd, Sql_condition::SL_WARNING, ER_WRONG_ARGUMENTS,
"TTL read filtering called with no snapshot.");
- update_row_stats(ROWS_UNFILTERED_NO_SNAPSHOT);
+ rdb_update_global_stats(ROWS_UNFILTERED_NO_SNAPSHOT, 1);
return false;
}
@@ -7263,7 +7272,7 @@ bool ha_rocksdb::should_hide_ttl_rec(const Rdb_key_def &kd,
bool is_hide_ttl =
ts + kd.m_ttl_duration + read_filter_ts <= static_cast<uint64>(curr_ts);
if (is_hide_ttl) {
- update_row_stats(ROWS_FILTERED);
+ rdb_update_global_stats(ROWS_FILTERED, 1);
/* increment examined row count when rows are skipped */
thd->inc_examined_row_count(1);
@@ -7381,8 +7390,6 @@ int ha_rocksdb::alloc_key_buffers(const TABLE *const table_arg,
m_sk_packed_tuple = reinterpret_cast<uchar *>(
my_malloc(PSI_NOT_INSTRUMENTED, max_packed_sk_len, MYF(0)));
- m_sk_match_prefix = reinterpret_cast<uchar *>(
- my_malloc(PSI_NOT_INSTRUMENTED, max_packed_sk_len, MYF(0)));
m_sk_packed_tuple_old = reinterpret_cast<uchar *>(
my_malloc(PSI_NOT_INSTRUMENTED, max_packed_sk_len, MYF(0)));
m_end_key_packed_tuple = reinterpret_cast<uchar *>(
@@ -7390,11 +7397,6 @@ int ha_rocksdb::alloc_key_buffers(const TABLE *const table_arg,
m_pack_buffer = reinterpret_cast<uchar *>(
my_malloc(PSI_NOT_INSTRUMENTED, max_packed_sk_len, MYF(0)));
- m_scan_it_lower_bound = reinterpret_cast<uchar *>(
- my_malloc(PSI_NOT_INSTRUMENTED, max_packed_sk_len, MYF(0)));
- m_scan_it_upper_bound = reinterpret_cast<uchar *>(
- my_malloc(PSI_NOT_INSTRUMENTED, max_packed_sk_len, MYF(0)));
-
/*
If inplace alter is happening, allocate special buffers for unique
secondary index duplicate checking.
@@ -7408,8 +7410,7 @@ int ha_rocksdb::alloc_key_buffers(const TABLE *const table_arg,
if (m_pk_packed_tuple == nullptr || m_sk_packed_tuple == nullptr ||
m_sk_packed_tuple_old == nullptr || m_end_key_packed_tuple == nullptr ||
- m_pack_buffer == nullptr || m_scan_it_upper_bound == nullptr ||
- m_scan_it_lower_bound == nullptr ||
+ m_pack_buffer == nullptr ||
(alloc_alter_buffers && (m_dup_sk_packed_tuple == nullptr ||
m_dup_sk_packed_tuple_old == nullptr))) {
// One or more of the above allocations failed. Clean up and exit
@@ -7428,9 +7429,6 @@ void ha_rocksdb::free_key_buffers() {
my_free(m_sk_packed_tuple);
m_sk_packed_tuple = nullptr;
- my_free(m_sk_match_prefix);
- m_sk_match_prefix = nullptr;
-
my_free(m_sk_packed_tuple_old);
m_sk_packed_tuple_old = nullptr;
@@ -7446,12 +7444,6 @@ void ha_rocksdb::free_key_buffers() {
my_free(m_dup_sk_packed_tuple_old);
m_dup_sk_packed_tuple_old = nullptr;
- my_free(m_scan_it_lower_bound);
- m_scan_it_lower_bound = nullptr;
-
- my_free(m_scan_it_upper_bound);
- m_scan_it_upper_bound = nullptr;
-
release_blob_buffer();
}
@@ -7576,6 +7568,7 @@ int ha_rocksdb::close(void) {
m_pk_descr = nullptr;
m_key_descr_arr = nullptr;
m_converter = nullptr;
+ m_iterator = nullptr;
free_key_buffers();
if (m_table_handler != nullptr) {
@@ -8738,173 +8731,6 @@ bool ha_rocksdb::check_keyread_allowed(bool &pk_can_be_decoded,
return true;
}
-int ha_rocksdb::read_key_exact(const Rdb_key_def &kd,
- rocksdb::Iterator *const iter,
- const rocksdb::Slice &key_slice,
- const int64_t ttl_filter_ts) {
- THD *thd = ha_thd();
- /*
- We are looking for the first record such that
- index_tuple= lookup_tuple.
- lookup_tuple may be a prefix of the index.
- */
- rocksdb_smart_seek(kd.m_is_reverse_cf, iter, key_slice);
-
- while (iter->Valid() && kd.value_matches_prefix(iter->key(), key_slice)) {
- if (thd && thd->killed) {
- return HA_ERR_QUERY_INTERRUPTED;
- }
- /*
- If TTL is enabled we need to check if the given key has already expired
- from the POV of the current transaction. If it has, try going to the next
- key.
- */
- if (kd.has_ttl() && should_hide_ttl_rec(kd, iter->value(), ttl_filter_ts)) {
- rocksdb_smart_next(kd.m_is_reverse_cf, iter);
- continue;
- }
-
- return HA_EXIT_SUCCESS;
- }
-
- /*
- Got a record that is not equal to the lookup value, or even a record
- from another table.index.
- */
- return HA_ERR_KEY_NOT_FOUND;
-}
-
-int ha_rocksdb::read_before_key(const Rdb_key_def &kd,
- const bool full_key_match,
- const rocksdb::Slice &key_slice) {
- THD *thd = ha_thd();
- /*
- We are looking for the first record such that
-
- index_tuple $LT lookup_tuple
-
- with HA_READ_BEFORE_KEY, $LT = '<',
- with HA_READ_PREFIX_LAST_OR_PREV, $LT = '<='
- with HA_READ_PREFIX_LAST, $LT = '=='
-
- Symmetry with read_after_key is possible if rocksdb supported prefix seeks.
- */
- rocksdb_smart_seek(!kd.m_is_reverse_cf, m_scan_it, key_slice);
-
- while (is_valid_iterator(m_scan_it)) {
- if (thd && thd->killed) {
- return HA_ERR_QUERY_INTERRUPTED;
- }
- /*
- We are using full key and we've hit an exact match.
- */
- if ((full_key_match &&
- kd.value_matches_prefix(m_scan_it->key(), key_slice))) {
- rocksdb_smart_next(!kd.m_is_reverse_cf, m_scan_it);
- continue;
- }
-
- return HA_EXIT_SUCCESS;
- }
-
- return HA_ERR_KEY_NOT_FOUND;
-}
-
-int ha_rocksdb::read_after_key(const Rdb_key_def &kd,
- const rocksdb::Slice &key_slice) {
- /*
- We are looking for the first record such that
-
- index_tuple $GT lookup_tuple
-
- with HA_READ_AFTER_KEY, $GT = '>',
- with HA_READ_KEY_OR_NEXT, $GT = '>='
- with HA_READ_KEY_EXACT, $GT = '=='
- */
- rocksdb_smart_seek(kd.m_is_reverse_cf, m_scan_it, key_slice);
-
- return is_valid_iterator(m_scan_it) ? HA_EXIT_SUCCESS : HA_ERR_KEY_NOT_FOUND;
-}
-
-int ha_rocksdb::position_to_correct_key(const Rdb_key_def &kd,
- const enum ha_rkey_function &find_flag,
- const bool full_key_match,
- const rocksdb::Slice &key_slice,
- bool *const move_forward) {
- int rc = 0;
-
- *move_forward = true;
-
- switch (find_flag) {
- case HA_READ_KEY_EXACT:
- case HA_READ_AFTER_KEY:
- case HA_READ_KEY_OR_NEXT:
- rc = read_after_key(kd, key_slice);
- break;
- case HA_READ_BEFORE_KEY:
- case HA_READ_PREFIX_LAST:
- case HA_READ_PREFIX_LAST_OR_PREV:
- *move_forward = false;
- rc = read_before_key(kd, full_key_match, key_slice);
- break;
- case HA_READ_KEY_OR_PREV:
- case HA_READ_PREFIX:
- /* These flags are not used by the SQL layer, so we don't support them
- * yet. */
- rc = HA_ERR_UNSUPPORTED;
- break;
- default:
- DBUG_ASSERT(0);
- break;
- }
-
- return rc;
-}
-
-int ha_rocksdb::calc_eq_cond_len(const Rdb_key_def &kd,
- const enum ha_rkey_function &find_flag,
- const rocksdb::Slice &slice,
- const int bytes_changed_by_succ,
- const key_range *const end_key) {
- if (find_flag == HA_READ_KEY_EXACT) return slice.size();
-
- if (find_flag == HA_READ_PREFIX_LAST) {
- /*
- We have made the kd.successor(m_sk_packed_tuple) call above.
-
- The slice is at least Rdb_key_def::INDEX_NUMBER_SIZE bytes long.
- */
- return slice.size() - bytes_changed_by_succ;
- }
-
- if (end_key) {
- uint end_key_packed_size = 0;
- end_key_packed_size =
- kd.pack_index_tuple(table, m_pack_buffer, m_end_key_packed_tuple,
- end_key->key, end_key->keypart_map);
-
- /*
- Calculating length of the equal conditions here. 4 byte index id is
- included.
- Example1: id1 BIGINT, id2 INT, id3 BIGINT, PRIMARY KEY (id1, id2, id3)
- WHERE id1=1 AND id2=1 AND id3>=2 => eq_cond_len= 4+8+4= 16
- WHERE id1=1 AND id2>=1 AND id3>=2 => eq_cond_len= 4+8= 12
- Example2: id1 VARCHAR(30), id2 INT, PRIMARY KEY (id1, id2)
- WHERE id1 = 'AAA' and id2 < 3; => eq_cond_len=13 (varchar used 9 bytes)
- */
- rocksdb::Slice end_slice(reinterpret_cast<char *>(m_end_key_packed_tuple),
- end_key_packed_size);
- return slice.difference_offset(end_slice);
- }
-
- /*
- On range scan without any end key condition, there is no
- eq cond, and eq cond length is the same as index_id size (8 bytes).
- Example1: id1 BIGINT, id2 INT, id3 BIGINT, PRIMARY KEY (id1, id2, id3)
- WHERE id1>=1 AND id2 >= 2 and id2 <= 5 => eq_cond_len= 4
- */
- return Rdb_key_def::INDEX_ID_SIZE;
-}
/**
@note
@@ -9088,7 +8914,7 @@ int ha_rocksdb::index_read_intern(uchar *const buf, const uchar *const key,
next/prev anyway. To avoid correctness issues, just free the
iterator.
*/
- release_scan_iterator();
+ m_iterator->reset();
DBUG_RETURN(rc);
} else {
/*
@@ -9124,7 +8950,7 @@ int ha_rocksdb::index_read_intern(uchar *const buf, const uchar *const key,
rc = get_row_by_rowid(buf, m_last_rowkey.ptr(),
m_last_rowkey.length());
- release_scan_iterator();
+ m_iterator->reset();
DBUG_RETURN(rc);
}
@@ -9143,7 +8969,7 @@ int ha_rocksdb::index_read_intern(uchar *const buf, const uchar *const key,
update_row_stats(ROWS_READ);
}
- release_scan_iterator();
+ m_iterator->reset();
DBUG_RETURN(rc);
}
}
@@ -9153,28 +8979,23 @@ int ha_rocksdb::index_read_intern(uchar *const buf, const uchar *const key,
key, keypart_map);
}
- if (find_flag == HA_READ_KEY_EXACT || find_flag == HA_READ_PREFIX_LAST) {
- m_sk_match_length = packed_size;
- memcpy(m_sk_match_prefix, m_sk_packed_tuple, packed_size);
- } else {
- kd.get_infimum_key(m_sk_match_prefix, &m_sk_match_length);
- }
-
- int bytes_changed_by_succ = 0;
- if (find_flag == HA_READ_PREFIX_LAST_OR_PREV ||
- find_flag == HA_READ_PREFIX_LAST || find_flag == HA_READ_AFTER_KEY) {
- /* See below */
- bytes_changed_by_succ = kd.successor(m_sk_packed_tuple, packed_size);
- }
-
rocksdb::Slice slice(reinterpret_cast<const char *>(m_sk_packed_tuple),
packed_size);
- const uint eq_cond_len =
- calc_eq_cond_len(kd, find_flag, slice, bytes_changed_by_succ, end_range);
+ rocksdb::Slice end_slice;
+ if (end_range && find_flag != HA_READ_KEY_EXACT &&
+ find_flag != HA_READ_PREFIX_LAST) {
+ uint end_key_packed_size = 0;
+ end_key_packed_size =
+ kd.pack_index_tuple(table, m_pack_buffer, m_end_key_packed_tuple,
+ end_range->key, end_range->keypart_map);
+ end_slice =
+ rocksdb::Slice((char *)m_end_key_packed_tuple, end_key_packed_size);
+ }
Rdb_transaction *const tx = get_or_create_tx(table->in_use);
const bool is_new_snapshot = !tx->has_snapshot();
+
// Loop as long as we get a deadlock error AND we end up creating the
// snapshot here (i.e. it did not exist prior to this)
for (;;) {
@@ -9187,15 +9008,7 @@ int ha_rocksdb::index_read_intern(uchar *const buf, const uchar *const key,
This will open the iterator and position it at a record that's equal or
greater than the lookup tuple.
*/
- setup_scan_iterator(kd, &slice, eq_cond_len);
-
- /*
- Once we are positioned on from above, move to the position we really
- want: See storage/rocksdb/rocksdb-range-access.txt
- */
- bool move_forward;
- rc = position_to_correct_key(kd, find_flag, using_full_key, slice,
- &move_forward);
+ rc = m_iterator->seek(find_flag, slice, using_full_key, end_slice);
if (rc) {
break;
@@ -9206,7 +9019,10 @@ int ha_rocksdb::index_read_intern(uchar *const buf, const uchar *const key,
then we have all the rows we need. For a secondary key we now need to
lookup the primary key.
*/
- rc = index_next_with_direction_intern(buf, move_forward, true);
+ bool direction = (find_flag == HA_READ_KEY_EXACT) ||
+ (find_flag == HA_READ_AFTER_KEY) ||
+ (find_flag == HA_READ_KEY_OR_NEXT);
+ rc = index_next_with_direction_intern(buf, direction, true);
if (!should_recreate_snapshot(rc, is_new_snapshot)) {
break; /* Exit the loop */
@@ -9214,7 +9030,7 @@ int ha_rocksdb::index_read_intern(uchar *const buf, const uchar *const key,
// release the snapshot and iterator so they will be regenerated
tx->release_snapshot();
- release_scan_iterator();
+ m_iterator->reset();
}
if (!rc) {
@@ -9247,7 +9063,14 @@ int ha_rocksdb::index_read_map(uchar *const buf, const uchar *const key,
DBUG_ENTER_FUNC();
ha_statistic_increment(&System_status_var::ha_read_key_count);
- DBUG_RETURN(index_read_intern(buf, key, keypart_map, find_flag));
+ int rc = index_read_intern(buf, key, keypart_map, find_flag);
+
+ // The SQL layer generally expects HA_ERR_KEY_NOT_FOUND for this call.
+ if (rc == HA_ERR_END_OF_FILE) {
+ rc = HA_ERR_KEY_NOT_FOUND;
+ }
+
+ DBUG_RETURN(rc);
}
/**
@@ -9324,7 +9147,7 @@ int ha_rocksdb::check(THD *const thd MY_ATTRIBUTE((__unused__)),
table_name, rows, res);
goto error;
}
- rocksdb::Slice key = m_scan_it->key();
+ rocksdb::Slice key = m_iterator->key();
sec_key_copy.copy(key.data(), key.size(), &my_charset_bin);
rowkey_copy.copy(m_last_rowkey.ptr(), m_last_rowkey.length(),
&my_charset_bin);
@@ -9470,16 +9293,9 @@ rocksdb::Status ha_rocksdb::get_for_update(
Rdb_transaction *const tx, const Rdb_key_def &key_descr,
const rocksdb::Slice &key, rocksdb::PinnableSlice *const value) const {
DBUG_ASSERT(m_lock_rows != RDB_LOCK_NONE);
-
+ DBUG_ASSERT(value == nullptr);
bool exclusive = m_lock_rows != RDB_LOCK_READ;
- bool do_validate = my_core::thd_tx_isolation(ha_thd()) > ISO_READ_COMMITTED;
- rocksdb::Status s =
- tx->get_for_update(key_descr, key, value, exclusive, do_validate);
-
-#ifndef DBUG_OFF
- ++rocksdb_num_get_for_update_calls;
-#endif
- return s;
+ return rdb_tx_get_for_update(tx, key_descr, key, value, exclusive);
}
bool ha_rocksdb::is_blind_delete_enabled() {
@@ -9522,9 +9338,6 @@ int ha_rocksdb::get_row_by_rowid(uchar *const buf, const char *const rowid,
DBUG_ASSERT(!debug_sync_set_action(thd, STRING_WITH_LEN(act)));
};);
- bool found;
- rocksdb::Status s;
-
/* Pretend row found without looking up */
if (skip_lookup) {
/* TODO(yzha) - rows stas are gone in 8.0
@@ -9535,11 +9348,9 @@ int ha_rocksdb::get_row_by_rowid(uchar *const buf, const char *const rowid,
DBUG_RETURN(0);
}
- if (m_lock_rows == RDB_LOCK_NONE) {
- tx->acquire_snapshot(true);
- s = tx->get(m_pk_descr->get_cf(), key_slice, &m_retrieved_record);
- } else if (m_insert_with_update && m_dup_key_found &&
- m_pk_descr->get_keyno() == m_dupp_errkey) {
+ if (m_insert_with_update && m_dup_key_found &&
+ m_pk_descr->get_keyno() == m_dupp_errkey) {
+ DBUG_ASSERT(m_lock_rows == RDB_LOCK_WRITE);
DBUG_ASSERT(m_dup_key_tuple.length() == key_slice.size());
DBUG_ASSERT(
memcmp(m_dup_key_tuple.ptr(), key_slice.data(), key_slice.size()) == 0);
@@ -9548,43 +9359,26 @@ int ha_rocksdb::get_row_by_rowid(uchar *const buf, const char *const rowid,
// m_dup_key_retrieved_record during write_row already, so just move it
// over.
m_retrieved_record = std::move(m_dup_key_retrieved_record);
- s = rocksdb::Status::OK();
+ rc = HA_EXIT_SUCCESS;
} else {
- s = get_for_update(tx, *m_pk_descr, key_slice, &m_retrieved_record);
+ tx->acquire_snapshot(false);
+ Rdb_iterator_base iter(ha_thd(), m_pk_descr, m_pk_descr, m_tbl_def);
+ rc = iter.get(&key_slice, &m_retrieved_record, m_lock_rows, skip_ttl_check);
}
- DBUG_EXECUTE_IF("rocksdb_return_status_corrupted",
- dbug_change_status_to_corrupted(&s););
-
- if (!s.IsNotFound() && !s.ok()) {
- DBUG_RETURN(tx->set_status_error(table->in_use, s, *m_pk_descr, m_tbl_def,
- m_table_handler));
- }
- found = !s.IsNotFound();
-
- table->m_status = STATUS_NOT_FOUND;
- if (found) {
- /* If we found the record, but it's expired, pretend we didn't find it. */
- if (!skip_ttl_check && m_pk_descr->has_ttl() &&
- should_hide_ttl_rec(*m_pk_descr, m_retrieved_record,
- tx->m_snapshot_timestamp)) {
- DBUG_RETURN(HA_ERR_KEY_NOT_FOUND);
- }
-
+ if (!rc) {
m_last_rowkey.copy((const char *)rowid, rowid_size, &my_charset_bin);
rc = convert_record_from_storage_format(&key_slice, buf);
if (!rc) {
table->m_status = 0;
}
- } else {
- /*
- Note: we don't need to unlock the row. It is intentional that we keep
- locks on rows that don't exist.
- */
- rc = HA_ERR_KEY_NOT_FOUND;
}
+ /*
+ Note: we don't need to unlock the row. It is intentional that we keep
+ locks on rows that don't exist.
+ */
DBUG_RETURN(rc);
}
@@ -9615,23 +9409,9 @@ int ha_rocksdb::records_from_index(ha_rows *num_rows, uint index) {
int ha_rocksdb::get_row_by_sk(uchar *buf, const Rdb_key_def &kd,
const rocksdb::Slice *key) {
DBUG_ENTER_FUNC();
- Rdb_transaction *const tx = get_or_create_tx(table->in_use);
-
- auto s = tx->get(kd.get_cf(), *key, &m_retrieved_record);
-
- if (!s.IsNotFound() && !s.ok()) {
- DBUG_RETURN(
- tx->set_status_error(table->in_use, s, kd, m_tbl_def, m_table_handler));
- }
- if (s.IsNotFound()) {
- DBUG_RETURN(HA_ERR_KEY_NOT_FOUND);
- }
-
- if (kd.has_ttl() &&
- should_hide_ttl_rec(kd, m_retrieved_record, tx->m_snapshot_timestamp)) {
- DBUG_RETURN(HA_ERR_KEY_NOT_FOUND);
- }
+ int rc = m_iterator->get(key, &m_retrieved_record, RDB_LOCK_NONE);
+ if (rc) DBUG_RETURN(rc);
const uint size =
kd.get_primary_key_tuple(table, *m_pk_descr, key, m_pk_packed_tuple);
@@ -9641,7 +9421,7 @@ int ha_rocksdb::get_row_by_sk(uchar *buf, const Rdb_key_def &kd,
m_last_rowkey.copy((const char *)m_pk_packed_tuple, size, &my_charset_bin);
- int rc = secondary_index_read(active_index, buf, &m_retrieved_record);
+ rc = secondary_index_read(active_index, buf, &m_retrieved_record);
if (!rc) {
table->m_status = 0;
}
@@ -9700,9 +9480,6 @@ int ha_rocksdb::index_next_with_direction_intern(uchar *const buf,
THD *thd = ha_thd();
int rc = 0;
const Rdb_key_def &kd = *m_key_descr_arr[active_index_pos()];
- Rdb_transaction *const tx = get_or_create_tx(thd);
- rocksdb::Slice prefix_tuple(reinterpret_cast<char *>(m_sk_match_prefix),
- m_sk_match_length);
table->m_status = STATUS_NOT_FOUND;
/* TODO(yzha) - row stats are gone in 8.0
@@ -9715,8 +9492,8 @@ int ha_rocksdb::index_next_with_direction_intern(uchar *const buf,
break;
}
- DBUG_ASSERT(m_scan_it);
- if (m_scan_it == nullptr) {
+ DBUG_ASSERT(m_iterator != nullptr);
+ if (m_iterator == nullptr) {
rc = HA_ERR_INTERNAL_ERROR;
break;
}
@@ -9725,31 +9502,18 @@ int ha_rocksdb::index_next_with_direction_intern(uchar *const buf,
skip_next = false;
} else {
if (move_forward) {
- rocksdb_smart_next(kd.m_is_reverse_cf, m_scan_it);
+ rc = m_iterator->next();
} else {
- rocksdb_smart_prev(kd.m_is_reverse_cf, m_scan_it);
+ rc = m_iterator->prev();
}
}
- if (!is_valid_iterator(m_scan_it)) {
- rc = HA_ERR_END_OF_FILE;
+ if (rc == HA_ERR_END_OF_FILE) {
break;
}
- const rocksdb::Slice &key = m_scan_it->key();
- const rocksdb::Slice &value = m_scan_it->value();
-
- // Outside our range, return EOF.
- if (!kd.value_matches_prefix(key, prefix_tuple)) {
- rc = HA_ERR_END_OF_FILE;
- break;
- }
-
- // Record is not visible due to TTL, move to next record.
- if (m_pk_descr->has_ttl() &&
- should_hide_ttl_rec(kd, value, tx->m_snapshot_timestamp)) {
- continue;
- }
+ const rocksdb::Slice &key = m_iterator->key();
+ const rocksdb::Slice &value = m_iterator->value();
if (active_index == table->s->primary_key) {
if (m_lock_rows != RDB_LOCK_NONE) {
@@ -9958,15 +9722,14 @@ bool ha_rocksdb::skip_unique_check() const {
use_read_free_rpl();
}
-bool ha_rocksdb::commit_in_the_middle() {
+bool commit_in_the_middle(THD *thd) {
// It does not make sense to use write unprepared with commit in the middle,
// since both handle large transactions by flushing the write batches onto
// disk.
//
// For the two to work together, we would need to assign a new xid after
// committing.
- return (THDVAR(table->in_use, bulk_load) ||
- THDVAR(table->in_use, commit_in_the_middle)) &&
+ return (THDVAR(thd, bulk_load) || THDVAR(thd, commit_in_the_middle)) &&
rocksdb_write_policy != rocksdb::TxnDBWritePolicy::WRITE_UNPREPARED;
}
@@ -9976,7 +9739,7 @@ bool ha_rocksdb::commit_in_the_middle() {
@retval false if bulk commit was skipped or succeeded
*/
bool ha_rocksdb::do_bulk_commit(Rdb_transaction *const tx) {
- return commit_in_the_middle() &&
+ return commit_in_the_middle(table->in_use) &&
tx->get_write_count() >= THDVAR(table->in_use, bulk_load_size) &&
tx->flush_batch();
}
@@ -10200,6 +9963,10 @@ void ha_rocksdb::set_last_rowkey(
}
}
+void ha_rocksdb::set_last_rowkey(const char *str, size_t len) {
+ m_last_rowkey.copy(str, len, &my_charset_bin);
+}
+
/**
Collect update data for primary key
@@ -10294,28 +10061,18 @@ int ha_rocksdb::check_and_lock_unique_pk(const uint key_id,
2) T1 Get(empty) -> T1 Put(insert, not committed yet) -> T2 Get(empty)
-> T2 Put(insert, blocked) -> T1 commit -> T2 commit(overwrite)
*/
- const rocksdb::Status s =
- get_for_update(row_info.tx, *m_pk_descr, row_info.new_pk_slice,
- ignore_pk_unique_check ? nullptr : pslice);
- if (!s.ok() && !s.IsNotFound()) {
- return row_info.tx->set_status_error(
- table->in_use, s, *m_key_descr_arr[key_id], m_tbl_def, m_table_handler);
- }
+ Rdb_iterator_base iter(ha_thd(), m_key_descr_arr[key_id], m_pk_descr,
+ m_tbl_def);
- bool key_found = ignore_pk_unique_check ? false : !s.IsNotFound();
+ int rc = iter.get(&row_info.new_pk_slice,
+ ignore_pk_unique_check ? nullptr : pslice, m_lock_rows);
- /*
- If the pk key has ttl, we may need to pretend the row wasn't
- found if it is already expired.
- */
- DBUG_ASSERT(row_info.tx->has_snapshot() &&
- row_info.tx->m_snapshot_timestamp != 0);
- if (key_found && m_pk_descr->has_ttl() &&
- should_hide_ttl_rec(*m_pk_descr, *pslice,
- row_info.tx->m_snapshot_timestamp)) {
- key_found = false;
+ if (rc && rc != HA_ERR_KEY_NOT_FOUND) {
+ return rc;
}
+ bool key_found = ignore_pk_unique_check ? false : (rc == HA_EXIT_SUCCESS);
+
if (key_found && row_info.old_data == nullptr && m_insert_with_update) {
// In INSERT ON DUPLICATE KEY UPDATE ... case, if the insert failed
// due to a duplicate key, remember the last key and skip the check
@@ -10441,53 +10198,30 @@ int ha_rocksdb::check_and_lock_sk(
The bloom filter may need to be disabled for this lookup.
*/
- uchar lower_bound_buf[Rdb_key_def::INDEX_ID_SIZE];
- uchar upper_bound_buf[Rdb_key_def::INDEX_ID_SIZE];
- rocksdb::Slice lower_bound_slice;
- rocksdb::Slice upper_bound_slice;
+ Rdb_iterator_base iter(ha_thd(), m_key_descr_arr[key_id], m_pk_descr,
+ m_tbl_def);
+ int rc = HA_EXIT_SUCCESS;
- const rocksdb::Status s =
- get_for_update(row_info.tx, kd, new_slice,
- all_parts_used ? &m_retrieved_record : nullptr);
- if (!s.ok() && !s.IsNotFound()) {
- return row_info.tx->set_status_error(table->in_use, s, kd, m_tbl_def,
- m_table_handler);
+ rc = iter.get(&new_slice, all_parts_used ? &m_retrieved_record : nullptr,
+ m_lock_rows);
+ if (rc && rc != HA_ERR_KEY_NOT_FOUND) {
+ return rc;
}
- rocksdb::Iterator *iter = nullptr;
+ if (!all_parts_used) {
+ rc = iter.seek(HA_READ_KEY_EXACT, new_slice, false /* full_key_match */,
+ new_slice, true /* read current */);
- if (all_parts_used) {
- *found = !s.IsNotFound();
- if (*found && kd.has_ttl() &&
- should_hide_ttl_rec(kd, m_retrieved_record,
- row_info.tx->m_snapshot_timestamp)) {
- *found = false;
+ if (rc && rc != HA_ERR_END_OF_FILE) {
+ return rc;
}
- } else {
- const bool total_order_seek = !check_bloom_and_set_bounds(
- ha_thd(), kd, new_slice, Rdb_key_def::INDEX_ID_SIZE, lower_bound_buf,
- upper_bound_buf, &lower_bound_slice, &upper_bound_slice);
- const bool fill_cache = !THDVAR(ha_thd(), skip_fill_cache);
-
- iter = row_info.tx->get_iterator(kd.get_cf(), total_order_seek, fill_cache,
- lower_bound_slice, upper_bound_slice,
- true /* read current data */,
- false /* acquire snapshot */);
- /*
- Need to scan the transaction to see if there is a duplicate key.
- Also need to scan RocksDB and verify the key has not been deleted
- in the transaction.
- */
- DBUG_ASSERT(row_info.tx->has_snapshot() &&
- row_info.tx->m_snapshot_timestamp != 0);
- *found =
- !read_key_exact(kd, iter, new_slice, row_info.tx->m_snapshot_timestamp);
}
- int rc = HA_EXIT_SUCCESS;
+ *found = (rc == HA_EXIT_SUCCESS);
+ rc = HA_EXIT_SUCCESS;
if (*found && m_insert_with_update) {
- const rocksdb::Slice &rkey = all_parts_used ? new_slice : iter->key();
+ const rocksdb::Slice &rkey = all_parts_used ? new_slice : iter.key();
uint pk_size =
kd.get_primary_key_tuple(table, *m_pk_descr, &rkey, m_pk_packed_tuple);
if (pk_size == RDB_INVALID_KEY_LEN) {
@@ -10503,7 +10237,6 @@ int ha_rocksdb::check_and_lock_sk(
}
}
- delete iter;
return rc;
}
@@ -11040,10 +10773,12 @@ int ha_rocksdb::update_write_row(const uchar *const old_data,
0x0000b3eb003f65c5e78857, and lower bound would be
0x0000b3eb003f65c5e78859. These cover given eq condition range.
*/
-void ha_rocksdb::setup_iterator_bounds(
- const Rdb_key_def &kd, const rocksdb::Slice &eq_cond, size_t bound_len,
- uchar *const lower_bound, uchar *const upper_bound,
- rocksdb::Slice *lower_bound_slice, rocksdb::Slice *upper_bound_slice) {
+static void setup_iterator_bounds(const Rdb_key_def &kd,
+ const rocksdb::Slice &eq_cond,
+ size_t bound_len, uchar *const lower_bound,
+ uchar *const upper_bound,
+ rocksdb::Slice *lower_bound_slice,
+ rocksdb::Slice *upper_bound_slice) {
// If eq_cond is shorter than Rdb_key_def::INDEX_NUMBER_SIZE, we should be
// able to get better bounds just by using index id directly.
if (eq_cond.size() <= Rdb_key_def::INDEX_ID_SIZE) {
@@ -11070,91 +10805,6 @@ void ha_rocksdb::setup_iterator_bounds(
}
}
-/*
- Open a cursor
-*/
-
-void ha_rocksdb::setup_scan_iterator(const Rdb_key_def &kd,
- rocksdb::Slice *const slice,
- const uint eq_cond_len) {
- DBUG_ASSERT(slice->size() >= eq_cond_len);
-
- Rdb_transaction *const tx = get_or_create_tx(table->in_use);
-
- bool skip_bloom = true;
-
- const rocksdb::Slice eq_cond(slice->data(), eq_cond_len);
- // The size of m_scan_it_lower_bound (and upper) is technically
- // max_packed_sk_len as calculated in ha_rocksdb::alloc_key_buffers. Rather
- // than recalculating that number, we pass in the max of eq_cond_len and
- // Rdb_key_def::INDEX_NUMBER_SIZE which is guaranteed to be smaller than
- // max_packed_sk_len, hence ensuring no buffer overrun.
- //
- // See ha_rocksdb::setup_iterator_bounds on how the bound_len parameter is
- // used.
- if (check_bloom_and_set_bounds(
- ha_thd(), kd, eq_cond,
- std::max(eq_cond_len, (uint)Rdb_key_def::INDEX_ID_SIZE),
- m_scan_it_lower_bound, m_scan_it_upper_bound,
- &m_scan_it_lower_bound_slice, &m_scan_it_upper_bound_slice)) {
- skip_bloom = false;
- }
-
- /*
- In some cases, setup_scan_iterator() is called multiple times from
- the same query but bloom filter can not always be used.
- Suppose the following query example. id2 is VARCHAR(30) and PRIMARY KEY
- (id1, id2).
- select count(*) from t2 WHERE id1=100 and id2 IN ('00000000000000000000',
- '100');
- In this case, setup_scan_iterator() is called twice, the first time is for
- (id1, id2)=(100, '00000000000000000000') and the second time is for (100,
- '100').
- If prefix bloom filter length is 24 bytes, prefix bloom filter can be used
- for the
- first condition but not for the second condition.
- If bloom filter condition is changed, currently it is necessary to destroy
- and
- re-create Iterator.
- */
- if (m_scan_it_skips_bloom != skip_bloom) {
- release_scan_iterator();
- }
-
- /*
- SQL layer can call rnd_init() multiple times in a row.
- In that case, re-use the iterator, but re-position it at the table start.
- */
- if (!m_scan_it) {
- const bool fill_cache = !THDVAR(ha_thd(), skip_fill_cache);
- if (commit_in_the_middle()) {
- DBUG_ASSERT(m_scan_it_snapshot == nullptr);
- m_scan_it_snapshot = rdb->GetSnapshot();
-
- auto read_opts = rocksdb::ReadOptions();
- // TODO(mung): set based on WHERE conditions
- read_opts.total_order_seek = true;
- read_opts.snapshot = m_scan_it_snapshot;
- m_scan_it = rdb->NewIterator(read_opts, kd.get_cf());
- } else {
- m_scan_it = tx->get_iterator(kd.get_cf(), skip_bloom, fill_cache,
- m_scan_it_lower_bound_slice,
- m_scan_it_upper_bound_slice);
- }
- m_scan_it_skips_bloom = skip_bloom;
- }
-}
-
-void ha_rocksdb::release_scan_iterator() {
- delete m_scan_it;
- m_scan_it = nullptr;
-
- if (m_scan_it_snapshot) {
- rdb->ReleaseSnapshot(m_scan_it_snapshot);
- m_scan_it_snapshot = nullptr;
- }
-}
-
/**
@return
HA_EXIT_SUCCESS OK
@@ -11241,6 +10891,10 @@ int ha_rocksdb::index_init(uint idx, bool sorted MY_ATTRIBUTE((__unused__))) {
Rdb_transaction *const tx = get_or_create_tx(thd);
DBUG_ASSERT(tx != nullptr);
+ active_index = idx;
+ m_iterator.reset(new Rdb_iterator_base(
+ thd, m_key_descr_arr[active_index_pos()], m_pk_descr, m_tbl_def));
+
// If m_lock_rows is not RDB_LOCK_NONE then we will be doing a get_for_update
// when accessing the index, so don't acquire the snapshot right away.
// Otherwise acquire the snapshot immediately.
@@ -11257,8 +10911,7 @@ int ha_rocksdb::index_end() {
DBUG_ENTER_FUNC();
m_need_build_decoder = false;
-
- release_scan_iterator();
+ m_iterator = nullptr;
active_index = MAX_KEY;
in_range_check_pushed_down = false;
@@ -15141,14 +14794,14 @@ bool rdb_dbug_set_ttl_ignore_pk() { return rocksdb_debug_ttl_ignore_pk; }
#endif
void rdb_update_global_stats(const operation_type &type, uint count,
- bool is_system_table) {
+ Rdb_tbl_def *td) {
DBUG_ASSERT(type < ROWS_MAX);
if (count == 0) {
return;
}
- if (is_system_table) {
+ if (td && td->m_is_mysql_system_table) {
global_stats.system_rows[type].add(count);
} else {
global_stats.rows[type].add(count);
@@ -16132,13 +15785,33 @@ const rocksdb::ReadOptions &rdb_tx_acquire_snapshot(Rdb_transaction *tx) {
rocksdb::Iterator *rdb_tx_get_iterator(
Rdb_transaction *tx, rocksdb::ColumnFamilyHandle *const column_family,
- bool skip_bloom_filter, bool fill_cache,
- const rocksdb::Slice &lower_bound_slice,
+ bool skip_bloom_filter, const rocksdb::Slice &lower_bound_slice,
const rocksdb::Slice &upper_bound_slice, bool read_current,
bool create_snapshot) {
- return tx->get_iterator(column_family, skip_bloom_filter, fill_cache,
- lower_bound_slice, upper_bound_slice, read_current,
- create_snapshot);
+ return tx->get_iterator(column_family, skip_bloom_filter, lower_bound_slice,
+ upper_bound_slice, read_current, create_snapshot);
+}
+
+rocksdb::Iterator *rdb_tx_get_iterator(
+ THD *thd, rocksdb::ColumnFamilyHandle *const cf, bool skip_bloom_filter,
+ const rocksdb::Slice &eq_cond_lower_bound,
+ const rocksdb::Slice &eq_cond_upper_bound,
+ const rocksdb::Snapshot **snapshot, bool read_current,
+ bool create_snapshot) {
+ if (commit_in_the_middle(thd)) {
+ DBUG_ASSERT(*snapshot == nullptr);
+ *snapshot = rdb->GetSnapshot();
+
+ auto read_opts = rocksdb::ReadOptions();
+ // TODO(mung): set based on WHERE conditions
+ read_opts.total_order_seek = true;
+ read_opts.snapshot = *snapshot;
+ return rdb->NewIterator(read_opts, cf);
+ } else {
+ Rdb_transaction *tx = get_tx_from_thd(thd);
+ return tx->get_iterator(cf, skip_bloom_filter, eq_cond_lower_bound,
+ eq_cond_upper_bound, read_current, create_snapshot);
+ }
}
bool rdb_tx_started(Rdb_transaction *tx) { return tx->is_tx_started(); }
@@ -16150,6 +15823,22 @@ rocksdb::Status rdb_tx_get(Rdb_transaction *tx,
return tx->get(column_family, key, value);
}
+rocksdb::Status rdb_tx_get_for_update(Rdb_transaction *tx,
+ const Rdb_key_def &kd,
+ const rocksdb::Slice &key,
+ rocksdb::PinnableSlice *const value,
+ bool exclusive) {
+ bool do_validate =
+ my_core::thd_tx_isolation(tx->get_thd()) > ISO_READ_COMMITTED;
+ rocksdb::Status s =
+ tx->get_for_update(kd, key, value, exclusive, do_validate);
+
+#ifndef DBUG_OFF
+ ++rocksdb_num_get_for_update_calls;
+#endif
+ return s;
+}
+
void rdb_tx_multi_get(Rdb_transaction *tx,
rocksdb::ColumnFamilyHandle *const column_family,
const size_t num_keys, const rocksdb::Slice *keys,
@@ -16158,6 +15847,12 @@ void rdb_tx_multi_get(Rdb_transaction *tx,
tx->multi_get(column_family, num_keys, keys, values, statuses, sorted_input);
}
+int rdb_tx_set_status_error(Rdb_transaction *tx, const rocksdb::Status &s,
+ const Rdb_key_def &kd,
+ const Rdb_tbl_def *const tbl_def) {
+ return tx->set_status_error(tx->get_thd(), s, kd, tbl_def, nullptr);
+}
+
/****************************************************************************
* Multi-Range-Read implementation based on RocksDB's MultiGet() call
***************************************************************************/
@@ -16685,8 +16380,7 @@ int ha_rocksdb::multi_range_read_next(char **range_info) {
/* If we found the record, but it's expired, pretend we didn't find it. */
if (m_pk_descr->has_ttl() &&
- should_hide_ttl_rec(*m_pk_descr, m_retrieved_record,
- tx->m_snapshot_timestamp)) {
+ rdb_should_hide_ttl_rec(*m_pk_descr, m_retrieved_record, tx)) {
continue;
}
diff --git a/storage/rocksdb/ha_rocksdb.h b/storage/rocksdb/ha_rocksdb.h
index 9c3c3927498..369af0bc9c4 100644
--- a/storage/rocksdb/ha_rocksdb.h
+++ b/storage/rocksdb/ha_rocksdb.h
@@ -77,6 +77,7 @@
namespace myrocks {
class Rdb_converter;
+class Rdb_iterator;
class Rdb_key_def;
class Rdb_tbl_def;
class Rdb_transaction;
@@ -134,6 +135,8 @@ enum table_cardinality_scan_type {
SCAN_TYPE_FULL_TABLE,
};
+enum Rdb_lock_type { RDB_LOCK_NONE, RDB_LOCK_READ, RDB_LOCK_WRITE };
+
class Mrr_rowid_source;
uint32_t rocksdb_perf_context_level(THD *const thd);
@@ -148,20 +151,6 @@ class ha_rocksdb : public my_core::handler {
Rdb_table_handler *m_table_handler; ///< Open table handler
- /* Iterator used for range scans and for full table/index scans */
- rocksdb::Iterator *m_scan_it;
-
- /* Whether m_scan_it was created with skip_bloom=true */
- bool m_scan_it_skips_bloom;
-
- const rocksdb::Snapshot *m_scan_it_snapshot;
-
- /* Buffers used for upper/lower bounds for m_scan_it. */
- uchar *m_scan_it_lower_bound;
- uchar *m_scan_it_upper_bound;
- rocksdb::Slice m_scan_it_lower_bound_slice;
- rocksdb::Slice m_scan_it_upper_bound_slice;
-
Rdb_tbl_def *m_tbl_def;
/* Primary Key encoder from KeyTupleFormat to StorageFormat */
@@ -197,13 +186,6 @@ class ha_rocksdb : public my_core::handler {
Rdb_string_writer m_sk_tails;
Rdb_string_writer m_pk_unpack_info;
- /*
- ha_rockdb->index_read_map(.. HA_READ_KEY_EXACT or similar) will save here
- mem-comparable form of the index lookup tuple.
- */
- uchar *m_sk_match_prefix;
- uint m_sk_match_length;
-
/* Second buffers, used by UPDATE. */
uchar *m_sk_packed_tuple_old;
Rdb_string_writer m_sk_tails_old;
@@ -221,6 +203,8 @@ class ha_rocksdb : public my_core::handler {
/* class to convert between Mysql format and RocksDB format*/
std::unique_ptr<Rdb_converter> m_converter;
+ std::unique_ptr<Rdb_iterator> m_iterator;
+
/*
Pointer to the original TTL timestamp value (8 bytes) during UPDATE.
*/
@@ -269,7 +253,7 @@ class ha_rocksdb : public my_core::handler {
uint m_total_blob_buffer_allocated = 0;
/* Type of locking to apply to rows */
- enum { RDB_LOCK_NONE, RDB_LOCK_READ, RDB_LOCK_WRITE } m_lock_rows;
+ Rdb_lock_type m_lock_rows;
/* true means we're doing an index-only read. false means otherwise. */
bool m_keyread_only;
@@ -327,17 +311,6 @@ class ha_rocksdb : public my_core::handler {
int secondary_index_read(const int keyno, uchar *const buf,
const rocksdb::Slice *value)
MY_ATTRIBUTE((__nonnull__, __warn_unused_result__));
- static void setup_iterator_bounds(const Rdb_key_def &kd,
- const rocksdb::Slice &eq_cond,
- size_t bound_len, uchar *const lower_bound,
- uchar *const upper_bound,
- rocksdb::Slice *lower_bound_slice,
- rocksdb::Slice *upper_bound_slice);
- static bool can_use_bloom_filter(THD *thd, const Rdb_key_def &kd,
- const rocksdb::Slice &eq_cond);
- void setup_scan_iterator(const Rdb_key_def &kd, rocksdb::Slice *slice,
- const uint eq_cond_len) MY_ATTRIBUTE((__nonnull__));
- void release_scan_iterator(void);
rocksdb::Status get_for_update(Rdb_transaction *const tx,
const Rdb_key_def &kd,
@@ -373,7 +346,6 @@ class ha_rocksdb : public my_core::handler {
MY_ATTRIBUTE((__warn_unused_result__));
bool is_blind_delete_enabled();
bool skip_unique_check() const;
- bool commit_in_the_middle() MY_ATTRIBUTE((__warn_unused_result__));
bool do_bulk_commit(Rdb_transaction *const tx)
MY_ATTRIBUTE((__nonnull__, __warn_unused_result__));
bool has_hidden_pk(const TABLE *const table) const
@@ -382,6 +354,7 @@ class ha_rocksdb : public my_core::handler {
void update_row_stats(const operation_type &type, ulonglong count = 1);
void set_last_rowkey(const uchar *const old_data);
+ void set_last_rowkey(const char *str, size_t len);
int alloc_key_buffers(const TABLE *const table_arg,
const Rdb_tbl_def *const tbl_def_arg,
@@ -666,6 +639,8 @@ class ha_rocksdb : public my_core::handler {
THD *thd, const Rdb_key_def &kd, const rocksdb::Slice &eq_cond,
size_t bound_len, uchar *const lower_bound, uchar *const upper_bound,
rocksdb::Slice *lower_bound_slice, rocksdb::Slice *upper_bound_slice);
+ static bool can_use_bloom_filter(THD *thd, const Rdb_key_def &kd,
+ const rocksdb::Slice &eq_cond);
private:
// true <=> The scan uses the default MRR implementation, just redirect all
@@ -680,6 +655,7 @@ class ha_rocksdb : public my_core::handler {
friend class Mrr_rowid_source;
friend class Mrr_pk_scan_rowid_source;
friend class Mrr_sec_key_rowid_source;
+ friend class Rdb_iterator;
// MRR parameters and output values
rocksdb::Slice *mrr_keys;
@@ -778,11 +754,6 @@ class ha_rocksdb : public my_core::handler {
int compare_keys(const KEY *const old_key, const KEY *const new_key) const
MY_ATTRIBUTE((__nonnull__, __warn_unused_result__));
- bool should_hide_ttl_rec(const Rdb_key_def &kd,
- const rocksdb::Slice &ttl_rec_val,
- const int64_t curr_ts)
- MY_ATTRIBUTE((__warn_unused_result__));
-
int index_read_intern(uchar *const buf, const uchar *const key,
key_part_map keypart_map,
enum ha_rkey_function find_flag)
@@ -834,29 +805,6 @@ class ha_rocksdb : public my_core::handler {
const bool pk_changed)
MY_ATTRIBUTE((__warn_unused_result__));
- int read_key_exact(const Rdb_key_def &kd, rocksdb::Iterator *const iter,
- const rocksdb::Slice &key_slice,
- const int64_t ttl_filter_ts)
- MY_ATTRIBUTE((__nonnull__, __warn_unused_result__));
- int read_before_key(const Rdb_key_def &kd, const bool using_full_key,
- const rocksdb::Slice &key_slice)
- MY_ATTRIBUTE((__nonnull__, __warn_unused_result__));
- int read_after_key(const Rdb_key_def &kd, const rocksdb::Slice &key_slice)
- MY_ATTRIBUTE((__nonnull__, __warn_unused_result__));
- int position_to_correct_key(const Rdb_key_def &kd,
- const enum ha_rkey_function &find_flag,
- const bool full_key_match,
- const rocksdb::Slice &key_slice,
- bool *const move_forward)
- MY_ATTRIBUTE((__warn_unused_result__));
-
- int calc_eq_cond_len(const Rdb_key_def &kd,
- const enum ha_rkey_function &find_flag,
- const rocksdb::Slice &slice,
- const int bytes_changed_by_succ,
- const key_range *const end_key)
- MY_ATTRIBUTE((__warn_unused_result__));
-
Rdb_tbl_def *get_table_if_exists(const char *const tablename)
MY_ATTRIBUTE((__nonnull__, __warn_unused_result__));
void read_thd_vars(THD *const thd) MY_ATTRIBUTE((__nonnull__));
@@ -1196,15 +1144,28 @@ const rocksdb::ReadOptions &rdb_tx_acquire_snapshot(Rdb_transaction *tx);
rocksdb::Iterator *rdb_tx_get_iterator(
Rdb_transaction *tx, rocksdb::ColumnFamilyHandle *const column_family,
- bool skip_bloom, bool fill_cache, const rocksdb::Slice &lower_bound_slice,
+ bool skip_bloom, const rocksdb::Slice &lower_bound_slice,
const rocksdb::Slice &upper_bound_slice, bool read_current = false,
bool create_snapshot = true);
+rocksdb::Iterator *rdb_tx_get_iterator(
+ THD *thd, rocksdb::ColumnFamilyHandle *const cf, bool skip_bloom_filter,
+ const rocksdb::Slice &eq_cond_lower_bound,
+ const rocksdb::Slice &eq_cond_upper_bound,
+ const rocksdb::Snapshot **snapshot, bool read_current = false,
+ bool create_snapshot = true);
+
rocksdb::Status rdb_tx_get(Rdb_transaction *tx,
rocksdb::ColumnFamilyHandle *const column_family,
const rocksdb::Slice &key,
rocksdb::PinnableSlice *const value);
+rocksdb::Status rdb_tx_get_for_update(Rdb_transaction *tx,
+ const Rdb_key_def &kd,
+ const rocksdb::Slice &key,
+ rocksdb::PinnableSlice *const value,
+ bool exclusive);
+
void rdb_tx_multi_get(Rdb_transaction *tx,
rocksdb::ColumnFamilyHandle *const column_family,
const size_t num_keys, const rocksdb::Slice *keys,
@@ -1244,7 +1205,14 @@ inline void rocksdb_smart_prev(bool seek_backward,
// https://github.com/facebook/rocksdb/wiki/Iterator#error-handling
bool is_valid_iterator(rocksdb::Iterator *scan_it);
+bool rdb_should_hide_ttl_rec(const Rdb_key_def &kd,
+ const rocksdb::Slice &ttl_rec_val,
+ Rdb_transaction *tx);
+
bool rdb_tx_started(Rdb_transaction *tx);
+int rdb_tx_set_status_error(Rdb_transaction *tx, const rocksdb::Status &s,
+ const Rdb_key_def &kd,
+ const Rdb_tbl_def *const tbl_def);
extern std::atomic<uint64_t> rocksdb_select_bypass_executed;
extern std::atomic<uint64_t> rocksdb_select_bypass_rejected;
diff --git a/storage/rocksdb/ha_rocksdb_proto.h b/storage/rocksdb/ha_rocksdb_proto.h
index e72d666a781..58eee57ac27 100644
--- a/storage/rocksdb/ha_rocksdb_proto.h
+++ b/storage/rocksdb/ha_rocksdb_proto.h
@@ -105,7 +105,7 @@ bool rdb_sync_wal_supported();
enum operation_type : int;
void rdb_update_global_stats(const operation_type &type, uint count,
- bool is_system_table = false);
+ Rdb_tbl_def *td = nullptr);
class Rdb_dict_manager;
Rdb_dict_manager *rdb_get_dict_manager(void)
diff --git a/storage/rocksdb/nosql_access.cc b/storage/rocksdb/nosql_access.cc
index 1788486ee9f..f13b94b32f1 100644
--- a/storage/rocksdb/nosql_access.cc
+++ b/storage/rocksdb/nosql_access.cc
@@ -678,8 +678,8 @@ class select_exec {
bool use_bloom,
const rocksdb::Slice &lower_bound,
const rocksdb::Slice &upper_bound) {
- return rdb_tx_get_iterator(m_tx, cf, !use_bloom, true /* fill_cache */,
- lower_bound, upper_bound);
+ return rdb_tx_get_iterator(m_tx, cf, !use_bloom, lower_bound,
+ upper_bound);
}
rocksdb::Status get(rocksdb::ColumnFamilyHandle *cf,
diff --git a/storage/rocksdb/rdb_converter.h b/storage/rocksdb/rdb_converter.h
index 2e6f1ed9689..e121215c2e8 100644
--- a/storage/rocksdb/rdb_converter.h
+++ b/storage/rocksdb/rdb_converter.h
@@ -173,12 +173,12 @@ class Rdb_converter {
}
const MY_BITMAP *get_lookup_bitmap() { return &m_lookup_bitmap; }
+ private:
int decode_value_header_for_pk(Rdb_string_reader *reader,
const std::shared_ptr<Rdb_key_def> &pk_def,
rocksdb::Slice *unpack_slice);
- private:
void setup_field_encoders();
void get_storage_type(Rdb_field_encoder *const encoder, const uint kp);
diff --git a/storage/rocksdb/rdb_iterator.cc b/storage/rocksdb/rdb_iterator.cc
new file mode 100644
index 00000000000..529cd6dacae
--- /dev/null
+++ b/storage/rocksdb/rdb_iterator.cc
@@ -0,0 +1,359 @@
+/*
+ Copyright (c) 2020, Facebook, Inc.
+
+ This program is free software; you can redistribute it and/or modify
+ it under the terms of the GNU General Public License as published by
+ the Free Software Foundation; version 2 of the License.
+
+ This program is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU General Public License for more details.
+
+ You should have received a copy of the GNU General Public License
+ along with this program; if not, write to the Free Software
+ Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA */
+
+#include "./rdb_iterator.h"
+
+#include "scope_guard.h"
+
+namespace myrocks {
+
+Rdb_iterator::~Rdb_iterator() {}
+
+Rdb_iterator_base::Rdb_iterator_base(THD *thd,
+ const std::shared_ptr<Rdb_key_def> kd,
+ const std::shared_ptr<Rdb_key_def> pkd,
+ const Rdb_tbl_def *tbl_def)
+ : m_kd(kd),
+ m_pkd(pkd),
+ m_tbl_def(tbl_def),
+ m_thd(thd),
+ m_scan_it(nullptr),
+ m_scan_it_skips_bloom(false),
+ m_scan_it_snapshot(nullptr),
+ m_scan_it_lower_bound(nullptr),
+ m_scan_it_upper_bound(nullptr),
+ m_prefix_buf(nullptr) {}
+
+Rdb_iterator_base::~Rdb_iterator_base() {
+ release_scan_iterator();
+ my_free(m_scan_it_lower_bound);
+ m_scan_it_lower_bound = nullptr;
+ my_free(m_scan_it_upper_bound);
+ m_scan_it_upper_bound = nullptr;
+ my_free(m_prefix_buf);
+ m_prefix_buf = nullptr;
+}
+
+int Rdb_iterator_base::read_before_key(const bool full_key_match,
+ const rocksdb::Slice &key_slice) {
+ /*
+ We are looking for the first record such that
+
+ index_tuple $LT lookup_tuple
+
+ with HA_READ_BEFORE_KEY, $LT = '<',
+ with HA_READ_PREFIX_LAST_OR_PREV, $LT = '<='
+ with HA_READ_PREFIX_LAST, $LT = '=='
+
+ Symmetry with read_after_key is possible if rocksdb supported prefix seeks.
+ */
+ rocksdb_smart_seek(!m_kd->m_is_reverse_cf, m_scan_it, key_slice);
+
+ while (is_valid_iterator(m_scan_it)) {
+ if (thd_killed(m_thd)) {
+ return HA_ERR_QUERY_INTERRUPTED;
+ }
+ /*
+ We are using full key and we've hit an exact match.
+ */
+ if ((full_key_match &&
+ m_kd->value_matches_prefix(m_scan_it->key(), key_slice))) {
+ rocksdb_smart_next(!m_kd->m_is_reverse_cf, m_scan_it);
+ continue;
+ }
+
+ return HA_EXIT_SUCCESS;
+ }
+
+ return HA_ERR_END_OF_FILE;
+}
+
+int Rdb_iterator_base::read_after_key(const rocksdb::Slice &key_slice) {
+ /*
+ We are looking for the first record such that
+
+ index_tuple $GT lookup_tuple
+
+ with HA_READ_AFTER_KEY, $GT = '>',
+ with HA_READ_KEY_OR_NEXT, $GT = '>='
+ with HA_READ_KEY_EXACT, $GT = '=='
+ */
+ rocksdb_smart_seek(m_kd->m_is_reverse_cf, m_scan_it, key_slice);
+
+ return is_valid_iterator(m_scan_it) ? HA_EXIT_SUCCESS : HA_ERR_END_OF_FILE;
+}
+
+void Rdb_iterator_base::release_scan_iterator() {
+ delete m_scan_it;
+ m_scan_it = nullptr;
+
+ if (m_scan_it_snapshot) {
+ auto rdb = rdb_get_rocksdb_db();
+ rdb->ReleaseSnapshot(m_scan_it_snapshot);
+ m_scan_it_snapshot = nullptr;
+ }
+}
+
+void Rdb_iterator_base::setup_scan_iterator(const rocksdb::Slice *const slice,
+ const uint eq_cond_len,
+ bool read_current) {
+ DBUG_ASSERT(slice->size() >= eq_cond_len);
+
+ bool skip_bloom = true;
+
+ const rocksdb::Slice eq_cond(slice->data(), eq_cond_len);
+
+ // The size of m_scan_it_lower_bound (and upper) is technically
+ // max_packed_sk_len as calculated in ha_rocksdb::alloc_key_buffers. Rather
+ // than recalculating that number, we pass in the max of eq_cond_len and
+ // Rdb_key_def::INDEX_NUMBER_SIZE which is guaranteed to be smaller than
+ // max_packed_sk_len, hence ensuring no buffer overrun.
+ //
+ // See setup_iterator_bounds on how the bound_len parameter is
+ // used.
+ if (ha_rocksdb::check_bloom_and_set_bounds(
+ m_thd, *m_kd, eq_cond,
+ std::max(eq_cond_len, (uint)Rdb_key_def::INDEX_ID_SIZE),
+ m_scan_it_lower_bound, m_scan_it_upper_bound,
+ &m_scan_it_lower_bound_slice, &m_scan_it_upper_bound_slice)) {
+ skip_bloom = false;
+ }
+
+ /*
+ In some cases, setup_scan_iterator() is called multiple times from
+ the same query but bloom filter can not always be used.
+ Suppose the following query example. id2 is VARCHAR(30) and PRIMARY KEY
+ (id1, id2).
+ select count(*) from t2 WHERE id1=100 and id2 IN ('00000000000000000000',
+ '100');
+ In this case, setup_scan_iterator() is called twice, the first time is for
+ (id1, id2)=(100, '00000000000000000000') and the second time is for (100,
+ '100').
+ If prefix bloom filter length is 24 bytes, prefix bloom filter can be used
+ for the
+ first condition but not for the second condition.
+ If bloom filter condition is changed, currently it is necessary to destroy
+ and
+ re-create Iterator.
+ */
+ if (m_scan_it_skips_bloom != skip_bloom) {
+ release_scan_iterator();
+ }
+
+ /*
+ SQL layer can call rnd_init() multiple times in a row.
+ In that case, re-use the iterator, but re-position it at the table start.
+ */
+ if (!m_scan_it) {
+ m_scan_it = rdb_tx_get_iterator(
+ m_thd, m_kd->get_cf(), skip_bloom, m_scan_it_lower_bound_slice,
+ m_scan_it_upper_bound_slice, &m_scan_it_snapshot, read_current,
+ !read_current);
+ m_scan_it_skips_bloom = skip_bloom;
+ }
+}
+
+int Rdb_iterator_base::calc_eq_cond_len(enum ha_rkey_function find_flag,
+ const rocksdb::Slice &start_key,
+ const int bytes_changed_by_succ,
+ const rocksdb::Slice &end_key) {
+ if (find_flag == HA_READ_KEY_EXACT) return start_key.size();
+
+ if (find_flag == HA_READ_PREFIX_LAST) {
+ /*
+ We have made the kd.successor(m_sk_packed_tuple) call above.
+
+ The slice is at least Rdb_key_def::INDEX_NUMBER_SIZE bytes long.
+ */
+ return start_key.size() - bytes_changed_by_succ;
+ }
+
+ if (!end_key.empty()) {
+ /*
+ Calculating length of the equal conditions here. 4 byte index id is
+ included.
+ Example1: id1 BIGINT, id2 INT, id3 BIGINT, PRIMARY KEY (id1, id2, id3)
+ WHERE id1=1 AND id2=1 AND id3>=2 => eq_cond_len= 4+8+4= 16
+ WHERE id1=1 AND id2>=1 AND id3>=2 => eq_cond_len= 4+8= 12
+ Example2: id1 VARCHAR(30), id2 INT, PRIMARY KEY (id1, id2)
+ WHERE id1 = 'AAA' and id2 < 3; => eq_cond_len=13 (varchar used 9 bytes)
+ */
+ return start_key.difference_offset(end_key);
+ }
+
+ /*
+ On range scan without any end key condition, there is no
+ eq cond, and eq cond length is the same as index_id size (4 bytes).
+ Example1: id1 BIGINT, id2 INT, id3 BIGINT, PRIMARY KEY (id1, id2, id3)
+ WHERE id1>=1 AND id2 >= 2 and id2 <= 5 => eq_cond_len= 4
+ */
+ return Rdb_key_def::INDEX_ID_SIZE;
+}
+
+int Rdb_iterator_base::next_with_direction(bool move_forward, bool skip_next) {
+ int rc = 0;
+ const auto &kd = *m_kd;
+ Rdb_transaction *const tx = get_tx_from_thd(m_thd);
+
+ for (;;) {
+ DEBUG_SYNC(m_thd, "rocksdb.check_flags_nwd");
+ if (thd_killed(m_thd)) {
+ rc = HA_ERR_QUERY_INTERRUPTED;
+ break;
+ }
+
+ DBUG_ASSERT(m_scan_it != nullptr);
+ if (m_scan_it == nullptr) {
+ rc = HA_ERR_INTERNAL_ERROR;
+ break;
+ }
+
+ if (skip_next) {
+ skip_next = false;
+ } else {
+ if (move_forward) {
+ rocksdb_smart_next(kd.m_is_reverse_cf, m_scan_it);
+ } else {
+ rocksdb_smart_prev(kd.m_is_reverse_cf, m_scan_it);
+ }
+ }
+
+ if (!is_valid_iterator(m_scan_it)) {
+ rc = HA_ERR_END_OF_FILE;
+ break;
+ }
+
+ const rocksdb::Slice &key = m_scan_it->key();
+ const rocksdb::Slice &value = m_scan_it->value();
+
+ // Outside our range, return EOF.
+ if (!kd.value_matches_prefix(key, m_prefix_tuple)) {
+ rc = HA_ERR_END_OF_FILE;
+ break;
+ }
+
+ // Record is not visible due to TTL, move to next record.
+ if (m_pkd->has_ttl() && rdb_should_hide_ttl_rec(kd, value, tx)) {
+ continue;
+ }
+
+ break;
+ }
+
+ return rc;
+}
+
+int Rdb_iterator_base::seek(enum ha_rkey_function find_flag,
+ const rocksdb::Slice start_key, bool full_key_match,
+ const rocksdb::Slice end_key, bool read_current) {
+ int rc = 0;
+
+ uint prefix_key_len;
+
+ if (!m_prefix_buf) {
+ const uint packed_len = m_kd->max_storage_fmt_length();
+ m_scan_it_lower_bound = reinterpret_cast<uchar *>(
+ my_malloc(PSI_NOT_INSTRUMENTED, packed_len, MYF(0)));
+ m_scan_it_upper_bound = reinterpret_cast<uchar *>(
+ my_malloc(PSI_NOT_INSTRUMENTED, packed_len, MYF(0)));
+ m_prefix_buf = reinterpret_cast<uchar *>(
+ my_malloc(PSI_NOT_INSTRUMENTED, packed_len, MYF(0)));
+ }
+
+ if (find_flag == HA_READ_KEY_EXACT || find_flag == HA_READ_PREFIX_LAST) {
+ memcpy(m_prefix_buf, start_key.data(), start_key.size());
+ prefix_key_len = start_key.size();
+ } else {
+ m_kd->get_infimum_key(m_prefix_buf, &prefix_key_len);
+ }
+ m_prefix_tuple = rocksdb::Slice((char *)m_prefix_buf, prefix_key_len);
+
+ int bytes_changed_by_succ = 0;
+ uchar *start_key_buf = (uchar *)start_key.data();
+ // We need to undo mutating the start key in case of retries using the same
+ // buffer.
+ auto start_key_guard = create_scope_guard([this, start_key_buf, start_key] {
+ this->m_kd->predecessor(start_key_buf, start_key.size());
+ });
+ if (find_flag == HA_READ_PREFIX_LAST_OR_PREV ||
+ find_flag == HA_READ_PREFIX_LAST || find_flag == HA_READ_AFTER_KEY) {
+ bytes_changed_by_succ = m_kd->successor(start_key_buf, start_key.size());
+ } else {
+ start_key_guard.commit();
+ }
+
+ const uint eq_cond_len =
+ calc_eq_cond_len(find_flag, start_key, bytes_changed_by_succ, end_key);
+
+ /*
+ This will open the iterator and position it at a record that's equal or
+ greater than the lookup tuple.
+ */
+ setup_scan_iterator(&start_key, eq_cond_len, read_current);
+
+ /*
+ Once we are positioned on from above, move to the position we really
+ want: See storage/rocksdb/rocksdb-range-access.txt
+ */
+ bool direction = (find_flag == HA_READ_KEY_EXACT) ||
+ (find_flag == HA_READ_AFTER_KEY) ||
+ (find_flag == HA_READ_KEY_OR_NEXT);
+ if (direction) {
+ rc = read_after_key(start_key);
+ } else {
+ rc = read_before_key(full_key_match, start_key);
+ }
+
+ if (rc) {
+ return rc;
+ }
+
+ rc = next_with_direction(direction, true);
+ return rc;
+}
+
+int Rdb_iterator_base::get(const rocksdb::Slice *key,
+ rocksdb::PinnableSlice *value, Rdb_lock_type type,
+ bool skip_ttl_check) {
+ int rc = HA_EXIT_SUCCESS;
+ Rdb_transaction *const tx = get_tx_from_thd(m_thd);
+ rocksdb::Status s;
+ if (type == RDB_LOCK_NONE) {
+ s = rdb_tx_get(tx, m_kd->get_cf(), *key, value);
+ } else {
+ s = rdb_tx_get_for_update(tx, *m_kd, *key, value, type == RDB_LOCK_WRITE);
+ }
+
+ DBUG_EXECUTE_IF("rocksdb_return_status_corrupted",
+ { s = rocksdb::Status::Corruption(); });
+
+ if (!s.IsNotFound() && !s.ok()) {
+ return rdb_tx_set_status_error(tx, s, *m_kd, m_tbl_def);
+ }
+
+ if (s.IsNotFound()) {
+ return HA_ERR_KEY_NOT_FOUND;
+ }
+
+ if (!skip_ttl_check && m_kd->has_ttl() &&
+ rdb_should_hide_ttl_rec(*m_kd, *value, tx)) {
+ return HA_ERR_KEY_NOT_FOUND;
+ }
+
+ return rc;
+}
+
+} // namespace myrocks
diff --git a/storage/rocksdb/rdb_iterator.h b/storage/rocksdb/rdb_iterator.h
new file mode 100644
index 00000000000..2a0f5bd5760
--- /dev/null
+++ b/storage/rocksdb/rdb_iterator.h
@@ -0,0 +1,121 @@
+/*
+ Copyright (c) 2020, Facebook, Inc.
+
+ This program is free software; you can redistribute it and/or modify
+ it under the terms of the GNU General Public License as published by
+ the Free Software Foundation; version 2 of the License.
+
+ This program is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU General Public License for more details.
+
+ You should have received a copy of the GNU General Public License
+ along with this program; if not, write to the Free Software
+ Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA */
+
+#pragma once
+
+// MySQL header files
+#include "sql/debug_sync.h"
+#include "sql/handler.h"
+
+// MyRocks header files
+#include "./ha_rocksdb.h"
+#include "./ha_rocksdb_proto.h"
+#include "./rdb_converter.h"
+#include "./rdb_datadic.h"
+
+namespace myrocks {
+
+class Rdb_iterator {
+ public:
+ virtual ~Rdb_iterator() = 0;
+
+ /*
+ direction specifies which logical direction the table is scanned in.
+ start_key is inclusive if scanning forwards, but exclusive when scanning
+ backwards. full_key_match indicates whether the seek key may match the full
+
+ Once rocksdb supports prefix seeks, the API can be simplified since
+ full_key_match is no longer needed.
+ */
+ virtual int seek(enum ha_rkey_function find_flag,
+ const rocksdb::Slice start_key, bool full_key_match,
+ const rocksdb::Slice end_key, bool read_current = false) = 0;
+ virtual int get(const rocksdb::Slice *key, rocksdb::PinnableSlice *value,
+ Rdb_lock_type type, bool skip_ttl_check = false) = 0;
+ virtual int next() = 0;
+ virtual int prev() = 0;
+ virtual rocksdb::Slice key() = 0;
+ virtual rocksdb::Slice value() = 0;
+ virtual void reset() = 0;
+};
+
+class Rdb_iterator_base : public Rdb_iterator {
+ private:
+ int read_before_key(const bool full_key_match,
+ const rocksdb::Slice &key_slice);
+ int read_after_key(const rocksdb::Slice &key_slice);
+ void release_scan_iterator();
+ void setup_scan_iterator(const rocksdb::Slice *const slice,
+ const uint eq_cond_len, bool read_current);
+ int calc_eq_cond_len(enum ha_rkey_function find_flag,
+ const rocksdb::Slice &start_key,
+ const int bytes_changed_by_succ,
+ const rocksdb::Slice &end_key);
+ int next_with_direction(bool move_forward, bool skip_next);
+
+ public:
+ Rdb_iterator_base(THD *thd, const std::shared_ptr<Rdb_key_def> kd,
+ const std::shared_ptr<Rdb_key_def> pkd,
+ const Rdb_tbl_def *tbl_def);
+
+ ~Rdb_iterator_base() override;
+
+ int seek(enum ha_rkey_function find_flag, const rocksdb::Slice start_key,
+ bool full_key_match, const rocksdb::Slice end_key,
+ bool read_current) override;
+ int get(const rocksdb::Slice *key, rocksdb::PinnableSlice *value,
+ Rdb_lock_type type, bool skip_ttl_check = false) override;
+
+ int next() override { return next_with_direction(true, false); }
+
+ int prev() override { return next_with_direction(false, false); }
+
+ rocksdb::Slice key() override { return m_scan_it->key(); }
+
+ rocksdb::Slice value() override { return m_scan_it->value(); }
+
+ void reset() override { release_scan_iterator(); }
+
+ protected:
+ friend class Rdb_iterator;
+ const std::shared_ptr<Rdb_key_def> m_kd;
+
+ // Rdb_key_def of the primary key
+ const std::shared_ptr<Rdb_key_def> m_pkd;
+
+ const Rdb_tbl_def *m_tbl_def;
+
+ THD *m_thd;
+
+ /* Iterator used for range scans and for full table/index scans */
+ rocksdb::Iterator *m_scan_it;
+
+ /* Whether m_scan_it was created with skip_bloom=true */
+ bool m_scan_it_skips_bloom;
+
+ const rocksdb::Snapshot *m_scan_it_snapshot;
+
+ /* Buffers used for upper/lower bounds for m_scan_it. */
+ uchar *m_scan_it_lower_bound;
+ uchar *m_scan_it_upper_bound;
+ rocksdb::Slice m_scan_it_lower_bound_slice;
+ rocksdb::Slice m_scan_it_upper_bound_slice;
+
+ uchar *m_prefix_buf;
+ rocksdb::Slice m_prefix_tuple;
+};
+
+} // namespace myrocks
1
0
17 May '21
revision-id: 0f4980afd8faa61b23a3008d3cba726881072174 (percona-202102-51-g0f4980afd8f)
parent(s): d89e160443823d0f07691e99067e15b06d532ccc
author: Sergei Petrunia
committer: Sergei Petrunia
timestamp: 2021-05-17 17:37:01 +0300
message:
Apply patch: Use Get for secondary key point lookups
Summary: Today for secondary keys where we have the full key, we are still using an iterator to read the key when a point query would suffice. This is more efficient, but also makes the bloom filter code cleaner.
Test Plan: mtr
Reviewers: luqun, herman, yzha, #mysql_eng
Subscribers: pgl
Differential Revision: https://phabricator.intern.facebook.com/D25906968
---
mysql-test/suite/rocksdb/r/bloomfilter.result | 120 +++++++-------
mysql-test/suite/rocksdb/r/rocksdb_range.result | 12 ++
.../suite/rocksdb/t/bloomfilter_load_select.inc | 2 +
mysql-test/suite/rocksdb/t/rocksdb_range.test | 8 +
storage/rocksdb/ha_rocksdb.cc | 179 +++++++++++++++------
storage/rocksdb/ha_rocksdb.h | 20 ++-
storage/rocksdb/nosql_access.cc | 5 +-
7 files changed, 226 insertions(+), 120 deletions(-)
diff --git a/mysql-test/suite/rocksdb/r/bloomfilter.result b/mysql-test/suite/rocksdb/r/bloomfilter.result
index 7c5b479da2e..cd2d2b671f9 100644
--- a/mysql-test/suite/rocksdb/r/bloomfilter.result
+++ b/mysql-test/suite/rocksdb/r/bloomfilter.result
@@ -121,28 +121,28 @@ count(*)
1
call bloom_end();
checked
-true
+false
call bloom_start();
select count(*) from t2 force index (id2_id4_id5) where id2=23 and id4=115 and id5=115;
count(*)
1
call bloom_end();
checked
-true
+false
call bloom_start();
select count(*) from t2 force index (id2_id4_id5) where id2=500 and id4=2500 and id5=2500;
count(*)
1
call bloom_end();
checked
-true
+false
call bloom_start();
select count(*) from t2 force index (id2_id4_id5) where id2=601 and id4=3005 and id5=3005;
count(*)
1
call bloom_end();
checked
-true
+false
call bloom_start();
select count(*) from t2 force index (id2_id3) where id2=1;
count(*)
@@ -240,14 +240,14 @@ count(*)
1
call bloom_end();
checked
-true
+false
call bloom_start();
select count(*) from t2 force index (id2_id3) where id2=12 and id3='12' and id4=60;
count(*)
1
call bloom_end();
checked
-true
+false
call bloom_start();
select count(*) from t1 force index (id2_id3) where id2=1 and id3='1';
count(*)
@@ -331,42 +331,42 @@ count(*)
1
call bloom_end();
checked
-true
+false
call bloom_start();
select count(*) from t2 force index (id2) where id2=23 and id4=115;
count(*)
1
call bloom_end();
checked
-true
+false
call bloom_start();
select count(*) from t2 force index (id2) where id2=500 and id4=2500;
count(*)
1
call bloom_end();
checked
-true
+false
call bloom_start();
select count(*) from t2 force index (id2) where id2=601 and id4=3005;
count(*)
1
call bloom_end();
checked
-true
+false
call bloom_start();
select count(*) from t2 force index (id3_id4) where id3='1' and id4=1;
count(*)
1
call bloom_end();
checked
-true
+false
call bloom_start();
select count(*) from t2 force index (id3_id4) where id3='12' and id4=60;
count(*)
1
call bloom_end();
checked
-true
+false
call bloom_start();
select count(*) from t1 force index (id2) where id2=1;
count(*)
@@ -519,28 +519,28 @@ count(*)
1
call bloom_end();
checked
-true
+false
call bloom_start();
select count(*) from t2 force index (id2_id4_id5) where id2=23 and id4=115 and id5=115;
count(*)
1
call bloom_end();
checked
-true
+false
call bloom_start();
select count(*) from t2 force index (id2_id4_id5) where id2=500 and id4=2500 and id5=2500;
count(*)
1
call bloom_end();
checked
-true
+false
call bloom_start();
select count(*) from t2 force index (id2_id4_id5) where id2=601 and id4=3005 and id5=3005;
count(*)
1
call bloom_end();
checked
-true
+false
call bloom_start();
select count(*) from t2 force index (id2_id3) where id2=1;
count(*)
@@ -638,14 +638,14 @@ count(*)
1
call bloom_end();
checked
-true
+false
call bloom_start();
select count(*) from t2 force index (id2_id3) where id2=12 and id3='12' and id4=60;
count(*)
1
call bloom_end();
checked
-true
+false
call bloom_start();
select count(*) from t1 force index (id2_id3) where id2=1 and id3='1';
count(*)
@@ -729,42 +729,42 @@ count(*)
1
call bloom_end();
checked
-true
+false
call bloom_start();
select count(*) from t2 force index (id2) where id2=23 and id4=115;
count(*)
1
call bloom_end();
checked
-true
+false
call bloom_start();
select count(*) from t2 force index (id2) where id2=500 and id4=2500;
count(*)
1
call bloom_end();
checked
-true
+false
call bloom_start();
select count(*) from t2 force index (id2) where id2=601 and id4=3005;
count(*)
1
call bloom_end();
checked
-true
+false
call bloom_start();
select count(*) from t2 force index (id3_id4) where id3='1' and id4=1;
count(*)
1
call bloom_end();
checked
-true
+false
call bloom_start();
select count(*) from t2 force index (id3_id4) where id3='12' and id4=60;
count(*)
1
call bloom_end();
checked
-true
+false
call bloom_start();
select count(*) from t1 force index (id2) where id2=1;
count(*)
@@ -917,28 +917,28 @@ count(*)
1
call bloom_end();
checked
-true
+false
call bloom_start();
select count(*) from t2 force index (id2_id4_id5) where id2=23 and id4=115 and id5=115;
count(*)
1
call bloom_end();
checked
-true
+false
call bloom_start();
select count(*) from t2 force index (id2_id4_id5) where id2=500 and id4=2500 and id5=2500;
count(*)
1
call bloom_end();
checked
-true
+false
call bloom_start();
select count(*) from t2 force index (id2_id4_id5) where id2=601 and id4=3005 and id5=3005;
count(*)
1
call bloom_end();
checked
-true
+false
call bloom_start();
select count(*) from t2 force index (id2_id3) where id2=1;
count(*)
@@ -1036,14 +1036,14 @@ count(*)
1
call bloom_end();
checked
-true
+false
call bloom_start();
select count(*) from t2 force index (id2_id3) where id2=12 and id3='12' and id4=60;
count(*)
1
call bloom_end();
checked
-true
+false
call bloom_start();
select count(*) from t1 force index (id2_id3) where id2=1 and id3='1';
count(*)
@@ -1127,42 +1127,42 @@ count(*)
1
call bloom_end();
checked
-true
+false
call bloom_start();
select count(*) from t2 force index (id2) where id2=23 and id4=115;
count(*)
1
call bloom_end();
checked
-true
+false
call bloom_start();
select count(*) from t2 force index (id2) where id2=500 and id4=2500;
count(*)
1
call bloom_end();
checked
-true
+false
call bloom_start();
select count(*) from t2 force index (id2) where id2=601 and id4=3005;
count(*)
1
call bloom_end();
checked
-true
+false
call bloom_start();
select count(*) from t2 force index (id3_id4) where id3='1' and id4=1;
count(*)
1
call bloom_end();
checked
-true
+false
call bloom_start();
select count(*) from t2 force index (id3_id4) where id3='12' and id4=60;
count(*)
1
call bloom_end();
checked
-true
+false
call bloom_start();
select count(*) from t1 force index (id2) where id2=1;
count(*)
@@ -1315,28 +1315,28 @@ count(*)
1
call bloom_end();
checked
-true
+false
call bloom_start();
select count(*) from t2 force index (id2_id4_id5) where id2=23 and id4=115 and id5=115;
count(*)
1
call bloom_end();
checked
-true
+false
call bloom_start();
select count(*) from t2 force index (id2_id4_id5) where id2=500 and id4=2500 and id5=2500;
count(*)
1
call bloom_end();
checked
-true
+false
call bloom_start();
select count(*) from t2 force index (id2_id4_id5) where id2=601 and id4=3005 and id5=3005;
count(*)
1
call bloom_end();
checked
-true
+false
call bloom_start();
select count(*) from t2 force index (id2_id3) where id2=1;
count(*)
@@ -1434,14 +1434,14 @@ count(*)
1
call bloom_end();
checked
-true
+false
call bloom_start();
select count(*) from t2 force index (id2_id3) where id2=12 and id3='12' and id4=60;
count(*)
1
call bloom_end();
checked
-true
+false
call bloom_start();
select count(*) from t1 force index (id2_id3) where id2=1 and id3='1';
count(*)
@@ -1525,42 +1525,42 @@ count(*)
1
call bloom_end();
checked
-true
+false
call bloom_start();
select count(*) from t2 force index (id2) where id2=23 and id4=115;
count(*)
1
call bloom_end();
checked
-true
+false
call bloom_start();
select count(*) from t2 force index (id2) where id2=500 and id4=2500;
count(*)
1
call bloom_end();
checked
-true
+false
call bloom_start();
select count(*) from t2 force index (id2) where id2=601 and id4=3005;
count(*)
1
call bloom_end();
checked
-true
+false
call bloom_start();
select count(*) from t2 force index (id3_id4) where id3='1' and id4=1;
count(*)
1
call bloom_end();
checked
-true
+false
call bloom_start();
select count(*) from t2 force index (id3_id4) where id3='12' and id4=60;
count(*)
1
call bloom_end();
checked
-true
+false
call bloom_start();
select count(*) from t1 force index (id2) where id2=1;
count(*)
@@ -1713,28 +1713,28 @@ count(*)
1
call bloom_end();
checked
-true
+false
call bloom_start();
select count(*) from t2 force index (id2_id4_id5) where id2=23 and id4=115 and id5=115;
count(*)
1
call bloom_end();
checked
-true
+false
call bloom_start();
select count(*) from t2 force index (id2_id4_id5) where id2=500 and id4=2500 and id5=2500;
count(*)
1
call bloom_end();
checked
-true
+false
call bloom_start();
select count(*) from t2 force index (id2_id4_id5) where id2=601 and id4=3005 and id5=3005;
count(*)
1
call bloom_end();
checked
-true
+false
call bloom_start();
select count(*) from t2 force index (id2_id3) where id2=1;
count(*)
@@ -1832,14 +1832,14 @@ count(*)
1
call bloom_end();
checked
-true
+false
call bloom_start();
select count(*) from t2 force index (id2_id3) where id2=12 and id3='12' and id4=60;
count(*)
1
call bloom_end();
checked
-true
+false
call bloom_start();
select count(*) from t1 force index (id2_id3) where id2=1 and id3='1';
count(*)
@@ -1923,42 +1923,42 @@ count(*)
1
call bloom_end();
checked
-true
+false
call bloom_start();
select count(*) from t2 force index (id2) where id2=23 and id4=115;
count(*)
1
call bloom_end();
checked
-true
+false
call bloom_start();
select count(*) from t2 force index (id2) where id2=500 and id4=2500;
count(*)
1
call bloom_end();
checked
-true
+false
call bloom_start();
select count(*) from t2 force index (id2) where id2=601 and id4=3005;
count(*)
1
call bloom_end();
checked
-true
+false
call bloom_start();
select count(*) from t2 force index (id3_id4) where id3='1' and id4=1;
count(*)
1
call bloom_end();
checked
-true
+false
call bloom_start();
select count(*) from t2 force index (id3_id4) where id3='12' and id4=60;
count(*)
1
call bloom_end();
checked
-true
+false
call bloom_start();
select count(*) from t1 force index (id2) where id2=1;
count(*)
diff --git a/mysql-test/suite/rocksdb/r/rocksdb_range.result b/mysql-test/suite/rocksdb/r/rocksdb_range.result
index a3d7839712d..4352f579759 100644
--- a/mysql-test/suite/rocksdb/r/rocksdb_range.result
+++ b/mysql-test/suite/rocksdb/r/rocksdb_range.result
@@ -1,6 +1,7 @@
select * from information_schema.engines where engine = 'rocksdb';
ENGINE SUPPORT COMMENT TRANSACTIONS XA SAVEPOINTS
ROCKSDB DEFAULT RocksDB storage engine YES YES YES
+set optimizer_force_index_for_range = on;
drop table if exists t0,t1,t2,t3,t4,t5;
create table t0 (a int) engine=myisam;
insert into t0 values (0),(1),(2),(3),(4),(5),(6),(7),(8),(9);
@@ -64,6 +65,16 @@ Note 1003 /* select#1 */ select `test`.`t2`.`pk` AS `pk`,`test`.`t2`.`a` AS `a`,
select * from t2 force index (a) where a=3 and pk=33;
pk a b
33 3 33
+explain
+select * from t2 force index (a) where a=3 and pk in (33, 34);
+id select_type table partitions type possible_keys key key_len ref rows filtered Extra
+1 SIMPLE t2 NULL range a a 8 NULL # # Using index condition
+Warnings:
+Note 1003 /* select#1 */ select `test`.`t2`.`pk` AS `pk`,`test`.`t2`.`a` AS `a`,`test`.`t2`.`b` AS `b` from `test`.`t2` FORCE INDEX (`a`) where ((`test`.`t2`.`a` = 3) and (`test`.`t2`.`pk` in (33,34)))
+select * from t2 force index (a) where a=3 and pk in (33, 34);
+pk a b
+33 3 33
+34 3 34
select * from t2 force index (a) where a=99 and pk=99;
pk a b
select * from t2 force index (a) where a=0 and pk=0;
@@ -316,3 +327,4 @@ Note 1003 /* select#1 */ select `test`.`t5`.`pk` AS `pk`,`test`.`t5`.`a` AS `a`,
select * from t5 where a=5 and b in (4) order by c desc;
pk a b c
drop table t0,t1,t2,t3,t4,t5;
+set optimizer_force_index_for_range = off;
diff --git a/mysql-test/suite/rocksdb/t/bloomfilter_load_select.inc b/mysql-test/suite/rocksdb/t/bloomfilter_load_select.inc
index 1f1a4b9810f..504507f9a35 100644
--- a/mysql-test/suite/rocksdb/t/bloomfilter_load_select.inc
+++ b/mysql-test/suite/rocksdb/t/bloomfilter_load_select.inc
@@ -187,3 +187,5 @@ call bloom_start();
select count(*) from t2 force index (id3_id5) where id3='240' and id5=1200;
call bloom_end();
+--remove_file $MYSQL_TMP_DIR/insert_t1.sql
+--remove_file $MYSQL_TMP_DIR/insert_t2.sql
diff --git a/mysql-test/suite/rocksdb/t/rocksdb_range.test b/mysql-test/suite/rocksdb/t/rocksdb_range.test
index 2f5928e6351..d9a39d67de4 100644
--- a/mysql-test/suite/rocksdb/t/rocksdb_range.test
+++ b/mysql-test/suite/rocksdb/t/rocksdb_range.test
@@ -5,6 +5,8 @@
#
select * from information_schema.engines where engine = 'rocksdb';
+set optimizer_force_index_for_range = on;
+
--disable_warnings
drop table if exists t0,t1,t2,t3,t4,t5;
--enable_warnings
@@ -46,6 +48,11 @@ explain
select * from t2 force index (a) where a=3 and pk=33;
select * from t2 force index (a) where a=3 and pk=33;
+--replace_column 10 # 11 #
+explain
+select * from t2 force index (a) where a=3 and pk in (33, 34);
+select * from t2 force index (a) where a=3 and pk in (33, 34);
+
select * from t2 force index (a) where a=99 and pk=99;
select * from t2 force index (a) where a=0 and pk=0;
select * from t2 force index (a) where a=-1;
@@ -191,3 +198,4 @@ select * from t5 where a=5 and b in (4) order by c desc;
select * from t5 where a=5 and b in (4) order by c desc;
drop table t0,t1,t2,t3,t4,t5;
+set optimizer_force_index_for_range = off;
diff --git a/storage/rocksdb/ha_rocksdb.cc b/storage/rocksdb/ha_rocksdb.cc
index 91791b337af..2f41ba40a17 100644
--- a/storage/rocksdb/ha_rocksdb.cc
+++ b/storage/rocksdb/ha_rocksdb.cc
@@ -8974,7 +8974,8 @@ ulong ha_rocksdb::index_flags(uint inx, uint part, bool all_parts) const {
m_scan_it points at the index key-value pair that we should read the (pk,row)
pair for.
*/
-int ha_rocksdb::secondary_index_read(const int keyno, uchar *const buf) {
+int ha_rocksdb::secondary_index_read(const int keyno, uchar *const buf,
+ const rocksdb::Slice *value) {
DBUG_ASSERT(table != nullptr);
if (m_iteration_only) {
@@ -8983,7 +8984,6 @@ int ha_rocksdb::secondary_index_read(const int keyno, uchar *const buf) {
}
int rc = 0;
- rocksdb::Slice value = m_scan_it->value();
#ifndef DBUG_OFF
bool save_keyread_only = m_keyread_only;
@@ -8992,7 +8992,7 @@ int ha_rocksdb::secondary_index_read(const int keyno, uchar *const buf) {
bool covered_lookup =
(m_keyread_only && m_key_descr_arr[keyno]->can_cover_lookup()) ||
m_key_descr_arr[keyno]->covers_lookup(
- &value, m_converter->get_lookup_bitmap());
+ value, m_converter->get_lookup_bitmap());
#ifndef DBUG_OFF
m_keyread_only = save_keyread_only;
#endif
@@ -9045,6 +9045,7 @@ int ha_rocksdb::index_read_intern(uchar *const buf, const uchar *const key,
const Rdb_key_def &kd = *m_key_descr_arr[active_index_pos()];
bool using_full_key = false;
+ m_full_key_lookup = false;
uint packed_size;
@@ -9060,6 +9061,7 @@ int ha_rocksdb::index_read_intern(uchar *const buf, const uchar *const key,
Handle some special cases when we do exact key lookups.
*/
if (find_flag == HA_READ_KEY_EXACT && using_full_key) {
+ m_full_key_lookup = true;
if (active_index == table->s->primary_key) {
/*
Equality lookup over primary key, using full tuple.
@@ -9122,6 +9124,26 @@ int ha_rocksdb::index_read_intern(uchar *const buf, const uchar *const key,
rc = get_row_by_rowid(buf, m_last_rowkey.ptr(),
m_last_rowkey.length());
+ release_scan_iterator();
+ DBUG_RETURN(rc);
+ }
+
+ if (using_full_key) {
+ packed_size = kd.pack_index_tuple(
+ table, m_pack_buffer, m_sk_packed_tuple, key, keypart_map);
+ rocksdb::Slice key_slice(
+ reinterpret_cast<const char *>(m_sk_packed_tuple), packed_size);
+
+ rc = get_row_by_sk(buf, kd, &key_slice);
+
+ if (!rc) {
+ /* TODO(yzha) - row stats are gone in 8.0
+ stats.rows_read++;
+ stats.rows_index_first++; */
+ update_row_stats(ROWS_READ);
+ }
+
+ release_scan_iterator();
DBUG_RETURN(rc);
}
}
@@ -9165,9 +9187,7 @@ int ha_rocksdb::index_read_intern(uchar *const buf, const uchar *const key,
This will open the iterator and position it at a record that's equal or
greater than the lookup tuple.
*/
- setup_scan_iterator(kd, &slice,
- using_full_key && (find_flag == HA_READ_KEY_EXACT),
- eq_cond_len);
+ setup_scan_iterator(kd, &slice, eq_cond_len);
/*
Once we are positioned on from above, move to the position we really
@@ -9584,6 +9604,50 @@ int ha_rocksdb::records_from_index(ha_rows *num_rows, uint index) {
return count;
}
+/*
+ The analagous function to ha_rocksdb::get_row_by_rowid for performing
+ secondary key lookups.
+
+ @return
+ HA_EXIT_SUCCESS OK
+ other HA_ERR error code (can be SE-specific)
+*/
+int ha_rocksdb::get_row_by_sk(uchar *buf, const Rdb_key_def &kd,
+ const rocksdb::Slice *key) {
+ DBUG_ENTER_FUNC();
+ Rdb_transaction *const tx = get_or_create_tx(table->in_use);
+
+ auto s = tx->get(kd.get_cf(), *key, &m_retrieved_record);
+
+ if (!s.IsNotFound() && !s.ok()) {
+ DBUG_RETURN(
+ tx->set_status_error(table->in_use, s, kd, m_tbl_def, m_table_handler));
+ }
+
+ if (s.IsNotFound()) {
+ DBUG_RETURN(HA_ERR_KEY_NOT_FOUND);
+ }
+
+ if (kd.has_ttl() &&
+ should_hide_ttl_rec(kd, m_retrieved_record, tx->m_snapshot_timestamp)) {
+ DBUG_RETURN(HA_ERR_KEY_NOT_FOUND);
+ }
+
+ const uint size =
+ kd.get_primary_key_tuple(table, *m_pk_descr, key, m_pk_packed_tuple);
+ if (size == RDB_INVALID_KEY_LEN) {
+ DBUG_RETURN(HA_ERR_ROCKSDB_CORRUPT_DATA);
+ }
+
+ m_last_rowkey.copy((const char *)m_pk_packed_tuple, size, &my_charset_bin);
+
+ int rc = secondary_index_read(active_index, buf, &m_retrieved_record);
+ if (!rc) {
+ table->m_status = 0;
+ }
+ DBUG_RETURN(rc);
+}
+
/**
@return
HA_EXIT_SUCCESS OK
@@ -9596,6 +9660,26 @@ int ha_rocksdb::index_next(uchar *const buf) {
DBUG_RETURN(index_next_with_direction_intern(buf, true, false));
}
+/**
+ @return
+ HA_EXIT_SUCCESS OK
+ other HA_ERR error code (can be SE-specific)
+*/
+int ha_rocksdb::index_next_same(uchar *const buf,
+ const uchar *key MY_ATTRIBUTE((unused)),
+ uint keylen MY_ATTRIBUTE((unused))) {
+ DBUG_ENTER_FUNC();
+
+ if (m_full_key_lookup) {
+#ifndef DBUG_OFF
+ uint len = calculate_key_len(table, active_index, HA_WHOLE_KEY);
+ DBUG_ASSERT(len == keylen);
+#endif
+ DBUG_RETURN(HA_ERR_END_OF_FILE);
+ }
+ DBUG_RETURN(index_next(buf));
+}
+
/**
@return
HA_EXIT_SUCCESS OK
@@ -9706,7 +9790,7 @@ int ha_rocksdb::index_next_with_direction_intern(uchar *const buf,
m_last_rowkey.copy((const char *)m_pk_packed_tuple, size,
&my_charset_bin);
- rc = secondary_index_read(active_index, buf);
+ rc = secondary_index_read(active_index, buf, &value);
}
if (!should_skip_invalidated_record(rc)) {
@@ -10362,35 +10446,48 @@ int ha_rocksdb::check_and_lock_sk(
rocksdb::Slice lower_bound_slice;
rocksdb::Slice upper_bound_slice;
- const bool total_order_seek = !check_bloom_and_set_bounds(
- ha_thd(), kd, new_slice, all_parts_used, Rdb_key_def::INDEX_ID_SIZE,
- lower_bound_buf, upper_bound_buf, &lower_bound_slice, &upper_bound_slice);
- const bool fill_cache = !THDVAR(ha_thd(), skip_fill_cache);
-
- const rocksdb::Status s = get_for_update(row_info.tx, kd, new_slice, nullptr);
+ const rocksdb::Status s =
+ get_for_update(row_info.tx, kd, new_slice,
+ all_parts_used ? &m_retrieved_record : nullptr);
if (!s.ok() && !s.IsNotFound()) {
return row_info.tx->set_status_error(table->in_use, s, kd, m_tbl_def,
m_table_handler);
}
- rocksdb::Iterator *const iter = row_info.tx->get_iterator(
- kd.get_cf(), total_order_seek, fill_cache, lower_bound_slice,
- upper_bound_slice, true /* read current data */,
- false /* acquire snapshot */);
- /*
- Need to scan the transaction to see if there is a duplicate key.
- Also need to scan RocksDB and verify the key has not been deleted
- in the transaction.
- */
- DBUG_ASSERT(row_info.tx->has_snapshot() &&
- row_info.tx->m_snapshot_timestamp != 0);
- *found =
- !read_key_exact(kd, iter, new_slice, row_info.tx->m_snapshot_timestamp);
+ rocksdb::Iterator *iter = nullptr;
+
+ if (all_parts_used) {
+ *found = !s.IsNotFound();
+ if (*found && kd.has_ttl() &&
+ should_hide_ttl_rec(kd, m_retrieved_record,
+ row_info.tx->m_snapshot_timestamp)) {
+ *found = false;
+ }
+ } else {
+ const bool total_order_seek = !check_bloom_and_set_bounds(
+ ha_thd(), kd, new_slice, Rdb_key_def::INDEX_ID_SIZE, lower_bound_buf,
+ upper_bound_buf, &lower_bound_slice, &upper_bound_slice);
+ const bool fill_cache = !THDVAR(ha_thd(), skip_fill_cache);
+
+ iter = row_info.tx->get_iterator(kd.get_cf(), total_order_seek, fill_cache,
+ lower_bound_slice, upper_bound_slice,
+ true /* read current data */,
+ false /* acquire snapshot */);
+ /*
+ Need to scan the transaction to see if there is a duplicate key.
+ Also need to scan RocksDB and verify the key has not been deleted
+ in the transaction.
+ */
+ DBUG_ASSERT(row_info.tx->has_snapshot() &&
+ row_info.tx->m_snapshot_timestamp != 0);
+ *found =
+ !read_key_exact(kd, iter, new_slice, row_info.tx->m_snapshot_timestamp);
+ }
int rc = HA_EXIT_SUCCESS;
if (*found && m_insert_with_update) {
- const rocksdb::Slice &rkey = iter->key();
+ const rocksdb::Slice &rkey = all_parts_used ? new_slice : iter->key();
uint pk_size =
kd.get_primary_key_tuple(table, *m_pk_descr, &rkey, m_pk_packed_tuple);
if (pk_size == RDB_INVALID_KEY_LEN) {
@@ -10979,7 +11076,6 @@ void ha_rocksdb::setup_iterator_bounds(
void ha_rocksdb::setup_scan_iterator(const Rdb_key_def &kd,
rocksdb::Slice *const slice,
- const bool use_all_keys,
const uint eq_cond_len) {
DBUG_ASSERT(slice->size() >= eq_cond_len);
@@ -10997,7 +11093,7 @@ void ha_rocksdb::setup_scan_iterator(const Rdb_key_def &kd,
// See ha_rocksdb::setup_iterator_bounds on how the bound_len parameter is
// used.
if (check_bloom_and_set_bounds(
- ha_thd(), kd, eq_cond, use_all_keys,
+ ha_thd(), kd, eq_cond,
std::max(eq_cond_len, (uint)Rdb_key_def::INDEX_ID_SIZE),
m_scan_it_lower_bound, m_scan_it_upper_bound,
&m_scan_it_lower_bound_slice, &m_scan_it_upper_bound_slice)) {
@@ -14955,10 +15051,9 @@ bool ha_rocksdb::can_assume_tracked(THD *thd) {
bool ha_rocksdb::check_bloom_and_set_bounds(
THD *thd, const Rdb_key_def &kd, const rocksdb::Slice &eq_cond,
- const bool use_all_keys, size_t bound_len, uchar *const lower_bound,
- uchar *const upper_bound, rocksdb::Slice *lower_bound_slice,
- rocksdb::Slice *upper_bound_slice) {
- bool can_use_bloom = can_use_bloom_filter(thd, kd, eq_cond, use_all_keys);
+ size_t bound_len, uchar *const lower_bound, uchar *const upper_bound,
+ rocksdb::Slice *lower_bound_slice, rocksdb::Slice *upper_bound_slice) {
+ bool can_use_bloom = can_use_bloom_filter(thd, kd, eq_cond);
if (!can_use_bloom && (THDVAR(thd, enable_iterate_bounds))) {
setup_iterator_bounds(kd, eq_cond, bound_len, lower_bound, upper_bound,
lower_bound_slice, upper_bound_slice);
@@ -14981,12 +15076,9 @@ bool ha_rocksdb::check_bloom_and_set_bounds(
@param kd
@param eq_cond Equal condition part of the key. This always includes
system index id (4 bytes).
- @param use_all_keys True if all key parts are set with equal conditions.
- This is aware of extended keys.
*/
bool ha_rocksdb::can_use_bloom_filter(THD *thd, const Rdb_key_def &kd,
- const rocksdb::Slice &eq_cond,
- const bool use_all_keys) {
+ const rocksdb::Slice &eq_cond) {
bool can_use = false;
if (THDVAR(thd, skip_bloom_filter_on_read)) {
@@ -15013,18 +15105,7 @@ bool ha_rocksdb::can_use_bloom_filter(THD *thd, const Rdb_key_def &kd,
shorter require all parts of the key to be available
for the short key match.
*/
- if ((use_all_keys && prefix_extractor->InRange(eq_cond)) ||
- prefix_extractor->SameResultWhenAppended(eq_cond)) {
- can_use = true;
- } else {
- can_use = false;
- }
- } else {
- /*
- if prefix extractor is not defined, all key parts have to be
- used by eq_cond.
- */
- if (use_all_keys) {
+ if (prefix_extractor->SameResultWhenAppended(eq_cond)) {
can_use = true;
} else {
can_use = false;
diff --git a/storage/rocksdb/ha_rocksdb.h b/storage/rocksdb/ha_rocksdb.h
index 11913a3aec6..9c3c3927498 100644
--- a/storage/rocksdb/ha_rocksdb.h
+++ b/storage/rocksdb/ha_rocksdb.h
@@ -278,6 +278,8 @@ class ha_rocksdb : public my_core::handler {
bool m_iteration_only;
bool m_rnd_scan_started;
+ bool m_full_key_lookup = false;
+
/*
true means INSERT ON DUPLICATE KEY UPDATE. In such case we can optimize by
remember the failed attempt (if there is one that violates uniqueness check)
@@ -322,7 +324,8 @@ class ha_rocksdb : public my_core::handler {
const TABLE *const old_table_arg = nullptr,
const Rdb_tbl_def *const old_tbl_def_arg = nullptr) const
MY_ATTRIBUTE((__nonnull__(2, 3), __warn_unused_result__));
- int secondary_index_read(const int keyno, uchar *const buf)
+ int secondary_index_read(const int keyno, uchar *const buf,
+ const rocksdb::Slice *value)
MY_ATTRIBUTE((__nonnull__, __warn_unused_result__));
static void setup_iterator_bounds(const Rdb_key_def &kd,
const rocksdb::Slice &eq_cond,
@@ -331,11 +334,9 @@ class ha_rocksdb : public my_core::handler {
rocksdb::Slice *lower_bound_slice,
rocksdb::Slice *upper_bound_slice);
static bool can_use_bloom_filter(THD *thd, const Rdb_key_def &kd,
- const rocksdb::Slice &eq_cond,
- const bool use_all_keys);
+ const rocksdb::Slice &eq_cond);
void setup_scan_iterator(const Rdb_key_def &kd, rocksdb::Slice *slice,
- const bool use_all_keys, const uint eq_cond_len)
- MY_ATTRIBUTE((__nonnull__));
+ const uint eq_cond_len) MY_ATTRIBUTE((__nonnull__));
void release_scan_iterator(void);
rocksdb::Status get_for_update(Rdb_transaction *const tx,
@@ -354,6 +355,8 @@ class ha_rocksdb : public my_core::handler {
return get_row_by_rowid(buf, reinterpret_cast<const char *>(rowid),
rowid_size, skip_lookup, skip_ttl_check);
}
+ int get_row_by_sk(uchar *buf, const Rdb_key_def &kd,
+ const rocksdb::Slice *key);
void load_auto_incr_value();
ulonglong load_auto_incr_value_from_index();
@@ -628,6 +631,8 @@ class ha_rocksdb : public my_core::handler {
int index_next(uchar *const buf) override
MY_ATTRIBUTE((__warn_unused_result__));
+ int index_next_same(uchar *const buf, const uchar *key, uint keylen) override
+ MY_ATTRIBUTE((__warn_unused_result__));
int index_prev(uchar *const buf) override
MY_ATTRIBUTE((__warn_unused_result__));
@@ -659,9 +664,8 @@ class ha_rocksdb : public my_core::handler {
static bool check_bloom_and_set_bounds(
THD *thd, const Rdb_key_def &kd, const rocksdb::Slice &eq_cond,
- const bool use_all_keys, size_t bound_len, uchar *const lower_bound,
- uchar *const upper_bound, rocksdb::Slice *lower_bound_slice,
- rocksdb::Slice *upper_bound_slice);
+ size_t bound_len, uchar *const lower_bound, uchar *const upper_bound,
+ rocksdb::Slice *lower_bound_slice, rocksdb::Slice *upper_bound_slice);
private:
// true <=> The scan uses the default MRR implementation, just redirect all
diff --git a/storage/rocksdb/nosql_access.cc b/storage/rocksdb/nosql_access.cc
index b01100b41e6..1788486ee9f 100644
--- a/storage/rocksdb/nosql_access.cc
+++ b/storage/rocksdb/nosql_access.cc
@@ -1506,9 +1506,8 @@ bool INLINE_ATTR select_exec::setup_iterator(txn_wrapper *txn,
m_lower_bound_buf.reserve(bound_len);
m_upper_bound_buf.reserve(bound_len);
bool use_bloom = ha_rocksdb::check_bloom_and_set_bounds(
- m_thd, *m_key_def, eq_slice, m_use_full_key, bound_len,
- m_lower_bound_buf.data(), m_upper_bound_buf.data(), &m_lower_bound_slice,
- &m_upper_bound_slice);
+ m_thd, *m_key_def, eq_slice, bound_len, m_lower_bound_buf.data(),
+ m_upper_bound_buf.data(), &m_lower_bound_slice, &m_upper_bound_slice);
rocksdb::Iterator *it = txn->get_iterator(
m_key_def->get_cf(), use_bloom, m_lower_bound_slice, m_upper_bound_slice);
if (it == nullptr) {
1
0
revision-id: d89e160443823d0f07691e99067e15b06d532ccc (percona-202102-50-gd89e1604438)
parent(s): 25ecd858d747918007eb17d9f46199592ccf664f
author: Sergei Petrunia
committer: Sergei Petrunia
timestamp: 2021-05-17 17:30:04 +0300
message:
Reset iterator after point lookup
Summary: Although I did not find an example of this happening today, it seems like the SQL layer can "index_next" after a point lookup. However, since we don't position our iterator for point lookups, we would be potentially returning incorrect results (eg. if we had a previous valid iterator from before the point lookup). It seems safer to just reset the iterator after a point lookup.
Test Plan: mtr
Reviewers: luqun, herman, yzha
Subscribers: pgl, vinaybhat
Differential Revision: https://phabricator.intern.facebook.com/D23399585
---
storage/rocksdb/ha_rocksdb.cc | 25 ++++++++++++++++---------
1 file changed, 16 insertions(+), 9 deletions(-)
diff --git a/storage/rocksdb/ha_rocksdb.cc b/storage/rocksdb/ha_rocksdb.cc
index 3f50f0b6380..91791b337af 100644
--- a/storage/rocksdb/ha_rocksdb.cc
+++ b/storage/rocksdb/ha_rocksdb.cc
@@ -9078,6 +9078,15 @@ int ha_rocksdb::index_read_intern(uchar *const buf, const uchar *const key,
stats.rows_index_first++; */
update_row_stats(ROWS_READ);
}
+ /*
+ If the SQL layer calls index_read_map, it expects the iterator to be
+ positioned accordingly, so that next/prev can work as expected. In
+ this case, we calling DB::Get directly without positioning an
+ iterator, so it is incorrect for the SQL layer to be calling
+ next/prev anyway. To avoid correctness issues, just free the
+ iterator.
+ */
+ release_scan_iterator();
DBUG_RETURN(rc);
} else {
/*
@@ -9622,6 +9631,12 @@ int ha_rocksdb::index_next_with_direction_intern(uchar *const buf,
break;
}
+ DBUG_ASSERT(m_scan_it);
+ if (m_scan_it == nullptr) {
+ rc = HA_ERR_INTERNAL_ERROR;
+ break;
+ }
+
if (skip_next) {
skip_next = false;
} else {
@@ -9632,15 +9647,7 @@ int ha_rocksdb::index_next_with_direction_intern(uchar *const buf,
}
}
- if (!m_scan_it || !is_valid_iterator(m_scan_it)) {
- /*
- We can get here when SQL layer has called
-
- h->index_init(PRIMARY);
- h->index_read_map(full index tuple, HA_READ_KEY_EXACT);
-
- In this case, we should return EOF.
- */
+ if (!is_valid_iterator(m_scan_it)) {
rc = HA_ERR_END_OF_FILE;
break;
}
1
0
[Commits] 25ecd858d74: Apply patch: Combine rnd_next/index_next/position_to_correct_key
by psergey 17 May '21
by psergey 17 May '21
17 May '21
revision-id: 25ecd858d747918007eb17d9f46199592ccf664f (percona-202102-49-g25ecd858d74)
parent(s): b9b6ca04c17cde7d866e1dd21ad926cefadcecee
author: Sergei Petrunia
committer: Sergei Petrunia
timestamp: 2021-05-17 17:29:28 +0300
message:
Apply patch: Combine rnd_next/index_next/position_to_correct_key
Summary:
This combines the iteration codepaths so that we consolidate the places where we perform the snapshot refresh loop/bloomfilter calculations/ttl/icp into a single location.
Roughly, I'm doing the following:
- `rnd_next`/`index_next` as well as `index_read_map` are now all calling into `index_next_with_direction_intern`, which will perform the required iteration.
- In `index_read_map` filtering logic was split between `position_to_correct_key` and `read_row_from_secondary_key`. The first function did some TTL checks, and the second function did ICP. Instead `position_to_correct_key` is now solely responsible for seeking to the correct key without regard for ttl (or any other filtering) logic. With better rocksdb primitives (ie. prefix seek), we wouldn't even need these functions. All filtering logic is now consolidated into `index_next_with_direction_intern`.
- There are checks to see if the current key is in the current index id, as well as checks against `m_sk_match_prefix_buf` to see if the current key is within the current check. Conceptually, these two checks are doing the same thing, and can be merged together. This probably saves some memcmps.
Test Plan: mtr
Reviewers: luqun, herman, yzha
Subscribers: pgl, vinaybhat
Differential Revision: https://phabricator.intern.facebook.com/D23399596
---
mysql-test/suite/rocksdb/r/check_flags.result | 10 +-
.../rocksdb/r/issue243_transactionStatus.result | 2 +-
mysql-test/suite/rocksdb/t/check_flags.test | 10 +-
storage/rocksdb/ha_rocksdb.cc | 730 ++++++---------------
storage/rocksdb/ha_rocksdb.h | 45 +-
5 files changed, 216 insertions(+), 581 deletions(-)
diff --git a/mysql-test/suite/rocksdb/r/check_flags.result b/mysql-test/suite/rocksdb/r/check_flags.result
index 46ffdcfe6a4..8ff4153707e 100644
--- a/mysql-test/suite/rocksdb/r/check_flags.result
+++ b/mysql-test/suite/rocksdb/r/check_flags.result
@@ -6,35 +6,35 @@ CREATE TABLE t3 (id INT, kp1 INT, PRIMARY KEY (id), KEY(kp1)) ENGINE=ROCKSDB COM
INSERT INTO t1 VALUES (1,1), (2,2), (3,3), (4,4), (5,5);
INSERT INTO t2 SELECT * FROM t1;
INSERT INTO t3 SELECT * FROM t1;
-set debug_sync='rocksdb.check_flags_rmi SIGNAL parked WAIT_FOR go';
+set debug_sync='rocksdb.check_flags_iri SIGNAL parked WAIT_FOR go';
SELECT value FROM t1 WHERE value = 3;
set debug_sync='now WAIT_FOR parked';
KILL QUERY $conn1_id;
set debug_sync='now SIGNAL go';
ERROR 70100: Query execution was interrupted
set debug_sync='RESET';
-set debug_sync='rocksdb.check_flags_rmi_scan SIGNAL parked WAIT_FOR go';
+set debug_sync='rocksdb.check_flags_iri_scan SIGNAL parked WAIT_FOR go';
SELECT DISTINCT(id) FROM t1 WHERE value = 5 AND id IN (1, 3, 5);
set debug_sync='now WAIT_FOR parked';
KILL QUERY $conn1_id;
set debug_sync='now SIGNAL go';
ERROR 70100: Query execution was interrupted
set debug_sync='RESET';
-set debug_sync='rocksdb.check_flags_inwd SIGNAL parked WAIT_FOR go';
+set debug_sync='rocksdb.check_flags_inwdi SIGNAL parked WAIT_FOR go';
SELECT value FROM t1 WHERE value > 3;
set debug_sync='now WAIT_FOR parked';
KILL QUERY $conn1_id;
set debug_sync='now SIGNAL go';
ERROR 70100: Query execution was interrupted
set debug_sync='RESET';
-set debug_sync='rocksdb.check_flags_rnwd SIGNAL parked WAIT_FOR go';
+set debug_sync='rocksdb.check_flags_inwdi SIGNAL parked WAIT_FOR go';
SELECT id FROM t2;
set debug_sync='now WAIT_FOR parked';
KILL QUERY $conn1_id;
set debug_sync='now SIGNAL go';
ERROR 70100: Query execution was interrupted
set debug_sync='RESET';
-set debug_sync='rocksdb.check_flags_rke SIGNAL parked WAIT_FOR go';
+set debug_sync='rocksdb.check_flags_inwdi SIGNAL parked WAIT_FOR go';
SELECT kp1 FROM t3 ORDER BY kp1;
set debug_sync='now WAIT_FOR parked';
KILL QUERY $conn1_id;
diff --git a/mysql-test/suite/rocksdb/r/issue243_transactionStatus.result b/mysql-test/suite/rocksdb/r/issue243_transactionStatus.result
index 2dce49f9e39..d640835ff40 100644
--- a/mysql-test/suite/rocksdb/r/issue243_transactionStatus.result
+++ b/mysql-test/suite/rocksdb/r/issue243_transactionStatus.result
@@ -149,7 +149,7 @@ LIST OF SNAPSHOTS FOR EACH SESSION:
---SNAPSHOT, ACTIVE NUM sec
MySQL thread id TID, OS thread handle PTR, query id QID localhost root ACTION
SHOW ENGINE rocksdb TRANSACTION STATUS
-lock count 5, write count 7
+lock count 4, write count 7
insert count 2, update count 1, delete count 1
----------LATEST DETECTED DEADLOCKS----------
-----------------------------------------
diff --git a/mysql-test/suite/rocksdb/t/check_flags.test b/mysql-test/suite/rocksdb/t/check_flags.test
index ab02712e585..58dc1f4f8da 100644
--- a/mysql-test/suite/rocksdb/t/check_flags.test
+++ b/mysql-test/suite/rocksdb/t/check_flags.test
@@ -18,7 +18,7 @@ INSERT INTO t2 SELECT * FROM t1;
INSERT INTO t3 SELECT * FROM t1;
connection conn1;
-set debug_sync='rocksdb.check_flags_rmi SIGNAL parked WAIT_FOR go';
+set debug_sync='rocksdb.check_flags_iri SIGNAL parked WAIT_FOR go';
send SELECT value FROM t1 WHERE value = 3;
connection default;
@@ -36,7 +36,7 @@ connection conn1;
set debug_sync='RESET';
connection conn1;
-set debug_sync='rocksdb.check_flags_rmi_scan SIGNAL parked WAIT_FOR go';
+set debug_sync='rocksdb.check_flags_iri_scan SIGNAL parked WAIT_FOR go';
send SELECT DISTINCT(id) FROM t1 WHERE value = 5 AND id IN (1, 3, 5);
connection default;
@@ -54,7 +54,7 @@ connection conn1;
set debug_sync='RESET';
connection conn1;
-set debug_sync='rocksdb.check_flags_inwd SIGNAL parked WAIT_FOR go';
+set debug_sync='rocksdb.check_flags_inwdi SIGNAL parked WAIT_FOR go';
send SELECT value FROM t1 WHERE value > 3;
connection default;
@@ -72,7 +72,7 @@ connection conn1;
set debug_sync='RESET';
connection conn1;
-set debug_sync='rocksdb.check_flags_rnwd SIGNAL parked WAIT_FOR go';
+set debug_sync='rocksdb.check_flags_inwdi SIGNAL parked WAIT_FOR go';
send SELECT id FROM t2;
connection default;
@@ -91,7 +91,7 @@ set debug_sync='RESET';
connection conn1;
-set debug_sync='rocksdb.check_flags_rke SIGNAL parked WAIT_FOR go';
+set debug_sync='rocksdb.check_flags_inwdi SIGNAL parked WAIT_FOR go';
send SELECT kp1 FROM t3 ORDER BY kp1;
connection default;
diff --git a/storage/rocksdb/ha_rocksdb.cc b/storage/rocksdb/ha_rocksdb.cc
index b7f2cd29acb..3f50f0b6380 100644
--- a/storage/rocksdb/ha_rocksdb.cc
+++ b/storage/rocksdb/ha_rocksdb.cc
@@ -7138,7 +7138,6 @@ ha_rocksdb::ha_rocksdb(my_core::handlerton *const hton,
m_sk_packed_tuple(nullptr),
m_end_key_packed_tuple(nullptr),
m_sk_match_prefix(nullptr),
- m_sk_match_prefix_buf(nullptr),
m_sk_packed_tuple_old(nullptr),
m_dup_sk_packed_tuple(nullptr),
m_dup_sk_packed_tuple_old(nullptr),
@@ -7273,24 +7272,6 @@ bool ha_rocksdb::should_hide_ttl_rec(const Rdb_key_def &kd,
return is_hide_ttl;
}
-int ha_rocksdb::rocksdb_skip_expired_records(const Rdb_key_def &kd,
- rocksdb::Iterator *const iter,
- bool seek_backward) {
- if (kd.has_ttl()) {
- THD *thd = ha_thd();
- while (iter->Valid() &&
- should_hide_ttl_rec(
- kd, iter->value(),
- get_or_create_tx(table->in_use)->m_snapshot_timestamp)) {
- if (thd && thd->killed) {
- return HA_ERR_QUERY_INTERRUPTED;
- }
- rocksdb_smart_next(seek_backward, iter);
- }
- }
- return HA_EXIT_SUCCESS;
-}
-
#ifndef DBUG_OFF
void dbug_append_garbage_at_end(rocksdb::PinnableSlice *on_disk_rec) {
std::string str(on_disk_rec->data(), on_disk_rec->size());
@@ -7400,7 +7381,7 @@ int ha_rocksdb::alloc_key_buffers(const TABLE *const table_arg,
m_sk_packed_tuple = reinterpret_cast<uchar *>(
my_malloc(PSI_NOT_INSTRUMENTED, max_packed_sk_len, MYF(0)));
- m_sk_match_prefix_buf = reinterpret_cast<uchar *>(
+ m_sk_match_prefix = reinterpret_cast<uchar *>(
my_malloc(PSI_NOT_INSTRUMENTED, max_packed_sk_len, MYF(0)));
m_sk_packed_tuple_old = reinterpret_cast<uchar *>(
my_malloc(PSI_NOT_INSTRUMENTED, max_packed_sk_len, MYF(0)));
@@ -7447,8 +7428,8 @@ void ha_rocksdb::free_key_buffers() {
my_free(m_sk_packed_tuple);
m_sk_packed_tuple = nullptr;
- my_free(m_sk_match_prefix_buf);
- m_sk_match_prefix_buf = nullptr;
+ my_free(m_sk_match_prefix);
+ m_sk_match_prefix = nullptr;
my_free(m_sk_packed_tuple_old);
m_sk_packed_tuple_old = nullptr;
@@ -8770,7 +8751,6 @@ int ha_rocksdb::read_key_exact(const Rdb_key_def &kd,
rocksdb_smart_seek(kd.m_is_reverse_cf, iter, key_slice);
while (iter->Valid() && kd.value_matches_prefix(iter->key(), key_slice)) {
- DEBUG_SYNC(thd, "rocksdb.check_flags_rke");
if (thd && thd->killed) {
return HA_ERR_QUERY_INTERRUPTED;
}
@@ -8796,12 +8776,18 @@ int ha_rocksdb::read_key_exact(const Rdb_key_def &kd,
int ha_rocksdb::read_before_key(const Rdb_key_def &kd,
const bool full_key_match,
- const rocksdb::Slice &key_slice,
- const int64_t ttl_filter_ts) {
+ const rocksdb::Slice &key_slice) {
THD *thd = ha_thd();
/*
- We are looking for record with the biggest t.key such that
- t.key < lookup_tuple.
+ We are looking for the first record such that
+
+ index_tuple $LT lookup_tuple
+
+ with HA_READ_BEFORE_KEY, $LT = '<',
+ with HA_READ_PREFIX_LAST_OR_PREV, $LT = '<='
+ with HA_READ_PREFIX_LAST, $LT = '=='
+
+ Symmetry with read_after_key is possible if rocksdb supported prefix seeks.
*/
rocksdb_smart_seek(!kd.m_is_reverse_cf, m_scan_it, key_slice);
@@ -8810,16 +8796,10 @@ int ha_rocksdb::read_before_key(const Rdb_key_def &kd,
return HA_ERR_QUERY_INTERRUPTED;
}
/*
- We are using full key and we've hit an exact match, or...
-
- If TTL is enabled we need to check if the given key has already expired
- from the POV of the current transaction. If it has, try going to the next
- key.
+ We are using full key and we've hit an exact match.
*/
if ((full_key_match &&
- kd.value_matches_prefix(m_scan_it->key(), key_slice)) ||
- (kd.has_ttl() &&
- should_hide_ttl_rec(kd, m_scan_it->value(), ttl_filter_ts))) {
+ kd.value_matches_prefix(m_scan_it->key(), key_slice))) {
rocksdb_smart_next(!kd.m_is_reverse_cf, m_scan_it);
continue;
}
@@ -8831,9 +8811,7 @@ int ha_rocksdb::read_before_key(const Rdb_key_def &kd,
}
int ha_rocksdb::read_after_key(const Rdb_key_def &kd,
- const rocksdb::Slice &key_slice,
- const int64_t ttl_filter_ts) {
- THD *thd = ha_thd();
+ const rocksdb::Slice &key_slice) {
/*
We are looking for the first record such that
@@ -8841,22 +8819,10 @@ int ha_rocksdb::read_after_key(const Rdb_key_def &kd,
with HA_READ_AFTER_KEY, $GT = '>',
with HA_READ_KEY_OR_NEXT, $GT = '>='
+ with HA_READ_KEY_EXACT, $GT = '=='
*/
rocksdb_smart_seek(kd.m_is_reverse_cf, m_scan_it, key_slice);
- /*
- If TTL is enabled we need to check if the given key has already expired
- from the POV of the current transaction. If it has, try going to the next
- key.
- */
- while (is_valid_iterator(m_scan_it) && kd.has_ttl() &&
- should_hide_ttl_rec(kd, m_scan_it->value(), ttl_filter_ts)) {
- if (thd && thd->killed) {
- return HA_ERR_QUERY_INTERRUPTED;
- }
- rocksdb_smart_next(kd.m_is_reverse_cf, m_scan_it);
- }
-
return is_valid_iterator(m_scan_it) ? HA_EXIT_SUCCESS : HA_ERR_KEY_NOT_FOUND;
}
@@ -8864,68 +8830,29 @@ int ha_rocksdb::position_to_correct_key(const Rdb_key_def &kd,
const enum ha_rkey_function &find_flag,
const bool full_key_match,
const rocksdb::Slice &key_slice,
- bool *const move_forward,
- const int64_t ttl_filter_ts) {
+ bool *const move_forward) {
int rc = 0;
*move_forward = true;
switch (find_flag) {
case HA_READ_KEY_EXACT:
- rc = read_key_exact(kd, m_scan_it, key_slice, ttl_filter_ts);
+ case HA_READ_AFTER_KEY:
+ case HA_READ_KEY_OR_NEXT:
+ rc = read_after_key(kd, key_slice);
break;
case HA_READ_BEFORE_KEY:
+ case HA_READ_PREFIX_LAST:
+ case HA_READ_PREFIX_LAST_OR_PREV:
*move_forward = false;
- rc = read_before_key(kd, full_key_match, key_slice, ttl_filter_ts);
- if (rc == 0 && !kd.covers_key(m_scan_it->key())) {
- /* The record we've got is not from this index */
- rc = HA_ERR_KEY_NOT_FOUND;
- }
- break;
- case HA_READ_AFTER_KEY:
- case HA_READ_KEY_OR_NEXT:
- rc = read_after_key(kd, key_slice, ttl_filter_ts);
- if (rc == 0 && !kd.covers_key(m_scan_it->key())) {
- /* The record we've got is not from this index */
- rc = HA_ERR_KEY_NOT_FOUND;
- }
+ rc = read_before_key(kd, full_key_match, key_slice);
break;
case HA_READ_KEY_OR_PREV:
case HA_READ_PREFIX:
- /* This flag is not used by the SQL layer, so we don't support it yet. */
+ /* These flags are not used by the SQL layer, so we don't support them
+ * yet. */
rc = HA_ERR_UNSUPPORTED;
break;
- case HA_READ_PREFIX_LAST:
- case HA_READ_PREFIX_LAST_OR_PREV:
- *move_forward = false;
- /*
- Find the last record with the specified index prefix lookup.
- - HA_READ_PREFIX_LAST requires that the record has the
- prefix=lookup (if there are no such records,
- HA_ERR_KEY_NOT_FOUND should be returned).
- - HA_READ_PREFIX_LAST_OR_PREV has no such requirement. If there are no
- records with prefix=lookup, we should return the last record
- before that.
- */
- rc = read_before_key(kd, full_key_match, key_slice, ttl_filter_ts);
- if (rc == 0) {
- const rocksdb::Slice &rkey = m_scan_it->key();
- if (!kd.covers_key(rkey)) {
- /* The record we've got is not from this index */
- rc = HA_ERR_KEY_NOT_FOUND;
- } else if (find_flag == HA_READ_PREFIX_LAST) {
- rocksdb::Slice lookup_tuple(
- reinterpret_cast<char *>(m_sk_match_prefix_buf),
- m_sk_match_length);
-
- // We need to compare the key we've got with the original search
- // prefix.
- if (!kd.value_matches_prefix(rkey, lookup_tuple)) {
- rc = HA_ERR_KEY_NOT_FOUND;
- }
- }
- }
- break;
default:
DBUG_ASSERT(0);
break;
@@ -8979,83 +8906,6 @@ int ha_rocksdb::calc_eq_cond_len(const Rdb_key_def &kd,
return Rdb_key_def::INDEX_ID_SIZE;
}
-int ha_rocksdb::read_row_from_primary_key(uchar *const buf) {
- int rc;
- const rocksdb::Slice &rkey = m_scan_it->key();
- const uint pk_size = rkey.size();
- const char *pk_data = rkey.data();
-
- memcpy(m_pk_packed_tuple, pk_data, pk_size);
- m_last_rowkey.copy(pk_data, pk_size, &my_charset_bin);
-
- if (m_lock_rows != RDB_LOCK_NONE) {
- DEBUG_SYNC(ha_thd(), "rocksdb_concurrent_delete");
- /* We need to put a lock and re-read */
- rc = get_row_by_rowid(buf, m_pk_packed_tuple, pk_size);
- } else {
- /* Unpack from the row we've read */
- const rocksdb::Slice &value = m_scan_it->value();
- rc = convert_record_from_storage_format(&rkey, &value, buf);
- }
-
- return rc;
-}
-
-int ha_rocksdb::read_row_from_secondary_key(uchar *const buf,
- const Rdb_key_def &kd,
- bool move_forward) {
- int rc = 0;
- uint pk_size = 0;
-
- // Due to MRR, now an index-only scan have pushed index condition.
- // (If it does, we follow non-index only code path here, except that
- // we don't fetch the row).
- bool have_icp = (pushed_idx_cond && pushed_idx_cond_keyno == active_index);
- if (have_icp) {
- if (kd.m_is_reverse_cf) move_forward = !move_forward;
- rc = find_icp_matching_index_rec(move_forward, buf);
- if (rc) return (rc);
- }
-
- /* Get the key columns and primary key value */
- const rocksdb::Slice &rkey = m_scan_it->key();
- const rocksdb::Slice &value = m_scan_it->value();
-
-#ifndef DBUG_OFF
- bool save_keyread_only = m_keyread_only;
-#endif
- DBUG_EXECUTE_IF("dbug.rocksdb.HA_EXTRA_KEYREAD", { m_keyread_only = true; });
-
- bool covered_lookup = (m_keyread_only && kd.can_cover_lookup()) ||
- kd.covers_lookup(&value, m_converter->get_lookup_bitmap());
-
-#ifndef DBUG_OFF
- m_keyread_only = save_keyread_only;
-#endif
-
- pk_size =
- kd.get_primary_key_tuple(table, *m_pk_descr, &rkey, m_pk_packed_tuple);
- if (pk_size == RDB_INVALID_KEY_LEN) {
- rc = HA_ERR_ROCKSDB_CORRUPT_DATA;
- } else {
- if (covered_lookup && m_lock_rows == RDB_LOCK_NONE) {
- rc = kd.unpack_record(table, buf, &rkey, &value,
- m_converter->get_verify_row_debug_checksums());
- inc_covered_sk_lookup();
- } else {
- DEBUG_SYNC(ha_thd(), "rocksdb_concurrent_delete_sk");
- rc = get_row_by_rowid(buf, m_pk_packed_tuple, pk_size);
- }
- }
-
- if (!rc) {
- m_last_rowkey.copy((const char *)m_pk_packed_tuple, pk_size,
- &my_charset_bin);
- }
-
- return rc;
-}
-
/**
@note
The problem with this function is that SQL layer calls it, when
@@ -9118,7 +8968,7 @@ ulong ha_rocksdb::index_flags(uint inx, uint part, bool all_parts) const {
/**
@brief
- Read next index tuple through the secondary index.
+ Read from primary key if secondary key is not covering.
@details
m_scan_it points at the index key-value pair that we should read the (pk,row)
@@ -9127,61 +8977,34 @@ ulong ha_rocksdb::index_flags(uint inx, uint part, bool all_parts) const {
int ha_rocksdb::secondary_index_read(const int keyno, uchar *const buf) {
DBUG_ASSERT(table != nullptr);
- /* TODO(yzha) - rows_requested is gone in 8.0
- stats.rows_requested++; */
-
- /* Use STATUS_NOT_FOUND when record not found or some error occurred */
- table->m_status = STATUS_NOT_FOUND;
-
- if (is_valid_iterator(m_scan_it)) {
- rocksdb::Slice key = m_scan_it->key();
-
- /* Check if we've ran out of records of this index */
- if (m_key_descr_arr[keyno]->covers_key(key)) {
- if (m_iteration_only) {
- table->m_status = 0;
- return 0;
- }
-
- int rc = 0;
-
- // TODO: We could here check if we have ran out of range we're scanning
- const uint size = m_key_descr_arr[keyno]->get_primary_key_tuple(
- table, *m_pk_descr, &key, m_pk_packed_tuple);
- if (size == RDB_INVALID_KEY_LEN) {
- return HA_ERR_ROCKSDB_CORRUPT_DATA;
- }
+ if (m_iteration_only) {
+ table->m_status = 0;
+ return 0;
+ }
- m_last_rowkey.copy((const char *)m_pk_packed_tuple, size,
- &my_charset_bin);
+ int rc = 0;
+ rocksdb::Slice value = m_scan_it->value();
- rocksdb::Slice value = m_scan_it->value();
- bool covered_lookup =
- (m_keyread_only && m_key_descr_arr[keyno]->can_cover_lookup()) ||
+#ifndef DBUG_OFF
+ bool save_keyread_only = m_keyread_only;
+ DBUG_EXECUTE_IF("dbug.rocksdb.HA_EXTRA_KEYREAD", { m_keyread_only = true; });
+#endif
+ bool covered_lookup =
+ (m_keyread_only && m_key_descr_arr[keyno]->can_cover_lookup()) ||
m_key_descr_arr[keyno]->covers_lookup(
&value, m_converter->get_lookup_bitmap());
- if (covered_lookup && m_lock_rows == RDB_LOCK_NONE) {
- rc = m_key_descr_arr[keyno]->unpack_record(
- table, buf, &key, &value,
- m_converter->get_verify_row_debug_checksums());
- inc_covered_sk_lookup();
- } else {
- DEBUG_SYNC(ha_thd(), "rocksdb_concurrent_delete_sk");
- rc = get_row_by_rowid(buf, m_pk_packed_tuple, size);
- }
+#ifndef DBUG_OFF
+ m_keyread_only = save_keyread_only;
+#endif
- if (!rc) {
- table->m_status = 0;
- /* TODO(yzha) - rows stats are gone in 8.0
- stats.rows_read++;
- stats.rows_index_next++;
- */
- update_row_stats(ROWS_READ);
- }
- return rc;
- }
+ if (covered_lookup && m_lock_rows == RDB_LOCK_NONE) {
+ inc_covered_sk_lookup();
+ } else {
+ DEBUG_SYNC(ha_thd(), "rocksdb_concurrent_delete_sk");
+ rc = get_row_by_rowid(buf, m_last_rowkey.ptr(), m_last_rowkey.length());
}
- return HA_ERR_END_OF_FILE;
+
+ return rc;
}
/*
@@ -9211,9 +9034,10 @@ int ha_rocksdb::index_read_intern(uchar *const buf, const uchar *const key,
while (1) { debug_i++; });
int rc = 0;
+ table->m_status = STATUS_NOT_FOUND;
THD *thd = ha_thd();
- DEBUG_SYNC(thd, "rocksdb.check_flags_rmi");
+ DEBUG_SYNC(thd, "rocksdb.check_flags_iri");
if (thd && thd->killed) {
rc = HA_ERR_QUERY_INTERRUPTED;
DBUG_RETURN(rc);
@@ -9222,15 +9046,12 @@ int ha_rocksdb::index_read_intern(uchar *const buf, const uchar *const key,
const Rdb_key_def &kd = *m_key_descr_arr[active_index_pos()];
bool using_full_key = false;
- /* By default, we don't need the retrieved records to match the prefix */
- m_sk_match_prefix = nullptr;
- /* TODO(yzha) - row stats are gone in 8.0
- stats.rows_requested++; */
-
uint packed_size;
if (!key) {
// If no key is passed in, then we are doing a full index scan.
+ //
+ // Just use current index id as the search key.
kd.get_infimum_key(m_sk_packed_tuple, &packed_size);
} else {
const uint actual_key_parts = kd.get_key_parts();
@@ -9247,6 +9068,8 @@ int ha_rocksdb::index_read_intern(uchar *const buf, const uchar *const key,
const uint size = kd.pack_index_tuple(
table, m_pack_buffer, m_pk_packed_tuple, key, keypart_map);
bool skip_lookup = is_blind_delete_enabled();
+ /* TODO(yzha) - row stats are gone in 8.0
+ stats.rows_requested++; */
rc = get_row_by_rowid(buf, m_pk_packed_tuple, size, skip_lookup, false);
if (!rc && !skip_lookup) {
@@ -9300,33 +9123,10 @@ int ha_rocksdb::index_read_intern(uchar *const buf, const uchar *const key,
}
if (find_flag == HA_READ_KEY_EXACT || find_flag == HA_READ_PREFIX_LAST) {
- bool have_icp = pushed_idx_cond && pushed_idx_cond_keyno == active_index;
- if (HA_READ_PREFIX_LAST || have_icp) {
- /*
- Save a copy of m_sk_packed_tuple for prefix matching,
-
- This is used in position_to_correct_key for the HA_READ_PREFIX_LAST
- flag.
- */
- m_sk_match_length = packed_size;
- memcpy(m_sk_match_prefix_buf, m_sk_packed_tuple, packed_size);
-
- /*
- We are doing a point index lookup, and ICP is enabled. It is possible
- that this call will be followed by ha_rocksdb->index_next_same() call.
-
- Do what InnoDB does: save the lookup tuple now. We will need it in
- index_next_same/find_icp_matching_index_rec in order to stop scanning
- as soon as index record doesn't match the lookup tuple.
-
- When not using ICP, handler::index_next_same() will make sure that rows
- that don't match the lookup prefix are not returned.
- row matches the lookup prefix.
- */
- if (have_icp) {
- m_sk_match_prefix = m_sk_match_prefix_buf;
- }
- }
+ m_sk_match_length = packed_size;
+ memcpy(m_sk_match_prefix, m_sk_packed_tuple, packed_size);
+ } else {
+ kd.get_infimum_key(m_sk_match_prefix, &m_sk_match_length);
}
int bytes_changed_by_succ = 0;
@@ -9347,7 +9147,7 @@ int ha_rocksdb::index_read_intern(uchar *const buf, const uchar *const key,
// Loop as long as we get a deadlock error AND we end up creating the
// snapshot here (i.e. it did not exist prior to this)
for (;;) {
- DEBUG_SYNC(thd, "rocksdb.check_flags_rmi_scan");
+ DEBUG_SYNC(thd, "rocksdb.check_flags_iri_scan");
if (thd && thd->killed) {
rc = HA_ERR_QUERY_INTERRUPTED;
break;
@@ -9366,7 +9166,7 @@ int ha_rocksdb::index_read_intern(uchar *const buf, const uchar *const key,
*/
bool move_forward;
rc = position_to_correct_key(kd, find_flag, using_full_key, slice,
- &move_forward, tx->m_snapshot_timestamp);
+ &move_forward);
if (rc) {
break;
@@ -9377,11 +9177,7 @@ int ha_rocksdb::index_read_intern(uchar *const buf, const uchar *const key,
then we have all the rows we need. For a secondary key we now need to
lookup the primary key.
*/
- if (active_index == table->s->primary_key) {
- rc = read_row_from_primary_key(buf);
- } else {
- rc = read_row_from_secondary_key(buf, kd, move_forward);
- }
+ rc = index_next_with_direction_intern(buf, move_forward, true);
if (!should_recreate_snapshot(rc, is_new_snapshot)) {
break; /* Exit the loop */
@@ -9392,18 +9188,10 @@ int ha_rocksdb::index_read_intern(uchar *const buf, const uchar *const key,
release_scan_iterator();
}
- if (rc) {
- /*
- This status is returned on any error
- the only possible error condition is record-not-found
- */
- table->m_status = STATUS_NOT_FOUND;
- } else {
- table->m_status = 0;
+ if (!rc) {
/* TODO(yzha) - row stats are gone in 8.0
- stats.rows_read++;
- stats.rows_index_first++; */
- update_row_stats(ROWS_READ);
+ stats.rows_index_first++;
+ stats.rows_index_next--; */
}
DBUG_RETURN(rc);
@@ -9433,88 +9221,6 @@ int ha_rocksdb::index_read_map(uchar *const buf, const uchar *const key,
DBUG_RETURN(index_read_intern(buf, key, keypart_map, find_flag));
}
-/*
- @brief
- Scan the secondary index until we find an index record that satisfies ICP
-
- @param move_forward true <=> move m_scan_it forward
- false <=> move m_scan_it backward
- @param buf Record buffer (must be the same buffer that
- pushed index condition points to, in practice
- it is table->record[0])
-
- @detail
- Move the current iterator m_scan_it until we get an index tuple that
- satisfies the pushed Index Condition.
- (if there is no pushed index condition, return right away)
-
- @return
- 0 - Index tuple satisfies ICP, can do index read.
- other - error code
-*/
-
-int ha_rocksdb::find_icp_matching_index_rec(const bool move_forward,
- uchar *const buf) {
- if (pushed_idx_cond && pushed_idx_cond_keyno == active_index) {
- const Rdb_key_def &kd = *m_key_descr_arr[active_index_pos()];
- THD *thd = ha_thd();
-
- while (1) {
- int rc = rocksdb_skip_expired_records(kd, m_scan_it, !move_forward);
- if (rc != HA_EXIT_SUCCESS) {
- return rc;
- }
-
- if (thd && thd->killed) {
- return HA_ERR_QUERY_INTERRUPTED;
- }
-
- if (!is_valid_iterator(m_scan_it)) {
- table->m_status = STATUS_NOT_FOUND;
- return HA_ERR_END_OF_FILE;
- }
- const rocksdb::Slice rkey = m_scan_it->key();
-
- if (!kd.covers_key(rkey)) {
- table->m_status = STATUS_NOT_FOUND;
- return HA_ERR_END_OF_FILE;
- }
-
- if (m_sk_match_prefix) {
- const rocksdb::Slice prefix((const char *)m_sk_match_prefix,
- m_sk_match_length);
- if (!kd.value_matches_prefix(rkey, prefix)) {
- table->m_status = STATUS_NOT_FOUND;
- return HA_ERR_END_OF_FILE;
- }
- }
-
- const rocksdb::Slice value = m_scan_it->value();
- int err = kd.unpack_record(table, buf, &rkey, &value,
- m_converter->get_verify_row_debug_checksums());
- if (err != HA_EXIT_SUCCESS) {
- return err;
- }
-
- const enum icp_result icp_status = check_index_cond();
- if (icp_status == ICP_NO_MATCH) {
- rocksdb_smart_next(!move_forward, m_scan_it);
- continue; /* Get the next (or prev) index tuple */
- } else if (icp_status == ICP_OUT_OF_RANGE) {
- /* We have walked out of range we are scanning */
- table->m_status = STATUS_NOT_FOUND;
- return HA_ERR_END_OF_FILE;
- } else /* icp_status == ICP_MATCH */
- {
- /* Index Condition is satisfied. We have rc==0, proceed to fetch the
- * row. */
- break;
- }
- }
- }
- return HA_EXIT_SUCCESS;
-}
-
/**
@return
HA_EXIT_SUCCESS OK
@@ -9878,7 +9584,7 @@ int ha_rocksdb::index_next(uchar *const buf) {
DBUG_ENTER_FUNC();
check_build_decoder();
ha_statistic_increment(&System_status_var::ha_read_next_count);
- DBUG_RETURN(index_next_intern(buf));
+ DBUG_RETURN(index_next_with_direction_intern(buf, true, false));
}
/**
@@ -9890,39 +9596,131 @@ int ha_rocksdb::index_prev(uchar *const buf) {
DBUG_ENTER_FUNC();
check_build_decoder();
ha_statistic_increment(&System_status_var::ha_read_prev_count);
- DBUG_RETURN(index_prev_intern(buf));
+ DBUG_RETURN(index_next_with_direction_intern(buf, false, false));
}
-int ha_rocksdb::index_next_with_direction(uchar *const buf, bool move_forward) {
+int ha_rocksdb::index_next_with_direction_intern(uchar *const buf,
+ bool move_forward,
+ bool skip_next) {
DBUG_ENTER_FUNC();
- int rc;
+ THD *thd = ha_thd();
+ int rc = 0;
+ const Rdb_key_def &kd = *m_key_descr_arr[active_index_pos()];
+ Rdb_transaction *const tx = get_or_create_tx(thd);
+ rocksdb::Slice prefix_tuple(reinterpret_cast<char *>(m_sk_match_prefix),
+ m_sk_match_length);
- if (active_index == table->s->primary_key) {
- rc = rnd_next_with_direction(buf, move_forward);
- } else {
- THD *thd = ha_thd();
- for (;;) {
- DEBUG_SYNC(thd, "rocksdb.check_flags_inwd");
- if (thd && thd->killed) {
- rc = HA_ERR_QUERY_INTERRUPTED;
- break;
- }
+ table->m_status = STATUS_NOT_FOUND;
+ /* TODO(yzha) - row stats are gone in 8.0
+ stats.rows_requested++; */
+
+ for (;;) {
+ DEBUG_SYNC(thd, "rocksdb.check_flags_inwdi");
+ if (thd && thd->killed) {
+ rc = HA_ERR_QUERY_INTERRUPTED;
+ break;
+ }
+
+ if (skip_next) {
+ skip_next = false;
+ } else {
if (move_forward) {
- m_scan_it->Next(); /* this call cannot fail */
+ rocksdb_smart_next(kd.m_is_reverse_cf, m_scan_it);
} else {
- m_scan_it->Prev();
+ rocksdb_smart_prev(kd.m_is_reverse_cf, m_scan_it);
}
- rc = rocksdb_skip_expired_records(*m_key_descr_arr[active_index_pos()],
- m_scan_it, !move_forward);
+ }
+
+ if (!m_scan_it || !is_valid_iterator(m_scan_it)) {
+ /*
+ We can get here when SQL layer has called
+
+ h->index_init(PRIMARY);
+ h->index_read_map(full index tuple, HA_READ_KEY_EXACT);
+
+ In this case, we should return EOF.
+ */
+ rc = HA_ERR_END_OF_FILE;
+ break;
+ }
+
+ const rocksdb::Slice &key = m_scan_it->key();
+ const rocksdb::Slice &value = m_scan_it->value();
+
+ // Outside our range, return EOF.
+ if (!kd.value_matches_prefix(key, prefix_tuple)) {
+ rc = HA_ERR_END_OF_FILE;
+ break;
+ }
+
+ // Record is not visible due to TTL, move to next record.
+ if (m_pk_descr->has_ttl() &&
+ should_hide_ttl_rec(kd, value, tx->m_snapshot_timestamp)) {
+ continue;
+ }
+
+ if (active_index == table->s->primary_key) {
+ if (m_lock_rows != RDB_LOCK_NONE) {
+ DEBUG_SYNC(ha_thd(), "rocksdb_concurrent_delete");
+ /* We need to put a lock and re-read */
+ rc = get_row_by_rowid(buf, key.data(), key.size());
+ } else {
+ /* Unpack from the row we've read */
+ m_last_rowkey.copy(key.data(), key.size(), &my_charset_bin);
+ rc = convert_record_from_storage_format(&key, &value, buf);
+ }
+ } else {
+ rc = kd.unpack_record(table, buf, &key, &value,
+ m_converter->get_verify_row_debug_checksums());
if (rc != HA_EXIT_SUCCESS) {
break;
}
- rc = find_icp_matching_index_rec(move_forward, buf);
- if (!rc) rc = secondary_index_read(active_index, buf);
- if (!should_skip_invalidated_record(rc)) {
+
+ // Record did not satisfy ICP, move to next record
+ if (pushed_idx_cond && pushed_idx_cond_keyno == active_index) {
+ const enum icp_result icp_status = check_index_cond();
+ if (icp_status == ICP_NO_MATCH) {
+ continue;
+ } else if (icp_status == ICP_OUT_OF_RANGE) {
+ rc = HA_ERR_END_OF_FILE;
+ break;
+ }
+ DBUG_ASSERT(icp_status == ICP_MATCH);
+ }
+
+ const uint size =
+ kd.get_primary_key_tuple(table, *m_pk_descr, &key, m_pk_packed_tuple);
+ if (size == RDB_INVALID_KEY_LEN) {
+ rc = HA_ERR_ROCKSDB_CORRUPT_DATA;
break;
}
+
+ m_last_rowkey.copy((const char *)m_pk_packed_tuple, size,
+ &my_charset_bin);
+
+ rc = secondary_index_read(active_index, buf);
+ }
+
+ if (!should_skip_invalidated_record(rc)) {
+ break;
+ }
+ }
+
+ if (!rc) {
+ /* TODO(yzha) - row stats are gone in 8.0
+ stats.rows_read++;
+ stats.rows_index_next++; */
+ update_row_stats(ROWS_READ);
+ table->m_status = 0;
+ }
+
+ // skip_next is false when called from functions that are trying to iterate
+ // through keys such as index_next/rnd_next/etc. and these functions
+ // typically expect HA_ERR_END_OF_FILE if no next key is found.
+ if (!skip_next) {
+ if (rc == HA_ERR_KEY_NOT_FOUND) {
+ rc = HA_ERR_END_OF_FILE;
}
}
@@ -9940,10 +9738,7 @@ int ha_rocksdb::index_first(uchar *const buf) {
check_build_decoder();
ha_statistic_increment(&System_status_var::ha_read_first_count);
- int rc = index_read_intern(buf, true /* first */);
- if (rc == HA_ERR_KEY_NOT_FOUND) rc = HA_ERR_END_OF_FILE;
-
- DBUG_RETURN(rc);
+ DBUG_RETURN(index_read_intern(buf, true /* first */));
}
/**
@@ -9957,10 +9752,7 @@ int ha_rocksdb::index_last(uchar *const buf) {
check_build_decoder();
ha_statistic_increment(&System_status_var::ha_read_last_count);
- int rc = index_read_intern(buf, false /* first */);
- if (rc == HA_ERR_KEY_NOT_FOUND) rc = HA_ERR_END_OF_FILE;
-
- DBUG_RETURN(rc);
+ DBUG_RETURN(index_read_intern(buf, false /* first */));
}
/*
@@ -10027,8 +9819,12 @@ int ha_rocksdb::index_last(uchar *const buf) {
*/
int ha_rocksdb::index_read_intern(uchar *const buf, bool first) {
DBUG_ENTER_FUNC();
- DBUG_RETURN(index_read_intern(
- buf, nullptr, 0, first ? HA_READ_KEY_EXACT : HA_READ_PREFIX_LAST));
+ int rc = index_read_intern(buf, nullptr, 0,
+ first ? HA_READ_KEY_EXACT : HA_READ_PREFIX_LAST);
+
+ if (rc == HA_ERR_KEY_NOT_FOUND) rc = HA_ERR_END_OF_FILE;
+
+ DBUG_RETURN(rc);
}
void ha_rocksdb::unlock_row() {
@@ -10298,7 +10094,7 @@ void ha_rocksdb::dec_table_n_rows() {
Constructing m_last_rowkey (MyRocks key expression) from
before_update|delete image (MySQL row expression).
m_last_rowkey is normally set during lookup phase, such as
- rnd_next_with_direction() and rnd_pos(). With Read Free Replication,
+ rnd_next() and rnd_pos(). With Read Free Replication,
these read functions are skipped and update_rows(), delete_rows() are
called without setting m_last_rowkey. This function sets m_last_rowkey
for Read Free Replication.
@@ -11296,129 +11092,11 @@ int ha_rocksdb::rnd_next(uchar *const buf) {
m_rnd_scan_started = true;
} else {
if (is_reverse_cf) {
- rc = index_prev_intern(buf);
+ rc = index_next_with_direction_intern(buf, false, false);
} else {
- rc = index_next_intern(buf);
+ rc = index_next_with_direction_intern(buf, true, false);
}
}
- if (rc == HA_ERR_KEY_NOT_FOUND) rc = HA_ERR_END_OF_FILE;
-
- DBUG_RETURN(rc);
-}
-
-/*
- See also secondary_index_read().
-*/
-int ha_rocksdb::rnd_next_with_direction(uchar *const buf, bool move_forward) {
- DBUG_ENTER_FUNC();
-
- int rc = 0;
- THD *thd = ha_thd();
-
- table->m_status = STATUS_NOT_FOUND;
- /* TODO(yzha) - row stats are gone in 8.0
- stats.rows_requested++; */
-
- if (!m_scan_it || !is_valid_iterator(m_scan_it)) {
- /*
- We can get here when SQL layer has called
-
- h->index_init(PRIMARY);
- h->index_read_map(full index tuple, HA_READ_KEY_EXACT);
-
- In this case, we should return EOF.
- */
- DBUG_RETURN(HA_ERR_END_OF_FILE);
- }
-
- for (;;) {
- DEBUG_SYNC(thd, "rocksdb.check_flags_rnwd");
- if (thd && thd->killed) {
- rc = HA_ERR_QUERY_INTERRUPTED;
- break;
- }
-
- if (move_forward) {
- m_scan_it->Next(); /* this call cannot fail */
- } else {
- m_scan_it->Prev(); /* this call cannot fail */
- }
-
- if (!is_valid_iterator(m_scan_it)) {
- rc = HA_ERR_END_OF_FILE;
- break;
- }
-
- /* check if we're out of this table */
- const rocksdb::Slice key = m_scan_it->key();
- if (!m_pk_descr->covers_key(key)) {
- rc = HA_ERR_END_OF_FILE;
- break;
- }
-
- if (m_iteration_only) {
- table->m_status = 0;
- break;
- }
-
- if (m_lock_rows != RDB_LOCK_NONE) {
- /*
- Lock the row we've just read.
-
- Now we call get_for_update which will 1) Take a lock and 2) Will fail
- if the row was deleted since the snapshot was taken.
- */
- Rdb_transaction *const tx = get_or_create_tx(table->in_use);
- DEBUG_SYNC(ha_thd(), "rocksdb_concurrent_delete");
-
- if (m_pk_descr->has_ttl() &&
- should_hide_ttl_rec(*m_pk_descr, m_scan_it->value(),
- tx->m_snapshot_timestamp)) {
- continue;
- }
-
- const rocksdb::Status s =
- get_for_update(tx, *m_pk_descr, key, &m_retrieved_record);
- if (s.IsNotFound() &&
- should_skip_invalidated_record(HA_ERR_KEY_NOT_FOUND)) {
- continue;
- }
-
- if (!s.ok()) {
- DBUG_RETURN(tx->set_status_error(table->in_use, s, *m_pk_descr,
- m_tbl_def, m_table_handler));
- }
-
- // If we called get_for_update() use the value from that call not from
- // the iterator as it may be stale since we don't have a snapshot
- // when m_lock_rows is not RDB_LOCK_NONE.
- m_last_rowkey.copy(key.data(), key.size(), &my_charset_bin);
- rc = convert_record_from_storage_format(&key, buf);
- } else {
- // Use the value from the iterator
- rocksdb::Slice value = m_scan_it->value();
-
- if (m_pk_descr->has_ttl() &&
- should_hide_ttl_rec(
- *m_pk_descr, value,
- get_or_create_tx(table->in_use)->m_snapshot_timestamp)) {
- continue;
- }
-
- m_last_rowkey.copy(key.data(), key.size(), &my_charset_bin);
- rc = convert_record_from_storage_format(&key, &value, buf);
- }
-
- table->m_status = 0;
- break;
- }
-
- if (!rc) {
- /* TODO(yzha) - row stats are gone in 8.0
- stats.rows_read++;
- stats.rows_index_next++; */
- update_row_stats(ROWS_READ);
- }
DBUG_RETURN(rc);
}
@@ -11457,7 +11135,7 @@ int ha_rocksdb::index_init(uint idx, bool sorted MY_ATTRIBUTE((__unused__))) {
DBUG_RETURN(HA_ERR_QUERY_INTERRUPTED);
}
- Rdb_transaction *const tx = get_or_create_tx(table->in_use);
+ Rdb_transaction *const tx = get_or_create_tx(thd);
DBUG_ASSERT(tx != nullptr);
// If m_lock_rows is not RDB_LOCK_NONE then we will be doing a get_for_update
@@ -11489,34 +11167,6 @@ int ha_rocksdb::index_end() {
DBUG_RETURN(HA_EXIT_SUCCESS);
}
-int ha_rocksdb::index_next_intern(uchar *const buf) {
- DBUG_ENTER_FUNC();
-
- bool moves_forward = true;
- if (m_key_descr_arr[active_index_pos()]->m_is_reverse_cf) {
- moves_forward = false;
- }
-
- int rc = index_next_with_direction(buf, moves_forward);
- if (rc == HA_ERR_KEY_NOT_FOUND) rc = HA_ERR_END_OF_FILE;
-
- DBUG_RETURN(rc);
-}
-
-int ha_rocksdb::index_prev_intern(uchar *const buf) {
- DBUG_ENTER_FUNC();
-
- bool moves_forward = false;
- if (m_key_descr_arr[active_index_pos()]->m_is_reverse_cf) {
- moves_forward = true;
- }
-
- int rc = index_next_with_direction(buf, moves_forward);
- if (rc == HA_ERR_KEY_NOT_FOUND) rc = HA_ERR_END_OF_FILE;
-
- DBUG_RETURN(rc);
-}
-
/**
Called by the partition manager for truncating tables.
diff --git a/storage/rocksdb/ha_rocksdb.h b/storage/rocksdb/ha_rocksdb.h
index 0d366b92c67..11913a3aec6 100644
--- a/storage/rocksdb/ha_rocksdb.h
+++ b/storage/rocksdb/ha_rocksdb.h
@@ -204,9 +204,6 @@ class ha_rocksdb : public my_core::handler {
uchar *m_sk_match_prefix;
uint m_sk_match_length;
- /* Buffer space for the above */
- uchar *m_sk_match_prefix_buf;
-
/* Second buffers, used by UPDATE. */
uchar *m_sk_packed_tuple_old;
Rdb_string_writer m_sk_tails_old;
@@ -631,8 +628,6 @@ class ha_rocksdb : public my_core::handler {
int index_next(uchar *const buf) override
MY_ATTRIBUTE((__warn_unused_result__));
- int index_next_with_direction(uchar *const buf, bool move_forward)
- MY_ATTRIBUTE((__warn_unused_result__));
int index_prev(uchar *const buf) override
MY_ATTRIBUTE((__warn_unused_result__));
@@ -641,11 +636,6 @@ class ha_rocksdb : public my_core::handler {
int index_last(uchar *const buf) override
MY_ATTRIBUTE((__warn_unused_result__));
- int index_next_intern(uchar *const buf)
- MY_ATTRIBUTE((__warn_unused_result__));
- int index_prev_intern(uchar *const buf)
- MY_ATTRIBUTE((__warn_unused_result__));
-
class Item *idx_cond_push(uint keyno, class Item *const idx_cond) override;
/*
Default implementation from cancel_pushed_idx_cond() suits us
@@ -788,9 +778,6 @@ class ha_rocksdb : public my_core::handler {
const rocksdb::Slice &ttl_rec_val,
const int64_t curr_ts)
MY_ATTRIBUTE((__warn_unused_result__));
- int rocksdb_skip_expired_records(const Rdb_key_def &kd,
- rocksdb::Iterator *const iter,
- bool seek_backward);
int index_read_intern(uchar *const buf, const uchar *const key,
key_part_map keypart_map,
@@ -798,10 +785,11 @@ class ha_rocksdb : public my_core::handler {
MY_ATTRIBUTE((__warn_unused_result__));
int index_read_intern(uchar *buf, bool first)
MY_ATTRIBUTE((__nonnull__, __warn_unused_result__));
+ int index_next_with_direction_intern(uchar *const buf, bool forward,
+ bool skip_next)
+ MY_ATTRIBUTE((__warn_unused_result__));
enum icp_result check_index_cond() const;
- int find_icp_matching_index_rec(const bool move_forward, uchar *const buf)
- MY_ATTRIBUTE((__nonnull__, __warn_unused_result__));
void calc_updated_indexes();
int update_write_row(const uchar *const old_data, const uchar *const new_data,
@@ -847,26 +835,17 @@ class ha_rocksdb : public my_core::handler {
const int64_t ttl_filter_ts)
MY_ATTRIBUTE((__nonnull__, __warn_unused_result__));
int read_before_key(const Rdb_key_def &kd, const bool using_full_key,
- const rocksdb::Slice &key_slice,
- const int64_t ttl_filter_ts)
+ const rocksdb::Slice &key_slice)
MY_ATTRIBUTE((__nonnull__, __warn_unused_result__));
- int read_after_key(const Rdb_key_def &kd, const rocksdb::Slice &key_slice,
- const int64_t ttl_filter_ts)
+ int read_after_key(const Rdb_key_def &kd, const rocksdb::Slice &key_slice)
MY_ATTRIBUTE((__nonnull__, __warn_unused_result__));
int position_to_correct_key(const Rdb_key_def &kd,
const enum ha_rkey_function &find_flag,
const bool full_key_match,
const rocksdb::Slice &key_slice,
- bool *const move_forward,
- const int64_t ttl_filter_ts)
+ bool *const move_forward)
MY_ATTRIBUTE((__warn_unused_result__));
- int read_row_from_primary_key(uchar *const buf)
- MY_ATTRIBUTE((__nonnull__, __warn_unused_result__));
- int read_row_from_secondary_key(uchar *const buf, const Rdb_key_def &kd,
- bool move_forward)
- MY_ATTRIBUTE((__nonnull__, __warn_unused_result__));
-
int calc_eq_cond_len(const Rdb_key_def &kd,
const enum ha_rkey_function &find_flag,
const rocksdb::Slice &slice,
@@ -914,11 +893,8 @@ class ha_rocksdb : public my_core::handler {
*/
int rnd_init(bool scan) override MY_ATTRIBUTE((__warn_unused_result__));
int rnd_end() override MY_ATTRIBUTE((__warn_unused_result__));
-
int rnd_next(uchar *const buf) override
MY_ATTRIBUTE((__warn_unused_result__));
- int rnd_next_with_direction(uchar *const buf, bool move_forward)
- MY_ATTRIBUTE((__warn_unused_result__));
int rnd_pos(uchar *const buf, uchar *const pos) override
MY_ATTRIBUTE((__warn_unused_result__));
@@ -1250,6 +1226,15 @@ inline void rocksdb_smart_next(bool seek_backward,
}
}
+inline void rocksdb_smart_prev(bool seek_backward,
+ rocksdb::Iterator *const iter) {
+ if (seek_backward) {
+ iter->Next();
+ } else {
+ iter->Prev();
+ }
+}
+
// If the iterator is not valid it might be because of EOF but might be due
// to IOError or corruption. The good practice is always check it.
// https://github.com/facebook/rocksdb/wiki/Iterator#error-handling
1
0
17 May '21
revision-id: b9b6ca04c17cde7d866e1dd21ad926cefadcecee (percona-202102-48-gb9b6ca04c17)
parent(s): b345ab69f4c44dec6e94865ec6d43178159a4fa6
author: Sergei Petrunia
committer: Sergei Petrunia
timestamp: 2021-05-17 17:15:44 +0300
message:
Apply patch: Merge index scan with index reads
Summary:
This merges index scans with index lookups. This is so that the codepaths to perform retry with snapshot refresh are consolidated into one location.
Also, in `position_to_correct_key`, I save a redundant call to `pack_index_tuple` since we just packed it earlier. To do this, the packed tuple is saved in `m_sk_match_prefix_buf`.
Test Plan: mtr
Reviewers: luqun, herman, yzha
Subscribers: pgl, vinaybhat
Differential Revision: https://phabricator.intern.facebook.com/D23358423
---
storage/rocksdb/ha_rocksdb.h | 1 -
1 file changed, 1 deletion(-)
diff --git a/storage/rocksdb/ha_rocksdb.h b/storage/rocksdb/ha_rocksdb.h
index b5f5ad23d90..0d366b92c67 100644
--- a/storage/rocksdb/ha_rocksdb.h
+++ b/storage/rocksdb/ha_rocksdb.h
@@ -279,7 +279,6 @@ class ha_rocksdb : public my_core::handler {
/* We only iterate but don't need to decode anything */
bool m_iteration_only;
-
bool m_rnd_scan_started;
/*
1
0