revision-id: af2c310e95343e4455c1cc3dcf893651d27e6c06 (percona-202103-65-gaf2c310e953) parent(s): cd90913f526a8166a9230442375bdaa862f133f3 author: Sergei Petrunia committer: Sergei Petrunia timestamp: 2021-07-19 12:45:16 +0300 message: This adds a global my.cnf parameter, rocksdb_use_range_locking. (Cherry-picked on top of the pushed iterator patches) When it is ON, MyRocks will: - initialize RocksDB to use range-locking lock manager - for all DML operations (including SELECT .. FOR UPDATE) will lock the scanned range before reading/modifying rows. - In range locking mode, there is no snapshot checking (cannot do that for ranges). Instead, MyRocks will read and modify latest committed data, just like InnoDB does (in the code, grep for (start|end) _ignore_snapshot) - Queries that do not have a finite range to scan, like UPDATE t1 .... ORDER BY t1.key LIMIT n will use a "Locking iterator" which will read rows, lock the range, and re-read the rows. See class LockingIterator. --- mysql-test/suite/rocksdb/combinations | 3 + .../suite/rocksdb/include/have_range_locking.inc | 3 + .../suite/rocksdb/include/not_range_locking.inc | 5 + .../rocksdb/include/select_from_is_rowlocks.inc | 66 +++ .../suite/rocksdb/r/hermitage-range_locking.result | 652 +++++++++++++++++++++ ...issue243_transactionStatus-range_locking.result | 182 ++++++ .../r/level_repeatable_read-range_locking.result | 106 ++++ mysql-test/suite/rocksdb/r/range_locking.result | 522 +++++++++++++++++ .../r/range_locking_deadlock_tracking.result | 453 ++++++++++++++ .../rocksdb/r/range_locking_escalation.result | 27 + .../rocksdb/r/range_locking_refresh_iter.result | 50 ++ .../suite/rocksdb/r/range_locking_rev_cf.result | 482 +++++++++++++++ .../rocksdb/r/range_locking_seek_for_update.result | 279 +++++++++ .../rocksdb/r/range_locking_shared_locks.result | 251 ++++++++ mysql-test/suite/rocksdb/r/rocksdb.result | 3 + .../suite/rocksdb/r/rocksdb_read_free_rpl.result | 2 +- .../rocksdb/r/rocksdb_timeout_rollback.result | 3 + mysql-test/suite/rocksdb/r/unique_sec.result | 4 + .../suite/rocksdb/r/unique_sec_rev_cf.result | 4 + mysql-test/suite/rocksdb/t/deadlock_tracking.test | 7 +- .../t/drop_cf_before_show_deadlock_info.test | 4 + .../suite/rocksdb/t/hermitage-range_locking.test | 15 + mysql-test/suite/rocksdb/t/hermitage.inc | 14 +- mysql-test/suite/rocksdb/t/hermitage.test | 3 + mysql-test/suite/rocksdb/t/i_s_deadlock.test | 4 + mysql-test/suite/rocksdb/t/issue111.test | 4 + .../issue243_transactionStatus-range_locking.test | 10 + .../rocksdb/t/issue243_transactionStatus.test | 4 + .../t/level_repeatable_read-range_locking.test | 9 + .../suite/rocksdb/t/level_repeatable_read.test | 3 + mysql-test/suite/rocksdb/t/lock_info.test | 3 + mysql-test/suite/rocksdb/t/locking_issues.test | 3 + mysql-test/suite/rocksdb/t/max_row_locks.test | 1 + mysql-test/suite/rocksdb/t/range_locking.inc | 544 +++++++++++++++++ mysql-test/suite/rocksdb/t/range_locking.test | 6 + .../rocksdb/t/range_locking_deadlock_tracking.test | 196 +++++++ .../rocksdb/t/range_locking_escalation-master.opt | 1 + .../suite/rocksdb/t/range_locking_escalation.test | 39 ++ .../rocksdb/t/range_locking_refresh_iter.test | 70 +++ .../suite/rocksdb/t/range_locking_rev_cf.test | 12 + .../rocksdb/t/range_locking_seek_for_update.test | 288 +++++++++ .../rocksdb/t/range_locking_shared_locks.test | 202 +++++++ mysql-test/suite/rocksdb/t/rocksdb.test | 3 + .../suite/rocksdb/t/rocksdb_concurrent_delete.test | 4 + mysql-test/suite/rocksdb/t/rocksdb_locks.test | 3 + .../suite/rocksdb/t/rocksdb_read_free_rpl.test | 2 +- .../suite/rocksdb/t/rocksdb_timeout_rollback.test | 2 + mysql-test/suite/rocksdb/t/rpl_row_not_found.inc | 2 + .../suite/rocksdb/t/select_lock_in_share_mode.test | 3 + mysql-test/suite/rocksdb/t/unique_check.test | 5 + mysql-test/suite/rocksdb/t/unique_sec.inc | 10 +- mysql-test/suite/rocksdb/t/unique_sec_rev_cf.test | 1 + mysql-test/suite/rocksdb/t/varbinary_format.test | 4 + mysql-test/suite/rocksdb/t/varchar_format.test | 2 + .../r/rocksdb_max_lock_memory_basic.result | 7 + .../r/rocksdb_use_range_locking_basic.result | 7 + .../t/rocksdb_max_lock_memory_basic.test | 5 + .../t/rocksdb_use_range_locking_basic.test | 5 + storage/rocksdb/CMakeLists.txt | 1 + storage/rocksdb/get_rocksdb_files.sh | 2 +- storage/rocksdb/ha_rocksdb.cc | 637 ++++++++++++++++++-- storage/rocksdb/ha_rocksdb.h | 18 +- storage/rocksdb/nosql_access.cc | 6 +- storage/rocksdb/rdb_i_s.cc | 82 ++- storage/rocksdb/rdb_iterator.cc | 33 +- storage/rocksdb/rdb_iterator.h | 9 + storage/rocksdb/rdb_locking_iter.cc | 108 ++++ storage/rocksdb/rdb_locking_iter.h | 190 ++++++ storage/rocksdb/rdb_utils.cc | 27 + storage/rocksdb/rdb_utils.h | 3 + 70 files changed, 5635 insertions(+), 85 deletions(-) diff --git a/mysql-test/suite/rocksdb/combinations b/mysql-test/suite/rocksdb/combinations index acf2f49a0c3..5e3b56932c6 100644 --- a/mysql-test/suite/rocksdb/combinations +++ b/mysql-test/suite/rocksdb/combinations @@ -7,3 +7,6 @@ rocksdb_write_policy=write_prepared [write_unprepared] rocksdb_write_policy=write_unprepared rocksdb_write_batch_flush_threshold=1 + +[range_locking] +rocksdb_use_range_locking=1 diff --git a/mysql-test/suite/rocksdb/include/have_range_locking.inc b/mysql-test/suite/rocksdb/include/have_range_locking.inc new file mode 100644 index 00000000000..a8600daea77 --- /dev/null +++ b/mysql-test/suite/rocksdb/include/have_range_locking.inc @@ -0,0 +1,3 @@ +if (`select count(*) = 0 from performance_schema.session_variables where variable_name = 'rocksdb_use_range_locking' and variable_value = 'ON';`) { + --skip Test requires range locking +} diff --git a/mysql-test/suite/rocksdb/include/not_range_locking.inc b/mysql-test/suite/rocksdb/include/not_range_locking.inc new file mode 100644 index 00000000000..62c26b134bc --- /dev/null +++ b/mysql-test/suite/rocksdb/include/not_range_locking.inc @@ -0,0 +1,5 @@ +--let $_use_range_locking= `select @@rocksdb_use_range_locking` +if ($_use_range_locking == 1) +{ + --skip Test doesn't support range locking +} diff --git a/mysql-test/suite/rocksdb/include/select_from_is_rowlocks.inc b/mysql-test/suite/rocksdb/include/select_from_is_rowlocks.inc new file mode 100644 index 00000000000..e5f54d68914 --- /dev/null +++ b/mysql-test/suite/rocksdb/include/select_from_is_rowlocks.inc @@ -0,0 +1,66 @@ +--echo # select * from information_schema.rocksdb_locks; # With replacements by select_from_is_rowlocks.inc +# +# An include to print contents of I_S.ROCKSB_LOCKS +# +# Implicit "parameters" +# - Currently it prints locks on t1.PRIMARY +# +# Explicit "parameter" variables: +# - $TRX1_ID - print this transaction as "TRX1" +# - $TRX2_ID - print this transaction as "TRX2" +# +# - $select_from_is_rowlocks_current_trx_only +# - $order_by_rowkey + +--disable_query_log +set @cf_id=(select column_family from information_schema.rocksdb_ddl + where table_name='t1' and index_name='PRIMARY'); +set @rtrx_id=(select transaction_id from information_schema.rocksdb_trx + where thread_id=connection_id()); +set @indexnr= (select lower(lpad(hex(index_number),8,'0')) from information_schema.rocksdb_ddl + where table_name='t1' and index_name='PRIMARY'); + +set @indexnr_next= (select lower(lpad(hex(index_number+1),8,'0')) + from information_schema.rocksdb_ddl + where table_name='t1' and index_name='PRIMARY'); + +let $extra_where = where 1; + +if ($select_from_is_rowlocks_current_trx_only) +{ + let $extra_where = where transaction_id=(select transaction_id from information_schema.rocksdb_trx where connection_id()=thread_id); +} + +# If TRX1_ID is not specified, get the current transaction: +let $transaction_col= replace(transaction_id, @rtrx_id, "\$trx_id"); +if ($TRX1_ID) +{ + let $transaction_col = replace(transaction_id, '$TRX1_ID', "\$TRX1_ID"); +} + +if ($TRX2_ID) +{ + let $transaction_col = replace($transaction_col, '$TRX2_ID', "\$TRX2_ID"); +} + +if ($order_by_rowkey) +{ + let $extra_order_by = ORDER BY 3,2; +} + +if (!$order_by_rowkey) +{ + --sorted_result +} + +eval select + replace(column_family_id, @cf_id, "\$cf_id") as COLUMN_FAMILY_ID, + $transaction_col as TRANSACTION_ID, + replace( + replace(`key`, @indexnr, '\${indexnr}'), + @indexnr_next, '\${indexnr+1}' + ) as `KEY`, + mode +from information_schema.rocksdb_locks $extra_where $extra_order_by; + +--enable_query_log diff --git a/mysql-test/suite/rocksdb/r/hermitage-range_locking.result b/mysql-test/suite/rocksdb/r/hermitage-range_locking.result new file mode 100644 index 00000000000..3938fa38b6c --- /dev/null +++ b/mysql-test/suite/rocksdb/r/hermitage-range_locking.result @@ -0,0 +1,652 @@ +DROP TABLE IF EXISTS test; +connect con1,localhost,root,,; +SET SESSION TRANSACTION ISOLATION LEVEL READ COMMITTED; +connect con2,localhost,root,,; +SET SESSION TRANSACTION ISOLATION LEVEL READ COMMITTED; +connect con3,localhost,root,,; +SET SESSION TRANSACTION ISOLATION LEVEL READ COMMITTED; +connection con1; +create table test (id int primary key, value int) engine=rocksdb; +connection con1; +truncate table test; +insert into test (id, value) values (1, 10), (2, 20); +begin; +connection con2; +begin; +connection con3; +begin; +connection con1; +select * from test; +id value +1 10 +2 20 +update test set value = 101 where id = 1; +connection con2; +select * from test; +id value +1 10 +2 20 +connection con1; +rollback; +connection con2; +select * from test; +id value +1 10 +2 20 +commit; +connection con1; +truncate table test; +insert into test (id, value) values (1, 10), (2, 20); +begin; +connection con2; +begin; +connection con3; +begin; +connection con1; +update test set value = 101 where id = 1; +connection con2; +select * from test; +id value +1 10 +2 20 +connection con1; +update test set value = 11 where id = 1; +commit; +connection con2; +select * from test; +id value +1 11 +2 20 +commit; +connection con1; +truncate table test; +insert into test (id, value) values (1, 10), (2, 20); +begin; +connection con2; +begin; +connection con3; +begin; +connection con1; +update test set value = 11 where id = 1; +connection con2; +update test set value = 22 where id = 2; +connection con1; +select * from test where id = 2; +id value +2 20 +connection con2; +select * from test where id = 1; +id value +1 10 +connection con1; +commit; +connection con2; +commit; +connection con1; +truncate table test; +insert into test (id, value) values (1, 10), (2, 20); +begin; +connection con2; +begin; +connection con3; +begin; +connection con1; +update test set value = 11 where id = 1; +update test set value = 19 where id = 2; +connection con2; +update test set value = 12 where id = 1; +connection con1; +commit; +connection con2; +connection con3; +select * from test; +id value +1 11 +2 19 +connection con2; +update test set value = 18 where id = 2; +connection con3; +select * from test; +id value +1 11 +2 19 +connection con2; +commit; +connection con3; +select * from test; +id value +1 12 +2 18 +commit; +connection con1; +truncate table test; +insert into test (id, value) values (1, 10), (2, 20); +begin; +connection con2; +begin; +connection con3; +begin; +connection con1; +select * from test where value = 30; +id value +connection con2; +insert into test (id, value) values(3, 30); +commit; +connection con1; +select * from test where value % 3 = 0; +id value +3 30 +commit; +connection con1; +truncate table test; +insert into test (id, value) values (1, 10), (2, 20); +begin; +connection con2; +begin; +connection con3; +begin; +connection con1; +update test set value = value + 10; +connection con2; +select variable_value into @a from performance_schema.global_status where variable_name='rocksdb_snapshot_conflict_errors'; +select * from test; +id value +1 10 +2 20 +delete from test where value = 20; +connection con1; +commit; +connection con2; +select * from test; +id value +2 30 +commit; +connection con1; +truncate table test; +insert into test (id, value) values (1, 10), (2, 20); +begin; +connection con2; +begin; +connection con3; +begin; +connection con1; +select * from test where id = 1; +id value +1 10 +connection con2; +select * from test where id = 1; +id value +1 10 +connection con1; +update test set value = 11 where id = 1; +connection con2; +update test set value = 12 where id = 1; +connection con1; +commit; +connection con2; +select * from test; +id value +1 12 +2 20 +commit; +connection con1; +truncate table test; +insert into test (id, value) values (1, 10), (2, 20); +begin; +connection con2; +begin; +connection con3; +begin; +connection con1; +select * from test where id = 1; +id value +1 10 +connection con2; +select * from test where id = 1; +id value +1 10 +select * from test where id = 2; +id value +2 20 +update test set value = 12 where id = 1; +update test set value = 18 where id = 2; +commit; +connection con1; +select * from test where id = 2; +id value +2 18 +commit; +connection con1; +truncate table test; +insert into test (id, value) values (1, 10), (2, 20); +begin; +connection con2; +begin; +connection con3; +begin; +connection con1; +select * from test where value % 5 = 0; +id value +1 10 +2 20 +connection con2; +update test set value = 12 where value = 10; +commit; +connection con1; +select * from test where value % 3 = 0; +id value +1 12 +commit; +connection con1; +truncate table test; +insert into test (id, value) values (1, 10), (2, 20); +begin; +connection con2; +begin; +connection con3; +begin; +connection con1; +select * from test where id = 1; +id value +1 10 +connection con2; +select * from test; +id value +1 10 +2 20 +update test set value = 12 where id = 1; +update test set value = 18 where id = 2; +commit; +connection con1; +delete from test where value = 20; +select * from test where id = 2; +id value +2 18 +commit; +connection con1; +truncate table test; +insert into test (id, value) values (1, 10), (2, 20); +begin; +connection con2; +begin; +connection con3; +begin; +connection con1; +select * from test where id in (1,2); +id value +1 10 +2 20 +connection con2; +select * from test where id in (1,2); +id value +1 10 +2 20 +connection con1; +update test set value = 11 where id = 1; +connection con2; +update test set value = 21 where id = 2; +connection con1; +commit; +connection con2; +commit; +connection con1; +truncate table test; +insert into test (id, value) values (1, 10), (2, 20); +begin; +connection con2; +begin; +connection con3; +begin; +connection con1; +select * from test where value % 3 = 0; +id value +connection con2; +select * from test where value % 3 = 0; +id value +connection con1; +insert into test (id, value) values(3, 30); +connection con2; +insert into test (id, value) values(4, 42); +connection con1; +commit; +connection con2; +commit; +select * from test where value % 3 = 0; +id value +3 30 +4 42 +connection con1; +select * from test where value % 3 = 0; +id value +3 30 +4 42 +connection default; +drop table test; +disconnect con1; +disconnect con2; +disconnect con3; +DROP TABLE IF EXISTS test; +connect con1,localhost,root,,; +SET SESSION TRANSACTION ISOLATION LEVEL REPEATABLE READ; +connect con2,localhost,root,,; +SET SESSION TRANSACTION ISOLATION LEVEL REPEATABLE READ; +connect con3,localhost,root,,; +SET SESSION TRANSACTION ISOLATION LEVEL REPEATABLE READ; +connection con1; +create table test (id int primary key, value int) engine=rocksdb; +connection con1; +truncate table test; +insert into test (id, value) values (1, 10), (2, 20); +begin; +connection con2; +begin; +connection con3; +begin; +connection con1; +select * from test; +id value +1 10 +2 20 +update test set value = 101 where id = 1; +connection con2; +select * from test; +id value +1 10 +2 20 +connection con1; +rollback; +connection con2; +select * from test; +id value +1 10 +2 20 +commit; +connection con1; +truncate table test; +insert into test (id, value) values (1, 10), (2, 20); +begin; +connection con2; +begin; +connection con3; +begin; +connection con1; +update test set value = 101 where id = 1; +connection con2; +select * from test; +id value +1 10 +2 20 +connection con1; +update test set value = 11 where id = 1; +commit; +connection con2; +select * from test; +id value +1 10 +2 20 +commit; +connection con1; +truncate table test; +insert into test (id, value) values (1, 10), (2, 20); +begin; +connection con2; +begin; +connection con3; +begin; +connection con1; +update test set value = 11 where id = 1; +connection con2; +update test set value = 22 where id = 2; +connection con1; +select * from test where id = 2; +id value +2 20 +connection con2; +select * from test where id = 1; +id value +1 10 +connection con1; +commit; +connection con2; +commit; +connection con1; +truncate table test; +insert into test (id, value) values (1, 10), (2, 20); +begin; +connection con2; +begin; +connection con3; +begin; +connection con1; +update test set value = 11 where id = 1; +update test set value = 19 where id = 2; +connection con2; +update test set value = 12 where id = 1; +connection con1; +commit; +connection con2; +connection con3; +select * from test; +id value +1 11 +2 19 +connection con2; +update test set value = 18 where id = 2; +connection con3; +select * from test; +id value +1 11 +2 19 +connection con2; +commit; +connection con3; +select * from test; +id value +1 11 +2 19 +commit; +connection con1; +truncate table test; +insert into test (id, value) values (1, 10), (2, 20); +begin; +connection con2; +begin; +connection con3; +begin; +connection con1; +select * from test where value = 30; +id value +connection con2; +insert into test (id, value) values(3, 30); +commit; +connection con1; +select * from test where value % 3 = 0; +id value +commit; +connection con1; +truncate table test; +insert into test (id, value) values (1, 10), (2, 20); +begin; +connection con2; +begin; +connection con3; +begin; +connection con1; +update test set value = value + 10; +connection con2; +select variable_value into @a from performance_schema.global_status where variable_name='rocksdb_snapshot_conflict_errors'; +select * from test; +id value +1 10 +2 20 +delete from test where value = 20; +connection con1; +commit; +connection con2; +select * from test; +id value +2 20 +commit; +connection con1; +truncate table test; +insert into test (id, value) values (1, 10), (2, 20); +begin; +connection con2; +begin; +connection con3; +begin; +connection con1; +select * from test where id = 1; +id value +1 10 +connection con2; +select * from test where id = 1; +id value +1 10 +connection con1; +update test set value = 11 where id = 1; +connection con2; +update test set value = 12 where id = 1; +connection con1; +commit; +connection con2; +select * from test; +id value +1 12 +2 20 +commit; +connection con1; +truncate table test; +insert into test (id, value) values (1, 10), (2, 20); +begin; +connection con2; +begin; +connection con3; +begin; +connection con1; +select * from test where id = 1; +id value +1 10 +connection con2; +select * from test where id = 1; +id value +1 10 +select * from test where id = 2; +id value +2 20 +update test set value = 12 where id = 1; +update test set value = 18 where id = 2; +commit; +connection con1; +select * from test where id = 2; +id value +2 20 +commit; +connection con1; +truncate table test; +insert into test (id, value) values (1, 10), (2, 20); +begin; +connection con2; +begin; +connection con3; +begin; +connection con1; +select * from test where value % 5 = 0; +id value +1 10 +2 20 +connection con2; +update test set value = 12 where value = 10; +commit; +connection con1; +select * from test where value % 3 = 0; +id value +commit; +connection con1; +truncate table test; +insert into test (id, value) values (1, 10), (2, 20); +begin; +connection con2; +begin; +connection con3; +begin; +connection con1; +select * from test where id = 1; +id value +1 10 +connection con2; +select * from test; +id value +1 10 +2 20 +update test set value = 12 where id = 1; +update test set value = 18 where id = 2; +commit; +connection con1; +delete from test where value = 20; +select * from test where id = 2; +id value +2 20 +commit; +connection con1; +truncate table test; +insert into test (id, value) values (1, 10), (2, 20); +begin; +connection con2; +begin; +connection con3; +begin; +connection con1; +select * from test where id in (1,2); +id value +1 10 +2 20 +connection con2; +select * from test where id in (1,2); +id value +1 10 +2 20 +connection con1; +update test set value = 11 where id = 1; +connection con2; +update test set value = 21 where id = 2; +connection con1; +commit; +connection con2; +commit; +connection con1; +truncate table test; +insert into test (id, value) values (1, 10), (2, 20); +begin; +connection con2; +begin; +connection con3; +begin; +connection con1; +select * from test where value % 3 = 0; +id value +connection con2; +select * from test where value % 3 = 0; +id value +connection con1; +insert into test (id, value) values(3, 30); +connection con2; +insert into test (id, value) values(4, 42); +connection con1; +commit; +connection con2; +commit; +select * from test where value % 3 = 0; +id value +3 30 +4 42 +connection con1; +select * from test where value % 3 = 0; +id value +3 30 +4 42 +connection default; +drop table test; +disconnect con1; +disconnect con2; +disconnect con3; diff --git a/mysql-test/suite/rocksdb/r/issue243_transactionStatus-range_locking.result b/mysql-test/suite/rocksdb/r/issue243_transactionStatus-range_locking.result new file mode 100644 index 00000000000..b48535c5ee6 --- /dev/null +++ b/mysql-test/suite/rocksdb/r/issue243_transactionStatus-range_locking.result @@ -0,0 +1,182 @@ +DROP TABLE IF EXISTS t1; +CREATE TABLE t1 ( +id INT, +val1 INT, +val2 INT, +PRIMARY KEY (id) +) ENGINE=rocksdb; +INSERT INTO t1 VALUES(1,1,1),(2,1,2); +SELECT * FROM t1; +id val1 val2 +1 1 1 +2 1 2 +UPDATE t1 SET val1=2 WHERE id=2; +SELECT * FROM t1; +id val1 val2 +1 1 1 +2 2 2 +SHOW ENGINE rocksdb TRANSACTION STATUS; +Type Name Status +rocksdb +============================================================ +TIMESTAMP ROCKSDB TRANSACTION MONITOR OUTPUT +============================================================ +--------- +SNAPSHOTS +--------- +LIST OF SNAPSHOTS FOR EACH SESSION: +----------LATEST DETECTED DEADLOCKS---------- +----------------------------------------- +END OF ROCKSDB TRANSACTION MONITOR OUTPUT +========================================= + +SET AUTOCOMMIT=0; +START TRANSACTION; +INSERT INTO t1 VALUES(20,1,1),(30,30,30); +SELECT * FROM t1; +id val1 val2 +1 1 1 +2 2 2 +20 1 1 +30 30 30 +UPDATE t1 SET val1=20, val2=20 WHERE id=20; +SELECT * FROM t1; +id val1 val2 +1 1 1 +2 2 2 +20 20 20 +30 30 30 +DELETE FROM t1 WHERE id=30; +SHOW ENGINE rocksdb TRANSACTION STATUS; +Type Name Status +rocksdb +============================================================ +TIMESTAMP ROCKSDB TRANSACTION MONITOR OUTPUT +============================================================ +--------- +SNAPSHOTS +--------- +LIST OF SNAPSHOTS FOR EACH SESSION: +---SNAPSHOT, ACTIVE NUM sec +MySQL thread id TID, OS thread handle PTR, query id QID localhost root ACTION +SHOW ENGINE rocksdb TRANSACTION STATUS +lock count 4, write count 4 +insert count 2, update count 1, delete count 1 +----------LATEST DETECTED DEADLOCKS---------- +----------------------------------------- +END OF ROCKSDB TRANSACTION MONITOR OUTPUT +========================================= + +ROLLBACK; +SHOW ENGINE rocksdb TRANSACTION STATUS; +Type Name Status +rocksdb +============================================================ +TIMESTAMP ROCKSDB TRANSACTION MONITOR OUTPUT +============================================================ +--------- +SNAPSHOTS +--------- +LIST OF SNAPSHOTS FOR EACH SESSION: +----------LATEST DETECTED DEADLOCKS---------- +----------------------------------------- +END OF ROCKSDB TRANSACTION MONITOR OUTPUT +========================================= + +START TRANSACTION; +INSERT INTO t1 VALUES(40,40,40); +SHOW ENGINE rocksdb TRANSACTION STATUS; +Type Name Status +rocksdb +============================================================ +TIMESTAMP ROCKSDB TRANSACTION MONITOR OUTPUT +============================================================ +--------- +SNAPSHOTS +--------- +LIST OF SNAPSHOTS FOR EACH SESSION: +----------LATEST DETECTED DEADLOCKS---------- +----------------------------------------- +END OF ROCKSDB TRANSACTION MONITOR OUTPUT +========================================= + +COMMIT; +SHOW ENGINE rocksdb TRANSACTION STATUS; +Type Name Status +rocksdb +============================================================ +TIMESTAMP ROCKSDB TRANSACTION MONITOR OUTPUT +============================================================ +--------- +SNAPSHOTS +--------- +LIST OF SNAPSHOTS FOR EACH SESSION: +----------LATEST DETECTED DEADLOCKS---------- +----------------------------------------- +END OF ROCKSDB TRANSACTION MONITOR OUTPUT +========================================= + +SET AUTOCOMMIT=1; +DROP TABLE t1; +DROP TABLE IF EXISTS t2; +CREATE TABLE t2 ( +id1 INT, +id2 INT, +value INT, +PRIMARY KEY (id1), +KEY (id2) +) ENGINE=rocksdb; +SET AUTOCOMMIT=0; +START TRANSACTION; +INSERT INTO t2 VALUES(1,2,0),(10,20,30); +UPDATE t2 SET value=3 WHERE id2=2; +DELETE FROM t2 WHERE id1=10; +SHOW ENGINE rocksdb TRANSACTION STATUS; +Type Name Status +rocksdb +============================================================ +TIMESTAMP ROCKSDB TRANSACTION MONITOR OUTPUT +============================================================ +--------- +SNAPSHOTS +--------- +LIST OF SNAPSHOTS FOR EACH SESSION: +----------LATEST DETECTED DEADLOCKS---------- +----------------------------------------- +END OF ROCKSDB TRANSACTION MONITOR OUTPUT +========================================= + +ROLLBACK; +SET AUTOCOMMIT=1; +DROP TABLE t2; +DROP TABLE IF EXISTS t2; +CREATE TABLE t2 ( +id1 INT, +id2 INT, +value INT, +PRIMARY KEY (id1), +UNIQUE KEY (id2) +) ENGINE=rocksdb; +SET AUTOCOMMIT=0; +START TRANSACTION; +INSERT INTO t2 VALUES(1,2,0),(10,20,30); +UPDATE t2 SET value=3 WHERE id2=2; +DELETE FROM t2 WHERE id1=10; +SHOW ENGINE rocksdb TRANSACTION STATUS; +Type Name Status +rocksdb +============================================================ +TIMESTAMP ROCKSDB TRANSACTION MONITOR OUTPUT +============================================================ +--------- +SNAPSHOTS +--------- +LIST OF SNAPSHOTS FOR EACH SESSION: +----------LATEST DETECTED DEADLOCKS---------- +----------------------------------------- +END OF ROCKSDB TRANSACTION MONITOR OUTPUT +========================================= + +ROLLBACK; +SET AUTOCOMMIT=1; +DROP TABLE t2; diff --git a/mysql-test/suite/rocksdb/r/level_repeatable_read-range_locking.result b/mysql-test/suite/rocksdb/r/level_repeatable_read-range_locking.result new file mode 100644 index 00000000000..0592b099238 --- /dev/null +++ b/mysql-test/suite/rocksdb/r/level_repeatable_read-range_locking.result @@ -0,0 +1,106 @@ +DROP TABLE IF EXISTS t1; +connect con1,localhost,root,,; +SET SESSION TRANSACTION ISOLATION LEVEL REPEATABLE READ; +connect con2,localhost,root,,; +SET SESSION TRANSACTION ISOLATION LEVEL REPEATABLE READ; +connection con1; +CREATE TABLE t1 (a INT, pk INT AUTO_INCREMENT PRIMARY KEY) ENGINE=rocksdb; +START TRANSACTION; +SELECT a FROM t1; +a +connection con2; +BEGIN; +INSERT INTO t1 (a) VALUES(1); +connection con1; +SELECT a FROM t1; +a +connection con2; +INSERT INTO t1 (a) VALUES (2); +connection con1; +SELECT a FROM t1; +a +INSERT INTO t1 (a) SELECT a+100 FROM t1; +SELECT a FROM t1; +a +connection con2; +SELECT a FROM t1; +a +1 +2 +COMMIT; +SELECT a FROM t1; +a +1 +2 +connection con1; +SELECT a FROM t1; +a +INSERT INTO t1 (a) SELECT a+200 FROM t1; +SELECT a FROM t1; +a +201 +202 +COMMIT; +SELECT a FROM t1; +a +1 +2 +201 +202 +connection con2; +SELECT a FROM t1; +a +1 +2 +201 +202 +connection default; +CREATE TABLE t2 (a INT PRIMARY KEY) ENGINE=rocksdb; +INSERT INTO t2 (a) VALUES (1); +COMMIT; +connection con1; +BEGIN; +SELECT a from t2; +a +1 +INSERT INTO t2 (a) VALUES (1), (3); +ERROR 23000: Duplicate entry '1' for key 't2.PRIMARY' +connection con2; +INSERT INTO t2 (a) VALUES (2); +COMMIT; +connection con1; +SELECT a from t2; +a +1 +COMMIT; +connection default; +disconnect con1; +disconnect con2; +DROP TABLE t1; +DROP TABLE t2; +CREATE TABLE t3 ( +pk int unsigned PRIMARY KEY, +count int unsigned DEFAULT '0' +) ENGINE=ROCKSDB; +connect con1,localhost,root,,; +SET SESSION TRANSACTION ISOLATION LEVEL REPEATABLE READ; +connect con2,localhost,root,,; +SET SESSION TRANSACTION ISOLATION LEVEL REPEATABLE READ; +connection con1; +BEGIN; +SELECT * FROM t3; +pk count +connection con2; +BEGIN; +INSERT INTO t3 (pk) VALUES(1) ON DUPLICATE KEY UPDATE count=count+1; +COMMIT; +connection con1; +INSERT INTO t3 (pk) VALUES(1) ON DUPLICATE KEY UPDATE count=count+1; +COMMIT; +SELECT count FROM t3; +count +1 +connection default; +disconnect con1; +disconnect con2; +DROP TABLE t3; diff --git a/mysql-test/suite/rocksdb/r/range_locking.result b/mysql-test/suite/rocksdb/r/range_locking.result new file mode 100644 index 00000000000..603c0b99f09 --- /dev/null +++ b/mysql-test/suite/rocksdb/r/range_locking.result @@ -0,0 +1,522 @@ +show variables like 'rocksdb_use_range_locking'; +Variable_name Value +rocksdb_use_range_locking ON +create table t1 ( +pk int, +a int, +primary key (pk) comment 'default' +) engine=rocksdb; +insert into t1 values +(10,10),(20,20),(30,30); +connect con1,localhost,root,,; +connect con2,localhost,root,,; +### Test: check that range lock inhibits a point lock +connection con1; +begin; +select * from t1 where pk between 5 and 25 for update; +pk a +10 10 +20 20 +connection con2; +insert into t1 values (15,15); +ERROR HY000: Lock wait timeout exceeded; try restarting transaction: Timeout on index: test.t1.PRIMARY +connection con1; +rollback; +## Test: check that range lock inhibits another range lock +connection con1; +begin; +select * from t1 where pk between 5 and 25 for update; +pk a +10 10 +20 20 +connection con2; +begin; +select * from t1 where pk between 15 and 35 for update; +ERROR HY000: Lock wait timeout exceeded; try restarting transaction: Timeout on index: test.t1.PRIMARY +rollback; +connection con1; +rollback; +## Test: check that regular read does not get a range lock +connection con1; +begin; +select * from t1 where pk between 5 and 25; +pk a +10 10 +20 20 +connection con2; +begin; +select * from t1 where pk between 15 and 35 for update; +pk a +20 20 +30 30 +rollback; +connection con1; +rollback; +## Test that locks are not released when a statement inside +## a transaction is rolled back +create table t2 ( +pk int, +a int, +primary key (pk) comment 'default', +unique key(a) comment 'default' +) engine=rocksdb; +insert into t2 values (1,1),(2,2); +begin; +insert into t2 values (3,3); +insert into t2 values (10,2); +ERROR 23000: Duplicate entry '2' for key 't2.a' +connection con2; +begin; +select * from t2 where pk=3 for update; +ERROR HY000: Lock wait timeout exceeded; try restarting transaction: Timeout on index: test.t2.PRIMARY +rollback; +connection con1; +rollback; +drop table t2; +connection default; +disconnect con1; +disconnect con2; +drop table t1; +# +# Test INFORMATION_SCHEMA.lock_info in range-locking mode +# +connect con1,localhost,root,,; +connection con1; +create table t0 (a int primary key); +begin; +insert into t0 values (1); +connection default; +create table t1 ( +pk int, +a int, +primary key (pk) comment 'default' +) engine=rocksdb; +insert into t1 values +(10,10),(20,20),(30,30); +begin; +select * from t1 where pk=10 for update; +pk a +10 10 +# select * from information_schema.rocksdb_locks; # With replacements by select_from_is_rowlocks.inc +COLUMN_FAMILY_ID TRANSACTION_ID KEY mode +$cf_id $trx_id ${indexnr}8000000a X +delete from t1 where pk between 25 and 40; +# select * from information_schema.rocksdb_locks; # With replacements by select_from_is_rowlocks.inc +COLUMN_FAMILY_ID TRANSACTION_ID KEY mode +$cf_id $trx_id ${indexnr}8000000a X +$cf_id $trx_id ${indexnr}80000019-${indexnr}80000028:1 X +rollback; +begin; +# The following will show a range lock on 2-9 and also a point lock on 10. +# This is how things currently work. (after MDEV-21314, not anymore) +select * from t1 where pk between 2 and 9 for update; +pk a +# select * from information_schema.rocksdb_locks; # With replacements by select_from_is_rowlocks.inc +COLUMN_FAMILY_ID TRANSACTION_ID KEY mode +$cf_id $trx_id ${indexnr}80000002-${indexnr}80000009:1 X +rollback; +drop table t1; +connection con1; +rollback; +drop table t0; +connection default; +disconnect con1; +# +# MDEV-18104: MyRocks-Gap-Lock: range locking bounds are incorrect for multi-part keys +# +create table t0(a int); +insert into t0 values (0),(1),(2),(3),(4),(5),(6),(7),(8),(9); +create table t1 ( +kp1 int not null, +kp2 int not null, +a int, +primary key(kp1, kp2) comment 'default' +) engine=rocksdb; +insert into t1 select 1, a, 1234 from t0; +insert into t1 select 2, a, 1234 from t0; +insert into t1 select 3, a, 1234 from t0; +connect con1,localhost,root,,; +connection con1; +begin; +select * from t1 where kp1=2 for update; +kp1 kp2 a +2 0 1234 +2 1 1234 +2 2 1234 +2 3 1234 +2 4 1234 +2 5 1234 +2 6 1234 +2 7 1234 +2 8 1234 +2 9 1234 +connection default; +# The lock on kp1=2 should inhibit the following INSERT: +insert into t1 values ( 2,5,9999); +ERROR HY000: Lock wait timeout exceeded; try restarting transaction: Timeout on index: test.t1.PRIMARY +rollback; +connection con1; +rollback; +connection default; +disconnect con1; +drop table t0,t1; +# +# Test that locks on ranges on non-unique secondary keys inhibit +# modifications of the contents of these ranges +# +create table t0(a int); +insert into t0 values (0),(1),(2),(3),(4),(5),(6),(7),(8),(9); +create table t1 ( +kp1 int not null, +kp2 int not null, +a int, +key(kp1, kp2) comment 'default' +) engine=rocksdb; +insert into t1 select 1, a, 1234 from t0; +insert into t1 values (2, 3, 1234); +insert into t1 values (2, 5, 1234); +insert into t1 values (2, 7, 1234); +insert into t1 select 3, a, 1234 from t0; +connect con1,localhost,root,,; +connection con1; +begin; +explain +select * from t1 where kp1=2 for update; +id select_type table partitions type possible_keys key key_len ref rows filtered Extra +1 SIMPLE t1 NULL ref kp1 kp1 4 const # 100.00 NULL +Warnings: +Note 1003 /* select#1 */ select `test`.`t1`.`kp1` AS `kp1`,`test`.`t1`.`kp2` AS `kp2`,`test`.`t1`.`a` AS `a` from `test`.`t1` where (`test`.`t1`.`kp1` = 2) +select * from t1 where kp1=2 for update; +kp1 kp2 a +2 3 1234 +2 5 1234 +2 7 1234 +connection default; +begin; +insert into t1 values (2, 9, 9999); +ERROR HY000: Lock wait timeout exceeded; try restarting transaction: Timeout on index: test.t1.kp1 +delete from t1 where kp1=2 and kp2=5; +ERROR HY000: Lock wait timeout exceeded; try restarting transaction: Timeout on index: test.t1.kp1 +update t1 set kp1=333 where kp1=2 and kp2=3; +ERROR HY000: Lock wait timeout exceeded; try restarting transaction: Timeout on index: test.t1.kp1 +update t1 set kp1=2 where kp1=1 and kp2=8; +ERROR HY000: Lock wait timeout exceeded; try restarting transaction: Timeout on index: test.t1.kp1 +rollback; +connection con1; +rollback; +disconnect con1; +connection default; +drop table t0,t1; +# +# Transaction isolation test +# +create table t1 (pk int primary key, a int) engine=rocksdb; +insert into t1 values (1,1),(2,2),(3,3); +connect con1,localhost,root,,; +# TRX1: Start, Allocate a snapshot +connection con1; +begin; +select * from t1; +pk a +1 1 +2 2 +3 3 +# TRX2: Make a change that TRX1 will not see +connection default; +update t1 set a=2222 where pk=2; +# TRX1: Now, make a change that would overwrite TRX2'x change and commit +connection con1; +update t1 set a=a+1 where pk=2; +commit; +# Examine the result: +# pk=2, a=2223 means UPDATE in TRX1 used "read committed" (InnoDB-like isolation) +# pk=2, a=3 means UPDATE in TRX1 silently overwrote TRX2 +# (and with key tracking, one would get an error on the second UPDATE) +connection default; +select * from t1; +pk a +1 1 +2 2223 +3 3 +disconnect con1; +connection default; +drop table t1; +# +# Same test as above, but check the range scan +# +create table t1 ( +pk int, +a int, +primary key (pk) comment 'default' +) engine=rocksdb; +insert into t1 values (1,1),(2,2),(3,3),(4,4),(5,5),(6,6); +connect con1,localhost,root,,; +# TRX1: Start, Allocate a snapshot +connection con1; +begin; +select * from t1; +pk a +1 1 +2 2 +3 3 +4 4 +5 5 +6 6 +# TRX2: Make a change that TRX1 will not see +connection default; +update t1 set a=2222 where pk between 3 and 5; +# TRX1: Now, make a change that would overwrite TRX2'x change and commit +connection con1; +update t1 set a=a+1 where pk between 3 and 5; +commit; +# Examine the result: +# pk={3,4,5} a=2223 means UPDATE in TRX1 used "read committed" (InnoDB-like isolation) +connection default; +select * from t1; +pk a +1 1 +2 2 +3 2223 +4 2223 +5 2223 +6 6 +disconnect con1; +connection default; +drop table t1; +# +# Same as above, but test SELECT FOR UPDATE. +# +create table t1 ( +pk int, +a int, +primary key (pk) comment 'default' +) engine=rocksdb; +insert into t1 values (1,1),(2,2),(3,3),(4,4),(5,5),(6,6); +connect con1,localhost,root,,; +# TRX1: Start, Allocate a snapshot +connection con1; +begin; +select * from t1; +pk a +1 1 +2 2 +3 3 +4 4 +5 5 +6 6 +# TRX2: Make a change that TRX1 will not see +connection default; +update t1 set a=222 where pk=2; +update t1 set a=333 where pk=3; +# TRX1: Check what select [FOR UPDATE] sees +connection con1; +select * from t1 where pk in (2,3); +pk a +2 2 +3 3 +select * from t1 where pk=2 for update; +pk a +2 222 +select * from t1 where pk=2; +pk a +2 2 +commit; +disconnect con1; +connection default; +drop table t1; +# +# Another no-snapshot-checking test, this time for single-statement +# transaction +# +create table t1 ( +pk int, +a int, +name varchar(16), +primary key(pk) comment 'default' +) engine=rocksdb; +insert into t1 values (1,1, 'row1'), (2,2,'row2'); +connect con1,localhost,root,,; +connection con1; +select get_lock('row1', 100); +get_lock('row1', 100) +1 +connection default; +# The following will read the first row (1,1,'row1'), and stop. +update t1 set a=a+100 where get_lock(name, 1000)=1; +connection con1; +update t1 set a=5 where pk=2; +select release_lock('row1'); +release_lock('row1') +1 +connection default; +# Look at the row with pk=2: +# 2, 105, row2 - means the UPDATE was reading current data (Correct) +# 2, 102, row - means the UPDATE read the snapshot (incorrect) +select * from t1; +pk a name +1 101 row1 +2 105 row2 +# Try releasing both locks (in 5.6, we will be holding only the second one) +select release_lock(name) from t1; +release_lock(name) +1 +1 +disconnect con1; +connection default; +drop table t1; +# +# Check that I_S.processlist.state is set correctly now. +# +create table t1( +pk int, +a int, +primary key(pk) comment 'default' +) engine=rocksdb; +insert into t1 values (1,1),(2,2),(3,3); +begin; +select * from t1 where pk=2 for update; +pk a +2 2 +connect con1,localhost,root,,; +begin; +set rocksdb_lock_wait_timeout=300; +select * from t1 where pk=2 for update; +connection default; +# Now, will wait until we see con1 have state="Waiting for row lock" +rollback; +connection con1; +pk a +2 2 +rollback; +disconnect con1; +connection default; +drop table t1; +# +# Test range locking for ranges with HA_READ_PREFIX_LAST +# +create table t0(a int) engine=rocksdb; +insert into t0 values (0),(1),(2),(3),(4),(5),(6),(7),(8),(9); +create table t1 ( +pk1 int, +pk2 int, +a int, +primary key(pk1, pk2) comment 'default' +) engine=rocksdb; +insert into t1 +select +A.a, B.a, A.a*10+B.a +from +t0 A, t0 B; +connect con1,localhost,root,,; +connection con1; +begin; +insert into t1 values (0x1112222,0x1112222,0); +connection default; +begin; +# Should use ref access w/o filesort: +explain +select * from t1 +where pk1=3 +order by pk1 desc, pk2 desc +for update; +id select_type table partitions type possible_keys key key_len ref rows filtered Extra +1 SIMPLE t1 NULL ref PRIMARY PRIMARY 4 const # 100.00 Backward index scan +Warnings: +Note 1003 /* select#1 */ select `test`.`t1`.`pk1` AS `pk1`,`test`.`t1`.`pk2` AS `pk2`,`test`.`t1`.`a` AS `a` from `test`.`t1` where (`test`.`t1`.`pk1` = 3) order by `test`.`t1`.`pk1` desc,`test`.`t1`.`pk2` desc +select * from t1 +where pk1=3 +order by pk1 desc, pk2 desc +for update; +pk1 pk2 a +3 9 39 +3 8 38 +3 7 37 +3 6 36 +3 5 35 +3 4 34 +3 3 33 +3 2 32 +3 1 31 +3 0 30 +# select * from information_schema.rocksdb_locks; # With replacements by select_from_is_rowlocks.inc +COLUMN_FAMILY_ID TRANSACTION_ID KEY mode +$cf_id $trx_id ${indexnr}80000003-${indexnr}80000003:1 X +rollback; +# +# Test range locking for ranges with HA_READ_PREFIX_LAST_OR_PREV +# +begin; +# Should use range access with 2 keyparts and w/o filesort: +explain +select * from t1 +where pk1=4 and pk2 between 5 and 8 +order by pk1 desc, pk2 desc +for update; +id select_type table partitions type possible_keys key key_len ref rows filtered Extra +1 SIMPLE t1 NULL range PRIMARY PRIMARY 8 NULL # 100.00 Using where; Backward index scan +Warnings: +Note 1003 /* select#1 */ select `test`.`t1`.`pk1` AS `pk1`,`test`.`t1`.`pk2` AS `pk2`,`test`.`t1`.`a` AS `a` from `test`.`t1` where ((`test`.`t1`.`pk1` = 4) and (`test`.`t1`.`pk2` between 5 and 8)) order by `test`.`t1`.`pk1` desc,`test`.`t1`.`pk2` desc +select * from t1 +where pk1=4 and pk2 between 5 and 8 +order by pk1 desc, pk2 desc +for update; +pk1 pk2 a +4 8 48 +4 7 47 +4 6 46 +4 5 45 +# select * from information_schema.rocksdb_locks; # With replacements by select_from_is_rowlocks.inc +COLUMN_FAMILY_ID TRANSACTION_ID KEY mode +$cf_id $trx_id ${indexnr}8000000480000005-${indexnr}8000000480000008:1 X +rollback; +connection con1; +rollback; +connection default; +drop table t0, t1; +# +# A bug: range locking was not used when scan started at table start or end +# +create table t0(a int); +insert into t0 values (0),(1),(2),(3),(4),(5),(6),(7),(8),(9); +create table t10(a int); +insert into t10 select A.a + B.a* 10 + C.a * 100 from t0 A, t0 B, t0 C; +create table t1 ( +pk int not null, +a int, +primary key(pk) +) engine=rocksdb; +insert into t1 select a*2,a*2 from t10; +connection con1; +begin; +select * from t1 where pk=500 for update; +pk a +500 500 +connection default; +begin; +select * from t1 where pk<10 order by pk limit 10 for update; +pk a +0 0 +2 2 +4 4 +6 6 +8 8 +# select * from information_schema.rocksdb_locks; # With replacements by select_from_is_rowlocks.inc +COLUMN_FAMILY_ID TRANSACTION_ID KEY mode +$cf_id $trx_id ${indexnr}-${indexnr}8000000a X +rollback; +begin; +select * from t1 where pk>1990 order by pk desc limit 10 for update; +pk a +1998 1998 +1996 1996 +1994 1994 +1992 1992 +# select * from information_schema.rocksdb_locks; # With replacements by select_from_is_rowlocks.inc +COLUMN_FAMILY_ID TRANSACTION_ID KEY mode +$cf_id $trx_id ${indexnr}800007c6-${indexnr+1} X +rollback; +connection con1; +rollback; +disconnect con1; +connection default; +drop table t0,t10,t1; diff --git a/mysql-test/suite/rocksdb/r/range_locking_deadlock_tracking.result b/mysql-test/suite/rocksdb/r/range_locking_deadlock_tracking.result new file mode 100644 index 00000000000..00fd1788dfd --- /dev/null +++ b/mysql-test/suite/rocksdb/r/range_locking_deadlock_tracking.result @@ -0,0 +1,453 @@ +set @prior_lock_wait_timeout = @@rocksdb_lock_wait_timeout; +set @prior_deadlock_detect = @@rocksdb_deadlock_detect; +set @prior_max_latest_deadlocks = @@rocksdb_max_latest_deadlocks; +set global rocksdb_deadlock_detect = on; +set global rocksdb_lock_wait_timeout = 10000; +# Clears deadlock buffer of any prior deadlocks. +set global rocksdb_max_latest_deadlocks = 0; +set global rocksdb_max_latest_deadlocks = @prior_max_latest_deadlocks; +create table t (i int primary key) engine=rocksdb; +insert into t values (1), (2), (3); +show engine rocksdb transaction status; +Type Name Status +rocksdb +============================================================ +TIMESTAMP ROCKSDB TRANSACTION MONITOR OUTPUT +============================================================ +--------- +SNAPSHOTS +--------- +LIST OF SNAPSHOTS FOR EACH SESSION: +----------LATEST DETECTED DEADLOCKS---------- +----------------------------------------- +END OF ROCKSDB TRANSACTION MONITOR OUTPUT +========================================= + +Deadlock #1 +begin; +select * from t where i=1 for update; +i +1 +begin; +select * from t where i=2 for update; +i +2 +select * from t where i=2 for update; +select * from t where i=1 for update; +ERROR 40001: Deadlock found when trying to get lock; try restarting transaction +rollback; +i +2 +rollback; +show engine rocksdb transaction status; +Type Name Status +rocksdb +============================================================ +TIMESTAMP ROCKSDB TRANSACTION MONITOR OUTPUT +============================================================ +--------- +SNAPSHOTS +--------- +LIST OF SNAPSHOTS FOR EACH SESSION: +----------LATEST DETECTED DEADLOCKS---------- + +*** DEADLOCK PATH +========================================= +TSTAMP +TXN_ID +COLUMN FAMILY NAME: default +KEY +LOCK TYPE: EXCLUSIVE +INDEX NAME: PRIMARY +TABLE NAME: test.t +---------------WAITING FOR--------------- +TSTAMP +TXN_ID +COLUMN FAMILY NAME: default +KEY +LOCK TYPE: EXCLUSIVE +INDEX NAME: PRIMARY +TABLE NAME: test.t + +--------TXN_ID GOT DEADLOCK--------- +----------------------------------------- +END OF ROCKSDB TRANSACTION MONITOR OUTPUT +========================================= + +Deadlock #2 +begin; +select * from t where i=1 for update; +i +1 +begin; +select * from t where i=2 for update; +i +2 +select * from t where i=2 for update; +select * from t where i=1 for update; +ERROR 40001: Deadlock found when trying to get lock; try restarting transaction +rollback; +i +2 +rollback; +show engine rocksdb transaction status; +Type Name Status +rocksdb +============================================================ +TIMESTAMP ROCKSDB TRANSACTION MONITOR OUTPUT +============================================================ +--------- +SNAPSHOTS +--------- +LIST OF SNAPSHOTS FOR EACH SESSION: +----------LATEST DETECTED DEADLOCKS---------- + +*** DEADLOCK PATH +========================================= +TSTAMP +TXN_ID +COLUMN FAMILY NAME: default +KEY +LOCK TYPE: EXCLUSIVE +INDEX NAME: PRIMARY +TABLE NAME: test.t +---------------WAITING FOR--------------- +TSTAMP +TXN_ID +COLUMN FAMILY NAME: default +KEY +LOCK TYPE: EXCLUSIVE +INDEX NAME: PRIMARY +TABLE NAME: test.t + +--------TXN_ID GOT DEADLOCK--------- + +*** DEADLOCK PATH +========================================= +TSTAMP +TXN_ID +COLUMN FAMILY NAME: default +KEY +LOCK TYPE: EXCLUSIVE +INDEX NAME: PRIMARY +TABLE NAME: test.t +---------------WAITING FOR--------------- +TSTAMP +TXN_ID +COLUMN FAMILY NAME: default +KEY +LOCK TYPE: EXCLUSIVE +INDEX NAME: PRIMARY +TABLE NAME: test.t + +--------TXN_ID GOT DEADLOCK--------- +----------------------------------------- +END OF ROCKSDB TRANSACTION MONITOR OUTPUT +========================================= + +set global rocksdb_max_latest_deadlocks = 10; +Deadlock #3 +begin; +select * from t where i=1 for update; +i +1 +begin; +select * from t where i=2 for update; +i +2 +select * from t where i=2 for update; +select * from t where i=1 for update; +ERROR 40001: Deadlock found when trying to get lock; try restarting transaction +rollback; +i +2 +rollback; +show engine rocksdb transaction status; +Type Name Status +rocksdb +============================================================ +TIMESTAMP ROCKSDB TRANSACTION MONITOR OUTPUT +============================================================ +--------- +SNAPSHOTS +--------- +LIST OF SNAPSHOTS FOR EACH SESSION: +----------LATEST DETECTED DEADLOCKS---------- + +*** DEADLOCK PATH +========================================= +TSTAMP +TXN_ID +COLUMN FAMILY NAME: default +KEY +LOCK TYPE: EXCLUSIVE +INDEX NAME: PRIMARY +TABLE NAME: test.t +---------------WAITING FOR--------------- +TSTAMP +TXN_ID +COLUMN FAMILY NAME: default +KEY +LOCK TYPE: EXCLUSIVE +INDEX NAME: PRIMARY +TABLE NAME: test.t + +--------TXN_ID GOT DEADLOCK--------- + +*** DEADLOCK PATH +========================================= +TSTAMP +TXN_ID +COLUMN FAMILY NAME: default +KEY +LOCK TYPE: EXCLUSIVE +INDEX NAME: PRIMARY +TABLE NAME: test.t +---------------WAITING FOR--------------- +TSTAMP +TXN_ID +COLUMN FAMILY NAME: default +KEY +LOCK TYPE: EXCLUSIVE +INDEX NAME: PRIMARY +TABLE NAME: test.t + +--------TXN_ID GOT DEADLOCK--------- + +*** DEADLOCK PATH +========================================= +TSTAMP +TXN_ID +COLUMN FAMILY NAME: default +KEY +LOCK TYPE: EXCLUSIVE +INDEX NAME: PRIMARY +TABLE NAME: test.t +---------------WAITING FOR--------------- +TSTAMP +TXN_ID +COLUMN FAMILY NAME: default +KEY +LOCK TYPE: EXCLUSIVE +INDEX NAME: PRIMARY +TABLE NAME: test.t + +--------TXN_ID GOT DEADLOCK--------- +----------------------------------------- +END OF ROCKSDB TRANSACTION MONITOR OUTPUT +========================================= + +set global rocksdb_max_latest_deadlocks = 1; +show engine rocksdb transaction status; +Type Name Status +rocksdb +============================================================ +TIMESTAMP ROCKSDB TRANSACTION MONITOR OUTPUT +============================================================ +--------- +SNAPSHOTS +--------- +LIST OF SNAPSHOTS FOR EACH SESSION: +----------LATEST DETECTED DEADLOCKS---------- + +*** DEADLOCK PATH +========================================= +TSTAMP +TXN_ID +COLUMN FAMILY NAME: default +KEY +LOCK TYPE: EXCLUSIVE +INDEX NAME: PRIMARY +TABLE NAME: test.t +---------------WAITING FOR--------------- +TSTAMP +TXN_ID +COLUMN FAMILY NAME: default +KEY +LOCK TYPE: EXCLUSIVE +INDEX NAME: PRIMARY +TABLE NAME: test.t + +--------TXN_ID GOT DEADLOCK--------- +----------------------------------------- +END OF ROCKSDB TRANSACTION MONITOR OUTPUT +========================================= + +set rocksdb_deadlock_detect_depth = 2; +# Range locking code will report deadlocks, because it doesn't honor +# rocksdb_deadlock_detect_depth: +Deadlock #4 +begin; +select * from t where i=1 for update; +i +1 +begin; +select * from t where i=2 for update; +i +2 +begin; +select * from t where i=3 for update; +i +3 +select * from t where i=2 for update; +select * from t where i=3 for update; +select variable_value into @a from performance_schema.global_status where variable_name='rocksdb_row_lock_deadlocks'; +select * from t where i=1 for update; +ERROR 40001: Deadlock found when trying to get lock; try restarting transaction +select case when variable_value-@a = 1 then 'true' else 'false' end as deadlocks from performance_schema.global_status where variable_name='rocksdb_row_lock_deadlocks'; +deadlocks +true +rollback; +i +3 +rollback; +i +2 +rollback; +set global rocksdb_max_latest_deadlocks = 5; +show engine rocksdb transaction status; +Type Name Status +rocksdb +============================================================ +TIMESTAMP ROCKSDB TRANSACTION MONITOR OUTPUT +============================================================ +--------- +SNAPSHOTS +--------- +LIST OF SNAPSHOTS FOR EACH SESSION: +----------LATEST DETECTED DEADLOCKS---------- + +*** DEADLOCK PATH +========================================= +TSTAMP +TXN_ID +COLUMN FAMILY NAME: default +KEY +LOCK TYPE: EXCLUSIVE +INDEX NAME: PRIMARY +TABLE NAME: test.t +---------------WAITING FOR--------------- +TSTAMP +TXN_ID +COLUMN FAMILY NAME: default +KEY +LOCK TYPE: EXCLUSIVE +INDEX NAME: PRIMARY +TABLE NAME: test.t +---------------WAITING FOR--------------- +TSTAMP +TXN_ID +COLUMN FAMILY NAME: default +KEY +LOCK TYPE: EXCLUSIVE +INDEX NAME: PRIMARY +TABLE NAME: test.t + +--------TXN_ID GOT DEADLOCK--------- +----------------------------------------- +END OF ROCKSDB TRANSACTION MONITOR OUTPUT +========================================= + +Deadlock #6 +create table t1 (id int primary key, value int) engine=rocksdb; +insert into t1 values (1,1),(2,2),(3,3),(4,4),(5,5); +begin; +update t1 set value=value+100 where id=1; +update t1 set value=value+100 where id=2; +begin; +update t1 set value=value+200 where id=3; +update t1 set value=value+100 where id=3; +update t1 set value=value+200 where id=1; +ERROR 40001: Deadlock found when trying to get lock; try restarting transaction +select * from t1; +id value +1 101 +2 102 +3 103 +4 4 +5 5 +drop table t1; +set global rocksdb_lock_wait_timeout = @prior_lock_wait_timeout; +set global rocksdb_deadlock_detect = @prior_deadlock_detect; +drop table t; +show engine rocksdb transaction status; +Type Name Status +rocksdb +============================================================ +TIMESTAMP ROCKSDB TRANSACTION MONITOR OUTPUT +============================================================ +--------- +SNAPSHOTS +--------- +LIST OF SNAPSHOTS FOR EACH SESSION: +----------LATEST DETECTED DEADLOCKS---------- + +*** DEADLOCK PATH +========================================= +TSTAMP +TXN_ID +COLUMN FAMILY NAME: default +KEY +LOCK TYPE: EXCLUSIVE +INDEX NAME: NOT FOUND; IDX_ID +TABLE NAME: NOT FOUND; IDX_ID +---------------WAITING FOR--------------- +TSTAMP +TXN_ID +COLUMN FAMILY NAME: default +KEY +LOCK TYPE: EXCLUSIVE +INDEX NAME: NOT FOUND; IDX_ID +TABLE NAME: NOT FOUND; IDX_ID + +--------TXN_ID GOT DEADLOCK--------- + +*** DEADLOCK PATH +========================================= +TSTAMP +TXN_ID +COLUMN FAMILY NAME: default +KEY +LOCK TYPE: EXCLUSIVE +INDEX NAME: NOT FOUND; IDX_ID +TABLE NAME: NOT FOUND; IDX_ID +---------------WAITING FOR--------------- +TSTAMP +TXN_ID +COLUMN FAMILY NAME: default +KEY +LOCK TYPE: EXCLUSIVE +INDEX NAME: NOT FOUND; IDX_ID +TABLE NAME: NOT FOUND; IDX_ID +---------------WAITING FOR--------------- +TSTAMP +TXN_ID +COLUMN FAMILY NAME: default +KEY +LOCK TYPE: EXCLUSIVE +INDEX NAME: NOT FOUND; IDX_ID +TABLE NAME: NOT FOUND; IDX_ID + +--------TXN_ID GOT DEADLOCK--------- +----------------------------------------- +END OF ROCKSDB TRANSACTION MONITOR OUTPUT +========================================= + +set global rocksdb_max_latest_deadlocks = 0; +# Clears deadlock buffer of any existent deadlocks. +set global rocksdb_max_latest_deadlocks = @prior_max_latest_deadlocks; +show engine rocksdb transaction status; +Type Name Status +rocksdb +============================================================ +TIMESTAMP ROCKSDB TRANSACTION MONITOR OUTPUT +============================================================ +--------- +SNAPSHOTS +--------- +LIST OF SNAPSHOTS FOR EACH SESSION: +----------LATEST DETECTED DEADLOCKS---------- +----------------------------------------- +END OF ROCKSDB TRANSACTION MONITOR OUTPUT +========================================= + diff --git a/mysql-test/suite/rocksdb/r/range_locking_escalation.result b/mysql-test/suite/rocksdb/r/range_locking_escalation.result new file mode 100644 index 00000000000..dd19d728ef2 --- /dev/null +++ b/mysql-test/suite/rocksdb/r/range_locking_escalation.result @@ -0,0 +1,27 @@ +show variables like 'rocksdb_use_range_locking'; +Variable_name Value +rocksdb_use_range_locking ON +show variables like 'rocksdb_max_lock_memory'; +Variable_name Value +rocksdb_max_lock_memory 1024 +show status like 'rocksdb_locktree_escalation_count'; +Variable_name Value +rocksdb_locktree_escalation_count 0 +create table t0(a int); +insert into t0 values (0),(1),(2),(3),(4),(5),(6),(7),(8),(9); +create table t1 ( +pk int primary key, +a int +) engine=rocksdb; +insert into t1 +select +A.a + B.a*10 + C.a*100 + D.a*1000, +12345 +from t0 A, t0 B, t0 C, t0 D; +select count(*) from t1; +count(*) +10000 +show status like 'rocksdb_locktree_escalation_count'; +Variable_name Value +rocksdb_locktree_escalation_count 127 +drop table t0,t1; diff --git a/mysql-test/suite/rocksdb/r/range_locking_refresh_iter.result b/mysql-test/suite/rocksdb/r/range_locking_refresh_iter.result new file mode 100644 index 00000000000..1067087e816 --- /dev/null +++ b/mysql-test/suite/rocksdb/r/range_locking_refresh_iter.result @@ -0,0 +1,50 @@ +select @@rocksdb_use_range_locking; +@@rocksdb_use_range_locking +1 +set debug_sync='RESET'; +create table ten(a int primary key); +insert into ten values (0),(1),(2),(3),(4),(5),(6),(7),(8),(9); +create table one_k(a int primary key); +insert into one_k select A.a + B.a* 10 + C.a * 100 from ten A, ten B, ten C; +create table t1 ( +pk int primary key, +a int +) engine=rocksdb; +insert into t1 select a,a from ten; +insert into t1 select a+40, a+40 from ten; +insert into t1 select a+100, a+100 from one_k; +delete from t1 where pk=44; +set global rocksdb_force_flush_memtable_and_lzero_now=1; +begin; +set debug_sync='rocksdb.check_flags_iri SIGNAL con1_stopped WAIT_FOR con1_cont'; +update t1 set a=a+100 where pk < 3 or pk between 10 and 50; +set debug_sync='now WAIT_FOR con1_stopped'; +insert into t1 values (44,5000); +delete from t1 where pk= 42; +update t1 set a=5000 where pk between 40 and 45; +set global rocksdb_force_flush_memtable_and_lzero_now=1; +set debug_sync='now SIGNAL con1_cont'; +select * from t1 where pk<100; +pk a +0 100 +1 101 +2 102 +3 3 +4 4 +5 5 +6 6 +7 7 +8 8 +9 9 +40 5100 +41 5100 +43 5100 +44 5100 +45 5100 +46 146 +47 147 +48 148 +49 149 +commit; +set debug_sync='RESET'; +drop table t1, ten, one_k; diff --git a/mysql-test/suite/rocksdb/r/range_locking_rev_cf.result b/mysql-test/suite/rocksdb/r/range_locking_rev_cf.result new file mode 100644 index 00000000000..5e1c2cf98a5 --- /dev/null +++ b/mysql-test/suite/rocksdb/r/range_locking_rev_cf.result @@ -0,0 +1,482 @@ +show variables like 'rocksdb_use_range_locking'; +Variable_name Value +rocksdb_use_range_locking ON +create table t1 ( +pk int, +a int, +primary key (pk) comment 'rev:cf1' +) engine=rocksdb; +insert into t1 values +(10,10),(20,20),(30,30); +connect con1,localhost,root,,; +connect con2,localhost,root,,; +### Test: check that range lock inhibits a point lock +connection con1; +begin; +select * from t1 where pk between 5 and 25 for update; +pk a +10 10 +20 20 +connection con2; +insert into t1 values (15,15); +ERROR HY000: Lock wait timeout exceeded; try restarting transaction: Timeout on index: test.t1.PRIMARY +connection con1; +rollback; +## Test: check that range lock inhibits another range lock +connection con1; +begin; +select * from t1 where pk between 5 and 25 for update; +pk a +10 10 +20 20 +connection con2; +begin; +select * from t1 where pk between 15 and 35 for update; +ERROR HY000: Lock wait timeout exceeded; try restarting transaction: Timeout on index: test.t1.PRIMARY +rollback; +connection con1; +rollback; +## Test: check that regular read does not get a range lock +connection con1; +begin; +select * from t1 where pk between 5 and 25; +pk a +10 10 +20 20 +connection con2; +begin; +select * from t1 where pk between 15 and 35 for update; +pk a +20 20 +30 30 +rollback; +connection con1; +rollback; +## Test that locks are not released when a statement inside +## a transaction is rolled back +create table t2 ( +pk int, +a int, +primary key (pk) comment 'rev:cf1', +unique key(a) comment '' +) engine=rocksdb; +insert into t2 values (1,1),(2,2); +begin; +insert into t2 values (3,3); +insert into t2 values (10,2); +ERROR 23000: Duplicate entry '2' for key 't2.a' +connection con2; +begin; +select * from t2 where pk=3 for update; +ERROR HY000: Lock wait timeout exceeded; try restarting transaction: Timeout on index: test.t2.PRIMARY +rollback; +connection con1; +rollback; +drop table t2; +connection default; +disconnect con1; +disconnect con2; +drop table t1; +# +# Test INFORMATION_SCHEMA.lock_info in range-locking mode +# +connect con1,localhost,root,,; +connection con1; +create table t0 (a int primary key); +begin; +insert into t0 values (1); +connection default; +create table t1 ( +pk int, +a int, +primary key (pk) comment 'rev:cf1' +) engine=rocksdb; +insert into t1 values +(10,10),(20,20),(30,30); +begin; +select * from t1 where pk=10 for update; +pk a +10 10 +# select * from information_schema.rocksdb_locks; # With replacements by select_from_is_rowlocks.inc +COLUMN_FAMILY_ID TRANSACTION_ID KEY mode +$cf_id $trx_id ${indexnr}8000000a X +delete from t1 where pk between 25 and 40; +# select * from information_schema.rocksdb_locks; # With replacements by select_from_is_rowlocks.inc +COLUMN_FAMILY_ID TRANSACTION_ID KEY mode +$cf_id $trx_id ${indexnr}8000000a X +$cf_id $trx_id ${indexnr}80000028-${indexnr}80000019:1 X +rollback; +begin; +# The following will show a range lock on 2-9 and also a point lock on 10. +# This is how things currently work. (after MDEV-21314, not anymore) +select * from t1 where pk between 2 and 9 for update; +pk a +# select * from information_schema.rocksdb_locks; # With replacements by select_from_is_rowlocks.inc +COLUMN_FAMILY_ID TRANSACTION_ID KEY mode +$cf_id $trx_id ${indexnr}80000009-${indexnr}80000002:1 X +rollback; +drop table t1; +connection con1; +rollback; +drop table t0; +connection default; +disconnect con1; +# +# MDEV-18104: MyRocks-Gap-Lock: range locking bounds are incorrect for multi-part keys +# +create table t0(a int); +insert into t0 values (0),(1),(2),(3),(4),(5),(6),(7),(8),(9); +create table t1 ( +kp1 int not null, +kp2 int not null, +a int, +primary key(kp1, kp2) comment 'rev:cf1' +) engine=rocksdb; +insert into t1 select 1, a, 1234 from t0; +insert into t1 select 2, a, 1234 from t0; +insert into t1 select 3, a, 1234 from t0; +connect con1,localhost,root,,; +connection con1; +begin; +select * from t1 where kp1=2 for update; +kp1 kp2 a +2 0 1234 +2 1 1234 +2 2 1234 +2 3 1234 +2 4 1234 +2 5 1234 +2 6 1234 +2 7 1234 +2 8 1234 +2 9 1234 +connection default; +# The lock on kp1=2 should inhibit the following INSERT: +insert into t1 values ( 2,5,9999); +ERROR HY000: Lock wait timeout exceeded; try restarting transaction: Timeout on index: test.t1.PRIMARY +rollback; +connection con1; +rollback; +connection default; +disconnect con1; +drop table t0,t1; +# +# Test that locks on ranges on non-unique secondary keys inhibit +# modifications of the contents of these ranges +# +create table t0(a int); +insert into t0 values (0),(1),(2),(3),(4),(5),(6),(7),(8),(9); +create table t1 ( +kp1 int not null, +kp2 int not null, +a int, +key(kp1, kp2) comment 'rev:cf1' +) engine=rocksdb; +insert into t1 select 1, a, 1234 from t0; +insert into t1 values (2, 3, 1234); +insert into t1 values (2, 5, 1234); +insert into t1 values (2, 7, 1234); +insert into t1 select 3, a, 1234 from t0; +connect con1,localhost,root,,; +connection con1; +begin; +explain +select * from t1 where kp1=2 for update; +id select_type table partitions type possible_keys key key_len ref rows filtered Extra +1 SIMPLE t1 NULL ref kp1 kp1 4 const # 100.00 NULL +Warnings: +Note 1003 /* select#1 */ select `test`.`t1`.`kp1` AS `kp1`,`test`.`t1`.`kp2` AS `kp2`,`test`.`t1`.`a` AS `a` from `test`.`t1` where (`test`.`t1`.`kp1` = 2) +select * from t1 where kp1=2 for update; +kp1 kp2 a +2 3 1234 +2 5 1234 +2 7 1234 +connection default; +begin; +insert into t1 values (2, 9, 9999); +ERROR HY000: Lock wait timeout exceeded; try restarting transaction: Timeout on index: test.t1.kp1 +delete from t1 where kp1=2 and kp2=5; +ERROR HY000: Lock wait timeout exceeded; try restarting transaction: Timeout on index: test.t1.kp1 +update t1 set kp1=333 where kp1=2 and kp2=3; +ERROR HY000: Lock wait timeout exceeded; try restarting transaction: Timeout on index: test.t1.kp1 +update t1 set kp1=2 where kp1=1 and kp2=8; +ERROR HY000: Lock wait timeout exceeded; try restarting transaction: Timeout on index: test.t1.kp1 +rollback; +connection con1; +rollback; +disconnect con1; +connection default; +drop table t0,t1; +# +# Transaction isolation test +# +create table t1 (pk int primary key, a int) engine=rocksdb; +insert into t1 values (1,1),(2,2),(3,3); +connect con1,localhost,root,,; +# TRX1: Start, Allocate a snapshot +connection con1; +begin; +select * from t1; +pk a +1 1 +2 2 +3 3 +# TRX2: Make a change that TRX1 will not see +connection default; +update t1 set a=2222 where pk=2; +# TRX1: Now, make a change that would overwrite TRX2'x change and commit +connection con1; +update t1 set a=a+1 where pk=2; +commit; +# Examine the result: +# pk=2, a=2223 means UPDATE in TRX1 used "read committed" (InnoDB-like isolation) +# pk=2, a=3 means UPDATE in TRX1 silently overwrote TRX2 +# (and with key tracking, one would get an error on the second UPDATE) +connection default; +select * from t1; +pk a +1 1 +2 2223 +3 3 +disconnect con1; +connection default; +drop table t1; +# +# Same test as above, but check the range scan +# +create table t1 ( +pk int, +a int, +primary key (pk) comment 'rev:cf1' +) engine=rocksdb; +insert into t1 values (1,1),(2,2),(3,3),(4,4),(5,5),(6,6); +connect con1,localhost,root,,; +# TRX1: Start, Allocate a snapshot +connection con1; +begin; +select * from t1; +pk a +6 6 +5 5 +4 4 +3 3 +2 2 +1 1 +# TRX2: Make a change that TRX1 will not see +connection default; +update t1 set a=2222 where pk between 3 and 5; +# TRX1: Now, make a change that would overwrite TRX2'x change and commit +connection con1; +update t1 set a=a+1 where pk between 3 and 5; +commit; +# Examine the result: +# pk={3,4,5} a=2223 means UPDATE in TRX1 used "read committed" (InnoDB-like isolation) +connection default; +select * from t1; +pk a +6 6 +5 2223 +4 2223 +3 2223 +2 2 +1 1 +disconnect con1; +connection default; +drop table t1; +# +# Same as above, but test SELECT FOR UPDATE. +# +create table t1 ( +pk int, +a int, +primary key (pk) comment 'rev:cf1' +) engine=rocksdb; +insert into t1 values (1,1),(2,2),(3,3),(4,4),(5,5),(6,6); +connect con1,localhost,root,,; +# TRX1: Start, Allocate a snapshot +connection con1; +begin; +select * from t1; +pk a +6 6 +5 5 +4 4 +3 3 +2 2 +1 1 +# TRX2: Make a change that TRX1 will not see +connection default; +update t1 set a=222 where pk=2; +update t1 set a=333 where pk=3; +# TRX1: Check what select [FOR UPDATE] sees +connection con1; +select * from t1 where pk in (2,3); +pk a +2 2 +3 3 +select * from t1 where pk=2 for update; +pk a +2 222 +select * from t1 where pk=2; +pk a +2 2 +commit; +disconnect con1; +connection default; +drop table t1; +# +# Check that I_S.processlist.state is set correctly now. +# +create table t1( +pk int, +a int, +primary key(pk) comment 'rev:cf1' +) engine=rocksdb; +insert into t1 values (1,1),(2,2),(3,3); +begin; +select * from t1 where pk=2 for update; +pk a +2 2 +connect con1,localhost,root,,; +begin; +set rocksdb_lock_wait_timeout=300; +select * from t1 where pk=2 for update; +connection default; +# Now, will wait until we see con1 have state="Waiting for row lock" +rollback; +connection con1; +pk a +2 2 +rollback; +disconnect con1; +connection default; +drop table t1; +# +# Test range locking for ranges with HA_READ_PREFIX_LAST +# +create table t0(a int) engine=rocksdb; +insert into t0 values (0),(1),(2),(3),(4),(5),(6),(7),(8),(9); +create table t1 ( +pk1 int, +pk2 int, +a int, +primary key(pk1, pk2) comment 'rev:cf1' +) engine=rocksdb; +insert into t1 +select +A.a, B.a, A.a*10+B.a +from +t0 A, t0 B; +connect con1,localhost,root,,; +connection con1; +begin; +insert into t1 values (0x1112222,0x1112222,0); +connection default; +begin; +# Should use ref access w/o filesort: +explain +select * from t1 +where pk1=3 +order by pk1 desc, pk2 desc +for update; +id select_type table partitions type possible_keys key key_len ref rows filtered Extra +1 SIMPLE t1 NULL ref PRIMARY PRIMARY 4 const # 100.00 Backward index scan +Warnings: +Note 1003 /* select#1 */ select `test`.`t1`.`pk1` AS `pk1`,`test`.`t1`.`pk2` AS `pk2`,`test`.`t1`.`a` AS `a` from `test`.`t1` where (`test`.`t1`.`pk1` = 3) order by `test`.`t1`.`pk1` desc,`test`.`t1`.`pk2` desc +select * from t1 +where pk1=3 +order by pk1 desc, pk2 desc +for update; +pk1 pk2 a +3 9 39 +3 8 38 +3 7 37 +3 6 36 +3 5 35 +3 4 34 +3 3 33 +3 2 32 +3 1 31 +3 0 30 +# select * from information_schema.rocksdb_locks; # With replacements by select_from_is_rowlocks.inc +COLUMN_FAMILY_ID TRANSACTION_ID KEY mode +$cf_id $trx_id ${indexnr}80000003-${indexnr}80000003:1 X +rollback; +# +# Test range locking for ranges with HA_READ_PREFIX_LAST_OR_PREV +# +begin; +# Should use range access with 2 keyparts and w/o filesort: +explain +select * from t1 +where pk1=4 and pk2 between 5 and 8 +order by pk1 desc, pk2 desc +for update; +id select_type table partitions type possible_keys key key_len ref rows filtered Extra +1 SIMPLE t1 NULL range PRIMARY PRIMARY 8 NULL # 100.00 Using where; Backward index scan +Warnings: +Note 1003 /* select#1 */ select `test`.`t1`.`pk1` AS `pk1`,`test`.`t1`.`pk2` AS `pk2`,`test`.`t1`.`a` AS `a` from `test`.`t1` where ((`test`.`t1`.`pk1` = 4) and (`test`.`t1`.`pk2` between 5 and 8)) order by `test`.`t1`.`pk1` desc,`test`.`t1`.`pk2` desc +select * from t1 +where pk1=4 and pk2 between 5 and 8 +order by pk1 desc, pk2 desc +for update; +pk1 pk2 a +4 8 48 +4 7 47 +4 6 46 +4 5 45 +# select * from information_schema.rocksdb_locks; # With replacements by select_from_is_rowlocks.inc +COLUMN_FAMILY_ID TRANSACTION_ID KEY mode +$cf_id $trx_id ${indexnr}8000000480000008-${indexnr}8000000480000005:1 X +rollback; +connection con1; +rollback; +connection default; +drop table t0, t1; +# +# A bug: range locking was not used when scan started at table start or end +# +create table t0(a int); +insert into t0 values (0),(1),(2),(3),(4),(5),(6),(7),(8),(9); +create table t10(a int); +insert into t10 select A.a + B.a* 10 + C.a * 100 from t0 A, t0 B, t0 C; +create table t1 ( +pk int not null, +a int, +primary key(pk) +) engine=rocksdb; +insert into t1 select a*2,a*2 from t10; +connection con1; +begin; +select * from t1 where pk=500 for update; +pk a +500 500 +connection default; +begin; +select * from t1 where pk<10 order by pk limit 10 for update; +pk a +0 0 +2 2 +4 4 +6 6 +8 8 +# select * from information_schema.rocksdb_locks; # With replacements by select_from_is_rowlocks.inc +COLUMN_FAMILY_ID TRANSACTION_ID KEY mode +$cf_id $trx_id ${indexnr}-${indexnr}8000000a X +rollback; +begin; +select * from t1 where pk>1990 order by pk desc limit 10 for update; +pk a +1998 1998 +1996 1996 +1994 1994 +1992 1992 +# select * from information_schema.rocksdb_locks; # With replacements by select_from_is_rowlocks.inc +COLUMN_FAMILY_ID TRANSACTION_ID KEY mode +$cf_id $trx_id ${indexnr}800007c6-${indexnr+1} X +rollback; +connection con1; +rollback; +disconnect con1; +connection default; +drop table t0,t10,t1; diff --git a/mysql-test/suite/rocksdb/r/range_locking_seek_for_update.result b/mysql-test/suite/rocksdb/r/range_locking_seek_for_update.result new file mode 100644 index 00000000000..514916eaa22 --- /dev/null +++ b/mysql-test/suite/rocksdb/r/range_locking_seek_for_update.result @@ -0,0 +1,279 @@ +show variables like 'rocksdb_use_range_locking'; +Variable_name Value +rocksdb_use_range_locking ON +create table t0(a int primary key); +insert into t0 values (0),(1),(2),(3),(4),(5),(6),(7),(8),(9); +create table t1 ( +pk int, +a int, +primary key (pk) +) engine=rocksdb; +insert into t1 select +A.a + B.a*10 + C.a*100, +A.a + B.a*10 + C.a*100 +from +t0 A, t0 B, t0 C; +# Make another connection to get the lock tree out of the STO-mode +connect con1,localhost,root,,; +connection con1; +begin; +select * from t1 where pk=10 for update; +pk a +10 10 +connection default; +begin; +select * from t1 where pk=11 for update; +pk a +11 11 +# Now, we will just see locks on 10=0xA and 11=0xB: +# select * from information_schema.rocksdb_locks; # With replacements by select_from_is_rowlocks.inc +COLUMN_FAMILY_ID TRANSACTION_ID KEY mode +$cf_id $trx_id ${indexnr}8000000b X +# +# SeekForUpdate Test #1: A query with type=range (without upper bound) and LIMIT +# +explain +select * from t1 where pk>=500 order by pk limit 3 for update; +id select_type table partitions type possible_keys key key_len ref rows filtered Extra +1 SIMPLE t1 NULL range PRIMARY PRIMARY 4 NULL # 100.00 Using where +Warnings: +Note 1003 /* select#1 */ select `test`.`t1`.`pk` AS `pk`,`test`.`t1`.`a` AS `a` from `test`.`t1` where (`test`.`t1`.`pk` >= 500) order by `test`.`t1`.`pk` limit 3 +select * from t1 where pk>=500 order by pk limit 3 for update; +pk a +500 500 +501 501 +502 502 +# select * from information_schema.rocksdb_locks; # With replacements by select_from_is_rowlocks.inc +COLUMN_FAMILY_ID TRANSACTION_ID KEY mode +$cf_id $trx_id ${indexnr}8000000b X +$cf_id $trx_id ${indexnr}800001f4-${indexnr}800001f6 X +rollback; +begin; +select * from t1 where pk=11 for update; +pk a +11 11 +explain +select * from t1 order by pk limit 3 for update; +id select_type table partitions type possible_keys key key_len ref rows filtered Extra +1 SIMPLE t1 NULL index NULL PRIMARY 4 NULL 3 100.00 NULL +Warnings: +Note 1003 /* select#1 */ select `test`.`t1`.`pk` AS `pk`,`test`.`t1`.`a` AS `a` from `test`.`t1` order by `test`.`t1`.`pk` limit 3 +select * from t1 order by pk limit 3 for update; +pk a +0 0 +1 1 +2 2 +# select * from information_schema.rocksdb_locks; # With replacements by select_from_is_rowlocks.inc +COLUMN_FAMILY_ID TRANSACTION_ID KEY mode +$cf_id $trx_id ${indexnr}-${indexnr}80000002 X +$cf_id $trx_id ${indexnr}8000000b X +rollback; +connection con1; +rollback; +disconnect con1; +connection default; +drop table t0, t1; +# +# Concurrent tests: let one thread do SeekForUpdate and the other +# interfere by committing modifications +# +create table t0(a int primary key); +insert into t0 values (0),(1),(2),(3),(4),(5),(6),(7),(8),(9); +create table t1 ( +pk int, +a int, +primary key (pk) +) engine=rocksdb; +insert into t1 select +A.a + B.a*10 + C.a*100, +A.a + B.a*10 + C.a*100 +from +t0 A, t0 B, t0 C; +select * from t1 where pk<10; +pk a +0 0 +1 1 +2 2 +3 3 +4 4 +5 5 +6 6 +7 7 +8 8 +9 9 +delete from t1 where pk<10; +select * from t1 where pk<10; +pk a +# Test what happens when another transaction commits a row +# right before the range we are about to lock (nothing) +explain +select * from t1 where pk >=5 order by pk limit 3 for update; +id select_type table partitions type possible_keys key key_len ref rows filtered Extra +1 SIMPLE t1 NULL range PRIMARY PRIMARY 4 NULL # 100.00 Using where +Warnings: +Note 1003 /* select#1 */ select `test`.`t1`.`pk` AS `pk`,`test`.`t1`.`a` AS `a` from `test`.`t1` where (`test`.`t1`.`pk` >= 5) order by `test`.`t1`.`pk` limit 3 +begin; +set debug_sync='rocksdb.locking_iter_scan SIGNAL about_to_lock_range WAIT_FOR spoiler_inserted'; +select * from t1 where pk >=5 order by pk limit 3 for update; +connect con1,localhost,root,,; +connection con1; +set debug_sync='now WAIT_FOR about_to_lock_range'; +insert into t1 values (3,3); +set debug_sync='now SIGNAL spoiler_inserted'; +connection default; +pk a +10 10 +11 11 +12 12 +rollback; +delete from t1 where pk=3; +# +# Now, repeat the test but let the other transaction insert the row into +# the range we are locking +explain +select * from t1 where pk >=5 order by pk limit 1 for update; +id select_type table partitions type possible_keys key key_len ref rows filtered Extra +1 SIMPLE t1 NULL range PRIMARY PRIMARY 4 NULL # 100.00 Using where +Warnings: +Note 1003 /* select#1 */ select `test`.`t1`.`pk` AS `pk`,`test`.`t1`.`a` AS `a` from `test`.`t1` where (`test`.`t1`.`pk` >= 5) order by `test`.`t1`.`pk` limit 1 +begin; +set debug_sync='rocksdb.locking_iter_scan SIGNAL about_to_lock_range WAIT_FOR spoiler_inserted'; +select * from t1 where pk >=5 order by pk limit 1 for update; +connection con1; +set debug_sync='now WAIT_FOR about_to_lock_range'; +insert into t1 values (8,8); +set debug_sync='now SIGNAL spoiler_inserted'; +connection default; +pk a +8 8 +# select * from information_schema.rocksdb_locks; # With replacements by select_from_is_rowlocks.inc +COLUMN_FAMILY_ID TRANSACTION_ID KEY mode +$cf_id $trx_id ${indexnr}80000005-${indexnr}8000000a X +rollback; +delete from t1 where pk=8; +# +# Repeat the third time, this time deleting the row that SeekForUpdate saw +# +insert into t1 values (7,7); +begin; +set debug_sync='rocksdb.locking_iter_scan SIGNAL about_to_lock_range WAIT_FOR spoiler_inserted'; +select * from t1 where pk >=5 order by pk limit 1 for update; +connection con1; +set debug_sync='now WAIT_FOR about_to_lock_range'; +delete from t1 where pk=7; +set debug_sync='now SIGNAL spoiler_inserted'; +connection default; +pk a +10 10 +rollback; +# +# Repeat the above test, but let the read fail with ER_LOCK_WAIT_TIMEOUT +# error. MyRocks code should now be prepared that data reads cause this +# error +# +insert into t1 values (7,7); +begin; +set debug_sync='rocksdb.locking_iter_scan SIGNAL about_to_lock_range WAIT_FOR spoiler_inserted'; +select * from t1 where pk >=5 order by pk limit 1 for update; +connection con1; +set debug_sync='now WAIT_FOR about_to_lock_range'; +begin; +delete from t1 where pk=7; +set debug_sync='now SIGNAL spoiler_inserted'; +connection default; +ERROR HY000: Lock wait timeout exceeded; try restarting transaction: Timeout on index: test.t1.PRIMARY +rollback; +connection con1; +rollback; +connection default; +# +# Backward scan test +# +connection con1; +begin; +select * from t1 where pk=500 for update; +pk a +500 500 +connection default; +insert into t1 values +(1001, 1001), +(1005, 1005), +(1007, 1007), +(1010, 1010); +begin; +select * from t1 order by pk desc limit 2 for update; +pk a +1010 1010 +1007 1007 +# The below will lock from pk=1007 (0x3ef) till the end of the table: +# select * from information_schema.rocksdb_locks; # With replacements by select_from_is_rowlocks.inc +COLUMN_FAMILY_ID TRANSACTION_ID KEY mode +$cf_id $trx_id ${indexnr}800003ef-${indexnr+1} X +rollback; +begin; +select * from t1 where pk <1007 order by pk desc limit 2 for update; +pk a +1005 1005 +1001 1001 +# select * from information_schema.rocksdb_locks; # With replacements by select_from_is_rowlocks.inc +COLUMN_FAMILY_ID TRANSACTION_ID KEY mode +$cf_id $trx_id ${indexnr}800003e9-${indexnr}800003ef X +connection con1; +rollback; +connection default; +rollback; +# +# Backward scan test 2: error condition +# +connection con1; +begin; +select * from t1 where pk=1010 for update; +pk a +1010 1010 +connection default; +begin; +select * from t1 order by pk desc limit 2 for update; +ERROR HY000: Lock wait timeout exceeded; try restarting transaction: Timeout on index: test.t1.PRIMARY +rollback; +connection con1; +rollback; +begin; +select * from t1 where pk=1007 for update; +pk a +1007 1007 +connection default; +begin; +select * from t1 order by pk desc limit 2 for update; +ERROR HY000: Lock wait timeout exceeded; try restarting transaction: Timeout on index: test.t1.PRIMARY +rollback; +connection con1; +rollback; +disconnect con1; +connection default; +drop table t0,t1; +# +# A test: full table scan doesn't lock gaps +# +create table t1 ( +pk int primary key, +a int +) engine=rocksdb; +insert into t1 values (10,10),(20,20),(30,30); +connect con1,localhost,root,,; +connect con2,localhost,root,,; +connection con1; +begin; +select * from t1 for update; +pk a +10 10 +20 20 +30 30 +connection con2; +insert into t1 values (5,5); +ERROR HY000: Lock wait timeout exceeded; try restarting transaction: Timeout on index: test.t1.PRIMARY +connection con1; +rollback; +disconnect con1; +disconnect con2; +connection default; +drop table t1; diff --git a/mysql-test/suite/rocksdb/r/range_locking_shared_locks.result b/mysql-test/suite/rocksdb/r/range_locking_shared_locks.result new file mode 100644 index 00000000000..580108de6f6 --- /dev/null +++ b/mysql-test/suite/rocksdb/r/range_locking_shared_locks.result @@ -0,0 +1,251 @@ +select @@rocksdb_use_range_locking; +@@rocksdb_use_range_locking +1 +create table t0 (a int); +insert into t0 values (0),(1),(2),(3),(4),(5),(6),(7),(8),(9); +create table t1 ( +pk int primary key, +a int +) engine=rocksdb; +insert into t1 select a,a from t0; +# A basic test for shared locks +begin; +select * from t1 where pk=3 for update; +pk a +3 3 +select * from t1 where pk=5 lock in share mode; +pk a +5 5 +connect con1,localhost,root,,; +connection con1; +begin; +select * from t1 where pk=5 lock in share mode; +pk a +5 5 +# Now for pk=5 we should see two locks by TRX1 and TRX2 with mode=S: +# select * from information_schema.rocksdb_locks; # With replacements by select_from_is_rowlocks.inc +COLUMN_FAMILY_ID TRANSACTION_ID KEY mode +$cf_id $TRX1_ID ${indexnr}80000003 X +$cf_id $TRX1_ID ${indexnr}80000005 S +$cf_id $TRX2_ID ${indexnr}80000005 S +rollback; +# Now, TRX2_ID should be gone: +# select * from information_schema.rocksdb_locks; # With replacements by select_from_is_rowlocks.inc +COLUMN_FAMILY_ID TRANSACTION_ID KEY mode +$cf_id $TRX1_ID ${indexnr}80000003 X +$cf_id $TRX1_ID ${indexnr}80000005 S +connection default; +# Get a read lock on pk=3 (where we have a write lock). +# The result should be that we will still have a write lock +select * from t1 where pk=3 for update; +pk a +3 3 +# select * from information_schema.rocksdb_locks; # With replacements by select_from_is_rowlocks.inc +COLUMN_FAMILY_ID TRANSACTION_ID KEY mode +$cf_id $TRX1_ID ${indexnr}80000003 X +$cf_id $TRX1_ID ${indexnr}80000005 S +# Get a write lock on pk=5 (where we have a read lock). +# The result should be that we will have a write lock. +select * from t1 where pk=5 for update; +pk a +5 5 +# select * from information_schema.rocksdb_locks; # With replacements by select_from_is_rowlocks.inc +COLUMN_FAMILY_ID TRANSACTION_ID KEY mode +$cf_id $TRX1_ID ${indexnr}80000003 X +$cf_id $TRX1_ID ${indexnr}80000005 X +connection default; +rollback; +# +# Test if a read lock inhibits write locks +# +begin; +select * from t1 where pk=2 lock in share mode; +pk a +2 2 +select * from t1 where pk=8 for update; +pk a +8 8 +connection con1; +begin; +select * from t1 where pk=2 for update; +ERROR HY000: Lock wait timeout exceeded; try restarting transaction: Timeout on index: test.t1.PRIMARY +select * from t1 where pk between 0 and 4 for update; +ERROR HY000: Lock wait timeout exceeded; try restarting transaction: Timeout on index: test.t1.PRIMARY +delete from t1 where pk=2; +ERROR HY000: Lock wait timeout exceeded; try restarting transaction: Timeout on index: test.t1.PRIMARY +# Get a shared lock +select * from t1 where pk=2 lock in share mode; +pk a +2 2 +# But this should still prevent us from acquiring a write lock on that value: +select * from t1 where pk=2 for update; +ERROR HY000: Lock wait timeout exceeded; try restarting transaction: Timeout on index: test.t1.PRIMARY +rollback; +connection default; +rollback; +drop table t1; +create table t1 ( +pk int not null primary key, +a int not null, +key(a) +) engine=rocksdb; +insert into t1 +select +A.a+10*B.a+100*C.a+1000*D.a, A.a+10*B.a+100*C.a+1000*D.a +from +t0 A, t0 B, t0 C, t0 D; +set global rocksdb_force_flush_memtable_now=1; +connection con1; +begin; +select * from t1 where pk=900 for update; +pk a +900 900 +connection default; +begin; +explain +select * from t1 where a between 2 and 5 lock in share mode; +id select_type table partitions type possible_keys key key_len ref rows filtered Extra +1 SIMPLE t1 NULL range a a 4 NULL # 100.00 Using where; Using index +Warnings: +Note 1003 /* select#1 */ select `test`.`t1`.`pk` AS `pk`,`test`.`t1`.`a` AS `a` from `test`.`t1` where (`test`.`t1`.`a` between 2 and 5) +select * from t1 where a between 2 and 5 lock in share mode; +pk a +2 2 +3 3 +4 4 +5 5 +# select * from information_schema.rocksdb_locks; # With replacements by select_from_is_rowlocks.inc +COLUMN_FAMILY_ID TRANSACTION_ID KEY mode +$cf_id $TRX1_ID ${indexnr+1}80000002-${indexnr+1}80000005:1 X +$cf_id $TRX1_ID ${indexnr}80000002 S +$cf_id $TRX1_ID ${indexnr}80000003 S +$cf_id $TRX1_ID ${indexnr}80000004 S +$cf_id $TRX1_ID ${indexnr}80000005 S +$cf_id $TRX1_ID ${indexnr}80000006 S +$cf_id $TRX2_ID ${indexnr}80000384 X +rollback; +disconnect con1; +drop table t0,t1; +# +# Test shared point locks and lock escalation +# +create table t0 (a int); +insert into t0 values (0),(1),(2),(3),(4),(5),(6),(7),(8),(9); +create table t1 ( +pk int primary key, +a int +) engine=rocksdb; +insert into t1 +select 1000 + 100*A.a + 10*B.a + C.a, 12345 from t0 A, t0 B, t0 C; +show status like 'rocksdb_locktree_current_lock_memory'; +Variable_name Value +rocksdb_locktree_current_lock_memory 0 +connect con1,localhost,root,,; +connection con1; +begin; +# CON1: get some shared locks +select * from t1 where pk=1001 lock in share mode; +pk a +1001 12345 +select * from t1 where pk=1100 lock in share mode; +pk a +1100 12345 +select * from t1 where pk=1200 lock in share mode; +pk a +1200 12345 +select * from t1 where pk=2500 lock in share mode; +pk a +connection default; +begin; +# DEFAULT: get the same locks so we have locks with multiple owners +select * from t1 where pk=1001 lock in share mode; +pk a +1001 12345 +select * from t1 where pk=1100 lock in share mode; +pk a +1100 12345 +select * from t1 where pk=1200 lock in share mode; +pk a +1200 12345 +# DEFAULT: get shared locks with one owner: +select * from t1 where pk=2510 lock in share mode; +pk a +# DEFAULT: exclusive locks on 0-10: +insert into t1 select A.a, 0 from t0 A; +connection con1; +# CON1: exclusive locks on 2000-2010: +insert into t1 select 2000+A.a, 0 from t0 A; +# select * from information_schema.rocksdb_locks; # With replacements by select_from_is_rowlocks.inc +COLUMN_FAMILY_ID TRANSACTION_ID KEY mode +$cf_id $TRX2_ID ${indexnr}80000000 X +$cf_id $TRX2_ID ${indexnr}80000001 X +$cf_id $TRX2_ID ${indexnr}80000002 X +$cf_id $TRX2_ID ${indexnr}80000003 X +$cf_id $TRX2_ID ${indexnr}80000004 X +$cf_id $TRX2_ID ${indexnr}80000005 X +$cf_id $TRX2_ID ${indexnr}80000006 X +$cf_id $TRX2_ID ${indexnr}80000007 X +$cf_id $TRX2_ID ${indexnr}80000008 X +$cf_id $TRX2_ID ${indexnr}80000009 X +$cf_id $TRX1_ID ${indexnr}800003e9 S +$cf_id $TRX2_ID ${indexnr}800003e9 S +$cf_id $TRX1_ID ${indexnr}8000044c S +$cf_id $TRX2_ID ${indexnr}8000044c S +$cf_id $TRX1_ID ${indexnr}800004b0 S +$cf_id $TRX2_ID ${indexnr}800004b0 S +$cf_id $TRX1_ID ${indexnr}800007d0 X +$cf_id $TRX1_ID ${indexnr}800007d1 X +$cf_id $TRX1_ID ${indexnr}800007d2 X +$cf_id $TRX1_ID ${indexnr}800007d3 X +$cf_id $TRX1_ID ${indexnr}800007d4 X +$cf_id $TRX1_ID ${indexnr}800007d5 X +$cf_id $TRX1_ID ${indexnr}800007d6 X +$cf_id $TRX1_ID ${indexnr}800007d7 X +$cf_id $TRX1_ID ${indexnr}800007d8 X +$cf_id $TRX1_ID ${indexnr}800007d9 X +$cf_id $TRX1_ID ${indexnr}800009c4 S +$cf_id $TRX2_ID ${indexnr}800009ce S +connection default; +show status like 'rocksdb_locktree_current_lock_memory'; +Variable_name Value +rocksdb_locktree_current_lock_memory 8792 +set @save_mlm= @@rocksdb_max_lock_memory; +# Set the limit to cause lock escalation: +set @cur_mem_usage= (select +variable_value +from +performance_schema.global_status +where +variable_name='rocksdb_locktree_current_lock_memory'); +set global rocksdb_max_lock_memory = cast(@cur_mem_usage+4 as SIGNED); +connection con1; +insert into t1 select 3000+A.a, 0 from t0 A; +# select * from information_schema.rocksdb_locks; # With replacements by select_from_is_rowlocks.inc +COLUMN_FAMILY_ID TRANSACTION_ID KEY mode +$cf_id $TRX2_ID ${indexnr}80000000-${indexnr}80000009 X +$cf_id $TRX1_ID ${indexnr}800003e9 S +$cf_id $TRX2_ID ${indexnr}800003e9 S +$cf_id $TRX1_ID ${indexnr}8000044c S +$cf_id $TRX2_ID ${indexnr}8000044c S +$cf_id $TRX1_ID ${indexnr}800004b0 S +$cf_id $TRX2_ID ${indexnr}800004b0 S +$cf_id $TRX1_ID ${indexnr}800007d0-${indexnr}800007d9 X +$cf_id $TRX1_ID ${indexnr}800009c4 S +$cf_id $TRX2_ID ${indexnr}800009ce S +$cf_id $TRX1_ID ${indexnr}80000bb8 X +$cf_id $TRX1_ID ${indexnr}80000bb9 X +$cf_id $TRX1_ID ${indexnr}80000bba X +$cf_id $TRX1_ID ${indexnr}80000bbb X +$cf_id $TRX1_ID ${indexnr}80000bbc X +$cf_id $TRX1_ID ${indexnr}80000bbd X +$cf_id $TRX1_ID ${indexnr}80000bbe X +$cf_id $TRX1_ID ${indexnr}80000bbf X +$cf_id $TRX1_ID ${indexnr}80000bc0 X +$cf_id $TRX1_ID ${indexnr}80000bc1 X +connection con1; +rollback; +connection default; +rollback; +disconnect con1; +set global rocksdb_max_lock_memory= cast(@save_mlm as SIGNED); +drop table t0, t1; diff --git a/mysql-test/suite/rocksdb/r/rocksdb.result b/mysql-test/suite/rocksdb/r/rocksdb.result index c343f66c97c..b1e6878e2d9 100644 --- a/mysql-test/suite/rocksdb/r/rocksdb.result +++ b/mysql-test/suite/rocksdb/r/rocksdb.result @@ -985,6 +985,7 @@ rocksdb_max_background_jobs 2 rocksdb_max_bottom_pri_background_compactions 0 rocksdb_max_compaction_history 64 rocksdb_max_latest_deadlocks 5 +rocksdb_max_lock_memory 1073741824 rocksdb_max_log_file_size 0 rocksdb_max_manifest_file_size 1073741824 rocksdb_max_manual_compactions 10 @@ -1054,6 +1055,8 @@ rocksdb_use_default_sk_cf OFF rocksdb_use_direct_io_for_flush_and_compaction OFF rocksdb_use_direct_reads OFF rocksdb_use_fsync OFF +rocksdb_use_range_lock_manager_as_point OFF +rocksdb_use_range_locking OFF rocksdb_validate_tables 1 rocksdb_verify_row_debug_checksums OFF rocksdb_wal_bytes_per_sync 0 diff --git a/mysql-test/suite/rocksdb/r/rocksdb_read_free_rpl.result b/mysql-test/suite/rocksdb/r/rocksdb_read_free_rpl.result index 46e846afda0..b418bfa9336 100644 --- a/mysql-test/suite/rocksdb/r/rocksdb_read_free_rpl.result +++ b/mysql-test/suite/rocksdb/r/rocksdb_read_free_rpl.result @@ -72,7 +72,7 @@ update t1 set c2=100 where c1=3; delete from t1 where c1 <= 2; include/sync_slave_sql_with_master.inc [connection slave] -select case when variable_value-@up > 0 then 'false' else 'true' end as read_free from performance_schema.global_status where variable_name='rocksdb_num_get_for_update_calls'; +select case when (@@rocksdb_use_range_locking=1 OR variable_value-@up > 0) then 'false' else 'true' end as read_free from performance_schema.global_status where variable_name='rocksdb_num_get_for_update_calls'; read_free false select * from t1; diff --git a/mysql-test/suite/rocksdb/r/rocksdb_timeout_rollback.result b/mysql-test/suite/rocksdb/r/rocksdb_timeout_rollback.result index 1e253a9974b..08a0a2f5942 100644 --- a/mysql-test/suite/rocksdb/r/rocksdb_timeout_rollback.result +++ b/mysql-test/suite/rocksdb/r/rocksdb_timeout_rollback.result @@ -36,6 +36,9 @@ rocksdb_rollback_on_timeout OFF begin work; insert into t1 values (9); insert into t1 values (10); +# Fix for Range Locking: force a snapshot to be taken: +select * from t1 where a=100; +a update t1 set a = a + 1 where a = 2; begin work; insert into t1 values (11); diff --git a/mysql-test/suite/rocksdb/r/unique_sec.result b/mysql-test/suite/rocksdb/r/unique_sec.result index 1da78db24b1..d4ef2e0ff2e 100644 --- a/mysql-test/suite/rocksdb/r/unique_sec.result +++ b/mysql-test/suite/rocksdb/r/unique_sec.result @@ -114,6 +114,10 @@ ERROR 23000: Duplicate entry '37' for key 't1.id5' UPDATE t1 SET id5=34 WHERE id1=38; ERROR HY000: Lock wait timeout exceeded; try restarting transaction: Timeout on index: test.t1.id5 # NULL values are unique +# (Note: the following UPDATE reads through the whole table without +# finding anything to update. With point locking, this is fine, +# but with range locking it will time out while waiting on a row lock +# that the other transaction is holding) UPDATE t1 SET id5=NULL WHERE value1 > 37; COMMIT; COMMIT; diff --git a/mysql-test/suite/rocksdb/r/unique_sec_rev_cf.result b/mysql-test/suite/rocksdb/r/unique_sec_rev_cf.result index d6d06f6ece5..0e71e6481aa 100644 --- a/mysql-test/suite/rocksdb/r/unique_sec_rev_cf.result +++ b/mysql-test/suite/rocksdb/r/unique_sec_rev_cf.result @@ -114,6 +114,10 @@ ERROR 23000: Duplicate entry '37' for key 't1.id5' UPDATE t1 SET id5=34 WHERE id1=38; ERROR HY000: Lock wait timeout exceeded; try restarting transaction: Timeout on index: test.t1.id5 # NULL values are unique +# (Note: the following UPDATE reads through the whole table without +# finding anything to update. With point locking, this is fine, +# but with range locking it will time out while waiting on a row lock +# that the other transaction is holding) UPDATE t1 SET id5=NULL WHERE value1 > 37; COMMIT; COMMIT; diff --git a/mysql-test/suite/rocksdb/t/deadlock_tracking.test b/mysql-test/suite/rocksdb/t/deadlock_tracking.test index 42e46bb0f28..55e6502c079 100644 --- a/mysql-test/suite/rocksdb/t/deadlock_tracking.test +++ b/mysql-test/suite/rocksdb/t/deadlock_tracking.test @@ -1,3 +1,9 @@ +# Deadlock #5 uses SELECT ... LOCK IN SHARE MODE; +# SHOW ENGINE ROCKSDB TRANSACTION status prints information about deadlocks. +# A part of this test that works with range locking is in +# range_locking_deadlock_tracking.test +--source suite/rocksdb/include/not_range_locking.inc + set @prior_lock_wait_timeout = @@rocksdb_lock_wait_timeout; set @prior_deadlock_detect = @@rocksdb_deadlock_detect; set @prior_max_latest_deadlocks = @@rocksdb_max_latest_deadlocks; @@ -137,7 +143,6 @@ rollback; connection default; --replace_regex /[0-9]{4}-[0-9]{2}-[0-9]{2} [0-9]{2}:[0-9]{2}:[0-9]{2}/TIMESTAMP/ /WAITING KEY: [0-9a-f]{16}/KEY/ /TRANSACTION ID: [0-9]*/TXN_ID/ /TIMESTAMP: [0-9]*/TSTAMP/ show engine rocksdb transaction status; - echo Deadlock #6; connection con1; create table t1 (id int primary key, value int) engine=rocksdb; diff --git a/mysql-test/suite/rocksdb/t/drop_cf_before_show_deadlock_info.test b/mysql-test/suite/rocksdb/t/drop_cf_before_show_deadlock_info.test index f7eb8151f40..05ae30f2ddd 100644 --- a/mysql-test/suite/rocksdb/t/drop_cf_before_show_deadlock_info.test +++ b/mysql-test/suite/rocksdb/t/drop_cf_before_show_deadlock_info.test @@ -3,6 +3,10 @@ --source include/have_rocksdb.inc --source include/count_sessions.inc +# Doesn't work with range locking because range locking +# does not provide info in rocksdb_deadlock. +--source suite/rocksdb/include/not_range_locking.inc + --disable_query_log call mtr.add_suppression("Column family '[a-z_]+' not found"); --enable_query_log diff --git a/mysql-test/suite/rocksdb/t/hermitage-range_locking.test b/mysql-test/suite/rocksdb/t/hermitage-range_locking.test new file mode 100644 index 00000000000..55203af9cf8 --- /dev/null +++ b/mysql-test/suite/rocksdb/t/hermitage-range_locking.test @@ -0,0 +1,15 @@ +--source include/have_rocksdb.inc + +# Range locking uses InnoDB-like transaction isolation, which +# means the results differ from "true" Repeatable Read. +--source suite/rocksdb/include/have_range_locking.inc + + +# Hermitage is an attempt to test transaction isolation levels. +# https://github.com/ept/hermitage + +let $trx_isolation = READ COMMITTED; +--source hermitage.inc + +let $trx_isolation = REPEATABLE READ; +--source hermitage.inc diff --git a/mysql-test/suite/rocksdb/t/hermitage.inc b/mysql-test/suite/rocksdb/t/hermitage.inc index 90f7d482533..83815a70459 100644 --- a/mysql-test/suite/rocksdb/t/hermitage.inc +++ b/mysql-test/suite/rocksdb/t/hermitage.inc @@ -108,6 +108,8 @@ select * from test where value % 3 = 0; commit; --source hermitage_init.inc +let $RC_OR_RANGE_LOCKING=`select @@tx_isolation='READ-COMMITTED' OR @@rocksdb_use_range_locking=1`; +let $RR_AND_NOT_RANGE_LOCKING=`select @@tx_isolation='REPEATABLE-READ' AND @@rocksdb_use_range_locking=0`; connection con1; update test set value = value + 10; connection con2; @@ -117,13 +119,13 @@ send delete from test where value = 20; connection con1; commit; connection con2; -if ($trx_isolation == "READ COMMITTED") +if ($RC_OR_RANGE_LOCKING) { reap; # RC: Returns 2 => 30 select * from test; } -if ($trx_isolation == "REPEATABLE READ") +if ($RR_AND_NOT_RANGE_LOCKING) { --error ER_LOCK_DEADLOCK reap; @@ -147,13 +149,13 @@ send update test set value = 12 where id = 1; connection con1; commit; connection con2; -if ($trx_isolation == "READ COMMITTED") +if ($RC_OR_RANGE_LOCKING) { reap; # RC: Returns 1 => 12 select * from test; } -if ($trx_isolation == "REPEATABLE READ") +if ($RR_AND_NOT_RANGE_LOCKING) { --error ER_LOCK_DEADLOCK reap; @@ -200,12 +202,12 @@ update test set value = 12 where id = 1; update test set value = 18 where id = 2; commit; connection con1; -if ($trx_isolation == "READ COMMITTED") +if ($RC_OR_RANGE_LOCKING) { delete from test where value = 20; # doesn't delete anything select * from test where id = 2; # shows 2 => 18 } -if ($trx_isolation == "REPEATABLE READ") +if ($RR_AND_NOT_RANGE_LOCKING) { --error ER_LOCK_DEADLOCK delete from test where value = 20; diff --git a/mysql-test/suite/rocksdb/t/hermitage.test b/mysql-test/suite/rocksdb/t/hermitage.test index e4138e8d89f..51f3f286a0e 100644 --- a/mysql-test/suite/rocksdb/t/hermitage.test +++ b/mysql-test/suite/rocksdb/t/hermitage.test @@ -1,5 +1,8 @@ --source include/have_rocksdb.inc +# See hermitage-range_locking variant +--source suite/rocksdb/include/not_range_locking.inc + # Hermitage is an attempt to test transaction isolation levels. # https://github.com/ept/hermitage diff --git a/mysql-test/suite/rocksdb/t/i_s_deadlock.test b/mysql-test/suite/rocksdb/t/i_s_deadlock.test index e0479d6a337..82fa9fc6bbd 100644 --- a/mysql-test/suite/rocksdb/t/i_s_deadlock.test +++ b/mysql-test/suite/rocksdb/t/i_s_deadlock.test @@ -1,5 +1,9 @@ --source include/have_rocksdb.inc +# Uses LOCK IN SHARE MODE and so will hang in range-locking mode. The part that +# doesn't hang is in rocksdb.range_locking_i_s_deadlock.test +--source suite/rocksdb/include/not_range_locking.inc + set @prior_lock_wait_timeout = @@rocksdb_lock_wait_timeout; set @prior_deadlock_detect = @@rocksdb_deadlock_detect; set @prior_max_latest_deadlocks = @@rocksdb_max_latest_deadlocks; diff --git a/mysql-test/suite/rocksdb/t/issue111.test b/mysql-test/suite/rocksdb/t/issue111.test index 671ea4708d6..3657e977a70 100644 --- a/mysql-test/suite/rocksdb/t/issue111.test +++ b/mysql-test/suite/rocksdb/t/issue111.test @@ -1,5 +1,9 @@ --source include/have_rocksdb.inc +# The testcase here assumes key tracking is present +# (and range locking uses InnoDB-like approach, "DMLs use Read Commited") +--source suite/rocksdb/include/not_range_locking.inc + connect (con2,localhost,root,,); connection default; diff --git a/mysql-test/suite/rocksdb/t/issue243_transactionStatus-range_locking.test b/mysql-test/suite/rocksdb/t/issue243_transactionStatus-range_locking.test new file mode 100644 index 00000000000..465fb9099da --- /dev/null +++ b/mysql-test/suite/rocksdb/t/issue243_transactionStatus-range_locking.test @@ -0,0 +1,10 @@ +# +# A range-locking variant of issue243_transactionStatus.test + +--source include/have_rocksdb.inc +--source suite/rocksdb/include/have_range_locking.inc + +let $forced_range_locking=1; +--source issue243_transactionStatus.test + + diff --git a/mysql-test/suite/rocksdb/t/issue243_transactionStatus.test b/mysql-test/suite/rocksdb/t/issue243_transactionStatus.test index 1e2f0b41226..5c1948ebe81 100644 --- a/mysql-test/suite/rocksdb/t/issue243_transactionStatus.test +++ b/mysql-test/suite/rocksdb/t/issue243_transactionStatus.test @@ -1,5 +1,9 @@ --source include/have_rocksdb.inc +if (!$forced_range_locking) { +--source suite/rocksdb/include/not_range_locking.inc +} + --disable_warnings DROP TABLE IF EXISTS t1; --enable_warnings diff --git a/mysql-test/suite/rocksdb/t/level_repeatable_read-range_locking.test b/mysql-test/suite/rocksdb/t/level_repeatable_read-range_locking.test new file mode 100644 index 00000000000..6c42c7be12c --- /dev/null +++ b/mysql-test/suite/rocksdb/t/level_repeatable_read-range_locking.test @@ -0,0 +1,9 @@ +--source include/have_rocksdb.inc + +# Range locking uses InnoDB-like transaction isolation, which +# means the results differ from "true" Repeatable Read. +--source suite/rocksdb/include/have_range_locking.inc + +let $trx_isolation = REPEATABLE READ; +--source transaction_isolation.inc + diff --git a/mysql-test/suite/rocksdb/t/level_repeatable_read.test b/mysql-test/suite/rocksdb/t/level_repeatable_read.test index cf29073f69e..b81dcf31ab1 100644 --- a/mysql-test/suite/rocksdb/t/level_repeatable_read.test +++ b/mysql-test/suite/rocksdb/t/level_repeatable_read.test @@ -1,5 +1,8 @@ --source include/have_rocksdb.inc +# See level_repeatable_read-range_locking variant +--source suite/rocksdb/include/not_range_locking.inc + let $trx_isolation = REPEATABLE READ; --source transaction_isolation.inc diff --git a/mysql-test/suite/rocksdb/t/lock_info.test b/mysql-test/suite/rocksdb/t/lock_info.test index 1b624cf38c0..a277c1b8d8d 100644 --- a/mysql-test/suite/rocksdb/t/lock_info.test +++ b/mysql-test/suite/rocksdb/t/lock_info.test @@ -1,5 +1,8 @@ --source include/have_rocksdb.inc +# Range Locking supports I_S.lock_info but its printout is different (see range_locking.test) +--source suite/rocksdb/include/not_range_locking.inc + --disable_warnings DROP TABLE IF EXISTS t1; DROP TABLE IF EXISTS t2; diff --git a/mysql-test/suite/rocksdb/t/locking_issues.test b/mysql-test/suite/rocksdb/t/locking_issues.test index 035046ae368..95a6676f78a 100644 --- a/mysql-test/suite/rocksdb/t/locking_issues.test +++ b/mysql-test/suite/rocksdb/t/locking_issues.test @@ -1,5 +1,8 @@ --source include/have_rocksdb.inc +# A lot of tests below assume point locking, not range. +--source suite/rocksdb/include/not_range_locking.inc + let $isolation_level = REPEATABLE READ; --source suite/rocksdb/include/locking_issues_case1_1.inc diff --git a/mysql-test/suite/rocksdb/t/max_row_locks.test b/mysql-test/suite/rocksdb/t/max_row_locks.test index 4b07f3d8492..d4b2604f1e3 100644 --- a/mysql-test/suite/rocksdb/t/max_row_locks.test +++ b/mysql-test/suite/rocksdb/t/max_row_locks.test @@ -1,4 +1,5 @@ --source include/have_rocksdb.inc +--source suite/rocksdb/include/not_range_locking.inc create table t1 (id1 bigint, id2 bigint, c1 bigint, c2 bigint, c3 bigint, c4 bigint, c5 bigint, c6 bigint, c7 bigint, primary key (id1, id2), index i(c1, c2)); --disable_query_log diff --git a/mysql-test/suite/rocksdb/t/range_locking.inc b/mysql-test/suite/rocksdb/t/range_locking.inc new file mode 100644 index 00000000000..4f1db4399cb --- /dev/null +++ b/mysql-test/suite/rocksdb/t/range_locking.inc @@ -0,0 +1,544 @@ +# +# Range locking tests. +# + +--source include/have_rocksdb.inc +--source suite/rocksdb/include/have_range_locking.inc + +--enable_connect_log + + +show variables like 'rocksdb_use_range_locking'; + +eval create table t1 ( + pk int, + a int, + primary key (pk) comment '$pk_cf' +) engine=rocksdb; + +insert into t1 values +(10,10),(20,20),(30,30); + +connect (con1,localhost,root,,); +connect (con2,localhost,root,,); + +--echo ### Test: check that range lock inhibits a point lock +connection con1; +begin; +select * from t1 where pk between 5 and 25 for update; + +connection con2; +--error ER_LOCK_WAIT_TIMEOUT +insert into t1 values (15,15); + +connection con1; +rollback; + +--echo ## Test: check that range lock inhibits another range lock +connection con1; +begin; +select * from t1 where pk between 5 and 25 for update; + +connection con2; +begin; +--error ER_LOCK_WAIT_TIMEOUT +select * from t1 where pk between 15 and 35 for update; +rollback; + +connection con1; +rollback; + +--echo ## Test: check that regular read does not get a range lock +connection con1; +begin; +select * from t1 where pk between 5 and 25; + +connection con2; +begin; +# This must not block +select * from t1 where pk between 15 and 35 for update; +rollback; + +connection con1; +rollback; + +--echo ## Test that locks are not released when a statement inside +--echo ## a transaction is rolled back +eval +create table t2 ( + pk int, + a int, + primary key (pk) comment '$pk_cf', + unique key(a) comment '$sk_cf' +) engine=rocksdb; + +insert into t2 values (1,1),(2,2); + +begin; +insert into t2 values (3,3); +--error ER_DUP_ENTRY +insert into t2 values (10,2); + +connection con2; +begin; +# This must time out: +--error ER_LOCK_WAIT_TIMEOUT +select * from t2 where pk=3 for update; + +rollback; +connection con1; +rollback; +drop table t2; + +# cleanup +connection default; +disconnect con1; +disconnect con2; +drop table t1; + +--echo # +--echo # Test INFORMATION_SCHEMA.lock_info in range-locking mode +--echo # + +connect (con1,localhost,root,,); +connection con1; +eval create table t0 (a int primary key); +begin; +insert into t0 values (1); +connection default; + + +eval +create table t1 ( + pk int, + a int, + primary key (pk) comment '$pk_cf' +) engine=rocksdb; + +insert into t1 values +(10,10),(20,20),(30,30); + +begin; +select * from t1 where pk=10 for update; + +#let TRX1_ID=`(select transaction_id from information_schema.rocksdb_trx where thread_id=connection_id())` ; +let $select_from_is_rowlocks_current_trx_only=1; +--source suite/rocksdb/include/select_from_is_rowlocks.inc + +delete from t1 where pk between 25 and 40; + +--source suite/rocksdb/include/select_from_is_rowlocks.inc + +rollback; +begin; +--echo # The following will show a range lock on 2-9 and also a point lock on 10. +--echo # This is how things currently work. (after MDEV-21314, not anymore) +select * from t1 where pk between 2 and 9 for update; +--source suite/rocksdb/include/select_from_is_rowlocks.inc +rollback; + +drop table t1; +connection con1; +rollback; +drop table t0; +connection default; +disconnect con1; + +--echo # +--echo # MDEV-18104: MyRocks-Gap-Lock: range locking bounds are incorrect for multi-part keys +--echo # + +create table t0(a int); +insert into t0 values (0),(1),(2),(3),(4),(5),(6),(7),(8),(9); +eval +create table t1 ( + kp1 int not null, + kp2 int not null, + a int, + primary key(kp1, kp2) comment '$pk_cf' +) engine=rocksdb; + +insert into t1 select 1, a, 1234 from t0; +insert into t1 select 2, a, 1234 from t0; +insert into t1 select 3, a, 1234 from t0; + +connect (con1,localhost,root,,); +connection con1; + +begin; +select * from t1 where kp1=2 for update; + +connection default; +--echo # The lock on kp1=2 should inhibit the following INSERT: +--error ER_LOCK_WAIT_TIMEOUT +insert into t1 values ( 2,5,9999); +rollback; + +connection con1; +rollback; +connection default; +disconnect con1; +drop table t0,t1; + +--echo # +--echo # Test that locks on ranges on non-unique secondary keys inhibit +--echo # modifications of the contents of these ranges +--echo # + +create table t0(a int); +insert into t0 values (0),(1),(2),(3),(4),(5),(6),(7),(8),(9); +eval +create table t1 ( + kp1 int not null, + kp2 int not null, + a int, + key(kp1, kp2) comment '$pk_cf' +) engine=rocksdb; + +insert into t1 select 1, a, 1234 from t0; +insert into t1 values (2, 3, 1234); +insert into t1 values (2, 5, 1234); +insert into t1 values (2, 7, 1234); +insert into t1 select 3, a, 1234 from t0; + +connect (con1,localhost,root,,); +connection con1; +begin; +--replace_column 10 # +explain +select * from t1 where kp1=2 for update; +select * from t1 where kp1=2 for update; + +connection default; +begin; +--error ER_LOCK_WAIT_TIMEOUT +insert into t1 values (2, 9, 9999); + +--error ER_LOCK_WAIT_TIMEOUT +delete from t1 where kp1=2 and kp2=5; + +# Update that "moves a row away" from the locked range +--error ER_LOCK_WAIT_TIMEOUT +update t1 set kp1=333 where kp1=2 and kp2=3; + +# Update that "moves a row into" the locked range +--error ER_LOCK_WAIT_TIMEOUT +update t1 set kp1=2 where kp1=1 and kp2=8; + +rollback; + +connection con1; +rollback; +disconnect con1; +connection default; +drop table t0,t1; + +--echo # +--echo # Transaction isolation test +--echo # + +create table t1 (pk int primary key, a int) engine=rocksdb; +insert into t1 values (1,1),(2,2),(3,3); + +connect (con1,localhost,root,,); + +--echo # TRX1: Start, Allocate a snapshot +connection con1; +begin; +select * from t1; + +--echo # TRX2: Make a change that TRX1 will not see +connection default; +update t1 set a=2222 where pk=2; + +--echo # TRX1: Now, make a change that would overwrite TRX2'x change and commit +connection con1; +update t1 set a=a+1 where pk=2; +commit; + +--echo # Examine the result: +--echo # pk=2, a=2223 means UPDATE in TRX1 used "read committed" (InnoDB-like isolation) +--echo # pk=2, a=3 means UPDATE in TRX1 silently overwrote TRX2 +--echo # (and with key tracking, one would get an error on the second UPDATE) +connection default; +select * from t1; + +disconnect con1; +connection default; +drop table t1; + +--echo # +--echo # Same test as above, but check the range scan +--echo # + +eval +create table t1 ( + pk int, + a int, + primary key (pk) comment '$pk_cf' +) engine=rocksdb; +insert into t1 values (1,1),(2,2),(3,3),(4,4),(5,5),(6,6); + +connect (con1,localhost,root,,); + +--echo # TRX1: Start, Allocate a snapshot +connection con1; +begin; +select * from t1; + +--echo # TRX2: Make a change that TRX1 will not see +connection default; +update t1 set a=2222 where pk between 3 and 5; + +--echo # TRX1: Now, make a change that would overwrite TRX2'x change and commit +connection con1; +update t1 set a=a+1 where pk between 3 and 5; +commit; + +--echo # Examine the result: +--echo # pk={3,4,5} a=2223 means UPDATE in TRX1 used "read committed" (InnoDB-like isolation) +connection default; +select * from t1; + +disconnect con1; +connection default; +drop table t1; + +--echo # +--echo # Same as above, but test SELECT FOR UPDATE. +--echo # +eval +create table t1 ( + pk int, + a int, + primary key (pk) comment '$pk_cf' +) engine=rocksdb; +insert into t1 values (1,1),(2,2),(3,3),(4,4),(5,5),(6,6); + +connect (con1,localhost,root,,); + +--echo # TRX1: Start, Allocate a snapshot +connection con1; +begin; +select * from t1; + +--echo # TRX2: Make a change that TRX1 will not see +connection default; +update t1 set a=222 where pk=2; +update t1 set a=333 where pk=3; + +--echo # TRX1: Check what select [FOR UPDATE] sees +connection con1; +select * from t1 where pk in (2,3); +select * from t1 where pk=2 for update; +select * from t1 where pk=2; + +commit; + +disconnect con1; +connection default; +drop table t1; + +if (!$PK_USES_REVERSE_CF) { +--echo # +--echo # Another no-snapshot-checking test, this time for single-statement +--echo # transaction +--echo # +eval +create table t1 ( + pk int, + a int, + name varchar(16), + primary key(pk) comment '$pk_cf' +) engine=rocksdb; +insert into t1 values (1,1, 'row1'), (2,2,'row2'); + +connect (con1,localhost,root,,); +connection con1; +select get_lock('row1', 100); + +connection default; + +--echo # The following will read the first row (1,1,'row1'), and stop. + +send update t1 set a=a+100 where get_lock(name, 1000)=1; + +# Wait till the default connection has stopped: +connection con1; + +let $wait_condition= + SELECT COUNT(*) = 1 FROM INFORMATION_SCHEMA.PROCESSLIST WHERE STATE = "User lock" + AND INFO = "update t1 set a=a+100 where get_lock(name, 1000)=1"; +--source include/wait_condition.inc + +# Update the second row +update t1 set a=5 where pk=2; + +select release_lock('row1'); + +connection default; +reap; + +--echo # Look at the row with pk=2: +--echo # 2, 105, row2 - means the UPDATE was reading current data (Correct) +--echo # 2, 102, row - means the UPDATE read the snapshot (incorrect) +select * from t1; + +--echo # Try releasing both locks (in 5.6, we will be holding only the second one) +select release_lock(name) from t1; + +disconnect con1; +connection default; +drop table t1; +} + +--echo # +--echo # Check that I_S.processlist.state is set correctly now. +--echo # +eval +create table t1( + pk int, + a int, + primary key(pk) comment '$pk_cf' +) engine=rocksdb; +insert into t1 values (1,1),(2,2),(3,3); + +begin; +select * from t1 where pk=2 for update; + +--connect (con1,localhost,root,,) +begin; +set rocksdb_lock_wait_timeout=300; +send select * from t1 where pk=2 for update; + +connection default; +--echo # Now, will wait until we see con1 have state="Waiting for row lock" +let $wait_condition= + SELECT COUNT(*) = 1 FROM INFORMATION_SCHEMA.PROCESSLIST WHERE STATE = "Waiting for row lock" + AND INFO = "select * from t1 where pk=2 for update"; +--source include/wait_condition.inc + +rollback; +connection con1; +--reap +rollback; + +disconnect con1; +connection default; +drop table t1; + +--echo # +--echo # Test range locking for ranges with HA_READ_PREFIX_LAST +--echo # +create table t0(a int) engine=rocksdb; +insert into t0 values (0),(1),(2),(3),(4),(5),(6),(7),(8),(9); + +eval +create table t1 ( + pk1 int, + pk2 int, + a int, + primary key(pk1, pk2) comment '$pk_cf' +) engine=rocksdb; + +insert into t1 +select + A.a, B.a, A.a*10+B.a +from + t0 A, t0 B; + + +# Get a lock in another connection so that the primary transaction is not using +# STO optimization, and its locks can be seen in I_S.rocksdb_locks +--connect (con1,localhost,root,,) +connection con1; +begin; +insert into t1 values (0x1112222,0x1112222,0); + +connection default; +begin; +--echo # Should use ref access w/o filesort: +--replace_column 10 # +explain +select * from t1 +where pk1=3 +order by pk1 desc, pk2 desc +for update; + +select * from t1 +where pk1=3 +order by pk1 desc, pk2 desc +for update; + +let $select_from_is_rowlocks_current_trx_only=1; +--source suite/rocksdb/include/select_from_is_rowlocks.inc +rollback; + +--echo # +--echo # Test range locking for ranges with HA_READ_PREFIX_LAST_OR_PREV +--echo # + +begin; +--echo # Should use range access with 2 keyparts and w/o filesort: +--replace_column 10 # +explain +select * from t1 +where pk1=4 and pk2 between 5 and 8 +order by pk1 desc, pk2 desc +for update; + +select * from t1 +where pk1=4 and pk2 between 5 and 8 +order by pk1 desc, pk2 desc +for update; + +let $select_from_is_rowlocks_current_trx_only=1; +--source suite/rocksdb/include/select_from_is_rowlocks.inc +rollback; + +connection con1; +rollback; + +connection default; +drop table t0, t1; + +--echo # +--echo # A bug: range locking was not used when scan started at table start or end +--echo # +create table t0(a int); +insert into t0 values (0),(1),(2),(3),(4),(5),(6),(7),(8),(9); +create table t10(a int); +insert into t10 select A.a + B.a* 10 + C.a * 100 from t0 A, t0 B, t0 C; + +create table t1 ( + pk int not null, + a int, + primary key(pk) +) engine=rocksdb; + +insert into t1 select a*2,a*2 from t10; + +connection con1; +begin; +select * from t1 where pk=500 for update; +connection default; + +begin; +select * from t1 where pk<10 order by pk limit 10 for update; + +let $select_from_is_rowlocks_current_trx_only=1; +--source suite/rocksdb/include/select_from_is_rowlocks.inc +rollback; + +begin; +select * from t1 where pk>1990 order by pk desc limit 10 for update; +let $select_from_is_rowlocks_current_trx_only=1; +--source suite/rocksdb/include/select_from_is_rowlocks.inc +rollback; + +connection con1; +rollback; +disconnect con1; + +connection default; +drop table t0,t10,t1; diff --git a/mysql-test/suite/rocksdb/t/range_locking.test b/mysql-test/suite/rocksdb/t/range_locking.test new file mode 100644 index 00000000000..5c599238a0a --- /dev/null +++ b/mysql-test/suite/rocksdb/t/range_locking.test @@ -0,0 +1,6 @@ + +--let pk_cf=default +--let sk_cf=default + +--source range_locking.inc + diff --git a/mysql-test/suite/rocksdb/t/range_locking_deadlock_tracking.test b/mysql-test/suite/rocksdb/t/range_locking_deadlock_tracking.test new file mode 100644 index 00000000000..2a5966b65c3 --- /dev/null +++ b/mysql-test/suite/rocksdb/t/range_locking_deadlock_tracking.test @@ -0,0 +1,196 @@ +--source suite/rocksdb/include/have_range_locking.inc + +# +# This is deadlock_tracking.test, variant for running with Range Locking: +# - Deadlock #5 is disabled, it requires LOCK IN SHARE MODE tests +# - In the result file, SHOW ENGINE ROCKSDB TRANSACTION STATUS does not print +# deadlock information. +# +set @prior_lock_wait_timeout = @@rocksdb_lock_wait_timeout; +set @prior_deadlock_detect = @@rocksdb_deadlock_detect; +set @prior_max_latest_deadlocks = @@rocksdb_max_latest_deadlocks; +set global rocksdb_deadlock_detect = on; +set global rocksdb_lock_wait_timeout = 10000; +--echo # Clears deadlock buffer of any prior deadlocks. +set global rocksdb_max_latest_deadlocks = 0; +set global rocksdb_max_latest_deadlocks = @prior_max_latest_deadlocks; +let $engine = rocksdb; + +--source include/count_sessions.inc +connect (con1,localhost,root,,); +let $con1= `SELECT CONNECTION_ID()`; + +connect (con2,localhost,root,,); +let $con2= `SELECT CONNECTION_ID()`; + +connect (con3,localhost,root,,); +let $con3= `SELECT CONNECTION_ID()`; + +connection default; +eval create table t (i int primary key) engine=$engine; +insert into t values (1), (2), (3); +--replace_regex /[0-9]{4}-[0-9]{2}-[0-9]{2} [0-9]{2}:[0-9]{2}:[0-9]{2}/TIMESTAMP/ /WAITING KEY: [0-9a-f]{16}/KEY/ /TRANSACTION ID: [0-9]*/TXN_ID/ /TIMESTAMP: [0-9]*/TSTAMP/ +show engine rocksdb transaction status; + +echo Deadlock #1; +--source include/simple_deadlock.inc +connection default; +--replace_regex /[0-9]{4}-[0-9]{2}-[0-9]{2} [0-9]{2}:[0-9]{2}:[0-9]{2}/TIMESTAMP/ /WAITING KEY: [0-9a-f]{16}/KEY/ /TRANSACTION ID: [0-9]*/TXN_ID/ /TIMESTAMP: [0-9]*/TSTAMP/ +show engine rocksdb transaction status; + +echo Deadlock #2; +--source include/simple_deadlock.inc +connection default; +--replace_regex /[0-9]{4}-[0-9]{2}-[0-9]{2} [0-9]{2}:[0-9]{2}:[0-9]{2}/TIMESTAMP/ /WAITING KEY: [0-9a-f]{16}/KEY/ /TRANSACTION ID: [0-9]*/TXN_ID/ /TIMESTAMP: [0-9]*/TSTAMP/ +show engine rocksdb transaction status; +set global rocksdb_max_latest_deadlocks = 10; + +echo Deadlock #3; +--source include/simple_deadlock.inc +connection default; +--replace_regex /[0-9]{4}-[0-9]{2}-[0-9]{2} [0-9]{2}:[0-9]{2}:[0-9]{2}/TIMESTAMP/ /WAITING KEY: [0-9a-f]{16}/KEY/ /TRANSACTION ID: [0-9]*/TXN_ID/ /TIMESTAMP: [0-9]*/TSTAMP/ +show engine rocksdb transaction status; +set global rocksdb_max_latest_deadlocks = 1; +--replace_regex /[0-9]{4}-[0-9]{2}-[0-9]{2} [0-9]{2}:[0-9]{2}:[0-9]{2}/TIMESTAMP/ /WAITING KEY: [0-9a-f]{16}/KEY/ /TRANSACTION ID: [0-9]*/TXN_ID/ /TIMESTAMP: [0-9]*/TSTAMP/ +show engine rocksdb transaction status; + +connection con3; +set rocksdb_deadlock_detect_depth = 2; + +--echo # Range locking code will report deadlocks, because it doesn't honor +--echo # rocksdb_deadlock_detect_depth: +echo Deadlock #4; +connection con1; +begin; +select * from t where i=1 for update; + +connection con2; +begin; +select * from t where i=2 for update; + +connection con3; +begin; +select * from t where i=3 for update; + +connection con1; +send select * from t where i=2 for update; + +connection con2; +let $wait_condition = select count(*) = 1 from information_schema.rocksdb_trx +where thread_id = $con1 and waiting_key != ""; +--source include/wait_condition.inc + +send select * from t where i=3 for update; + +connection con3; +let $wait_condition = select count(*) = 1 from information_schema.rocksdb_trx +where thread_id = $con2 and waiting_key != ""; +--source include/wait_condition.inc + +select variable_value into @a from performance_schema.global_status where variable_name='rocksdb_row_lock_deadlocks'; +--error ER_LOCK_DEADLOCK +select * from t where i=1 for update; +select case when variable_value-@a = 1 then 'true' else 'false' end as deadlocks from performance_schema.global_status where variable_name='rocksdb_row_lock_deadlocks'; +rollback; + +connection con2; +reap; +rollback; + +connection con1; +reap; +rollback; + +connection default; +set global rocksdb_max_latest_deadlocks = 5; +--replace_regex /[0-9]{4}-[0-9]{2}-[0-9]{2} [0-9]{2}:[0-9]{2}:[0-9]{2}/TIMESTAMP/ /WAITING KEY: [0-9a-f]{16}/KEY/ /TRANSACTION ID: [0-9]*/TXN_ID/ /TIMESTAMP: [0-9]*/TSTAMP/ +show engine rocksdb transaction status; + +--disable_testcase BUG#0000 +echo Deadlock #5; +connection con1; +begin; +select * from t where i=1 for update; + +connection con2; +begin; +select * from t where i=2 for update; + +connection con3; +begin; +select * from t where i=3 lock in share mode; + +connection con1; +select * from t where i=100 for update; +select * from t where i=101 for update; +send select * from t where i=2 for update; + +connection con2; +let $wait_condition = select count(*) = 1 from information_schema.rocksdb_trx +where thread_id = $con1 and waiting_key != ""; +--source include/wait_condition.inc + +select * from t where i=3 lock in share mode; +select * from t where i=200 for update; +select * from t where i=201 for update; + +--error ER_LOCK_DEADLOCK +select * from t where i=1 lock in share mode; +rollback; + +connection con1; +reap; +rollback; + +connection con3; +rollback; + +connection default; +--replace_regex /[0-9]{4}-[0-9]{2}-[0-9]{2} [0-9]{2}:[0-9]{2}:[0-9]{2}/TIMESTAMP/ /WAITING KEY: [0-9a-f]{16}/KEY/ /TRANSACTION ID: [0-9]*/TXN_ID/ /TIMESTAMP: [0-9]*/TSTAMP/ +show engine rocksdb transaction status; +--enable_testcase +echo Deadlock #6; +connection con1; +create table t1 (id int primary key, value int) engine=rocksdb; +insert into t1 values (1,1),(2,2),(3,3),(4,4),(5,5); +begin; +update t1 set value=value+100 where id=1; +update t1 set value=value+100 where id=2; + +connection con2; +begin; +update t1 set value=value+200 where id=3; + +connection con1; +send update t1 set value=value+100 where id=3; + +connection con2; +let $wait_condition = select count(*) = 1 from information_schema.rocksdb_trx +where thread_id = $con1 and waiting_key != ""; +--source include/wait_condition.inc +--error ER_LOCK_DEADLOCK +update t1 set value=value+200 where id=1; + +# con2 tx is automatically rolled back +connection con1; +reap; +select * from t1; +drop table t1; + +connection default; + +disconnect con1; +disconnect con2; +disconnect con3; + +set global rocksdb_lock_wait_timeout = @prior_lock_wait_timeout; +set global rocksdb_deadlock_detect = @prior_deadlock_detect; +drop table t; +--replace_regex /[0-9]{4}-[0-9]{2}-[0-9]{2} [0-9]{2}:[0-9]{2}:[0-9]{2}/TIMESTAMP/ /WAITING KEY: [0-9a-f]{16}/KEY/ /TRANSACTION ID: [0-9]*/TXN_ID/ /INDEX_ID: [0-9a-f]*/IDX_ID/ /TIMESTAMP: [0-9]*/TSTAMP/ +show engine rocksdb transaction status; +set global rocksdb_max_latest_deadlocks = 0; +--echo # Clears deadlock buffer of any existent deadlocks. +set global rocksdb_max_latest_deadlocks = @prior_max_latest_deadlocks; +--replace_regex /[0-9]{4}-[0-9]{2}-[0-9]{2} [0-9]{2}:[0-9]{2}:[0-9]{2}/TIMESTAMP/ /WAITING KEY: [0-9a-f]{16}/KEY/ /TRANSACTION ID: [0-9]*/TXN_ID/ /INDEX_ID: [0-9a-f]*/IDX_ID/ /TIMESTAMP: [0-9]*/TSTAMP/ +show engine rocksdb transaction status; +--source include/wait_until_count_sessions.inc diff --git a/mysql-test/suite/rocksdb/t/range_locking_escalation-master.opt b/mysql-test/suite/rocksdb/t/range_locking_escalation-master.opt new file mode 100644 index 00000000000..d0087e2a77b --- /dev/null +++ b/mysql-test/suite/rocksdb/t/range_locking_escalation-master.opt @@ -0,0 +1 @@ +--rocksdb_use_range_locking=1 --rocksdb_max_lock_memory=1024 diff --git a/mysql-test/suite/rocksdb/t/range_locking_escalation.test b/mysql-test/suite/rocksdb/t/range_locking_escalation.test new file mode 100644 index 00000000000..5a6e9fa6616 --- /dev/null +++ b/mysql-test/suite/rocksdb/t/range_locking_escalation.test @@ -0,0 +1,39 @@ +# +# Range Locking - Lock Escalation Tests. +# + +--source include/have_rocksdb.inc +--source suite/rocksdb/include/have_range_locking.inc +--enable_connect_log + + +show variables like 'rocksdb_use_range_locking'; +show variables like 'rocksdb_max_lock_memory'; +show status like 'rocksdb_locktree_escalation_count'; +create table t0(a int); +insert into t0 values (0),(1),(2),(3),(4),(5),(6),(7),(8),(9); + +create table t1 ( + pk int primary key, + a int +) engine=rocksdb; + +#begin; +#insert into t1 values (1000111,100011); +#connect (con1,localhost,root,,); +#connection con1; + +insert into t1 +select + A.a + B.a*10 + C.a*100 + D.a*1000, + 12345 +from t0 A, t0 B, t0 C, t0 D; + +select count(*) from t1; + +#connection default; +#disconnect con1; +show status like 'rocksdb_locktree_escalation_count'; + +drop table t0,t1; + diff --git a/mysql-test/suite/rocksdb/t/range_locking_refresh_iter.test b/mysql-test/suite/rocksdb/t/range_locking_refresh_iter.test new file mode 100644 index 00000000000..9bbb1b9b392 --- /dev/null +++ b/mysql-test/suite/rocksdb/t/range_locking_refresh_iter.test @@ -0,0 +1,70 @@ +--source include/have_rocksdb.inc +--source suite/rocksdb/include/have_range_locking.inc +--source include/have_debug_sync.inc + +select @@rocksdb_use_range_locking; + +--disable_warnings +set debug_sync='RESET'; +--enable_warnings +# +# Testcase for iterator snapshot refresh +# +create table ten(a int primary key); +insert into ten values (0),(1),(2),(3),(4),(5),(6),(7),(8),(9); + +create table one_k(a int primary key); +insert into one_k select A.a + B.a* 10 + C.a * 100 from ten A, ten B, ten C; + +create table t1 ( + pk int primary key, + a int +) engine=rocksdb; + +insert into t1 select a,a from ten; +insert into t1 select a+40, a+40 from ten; +insert into t1 select a+100, a+100 from one_k; +delete from t1 where pk=44; +set global rocksdb_force_flush_memtable_and_lzero_now=1; + +# Ok, now the table has these PK ranges: +# 0..9 40..49 100...1000 +# and all rows have pk=a +connect (con1,localhost,root,,); +connect (con2,localhost,root,,); + +connection con1; +begin; +set debug_sync='rocksdb.check_flags_iri SIGNAL con1_stopped WAIT_FOR con1_cont'; +send +update t1 set a=a+100 where pk < 3 or pk between 10 and 50; + +# The query is how stuck at the start of the second range. + + +## con2> +connection con2; +set debug_sync='now WAIT_FOR con1_stopped'; + +# Make some changes to check if the iterator is reading current data or +# snapshot +insert into t1 values (44,5000); +delete from t1 where pk= 42; +update t1 set a=5000 where pk between 40 and 45; +set global rocksdb_force_flush_memtable_and_lzero_now=1; + +set debug_sync='now SIGNAL con1_cont'; + +connection con1; +#--error ER_GET_ERRMSG +reap; +select * from t1 where pk<100; + +commit; +disconnect con1; +disconnect con2; +connection default; +set debug_sync='RESET'; + +drop table t1, ten, one_k; + diff --git a/mysql-test/suite/rocksdb/t/range_locking_rev_cf.test b/mysql-test/suite/rocksdb/t/range_locking_rev_cf.test new file mode 100644 index 00000000000..8b993764235 --- /dev/null +++ b/mysql-test/suite/rocksdb/t/range_locking_rev_cf.test @@ -0,0 +1,12 @@ +# +# Range locking tests. +# + +--source include/have_rocksdb.inc +--source suite/rocksdb/include/have_range_locking.inc + +--let pk_cf=rev:cf1 +--let PK_USES_REVERSE_CF=1 + +--source range_locking.inc + diff --git a/mysql-test/suite/rocksdb/t/range_locking_seek_for_update.test b/mysql-test/suite/rocksdb/t/range_locking_seek_for_update.test new file mode 100644 index 00000000000..c1f0fe312e0 --- /dev/null +++ b/mysql-test/suite/rocksdb/t/range_locking_seek_for_update.test @@ -0,0 +1,288 @@ +# +# Range Locking : tests for SeekForUpdate feature +# + +--source include/have_rocksdb.inc +--source include/have_debug_sync.inc +--source suite/rocksdb/include/have_range_locking.inc +--enable_connect_log +show variables like 'rocksdb_use_range_locking'; + +create table t0(a int primary key); +insert into t0 values (0),(1),(2),(3),(4),(5),(6),(7),(8),(9); + +create table t1 ( + pk int, + a int, + primary key (pk) +) engine=rocksdb; + +insert into t1 select + A.a + B.a*10 + C.a*100, + A.a + B.a*10 + C.a*100 +from + t0 A, t0 B, t0 C; + +--echo # Make another connection to get the lock tree out of the STO-mode +connect (con1,localhost,root,,); +connection con1; +begin; +select * from t1 where pk=10 for update; + +connection default; +begin; +select * from t1 where pk=11 for update; + +let $select_from_is_rowlocks_current_trx_only=1; +--echo # Now, we will just see locks on 10=0xA and 11=0xB: +--source suite/rocksdb/include/select_from_is_rowlocks.inc + +--echo # +--echo # SeekForUpdate Test #1: A query with type=range (without upper bound) and LIMIT +--echo # +--replace_column 10 # +explain +select * from t1 where pk>=500 order by pk limit 3 for update; +select * from t1 where pk>=500 order by pk limit 3 for update; +--source suite/rocksdb/include/select_from_is_rowlocks.inc +rollback; + + +begin; +select * from t1 where pk=11 for update; +explain +select * from t1 order by pk limit 3 for update; +select * from t1 order by pk limit 3 for update; + +--source suite/rocksdb/include/select_from_is_rowlocks.inc + +rollback; +connection con1; +rollback; +disconnect con1; +connection default; +drop table t0, t1; + + +--echo # +--echo # Concurrent tests: let one thread do SeekForUpdate and the other +--echo # interfere by committing modifications +--echo # + +create table t0(a int primary key); +insert into t0 values (0),(1),(2),(3),(4),(5),(6),(7),(8),(9); + +create table t1 ( + pk int, + a int, + primary key (pk) +) engine=rocksdb; + +insert into t1 select + A.a + B.a*10 + C.a*100, + A.a + B.a*10 + C.a*100 +from + t0 A, t0 B, t0 C; + +select * from t1 where pk<10; +delete from t1 where pk<10; +select * from t1 where pk<10; + + +--echo # Test what happens when another transaction commits a row +--echo # right before the range we are about to lock (nothing) + +--replace_column 10 # +explain +select * from t1 where pk >=5 order by pk limit 3 for update; + +begin; + +set debug_sync='rocksdb.locking_iter_scan SIGNAL about_to_lock_range WAIT_FOR spoiler_inserted'; +send select * from t1 where pk >=5 order by pk limit 3 for update; + +connect (con1,localhost,root,,); +connection con1; +set debug_sync='now WAIT_FOR about_to_lock_range'; +insert into t1 values (3,3); +set debug_sync='now SIGNAL spoiler_inserted'; + +connection default; +reap; +rollback; + +delete from t1 where pk=3; + +--echo # +--echo # Now, repeat the test but let the other transaction insert the row into +--echo # the range we are locking + +--replace_column 10 # +explain +select * from t1 where pk >=5 order by pk limit 1 for update; + +begin; + +set debug_sync='rocksdb.locking_iter_scan SIGNAL about_to_lock_range WAIT_FOR spoiler_inserted'; +send +select * from t1 where pk >=5 order by pk limit 1 for update; + +connection con1; +set debug_sync='now WAIT_FOR about_to_lock_range'; +insert into t1 values (8,8); +set debug_sync='now SIGNAL spoiler_inserted'; + +connection default; +reap; + +--source suite/rocksdb/include/select_from_is_rowlocks.inc + +rollback; +delete from t1 where pk=8; + +--echo # +--echo # Repeat the third time, this time deleting the row that SeekForUpdate saw +--echo # +insert into t1 values (7,7); + +begin; + +set debug_sync='rocksdb.locking_iter_scan SIGNAL about_to_lock_range WAIT_FOR spoiler_inserted'; +send +select * from t1 where pk >=5 order by pk limit 1 for update; + +connection con1; +set debug_sync='now WAIT_FOR about_to_lock_range'; +delete from t1 where pk=7; +set debug_sync='now SIGNAL spoiler_inserted'; + +connection default; +reap; + +rollback; + +--echo # +--echo # Repeat the above test, but let the read fail with ER_LOCK_WAIT_TIMEOUT +--echo # error. MyRocks code should now be prepared that data reads cause this +--echo # error +--echo # +insert into t1 values (7,7); + +begin; + +set debug_sync='rocksdb.locking_iter_scan SIGNAL about_to_lock_range WAIT_FOR spoiler_inserted'; +send +select * from t1 where pk >=5 order by pk limit 1 for update; + +connection con1; +set debug_sync='now WAIT_FOR about_to_lock_range'; +begin; +delete from t1 where pk=7; +set debug_sync='now SIGNAL spoiler_inserted'; + +connection default; +--error ER_LOCK_WAIT_TIMEOUT +reap; + +rollback; + +connection con1; +rollback; +connection default; + +--echo # +--echo # Backward scan test +--echo # +connection con1; +begin; +select * from t1 where pk=500 for update; +connection default; + +insert into t1 values + (1001, 1001), + (1005, 1005), + (1007, 1007), + (1010, 1010); + +begin; +select * from t1 order by pk desc limit 2 for update; + +let $select_from_is_rowlocks_current_trx_only=1; + +--echo # The below will lock from pk=1007 (0x3ef) till the end of the table: +--source suite/rocksdb/include/select_from_is_rowlocks.inc + +rollback; + +begin; +select * from t1 where pk <1007 order by pk desc limit 2 for update; +--source suite/rocksdb/include/select_from_is_rowlocks.inc + +connection con1; +rollback; + +connection default; +rollback; + +--echo # +--echo # Backward scan test 2: error condition +--echo # +connection con1; +begin; +select * from t1 where pk=1010 for update; + +connection default; +begin; +--error ER_LOCK_WAIT_TIMEOUT +select * from t1 order by pk desc limit 2 for update; +rollback; + +connection con1; +rollback; +begin; +select * from t1 where pk=1007 for update; + +connection default; +begin; +--error ER_LOCK_WAIT_TIMEOUT +select * from t1 order by pk desc limit 2 for update; +rollback; + +connection con1; +rollback; + +disconnect con1; +connection default; +drop table t0,t1; + +--echo # +--echo # A test: full table scan doesn't lock gaps +--echo # + +create table t1 ( + pk int primary key, + a int +) engine=rocksdb; + +insert into t1 values (10,10),(20,20),(30,30); + +connect (con1,localhost,root,,); +connect (con2,localhost,root,,); + +connection con1; +begin; + +select * from t1 for update; + +connection con2; + +--error ER_LOCK_WAIT_TIMEOUT +insert into t1 values (5,5); + +connection con1; +rollback; + +disconnect con1; +disconnect con2; +connection default; +drop table t1; diff --git a/mysql-test/suite/rocksdb/t/range_locking_shared_locks.test b/mysql-test/suite/rocksdb/t/range_locking_shared_locks.test new file mode 100644 index 00000000000..c6e4e457897 --- /dev/null +++ b/mysql-test/suite/rocksdb/t/range_locking_shared_locks.test @@ -0,0 +1,202 @@ +# +# Test for shared lock support for range locking +# +--source include/have_rocksdb.inc +--source suite/rocksdb/include/have_range_locking.inc +--enable_connect_log + +select @@rocksdb_use_range_locking; + +create table t0 (a int); +insert into t0 values (0),(1),(2),(3),(4),(5),(6),(7),(8),(9); + +create table t1 ( + pk int primary key, + a int +) engine=rocksdb; + + +insert into t1 select a,a from t0; + +--echo # A basic test for shared locks + +begin; +select * from t1 where pk=3 for update; +select * from t1 where pk=5 lock in share mode; +let $TRX1_ID=`select transaction_id from information_schema.rocksdb_trx where thread_id=connection_id()`; + +connect (con1,localhost,root,,); +connection con1; +begin; +select * from t1 where pk=5 lock in share mode; +let $TRX2_ID=`select transaction_id from information_schema.rocksdb_trx where thread_id=connection_id()`; +--echo # Now for pk=5 we should see two locks by TRX1 and TRX2 with mode=S: +--source suite/rocksdb/include/select_from_is_rowlocks.inc + +rollback; +--echo # Now, TRX2_ID should be gone: +--source suite/rocksdb/include/select_from_is_rowlocks.inc + +connection default; + +--echo # Get a read lock on pk=3 (where we have a write lock). +--echo # The result should be that we will still have a write lock +select * from t1 where pk=3 for update; +--source suite/rocksdb/include/select_from_is_rowlocks.inc + +--echo # Get a write lock on pk=5 (where we have a read lock). +--echo # The result should be that we will have a write lock. +select * from t1 where pk=5 for update; +--source suite/rocksdb/include/select_from_is_rowlocks.inc + +connection default; +rollback; + +--echo # +--echo # Test if a read lock inhibits write locks +--echo # + +begin; +select * from t1 where pk=2 lock in share mode; +select * from t1 where pk=8 for update; + +connection con1; +begin; + +--error ER_LOCK_WAIT_TIMEOUT +select * from t1 where pk=2 for update; + +--error ER_LOCK_WAIT_TIMEOUT +select * from t1 where pk between 0 and 4 for update; + +--error ER_LOCK_WAIT_TIMEOUT +delete from t1 where pk=2; + +--echo # Get a shared lock +select * from t1 where pk=2 lock in share mode; + +--echo # But this should still prevent us from acquiring a write lock on that value: +--error ER_LOCK_WAIT_TIMEOUT +select * from t1 where pk=2 for update; + +rollback; +connection default; +rollback; + +drop table t1; +create table t1 ( + pk int not null primary key, + a int not null, + key(a) +) engine=rocksdb; + +insert into t1 +select + A.a+10*B.a+100*C.a+1000*D.a, A.a+10*B.a+100*C.a+1000*D.a +from + t0 A, t0 B, t0 C, t0 D; +set global rocksdb_force_flush_memtable_now=1; + +connection con1; +begin; +select * from t1 where pk=900 for update; +let $TRX2_ID=`select transaction_id from information_schema.rocksdb_trx where thread_id=connection_id()`; + +connection default; +begin; +--replace_column 10 # +explain +select * from t1 where a between 2 and 5 lock in share mode; +select * from t1 where a between 2 and 5 lock in share mode; +let $TRX1_ID=`select transaction_id from information_schema.rocksdb_trx where thread_id=connection_id()`; + +--source suite/rocksdb/include/select_from_is_rowlocks.inc + +rollback; + +disconnect con1; + +drop table t0,t1; + +--echo # +--echo # Test shared point locks and lock escalation +--echo # +create table t0 (a int); +insert into t0 values (0),(1),(2),(3),(4),(5),(6),(7),(8),(9); + +create table t1 ( + pk int primary key, + a int +) engine=rocksdb; + +insert into t1 +select 1000 + 100*A.a + 10*B.a + C.a, 12345 from t0 A, t0 B, t0 C; + +show status like 'rocksdb_locktree_current_lock_memory'; + +connect (con1,localhost,root,,); +connection con1; + +begin; +--echo # CON1: get some shared locks +select * from t1 where pk=1001 lock in share mode; +select * from t1 where pk=1100 lock in share mode; +select * from t1 where pk=1200 lock in share mode; + +select * from t1 where pk=2500 lock in share mode; +let $TRX1_ID=`select transaction_id from information_schema.rocksdb_trx where thread_id=connection_id()`; + +connection default; +begin; +--echo # DEFAULT: get the same locks so we have locks with multiple owners +select * from t1 where pk=1001 lock in share mode; +select * from t1 where pk=1100 lock in share mode; +select * from t1 where pk=1200 lock in share mode; + +--echo # DEFAULT: get shared locks with one owner: +select * from t1 where pk=2510 lock in share mode; +let $TRX2_ID=`select transaction_id from information_schema.rocksdb_trx where thread_id=connection_id()`; + + +--echo # DEFAULT: exclusive locks on 0-10: +insert into t1 select A.a, 0 from t0 A; + +connection con1; +--echo # CON1: exclusive locks on 2000-2010: +insert into t1 select 2000+A.a, 0 from t0 A; + +let $order_by_rowkey=1; +#select * from information_schema.rocksdb_locks; +--source suite/rocksdb/include/select_from_is_rowlocks.inc + +connection default; +show status like 'rocksdb_locktree_current_lock_memory'; +set @save_mlm= @@rocksdb_max_lock_memory; + +--echo # Set the limit to cause lock escalation: +set @cur_mem_usage= (select + variable_value + from + performance_schema.global_status + where + variable_name='rocksdb_locktree_current_lock_memory'); + +set global rocksdb_max_lock_memory = cast(@cur_mem_usage+4 as SIGNED); + +connection con1; +insert into t1 select 3000+A.a, 0 from t0 A; + +#select * from information_schema.rocksdb_locks; +--source suite/rocksdb/include/select_from_is_rowlocks.inc + +connection con1; +rollback; +connection default; +rollback; + +disconnect con1; +set global rocksdb_max_lock_memory= cast(@save_mlm as SIGNED); + +drop table t0, t1; + + diff --git a/mysql-test/suite/rocksdb/t/rocksdb.test b/mysql-test/suite/rocksdb/t/rocksdb.test index c063d8c7ccb..0544214b8c9 100644 --- a/mysql-test/suite/rocksdb/t/rocksdb.test +++ b/mysql-test/suite/rocksdb/t/rocksdb.test @@ -2,6 +2,9 @@ --source suite/rocksdb/include/have_write_committed.inc --source include/count_sessions.inc +# Does SHOW WARNINGS and SHOW STATUS which change in Range Locking mode +--source suite/rocksdb/include/not_range_locking.inc + # # RocksDB Storage Engine tests # diff --git a/mysql-test/suite/rocksdb/t/rocksdb_concurrent_delete.test b/mysql-test/suite/rocksdb/t/rocksdb_concurrent_delete.test index 47818bfdbe1..3aa51b7be80 100644 --- a/mysql-test/suite/rocksdb/t/rocksdb_concurrent_delete.test +++ b/mysql-test/suite/rocksdb/t/rocksdb_concurrent_delete.test @@ -27,6 +27,10 @@ # In all cases, RR gets snapshot conflict errors if non-first rows get # deleted by another transaction after scanning. +# The tests do not work with range locking as it locks it is about to +# read, first. +--source suite/rocksdb/include/not_range_locking.inc + --source include/have_rocksdb.inc --source include/have_debug_sync.inc diff --git a/mysql-test/suite/rocksdb/t/rocksdb_locks.test b/mysql-test/suite/rocksdb/t/rocksdb_locks.test index ff092773737..8b3975723df 100644 --- a/mysql-test/suite/rocksdb/t/rocksdb_locks.test +++ b/mysql-test/suite/rocksdb/t/rocksdb_locks.test @@ -5,6 +5,9 @@ # --source include/have_debug.inc +# Range locking requests locks before doing snapshot checking. +--source suite/rocksdb/include/not_range_locking.inc + --enable_connect_log create table t1 (pk int not null primary key) engine=rocksdb; diff --git a/mysql-test/suite/rocksdb/t/rocksdb_read_free_rpl.test b/mysql-test/suite/rocksdb/t/rocksdb_read_free_rpl.test index 92981e19a43..cc5c1a90436 100644 --- a/mysql-test/suite/rocksdb/t/rocksdb_read_free_rpl.test +++ b/mysql-test/suite/rocksdb/t/rocksdb_read_free_rpl.test @@ -62,7 +62,7 @@ update t1 set c2=100 where c1=3; delete from t1 where c1 <= 2; --source include/sync_slave_sql_with_master.inc --source include/rpl_connection_slave.inc -select case when variable_value-@up > 0 then 'false' else 'true' end as read_free from performance_schema.global_status where variable_name='rocksdb_num_get_for_update_calls'; +select case when (@@rocksdb_use_range_locking=1 OR variable_value-@up > 0) then 'false' else 'true' end as read_free from performance_schema.global_status where variable_name='rocksdb_num_get_for_update_calls'; select * from t1; --echo diff --git a/mysql-test/suite/rocksdb/t/rocksdb_timeout_rollback.test b/mysql-test/suite/rocksdb/t/rocksdb_timeout_rollback.test index 694594efd70..1273a2b6f70 100644 --- a/mysql-test/suite/rocksdb/t/rocksdb_timeout_rollback.test +++ b/mysql-test/suite/rocksdb/t/rocksdb_timeout_rollback.test @@ -46,6 +46,8 @@ begin work; insert into t1 values (9); insert into t1 values (10); +--echo # Fix for Range Locking: force a snapshot to be taken: +select * from t1 where a=100; update t1 set a = a + 1 where a = 2; connection con1; diff --git a/mysql-test/suite/rocksdb/t/rpl_row_not_found.inc b/mysql-test/suite/rocksdb/t/rpl_row_not_found.inc index 5a78979f048..63b72ce5c5a 100644 --- a/mysql-test/suite/rocksdb/t/rpl_row_not_found.inc +++ b/mysql-test/suite/rocksdb/t/rpl_row_not_found.inc @@ -3,6 +3,8 @@ --source include/have_debug.inc --source include/have_debug_sync.inc +--source suite/rocksdb/include/not_range_locking.inc + connection master; --disable_warnings drop table if exists t1; diff --git a/mysql-test/suite/rocksdb/t/select_lock_in_share_mode.test b/mysql-test/suite/rocksdb/t/select_lock_in_share_mode.test index 23ce6d45234..cf9d53ff88a 100644 --- a/mysql-test/suite/rocksdb/t/select_lock_in_share_mode.test +++ b/mysql-test/suite/rocksdb/t/select_lock_in_share_mode.test @@ -1,5 +1,8 @@ --source include/have_rocksdb.inc +# Range locking only supports exclusive locks currently. +--source suite/rocksdb/include/not_range_locking.inc + # # SELECT .. LOCK IN SHARE MODE # diff --git a/mysql-test/suite/rocksdb/t/unique_check.test b/mysql-test/suite/rocksdb/t/unique_check.test index 47ca74d0e5e..9814d89448d 100644 --- a/mysql-test/suite/rocksdb/t/unique_check.test +++ b/mysql-test/suite/rocksdb/t/unique_check.test @@ -2,6 +2,11 @@ --source include/have_debug_sync.inc --source include/count_sessions.inc +# Doesn't work with range locking because lock tree waits do not set +# state="Waiting for row lock" in I_S.PROCESSLIST. See MDEV-17873 for +# details. +--source suite/rocksdb/include/not_range_locking.inc + # For GitHub issue#167 -- Unique key check doesn't work connect (con1, localhost, root,,); diff --git a/mysql-test/suite/rocksdb/t/unique_sec.inc b/mysql-test/suite/rocksdb/t/unique_sec.inc index ce0bb1e39a9..508816e6ace 100644 --- a/mysql-test/suite/rocksdb/t/unique_sec.inc +++ b/mysql-test/suite/rocksdb/t/unique_sec.inc @@ -144,8 +144,16 @@ UPDATE t1 SET id5=37 WHERE id1=38; UPDATE t1 SET id5=34 WHERE id1=38; --echo # NULL values are unique +--echo # (Note: the following UPDATE reads through the whole table without +--echo # finding anything to update. With point locking, this is fine, +--echo # but with range locking it will time out while waiting on a row lock +--echo # that the other transaction is holding) +if (`select @@rocksdb_use_range_locking=0`) { UPDATE t1 SET id5=NULL WHERE value1 > 37; - +} +if (`select @@rocksdb_use_range_locking=1`) { +-- echo UPDATE t1 SET id5=NULL WHERE value1 > 37; +} connection con1; COMMIT; diff --git a/mysql-test/suite/rocksdb/t/unique_sec_rev_cf.test b/mysql-test/suite/rocksdb/t/unique_sec_rev_cf.test index d6a8e3d5a1b..8d2e64e5890 100644 --- a/mysql-test/suite/rocksdb/t/unique_sec_rev_cf.test +++ b/mysql-test/suite/rocksdb/t/unique_sec_rev_cf.test @@ -3,3 +3,4 @@ let ddl= $MYSQL_TMP_DIR/unique_sec_rev_cf.sql; --exec sed s/##CF##/" COMMENT 'rev:cf'"/g suite/rocksdb/t/unique_sec.inc > $ddl --source $ddl +--remove_file $ddl diff --git a/mysql-test/suite/rocksdb/t/varbinary_format.test b/mysql-test/suite/rocksdb/t/varbinary_format.test index fbebfeac85a..0d8a35a1321 100644 --- a/mysql-test/suite/rocksdb/t/varbinary_format.test +++ b/mysql-test/suite/rocksdb/t/varbinary_format.test @@ -1,6 +1,10 @@ --source include/have_debug.inc --source include/have_rocksdb.inc +# The test uses SELECT .. FOR UPDATE and examines which locks it acquires +# Range Locking will use different locks from point locking +--source suite/rocksdb/include/not_range_locking.inc + # Create a table with a varbinary key with the current format and validate # that it sorts correctly CREATE TABLE t1( diff --git a/mysql-test/suite/rocksdb/t/varchar_format.test b/mysql-test/suite/rocksdb/t/varchar_format.test index 3ea1a1a60b3..985b2c0c8e7 100644 --- a/mysql-test/suite/rocksdb/t/varchar_format.test +++ b/mysql-test/suite/rocksdb/t/varchar_format.test @@ -1,6 +1,8 @@ --source include/have_debug.inc --source include/have_rocksdb.inc +--source suite/rocksdb/include/not_range_locking.inc + #################### # Create a table with a varchar key with the current format and validate # that it sorts correctly diff --git a/mysql-test/suite/rocksdb_sys_vars/r/rocksdb_max_lock_memory_basic.result b/mysql-test/suite/rocksdb_sys_vars/r/rocksdb_max_lock_memory_basic.result new file mode 100644 index 00000000000..614737fcfbc --- /dev/null +++ b/mysql-test/suite/rocksdb_sys_vars/r/rocksdb_max_lock_memory_basic.result @@ -0,0 +1,7 @@ +SET @start_global_value = @@global.ROCKSDB_USE_RANGE_LOCKING; +SELECT @start_global_value; +@start_global_value +0 +"Trying to set variable @@global.ROCKSDB_USE_RANGE_LOCKING to 444. It should fail because it is readonly." +SET @@global.ROCKSDB_USE_RANGE_LOCKING = 444; +ERROR HY000: Variable 'rocksdb_use_range_locking' is a read only variable diff --git a/mysql-test/suite/rocksdb_sys_vars/r/rocksdb_use_range_locking_basic.result b/mysql-test/suite/rocksdb_sys_vars/r/rocksdb_use_range_locking_basic.result new file mode 100644 index 00000000000..614737fcfbc --- /dev/null +++ b/mysql-test/suite/rocksdb_sys_vars/r/rocksdb_use_range_locking_basic.result @@ -0,0 +1,7 @@ +SET @start_global_value = @@global.ROCKSDB_USE_RANGE_LOCKING; +SELECT @start_global_value; +@start_global_value +0 +"Trying to set variable @@global.ROCKSDB_USE_RANGE_LOCKING to 444. It should fail because it is readonly." +SET @@global.ROCKSDB_USE_RANGE_LOCKING = 444; +ERROR HY000: Variable 'rocksdb_use_range_locking' is a read only variable diff --git a/mysql-test/suite/rocksdb_sys_vars/t/rocksdb_max_lock_memory_basic.test b/mysql-test/suite/rocksdb_sys_vars/t/rocksdb_max_lock_memory_basic.test new file mode 100644 index 00000000000..ee185aba660 --- /dev/null +++ b/mysql-test/suite/rocksdb_sys_vars/t/rocksdb_max_lock_memory_basic.test @@ -0,0 +1,5 @@ +--source include/have_rocksdb.inc +--let $sys_var=ROCKSDB_USE_RANGE_LOCKING +--let $read_only=1 +--let $session=0 +--source ../include/rocksdb_sys_var.inc diff --git a/mysql-test/suite/rocksdb_sys_vars/t/rocksdb_use_range_locking_basic.test b/mysql-test/suite/rocksdb_sys_vars/t/rocksdb_use_range_locking_basic.test new file mode 100644 index 00000000000..ee185aba660 --- /dev/null +++ b/mysql-test/suite/rocksdb_sys_vars/t/rocksdb_use_range_locking_basic.test @@ -0,0 +1,5 @@ +--source include/have_rocksdb.inc +--let $sys_var=ROCKSDB_USE_RANGE_LOCKING +--let $read_only=1 +--let $session=0 +--source ../include/rocksdb_sys_var.inc diff --git a/storage/rocksdb/CMakeLists.txt b/storage/rocksdb/CMakeLists.txt index 135a6af62df..5c25f89b6e9 100644 --- a/storage/rocksdb/CMakeLists.txt +++ b/storage/rocksdb/CMakeLists.txt @@ -126,6 +126,7 @@ SET(ROCKSDB_SOURCES logger.h rdb_datadic.cc rdb_datadic.h rdb_iterator.cc rdb_iterator.h + rdb_locking_iter.cc rdb_locking_iter.h rdb_cf_options.cc rdb_cf_options.h rdb_cf_manager.cc rdb_cf_manager.h rdb_converter.cc rdb_converter.h diff --git a/storage/rocksdb/get_rocksdb_files.sh b/storage/rocksdb/get_rocksdb_files.sh index e0fa66b54fa..07fee252a1e 100755 --- a/storage/rocksdb/get_rocksdb_files.sh +++ b/storage/rocksdb/get_rocksdb_files.sh @@ -4,7 +4,7 @@ MKFILE=`mktemp` # include rocksdb make file relative to the path of this script echo "include rocksdb/src.mk all: - @echo \$(LIB_SOURCES)" > $MKFILE + @echo \$(LIB_SOURCES) \$(RANGE_TREE_SOURCES)" > $MKFILE for f in `make --makefile $MKFILE` do echo ../../rocksdb/$f diff --git a/storage/rocksdb/ha_rocksdb.cc b/storage/rocksdb/ha_rocksdb.cc index d97a7c237f3..b30f694f3d3 100644 --- a/storage/rocksdb/ha_rocksdb.cc +++ b/storage/rocksdb/ha_rocksdb.cc @@ -104,6 +104,9 @@ #include "./ObjectFactory.h" #endif +#include "./rdb_locking_iter.h" + + // Internal MySQL APIs not exposed in any header. extern "C" { /** @@ -634,6 +637,10 @@ static void rocksdb_set_delayed_write_rate(THD *thd, struct SYS_VAR *var, static void rocksdb_set_max_latest_deadlocks(THD *thd, struct SYS_VAR *var, void *var_ptr, const void *save); +static void rocksdb_set_max_lock_memory(THD *thd, + struct SYS_VAR *var, + void *var_ptr, const void *save); + static void rdb_set_collation_exception_list(const char *exception_list); static void rocksdb_set_collation_exception_list(THD *thd, struct SYS_VAR *var, void *var_ptr, @@ -786,6 +793,16 @@ static unsigned long long // NOLINT(runtime/int) static bool rocksdb_skip_locks_if_skip_unique_check = false; static bool rocksdb_alter_column_default_inplace = false; +// Range Locking: how much memory can be used used for the lock data structure +// (which hold the locks acquired by all clients). +static ulonglong rocksdb_max_lock_memory; + +static bool rocksdb_use_range_locking = 0; +static bool rocksdb_use_range_lock_manager_as_point = 0; +std::shared_ptr<rocksdb::RangeLockManagerHandle> range_lock_mgr; + +std::shared_ptr<rocksdb::RangeLockManagerHandle> range_lock_mgr_used_as_point; + std::atomic<uint64_t> rocksdb_row_lock_deadlocks(0); std::atomic<uint64_t> rocksdb_row_lock_wait_timeouts(0); std::atomic<uint64_t> rocksdb_snapshot_conflict_errors(0); @@ -1484,6 +1501,13 @@ static MYSQL_SYSVAR_UINT(max_latest_deadlocks, rocksdb_max_latest_deadlocks, nullptr, rocksdb_set_max_latest_deadlocks, rocksdb::kInitialMaxDeadlocks, 0, UINT32_MAX, 0); +static MYSQL_SYSVAR_ULONGLONG(max_lock_memory, rocksdb_max_lock_memory, + PLUGIN_VAR_RQCMDARG, + "Range-locking mode: Maximum amount of memory " + "that locks from all transactions can use at a time", + nullptr, rocksdb_set_max_lock_memory, + /*initial*/1073741824, 0, UINT64_MAX, 0); + static MYSQL_SYSVAR_ENUM( info_log_level, rocksdb_info_log_level, PLUGIN_VAR_RQCMDARG, "Filter level for info logs to be written mysqld error log. " @@ -2333,6 +2357,19 @@ static MYSQL_SYSVAR_BOOL(table_stats_use_table_scan, rocksdb_update_table_stats_use_table_scan, rocksdb_table_stats_use_table_scan); +static MYSQL_SYSVAR_BOOL(use_range_locking, rocksdb_use_range_locking, + PLUGIN_VAR_OPCMDARG | PLUGIN_VAR_READONLY, + "Use Range Locking", + nullptr, nullptr, + rocksdb_use_range_locking); + +static MYSQL_SYSVAR_BOOL(use_range_lock_manager_as_point, + rocksdb_use_range_lock_manager_as_point, + PLUGIN_VAR_OPCMDARG | PLUGIN_VAR_READONLY, + "Use Range Lock Manager as point", + nullptr, nullptr, + rocksdb_use_range_lock_manager_as_point); + static MYSQL_SYSVAR_BOOL( large_prefix, rocksdb_large_prefix, PLUGIN_VAR_RQCMDARG, "Support large index prefix length of 3072 bytes. If off, the maximum " @@ -2648,7 +2685,9 @@ static struct SYS_VAR *rocksdb_system_variables[] = { MYSQL_SYSVAR(manual_compaction_threads), MYSQL_SYSVAR(manual_compaction_bottommost_level), MYSQL_SYSVAR(rollback_on_timeout), - + MYSQL_SYSVAR(use_range_locking), + MYSQL_SYSVAR(use_range_lock_manager_as_point), + MYSQL_SYSVAR(max_lock_memory), MYSQL_SYSVAR(enable_insert_with_update_caching), MYSQL_SYSVAR(trace_block_cache_access), MYSQL_SYSVAR(trace_queries), @@ -2907,8 +2946,41 @@ class Rdb_transaction { virtual rocksdb::Status do_pop_savepoint() = 0; virtual void do_rollback_to_savepoint() = 0; + private: + /* + If true, the current statement should not use a snapshot for reading. + Note that in a multi-statement transaction, the snapshot may have been + allocated by another statement. + */ + bool m_stmt_ignores_snapshot = false; + + /* Snapshot-ignore mode will put away m_reads_opts.snapshot here: */ + const rocksdb::Snapshot *m_saved_snapshot= nullptr; + public: + + void start_ignore_snapshot() { + // note: this may be called several times for the same statement + if (!m_stmt_ignores_snapshot) { + m_saved_snapshot = m_read_opts.snapshot; + m_read_opts.snapshot = nullptr; + m_stmt_ignores_snapshot= true; + if (!m_snapshot_timestamp) + rdb->GetEnv()->GetCurrentTime(&m_snapshot_timestamp); + } + } + + void end_ignore_snapshot_if_needed() { + if (m_stmt_ignores_snapshot) { + m_stmt_ignores_snapshot = false; + m_read_opts.snapshot = m_saved_snapshot; + m_saved_snapshot = nullptr; + } + } + bool in_snapshot_ignore_mode() const { return m_stmt_ignores_snapshot; } + rocksdb::ReadOptions m_read_opts; + const char *m_mysql_log_file_name; my_off_t m_mysql_log_offset; const char *m_mysql_gtid; @@ -3089,6 +3161,19 @@ class Rdb_transaction { virtual void release_lock(const Rdb_key_def &key_descr, const std::string &rowkey, bool force = false) = 0; + virtual + rocksdb::Status lock_range(rocksdb::ColumnFamilyHandle *const cf, + const rocksdb::Endpoint &start, + const rocksdb::Endpoint &end) = 0; + + rocksdb::Status lock_singlepoint_range(rocksdb::ColumnFamilyHandle *const cf, + const rocksdb::Slice &point) { + // Normally, one needs to "flip" the endpoint type for reverse-ordered CFs. + // But here we are locking just one point so this is not necessary. + rocksdb::Endpoint endp(point, false); + return lock_range(cf, endp, endp); + } + virtual bool prepare() = 0; bool commit_or_rollback() { @@ -3147,10 +3232,17 @@ class Rdb_transaction { m_is_delayed_snapshot = false; } + void locking_iter_created() { + if (!m_snapshot_timestamp) + rdb->GetEnv()->GetCurrentTime(&m_snapshot_timestamp); + } + virtual void acquire_snapshot(bool acquire_now) = 0; virtual void release_snapshot() = 0; - bool has_snapshot() const { return m_read_opts.snapshot != nullptr; } + bool has_snapshot() const { + return m_read_opts.snapshot != nullptr || m_saved_snapshot; + } private: // The Rdb_sst_info structures we are currently loading. In a partitioned @@ -3704,7 +3796,9 @@ class Rdb_transaction { virtual rocksdb::Iterator *get_iterator( const rocksdb::ReadOptions &options, - rocksdb::ColumnFamilyHandle *column_family) = 0; + rocksdb::ColumnFamilyHandle *column_family, + bool is_rev_cf, + bool use_locking_iterator=false) = 0; virtual void multi_get(rocksdb::ColumnFamilyHandle *const column_family, const size_t num_keys, const rocksdb::Slice *keys, @@ -3713,10 +3807,12 @@ class Rdb_transaction { const bool sorted_input) const = 0; rocksdb::Iterator *get_iterator( - rocksdb::ColumnFamilyHandle *const column_family, bool skip_bloom_filter, + rocksdb::ColumnFamilyHandle *const column_family, bool is_rev_cf, + bool skip_bloom_filter, const rocksdb::Slice &eq_cond_lower_bound, const rocksdb::Slice &eq_cond_upper_bound, bool read_current = false, - bool create_snapshot = true) { + bool create_snapshot = true, + bool use_locking_iterator=false) { // Make sure we are not doing both read_current (which implies we don't // want a snapshot) and create_snapshot which makes sure we create // a snapshot @@ -3746,12 +3842,14 @@ class Rdb_transaction { if (read_current) { options.snapshot = nullptr; } - return get_iterator(options, column_family); + return get_iterator(options, column_family, is_rev_cf, + use_locking_iterator); } virtual bool is_tx_started() const = 0; virtual void start_tx() = 0; - virtual void start_stmt() = 0; + virtual void start_stmt(bool is_dml_statement) = 0; + virtual void start_autocommit_stmt(bool /*is_dml_statement*/){} virtual void set_name() = 0; protected: @@ -3949,6 +4047,13 @@ class Rdb_transaction_impl : public Rdb_transaction { virtual bool is_writebatch_trx() const override { return false; } + // Lock the range between two specified endpoints + rocksdb::Status lock_range(rocksdb::ColumnFamilyHandle *const cf, + const rocksdb::Endpoint &start_endp, + const rocksdb::Endpoint &end_endp) override { + ++m_row_lock_count; + return m_rocksdb_tx->GetRangeLock(cf, start_endp, end_endp); + } private: void release_tx(void) { // We are done with the current active transaction object. Preserve it @@ -4055,7 +4160,7 @@ class Rdb_transaction_impl : public Rdb_transaction { } void acquire_snapshot(bool acquire_now) override { - if (m_read_opts.snapshot == nullptr) { + if (m_read_opts.snapshot == nullptr && !in_snapshot_ignore_mode()) { const auto thd_ss = std::static_pointer_cast<Rdb_explicit_snapshot>( m_thd->get_explicit_snapshot()); if (thd_ss) { @@ -4211,9 +4316,17 @@ class Rdb_transaction_impl : public Rdb_transaction { rocksdb::Iterator *get_iterator( const rocksdb::ReadOptions &options, - rocksdb::ColumnFamilyHandle *const column_family) override { + rocksdb::ColumnFamilyHandle *const column_family, + bool is_rev_cf, + bool use_locking_iterator) override { global_stats.queries[QUERIES_RANGE].inc(); - return m_rocksdb_tx->GetIterator(options, column_family); + if (use_locking_iterator) { + locking_iter_created(); + return GetLockingIterator(m_rocksdb_tx, options, column_family, + is_rev_cf, &m_row_lock_count); + } + else + return m_rocksdb_tx->GetIterator(options, column_family); } const rocksdb::Transaction *get_rdb_trx() const { return m_rocksdb_tx; } @@ -4287,17 +4400,35 @@ class Rdb_transaction_impl : public Rdb_transaction { /* Start a statement inside a multi-statement transaction. - @todo: are we sure this is called once (and not several times) per - statement start? + @note: If a statement uses N tables, this function will be called N times, + for each TABLE object that is used. For hooking to start of statement that is its own transaction, see ha_rocksdb::external_lock(). */ - void start_stmt() override { + void start_stmt(bool is_dml_statement) override { + + if (rocksdb_use_range_locking && is_dml_statement) { + /* + In Range Locking mode, RocksDB does not do "key tracking". + Use InnoDB-like concurrency mode: make the DML statements always read + the latest data (instead of using transaction's snapshot). + This "downgrades" the transaction isolation to READ-COMMITTED on the + master, but in return the actions can be replayed on the slave. + */ + start_ignore_snapshot(); + } + // Set the snapshot to delayed acquisition (SetSnapshotOnNextOperation) acquire_snapshot(false); } + void start_autocommit_stmt(bool is_dml_statement) override { + if (rocksdb_use_range_locking && is_dml_statement) { + start_ignore_snapshot(); + } + } + /* This must be called when last statement is rolled back, but the transaction continues @@ -4426,6 +4557,12 @@ class Rdb_writebatch_impl : public Rdb_transaction { // Nothing to do here since we don't hold any row locks. } + rocksdb::Status lock_range(rocksdb::ColumnFamilyHandle *const, + const rocksdb::Endpoint&, + const rocksdb::Endpoint&) override { + return rocksdb::Status::OK(); + } + void rollback() override { on_rollback(); m_write_count = 0; @@ -4525,7 +4662,9 @@ class Rdb_writebatch_impl : public Rdb_transaction { rocksdb::Iterator *get_iterator( const rocksdb::ReadOptions &options, - rocksdb::ColumnFamilyHandle *const /* column_family */) override { + rocksdb::ColumnFamilyHandle *const /* column_family */, + bool /*is_rev_cf*/, + bool /*use_locking_iterator*/) override { const auto it = rdb->NewIterator(options); return m_batch->NewIteratorWithBase(it); } @@ -4543,9 +4682,9 @@ class Rdb_writebatch_impl : public Rdb_transaction { set_initial_savepoint(); } + void start_stmt(bool /*is_dml_statement*/) override {} void set_name() override {} - void start_stmt() override {} void rollback_stmt() override { if (m_batch) rollback_to_stmt_savepoint(); @@ -4869,6 +5008,7 @@ static int rocksdb_prepare(handlerton *const hton MY_ATTRIBUTE((__unused__)), DEBUG_SYNC(thd, "rocksdb.prepared"); } else { tx->make_stmt_savepoint_permanent(); + tx->end_ignore_snapshot_if_needed(); } return HA_EXIT_SUCCESS; @@ -5096,6 +5236,7 @@ static int rocksdb_commit(handlerton *const hton MY_ATTRIBUTE((__unused__)), - For a COMMIT statement that finishes a multi-statement transaction - For a statement that has its own transaction */ + tx->end_ignore_snapshot_if_needed(); if (tx->commit()) { DBUG_RETURN(HA_ERR_ROCKSDB_COMMIT_FAILED); } @@ -5105,6 +5246,7 @@ static int rocksdb_commit(handlerton *const hton MY_ATTRIBUTE((__unused__)), */ tx->set_tx_failed(false); tx->make_stmt_savepoint_permanent(); + tx->end_ignore_snapshot_if_needed(); } if (my_core::thd_tx_isolation(thd) <= ISO_READ_COMMITTED) { @@ -5142,6 +5284,7 @@ static int rocksdb_rollback(handlerton *const hton MY_ATTRIBUTE((__unused__)), - a statement inside a transaction is rolled back */ + tx->end_ignore_snapshot_if_needed(); tx->rollback_stmt(); tx->set_tx_failed(true); } @@ -5246,8 +5389,9 @@ class Rdb_snapshot_status : public Rdb_tx_list_walker { "=========================================\n"; } + template<class PathStruct> static Rdb_deadlock_info::Rdb_dl_trx_info get_dl_txn_info( - const rocksdb::DeadlockInfo &txn, const GL_INDEX_ID &gl_index_id) { + const PathStruct &txn, const GL_INDEX_ID &gl_index_id) { Rdb_deadlock_info::Rdb_dl_trx_info txn_data; txn_data.trx_id = txn.m_txn_id; @@ -5272,23 +5416,49 @@ class Rdb_snapshot_status : public Rdb_tx_list_walker { ? cfh->GetName() : "NOT FOUND; CF_ID: " + std::to_string(txn.m_cf_id); - txn_data.waiting_key = - rdb_hexdump(txn.m_waiting_key.c_str(), txn.m_waiting_key.length()); + txn_data.waiting_key = format_wait_key(txn); txn_data.exclusive_lock = txn.m_exclusive; return txn_data; } + // Get the key to use to find the index number (and then, index name) + // Two functions with matching signatures so get_dl_path_trx_info() template + // can be used with both point and range locking. + static const std::string& get_key_for_indexnr( + const rocksdb::DeadlockInfo& info) { + return info.m_waiting_key; + } + static const std::string& get_key_for_indexnr( + const rocksdb::RangeDeadlockInfo& info) { + // Range locks do not span across indexes, so take the left bound + return info.m_start.slice; + } + + // Print the locked key (or range) in hex + // Two functions with matching signatures so get_dl_path_trx_info() template + // can be used with both point and range locking. + static std::string format_wait_key(const rocksdb::DeadlockInfo& info) { + return rdb_hexdump(info.m_waiting_key.c_str(), info.m_waiting_key.length()); + } + static std::string format_wait_key(const rocksdb::RangeDeadlockInfo& info) { + return rdb_hexdump_range(info.m_start, info.m_end); + } + + // Get deadlock path info. A templated function so one can use it with both + // point and range locking. + template<class PathStruct> static Rdb_deadlock_info get_dl_path_trx_info( - const rocksdb::DeadlockPath &path_entry) { + const PathStruct &path_entry) { Rdb_deadlock_info deadlock_info; for (auto it = path_entry.path.begin(); it != path_entry.path.end(); it++) { const auto &txn = *it; + auto waiting_key = get_key_for_indexnr(txn); const GL_INDEX_ID gl_index_id = { txn.m_cf_id, rdb_netbuf_to_uint32(reinterpret_cast<const uchar *>( - txn.m_waiting_key.c_str()))}; + waiting_key.c_str()))}; deadlock_info.path.push_back(get_dl_txn_info(txn, gl_index_id)); } DBUG_ASSERT_IFF(path_entry.limit_exceeded, path_entry.path.empty()); @@ -5313,7 +5483,7 @@ class Rdb_snapshot_status : public Rdb_tx_list_walker { /* Calculate the duration the snapshot has existed */ int64_t snapshot_timestamp = tx->m_snapshot_timestamp; - if (snapshot_timestamp != 0) { + if (snapshot_timestamp != 0 && tx->has_snapshot()) { int64_t curr_time; rdb->GetEnv()->GetCurrentTime(&curr_time); @@ -5332,8 +5502,8 @@ class Rdb_snapshot_status : public Rdb_tx_list_walker { } } - void populate_deadlock_buffer() { - auto dlock_buffer = rdb->GetDeadlockInfoBuffer(); + template<class PathStruct> + void populate_deadlock_buffer_tmpl(PathStruct &dlock_buffer) { m_data += "----------LATEST DETECTED DEADLOCKS----------\n"; for (const auto &path_entry : dlock_buffer) { @@ -5373,12 +5543,32 @@ class Rdb_snapshot_status : public Rdb_tx_list_walker { } } + void populate_deadlock_buffer() { + if (range_lock_mgr) { + auto dlock_buffer = range_lock_mgr->GetRangeDeadlockInfoBuffer(); + populate_deadlock_buffer_tmpl(dlock_buffer); + } else { + auto dlock_buffer = rdb->GetDeadlockInfoBuffer(); + populate_deadlock_buffer_tmpl(dlock_buffer); + } + } + std::vector<Rdb_deadlock_info> get_deadlock_info() { std::vector<Rdb_deadlock_info> deadlock_info; - auto dlock_buffer = rdb->GetDeadlockInfoBuffer(); - for (const auto &path_entry : dlock_buffer) { - if (!path_entry.limit_exceeded) { - deadlock_info.push_back(get_dl_path_trx_info(path_entry)); + + if (range_lock_mgr) { + auto dlock_buffer = range_lock_mgr->GetRangeDeadlockInfoBuffer(); + for (const auto &path_entry : dlock_buffer) { + if (!path_entry.limit_exceeded) { + deadlock_info.push_back(get_dl_path_trx_info(path_entry)); + } + } + } else { + auto dlock_buffer = rdb->GetDeadlockInfoBuffer(); + for (const auto &path_entry : dlock_buffer) { + if (!path_entry.limit_exceeded) { + deadlock_info.push_back(get_dl_path_trx_info(path_entry)); + } } } return deadlock_info; @@ -5794,9 +5984,13 @@ static bool rocksdb_collect_hton_log_info(handlerton *const /* unused */, return ret_val; } +/* + @param is_dml_statement If true, we are is a DML statement +*/ static inline void rocksdb_register_tx( handlerton *const hton MY_ATTRIBUTE((__unused__)), THD *const thd, - Rdb_transaction *const tx) { + Rdb_transaction *const tx, + bool is_dml_stmt) { DBUG_ASSERT(tx != nullptr); trans_register_ha(thd, false, rocksdb_hton, NULL); @@ -5811,8 +6005,10 @@ static inline void rocksdb_register_tx( } } if (my_core::thd_test_options(thd, OPTION_NOT_AUTOCOMMIT | OPTION_BEGIN)) { - tx->start_stmt(); + tx->start_stmt(is_dml_stmt); trans_register_ha(thd, true, rocksdb_hton, NULL); + } else { + tx->start_autocommit_stmt(is_dml_stmt); } } @@ -5897,7 +6093,7 @@ static int rocksdb_start_tx_and_assign_read_view( DBUG_ASSERT(!tx->has_snapshot()); tx->set_tx_read_only(true); - rocksdb_register_tx(hton, thd, tx); + rocksdb_register_tx(hton, thd, tx, false); tx->acquire_snapshot(true); return HA_EXIT_SUCCESS; @@ -5954,7 +6150,7 @@ static int rocksdb_start_tx_with_shared_read_view( DBUG_ASSERT(!tx->has_snapshot()); tx->set_tx_read_only(true); - rocksdb_register_tx(hton, thd, tx); + rocksdb_register_tx(hton, thd, tx, false); tx->acquire_snapshot(true); // case: an explicit snapshot was not assigned to this transaction @@ -6633,6 +6829,25 @@ static int rocksdb_init_internal(void *const p) { tx_db_options.custom_mutex_factory = std::make_shared<Rdb_mutex_factory>(); tx_db_options.write_policy = static_cast<rocksdb::TxnDBWritePolicy>(rocksdb_write_policy); + + if (rocksdb_use_range_locking && rocksdb_use_range_lock_manager_as_point) { + //rdb_log_status_error( + // status, "Can't have both range_locking and range_lock_manager_as_point"); + //DBUG_RETURN(HA_EXIT_FAILURE); + rocksdb_use_range_lock_manager_as_point= 0; + } + + + if (rocksdb_use_range_locking) { + range_lock_mgr.reset( + rocksdb::NewRangeLockManager(tx_db_options.custom_mutex_factory)); + tx_db_options.lock_mgr_handle = range_lock_mgr; + } + if (rocksdb_use_range_lock_manager_as_point) { + range_lock_mgr_used_as_point.reset( + rocksdb::NewRangeLockManager(tx_db_options.custom_mutex_factory)); + tx_db_options.lock_mgr_handle = range_lock_mgr_used_as_point; + } status = check_rocksdb_options_compatibility(rocksdb_datadir, main_opts, cf_descr); @@ -6667,6 +6882,15 @@ static int rocksdb_init_internal(void *const p) { DBUG_RETURN(HA_EXIT_FAILURE); } + if (range_lock_mgr) + { + range_lock_mgr->SetMaxLockMemory(rocksdb_max_lock_memory); + sql_print_information("RocksDB: USING NEW RANGE LOCKING"); + sql_print_information("RocksDB: Max lock memory=%llu", rocksdb_max_lock_memory); + } + else + sql_print_information("RocksDB: USING POINT LOCKING"); + cf_manager.init(std::move(cf_options_map), &cf_handles); // NO_LINT_DEBUG @@ -9208,6 +9432,15 @@ int ha_rocksdb::index_read_intern(uchar *const buf, const uchar *const key, } Rdb_transaction *const tx = get_or_create_tx(table->in_use); + + bool use_locking_iter= false; + + if ((rc = set_range_lock(tx, kd, find_flag, slice, end_range, + &use_locking_iter))) + DBUG_RETURN(rc); + if (use_locking_iter) + m_iterator->set_use_locking(); + const bool is_new_snapshot = !tx->has_snapshot(); // Loop as long as we get a deadlock error AND we end up creating the @@ -9256,6 +9489,234 @@ int ha_rocksdb::index_read_intern(uchar *const buf, const uchar *const key, DBUG_RETURN(rc); } + +/* + @brief + Compute the range lock endpoints and set the range lock, if necessary + + @param use_locking_iter OUT If true, locks are not set and LockingIterator + should be used instead + + @detail + If the scanned range doesn't have the endpoint we're scanning towards, + don't set the lock, it will be too coarse. Indicate that LockingIterator + should be used, instead. + + @return + 0 Ok + Other Error acquiring the lock (wait timeout, deadlock, etc) +*/ + +int ha_rocksdb::set_range_lock(Rdb_transaction *tx, + const Rdb_key_def &kd, + const enum ha_rkey_function &find_flag, + const rocksdb::Slice &slice_arg, + const key_range *const end_key, + bool *use_locking_iterator) +{ + rocksdb::Slice end_slice; + uchar end_slice_buf[MAX_KEY_LENGTH]; + bool start_has_inf_suffix = false, end_has_inf_suffix = false; + rocksdb::Slice slice(slice_arg); + *use_locking_iterator= false; + + if (m_lock_rows == RDB_LOCK_NONE || !rocksdb_use_range_locking) { + return 0; + } + bool big_range= false; + + /* + The 'slice' has the left endpoint of the range to lock. + Figure out the right endpoint. + */ + + if (find_flag == HA_READ_KEY_EXACT) { + if (slice.size() == Rdb_key_def::INDEX_NUMBER_SIZE) { + // This is a full table/index scan + start_has_inf_suffix= false; + big_range = true; + } else { + /* + This is "key_part= const" interval. We need to lock this range: + (lookup_value, -inf) < key < (lookup_value, +inf) + */ + start_has_inf_suffix= false; + end_has_inf_suffix= true; + end_slice= slice; + } + } + else if (find_flag == HA_READ_PREFIX_LAST) { + if (slice.size() == Rdb_key_def::INDEX_NUMBER_SIZE) { + /* Reverse-ordered full index scan */ + start_has_inf_suffix= true; + big_range = true; + } else { + /* + We get here for queries like: + + select * from t1 where pk1=const order by pk1 desc for update + + assuming this uses an index on (pk1, ...) + We get end_key=nullptr. + + The range to lock is the same as with HA_READ_KEY_EXACT above. + */ + end_slice= slice; + start_has_inf_suffix= false; + end_has_inf_suffix= true; + } + } + else if (find_flag == HA_READ_PREFIX_LAST_OR_PREV) { + /* + We get here for queries like: + + select * from t1 where pk1=const1 and pk2 between const2 and const3 + order by pk1 desc + for update + + assuming this uses an index on (pk1, pk2). + The slice has the right endpoint: {const1, const3} + the end_key has the left endpoint: {const1, const2}. + */ + + // Move the right endpoint from slice to end_slice + end_slice= slice; + + // Pack the left endpoint and make "slice" point to it + uchar pack_buffer[MAX_KEY_LENGTH]; + uint end_slice_size= + kd.pack_index_tuple(table, pack_buffer, end_slice_buf, + end_key->key, end_key->keypart_map); + slice= rocksdb::Slice(reinterpret_cast<char *>(end_slice_buf), + end_slice_size); + start_has_inf_suffix= false; + end_has_inf_suffix= true; + } + else if (find_flag == HA_READ_BEFORE_KEY) { + /* + We get here for queries like + select * from t1 + where pk <1007 order by pk desc limit 2 for update + select * from t1 + where pk >=800 and pk <1007 order by pk desc limit 2 for update + */ + + // Move the right endpoint from slice to end_slice + end_slice= slice; + + if (end_key) { + uchar pack_buffer[MAX_KEY_LENGTH]; + uint end_slice_size= + kd.pack_index_tuple(table, pack_buffer, end_slice_buf, + end_key->key, end_key->keypart_map); + + slice= rocksdb::Slice(reinterpret_cast<char *>(end_slice_buf), + end_slice_size); + + end_has_inf_suffix= false; + big_range= false; + } else { + uint end_slice_size; + kd.get_infimum_key(end_slice_buf, &end_slice_size); + slice= rocksdb::Slice((char*)end_slice_buf, end_slice_size); + + big_range= true; + } + } + else if (end_key) { + // Known start range bounds: HA_READ_KEY_OR_NEXT, HA_READ_AFTER_KEY + if (find_flag == HA_READ_KEY_OR_NEXT) + start_has_inf_suffix= false; + else if (find_flag == HA_READ_AFTER_KEY) + start_has_inf_suffix= true; + else + DBUG_ASSERT(0); + + // Known end range bounds: HA_READ_AFTER_KEY, HA_READ_BEFORE_KEY + if (end_key->flag == HA_READ_AFTER_KEY) { + // this is "key_part <= const". + end_has_inf_suffix= true; + } else if (end_key->flag == HA_READ_BEFORE_KEY) { + // this is "key_part < const", non-inclusive. + end_has_inf_suffix= false; + } else + DBUG_ASSERT(0); + + uchar pack_buffer[MAX_KEY_LENGTH]; + uint end_slice_size= kd.pack_index_tuple(table, pack_buffer, end_slice_buf, + end_key->key, + end_key->keypart_map); + + end_slice= rocksdb::Slice(reinterpret_cast<char *>(end_slice_buf), + end_slice_size); + } + else + { + big_range= true; +#if 0 + // The below is code to handle this without LockingIterator: + // No end key + // Known start range bounds: HA_READ_KEY_OR_NEXT, HA_READ_AFTER_KEY + if (find_flag == HA_READ_KEY_OR_NEXT) + start_has_inf_suffix= false; + else if (find_flag == HA_READ_AFTER_KEY) + start_has_inf_suffix= true; + else + DBUG_ASSERT(0); + + uint end_slice_size; + kd.get_infimum_key(end_slice_buf, &end_slice_size); + end_slice= rocksdb::Slice((char*)end_slice_buf, end_slice_size); + end_has_inf_suffix= true; +#endif + } + + if (big_range) + { + *use_locking_iterator= true; + return 0; + } + + rocksdb::Endpoint start_endp; + rocksdb::Endpoint end_endp; + + if (kd.m_is_reverse_cf) { + // Flip the endpoints + start_endp =rocksdb::Endpoint(end_slice, !end_has_inf_suffix); + end_endp = rocksdb::Endpoint(slice, !start_has_inf_suffix); + } else { + start_endp= rocksdb::Endpoint(slice, start_has_inf_suffix); + end_endp= rocksdb::Endpoint(end_slice, end_has_inf_suffix); + } + + /* + RocksDB's iterator is reading the snapshot of the data that was taken at + the time the iterator was created. + + After we've got a lock on the range, we'll need to refresh the iterator + to read the latest contents. (If we use the iterator created before the + lock_range() call, we may miss the changes that were made/committed after + the iterator was created but before the lock_range() call was made). + + RocksDB has Iterator::Refresh() method, but alas, it is not implemented for + the iterator returned by Transaction object (Transaction object returns + BaseDeltaIterator which allows one to see the transactions's own changes). + + Our solution to this is to release the iterator and create the new one. + We release it here, it will be created as soon as there's a need to read + records. + */ + //release_scan_iterator(); + m_iterator->reset(); + + auto s= tx->lock_range(kd.get_cf(), start_endp, end_endp); + if (!s.ok()) { + return (tx->set_status_error(table->in_use, s, kd, m_tbl_def, + m_table_handler)); + } + return 0; +} + /* See storage/rocksdb/rocksdb-range-access.txt for description of how MySQL index navigation commands are converted into RocksDB lookup commands. @@ -9770,7 +10231,7 @@ int ha_rocksdb::index_next_with_direction_intern(uchar *const buf, } } - if (rc) { + if (rc != HA_EXIT_SUCCESS) { break; } @@ -9781,7 +10242,7 @@ int ha_rocksdb::index_next_with_direction_intern(uchar *const buf, table->m_status = 0; rc = 0; } else if (active_index == table->s->primary_key) { - if (m_lock_rows != RDB_LOCK_NONE) { + if (m_lock_rows != RDB_LOCK_NONE && !rocksdb_use_range_locking) { DEBUG_SYNC(ha_thd(), "rocksdb_concurrent_delete"); /* We need to put a lock and re-read */ bool skip_row = false; @@ -10850,6 +11311,15 @@ int ha_rocksdb::update_write_sk(const TABLE *const table_arg, old_key_slice = rocksdb::Slice( reinterpret_cast<const char *>(m_sk_packed_tuple_old), old_packed_size); + /* Range locking: lock the index tuple being deleted */ + if (rocksdb_use_range_locking) { + auto s= row_info.tx->lock_singlepoint_range(kd.get_cf(), old_key_slice); + if (!s.ok()) { + return (row_info.tx->set_status_error(table->in_use, s, kd, + m_tbl_def, m_table_handler)); + } + } + row_info.tx->get_indexed_write_batch()->SingleDelete(kd.get_cf(), old_key_slice); @@ -10902,6 +11372,14 @@ int ha_rocksdb::update_write_sk(const TABLE *const table_arg, if (bulk_load_sk && row_info.old_data == nullptr) { rc = bulk_load_key(row_info.tx, kd, new_key_slice, new_value_slice, true); } else { + /* Range locking: lock the index tuple being inserted */ + if (rocksdb_use_range_locking) { + auto s= row_info.tx->lock_singlepoint_range(kd.get_cf(), new_key_slice); + if (!s.ok()) { + return (row_info.tx->set_status_error(table->in_use, s, kd, + m_tbl_def, m_table_handler)); + } + } row_info.tx->get_indexed_write_batch()->Put(kd.get_cf(), new_key_slice, new_value_slice); } @@ -11327,6 +11805,19 @@ int ha_rocksdb::delete_row(const uchar *const buf) { nullptr, false, hidden_pk_id); rocksdb::Slice secondary_key_slice( reinterpret_cast<const char *>(m_sk_packed_tuple), packed_size); + + /* + For point locking, Deleting on secondary key doesn't need any locks. + Range locking must get a lock. + */ + if (rocksdb_use_range_locking) { + auto s= tx->lock_singlepoint_range(kd.get_cf(), secondary_key_slice); + if (!s.ok()) { + DBUG_RETURN(tx->set_status_error(table->in_use, s, kd, m_tbl_def, + m_table_handler)); + } + } + tx->get_indexed_write_batch()->SingleDelete(kd.get_cf(), secondary_key_slice); bytes_written += secondary_key_slice.size(); @@ -11900,7 +12391,7 @@ int ha_rocksdb::external_lock(THD *const thd, int lock_type) { } } tx->m_n_mysql_tables_in_use++; - rocksdb_register_tx(rocksdb_hton, thd, tx); + rocksdb_register_tx(rocksdb_hton, thd, tx, (lock_type == F_WRLCK)); tx->io_perf_start(&m_io_perf); } @@ -11928,7 +12419,7 @@ int ha_rocksdb::start_stmt(THD *const thd, Rdb_transaction *const tx = get_or_create_tx(thd); read_thd_vars(thd); - rocksdb_register_tx(ht, thd, tx); + rocksdb_register_tx(ht, thd, tx, (lock_type == F_WRLCK)); tx->io_perf_start(&m_io_perf); DBUG_RETURN(HA_EXIT_SUCCESS); @@ -14324,6 +14815,36 @@ static int show_rocksdb_stall_vars(THD *thd MY_ATTRIBUTE((unused)), return 0; } +// +// Lock Tree Status variables +// +static longlong rocksdb_locktree_escalation_count=1234; +static longlong rocksdb_locktree_current_lock_memory=0; + +static SHOW_VAR rocksdb_locktree_status_variables[] = { + DEF_STATUS_VAR_FUNC("escalation_count", + &rocksdb_locktree_escalation_count, SHOW_LONGLONG), + DEF_STATUS_VAR_FUNC("current_lock_memory", + &rocksdb_locktree_current_lock_memory, SHOW_LONGLONG), + // end of the array marker + {NullS, NullS, SHOW_LONG}}; + +static SHOW_VAR rocksdb_empty_status_variables[] = { + {NullS, NullS, SHOW_LONG}}; + +static void show_rocksdb_locktree_vars(THD*, SHOW_VAR *var, char*) { + var->type = SHOW_ARRAY; + if (range_lock_mgr) + { + auto status = range_lock_mgr->GetStatus(); + rocksdb_locktree_escalation_count = status.escalation_count; + rocksdb_locktree_current_lock_memory = status.current_lock_memory; + var->value = reinterpret_cast<char *>(&rocksdb_locktree_status_variables); + } + else + var->value = reinterpret_cast<char *>(&rocksdb_empty_status_variables); +} + static SHOW_VAR rocksdb_status_vars[] = { DEF_STATUS_VAR(block_cache_miss), DEF_STATUS_VAR(block_cache_hit), @@ -14449,6 +14970,8 @@ static SHOW_VAR rocksdb_status_vars[] = { SHOW_SCOPE_GLOBAL}, {"rocksdb_stall", reinterpret_cast<char *>(&show_rocksdb_stall_vars), SHOW_FUNC, SHOW_SCOPE_GLOBAL}, + {"rocksdb_locktree", reinterpret_cast<char *>(show_rocksdb_locktree_vars), + SHOW_FUNC, SHOW_SCOPE_GLOBAL}, {NullS, NullS, SHOW_LONG, SHOW_SCOPE_GLOBAL}}; /* @@ -15326,6 +15849,23 @@ void rocksdb_set_delayed_write_rate(THD *thd MY_ATTRIBUTE((unused)), RDB_MUTEX_UNLOCK_CHECK(rdb_sysvars_mutex); } +void rocksdb_set_max_lock_memory(THD *thd, struct SYS_VAR*, + void* /*var_ptr*/, const void *save) { + const uint64_t new_val = *static_cast<const uint64_t *>(save); + if (rocksdb_max_lock_memory != new_val) { + if (range_lock_mgr->SetMaxLockMemory(new_val)) { + /* NO_LINT_DEBUG */ + sql_print_warning("MyRocks: failed to set max_lock_memory"); + push_warning_printf(thd, Sql_condition::SL_WARNING, + ER_ERROR_WHEN_EXECUTING_COMMAND, + "Cannot set max_lock_memory to size below currently used"); + } else { + // Succeeded + rocksdb_max_lock_memory = new_val; + } + } +} + void rocksdb_set_max_latest_deadlocks( THD *thd MY_ATTRIBUTE((unused)), struct SYS_VAR *var MY_ATTRIBUTE((unused)), void *var_ptr MY_ATTRIBUTE((unused)), const void *save) { @@ -15333,7 +15873,13 @@ void rocksdb_set_max_latest_deadlocks( const uint32_t new_val = *static_cast<const uint32_t *>(save); if (rocksdb_max_latest_deadlocks != new_val) { rocksdb_max_latest_deadlocks = new_val; - rdb->SetDeadlockInfoBufferSize(rocksdb_max_latest_deadlocks); + if (range_lock_mgr) { + auto n= rocksdb_max_latest_deadlocks; + range_lock_mgr->SetRangeDeadlockInfoBufferSize(n); + } + else + rdb->SetDeadlockInfoBufferSize(rocksdb_max_latest_deadlocks); + } RDB_MUTEX_UNLOCK_CHECK(rdb_sysvars_mutex); } @@ -16046,11 +16592,22 @@ const rocksdb::ReadOptions &rdb_tx_acquire_snapshot(Rdb_transaction *tx) { } rocksdb::Iterator *rdb_tx_get_iterator( - THD *thd, rocksdb::ColumnFamilyHandle *const cf, bool skip_bloom_filter, + Rdb_transaction *tx, rocksdb::ColumnFamilyHandle *const column_family, + bool is_rev_cf, + bool skip_bloom_filter, const rocksdb::Slice &lower_bound_slice, + const rocksdb::Slice &upper_bound_slice, bool read_current, + bool create_snapshot) { + return tx->get_iterator(column_family, is_rev_cf, skip_bloom_filter, lower_bound_slice, + upper_bound_slice, read_current, create_snapshot); +} + + +rocksdb::Iterator *rdb_tx_get_iterator( + THD *thd, rocksdb::ColumnFamilyHandle *const cf, bool is_rev_cf, bool skip_bloom_filter, const rocksdb::Slice &eq_cond_lower_bound, const rocksdb::Slice &eq_cond_upper_bound, const rocksdb::Snapshot **snapshot, bool read_current, - bool create_snapshot) { + bool create_snapshot, bool use_locking_iter) { if (commit_in_the_middle(thd)) { DBUG_ASSERT(snapshot && *snapshot == nullptr); if (snapshot) { @@ -16065,8 +16622,8 @@ rocksdb::Iterator *rdb_tx_get_iterator( } } else { Rdb_transaction *tx = get_tx_from_thd(thd); - return tx->get_iterator(cf, skip_bloom_filter, eq_cond_lower_bound, - eq_cond_upper_bound, read_current, create_snapshot); + return tx->get_iterator(cf, is_rev_cf, skip_bloom_filter, eq_cond_lower_bound, + eq_cond_upper_bound, read_current, create_snapshot, use_locking_iter); } } diff --git a/storage/rocksdb/ha_rocksdb.h b/storage/rocksdb/ha_rocksdb.h index b5d35b448c1..f4aa151ea26 100644 --- a/storage/rocksdb/ha_rocksdb.h +++ b/storage/rocksdb/ha_rocksdb.h @@ -313,6 +313,13 @@ class ha_rocksdb : public my_core::handler { const Rdb_key_def &kd, const rocksdb::Slice &key) const; + int set_range_lock(Rdb_transaction *tx, + const Rdb_key_def &kd, + const enum ha_rkey_function &find_flag, + const rocksdb::Slice &slice, + const key_range *const end_key, + bool *use_locking_iterator); + int get_row_by_rowid(uchar *const buf, const char *const rowid, const uint rowid_size, bool *skip_row = nullptr, const bool skip_lookup = false, @@ -958,6 +965,8 @@ class ha_rocksdb : public my_core::handler { /* Need to build decoder on next read operation */ bool m_need_build_decoder; + + int iter_status_to_retval(rocksdb::Iterator *it, const Rdb_key_def &kd, int not_found_code); }; /* @@ -1095,11 +1104,14 @@ Rdb_transaction *get_tx_from_thd(THD *const thd); const rocksdb::ReadOptions &rdb_tx_acquire_snapshot(Rdb_transaction *tx); rocksdb::Iterator *rdb_tx_get_iterator( - THD *thd, rocksdb::ColumnFamilyHandle *const cf, bool skip_bloom_filter, + THD *thd, rocksdb::ColumnFamilyHandle *const cf, + bool is_rev_cf, + bool skip_bloom_filter, const rocksdb::Slice &eq_cond_lower_bound, const rocksdb::Slice &eq_cond_upper_bound, const rocksdb::Snapshot **snapshot, bool read_current = false, - bool create_snapshot = true); + bool create_snapshot = true, + bool use_locking_iter= false); rocksdb::Status rdb_tx_get(Rdb_transaction *tx, rocksdb::ColumnFamilyHandle *const column_family, @@ -1172,4 +1184,6 @@ extern std::atomic<uint64_t> rocksdb_partial_index_groups_materialized; extern std::atomic<uint64_t> rocksdb_partial_index_rows_sorted; extern std::atomic<uint64_t> rocksdb_partial_index_rows_materialized; +extern std::shared_ptr<rocksdb::RangeLockManagerHandle> range_lock_mgr; + } // namespace myrocks diff --git a/storage/rocksdb/nosql_access.cc b/storage/rocksdb/nosql_access.cc index eacae4b9315..0ad2349d8fb 100644 --- a/storage/rocksdb/nosql_access.cc +++ b/storage/rocksdb/nosql_access.cc @@ -676,10 +676,11 @@ class select_exec { } rocksdb::Iterator *get_iterator(rocksdb::ColumnFamilyHandle *cf, + bool is_rev_cf, bool use_bloom, const rocksdb::Slice &lower_bound, const rocksdb::Slice &upper_bound) { - return rdb_tx_get_iterator(m_thd, cf, !use_bloom, lower_bound, + return rdb_tx_get_iterator(m_thd, cf, is_rev_cf, !use_bloom, lower_bound, //psergey-mergey-todo: or m_tx ? upper_bound, nullptr); } @@ -1523,7 +1524,8 @@ bool INLINE_ATTR select_exec::setup_iterator(txn_wrapper *txn, m_thd, *m_key_def, eq_slice, bound_len, m_lower_bound_buf.data(), m_upper_bound_buf.data(), &m_lower_bound_slice, &m_upper_bound_slice); rocksdb::Iterator *it = txn->get_iterator( - m_key_def->get_cf(), use_bloom, m_lower_bound_slice, m_upper_bound_slice); + m_key_def->get_cf(), m_key_def->m_is_reverse_cf, use_bloom, + m_lower_bound_slice, m_upper_bound_slice); if (it == nullptr) { return true; } diff --git a/storage/rocksdb/rdb_i_s.cc b/storage/rocksdb/rdb_i_s.cc index cd015414fb1..45fb886f8ee 100644 --- a/storage/rocksdb/rdb_i_s.cc +++ b/storage/rocksdb/rdb_i_s.cc @@ -1771,31 +1771,63 @@ static int rdb_i_s_lock_info_fill_table( } /* cf id -> rocksdb::KeyLockInfo */ - std::unordered_multimap<uint32_t, rocksdb::KeyLockInfo> lock_info = - rdb->GetLockStatusData(); - - for (const auto &lock : lock_info) { - const uint32_t cf_id = lock.first; - const auto &key_lock_info = lock.second; - const auto key_hexstr = rdb_hexdump(key_lock_info.key.c_str(), - key_lock_info.key.length(), FN_REFLEN); - - for (const auto &id : key_lock_info.ids) { - tables->table->field[RDB_LOCKS_FIELD::COLUMN_FAMILY_ID]->store(cf_id, - true); - tables->table->field[RDB_LOCKS_FIELD::TRANSACTION_ID]->store(id, true); - - tables->table->field[RDB_LOCKS_FIELD::KEY]->store( - key_hexstr.c_str(), key_hexstr.size(), system_charset_info); - tables->table->field[RDB_LOCKS_FIELD::MODE]->store( - key_lock_info.exclusive ? "X" : "S", 1, system_charset_info); - - /* Tell MySQL about this row in the virtual table */ - ret = static_cast<int>( - my_core::schema_table_store_record(thd, tables->table)); - - if (ret != 0) { - break; + if (range_lock_mgr) { + // Use Range Lock Manager's interface for obtaining more specific + // information about the acquired locks + auto lock_info = range_lock_mgr->GetRangeLockStatusData(); + + for (const auto &lock : lock_info) { + const uint32_t cf_id = lock.first; + const auto &range_lock_info = lock.second; + + std::string key_hexstr = rdb_hexdump_range(range_lock_info.start, + range_lock_info.end); + + for (const auto &id : range_lock_info.ids) { + tables->table->field[RDB_LOCKS_FIELD::COLUMN_FAMILY_ID]->store(cf_id, + true); + tables->table->field[RDB_LOCKS_FIELD::TRANSACTION_ID]->store(id, true); + + tables->table->field[RDB_LOCKS_FIELD::KEY]->store( + key_hexstr.c_str(), key_hexstr.size(), system_charset_info); + tables->table->field[RDB_LOCKS_FIELD::MODE]->store( + range_lock_info.exclusive ? "X" : "S", 1, system_charset_info); + + /* Tell MySQL about this row in the virtual table */ + ret = static_cast<int>( + my_core::schema_table_store_record(thd, tables->table)); + + if (ret != 0) { + break; + } + } + } + } else { + std::unordered_multimap<uint32_t, rocksdb::KeyLockInfo> lock_info = + rdb->GetLockStatusData(); + + for (const auto &lock : lock_info) { + const uint32_t cf_id = lock.first; + const auto &key_lock_info = lock.second; + auto key_hexstr = rdb_hexdump(key_lock_info.key.c_str(), + key_lock_info.key.length(), FN_REFLEN); + for (const auto &id : key_lock_info.ids) { + tables->table->field[RDB_LOCKS_FIELD::COLUMN_FAMILY_ID]->store(cf_id, + true); + tables->table->field[RDB_LOCKS_FIELD::TRANSACTION_ID]->store(id, true); + + tables->table->field[RDB_LOCKS_FIELD::KEY]->store( + key_hexstr.c_str(), key_hexstr.size(), system_charset_info); + tables->table->field[RDB_LOCKS_FIELD::MODE]->store( + key_lock_info.exclusive ? "X" : "S", 1, system_charset_info); + + /* Tell MySQL about this row in the virtual table */ + ret = static_cast<int>( + my_core::schema_table_store_record(thd, tables->table)); + + if (ret != 0) { + break; + } } } } diff --git a/storage/rocksdb/rdb_iterator.cc b/storage/rocksdb/rdb_iterator.cc index b66d539e2af..761bccfa966 100644 --- a/storage/rocksdb/rdb_iterator.cc +++ b/storage/rocksdb/rdb_iterator.cc @@ -36,6 +36,7 @@ Rdb_iterator_base::Rdb_iterator_base(THD *thd, m_tbl_def(tbl_def), m_thd(thd), m_scan_it(nullptr), + m_use_locking_iter(false), m_scan_it_skips_bloom(false), m_scan_it_snapshot(nullptr), m_scan_it_lower_bound(nullptr), @@ -83,7 +84,7 @@ int Rdb_iterator_base::read_before_key(const bool full_key_match, return HA_EXIT_SUCCESS; } - return HA_ERR_END_OF_FILE; + return iter_status_to_retval(m_scan_it, m_kd, HA_ERR_END_OF_FILE); } int Rdb_iterator_base::read_after_key(const rocksdb::Slice &key_slice) { @@ -98,12 +99,15 @@ int Rdb_iterator_base::read_after_key(const rocksdb::Slice &key_slice) { */ rocksdb_smart_seek(m_kd->m_is_reverse_cf, m_scan_it, key_slice); - return is_valid_iterator(m_scan_it) ? HA_EXIT_SUCCESS : HA_ERR_END_OF_FILE; + return is_valid_iterator(m_scan_it) ? + HA_EXIT_SUCCESS : + iter_status_to_retval(m_scan_it, m_kd, HA_ERR_END_OF_FILE); } void Rdb_iterator_base::release_scan_iterator() { delete m_scan_it; m_scan_it = nullptr; + m_use_locking_iter = false; if (m_scan_it_snapshot) { auto rdb = rdb_get_rocksdb_db(); @@ -137,6 +141,10 @@ void Rdb_iterator_base::setup_scan_iterator(const rocksdb::Slice *const slice, skip_bloom = false; } + // Save the value of m_use_locking_iter because release_scan_iterator() + // will set it to false. + bool use_locking_iter= m_use_locking_iter; + /* In some cases, setup_scan_iterator() is called multiple times from the same query but bloom filter can not always be used. @@ -164,9 +172,10 @@ void Rdb_iterator_base::setup_scan_iterator(const rocksdb::Slice *const slice, */ if (!m_scan_it) { m_scan_it = rdb_tx_get_iterator( - m_thd, m_kd->get_cf(), skip_bloom, m_scan_it_lower_bound_slice, + m_thd, m_kd->get_cf(), m_kd->m_is_reverse_cf, skip_bloom, + m_scan_it_lower_bound_slice, m_scan_it_upper_bound_slice, &m_scan_it_snapshot, read_current, - !read_current); + !read_current, use_locking_iter); m_scan_it_skips_bloom = skip_bloom; } } @@ -208,6 +217,20 @@ int Rdb_iterator_base::calc_eq_cond_len(enum ha_rkey_function find_flag, return Rdb_key_def::INDEX_NUMBER_SIZE; } +int Rdb_iterator_base::iter_status_to_retval(rocksdb::Iterator *it, + const std::shared_ptr<Rdb_key_def> kd, + int not_found_code) { + if (it->Valid()) + return HA_EXIT_SUCCESS; + + rocksdb::Status s= it->status(); + if (s.ok() || s.IsNotFound()) + return not_found_code; + + Rdb_transaction *tx = get_tx_from_thd(m_thd); + return rdb_tx_set_status_error(tx, s, *kd, m_tbl_def); +} + int Rdb_iterator_base::next_with_direction(bool move_forward, bool skip_next) { int rc = 0; const auto &kd = *m_kd; @@ -237,7 +260,7 @@ int Rdb_iterator_base::next_with_direction(bool move_forward, bool skip_next) { } if (!is_valid_iterator(m_scan_it)) { - rc = HA_ERR_END_OF_FILE; + rc = iter_status_to_retval(m_scan_it, m_kd, HA_ERR_END_OF_FILE); break; } diff --git a/storage/rocksdb/rdb_iterator.h b/storage/rocksdb/rdb_iterator.h index 13f53b62a6f..34d2b53f35d 100644 --- a/storage/rocksdb/rdb_iterator.h +++ b/storage/rocksdb/rdb_iterator.h @@ -53,6 +53,8 @@ class Rdb_iterator { virtual rocksdb::Slice key() = 0; virtual rocksdb::Slice value() = 0; virtual void reset() = 0; + + virtual void set_use_locking()=0; }; class Rdb_iterator_base : public Rdb_iterator { @@ -93,6 +95,7 @@ class Rdb_iterator_base : public Rdb_iterator { void reset() override { release_scan_iterator(); } + void set_use_locking() override { m_use_locking_iter = true; } protected: friend class Rdb_iterator; const std::shared_ptr<Rdb_key_def> m_kd; @@ -107,6 +110,8 @@ class Rdb_iterator_base : public Rdb_iterator { /* Iterator used for range scans and for full table/index scans */ rocksdb::Iterator *m_scan_it; + bool m_use_locking_iter; + /* Whether m_scan_it was created with skip_bloom=true */ bool m_scan_it_skips_bloom; @@ -120,6 +125,10 @@ class Rdb_iterator_base : public Rdb_iterator { uchar *m_prefix_buf; rocksdb::Slice m_prefix_tuple; + + int iter_status_to_retval(rocksdb::Iterator *it, + const std::shared_ptr<Rdb_key_def> kd, + int not_found_code); }; class Rdb_iterator_partial : public Rdb_iterator_base { diff --git a/storage/rocksdb/rdb_locking_iter.cc b/storage/rocksdb/rdb_locking_iter.cc new file mode 100644 index 00000000000..739f383a816 --- /dev/null +++ b/storage/rocksdb/rdb_locking_iter.cc @@ -0,0 +1,108 @@ + +#ifdef USE_PRAGMA_IMPLEMENTATION +#pragma implementation // gcc: Class implementation +#endif + +#define MYSQL_SERVER 1 + +/* This C++ file's header file */ +#include "./rdb_locking_iter.h" + +namespace myrocks { + +rocksdb::Iterator* GetLockingIterator( + rocksdb::Transaction *trx, + const rocksdb::ReadOptions& read_options, + rocksdb::ColumnFamilyHandle* column_family, + bool is_rev_cf, + ulonglong *counter) { + return new LockingIterator(trx, column_family, is_rev_cf, read_options, + counter); +} + +/* + @brief + Seek to the first key K that is equal or greater than target, + locking the range [target; K]. +*/ + +void LockingIterator::Seek(const rocksdb::Slice& target) { + iter_ = txn_->GetIterator(read_opts_, cfh_); + iter_->Seek(target); + ScanForward(target, false); +} + +void LockingIterator::SeekForPrev(const rocksdb::Slice& target) { + iter_ = txn_->GetIterator(read_opts_, cfh_); + iter_->SeekForPrev(target); + ScanBackward(target, false); +} + +/* + @brief + Move the iterator to the next key, locking the range between the current + and the next key. + + @detail + Implementation is similar to Seek(next_key). Since we don't know what the + next_key is, we reach it by calling { Seek(current_key); Next(); } +*/ +void LockingIterator::Next() { + DEBUG_SYNC(my_core::thd_get_current_thd(), "rocksdb.LockingIterator.Next"); + assert(Valid()); + // Save the current key value. We need it as the left endpoint + // of the range lock we're going to acquire + std::string current_key = iter_->key().ToString(); + + iter_->Next(); + ScanForward(rocksdb::Slice(current_key), true); +} + +/* + @brief + Move the iterator to the previous key, locking the range between the current + and the previous key. +*/ + +void LockingIterator::Prev() { + assert(Valid()); + + std::string current_key = iter_->key().ToString(); + iter_->Prev(); + ScanBackward(rocksdb::Slice(current_key), true); +} + + +/* + @detail + Ideally, this function should + - find the first key $first_key + - lock the range [-inf; $first_key] + - return, the iterator is positioned on $first_key + + The problem here is that we cannot have "-infinity" bound. + + Note: we don't have a practical use for this function - MyRocks always + searches within one index_name.table_name, which means we are only looking + at the keys with index_number as the prefix. +*/ + +void LockingIterator::SeekToFirst() { + DBUG_ASSERT(0); + status_ = rocksdb::Status::NotSupported("Not implemented"); + valid_ = false; +} + +/* + @detail + See SeekToFirst. +*/ + +void LockingIterator::SeekToLast() { + DBUG_ASSERT(0); + status_ = rocksdb::Status::NotSupported("Not implemented"); + valid_ = false; +} + +} // namespace myrocks + diff --git a/storage/rocksdb/rdb_locking_iter.h b/storage/rocksdb/rdb_locking_iter.h new file mode 100644 index 00000000000..5a9ed6c275d --- /dev/null +++ b/storage/rocksdb/rdb_locking_iter.h @@ -0,0 +1,190 @@ + +/* MySQL header files */ +#include "sql/handler.h" /* handler */ +#include "sql/debug_sync.h" +#include "./rdb_threads.h" /* for thd_get_current_thd */ + +/* MyRocks header files */ +#include "./ha_rocksdb.h" + +namespace myrocks { + +////////////////////////////////////////////////////////////////////////////// +// Locking iterator +////////////////////////////////////////////////////////////////////////////// + +// +// LockingIterator is an iterator that locks the rows before returning, as well +// as scanned gaps between the rows. +// +// Example: +// lock_iter= trx->GetLockingIterator(); +// lock_iter->Seek('abc'); +// lock_iter->Valid()==true && lock_iter->key() == 'bcd'; +// +// After the above, the returned record 'bcd' is locked by transaction trx. +// Also, the range between ['abc'..'bcd'] is empty and locked by trx. +// +// lock_iter->Next(); +// lock_iter->Valid()==true && lock_iter->key() == 'efg' +// +// Now, the range ['bcd'.. 'efg'] (bounds incluive) is also locked, and there are no +// records between 'bcd' and 'efg'. +// +class LockingIterator : public rocksdb::Iterator { + + rocksdb::Transaction *txn_; + rocksdb::ColumnFamilyHandle* cfh_; + bool m_is_rev_cf; + rocksdb::ReadOptions read_opts_; + rocksdb::Iterator *iter_; + rocksdb::Status status_; + + // note: an iterator that has reached EOF has status()==OK && valid_==false + bool valid_; + + ulonglong *lock_count_; + public: + LockingIterator(rocksdb::Transaction *txn, + rocksdb::ColumnFamilyHandle *cfh, + bool is_rev_cf, + const rocksdb::ReadOptions& opts, + ulonglong *lock_count=nullptr + ) : + txn_(txn), cfh_(cfh), m_is_rev_cf(is_rev_cf), read_opts_(opts), iter_(nullptr), + status_(rocksdb::Status::InvalidArgument()), valid_(false), + lock_count_(lock_count) {} + + ~LockingIterator() { + delete iter_; + } + + virtual bool Valid() const override { return valid_; } + + // Note: MyRocks doesn't ever call these: + virtual void SeekToFirst() override; + virtual void SeekToLast() override; + + virtual void Seek(const rocksdb::Slice& target) override; + + // Position at the last key in the source that at or before target. + // The iterator is Valid() after this call iff the source contains + // an entry that comes at or before target. + virtual void SeekForPrev(const rocksdb::Slice& target) override; + + virtual void Next() override; + virtual void Prev() override; + + virtual rocksdb::Slice key() const override { + assert(Valid()); + return iter_->key(); + } + + virtual rocksdb::Slice value() const override { + assert(Valid()); + return iter_->value(); + } + + virtual rocksdb::Status status() const override { + return status_; + } + + private: + template <bool forward> void Scan(const rocksdb::Slice& target, + bool call_next) { + if (!iter_->Valid()) { + status_ = iter_->status(); + valid_ = false; + return; + } + + while (1) { + /* + note: the underlying iterator checks iterator bounds, so we don't need + to check them here + */ + DEBUG_SYNC(my_core::thd_get_current_thd(), "rocksdb.locking_iter_scan"); + auto end_key = iter_->key(); + bool endp_arg= m_is_rev_cf; + if (forward) { + status_ = txn_->GetRangeLock(cfh_, + rocksdb::Endpoint(target, endp_arg), + rocksdb::Endpoint(end_key, endp_arg)); + } else { + status_ = txn_->GetRangeLock(cfh_, + rocksdb::Endpoint(end_key, endp_arg), + rocksdb::Endpoint(target, endp_arg)); + } + + if (!status_.ok()) { + // Failed to get a lock (most likely lock wait timeout) + valid_ = false; + return; + } + if (lock_count_) (*lock_count_)++; + std::string end_key_copy= end_key.ToString(); + + //Ok, now we have a lock which is inhibiting modifications in the range + // Somebody might have done external modifications, though: + // - removed the key we've found + // - added a key before that key. + + // First, refresh the iterator: + delete iter_; + iter_ = txn_->GetIterator(read_opts_, cfh_); + + // Then, try seeking to the same row + if (forward) + iter_->Seek(target); + else + iter_->SeekForPrev(target); + + auto cmp= cfh_->GetComparator(); + + if (call_next && iter_->Valid() && !cmp->Compare(iter_->key(), target)) { + if (forward) + iter_->Next(); + else + iter_->Prev(); + } + + if (iter_->Valid()) { + int inv = forward ? 1 : -1; + if (cmp->Compare(iter_->key(), rocksdb::Slice(end_key_copy))*inv <= 0) { + // Ok, the found key is within the range. + status_ = rocksdb::Status::OK(); + valid_= true; + break; + } else { + // We've got a key but it is outside the range we've locked. + // Re-try the lock-and-read step. + continue; + } + } else { + // There's no row (within the iterator bounds perhaps). Exit now. + // (we might already have locked a range in this function but there's + // nothing we can do about it) + valid_ = false; + status_ = iter_->status(); + break; + } + } + } + + inline void ScanForward(const rocksdb::Slice& target, bool call_next) { + Scan<true>(target, call_next); + } + + inline void ScanBackward(const rocksdb::Slice& target, bool call_next) { + Scan<false>(target, call_next); + } +}; + +rocksdb::Iterator* +GetLockingIterator(rocksdb::Transaction *trx, + const rocksdb::ReadOptions& read_options, + rocksdb::ColumnFamilyHandle* column_family, + bool is_rev_cf, + ulonglong *counter); + +} // namespace myrocks diff --git a/storage/rocksdb/rdb_utils.cc b/storage/rocksdb/rdb_utils.cc index a723ac9e806..88695aa0539 100644 --- a/storage/rocksdb/rdb_utils.cc +++ b/storage/rocksdb/rdb_utils.cc @@ -259,6 +259,33 @@ std::string rdb_hexdump(const char *data, const std::size_t data_len, return str; } +/* + Print the range in hex, in "start_endpoint-end_endpoint" form +*/ + +std::string rdb_hexdump_range(const rocksdb::EndpointWithString& start, + const rocksdb::EndpointWithString& end) { + std::string res; + // For keys: :0 keys should look like point keys + if (!start.inf_suffix && !end.inf_suffix && (start.slice == end.slice)) { + // This is a single-point range, show it like a key + res = rdb_hexdump(start.slice.c_str(), start.slice.length(), FN_REFLEN); + } else { + res = rdb_hexdump(start.slice.c_str(), start.slice.length(), FN_REFLEN); + if (start.inf_suffix) + res.append(":1"); + + res.append("-"); + + std::string key2 = rdb_hexdump(end.slice.c_str(), end.slice.length(), + FN_REFLEN); + if (end.inf_suffix) + key2.append(":1"); + res.append(key2); + } + return res; +} + /* Attempt to access the database subdirectory to see if it exists */ diff --git a/storage/rocksdb/rdb_utils.h b/storage/rocksdb/rdb_utils.h index f49f102a08c..39f2096dd35 100644 --- a/storage/rocksdb/rdb_utils.h +++ b/storage/rocksdb/rdb_utils.h @@ -29,6 +29,7 @@ /* RocksDB header files */ #include "rocksdb/slice.h" #include "rocksdb/status.h" +#include "rocksdb/utilities/transaction_db.h" /* MyRocks header files */ #include "./rdb_global.h" @@ -290,6 +291,8 @@ std::string rdb_hexdump(const char *data, const std::size_t data_len, const std::size_t maxsize = 0) MY_ATTRIBUTE((__nonnull__)); +std::string rdb_hexdump_range(const rocksdb::EndpointWithString& left, + const rocksdb::EndpointWithString& right); /* Helper function to see if a database exists */