[Commits] 9c0e1c912bf: MDEV-18104: MyRocks-Gap-Lock: range locking bounds are incorrect for multi-part keys
revision-id: 9c0e1c912bf37cec8867806e0a7e46033668a99a (fb-prod201801-180-g9c0e1c912bf) parent(s): bb04a774561548d159576b3bcaa2f638ffd76829 author: Sergei Petrunia committer: Sergei Petrunia timestamp: 2019-01-04 18:39:03 +0300 message: MDEV-18104: MyRocks-Gap-Lock: range locking bounds are incorrect for multi-part keys In order to work correctly with multi-component indexes, Range Locking should use "Range endpoints" not lookup keys of range bounds. --- mysql-test/suite/rocksdb/r/range_locking.result | 50 ++++++- mysql-test/suite/rocksdb/r/varbinary_format.result | 17 ++- mysql-test/suite/rocksdb/t/delete_before_lock.test | 5 + mysql-test/suite/rocksdb/t/range_locking.test | 36 +++++ mysql-test/suite/rocksdb/t/varbinary_format.test | 14 +- rocksdb | 2 +- storage/rocksdb/ha_rocksdb.cc | 166 ++++++++++++++++++--- storage/rocksdb/range_locking/ft/comparator.h | 16 +- 8 files changed, 262 insertions(+), 44 deletions(-) diff --git a/mysql-test/suite/rocksdb/r/range_locking.result b/mysql-test/suite/rocksdb/r/range_locking.result index 964458b783f..c3271db2730 100644 --- a/mysql-test/suite/rocksdb/r/range_locking.result +++ b/mysql-test/suite/rocksdb/r/range_locking.result @@ -90,18 +90,60 @@ pk a 10 10 select * from information_schema.rocksdb_locks; COLUMN_FAMILY_ID TRANSACTION_ID KEY MODE -0 15 000001078000000a X +0 15 00000001078000000a X delete from t1 where pk between 25 and 40; select * from information_schema.rocksdb_locks; COLUMN_FAMILY_ID TRANSACTION_ID KEY MODE -0 15 0000010780000019 - 0000010780000028 X -0 15 000001078000000a X +0 15 000000010780000019 - 010000010780000028 X +0 15 00000001078000000a X rollback; begin; +# The following will show a range lock on 2-9 and also a point lock on 10. +# This is how things currently work. select * from t1 where pk between 2 and 9 for update; pk a select * from information_schema.rocksdb_locks; COLUMN_FAMILY_ID TRANSACTION_ID KEY MODE -0 16 0000010780000002 - 0000010780000009 X +0 16 00000001078000000a X +0 16 000000010780000002 - 010000010780000009 X rollback; drop table t1; +# +# MDEV-18104: MyRocks-Gap-Lock: range locking bounds are incorrect for multi-part keys +# +create table t0(a int); +insert into t0 values (0),(1),(2),(3),(4),(5),(6),(7),(8),(9); +create table t1 ( +kp1 int not null, +kp2 int not null, +a int, +primary key(kp1, kp2) +) engine=rocksdb; +insert into t1 select 1, a, 1234 from t0; +insert into t1 select 2, a, 1234 from t0; +insert into t1 select 3, a, 1234 from t0; +connect con1,localhost,root,,; +connection con1; +begin; +select * from t1 where kp1=2 for update; +kp1 kp2 a +2 0 1234 +2 1 1234 +2 2 1234 +2 3 1234 +2 4 1234 +2 5 1234 +2 6 1234 +2 7 1234 +2 8 1234 +2 9 1234 +connection default; +# The lock on kp1=2 should inhibit the following INSERT: +insert into t1 values ( 2,5,9999); +ERROR HY000: Lock wait timeout exceeded; try restarting transaction: Timeout on index: test.t1.PRIMARY +rollback; +connection con1; +rollback; +connection default; +disconnect con1; +drop table t0,t1; diff --git a/mysql-test/suite/rocksdb/r/varbinary_format.result b/mysql-test/suite/rocksdb/r/varbinary_format.result index 9362d42515c..e3dde737f9f 100644 --- a/mysql-test/suite/rocksdb/r/varbinary_format.result +++ b/mysql-test/suite/rocksdb/r/varbinary_format.result @@ -1,3 +1,4 @@ +set @skip_head=9 + if(@@rocksdb_use_range_locking, 2, 0); CREATE TABLE t1( vb VARBINARY(64) primary key ) ENGINE=rocksdb; @@ -45,8 +46,8 @@ hex(vb) 00000000000000000000000000000000000000000000000000000000000000 0000000000000000000000000000000000000000000000000000000000000000 000000000000000000000000000000000000000000000000000000000000000000 -SELECT SUBSTRING(a.key,9) FROM information_schema.rocksdb_locks AS a ORDER BY a.key; -SUBSTRING(a.key,9) +SELECT SUBSTRING(a.key,@skip_head) FROM information_schema.rocksdb_locks AS a ORDER BY a.key; +SUBSTRING(a.key,@skip_head) 000000000000000001 000000000000000002 000000000000000003 @@ -111,8 +112,8 @@ hex(vb) 00000000000000000000000000000000000000000000000000000000000000 0000000000000000000000000000000000000000000000000000000000000000 000000000000000000000000000000000000000000000000000000000000000000 -SELECT SUBSTRING(a.key,9) FROM information_schema.rocksdb_locks AS a ORDER BY a.key; -SUBSTRING(a.key,9) +SELECT SUBSTRING(a.key,@skip_head) FROM information_schema.rocksdb_locks AS a ORDER BY a.key; +SUBSTRING(a.key,@skip_head) 0000000000000000f8 0000000000000000f9 0000000000000000fa @@ -175,8 +176,8 @@ aaaaaaaaaa aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa -SELECT SUBSTRING(a.key,9) FROM information_schema.rocksdb_locks AS a ORDER BY a.key; -SUBSTRING(a.key,9) +SELECT SUBSTRING(a.key,@skip_head) FROM information_schema.rocksdb_locks AS a ORDER BY a.key; +SUBSTRING(a.key,@skip_head) 610000000000000001 616100000000000002 616161000000000003 @@ -241,8 +242,8 @@ aaaaaaaaaa aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa -SELECT SUBSTRING(a.key,9) FROM information_schema.rocksdb_locks AS a ORDER BY a.key; -SUBSTRING(a.key,9) +SELECT SUBSTRING(a.key,@skip_head) FROM information_schema.rocksdb_locks AS a ORDER BY a.key; +SUBSTRING(a.key,@skip_head) 6100000000000000f8 6161000000000000f9 6161610000000000fa diff --git a/mysql-test/suite/rocksdb/t/delete_before_lock.test b/mysql-test/suite/rocksdb/t/delete_before_lock.test index 93a9d1adaf9..52d8cff9cb8 100644 --- a/mysql-test/suite/rocksdb/t/delete_before_lock.test +++ b/mysql-test/suite/rocksdb/t/delete_before_lock.test @@ -1,6 +1,11 @@ --source include/have_rocksdb.inc --source include/have_debug_sync.inc +# The testcase doesn't work with range locking: +# UPDATE where id1=1 sets a lock on [(1;-inf), (1,+inf)] +# then DELETE will try to get a lock on (1,1) and cause lock wait timeout +--source suite/rocksdb/include/not_range_locking.inc + # This is a test case to reproduce https://github.com/facebook/mysql-5.6/issues/162 # Expected output of the last select for update was (1,2,100) and (1,3,100), but # currently it returns (1,2,1) and (1,3,1), which must be fixed. diff --git a/mysql-test/suite/rocksdb/t/range_locking.test b/mysql-test/suite/rocksdb/t/range_locking.test index 4cf6f079ebe..b0f5157f304 100644 --- a/mysql-test/suite/rocksdb/t/range_locking.test +++ b/mysql-test/suite/rocksdb/t/range_locking.test @@ -114,10 +114,46 @@ select * from information_schema.rocksdb_locks; rollback; begin; +--echo # The following will show a range lock on 2-9 and also a point lock on 10. +--echo # This is how things currently work. select * from t1 where pk between 2 and 9 for update; select * from information_schema.rocksdb_locks; rollback; drop table t1; +--echo # +--echo # MDEV-18104: MyRocks-Gap-Lock: range locking bounds are incorrect for multi-part keys +--echo # + +create table t0(a int); +insert into t0 values (0),(1),(2),(3),(4),(5),(6),(7),(8),(9); +create table t1 ( + kp1 int not null, + kp2 int not null, + a int, + primary key(kp1, kp2) +) engine=rocksdb; + +insert into t1 select 1, a, 1234 from t0; +insert into t1 select 2, a, 1234 from t0; +insert into t1 select 3, a, 1234 from t0; + +connect (con1,localhost,root,,); +connection con1; + +begin; +select * from t1 where kp1=2 for update; + +connection default; +--echo # The lock on kp1=2 should inhibit the following INSERT: +--error ER_LOCK_WAIT_TIMEOUT +insert into t1 values ( 2,5,9999); +rollback; + +connection con1; +rollback; +connection default; +disconnect con1; +drop table t0,t1; diff --git a/mysql-test/suite/rocksdb/t/varbinary_format.test b/mysql-test/suite/rocksdb/t/varbinary_format.test index d00a8b7afbe..cca230614fd 100644 --- a/mysql-test/suite/rocksdb/t/varbinary_format.test +++ b/mysql-test/suite/rocksdb/t/varbinary_format.test @@ -1,6 +1,12 @@ --source include/have_debug.inc --source include/have_rocksdb.inc +# +# In point-locking mode, skip first 4 bytes, as they indexnr. +# In range-locking mode, there is one more range control byte at the front +# +set @skip_head=9 + if(@@rocksdb_use_range_locking, 2, 0); + # Create a table with a varbinary key with the current format and validate # that it sorts correctly CREATE TABLE t1( @@ -27,7 +33,7 @@ SELECT hex(vb) FROM t1; # validate that the keys were encoded as expected BEGIN; SELECT hex(vb) FROM t1 FOR UPDATE; -SELECT SUBSTRING(a.key,9) FROM information_schema.rocksdb_locks AS a ORDER BY a.key; +SELECT SUBSTRING(a.key,@skip_head) FROM information_schema.rocksdb_locks AS a ORDER BY a.key; ROLLBACK; DROP TABLE t1; @@ -60,7 +66,7 @@ SELECT hex(vb) FROM t1; # validate that the keys were encoded as expected BEGIN; SELECT hex(vb) FROM t1 FOR UPDATE; -SELECT SUBSTRING(a.key,9) FROM information_schema.rocksdb_locks AS a ORDER BY a.key; +SELECT SUBSTRING(a.key,@skip_head) FROM information_schema.rocksdb_locks AS a ORDER BY a.key; ROLLBACK; DROP TABLE t1; @@ -91,7 +97,7 @@ SELECT * FROM t1; # validate that the keys were encoded as expected BEGIN; SELECT * FROM t1 FOR UPDATE; -SELECT SUBSTRING(a.key,9) FROM information_schema.rocksdb_locks AS a ORDER BY a.key; +SELECT SUBSTRING(a.key,@skip_head) FROM information_schema.rocksdb_locks AS a ORDER BY a.key; ROLLBACK; DROP TABLE t1; @@ -124,7 +130,7 @@ SELECT * FROM t1; # validate that the keys were encoded as expected BEGIN; SELECT * FROM t1 FOR UPDATE; -SELECT SUBSTRING(a.key,9) FROM information_schema.rocksdb_locks AS a ORDER BY a.key; +SELECT SUBSTRING(a.key,@skip_head) FROM information_schema.rocksdb_locks AS a ORDER BY a.key; ROLLBACK; DROP TABLE t1; diff --git a/rocksdb b/rocksdb index 756b3e44a8b..dd648b42fde 160000 --- a/rocksdb +++ b/rocksdb @@ -1 +1 @@ -Subproject commit 756b3e44a8b7173993c0076a76c56c6a31d52a8b +Subproject commit dd648b42fde54a15675002981c305cf786db8594 diff --git a/storage/rocksdb/ha_rocksdb.cc b/storage/rocksdb/ha_rocksdb.cc index 3ec6b189f8b..72dd31bbb85 100644 --- a/storage/rocksdb/ha_rocksdb.cc +++ b/storage/rocksdb/ha_rocksdb.cc @@ -2275,7 +2275,9 @@ protected: virtual rocksdb::Status lock_range(rocksdb::ColumnFamilyHandle *const cf, const rocksdb::Slice &start, - const rocksdb::Slice &end) = 0; + bool start_inf_suffix, + const rocksdb::Slice &end, + bool end_inf_suffix) = 0; virtual bool prepare(const rocksdb::TransactionName &name) = 0; @@ -2729,11 +2731,33 @@ public: virtual bool is_writebatch_trx() const override { return false; } + /* + Both start and end endpoint may may be prefixes. + Both bounds are inclusive. + */ rocksdb::Status lock_range(rocksdb::ColumnFamilyHandle *const cf, const rocksdb::Slice &start, - const rocksdb::Slice &end) override + bool start_inf_suffix, + const rocksdb::Slice &end, + bool end_inf_suffix + ) override { - return m_rocksdb_tx->GetRangeLock(cf, start, end); + const char SUFFIX_INF= 0x0; + const char SUFFIX_SUP= 0x1; + // GetRangeLock accepts range endpoints, not keys. + // Convert keys to range endpoints here. + // See also range_endpoint_convert() below. + StringBuffer<64> left; + left.append(start_inf_suffix? SUFFIX_SUP :SUFFIX_INF); + left.append(start.data(), start.size()); + + StringBuffer<64> right; + right.append(end_inf_suffix? SUFFIX_SUP :SUFFIX_INF); + right.append(end.data(), end.size()); + + return m_rocksdb_tx->GetRangeLock(cf, + rocksdb::Slice(left.ptr(),left.length()), + rocksdb::Slice(right.ptr(),right.length())); } private: void release_tx(void) { @@ -3135,7 +3159,9 @@ public: rocksdb::Status lock_range(rocksdb::ColumnFamilyHandle *const cf, const rocksdb::Slice &start, - const rocksdb::Slice &end) override + bool start_inf_suffix, + const rocksdb::Slice &end, + bool end_inf_suffix) override { return rocksdb::Status::OK(); } @@ -4333,6 +4359,63 @@ void rocksdb_kill_connection(handlerton* hton, THD* thd) } +void range_endpoint_convert(const rocksdb::Slice &key, + std::string *res) +{ + const char SUFFIX_INF= 0x0; + res->clear(); + res->append(&SUFFIX_INF, 1); + res->append(key.data(), key.size()); +} + +int range_endpoints_compare(const char *a, size_t a_len, + const char *b, size_t b_len) +{ + size_t min_len= std::min(a_len, b_len); + + //compare the values. Skip the first byte as it is the endpoint signifier + int res= memcmp(a+1, b+1, min_len-1); + if (!res) + { + if (b_len > min_len) + { + // a is shorter; + if (a[0] == 0) + return -1; //"a is smaller" + else + { + // a is considered padded with 0xFF:FF:FF:FF... + return 1; // "a" is bigger + } + } + else if (a_len > min_len) + { + // the opposite of the above: b is shorter. + if (b[0] == 0) + return 1; //"b is smaller" + else + { + // b is considered padded with 0xFF:FF:FF:FF... + return -1; // "b" is bigger + } + } + else + { + // the lengths are equal (and the key values, too) + if (a[0] < b[0]) + return -1; + else if (a[0] > b[0]) + return 1; + else + return 0; + } + } + else + return res; +} + + + void Rdb_transaction::kill_lock_wait_by_thd(THD *thd) { /* @@ -4969,8 +5052,11 @@ static int rocksdb_init_func(void *const p) { if (rocksdb_use_range_locking) { - //psergey-todo: this is ugly: - rdb->get_range_lock_manager()->set_max_lock_memory(rocksdb_max_lock_memory); + rocksdb::RangeLockMgrControl *mgr= rdb->get_range_lock_manager(); + + mgr->set_endpoint_cmp_functions(range_endpoint_convert, + range_endpoints_compare); + mgr->set_max_lock_memory(rocksdb_max_lock_memory); sql_print_information("RocksDB: USING NEW RANGE LOCKING"); sql_print_information("RocksDB: Max lock memory=%lu", rocksdb_max_lock_memory); } @@ -8129,19 +8215,31 @@ void ha_rocksdb::set_range_lock(Rdb_transaction *tx, const rocksdb::Slice &slice, const key_range *const end_key) { - // We have 'slice' with start_slice. rocksdb::Slice end_slice; uchar end_slice_buf[MAX_KEY_LENGTH]; + bool start_has_inf_suffix, end_has_inf_suffix; if (m_lock_rows != RDB_LOCK_WRITE || !rocksdb_use_range_locking) { return; } + /* + The 'slice' parameter has the left endpoint of the range to lock. + Figure out the right endpoint + */ + if (find_flag == HA_READ_KEY_EXACT) { + /* + This is "key_part= const" interval + */ + start_has_inf_suffix= false; + end_has_inf_suffix= true; end_slice= slice; } - else - if (find_flag == HA_READ_PREFIX_LAST) { + else if (find_flag == HA_READ_PREFIX_LAST) { + //psergey-todo: swap the bounds? so that the start bounds is the end? + //Testscase! + /* We have made the kd.successor(m_sk_packed_tuple) call above. @@ -8151,8 +8249,29 @@ void ha_rocksdb::set_range_lock(Rdb_transaction *tx, kd.successor(end_slice_buf, slice.size()); end_slice= rocksdb::Slice((const char*)end_slice_buf, slice.size()); } - else - if (end_key) { + else if (end_key) { + // Known start range bounds: HA_READ_KEY_OR_NEXT, HA_READ_AFTER_KEY + if (find_flag == HA_READ_KEY_OR_NEXT) + start_has_inf_suffix= false; + else if (find_flag == HA_READ_AFTER_KEY) + start_has_inf_suffix= true; + else + DBUG_ASSERT(0); + + // Known end range bounds: HA_READ_AFTER_KEY, HA_READ_BEFORE_KEY + if (end_key->flag == HA_READ_AFTER_KEY) + { + // this is "key_part <= const". + end_has_inf_suffix= true; + } + else if (end_key->flag == HA_READ_BEFORE_KEY) + { + // this is "key_part < const", non-inclusive. + end_has_inf_suffix= false; + } + else + DBUG_ASSERT(0); + uchar pack_buffer[MAX_KEY_LENGTH]; uint end_slice_size= kd.pack_index_tuple(table, pack_buffer, end_slice_buf, @@ -8163,18 +8282,22 @@ void ha_rocksdb::set_range_lock(Rdb_transaction *tx, } else { - /* - On range scan without any end key condition, there is no - eq cond, and eq cond length is the same as index_id size (4 bytes). - Example1: id1 BIGINT, id2 INT, id3 BIGINT, PRIMARY KEY (id1, id2, id3) - WHERE id1>=1 AND id2 >= 2 and id2 <= 5 => eq_cond_len= 4 - */ + // Known start range bounds: HA_READ_KEY_OR_NEXT, HA_READ_AFTER_KEY + if (find_flag == HA_READ_KEY_OR_NEXT) + start_has_inf_suffix= false; + else if (find_flag == HA_READ_AFTER_KEY) + start_has_inf_suffix= true; + else + DBUG_ASSERT(0); + uint end_slice_size; - // TODO: or this should be kd.get_index_number() ? kd.get_infimum_key(end_slice_buf, &end_slice_size); end_slice= rocksdb::Slice((char*)end_slice_buf, end_slice_size); + end_has_inf_suffix= true; } - tx->lock_range(kd.get_cf(), slice, end_slice); + tx->lock_range(kd.get_cf(), + slice, start_has_inf_suffix, + end_slice, end_has_inf_suffix); } /* @@ -8296,11 +8419,12 @@ int ha_rocksdb::index_read_map_impl(uchar *const buf, const uchar *const key, Rdb_transaction *const tx = get_or_create_tx(table->in_use); const bool is_new_snapshot = !tx->has_snapshot(); + + set_range_lock(tx, kd, find_flag, slice, end_key); + // Loop as long as we get a deadlock error AND we end up creating the // snapshot here (i.e. it did not exist prior to this) for (;;) { - //psergey: - set_range_lock(tx, kd, find_flag, slice, end_key); /* This will open the iterator and position it at a record that's equal or diff --git a/storage/rocksdb/range_locking/ft/comparator.h b/storage/rocksdb/range_locking/ft/comparator.h index 9775cc13387..8d7fe6a0f90 100644 --- a/storage/rocksdb/range_locking/ft/comparator.h +++ b/storage/rocksdb/range_locking/ft/comparator.h @@ -44,7 +44,7 @@ Copyright (c) 2006, 2015, Percona and/or its affiliates. All rights reserved. #include "util/dbt.h" -typedef int (*ft_compare_func)(DB *db, const DBT *a, const DBT *b); +typedef int (*ft_compare_func)(DB *db, void *arg, const DBT *a, const DBT *b); int toku_keycompare(const void *key1, uint32_t key1len, const void *key2, uint32_t key2len); @@ -57,8 +57,9 @@ namespace toku { // that points may be positive or negative infinity. class comparator { - void init(ft_compare_func cmp, DESCRIPTOR desc, uint8_t memcmp_magic) { + void init(ft_compare_func cmp, void *cmp_arg, DESCRIPTOR desc, uint8_t memcmp_magic) { _cmp = cmp; + _cmp_arg= cmp_arg; _fake_db->cmp_descriptor = desc; _memcmp_magic = memcmp_magic; } @@ -67,9 +68,9 @@ namespace toku { // This magic value is reserved to mean that the magic has not been set. static const uint8_t MEMCMP_MAGIC_NONE = 0; - void create(ft_compare_func cmp, DESCRIPTOR desc, uint8_t memcmp_magic = MEMCMP_MAGIC_NONE) { + void create(ft_compare_func cmp, void *cmp_arg, DESCRIPTOR desc, uint8_t memcmp_magic = MEMCMP_MAGIC_NONE) { XCALLOC(_fake_db); - init(cmp, desc, memcmp_magic); + init(cmp, cmp_arg, desc, memcmp_magic); } // inherit the attributes of another comparator, but keep our own @@ -78,7 +79,7 @@ namespace toku { invariant_notnull(_fake_db); invariant_notnull(cmp._cmp); invariant_notnull(cmp._fake_db); - init(cmp._cmp, cmp._fake_db->cmp_descriptor, cmp._memcmp_magic); + init(cmp._cmp, cmp._cmp_arg, cmp._fake_db->cmp_descriptor, cmp._memcmp_magic); } // like inherit, but doesn't require that the this comparator @@ -120,16 +121,19 @@ namespace toku { && dbt_has_memcmp_magic(a) // ..then we expect `b' to also have the memcmp magic && __builtin_expect(dbt_has_memcmp_magic(b), 1)) { + assert(0); // psergey: this branch should not be taken. return toku_builtin_compare_fun(nullptr, a, b); } else { // yikes, const sadness here - return _cmp(const_cast<DB *>(_fake_db), a, b); + return _cmp(const_cast<DB *>(_fake_db), _cmp_arg, a, b); } } private: DB *_fake_db; ft_compare_func _cmp; + void *_cmp_arg; + uint8_t _memcmp_magic; };
participants (1)
-
Sergei Petrunia