[Commits] 40182db11b5: Issue #790, MyRocks/MRR: address review input and cleanup
revision-id: 40182db11b506f97183a8c41af747fb34eb239ec (fb-prod201903-169-g40182db11b5) parent(s): 8e230333ad43098b8460240386b82e5d67a974c0 author: Sergei Petrunia committer: Sergei Petrunia timestamp: 2019-10-11 22:25:13 +0300 message: Issue #790, MyRocks/MRR: address review input and cleanup - Join two multi_get() functions together (the other implementation that is used by bypass code now uses RocksDB's MultiGet with the same signature, so there's no difference) - Add more DBUG_ASSERTs as requested - Call RANGE_SEQ_IF::skip_index_tuple when appropriate (+testcase) - Call RANGE_SEQ_IF::skip_record when appropriate (+testcase) --- mysql-test/suite/rocksdb/r/rocksdb_mrr.result | 59 ++++++++++++++++++++ mysql-test/suite/rocksdb/t/rocksdb_mrr.test | 35 ++++++++++++ storage/rocksdb/ha_rocksdb.cc | 78 ++++++++++++++------------- 3 files changed, 136 insertions(+), 36 deletions(-) diff --git a/mysql-test/suite/rocksdb/r/rocksdb_mrr.result b/mysql-test/suite/rocksdb/r/rocksdb_mrr.result index 7ece91c1fa4..6aa2ff4dc23 100644 --- a/mysql-test/suite/rocksdb/r/rocksdb_mrr.result +++ b/mysql-test/suite/rocksdb/r/rocksdb_mrr.result @@ -284,6 +284,65 @@ pk1 pk2 col1 filler mod(t3.col1,2) 20 20 20 20 0 26 26 26 26 0 28 28 28 28 0 +# +# Test for BKA's variant of Index Condition Pushdown. With BKA, +# pushed index conditions that refer to preceding tables are +# handled in a special way because there's no clear concept of +# "current row" for the preceding table(s) +# +explain +select * from t0,t3 where t3.col1=t0.a and mod(t3.pk2,2)=t0.a; +id select_type table type possible_keys key key_len ref rows Extra +1 SIMPLE t0 ALL NULL NULL NULL NULL 10 Using where +1 SIMPLE t3 ref col1 col1 5 test.t0.a 4 Using index condition; Using join buffer (Batched Key Access) +select * from t0,t3 where t3.col1=t0.a and mod(t3.pk2,2)=t0.a; +a pk1 pk2 col1 filler +0 0 0 0 0 +1 1 1 1 1 +set optimizer_switch='mrr=off'; +select * from t0,t3 where t3.col1=t0.a and mod(t3.pk2,2)=t0.a; +a pk1 pk2 col1 filler +0 0 0 0 0 +1 1 1 1 1 +set optimizer_switch='mrr=on'; +# +# A query which has RANGE_SEQ_IF::skip_record != nullptr. +# +# MultiGet/MRR does not invoke skip_record() as it would not produce +# much speedup. +# +insert into t3 select 10000+a, 10000+a, a, 'duplicate-match' from t1; +delete from t3 where col1 in (3,5); +explain +select * from t0 left join t3 on t3.col1=t0.a where t3.pk1 is null; +id select_type table type possible_keys key key_len ref rows Extra +1 SIMPLE t0 ALL NULL NULL NULL NULL 10 NULL +1 SIMPLE t3 ref col1 col1 5 test.t0.a 4 Using where; Not exists; Using join buffer (Batched Key Access) +select * from t0 left join t3 on t3.col1=t0.a where t3.pk1 is null; +a pk1 pk2 col1 filler +3 NULL NULL NULL NULL +5 NULL NULL NULL NULL +set optimizer_switch='mrr=off'; +select * from t0 left join t3 on t3.col1=t0.a where t3.pk1 is null; +a pk1 pk2 col1 filler +3 NULL NULL NULL NULL +5 NULL NULL NULL NULL +set optimizer_switch='mrr=on'; +explain +select * from t0 where t0.a in (select t3.col1 from t3 where char_length(t3.filler)<30); +id select_type table type possible_keys key key_len ref rows Extra +1 SIMPLE t0 ALL NULL NULL NULL NULL 10 Using where +1 SIMPLE t3 ref col1 col1 5 test.t0.a 4 Using where; FirstMatch(t0); Using join buffer (Batched Key Access) +select * from t0 where t0.a in (select t3.col1 from t3 where char_length(t3.filler)<30); +a +0 +1 +2 +4 +6 +7 +8 +9 drop table t0,t1,t2,t3,t4; # # Multi-keypart testcase diff --git a/mysql-test/suite/rocksdb/t/rocksdb_mrr.test b/mysql-test/suite/rocksdb/t/rocksdb_mrr.test index dc8c260f754..74675b1cb5b 100644 --- a/mysql-test/suite/rocksdb/t/rocksdb_mrr.test +++ b/mysql-test/suite/rocksdb/t/rocksdb_mrr.test @@ -195,6 +195,41 @@ where (t3.col1=20 or t3.col1 between 25 and 28) and mod(t3.col1,2)=0; select pk1,pk2,col1, filler,mod(t3.col1,2) from t3 where (t3.col1=20 or t3.col1 between 25 and 28) and mod(t3.col1,2)=0; +--echo # +--echo # Test for BKA's variant of Index Condition Pushdown. With BKA, +--echo # pushed index conditions that refer to preceding tables are +--echo # handled in a special way because there's no clear concept of +--echo # "current row" for the preceding table(s) +--echo # + +explain +select * from t0,t3 where t3.col1=t0.a and mod(t3.pk2,2)=t0.a; +select * from t0,t3 where t3.col1=t0.a and mod(t3.pk2,2)=t0.a; + +set optimizer_switch='mrr=off'; +select * from t0,t3 where t3.col1=t0.a and mod(t3.pk2,2)=t0.a; +set optimizer_switch='mrr=on'; + +--echo # +--echo # A query which has RANGE_SEQ_IF::skip_record != nullptr. +--echo # +--echo # MultiGet/MRR does not invoke skip_record() as it would not produce +--echo # much speedup. +--echo # +insert into t3 select 10000+a, 10000+a, a, 'duplicate-match' from t1; +delete from t3 where col1 in (3,5); + +explain +select * from t0 left join t3 on t3.col1=t0.a where t3.pk1 is null; +select * from t0 left join t3 on t3.col1=t0.a where t3.pk1 is null; +set optimizer_switch='mrr=off'; +select * from t0 left join t3 on t3.col1=t0.a where t3.pk1 is null; +set optimizer_switch='mrr=on'; + +explain +select * from t0 where t0.a in (select t3.col1 from t3 where char_length(t3.filler)<30); +select * from t0 where t0.a in (select t3.col1 from t3 where char_length(t3.filler)<30); + drop table t0,t1,t2,t3,t4; --echo # diff --git a/storage/rocksdb/ha_rocksdb.cc b/storage/rocksdb/ha_rocksdb.cc index 15601fd3f82..8019be03401 100644 --- a/storage/rocksdb/ha_rocksdb.cc +++ b/storage/rocksdb/ha_rocksdb.cc @@ -3095,11 +3095,6 @@ class Rdb_transaction { const rocksdb::Slice &key, rocksdb::PinnableSlice *const value, bool exclusive, const bool do_validate) = 0; - virtual void multi_get(const rocksdb::ReadOptions &read_options, - rocksdb::ColumnFamilyHandle *column_family, - const size_t num_keys, const rocksdb::Slice *keys, - rocksdb::PinnableSlice *values, - rocksdb::Status *statuses, bool sorted_input) = 0; virtual rocksdb::Iterator *get_iterator( const rocksdb::ReadOptions &options, rocksdb::ColumnFamilyHandle *column_family) = 0; @@ -3476,15 +3471,6 @@ class Rdb_transaction_impl : public Rdb_transaction { return m_rocksdb_tx->Get(m_read_opts, column_family, key, value); } - void multi_get(const rocksdb::ReadOptions &read_options, - rocksdb::ColumnFamilyHandle *column_family, - const size_t num_keys, const rocksdb::Slice *keys, - rocksdb::PinnableSlice *values, rocksdb::Status *statuses, - bool sorted_input) override { - m_rocksdb_tx->MultiGet(read_options, column_family, num_keys, keys, values, - statuses, sorted_input); - } - void multi_get(rocksdb::ColumnFamilyHandle *const column_family, const size_t num_keys, const rocksdb::Slice *keys, rocksdb::PinnableSlice *values, rocksdb::Status *statuses, @@ -3791,16 +3777,6 @@ class Rdb_writebatch_impl : public Rdb_transaction { return get(column_family, key, value); } - void multi_get(const rocksdb::ReadOptions &read_options, - rocksdb::ColumnFamilyHandle *column_family, - const size_t num_keys, const rocksdb::Slice *keys, - rocksdb::PinnableSlice *values, rocksdb::Status *statuses, - bool sorted_input) override { - // todo: could we just read the committed content from the DB here? - //psergey-todo:! - DBUG_ASSERT(0); - } - void multi_get(rocksdb::ColumnFamilyHandle *const column_family, const size_t num_keys, const rocksdb::Slice *keys, rocksdb::PinnableSlice *values, rocksdb::Status *statuses, @@ -15173,9 +15149,8 @@ ha_rows ha_rocksdb::multi_range_read_info_const(uint keyno, RANGE_SEQ_IF *seq, } if (all_eq_ranges) { - // Indicate that we will use Mutlit-Get MRR + // Indicate that we will use MultiGet MRR *flags &= ~HA_MRR_USE_DEFAULT_IMPL; - *flags |= HA_MRR_CONVERT_REF_TO_RANGE; *flags |= HA_MRR_SUPPORT_SORTED; *bufsz = mrr_get_length_per_rec() * res * 1.1 + 1; } @@ -15237,7 +15212,7 @@ class Mrr_rowid_source { // -// Rowid source that produces rowids by enumerating a seqence of ranges +// Rowid source that produces rowids by enumerating a sequence of ranges // class Mrr_pk_scan_rowid_source : public Mrr_rowid_source { range_seq_t mrr_seq_it; @@ -15263,6 +15238,8 @@ class Mrr_pk_scan_rowid_source : public Mrr_rowid_source { (key_part_map(1) << self->m_pk_descr->get_key_parts()) - 1; DBUG_ASSERT(range.start_key.keypart_map == all_parts_map); DBUG_ASSERT(range.end_key.keypart_map == all_parts_map); + DBUG_ASSERT(range.start_key.flag == HA_READ_KEY_EXACT); + DBUG_ASSERT(range.end_key.flag == HA_READ_AFTER_KEY); *range_ptr = range.ptr; *size = self->m_pk_descr->pack_index_tuple(self->table, @@ -15299,10 +15276,23 @@ class Mrr_sec_key_rowid_source : public Mrr_rowid_source { if (err) return err; - err = self->handler::multi_range_read_next(range_ptr); - if (!err) { + while (!(err = self->handler::multi_range_read_next(range_ptr))) { + + if (self->mrr_funcs.skip_index_tuple && + self->mrr_funcs.skip_index_tuple(self->mrr_iter, *range_ptr)) { + // BKA's variant of "Index Condition Pushdown" check failed + continue; + } + + if (self->mrr_funcs.skip_record && + self->mrr_funcs.skip_record(self->mrr_iter, *range_ptr, + (uchar*)self->m_last_rowkey.ptr())) { + continue; + } + memcpy(buf, self->m_last_rowkey.ptr(), self->m_last_rowkey.length()); *size = self->m_last_rowkey.length(); + break; } return err; } @@ -15344,6 +15334,9 @@ int ha_rocksdb::multi_range_read_init(RANGE_SEQ_IF *seq, void *seq_init_param, mrr_sorted_mode = (mode & HA_MRR_SORTED) ? true : false; if (active_index == table->s->primary_key) { + // ICP is not supported for PK, so we don't expect that BKA's variant + // of ICP would be used: + DBUG_ASSERT(!mrr_funcs.skip_index_tuple); mrr_rowid_reader = new Mrr_pk_scan_rowid_source(this, seq_init_param, n_ranges, mode); } else { @@ -15489,9 +15482,8 @@ int ha_rocksdb::mrr_fill_buffer() { Rdb_transaction *const tx = get_or_create_tx(table->in_use); - tx->multi_get(tx->m_read_opts, m_pk_descr->get_cf(), - mrr_n_elements, // actual number of elements we've got - mrr_keys, mrr_values, mrr_statuses, mrr_sorted_mode); + tx->multi_get(m_pk_descr->get_cf(), mrr_n_elements, mrr_keys, mrr_values, + mrr_statuses, mrr_sorted_mode); return 0; } @@ -15527,7 +15519,7 @@ int ha_rocksdb::multi_range_read_next(char **range_info) { Rdb_transaction *&tx = get_tx_from_thd(table->in_use); int rc; - do { + while (1) { while (1) { if (mrr_read_index >= mrr_n_elements) { if (mrr_rowid_reader->eof() || !mrr_n_elements) { @@ -15543,13 +15535,26 @@ int ha_rocksdb::multi_range_read_next(char **range_info) { return rc; } } - // Skip the "is not found" errors + // If we found a status that has a row, leave the loop if (mrr_statuses[mrr_read_index].ok()) break; + + // Skip the NotFound errors, return any other error to the SQL layer + if (!mrr_statuses[mrr_read_index].IsNotFound()) + return rdb_error_to_mysql(mrr_statuses[mrr_read_index]); + mrr_read_index++; } size_t cur_key = mrr_read_index++; const rocksdb::Slice &rowkey = mrr_keys[cur_key]; + + if (mrr_funcs.skip_record && + mrr_funcs.skip_record(mrr_iter, mrr_range_ptrs[cur_key], + (uchar*)rowkey.data())) { + rc = HA_ERR_END_OF_FILE; + continue; + } + m_last_rowkey.copy((const char *)rowkey.data(), rowkey.size(), &my_charset_bin); @@ -15570,8 +15575,9 @@ int ha_rocksdb::multi_range_read_next(char **range_info) { rc = convert_record_from_storage_format(&rowkey, table->record[0]); m_retrieved_record.Reset(); mrr_values[cur_key].Reset(); - table->status = rc ? STATUS_NOT_FOUND : 0; - } while (0); + break; + } + table->status = rc ? STATUS_NOT_FOUND : 0; return rc; }
participants (1)
-
psergey