- commits - lists.mariadb.org

[Commits] a0643458ca5: MDEV-26996: Support descending indexes in the range optimizer
by Sergei Petrunia 13 Dec '21

13 Dec '21

revision-id: a0643458ca5bcd69e91cffcb9e637f544aca7cf1 (mariadb-10.6.1-231-ga0643458ca5) parent(s): d3b5ea4c6b0d3abbe18bf63d9fe076dcb0a043ce author: Sergei Petrunia committer: Sergei Petrunia timestamp: 2021-12-14 01:47:01 +0300 message: MDEV-26996: Support descending indexes in the range optimizer Make the Range Optimizer support descending index key parts. We follow the approach taken in MySQL-8. See HowRangeOptimizerHandlesDescKeyparts for the description. --- mysql-test/main/desc_index_range.result | 158 ++++++++++++++++++++++ mysql-test/main/desc_index_range.test | 74 +++++++++++ sql/item_geofunc.cc | 3 +- sql/key.cc | 16 ++- sql/opt_range.cc | 162 +++++++++++++++------- sql/opt_range.h | 229 ++++++++++++++++++++++++++++---- sql/opt_range_mrr.cc | 46 +++---- 7 files changed, 583 insertions(+), 105 deletions(-) diff --git a/mysql-test/main/desc_index_range.result b/mysql-test/main/desc_index_range.result new file mode 100644 index 00000000000..53a608fe2d9 --- /dev/null +++ b/mysql-test/main/desc_index_range.result @@ -0,0 +1,158 @@ +create table t1 ( +a int, +key (a desc) +); +insert into t1 select seq from seq_1_to_1000; +set optimizer_trace=1; +explain select * from t1 force index(a) where a in (2, 4, 6); +id select_type table type possible_keys key key_len ref rows Extra +1 SIMPLE t1 range a a 5 NULL 3 Using where; Using index +select json_detailed(json_extract(trace, '$**.range_access_plan.ranges')) +from information_schema.optimizer_trace; +json_detailed(json_extract(trace, '$**.range_access_plan.ranges')) +[ + + [ + "(6) <= (a) <= (6)", + "(4) <= (a) <= (4)", + "(2) <= (a) <= (2)" + ] +] +set optimizer_trace=default; +# These should go in reverse order: +select * from t1 force index(a) where a in (2, 4, 6); +a +6 +4 +2 +drop table t1; +# +# Multi-part key tests +# +create table t1 ( +a int not null, +b int not null, +key ab(a, b desc) +); +insert into t1 select A.seq, B.seq*10 from seq_1_to_10 A, seq_1_to_10 B; +set optimizer_trace=1; +explain select * from t1 force index(ab) where a>=8 and b>=50; +id select_type table type possible_keys key key_len ref rows Extra +1 SIMPLE t1 range ab ab 4 NULL 51 Using where; Using index +select json_detailed(json_extract(trace, '$**.range_access_plan.ranges')) +from information_schema.optimizer_trace; +json_detailed(json_extract(trace, '$**.range_access_plan.ranges')) +[ + + [ + "(8) <= (a)" + ] +] +explain select * from t1 force index(ab) where a>=8 and b<=50; +id select_type table type possible_keys key key_len ref rows Extra +1 SIMPLE t1 range ab ab 8 NULL 46 Using where; Using index +select json_detailed(json_extract(trace, '$**.range_access_plan.ranges')) +from information_schema.optimizer_trace; +json_detailed(json_extract(trace, '$**.range_access_plan.ranges')) +[ + + [ + "(8,50) <= (a,b)" + ] +] +select * from t1 force index(ab) where a>=8 and b<=50; +a b +8 50 +8 40 +8 30 +8 20 +8 10 +9 50 +9 40 +9 30 +9 20 +9 10 +10 50 +10 40 +10 30 +10 20 +10 10 +select * from t1 ignore index(ab) where a>=8 and b<=50 order by a, b desc; +a b +8 50 +8 40 +8 30 +8 20 +8 10 +9 50 +9 40 +9 30 +9 20 +9 10 +10 50 +10 40 +10 30 +10 20 +10 10 +explain +select * from t1 where a between 2 and 4 and b between 50 and 80; +id select_type table type possible_keys key key_len ref rows Extra +1 SIMPLE t1 range ab ab 8 NULL 17 Using where; Using index +select json_detailed(json_extract(trace, '$**.range_access_plan.ranges')) +from information_schema.optimizer_trace; +json_detailed(json_extract(trace, '$**.range_access_plan.ranges')) +[ + + [ + "(2,80) <= (a,b) <= (4,50)" + ] +] +select * from t1 where a between 2 and 4 and b between 50 and 80; +a b +2 80 +2 70 +2 60 +2 50 +3 80 +3 70 +3 60 +3 50 +4 80 +4 70 +4 60 +4 50 +drop table t1; +create table t2 ( +a int not null, +b int not null, +key ab(a desc, b desc) +); +insert into t2 select A.seq, B.seq*10 from seq_1_to_10 A, seq_1_to_10 B; +explain +select * from t2 where a between 2 and 4; +id select_type table type possible_keys key key_len ref rows Extra +1 SIMPLE t2 range ab ab 4 NULL 40 Using where; Using index +select json_detailed(json_extract(trace, '$**.range_access_plan.ranges')) +from information_schema.optimizer_trace; +json_detailed(json_extract(trace, '$**.range_access_plan.ranges')) +[ + + [ + "(4) <= (a) <= (2)" + ] +] +explain +select * from t2 where a between 2 and 4 and b between 50 and 80; +id select_type table type possible_keys key key_len ref rows Extra +1 SIMPLE t2 range ab ab 8 NULL 31 Using where; Using index +select json_detailed(json_extract(trace, '$**.range_access_plan.ranges')) +from information_schema.optimizer_trace; +json_detailed(json_extract(trace, '$**.range_access_plan.ranges')) +[ + + [ + "(4,80) <= (a,b) <= (2,50)" + ] +] +set optimizer_trace=default; +drop table t2; diff --git a/mysql-test/main/desc_index_range.test b/mysql-test/main/desc_index_range.test new file mode 100644 index 00000000000..7fdf439c523 --- /dev/null +++ b/mysql-test/main/desc_index_range.test @@ -0,0 +1,74 @@ +# +# Tests for range access and descending indexes +# +--source include/have_sequence.inc +--source include/have_innodb.inc + +create table t1 ( + a int, + key (a desc) +); +insert into t1 select seq from seq_1_to_1000; + +set optimizer_trace=1; +explain select * from t1 force index(a) where a in (2, 4, 6); + +select json_detailed(json_extract(trace, '$**.range_access_plan.ranges')) +from information_schema.optimizer_trace; +set optimizer_trace=default; + +--echo # These should go in reverse order: +select * from t1 force index(a) where a in (2, 4, 6); +drop table t1; + +--echo # +--echo # Multi-part key tests +--echo # +create table t1 ( + a int not null, + b int not null, + key ab(a, b desc) +); + +insert into t1 select A.seq, B.seq*10 from seq_1_to_10 A, seq_1_to_10 B; + +set optimizer_trace=1; +explain select * from t1 force index(ab) where a>=8 and b>=50; +select json_detailed(json_extract(trace, '$**.range_access_plan.ranges')) +from information_schema.optimizer_trace; + +explain select * from t1 force index(ab) where a>=8 and b<=50; +select json_detailed(json_extract(trace, '$**.range_access_plan.ranges')) +from information_schema.optimizer_trace; + +select * from t1 force index(ab) where a>=8 and b<=50; +select * from t1 ignore index(ab) where a>=8 and b<=50 order by a, b desc; + +explain +select * from t1 where a between 2 and 4 and b between 50 and 80; +select json_detailed(json_extract(trace, '$**.range_access_plan.ranges')) +from information_schema.optimizer_trace; + +select * from t1 where a between 2 and 4 and b between 50 and 80; + +drop table t1; + +create table t2 ( + a int not null, + b int not null, + key ab(a desc, b desc) +); +insert into t2 select A.seq, B.seq*10 from seq_1_to_10 A, seq_1_to_10 B; + +explain +select * from t2 where a between 2 and 4; +select json_detailed(json_extract(trace, '$**.range_access_plan.ranges')) +from information_schema.optimizer_trace; + +explain +select * from t2 where a between 2 and 4 and b between 50 and 80; +select json_detailed(json_extract(trace, '$**.range_access_plan.ranges')) +from information_schema.optimizer_trace; + +set optimizer_trace=default; +drop table t2; diff --git a/sql/item_geofunc.cc b/sql/item_geofunc.cc index 49b85e2213b..a2a99bcdf8f 100644 --- a/sql/item_geofunc.cc +++ b/sql/item_geofunc.cc @@ -1083,7 +1083,8 @@ Item_func_spatial_rel::get_mm_leaf(RANGE_OPT_PARAM *param, DBUG_RETURN(0); // out of memory field->get_key_image(str, key_part->length, key_part->image_type); SEL_ARG *tree; - if (!(tree= new (param->mem_root) SEL_ARG(field, str, str))) + + if (!(tree= new (param->mem_root) SEL_ARG(field, true, str, str))) DBUG_RETURN(0); // out of memory switch (type) { diff --git a/sql/key.cc b/sql/key.cc index f2cebfe6d82..c43d3c36d5d 100644 --- a/sql/key.cc +++ b/sql/key.cc @@ -495,6 +495,7 @@ int key_cmp(KEY_PART_INFO *key_part, const uchar *key, uint key_length) { int cmp; store_length= key_part->store_length; + int sort_order = (key_part->key_part_flag & HA_REVERSE_SORT) ? -1 : 1; if (key_part->null_bit) { /* This key part allows null values; NULL is lower than everything */ @@ -503,19 +504,19 @@ int key_cmp(KEY_PART_INFO *key_part, const uchar *key, uint key_length) { /* the range is expecting a null value */ if (!field_is_null) - return 1; // Found key is > range + return sort_order; // Found key is > range /* null -- exact match, go to next key part */ continue; } else if (field_is_null) - return -1; // NULL is less than any value + return -sort_order; // NULL is less than any value key++; // Skip null byte store_length--; } if ((cmp=key_part->field->key_cmp(key, key_part->length)) < 0) - return -1; + return -sort_order; if (cmp > 0) - return 1; + return sort_order; } return 0; // Keys are equal } @@ -574,6 +575,7 @@ int key_rec_cmp(void *key_p, uchar *first_rec, uchar *second_rec) do { field= key_part->field; + int sort_order = (key_part->key_part_flag & HA_REVERSE_SORT) ? -1 : 1; if (key_part->null_bit) { @@ -593,12 +595,12 @@ int key_rec_cmp(void *key_p, uchar *first_rec, uchar *second_rec) ; /* Fall through, no NULL fields */ else { - DBUG_RETURN(+1); + DBUG_RETURN(sort_order); } } else if (!sec_is_null) { - DBUG_RETURN(-1); + DBUG_RETURN(-sort_order); } else goto next_loop; /* Both were NULL */ @@ -612,7 +614,7 @@ int key_rec_cmp(void *key_p, uchar *first_rec, uchar *second_rec) */ if ((result= field->cmp_prefix(field->ptr+first_diff, field->ptr+sec_diff, key_part->length))) - DBUG_RETURN(result); + DBUG_RETURN(result * sort_order); next_loop: key_part++; key_part_num++; diff --git a/sql/opt_range.cc b/sql/opt_range.cc index 86539046a32..541a921435a 100644 --- a/sql/opt_range.cc +++ b/sql/opt_range.cc @@ -1879,6 +1879,7 @@ SEL_ARG::SEL_ARG(SEL_ARG &arg) :Sql_alloc() max_flag=arg.max_flag; maybe_flag=arg.maybe_flag; maybe_null=arg.maybe_null; + is_ascending= arg.is_ascending; part=arg.part; field=arg.field; min_value=arg.min_value; @@ -1904,9 +1905,10 @@ inline void SEL_ARG::make_root() use_count=0; elements=1; } -SEL_ARG::SEL_ARG(Field *f,const uchar *min_value_arg, +SEL_ARG::SEL_ARG(Field *f, bool is_asc, const uchar *min_value_arg, const uchar *max_value_arg) :min_flag(0), max_flag(0), maybe_flag(0), maybe_null(f->real_maybe_null()), + is_ascending(is_asc), elements(1), use_count(1), field(f), min_value((uchar*) min_value_arg), max_value((uchar*) max_value_arg), next(0),prev(0), next_key_part(0), color(BLACK), type(KEY_RANGE), weight(1) @@ -1915,11 +1917,12 @@ SEL_ARG::SEL_ARG(Field *f,const uchar *min_value_arg, max_part_no= 1; } -SEL_ARG::SEL_ARG(Field *field_,uint8 part_, +SEL_ARG::SEL_ARG(Field *field_,uint8 part_, bool is_asc_, uchar *min_value_, uchar *max_value_, uint8 min_flag_,uint8 max_flag_,uint8 maybe_flag_) :min_flag(min_flag_),max_flag(max_flag_),maybe_flag(maybe_flag_), - part(part_),maybe_null(field_->real_maybe_null()), elements(1),use_count(1), + part(part_),maybe_null(field_->real_maybe_null()), is_ascending(is_asc_), + elements(1),use_count(1), field(field_), min_value(min_value_), max_value(max_value_), next(0),prev(0),next_key_part(0),color(BLACK),type(KEY_RANGE), weight(1) { @@ -1938,8 +1941,8 @@ SEL_ARG::SEL_ARG(Field *field_,uint8 part_, class SEL_ARG_LE: public SEL_ARG { public: - SEL_ARG_LE(const uchar *key, Field *field) - :SEL_ARG(field, key, key) + SEL_ARG_LE(const uchar *key, Field *field, bool is_asc) + :SEL_ARG(field, is_asc, key, key) { if (!field->real_maybe_null()) min_flag= NO_MIN_RANGE; // From start @@ -1959,16 +1962,17 @@ class SEL_ARG_LT: public SEL_ARG_LE Use this constructor if value->save_in_field() went precisely, without any data rounding or truncation. */ - SEL_ARG_LT(const uchar *key, Field *field) - :SEL_ARG_LE(key, field) + SEL_ARG_LT(const uchar *key, Field *field, bool is_asc) + :SEL_ARG_LE(key, field, is_asc) { max_flag= NEAR_MAX; } /* Use this constructor if value->save_in_field() returned success, but we don't know if rounding or truncation happened (as some Field::store() do not report minor data changes). */ - SEL_ARG_LT(THD *thd, const uchar *key, Field *field, Item *value) - :SEL_ARG_LE(key, field) + SEL_ARG_LT(THD *thd, const uchar *key, Field *field, bool is_asc, + Item *value) + :SEL_ARG_LE(key, field, is_asc) { if (stored_field_cmp_to_item(thd, field, value) == 0) max_flag= NEAR_MAX; @@ -1984,7 +1988,7 @@ class SEL_ARG_GT: public SEL_ARG without any data rounding or truncation. */ SEL_ARG_GT(const uchar *key, const KEY_PART *key_part, Field *field) - :SEL_ARG(field, key, key) + :SEL_ARG(field, !(key_part->flag & HA_REVERSE_SORT), key, key) { // Don't use open ranges for partial key_segments if (!(key_part->flag & HA_PART_KEY_SEG)) @@ -1998,7 +2002,7 @@ class SEL_ARG_GT: public SEL_ARG */ SEL_ARG_GT(THD *thd, const uchar *key, const KEY_PART *key_part, Field *field, Item *value) - :SEL_ARG(field, key, key) + :SEL_ARG(field, !(key_part->flag & HA_REVERSE_SORT), key, key) { // Don't use open ranges for partial key_segments if ((!(key_part->flag & HA_PART_KEY_SEG)) && @@ -2016,8 +2020,8 @@ class SEL_ARG_GE: public SEL_ARG Use this constructor if value->save_in_field() went precisely, without any data rounding or truncation. */ - SEL_ARG_GE(const uchar *key, Field *field) - :SEL_ARG(field, key, key) + SEL_ARG_GE(const uchar *key, Field *field, bool is_asc) + :SEL_ARG(field, is_asc, key, key) { max_flag= NO_MAX_RANGE; } @@ -2028,7 +2032,7 @@ class SEL_ARG_GE: public SEL_ARG */ SEL_ARG_GE(THD *thd, const uchar *key, const KEY_PART *key_part, Field *field, Item *value) - :SEL_ARG(field, key, key) + :SEL_ARG(field, !(key_part->flag & HA_REVERSE_SORT), key, key) { // Don't use open ranges for partial key_segments if ((!(key_part->flag & HA_PART_KEY_SEG)) && @@ -2059,7 +2063,8 @@ SEL_ARG *SEL_ARG::clone(RANGE_OPT_PARAM *param, SEL_ARG *new_parent, } else { - if (!(tmp= new (param->mem_root) SEL_ARG(field,part, min_value,max_value, + if (!(tmp= new (param->mem_root) SEL_ARG(field, part, is_ascending, + min_value, max_value, min_flag, max_flag, maybe_flag))) return 0; // OOM tmp->parent=new_parent; @@ -2830,6 +2835,7 @@ int SQL_SELECT::test_quick_select(THD *thd, key_map keys_to_use, } trace_keypart.end(); trace_idx_details.add("usable", !unusable_has_desc_keyparts); + unusable_has_desc_keyparts= false; if (unusable_has_desc_keyparts) // TODO MDEV-13756 { key_parts= param.key[param.keys]; @@ -4420,12 +4426,14 @@ int find_used_partitions(PART_PRUNE_PARAM *ppar, SEL_ARG *key_tree) key_tree->next_key_part->store_min_key(ppar->key, &tmp_min_key, &tmp_min_flag, - ppar->last_part_partno); + ppar->last_part_partno, + true); if (!tmp_max_flag) key_tree->next_key_part->store_max_key(ppar->key, &tmp_max_key, &tmp_max_flag, - ppar->last_part_partno); + ppar->last_part_partno, + false); flag= tmp_min_flag | tmp_max_flag; } else @@ -8671,7 +8679,8 @@ Item_func_null_predicate::get_mm_leaf(RANGE_OPT_PARAM *param, if (!field->real_maybe_null()) DBUG_RETURN(type == ISNULL_FUNC ? &null_element : NULL); SEL_ARG *tree; - if (!(tree= new (alloc) SEL_ARG(field, is_null_string, is_null_string))) + bool is_asc= !(key_part->flag & HA_REVERSE_SORT); + if (!(tree= new (alloc) SEL_ARG(field, is_asc, is_null_string, is_null_string))) DBUG_RETURN(0); if (type == Item_func::ISNOTNULL_FUNC) { @@ -8771,7 +8780,8 @@ Item_func_like::get_mm_leaf(RANGE_OPT_PARAM *param, int2store(min_str + maybe_null, min_length); int2store(max_str + maybe_null, max_length); } - SEL_ARG *tree= new (param->mem_root) SEL_ARG(field, min_str, max_str); + bool is_asc= !(key_part->flag & HA_REVERSE_SORT); + SEL_ARG *tree= new (param->mem_root) SEL_ARG(field, is_asc, min_str, max_str); DBUG_RETURN(tree); } @@ -9019,18 +9029,19 @@ SEL_ARG *Field::stored_field_make_mm_leaf(RANGE_OPT_PARAM *param, if (!(str= make_key_image(param->mem_root, key_part))) DBUG_RETURN(0); + bool is_asc= !(key_part->flag & HA_REVERSE_SORT); switch (op) { case SCALAR_CMP_LE: - DBUG_RETURN(new (mem_root) SEL_ARG_LE(str, this)); + DBUG_RETURN(new (mem_root) SEL_ARG_LE(str, this, is_asc)); case SCALAR_CMP_LT: - DBUG_RETURN(new (mem_root) SEL_ARG_LT(thd, str, this, value)); + DBUG_RETURN(new (mem_root) SEL_ARG_LT(thd, str, this, is_asc, value)); case SCALAR_CMP_GT: DBUG_RETURN(new (mem_root) SEL_ARG_GT(thd, str, key_part, this, value)); case SCALAR_CMP_GE: DBUG_RETURN(new (mem_root) SEL_ARG_GE(thd, str, key_part, this, value)); case SCALAR_CMP_EQ: case SCALAR_CMP_EQUAL: - DBUG_RETURN(new (mem_root) SEL_ARG(this, str, str)); + DBUG_RETURN(new (mem_root) SEL_ARG(this, is_asc, str, str)); break; } DBUG_ASSERT(0); @@ -9048,18 +9059,19 @@ SEL_ARG *Field::stored_field_make_mm_leaf_exact(RANGE_OPT_PARAM *param, if (!(str= make_key_image(param->mem_root, key_part))) DBUG_RETURN(0); + bool is_asc= !(key_part->flag & HA_REVERSE_SORT); switch (op) { case SCALAR_CMP_LE: - DBUG_RETURN(new (param->mem_root) SEL_ARG_LE(str, this)); + DBUG_RETURN(new (param->mem_root) SEL_ARG_LE(str, this, is_asc)); case SCALAR_CMP_LT: - DBUG_RETURN(new (param->mem_root) SEL_ARG_LT(str, this)); + DBUG_RETURN(new (param->mem_root) SEL_ARG_LT(str, this, is_asc)); case SCALAR_CMP_GT: DBUG_RETURN(new (param->mem_root) SEL_ARG_GT(str, key_part, this)); case SCALAR_CMP_GE: - DBUG_RETURN(new (param->mem_root) SEL_ARG_GE(str, this)); + DBUG_RETURN(new (param->mem_root) SEL_ARG_GE(str, this, is_asc)); case SCALAR_CMP_EQ: case SCALAR_CMP_EQUAL: - DBUG_RETURN(new (param->mem_root) SEL_ARG(this, str, str)); + DBUG_RETURN(new (param->mem_root) SEL_ARG(this, is_asc, str, str)); break; } DBUG_ASSERT(0); @@ -11777,6 +11789,46 @@ get_quick_select(PARAM *param,uint idx,SEL_ARG *key_tree, uint mrr_flags, } +void SEL_ARG::store_next_min_max_keys(KEY_PART *key, + uchar **cur_min_key, uint *cur_min_flag, + uchar **cur_max_key, uint *cur_max_flag, + int *min_part, int *max_part) +{ + DBUG_ASSERT(next_key_part); + bool asc = next_key_part->is_ascending; + + if (!get_min_flag()) + { + if (asc) + { + *min_part += next_key_part->store_min_key(key, cur_min_key, + cur_min_flag, MAX_KEY, true); + } + else + { + uint tmp_flag = invert_min_flag(*cur_min_flag); + *min_part += next_key_part->store_max_key(key, cur_min_key, &tmp_flag, + MAX_KEY, true); + *cur_min_flag = invert_max_flag(tmp_flag); + } + } + if (!get_max_flag()) + { + if (asc) + { + *max_part += next_key_part->store_max_key(key, cur_max_key, + cur_max_flag, MAX_KEY, false); + } + else + { + uint tmp_flag = invert_max_flag(*cur_max_flag); + *max_part += next_key_part->store_min_key(key, cur_max_key, &tmp_flag, + MAX_KEY, false); + *cur_max_flag = invert_min_flag(tmp_flag); + } + } +} + /* ** Fix this to get all possible sub_ranges */ @@ -11790,17 +11842,19 @@ get_quick_keys(PARAM *param,QUICK_RANGE_SELECT *quick,KEY_PART *key, int min_part= key_tree->part-1, // # of keypart values in min_key buffer max_part= key_tree->part-1; // # of keypart values in max_key buffer - if (key_tree->left != &null_element) + SEL_ARG *next_tree = key_tree->is_ascending ? key_tree->left : key_tree->right; + if (next_tree != &null_element) { - if (get_quick_keys(param,quick,key,key_tree->left, + if (get_quick_keys(param,quick,key,next_tree, min_key,min_key_flag, max_key, max_key_flag)) return 1; } uchar *tmp_min_key=min_key,*tmp_max_key=max_key; - min_part+= key_tree->store_min(key[key_tree->part].store_length, - &tmp_min_key,min_key_flag); - max_part+= key_tree->store_max(key[key_tree->part].store_length, - &tmp_max_key,max_key_flag); + + key_tree->store_min_max(key[key_tree->part].store_length, + &tmp_min_key, min_key_flag, + &tmp_max_key, max_key_flag, + &min_part, &max_part); if (key_tree->next_key_part && key_tree->next_key_part->type == SEL_ARG::KEY_RANGE && @@ -11810,31 +11864,40 @@ get_quick_keys(PARAM *param,QUICK_RANGE_SELECT *quick,KEY_PART *key, memcmp(min_key, max_key, (uint)(tmp_max_key - max_key))==0 && key_tree->min_flag==0 && key_tree->max_flag==0) { + // psergey-note: simplified the parameters below as follows: + // min_key_flag | key_tree->min_flag -> min_key_flag + // max_key_flag | key_tree->max_flag -> max_key_flag if (get_quick_keys(param,quick,key,key_tree->next_key_part, - tmp_min_key, min_key_flag | key_tree->min_flag, - tmp_max_key, max_key_flag | key_tree->max_flag)) + tmp_min_key, min_key_flag, + tmp_max_key, max_key_flag)) return 1; goto end; // Ugly, but efficient } { - uint tmp_min_flag=key_tree->min_flag,tmp_max_flag=key_tree->max_flag; - if (!tmp_min_flag) - min_part+= key_tree->next_key_part->store_min_key(key, - &tmp_min_key, - &tmp_min_flag, - MAX_KEY); - if (!tmp_max_flag) - max_part+= key_tree->next_key_part->store_max_key(key, - &tmp_max_key, - &tmp_max_flag, - MAX_KEY); + uint tmp_min_flag= key_tree->get_min_flag(); + uint tmp_max_flag= key_tree->get_max_flag(); + + key_tree->store_next_min_max_keys(key, + &tmp_min_key, &tmp_min_flag, + &tmp_max_key, &tmp_max_flag, + &min_part, &max_part); flag=tmp_min_flag | tmp_max_flag; } } else { - flag = (key_tree->min_flag & GEOM_FLAG) ? - key_tree->min_flag : key_tree->min_flag | key_tree->max_flag; + if (key_tree->is_ascending) + { + flag= (key_tree->min_flag & GEOM_FLAG) ? key_tree->min_flag: + (key_tree->min_flag | + key_tree->max_flag); + } + else + { + // Invert flags for DESC keypart + flag= invert_min_flag(key_tree->min_flag) | + invert_max_flag(key_tree->max_flag); + } } /* @@ -11895,8 +11958,9 @@ get_quick_keys(PARAM *param,QUICK_RANGE_SELECT *quick,KEY_PART *key, return 1; end: - if (key_tree->right != &null_element) - return get_quick_keys(param,quick,key,key_tree->right, + next_tree = key_tree->is_ascending ? key_tree->right : key_tree->left; + if (next_tree != &null_element) + return get_quick_keys(param,quick,key,next_tree, min_key,min_key_flag, max_key,max_key_flag); return 0; diff --git a/sql/opt_range.h b/sql/opt_range.h index 1014176ecc5..6864a5c583a 100644 --- a/sql/opt_range.h +++ b/sql/opt_range.h @@ -54,6 +54,33 @@ struct KEY_PART { }; +/** + A helper function to invert min flags to max flags for DESC key parts. + It changes NEAR_MIN, NO_MIN_RANGE to NEAR_MAX, NO_MAX_RANGE appropriately +*/ + +inline uint invert_min_flag(uint min_flag) +{ + uint max_flag_out = min_flag & ~(NEAR_MIN | NO_MIN_RANGE); + if (min_flag & NEAR_MIN) max_flag_out |= NEAR_MAX; + if (min_flag & NO_MIN_RANGE) max_flag_out |= NO_MAX_RANGE; + return max_flag_out; +} + + +/** + A helper function to invert max flags to min flags for DESC key parts. + It changes NEAR_MAX, NO_MAX_RANGE to NEAR_MIN, NO_MIN_RANGE appropriately +*/ + +inline uint invert_max_flag(uint max_flag) +{ + uint min_flag_out = max_flag & ~(NEAR_MAX | NO_MAX_RANGE); + if (max_flag & NEAR_MAX) min_flag_out |= NEAR_MIN; + if (max_flag & NO_MAX_RANGE) min_flag_out |= NO_MIN_RANGE; + return min_flag_out; +} + class RANGE_OPT_PARAM; /* A construction block of the SEL_ARG-graph. @@ -267,6 +294,8 @@ class RANGE_OPT_PARAM; - it is a lot easier to compute than computing the number of ranges, - it can be updated incrementally when performing AND/OR operations on parts of the graph. + + 6. For handling DESC keyparts, See HowRangeOptimizerHandlesDescKeyparts */ class SEL_ARG :public Sql_alloc @@ -277,6 +306,11 @@ class SEL_ARG :public Sql_alloc uint8 min_flag,max_flag,maybe_flag; uint8 part; // Which key part uint8 maybe_null; + /* + Whether the keypart is ascending or descending. + See HowRangeOptimizerHandlesDescKeyparts for details. + */ + uint8 is_ascending; /* The ordinal number the least significant component encountered in the ranges of the SEL_ARG tree (the first component has number 1) @@ -327,11 +361,15 @@ class SEL_ARG :public Sql_alloc SEL_ARG() {} SEL_ARG(SEL_ARG &); - SEL_ARG(Field *,const uchar *, const uchar *); - SEL_ARG(Field *field, uint8 part, uchar *min_value, uchar *max_value, + SEL_ARG(Field *, bool is_asc, const uchar *, const uchar *); + SEL_ARG(Field *field, uint8 part, bool is_asc, + uchar *min_value, uchar *max_value, uint8 min_flag, uint8 max_flag, uint8 maybe_flag); + + /* This is used to construct degenerate SEL_ARGS like ALWAYS, IMPOSSIBLE, etc */ SEL_ARG(enum Type type_arg) - :min_flag(0), max_part_no(0) /* first key part means 1. 0 mean 'no parts'*/, + :min_flag(0), is_ascending(false), + max_part_no(0) /* first key part means 1. 0 mean 'no parts'*/, elements(1),use_count(1),left(0),right(0), next_key_part(0), color(BLACK), type(type_arg), weight(1) {} @@ -409,19 +447,20 @@ class SEL_ARG :public Sql_alloc { new_max=arg->max_value; flag_max=arg->max_flag; } - return new (thd->mem_root) SEL_ARG(field, part, new_min, new_max, flag_min, + return new (thd->mem_root) SEL_ARG(field, part, is_ascending, + new_min, new_max, flag_min, flag_max, MY_TEST(maybe_flag && arg->maybe_flag)); } SEL_ARG *clone_first(SEL_ARG *arg) { // min <= X < arg->min - return new SEL_ARG(field,part, min_value, arg->min_value, + return new SEL_ARG(field, part, is_ascending, min_value, arg->min_value, min_flag, arg->min_flag & NEAR_MIN ? 0 : NEAR_MAX, maybe_flag | arg->maybe_flag); } SEL_ARG *clone_last(SEL_ARG *arg) { // min <= X <= key_max - return new SEL_ARG(field, part, min_value, arg->max_value, + return new SEL_ARG(field, part, is_ascending, min_value, arg->max_value, min_flag, arg->max_flag, maybe_flag | arg->maybe_flag); } SEL_ARG *clone(RANGE_OPT_PARAM *param, SEL_ARG *new_parent, SEL_ARG **next); @@ -504,6 +543,56 @@ class SEL_ARG :public Sql_alloc return 0; } + /* Save minimum and maximum, taking index order into account */ + void store_min_max(uint length, + uchar **min_key, uint min_flag, + uchar **max_key, uint max_flag, + int *min_part, int *max_part) + { + if (is_ascending) { + *min_part += store_min(length, min_key, min_flag); + *max_part += store_max(length, max_key, max_flag); + } else { + *max_part += store_min(length, max_key, min_flag); + *min_part += store_max(length, min_key, max_flag); + } + } + /* + Get the flag for range's starting endpoint, taking index order into + account. + */ + uint get_min_flag() + { + return (is_ascending ? min_flag : invert_max_flag(max_flag)); + } + /* + Get the flag for range's starting endpoint, taking index order into + account. + */ + uint get_max_flag() + { + return (is_ascending ? max_flag : invert_min_flag(min_flag)); + } + /* Get the previous interval, taking index order into account */ + inline SEL_ARG* index_order_prev() + { + return is_ascending? prev: next; + } + /* Get the next interval, taking index order into account */ + inline SEL_ARG* index_order_next() + { + return is_ascending? next: prev; + } + + /* + Produce a single multi-part interval, taking key part ordering into + account. + */ + void store_next_min_max_keys(KEY_PART *key, uchar **cur_min_key, + uint *cur_min_flag, uchar **cur_max_key, + uint *cur_max_flag, int *min_part, + int *max_part); + /* Returns a number of keypart values appended to the key buffer for min key and max key. This function is used by both Range @@ -516,7 +605,8 @@ class SEL_ARG :public Sql_alloc int store_min_key(KEY_PART *key, uchar **range_key, uint *range_key_flag, - uint last_part) + uint last_part, + bool start_key) { SEL_ARG *key_tree= first(); uint res= key_tree->store_min(key[key_tree->part].store_length, @@ -525,15 +615,26 @@ class SEL_ARG :public Sql_alloc if (!res) return 0; *range_key_flag|= key_tree->min_flag; - if (key_tree->next_key_part && - key_tree->next_key_part->type == SEL_ARG::KEY_RANGE && + SEL_ARG *nkp= key_tree->next_key_part; + if (nkp && nkp->type == SEL_ARG::KEY_RANGE && key_tree->part != last_part && - key_tree->next_key_part->part == key_tree->part+1 && + nkp->part == key_tree->part+1 && !(*range_key_flag & (NO_MIN_RANGE | NEAR_MIN))) - res+= key_tree->next_key_part->store_min_key(key, - range_key, - range_key_flag, - last_part); + { + const bool asc = nkp->is_ascending; + if (start_key == asc) + { + res+= nkp->store_min_key(key, range_key, range_key_flag, last_part, + start_key); + } + else + { + uint tmp_flag = invert_min_flag(*range_key_flag); + res += nkp->store_max_key(key, range_key, &tmp_flag, last_part, + start_key); + *range_key_flag = invert_max_flag(tmp_flag); + } + } return res; } @@ -541,7 +642,8 @@ class SEL_ARG :public Sql_alloc int store_max_key(KEY_PART *key, uchar **range_key, uint *range_key_flag, - uint last_part) + uint last_part, + bool start_key) { SEL_ARG *key_tree= last(); uint res=key_tree->store_max(key[key_tree->part].store_length, @@ -549,15 +651,26 @@ class SEL_ARG :public Sql_alloc if (!res) return 0; *range_key_flag|= key_tree->max_flag; - if (key_tree->next_key_part && - key_tree->next_key_part->type == SEL_ARG::KEY_RANGE && + SEL_ARG *nkp= key_tree->next_key_part; + if (nkp && nkp->type == SEL_ARG::KEY_RANGE && key_tree->part != last_part && - key_tree->next_key_part->part == key_tree->part+1 && + nkp->part == key_tree->part+1 && !(*range_key_flag & (NO_MAX_RANGE | NEAR_MAX))) - res+= key_tree->next_key_part->store_max_key(key, - range_key, - range_key_flag, - last_part); + { + const bool asc = nkp->is_ascending; + if ((!start_key && asc) || (start_key && !asc)) + { + res += nkp->store_max_key(key, range_key, range_key_flag, last_part, + start_key); + } + else + { + uint tmp_flag = invert_max_flag(*range_key_flag); + res += nkp->store_min_key(key, range_key, &tmp_flag, last_part, + start_key); + *range_key_flag = invert_min_flag(tmp_flag); + } + } return res; } @@ -661,13 +774,83 @@ class SEL_ARG :public Sql_alloc SEL_ARG *clone_tree(RANGE_OPT_PARAM *param); }; +/* + HowRangeOptimizerHandlesDescKeyparts + ==================================== + + Starting with MySQL-8.0 and MariaDB 10.8, index key parts may be descending, + for example: + + INDEX idx1(col1, col2 DESC, col3, col4 DESC) + + Range Optimizer handles this as follows: + + The SEL_ARG object has SEL_ARG::is_ascending which specifies whether the + keypart is ascending. + + Other than that, the SEL_ARG graph is built without any regard to DESC + keyparts. + + For example, for an index + + INDEX idx2(kp1 DESC, kp2) + + and range + + kp1 BETWEEN 10 and 20 (RANGE-1) + + the SEL_ARG will have min_value=10, max_value=20, is_ascending=false. + + The ordering of key parts is taken into account when SEL_ARG graph is + linearized to ranges, in sel_arg_range_seq_next() and get_quick_keys(). + + The storage engine expects the first bound to be the first in the index and + the last bound to be the last, that is, for (RANGE-1) we will flip min and + max and generate these key_range structures: + + start.key='20' , end.key='10' + + See SEL_ARG::store_min_max(). The flag values are flipped as well, see + SEL_ARG::get_min_flag(), get_max_flag(). + + == Handling multiple key parts == + + For multi-part keys, the order of key parts has an effect on which ranges are + generated. Consider + + kp1 >= 10 AND kp2 >'foo' + + for INDEX(kp1 ASC, kp2 ASC) the range will be + + (kp1, kp2) > (10, 'foo') + + while for INDEX(kp1 ASC, kp2 DESC) it will be just + + kp1 >= 10 + + Another example: + + (kp1 BETWEEN 10 AND 20) AND (kp2 BETWEEN 'foo' AND 'quux') + + with INDEX (kp1 ASC, kp2 ASC) will generate + + (10, 'foo') <= (kp1, kp2) < (20, 'quux') + + while with index INDEX (kp1 ASC, kp2 DESC) it will generate + + (10, 'quux') <= (kp1, kp2) < (20, 'foo') + + This is again achieved by sel_arg_range_seq_next() and get_quick_keys() + flipping SEL_ARG's min,max, their flags and next/prev as needed. +*/ + extern MYSQL_PLUGIN_IMPORT SEL_ARG null_element; class SEL_ARG_IMPOSSIBLE: public SEL_ARG { public: SEL_ARG_IMPOSSIBLE(Field *field) - :SEL_ARG(field, 0, 0) + :SEL_ARG(field, false, 0, 0) { type= SEL_ARG::IMPOSSIBLE; } diff --git a/sql/opt_range_mrr.cc b/sql/opt_range_mrr.cc index 20413f5df63..8877e15d5b5 100644 --- a/sql/opt_range_mrr.cc +++ b/sql/opt_range_mrr.cc @@ -34,7 +34,7 @@ typedef struct st_range_seq_entry uint min_key_flag, max_key_flag; /* Number of key parts */ - uint min_key_parts, max_key_parts; + int min_key_parts, max_key_parts; SEL_ARG *key_tree; } RANGE_SEQ_ENTRY; @@ -105,13 +105,14 @@ static void step_down_to(SEL_ARG_RANGE_SEQ *arg, SEL_ARG *key_tree) cur->max_key_parts= prev->max_key_parts; uint16 stor_length= arg->param->key[arg->keyno][key_tree->part].store_length; - cur->min_key_parts += key_tree->store_min(stor_length, &cur->min_key, - prev->min_key_flag); - cur->max_key_parts += key_tree->store_max(stor_length, &cur->max_key, - prev->max_key_flag); - cur->min_key_flag= prev->min_key_flag | key_tree->min_flag; - cur->max_key_flag= prev->max_key_flag | key_tree->max_flag; + key_tree->store_min_max(stor_length, + &cur->min_key, prev->min_key_flag, + &cur->max_key, prev->max_key_flag, + &cur->min_key_parts, &cur->max_key_parts); + + cur->min_key_flag= prev->min_key_flag | key_tree->get_min_flag(); + cur->max_key_flag= prev->max_key_flag | key_tree->get_max_flag(); if (key_tree->is_null_interval()) cur->min_key_flag |= NULL_RANGE; @@ -165,12 +166,12 @@ bool sel_arg_range_seq_next(range_seq_t rseq, KEY_MULTI_RANGE *range) /* Ok, we're at some "full tuple" position in the tree */ /* Step down if we can */ - if (key_tree->next && key_tree->next != &null_element) + if (key_tree->index_order_next() && key_tree->index_order_next() != &null_element) { //step down; (update the tuple, we'll step right and stay there) seq->i--; - step_down_to(seq, key_tree->next); - key_tree= key_tree->next; + step_down_to(seq, key_tree->index_order_next()); + key_tree= key_tree->index_order_next(); seq->is_ror_scan= FALSE; goto walk_right_n_up; } @@ -185,12 +186,12 @@ bool sel_arg_range_seq_next(range_seq_t rseq, KEY_MULTI_RANGE *range) key_tree= seq->stack[seq->i].key_tree; /* Step down if we can */ - if (key_tree->next && key_tree->next != &null_element) + if (key_tree->index_order_next() && key_tree->index_order_next() != &null_element) { // Step down; update the tuple seq->i--; - step_down_to(seq, key_tree->next); - key_tree= key_tree->next; + step_down_to(seq, key_tree->index_order_next()); + key_tree= key_tree->index_order_next(); break; } } @@ -214,16 +215,10 @@ bool sel_arg_range_seq_next(range_seq_t rseq, KEY_MULTI_RANGE *range) !key_tree->min_flag && !key_tree->max_flag)) { seq->is_ror_scan= FALSE; - if (!key_tree->min_flag) - cur->min_key_parts += - key_tree->next_key_part->store_min_key(seq->param->key[seq->keyno], - &cur->min_key, - &cur->min_key_flag, MAX_KEY); - if (!key_tree->max_flag) - cur->max_key_parts += - key_tree->next_key_part->store_max_key(seq->param->key[seq->keyno], - &cur->max_key, - &cur->max_key_flag, MAX_KEY); + key_tree->store_next_min_max_keys(seq->param->key[seq->keyno], + &cur->min_key, &cur->min_key_flag, + &cur->max_key, &cur->max_key_flag, + &cur->min_key_parts, &cur->max_key_parts); break; } } @@ -235,10 +230,11 @@ bool sel_arg_range_seq_next(range_seq_t rseq, KEY_MULTI_RANGE *range) key_tree= key_tree->next_key_part; walk_up_n_right: - while (key_tree->prev && key_tree->prev != &null_element) + while (key_tree->index_order_prev() && + key_tree->index_order_prev() != &null_element) { /* Step up */ - key_tree= key_tree->prev; + key_tree= key_tree->index_order_prev(); } step_down_to(seq, key_tree); }

1 0

[Commits] 8708095a025: MDEV-27230: Estimation for filtered rows less precise ...
by psergey 13 Dec '21

13 Dec '21

revision-id: 8708095a0256add4d06abf2d04a91f4f5b7800fb (mariadb-10.6.1-330-g8708095a025) parent(s): 917f636563ea2e300c7f7f9ab0f240749525da51 author: Sergei Petrunia committer: Sergei Petrunia timestamp: 2021-12-13 23:46:04 +0300 message: MDEV-27230: Estimation for filtered rows less precise ... Fix the code in Histogram_json_hb::range_selectivity that handles special cases: a non-inclusive endpoint hitting a bucket boundary... --- mysql-test/main/statistics_json.result | 17 +++++++++++++++++ mysql-test/main/statistics_json.test | 10 ++++++++++ sql/opt_histogram_json.cc | 1 + 3 files changed, 28 insertions(+) diff --git a/mysql-test/main/statistics_json.result b/mysql-test/main/statistics_json.result index 7373da0570b..a54d5fd4151 100644 --- a/mysql-test/main/statistics_json.result +++ b/mysql-test/main/statistics_json.result @@ -8194,3 +8194,20 @@ analyze select * from t2 where a =100; id select_type table type possible_keys key key_len ref rows r_rows filtered r_filtered Extra 1 SIMPLE t2 ALL NULL NULL NULL NULL 1011 1011.00 0.10 0.10 Using where drop table t0,t1,t2; +# +# MDEV-27230: Estimation for filtered rows less precise ... +# +create table t1 (a char(1)); +insert into t1 select chr(seq%26+97) from seq_1_to_50; +insert into t1 select ':' from t1; +analyze table t1 persistent for all; +Table Op Msg_type Msg_text +test.t1 analyze status Engine-independent statistics collected +test.t1 analyze status OK +analyze select COUNT(*) FROM t1 WHERE a <> 'a'; +id select_type table type possible_keys key key_len ref rows r_rows filtered r_filtered Extra +1 SIMPLE t1 ALL NULL NULL NULL NULL 100 100.00 99.00 99.00 Using where +analyze select COUNT(*) FROM t1 WHERE a < 'a'; +id select_type table type possible_keys key key_len ref rows r_rows filtered r_filtered Extra +1 SIMPLE t1 ALL NULL NULL NULL NULL 100 100.00 50.00 50.00 Using where +drop table t1; diff --git a/mysql-test/main/statistics_json.test b/mysql-test/main/statistics_json.test index b67df41d9ba..024cb55e540 100644 --- a/mysql-test/main/statistics_json.test +++ b/mysql-test/main/statistics_json.test @@ -380,3 +380,13 @@ analyze select * from t2 where a =100; drop table t0,t1,t2; +--echo # +--echo # MDEV-27230: Estimation for filtered rows less precise ... +--echo # +create table t1 (a char(1)); +insert into t1 select chr(seq%26+97) from seq_1_to_50; +insert into t1 select ':' from t1; +analyze table t1 persistent for all; +analyze select COUNT(*) FROM t1 WHERE a <> 'a'; +analyze select COUNT(*) FROM t1 WHERE a < 'a'; +drop table t1; diff --git a/sql/opt_histogram_json.cc b/sql/opt_histogram_json.cc index d65bb225477..2ee6cd73dbe 100644 --- a/sql/opt_histogram_json.cc +++ b/sql/opt_histogram_json.cc @@ -1022,6 +1022,7 @@ double Histogram_json_hb::range_selectivity(Field *field, key_range *min_endp, $CONST. Move to the previous bucket. */ idx--; + equal= false; } double left_fract= get_left_fract(idx);

1 0

[Commits] 917f636563e: MDEV-27203: Valgrind / MSAN errors in Histogram_json_hb::parse_bucket
by psergey 13 Dec '21

13 Dec '21

revision-id: 917f636563ea2e300c7f7f9ab0f240749525da51 (mariadb-10.6.1-329-g917f636563e) parent(s): da3231a86f90cdd1387b4d36f9b6a4b5df9ae5fc author: Sergei Petrunia committer: Sergei Petrunia timestamp: 2021-12-13 22:54:33 +0300 message: MDEV-27203: Valgrind / MSAN errors in Histogram_json_hb::parse_bucket In read_bucket_endpoint(), handle all possible parser states. --- mysql-test/main/statistics_json.result | 2 +- sql/opt_histogram_json.cc | 7 +++++++ 2 files changed, 8 insertions(+), 1 deletion(-) diff --git a/mysql-test/main/statistics_json.result b/mysql-test/main/statistics_json.result index 50a79d8f834..7373da0570b 100644 --- a/mysql-test/main/statistics_json.result +++ b/mysql-test/main/statistics_json.result @@ -4688,7 +4688,7 @@ explain select * from t1_json limit 1; id select_type table type possible_keys key key_len ref rows Extra 1 SIMPLE t1_json ALL NULL NULL NULL NULL 10 Warnings: -Warning 4186 Failed to parse histogram for table test.t1_json: "size" element not present at offset 28. +Warning 4186 Failed to parse histogram for table test.t1_json: String or number expected at offset 27. UPDATE mysql.column_stats SET histogram='{"histogram_hb":[{"start":"aaa", "size":"not-an-integer"}]}' WHERE table_name='t1_json'; diff --git a/sql/opt_histogram_json.cc b/sql/opt_histogram_json.cc index 5d92c9535a4..d65bb225477 100644 --- a/sql/opt_histogram_json.cc +++ b/sql/opt_histogram_json.cc @@ -480,6 +480,13 @@ bool read_bucket_endpoint(json_engine_t *je, Field *field, String *out, if (json_read_value(je)) return true; + if (je->value_type != JSON_VALUE_STRING && + je->value_type != JSON_VALUE_NUMBER) + { + *err= "String or number expected"; + return true; + } + const char* je_value= (const char*)je->value; if (je->value_type == JSON_VALUE_STRING && je->value_escaped) {

1 0

[Commits] fa7a3d580: Range Locking: add support for escalation barriers
by psergey 13 Dec '21

13 Dec '21

revision-id: fa7a3d580fc0f3b8217f3d61b22554469527fbea (v6.26.0-142-gfa7a3d580) parent(s): eca85cdb6642c80ee1ac60eb758c7bd2627759f5 author: Sergei Petrunia committer: Sergei Petrunia timestamp: 2021-12-13 19:38:17 +0300 message: Range Locking: add support for escalation barriers Range Locking supports Lock Escalation. Lock Escalation is invoked when lock memory is nearly exhausted and it reduced the amount of memory used by joining adjacent locks. Bridging the gap between certain locks has adverse effects. For example, in MyRocks it is not a good idea to bridge the gap between locks in different indexes, as that get the lock to cover large portions of indexes, or even entire indexes. Resolve this by introducing Escalation Barrier. The escalation process will call the user-provided barrier callback function: bool(const Endpoint& a, const Endpoint& b) If the function returns true, there's a barrier between a and b and Lock Escalation will not try to bridge the gap between a and b. --- include/rocksdb/utilities/transaction_db.h | 6 +++ .../transactions/lock/range/range_locking_test.cc | 53 ++++++++++++++++++++++ .../lock/range/range_tree/lib/locktree/locktree.cc | 19 +++++++- .../lock/range/range_tree/lib/locktree/locktree.h | 14 ++++++ .../range/range_tree/range_tree_lock_manager.cc | 30 +++++++++++- .../range/range_tree/range_tree_lock_manager.h | 12 ++++- 6 files changed, 130 insertions(+), 4 deletions(-) diff --git a/include/rocksdb/utilities/transaction_db.h b/include/rocksdb/utilities/transaction_db.h index 265d4b79a..ca850cbdf 100644 --- a/include/rocksdb/utilities/transaction_db.h +++ b/include/rocksdb/utilities/transaction_db.h @@ -97,6 +97,12 @@ class RangeLockManagerHandle : public LockManagerHandle { using RangeLockStatus = std::unordered_multimap<ColumnFamilyId, RangeLockInfo>; + // Lock escalation barrier function. It returns true if lock escalation + // is not allowed to bridge the gap between the endpoints a and b. + using EscalationBarrierFunc = std::function<bool(const Endpoint& a, + const Endpoint& b)>; + virtual void SetEscalationBarrierFunc(EscalationBarrierFunc func) = 0; + virtual RangeLockStatus GetRangeLockStatusData() = 0; class Counters { diff --git a/utilities/transactions/lock/range/range_locking_test.cc b/utilities/transactions/lock/range/range_locking_test.cc index c881b68cb..be94623f9 100644 --- a/utilities/transactions/lock/range/range_locking_test.cc +++ b/utilities/transactions/lock/range/range_locking_test.cc @@ -286,6 +286,59 @@ TEST_F(RangeLockingTest, BasicLockEscalation) { delete txn; } + +/* + An escalation barrier function. Allow escalation iff the first two bytes are + identical. +*/ +static bool escalation_barrier(const Endpoint& a, const Endpoint& b) { + assert(a.slice.size()>2); + assert(b.slice.size()>2); + if (memcmp(a.slice.data(), b.slice.data(), 2)) + return true; // This is a barrier + else + return false; // No barrier +} + +TEST_F(RangeLockingTest, LockEscalationBarrier) { + auto cf = db->DefaultColumnFamily(); + + auto counters = range_lock_mgr->GetStatus(); + + // Initially not using any lock memory + ASSERT_EQ(counters.escalation_count, 0); + + range_lock_mgr->SetMaxLockMemory(8000); + range_lock_mgr->SetEscalationBarrierFunc(escalation_barrier); + + // Insert enough locks to cause lock escalations to happen + auto txn = NewTxn(); + const int N=2000; + for (int i = 0; i < N; i++) { + char buf[32]; + snprintf(buf, sizeof(buf) - 1, "%04d", i); + ASSERT_OK(txn->GetRangeLock(cf, Endpoint(buf), Endpoint(buf))); + } + counters = range_lock_mgr->GetStatus(); + ASSERT_GT(counters.escalation_count, 0); + + // Check that lock escalation was not performed across escalation barriers: + // Use another txn to acquire locks near the barriers. + auto txn2 = NewTxn(); + range_lock_mgr->SetMaxLockMemory(500000); + for (int i = 100; i < N; i+=100) { + char buf[32]; + snprintf(buf, sizeof(buf) - 1, "%04d-a", i-1); + // Check that we CAN get a lock near the escalation barrier + ASSERT_OK(txn2->GetRangeLock(cf, Endpoint(buf), Endpoint(buf))); + } + + txn->Rollback(); + txn2->Rollback(); + delete txn; + delete txn2; +} + #endif void PointLockManagerTestExternalSetup(PointLockManagerTest* self) { diff --git a/utilities/transactions/lock/range/range_tree/lib/locktree/locktree.cc b/utilities/transactions/lock/range/range_tree/lib/locktree/locktree.cc index c238b0204..0959beced 100644 --- a/utilities/transactions/lock/range/range_tree/lib/locktree/locktree.cc +++ b/utilities/transactions/lock/range/range_tree/lib/locktree/locktree.cc @@ -96,9 +96,19 @@ void locktree::create(locktree_manager *mgr, DICTIONARY_ID dict_id, m_sto_end_early_count = 0; m_sto_end_early_time = 0; + m_escalation_barrier = [](const DBT*, const DBT*, void *) -> bool { return false; }; + m_lock_request_info.init(mutex_factory); } +void +locktree::set_escalation_barrier_func(lt_escalation_barrier_check_func func, + void *extra) { + m_escalation_barrier= func; + m_escalation_barrier_arg= extra; +} + + void lt_lock_request_info::init(toku_external_mutex_factory_t mutex_factory) { pending_lock_requests.create(); pending_is_empty = true; @@ -863,14 +873,19 @@ void locktree::escalate(lt_escalate_cb after_escalate_callback, // - belongs to a different txnid, or // - belongs to several txnids, or // - is a shared lock (we could potentially merge those but - // currently we don't) + // currently we don't), or + // - is across a lock escalation barrier. int next_txnid_index = current_index + 1; while (next_txnid_index < num_extracted && (extracted_buf[current_index].txnid == extracted_buf[next_txnid_index].txnid) && !extracted_buf[next_txnid_index].is_shared && - !extracted_buf[next_txnid_index].owners) { + !extracted_buf[next_txnid_index].owners && + !m_escalation_barrier( + extracted_buf[current_index].range.get_right_key(), + extracted_buf[next_txnid_index].range.get_left_key(), + m_escalation_barrier_arg)) { next_txnid_index++; } diff --git a/utilities/transactions/lock/range/range_tree/lib/locktree/locktree.h b/utilities/transactions/lock/range/range_tree/lib/locktree/locktree.h index 3e438f502..e416ac5a3 100644 --- a/utilities/transactions/lock/range/range_tree/lib/locktree/locktree.h +++ b/utilities/transactions/lock/range/range_tree/lib/locktree/locktree.h @@ -85,6 +85,9 @@ typedef void (*lt_destroy_cb)(locktree *lt); typedef void (*lt_escalate_cb)(TXNID txnid, const locktree *lt, const range_buffer &buffer, void *extra); +typedef bool (*lt_escalation_barrier_check_func)(const DBT* a, const DBT *b, + void* extra); + struct lt_counters { uint64_t wait_count, wait_time; uint64_t long_wait_count, long_wait_time; @@ -343,6 +346,14 @@ class locktree { void set_comparator(const comparator &cmp); + // Escalation barrier prevents lock escalation. + // For two keys A and B, if escalation_barrier_check_func(A, B)==true, then + // there's a barrier between them, and lock escalation is not allowed to + // bridge the gap between A and B. + // This method sets the user-provided barrier check function. + void set_escalation_barrier_func(lt_escalation_barrier_check_func func, + void *extra); + int compare(const locktree *lt) const; DICTIONARY_ID get_dict_id() const; @@ -373,6 +384,9 @@ class locktree { // userdata pointer below. see locktree_manager::get_lt w/ on_create_extra comparator m_cmp; + lt_escalation_barrier_check_func m_escalation_barrier; + void *m_escalation_barrier_arg; + concurrent_tree *m_rangetree; void *m_userdata; diff --git a/utilities/transactions/lock/range/range_tree/range_tree_lock_manager.cc b/utilities/transactions/lock/range/range_tree/range_tree_lock_manager.cc index 6dfb78d3f..b1788b9b7 100644 --- a/utilities/transactions/lock/range/range_tree/range_tree_lock_manager.cc +++ b/utilities/transactions/lock/range/range_tree/range_tree_lock_manager.cc @@ -58,6 +58,16 @@ void deserialize_endpoint(const DBT* dbt, EndpointWithString* endp) { endp->slice.assign(dbt_data + 1, dbt->size - 1); } +// Same as above, but decode into Endpoint structure +void deserialize_endpoint(const DBT* dbt, Endpoint* endp) { + assert(dbt->size >= 1); + const char* dbt_data = (const char*)dbt->data; + char suffix = dbt_data[0]; + assert(suffix == SUFFIX_INFIMUM || suffix == SUFFIX_SUPREMUM); + endp->inf_suffix = (suffix == SUFFIX_SUPREMUM); + endp->slice= Slice(dbt_data + 1, dbt->size - 1); +} + // Get a range lock on [start_key; end_key] range Status RangeTreeLockManager::TryLock(PessimisticTransaction* txn, uint32_t column_family_id, @@ -262,6 +272,24 @@ RangeTreeLockManager::RangeTreeLockManager( ltm_.create(on_create, on_destroy, on_escalate, nullptr, mutex_factory_); } +int RangeTreeLockManager::on_create(locktree* lt, void *arg) +{ + // arg is a pointer to RangeTreeLockManager + lt->set_escalation_barrier_func(&OnEscalationBarrierCheck, arg); + return 0; +} + +bool RangeTreeLockManager::OnEscalationBarrierCheck(const DBT *a, + const DBT *b, + void *extra) +{ + Endpoint a_endp, b_endp; + deserialize_endpoint(a, &a_endp); + deserialize_endpoint(b, &b_endp); + auto self = (RangeTreeLockManager*) extra; + return self->barrier_func_(a_endp, b_endp); +} + void RangeTreeLockManager::SetRangeDeadlockInfoBufferSize( uint32_t target_size) { dlock_buffer_.Resize(target_size); @@ -351,7 +379,7 @@ void RangeTreeLockManager::AddColumnFamily(const ColumnFamilyHandle* cfh) { toku::comparator cmp; cmp.create(CompareDbtEndpoints, (void*)cfh->GetComparator()); toku::locktree* ltree = ltm_.get_lt(dict_id, cmp, - /* on_create_extra*/ nullptr); + /* on_create_extra*/ (void*)this); // This is ok to because get_lt has copied the comparator: cmp.destroy(); diff --git a/utilities/transactions/lock/range/range_tree/range_tree_lock_manager.h b/utilities/transactions/lock/range/range_tree/range_tree_lock_manager.h index 5d55ded02..3cbae850c 100644 --- a/utilities/transactions/lock/range/range_tree/range_tree_lock_manager.h +++ b/utilities/transactions/lock/range/range_tree/range_tree_lock_manager.h @@ -93,9 +93,16 @@ class RangeTreeLockManager : public RangeLockManagerBase, // Get the locktree which stores locks for the Column Family with given cf_id std::shared_ptr<locktree> GetLockTreeForCF(ColumnFamilyId cf_id); + void SetEscalationBarrierFunc(EscalationBarrierFunc func) override { + barrier_func_ = func; + } + private: toku::locktree_manager ltm_; + EscalationBarrierFunc barrier_func_ = + [](const Endpoint&, const Endpoint&)->bool { return false; }; + std::shared_ptr<TransactionDBMutexFactory> mutex_factory_; // Map from cf_id to locktree*. Can only be accessed while holding the @@ -116,10 +123,13 @@ class RangeTreeLockManager : public RangeLockManagerBase, static int CompareDbtEndpoints(void* arg, const DBT* a_key, const DBT* b_key); // Callbacks - static int on_create(locktree*, void*) { return 0; /* no error */ } + static int on_create(locktree*, void*); static void on_destroy(locktree*) {} static void on_escalate(TXNID txnid, const locktree* lt, const range_buffer& buffer, void* extra); + + static bool OnEscalationBarrierCheck(const DBT *a, const DBT *b, + void *extra); }; void serialize_endpoint(const Endpoint& endp, std::string* buf);

1 0

[Commits] 15e9afd84b9: Code cleanup: don't call subquery_types_allow_materialization() on prepare
by psergey 09 Dec '21

09 Dec '21

revision-id: 15e9afd84b94c00cbeea8ca2719799b4759cbb9c (mariadb-10.6.1-243-g15e9afd84b9) parent(s): c88e37ff857a83387c4d86829fbaf2e277e4cf9f author: Sergei Petrunia committer: Sergei Petrunia timestamp: 2021-12-09 20:13:35 +0300 message: Code cleanup: don't call subquery_types_allow_materialization() on prepare For subqueries that are processed as semi-joins. --- sql/opt_subselect.cc | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/sql/opt_subselect.cc b/sql/opt_subselect.cc index 596b5169659..f2d395a9ee2 100644 --- a/sql/opt_subselect.cc +++ b/sql/opt_subselect.cc @@ -703,7 +703,7 @@ int check_and_do_in_subquery_rewrites(JOIN *join) { DBUG_PRINT("info", ("Subquery is semi-join conversion candidate")); - (void)subquery_types_allow_materialization(thd, in_subs); + //(void)subquery_types_allow_materialization(thd, in_subs); in_subs->is_flattenable_semijoin= TRUE; @@ -1271,6 +1271,7 @@ bool convert_join_subqueries_to_semijoins(JOIN *join) while ((in_subq= li++)) { bool remove_item= TRUE; + subquery_types_allow_materialization(thd, in_subq); /* Stop processing if we've reached a subquery that's attached to the ON clause */ if (in_subq->do_not_convert_to_sj)

1 0

[Commits] da3231a86f9: MDEV-27204: [ERROR] Json_writer: a member name was expected, Assertion `got_name
by psergey 09 Dec '21

09 Dec '21

revision-id: da3231a86f90cdd1387b4d36f9b6a4b5df9ae5fc (mariadb-10.6.1-328-gda3231a86f9) parent(s): 98cb4351e141dcf65ea5a30f9f8c4bd352e374ea author: Sergei Petrunia committer: Sergei Petrunia timestamp: 2021-12-09 20:06:54 +0300 message: MDEV-27204: [ERROR] Json_writer: a member name was expected, Assertion `got_name [Adjusting Sergei Krivonos's patch] "duplicates_removal" may contain multiple elements inside it and so should have a JSON array as a value (and not object). --- mysql-test/main/explain_json.result | 85 +++++++++++++++++++++++++++++++------ mysql-test/main/explain_json.test | 10 +++++ sql/sql_explain.cc | 4 ++ 3 files changed, 86 insertions(+), 13 deletions(-) diff --git a/mysql-test/main/explain_json.result b/mysql-test/main/explain_json.result index 35f2d11c8b6..810ececc65c 100644 --- a/mysql-test/main/explain_json.result +++ b/mysql-test/main/explain_json.result @@ -842,20 +842,22 @@ EXPLAIN } }, { - "duplicates_removal": { - "block-nl-join": { - "table": { - "table_name": "t1", - "access_type": "ALL", - "rows": 10, - "filtered": 100 - }, - "buffer_type": "flat", - "buffer_size": "206", - "join_type": "BNL", - "attached_condition": "t1.b = t2.b and t1.a = t2.a" + "duplicates_removal": [ + { + "block-nl-join": { + "table": { + "table_name": "t1", + "access_type": "ALL", + "rows": 10, + "filtered": 100 + }, + "buffer_type": "flat", + "buffer_size": "206", + "join_type": "BNL", + "attached_condition": "t1.b = t2.b and t1.a = t2.a" + } } - } + ] } ] } @@ -1941,3 +1943,60 @@ EXPLAIN } } drop table t1; +# +# MDEV-27204: [ERROR] Json_writer: a member name was expected, Assertion `got_name == named_item_expected()' failed +# +CREATE TABLE t1 (a INT); +INSERT INTO t1 VALUES (1),(2); +explain FORMAT=JSON +SELECT * FROM t1 t0 +WHERE t0.a IN (SELECT t2.a FROM t1 t2 WHERE t0.a IN (SELECT t3.a FROM t1 t3)); +EXPLAIN +{ + "query_block": { + "select_id": 1, + "nested_loop": [ + { + "table": { + "table_name": "t0", + "access_type": "ALL", + "rows": 2, + "filtered": 100 + } + }, + { + "duplicates_removal": [ + { + "block-nl-join": { + "table": { + "table_name": "t2", + "access_type": "ALL", + "rows": 2, + "filtered": 100 + }, + "buffer_type": "flat", + "buffer_size": "152", + "join_type": "BNL", + "attached_condition": "t2.a = t0.a" + } + }, + { + "block-nl-join": { + "table": { + "table_name": "t3", + "access_type": "ALL", + "rows": 2, + "filtered": 100 + }, + "buffer_type": "incremental", + "buffer_size": "109", + "join_type": "BNL", + "attached_condition": "t3.a = t0.a" + } + } + ] + } + ] + } +} +DROP TABLE t1; diff --git a/mysql-test/main/explain_json.test b/mysql-test/main/explain_json.test index cfbc0cfa10c..3767939d3e3 100644 --- a/mysql-test/main/explain_json.test +++ b/mysql-test/main/explain_json.test @@ -419,3 +419,13 @@ explain format=json select * from t1 order by a desc, b desc; explain format=json select * from t1 order by a desc, b ; drop table t1; +--echo # +--echo # MDEV-27204: [ERROR] Json_writer: a member name was expected, Assertion `got_name == named_item_expected()' failed +--echo # + +CREATE TABLE t1 (a INT); +INSERT INTO t1 VALUES (1),(2); +explain FORMAT=JSON +SELECT * FROM t1 t0 +WHERE t0.a IN (SELECT t2.a FROM t1 t2 WHERE t0.a IN (SELECT t3.a FROM t1 t3)); +DROP TABLE t1; diff --git a/sql/sql_explain.cc b/sql/sql_explain.cc index 44654dc2e3d..4fd4e6d3b77 100644 --- a/sql/sql_explain.cc +++ b/sql/sql_explain.cc @@ -1100,12 +1100,16 @@ print_explain_json_interns(Explain_query *query, { writer->start_object(); writer->add_member("duplicates_removal"); + writer->start_array(); } join_tabs[i]->print_explain_json(query, writer, is_analyze); if (join_tabs[i]->end_dups_weedout) + { + writer->end_array(); writer->end_object(); + } } } // "nested_loop" print_explain_json_for_children(query, writer, is_analyze);

1 0

[Commits] c88e37ff857: MDEV-27204: [ERROR] Json_writer: a member name was expected, Assertion `got_name
by psergey 09 Dec '21

09 Dec '21

revision-id: c88e37ff857a83387c4d86829fbaf2e277e4cf9f (mariadb-10.6.1-242-gc88e37ff857) parent(s): 1e8bcbd0a0bfa07052e9458830672ea215c8664a author: Sergei Petrunia committer: Sergei Petrunia timestamp: 2021-12-09 16:49:40 +0300 message: MDEV-27204: [ERROR] Json_writer: a member name was expected, Assertion `got_name [Adjusting Sergei Krivonos's patch] "duplicates_removal" may contain multiple elements inside it and so should have a JSON array as a value (and not object). --- mysql-test/main/explain_json.result | 85 +++++++++++++++++++++++++++++++------ mysql-test/main/explain_json.test | 10 +++++ sql/sql_explain.cc | 4 ++ 3 files changed, 86 insertions(+), 13 deletions(-) diff --git a/mysql-test/main/explain_json.result b/mysql-test/main/explain_json.result index 35f2d11c8b6..810ececc65c 100644 --- a/mysql-test/main/explain_json.result +++ b/mysql-test/main/explain_json.result @@ -842,20 +842,22 @@ EXPLAIN } }, { - "duplicates_removal": { - "block-nl-join": { - "table": { - "table_name": "t1", - "access_type": "ALL", - "rows": 10, - "filtered": 100 - }, - "buffer_type": "flat", - "buffer_size": "206", - "join_type": "BNL", - "attached_condition": "t1.b = t2.b and t1.a = t2.a" + "duplicates_removal": [ + { + "block-nl-join": { + "table": { + "table_name": "t1", + "access_type": "ALL", + "rows": 10, + "filtered": 100 + }, + "buffer_type": "flat", + "buffer_size": "206", + "join_type": "BNL", + "attached_condition": "t1.b = t2.b and t1.a = t2.a" + } } - } + ] } ] } @@ -1941,3 +1943,60 @@ EXPLAIN } } drop table t1; +# +# MDEV-27204: [ERROR] Json_writer: a member name was expected, Assertion `got_name == named_item_expected()' failed +# +CREATE TABLE t1 (a INT); +INSERT INTO t1 VALUES (1),(2); +explain FORMAT=JSON +SELECT * FROM t1 t0 +WHERE t0.a IN (SELECT t2.a FROM t1 t2 WHERE t0.a IN (SELECT t3.a FROM t1 t3)); +EXPLAIN +{ + "query_block": { + "select_id": 1, + "nested_loop": [ + { + "table": { + "table_name": "t0", + "access_type": "ALL", + "rows": 2, + "filtered": 100 + } + }, + { + "duplicates_removal": [ + { + "block-nl-join": { + "table": { + "table_name": "t2", + "access_type": "ALL", + "rows": 2, + "filtered": 100 + }, + "buffer_type": "flat", + "buffer_size": "152", + "join_type": "BNL", + "attached_condition": "t2.a = t0.a" + } + }, + { + "block-nl-join": { + "table": { + "table_name": "t3", + "access_type": "ALL", + "rows": 2, + "filtered": 100 + }, + "buffer_type": "incremental", + "buffer_size": "109", + "join_type": "BNL", + "attached_condition": "t3.a = t0.a" + } + } + ] + } + ] + } +} +DROP TABLE t1; diff --git a/mysql-test/main/explain_json.test b/mysql-test/main/explain_json.test index cfbc0cfa10c..3767939d3e3 100644 --- a/mysql-test/main/explain_json.test +++ b/mysql-test/main/explain_json.test @@ -419,3 +419,13 @@ explain format=json select * from t1 order by a desc, b desc; explain format=json select * from t1 order by a desc, b ; drop table t1; +--echo # +--echo # MDEV-27204: [ERROR] Json_writer: a member name was expected, Assertion `got_name == named_item_expected()' failed +--echo # + +CREATE TABLE t1 (a INT); +INSERT INTO t1 VALUES (1),(2); +explain FORMAT=JSON +SELECT * FROM t1 t0 +WHERE t0.a IN (SELECT t2.a FROM t1 t2 WHERE t0.a IN (SELECT t3.a FROM t1 t3)); +DROP TABLE t1; diff --git a/sql/sql_explain.cc b/sql/sql_explain.cc index 44654dc2e3d..4fd4e6d3b77 100644 --- a/sql/sql_explain.cc +++ b/sql/sql_explain.cc @@ -1100,12 +1100,16 @@ print_explain_json_interns(Explain_query *query, { writer->start_object(); writer->add_member("duplicates_removal"); + writer->start_array(); } join_tabs[i]->print_explain_json(query, writer, is_analyze); if (join_tabs[i]->end_dups_weedout) + { + writer->end_array(); writer->end_object(); + } } } // "nested_loop" print_explain_json_for_children(query, writer, is_analyze);

1 0

[Commits] aa9b8132f37: MDEV-27149: Add rocksdb_ignore_datadic_errors
by psergey 08 Dec '21

08 Dec '21

revision-id: aa9b8132f374e508246499d3ae4338241a08049d (mariadb-10.4.13-181-gaa9b8132f37) parent(s): caa727fd7134e23bf2244e734434014e70c95e27 author: Sergei Petrunia committer: Sergei Petrunia timestamp: 2021-12-08 23:44:09 +0300 message: MDEV-27149: Add rocksdb_ignore_datadic_errors Add a --rocksdb_ignore_datadic_errors plugin option for MyRocks. The default is 0, and this means MyRocks will call abort() if it detects a DDL mismatch. Setting rocksdb_ignore_datadic_errors=1 makes MyRocks to try to ignore the errors and allow to start the server for repairs. --- storage/rocksdb/ha_rocksdb.cc | 35 +++++++++++++++++++++- storage/rocksdb/ha_rocksdb.h | 2 ++ .../rocksdb/mysql-test/rocksdb/r/rocksdb.result | 1 + .../r/rocksdb_ignore_datadic_errors.result | 7 +++++ .../t/rocksdb_ignore_datadic_errors.test | 6 ++++ storage/rocksdb/rdb_datadic.cc | 6 ++++ 6 files changed, 56 insertions(+), 1 deletion(-) diff --git a/storage/rocksdb/ha_rocksdb.cc b/storage/rocksdb/ha_rocksdb.cc index 938be7030bb..452b670883a 100644 --- a/storage/rocksdb/ha_rocksdb.cc +++ b/storage/rocksdb/ha_rocksdb.cc @@ -638,6 +638,8 @@ static my_bool rocksdb_large_prefix = 0; static my_bool rocksdb_allow_to_start_after_corruption = 0; static char* rocksdb_git_hash; +uint32_t rocksdb_ignore_datadic_errors = 0; + char *compression_types_val= const_cast<char*>(get_rocksdb_supported_compression_types()); static unsigned long rocksdb_write_policy = @@ -1908,6 +1910,15 @@ static MYSQL_SYSVAR_UINT( nullptr, nullptr, 1 /* default value */, 0 /* min value */, 2 /* max value */, 0); +static MYSQL_SYSVAR_UINT( + ignore_datadic_errors, rocksdb_ignore_datadic_errors, + PLUGIN_VAR_RQCMDARG | PLUGIN_VAR_READONLY, + "Ignore MyRocks' data directory errors. " + "(CAUTION: Use only to start the server and perform repairs. Do NOT use " + "for regular operation)", + nullptr, nullptr, 0 /* default value */, 0 /* min value */, + 1 /* max value */, 0); + static MYSQL_SYSVAR_STR(datadir, rocksdb_datadir, PLUGIN_VAR_OPCMDARG | PLUGIN_VAR_READONLY, "RocksDB data directory", nullptr, nullptr, @@ -2143,6 +2154,8 @@ static struct st_mysql_sys_var *rocksdb_system_variables[] = { MYSQL_SYSVAR(rollback_on_timeout), MYSQL_SYSVAR(enable_insert_with_update_caching), + + MYSQL_SYSVAR(ignore_datadic_errors), nullptr}; static rocksdb::WriteOptions rdb_get_rocksdb_write_options( @@ -5205,6 +5218,13 @@ static int rocksdb_init_func(void *const p) { DBUG_RETURN(1); } + if (rocksdb_ignore_datadic_errors) + { + sql_print_information( + "CAUTION: Running with rocksdb_ignore_datadic_errors=1. " + " This should only be used to perform repairs"); + } + if (rdb_check_rocksdb_corruption()) { // NO_LINT_DEBUG sql_print_error( @@ -5636,7 +5656,14 @@ static int rocksdb_init_func(void *const p) { if (ddl_manager.init(&dict_manager, &cf_manager, rocksdb_validate_tables)) { // NO_LINT_DEBUG sql_print_error("RocksDB: Failed to initialize DDL manager."); - DBUG_RETURN(HA_EXIT_FAILURE); + + if (rocksdb_ignore_datadic_errors) + { + sql_print_error("RocksDB: rocksdb_ignore_datadic_errors=1, " + "trying to continue"); + } + else + DBUG_RETURN(HA_EXIT_FAILURE); } Rdb_sst_info::init(rdb); @@ -11613,6 +11640,12 @@ void Rdb_drop_index_thread::run() { "from cf id %u. MyRocks data dictionary may " "get corrupted.", d.cf_id); + if (rocksdb_ignore_datadic_errors) + { + sql_print_error("RocksDB: rocksdb_ignore_datadic_errors=1, " + "trying to continue"); + continue; + } abort(); } rocksdb::ColumnFamilyHandle *cfh = cf_manager.get_cf(d.cf_id); diff --git a/storage/rocksdb/ha_rocksdb.h b/storage/rocksdb/ha_rocksdb.h index 448900c5a91..8d0ace707f8 100644 --- a/storage/rocksdb/ha_rocksdb.h +++ b/storage/rocksdb/ha_rocksdb.h @@ -1062,6 +1062,8 @@ const int MYROCKS_MARIADB_PLUGIN_MATURITY_LEVEL= MariaDB_PLUGIN_MATURITY_STABLE; extern bool prevent_myrocks_loading; +extern uint32_t rocksdb_ignore_datadic_errors; + void sql_print_verbose_info(const char *format, ...); } // namespace myrocks diff --git a/storage/rocksdb/mysql-test/rocksdb/r/rocksdb.result b/storage/rocksdb/mysql-test/rocksdb/r/rocksdb.result index f2f9adebf46..865fbf33b8d 100644 --- a/storage/rocksdb/mysql-test/rocksdb/r/rocksdb.result +++ b/storage/rocksdb/mysql-test/rocksdb/r/rocksdb.result @@ -932,6 +932,7 @@ rocksdb_force_flush_memtable_now OFF rocksdb_force_index_records_in_range 0 rocksdb_git_hash # rocksdb_hash_index_allow_collision ON +rocksdb_ignore_datadic_errors 0 rocksdb_ignore_unknown_options ON rocksdb_index_type kBinarySearch rocksdb_info_log_level error_level diff --git a/storage/rocksdb/mysql-test/rocksdb_sys_vars/r/rocksdb_ignore_datadic_errors.result b/storage/rocksdb/mysql-test/rocksdb_sys_vars/r/rocksdb_ignore_datadic_errors.result new file mode 100644 index 00000000000..daa70a80683 --- /dev/null +++ b/storage/rocksdb/mysql-test/rocksdb_sys_vars/r/rocksdb_ignore_datadic_errors.result @@ -0,0 +1,7 @@ +SET @start_global_value = @@global.ROCKSDB_IGNORE_DATADIC_ERRORS; +SELECT @start_global_value; +@start_global_value +0 +"Trying to set variable @@global.ROCKSDB_IGNORE_DATADIC_ERRORS to 444. It should fail because it is readonly." +SET @@global.ROCKSDB_IGNORE_DATADIC_ERRORS = 444; +ERROR HY000: Variable 'rocksdb_ignore_datadic_errors' is a read only variable diff --git a/storage/rocksdb/mysql-test/rocksdb_sys_vars/t/rocksdb_ignore_datadic_errors.test b/storage/rocksdb/mysql-test/rocksdb_sys_vars/t/rocksdb_ignore_datadic_errors.test new file mode 100644 index 00000000000..b412a018869 --- /dev/null +++ b/storage/rocksdb/mysql-test/rocksdb_sys_vars/t/rocksdb_ignore_datadic_errors.test @@ -0,0 +1,6 @@ +--source include/have_rocksdb.inc + +--let $sys_var=ROCKSDB_IGNORE_DATADIC_ERRORS +--let $read_only=1 +--let $session=0 +--source include/rocksdb_sys_var.inc diff --git a/storage/rocksdb/rdb_datadic.cc b/storage/rocksdb/rdb_datadic.cc index 3d07a3d2516..c624bd15d20 100644 --- a/storage/rocksdb/rdb_datadic.cc +++ b/storage/rocksdb/rdb_datadic.cc @@ -5263,6 +5263,12 @@ void Rdb_dict_manager::log_start_drop_index(GL_INDEX_ID gl_index_id, "from index id (%u,%u). MyRocks data dictionary may " "get corrupted.", gl_index_id.cf_id, gl_index_id.index_id); + if (rocksdb_ignore_datadic_errors) + { + sql_print_error("RocksDB: rocksdb_ignore_datadic_errors=1, " + "trying to continue"); + return; + } abort(); } }

1 0

[Commits] caa727fd713: Avoid a crash on MyRocks data inconsistency.
by psergey 08 Dec '21

08 Dec '21

revision-id: caa727fd7134e23bf2244e734434014e70c95e27 (mariadb-10.4.13-180-gcaa727fd713) parent(s): 98e9f790e6e216e2298d6e7a04117b52e4a0cbd3 author: Sergei Petrunia committer: Sergei Petrunia timestamp: 2021-12-08 22:08:18 +0300 message: Avoid a crash on MyRocks data inconsistency. In ha_rocksdb::open(), check if the number of indexes seen from the SQL layer matches the number of indexes in the internal MyRocks data dictionary. Produce an error if there is a mismatch. (If we don't produce this error, we are likely to crash as soon as we attempt to use an index) --- storage/rocksdb/ha_rocksdb.cc | 11 +++++++++++ 1 file changed, 11 insertions(+) diff --git a/storage/rocksdb/ha_rocksdb.cc b/storage/rocksdb/ha_rocksdb.cc index 1f4b53029b8..938be7030bb 100644 --- a/storage/rocksdb/ha_rocksdb.cc +++ b/storage/rocksdb/ha_rocksdb.cc @@ -6708,6 +6708,17 @@ int ha_rocksdb::open(const char *const name, int mode, uint test_if_locked) { "dictionary"); DBUG_RETURN(HA_ERR_ROCKSDB_INVALID_TABLE); } + if (m_tbl_def->m_key_count != table->s->keys + has_hidden_pk(table)? 1:0) + { + sql_print_error("MyRocks: DDL mismatch: .frm file has %u indexes, " + "MyRocks has %u (%s hidden pk)", + table->s->keys, m_tbl_def->m_key_count, + has_hidden_pk(table)? "1" : "no"); + my_error(ER_INTERNAL_ERROR, MYF(0), + "MyRocks: DDL mismatch. Check the error log for details"); + DBUG_RETURN(HA_ERR_ROCKSDB_INVALID_TABLE); + } + m_lock_rows = RDB_LOCK_NONE; m_key_descr_arr = m_tbl_def->m_key_descr_arr;

1 0

[Commits] 98cb4351e14: Fix compilation on Windows
by psergey 03 Dec '21

03 Dec '21

revision-id: 98cb4351e141dcf65ea5a30f9f8c4bd352e374ea (mariadb-10.6.1-327-g98cb4351e14) parent(s): 103949d566d0652b781a17ce414ed07e7210ef07 author: Sergei Petrunia committer: Sergei Petrunia timestamp: 2021-12-03 20:26:37 +0300 message: Fix compilation on Windows --- sql/opt_histogram_json.cc | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sql/opt_histogram_json.cc b/sql/opt_histogram_json.cc index dfac0dd3f2f..5d92c9535a4 100644 --- a/sql/opt_histogram_json.cc +++ b/sql/opt_histogram_json.cc @@ -182,7 +182,7 @@ class Histogram_json_builder : public Histogram_builder curtime.tm_hour, curtime.tm_min, curtime.tm_sec, - curtime.tm_zone); + system_time_zone); writer.add_member("target_histogram_size").add_ull(hist_width); writer.add_member("collected_at").add_str(buf);

1 0