[Commits] 62d269e: MDEV-25635 Assertion failure when pushing from HAVING into WHERE of view
by IgorBabaev 01 Jun '21
by IgorBabaev 01 Jun '21
01 Jun '21
revision-id: 62d269ea914de5b9225e0584cca52444e09c1c56 (mariadb-10.2.31-986-g62d269e)
parent(s): d06205ba3713da6c5875f124d5e431d3704aad1d
author: Igor Babaev
committer: Igor Babaev
timestamp: 2021-05-31 22:37:15 -0700
message:
MDEV-25635 Assertion failure when pushing from HAVING into WHERE of view
This bug could manifest itself after pushing a where condition over a
mergeable derived table / view / CTE DT into a grouping view / derived table / CTE V
whose item list contained set functions with constant arguments such as
MIN(2), SUM(1) etc. In such cases the field references used in the
condition pushed into the view V that correspond set functions are wrapped
into Item_direct_view_ref wrappers. Due to a wrong implementation of the
virtual method const_item() for the class Item_direct_view_ref the wrapped
set functions with constant arguments could be erroneously taken for
constant items. This could lead to a wrong result set returned by the main
select query in 10.2. In 10.4 where a possibility of pushing condition from
HAVING into WHERE had been added this could cause a crash.
---
mysql-test/r/derived_cond_pushdown.result | 39 +++++++++++++++++++++++++++++++
mysql-test/t/derived_cond_pushdown.test | 25 ++++++++++++++++++++
sql/item.h | 5 +++-
3 files changed, 68 insertions(+), 1 deletion(-)
diff --git a/mysql-test/r/derived_cond_pushdown.result b/mysql-test/r/derived_cond_pushdown.result
index 25237aa..28532ae 100644
--- a/mysql-test/r/derived_cond_pushdown.result
+++ b/mysql-test/r/derived_cond_pushdown.result
@@ -10634,4 +10634,43 @@ m
7
drop view v1;
drop table t1;
+#
+# MDEV-25635: pushdown into grouping view using aggregate functions
+# with constant arguments via a mergeable derived table
+#
+create table t1 (a int);
+insert into t1 values (3), (7), (1), (3), (7), (7), (3);
+create view v1 as select a, sum(1) as f, sum(1) as g from t1 group by a;
+select * from v1;
+a f g
+1 1 1
+3 3 3
+7 3 3
+select * from (select * from v1) as dt where a=f and a=g;
+a f g
+1 1 1
+3 3 3
+explain extended select * from (select * from v1) as dt where a=f and a=g;
+id select_type table type possible_keys key key_len ref rows filtered Extra
+1 PRIMARY <derived3> ALL NULL NULL NULL NULL 7 100.00 Using where
+3 DERIVED t1 ALL NULL NULL NULL NULL 7 100.00 Using temporary; Using filesort
+Warnings:
+Note 1003 select `v1`.`a` AS `a`,`v1`.`f` AS `f`,`v1`.`g` AS `g` from `test`.`v1` where `v1`.`a` = `v1`.`f` and `v1`.`a` = `v1`.`g`
+create view v2 as select a, min(1) as f, min(1) as g from t1 group by a;
+select * from v2;
+a f g
+1 1 1
+3 1 1
+7 1 1
+select * from (select * from v2) as dt where a=f and a=g;
+a f g
+1 1 1
+explain extended select * from (select * from v2) as dt where a=f and a=g;
+id select_type table type possible_keys key key_len ref rows filtered Extra
+1 PRIMARY <derived3> ALL NULL NULL NULL NULL 7 100.00 Using where
+3 DERIVED t1 ALL NULL NULL NULL NULL 7 100.00 Using temporary; Using filesort
+Warnings:
+Note 1003 select `v2`.`a` AS `a`,`v2`.`f` AS `f`,`v2`.`g` AS `g` from `test`.`v2` where `v2`.`f` = `v2`.`a` and `v2`.`g` = `v2`.`a`
+drop view v1,v2;
+drop table t1;
# End of 10.2 tests
diff --git a/mysql-test/t/derived_cond_pushdown.test b/mysql-test/t/derived_cond_pushdown.test
index 31b4904..58f38ac 100644
--- a/mysql-test/t/derived_cond_pushdown.test
+++ b/mysql-test/t/derived_cond_pushdown.test
@@ -2212,4 +2212,29 @@ select * from v1 where m > 0;
drop view v1;
drop table t1;
+--echo #
+--echo # MDEV-25635: pushdown into grouping view using aggregate functions
+--echo # with constant arguments via a mergeable derived table
+--echo #
+
+create table t1 (a int);
+insert into t1 values (3), (7), (1), (3), (7), (7), (3);
+
+create view v1 as select a, sum(1) as f, sum(1) as g from t1 group by a;
+select * from v1;
+let $q1=
+select * from (select * from v1) as dt where a=f and a=g;
+eval $q1;
+eval explain extended $q1;
+
+create view v2 as select a, min(1) as f, min(1) as g from t1 group by a;
+select * from v2;
+let $q2=
+select * from (select * from v2) as dt where a=f and a=g;
+eval $q2;
+eval explain extended $q2;
+
+drop view v1,v2;
+drop table t1;
+
--echo # End of 10.2 tests
diff --git a/sql/item.h b/sql/item.h
index c94709c..76be66d 100644
--- a/sql/item.h
+++ b/sql/item.h
@@ -4952,7 +4952,10 @@ class Item_direct_view_ref :public Item_direct_ref
table_map used_tables() const;
void update_used_tables();
table_map not_null_tables() const;
- bool const_item() const { return used_tables() == 0; }
+ bool const_item() const
+ {
+ return (*ref)->const_item() && (null_ref_table == NO_NULL_TABLE);
+ }
TABLE *get_null_ref_table() const { return null_ref_table; }
bool walk(Item_processor processor, bool walk_subquery, void *arg)
{
1
0
[Commits] 5ff1765: MDEV-25714 Join using derived with aggregation returns incorrect results
by IgorBabaev 27 May '21
by IgorBabaev 27 May '21
27 May '21
revision-id: 5ff176547f0ef9e8aa76604319919912a2a8c23b (mariadb-10.3.26-168-g5ff1765)
parent(s): 1e5ebf3762abdb8108620b46e76d4ebdde8472f7
author: Igor Babaev
committer: Igor Babaev
timestamp: 2021-05-26 23:41:59 -0700
message:
MDEV-25714 Join using derived with aggregation returns incorrect results
If a join query uses a derived table (view / CTE) with GROUP BY clause then
the execution plan for such join may employ split optimization. When this
optimization is employed the derived table is not materialized. Rather only
some partitions of the derived table are subject to grouping. Split
optimization can be applied only if:
- there are some indexes over the tables used in the join specifying the
derived table whose prefixes partially cover the field items used in the
GROUP BY list (such indexes are called splitting indexes)
- the WHERE condition of the join query contains conjunctive equalities
between columns of the derived table that comprise major parts of
splitting indexes and columns of the other join tables.
When the optimizer evaluates extending of a partial join by the rows of the
derived table it always considers a possibility of using split optimization.
Different splitting indexes can be used depending on the extended partial
join. At some rare conditions, for example, when there is a non-splitting
covering index for a table joined in the join specifying the derived table
usage of a splitting index to produce rows needed for grouping may be still
less beneficial than usage of such covering index without any splitting
technique. The function JOIN_TAB::choose_best_splitting() must take this
into account.
Approved by Oleksandr Byelkin <sanja(a)mariadb.com>
---
mysql-test/main/derived_cond_pushdown.result | 2 +-
mysql-test/main/derived_split_innodb.result | 61 ++++++++++++++++++++++++++++
mysql-test/main/derived_split_innodb.test | 37 +++++++++++++++++
sql/opt_split.cc | 27 +++++++++---
4 files changed, 121 insertions(+), 6 deletions(-)
diff --git a/mysql-test/main/derived_cond_pushdown.result b/mysql-test/main/derived_cond_pushdown.result
index f3d63b5..5fc0111 100644
--- a/mysql-test/main/derived_cond_pushdown.result
+++ b/mysql-test/main/derived_cond_pushdown.result
@@ -16712,7 +16712,7 @@ EXPLAIN EXTENDED
SELECT * FROM v1 JOIN v2 ON v1.f = v2.f;
id select_type table type possible_keys key key_len ref rows filtered Extra
1 PRIMARY NULL NULL NULL NULL NULL NULL NULL NULL Impossible WHERE noticed after reading const tables
-3 LATERAL DERIVED NULL NULL NULL NULL NULL NULL NULL NULL no matching row in const table
+3 DERIVED NULL NULL NULL NULL NULL NULL NULL NULL no matching row in const table
Warnings:
Note 1003 /* select#1 */ select NULL AS `f`,`v2`.`f` AS `f` from `test`.`t1` `a` straight_join `test`.`t1` `b` join `test`.`v2` where 0
DROP VIEW v1,v2;
diff --git a/mysql-test/main/derived_split_innodb.result b/mysql-test/main/derived_split_innodb.result
index 0b57e72..7ea3b68 100644
--- a/mysql-test/main/derived_split_innodb.result
+++ b/mysql-test/main/derived_split_innodb.result
@@ -174,3 +174,64 @@ id select_type table type possible_keys key key_len ref rows Extra
2 LATERAL DERIVED t1 ref a,a_2 a 5 test.t1.a 1 Using where; Using temporary; Using filesort
2 LATERAL DERIVED t2 ref c c 5 test.t1.b 1 Using index
DROP TABLE t1, t2;
+#
+# Bug mdev-25714: usage non-splitting covering index is cheaper than
+# usage of the best splitting index for one group
+#
+create table t1 (
+id int not null, itemid int not null, index idx (itemid)
+) engine=innodb;
+insert into t1 values (1, 2), (2,2), (4,2), (4,2), (0,3), (3,3);
+create table t2 (id int not null) engine=innodb;
+insert into t2 values (2);
+create table t3 (
+id int not null, itemid int not null, userid int not null, primary key (id),
+index idx1 (userid, itemid), index idx2 (itemid)
+) engine innodb;
+insert into t3 values (1,1,1), (2,1,1), (3,2,1), (4,2,1), (5,3,1);
+analyze table t1,t2,t3;
+Table Op Msg_type Msg_text
+test.t1 analyze status OK
+test.t2 analyze status OK
+test.t3 analyze status OK
+set optimizer_switch='split_materialized=on';
+explain select t1.id, t1.itemid, dt.id, t2.id
+from t1,
+(select itemid, max(id) as id from t3 where userid = 1 group by itemid) dt,
+t2
+where t1.id = dt.id and t1.itemid = dt.itemid and t2.id=t1.itemid;
+id select_type table type possible_keys key key_len ref rows Extra
+1 PRIMARY t2 ALL NULL NULL NULL NULL 1
+1 PRIMARY t1 ref idx idx 4 test.t2.id 1
+1 PRIMARY <derived2> ref key0 key0 9 test.t2.id,test.t1.id 2
+2 DERIVED t3 ref idx1,idx2 idx1 4 const 5 Using where; Using index
+select t1.id, t1.itemid, dt.id, t2.id
+from t1,
+(select itemid, max(id) as id from t3 where userid = 1 group by itemid) dt,
+t2
+where t1.id = dt.id and t1.itemid = dt.itemid and t2.id=t1.itemid;
+id itemid id id
+4 2 4 2
+4 2 4 2
+set optimizer_switch='split_materialized=off';
+explain select t1.id, t1.itemid, dt.id, t2.id
+from t1,
+(select itemid, max(id) as id from t3 where userid = 1 group by itemid) dt,
+t2
+where t1.id = dt.id and t1.itemid = dt.itemid and t2.id=t1.itemid;
+id select_type table type possible_keys key key_len ref rows Extra
+1 PRIMARY t2 ALL NULL NULL NULL NULL 1
+1 PRIMARY t1 ref idx idx 4 test.t2.id 1
+1 PRIMARY <derived2> ref key0 key0 9 test.t2.id,test.t1.id 2
+2 DERIVED t3 ref idx1 idx1 4 const 5 Using where; Using index
+select t1.id, t1.itemid, dt.id, t2.id
+from t1,
+(select itemid, max(id) as id from t3 where userid = 1 group by itemid) dt,
+t2
+where t1.id = dt.id and t1.itemid = dt.itemid and t2.id=t1.itemid;
+id itemid id id
+4 2 4 2
+4 2 4 2
+drop table t1,t2,t3;
+set optimizer_switch='split_materialized=default';
+# End of 10.3 tests
diff --git a/mysql-test/main/derived_split_innodb.test b/mysql-test/main/derived_split_innodb.test
index 19a6ecf..6f33c71 100644
--- a/mysql-test/main/derived_split_innodb.test
+++ b/mysql-test/main/derived_split_innodb.test
@@ -150,3 +150,40 @@ eval set statement optimizer_switch='split_materialized=on' for $query;
DROP TABLE t1, t2;
+--echo #
+--echo # Bug mdev-25714: usage non-splitting covering index is cheaper than
+--echo # usage of the best splitting index for one group
+--echo #
+
+create table t1 (
+ id int not null, itemid int not null, index idx (itemid)
+) engine=innodb;
+insert into t1 values (1, 2), (2,2), (4,2), (4,2), (0,3), (3,3);
+create table t2 (id int not null) engine=innodb;
+insert into t2 values (2);
+create table t3 (
+ id int not null, itemid int not null, userid int not null, primary key (id),
+ index idx1 (userid, itemid), index idx2 (itemid)
+) engine innodb;
+insert into t3 values (1,1,1), (2,1,1), (3,2,1), (4,2,1), (5,3,1);
+analyze table t1,t2,t3;
+
+let $q=
+select t1.id, t1.itemid, dt.id, t2.id
+ from t1,
+ (select itemid, max(id) as id from t3 where userid = 1 group by itemid) dt,
+ t2
+ where t1.id = dt.id and t1.itemid = dt.itemid and t2.id=t1.itemid;
+
+set optimizer_switch='split_materialized=on';
+eval explain $q;
+eval $q;
+
+set optimizer_switch='split_materialized=off';
+eval explain $q;
+eval $q;
+
+drop table t1,t2,t3;
+set optimizer_switch='split_materialized=default';
+
+--echo # End of 10.3 tests
diff --git a/sql/opt_split.cc b/sql/opt_split.cc
index c3a2d03..edf9ae3 100644
--- a/sql/opt_split.cc
+++ b/sql/opt_split.cc
@@ -960,11 +960,7 @@ SplM_plan_info * JOIN_TAB::choose_best_splitting(double record_count,
in the cache
*/
spl_plan= spl_opt_info->find_plan(best_table, best_key, best_key_parts);
- if (!spl_plan &&
- (spl_plan= (SplM_plan_info *) thd->alloc(sizeof(SplM_plan_info))) &&
- (spl_plan->best_positions=
- (POSITION *) thd->alloc(sizeof(POSITION) * join->table_count)) &&
- !spl_opt_info->plan_cache.push_back(spl_plan))
+ if (!spl_plan)
{
/*
The plan for the chosen key has not been found in the cache.
@@ -974,6 +970,27 @@ SplM_plan_info * JOIN_TAB::choose_best_splitting(double record_count,
reset_validity_vars_for_keyuses(best_key_keyuse_ext_start, best_table,
best_key, remaining_tables, true);
choose_plan(join, all_table_map & ~join->const_table_map);
+
+ /*
+ Check that the chosen plan is really a splitting plan.
+ If not or if there is not enough memory to save the plan in the cache
+ then just return with no splitting plan.
+ */
+ POSITION *first_non_const_pos= join->best_positions + join->const_tables;
+ TABLE *table= first_non_const_pos->table->table;
+ key_map spl_keys= table->keys_usable_for_splitting;
+ if (!(first_non_const_pos->key &&
+ spl_keys.is_set(first_non_const_pos->key->key)) ||
+ !(spl_plan= (SplM_plan_info *) thd->alloc(sizeof(SplM_plan_info))) ||
+ !(spl_plan->best_positions=
+ (POSITION *) thd->alloc(sizeof(POSITION) * join->table_count)) ||
+ spl_opt_info->plan_cache.push_back(spl_plan))
+ {
+ reset_validity_vars_for_keyuses(best_key_keyuse_ext_start, best_table,
+ best_key, remaining_tables, false);
+ return 0;
+ }
+
spl_plan->keyuse_ext_start= best_key_keyuse_ext_start;
spl_plan->table= best_table;
spl_plan->key= best_key;
1
0
[Commits] 46a17ea: MDEV-23886 Reusing CTE inside a function fails with table doesn't exist
by IgorBabaev 26 May '21
by IgorBabaev 26 May '21
26 May '21
revision-id: 46a17ea4875966159d3f8369488b813b1ac4c4b6 (mariadb-10.5.4-637-g46a17ea)
parent(s): c80cecb5e3e509d37929b4f446edf9b6c636b98f
author: Igor Babaev
committer: Igor Babaev
timestamp: 2021-05-25 17:13:17 -0700
message:
MDEV-23886 Reusing CTE inside a function fails with table doesn't exist
In the code existed just before this patch binding of a table reference to
the specification of the corresponding CTE happens in the function
open_and_process_table(). If the table reference is not the first in the
query the specification is cloned in the same way as the specification of
a view is cloned for any reference of the view. This works fine for
standalone queries, but does not work for stored procedures / functions
for the following reason.
When the first call of a stored procedure/ function SP is processed the
body of SP is parsed. When a query of SP is parsed the info on each
encountered table reference is put into a TABLE_LIST object linked into
a global chain associated with the query. When parsing of the query is
finished the basic info on the table references from this chain except
table references to derived tables and information schema tables is put
in one hash table associated with SP. When parsing of the body of SP is
finished this hash table is used to construct TABLE_LIST objects for all
table references mentioned in SP and link them into the list of such
objects passed to a pre-locking process that calls open_and_process_table()
for each table from the list.
When a TABLE_LIST for a view is encountered the view is opened and its
specification is parsed. For any table reference occurred in
the specification a new TABLE_LIST object is created to be included into
the list for pre-locking. After all objects in the pre-locking have been
looked through the tables mentioned in the list are locked. Note that the
objects referenced CTEs are just skipped here as it is impossible to
resolve these references without any info on the context where they occur.
Now the statements from the body of SP are executed one by one that.
At the very beginning of the execution of a query the tables used in the
query are opened and open_and_process_table() now is called for each table
reference mentioned in the list of TABLE_LIST objects associated with the
query that was built when the query was parsed.
For each table reference first the reference is checked against CTEs
definitions in whose scope it occurred. If such definition is found the
reference is considered resolved and if this is not the first reference
to the found CTE the the specification of the CTE is re-parsed and the
result of the parsing is added to the parsing tree of the query as a
sub-tree. If this sub-tree contains table references to other tables they
are added to the list of TABLE_LIST objects associated with the query in
order the referenced tables to be opened. When the procedure that opens
the tables comes to the TABLE_LIST object created for a non-first
reference to a CTE it discovers that the referenced table instance is not
locked and reports an error.
Thus processing non-first table references to a CTE similar to how
references to view are processed does not work for queries used in stored
procedures / functions. And the main problem is that the current
pre-locking mechanism employed for stored procedures / functions does not
allow to save the context in which a CTE reference occur. It's not trivial
to save the info about the context where a CTE reference occurs while the
resolution of the table reference cannot be done without this context and
consequentially the specification for the table reference cannot be
determined.
This patch solves the above problem by moving resolution of all CTE
references at the parsing stage. More exactly references to CTEs occurred in
a query are resolved right after parsing of the query has finished. After
resolution any CTE reference it is marked as a reference to to derived
table. So it is excluded from the hash table created for pre-locking used
base tables and view when the first call of a stored procedure / function
is processed.
This solution required recursive calls of the parser. The function
THD::sql_parser() has been added specifically for recursive invocations of
the parser.
# Conflicts:
# sql/sql_cte.cc
# sql/sql_cte.h
# sql/sql_lex.cc
# sql/sql_lex.h
# sql/sql_view.cc
# sql/sql_yacc.yy
# sql/sql_yacc_ora.yy
---
mysql-test/main/cte_nonrecursive.result | 201 +++++++++++++++
mysql-test/main/cte_nonrecursive.test | 202 +++++++++++++++
sql/item_subselect.cc | 1 -
sql/sp_head.cc | 3 +-
sql/sql_base.cc | 33 +--
sql/sql_class.cc | 56 +++++
sql/sql_class.h | 8 +-
sql/sql_cte.cc | 422 ++++++++++++++++++++++----------
sql/sql_cte.h | 85 ++++++-
sql/sql_lex.cc | 9 +-
sql/sql_lex.h | 26 +-
sql/sql_parse.cc | 15 +-
sql/sql_prepare.cc | 3 -
sql/sql_view.cc | 9 -
sql/sql_yacc.yy | 26 +-
sql/table.h | 37 +++
16 files changed, 929 insertions(+), 207 deletions(-)
diff --git a/mysql-test/main/cte_nonrecursive.result b/mysql-test/main/cte_nonrecursive.result
index 54283f1..763e51f 100644
--- a/mysql-test/main/cte_nonrecursive.result
+++ b/mysql-test/main/cte_nonrecursive.result
@@ -1763,6 +1763,207 @@ a c
2 1
7 3
drop table t1;
+#
+# MDEV-23886: Stored Function returning the result of a query
+# that uses CTE over a table twice
+#
+create table t1 (c1 int);
+insert into t1 values (1),(2),(6);
+create function f1() returns int return
+( with cte1 as (select c1 from t1)
+select sum(c1) from
+(select * from cte1 union all select * from cte1) dt
+);
+select f1();
+f1()
+18
+create function f2() returns int return
+( with cte1 as (select c1 from t1)
+select sum(s.c1) from cte1 as s, cte1 as t where s.c1=t.c1
+);
+select f2();
+f2()
+9
+create function f3() returns int return
+( with cte1 as (select c1 from t1)
+select
+case
+when exists(select 1 from cte1 where c1 between 1 and 2) then 1
+when exists(select 1 from cte1 where c1 between 5 and 6) then 2
+else 0
+end
+);
+select f3();
+f3()
+1
+create view v1 as (select c1 from t1);
+create function f4() returns int return
+( select sum(c1) from
+(select * from v1 union all select * from v1) dt
+);
+select f4();
+f4()
+18
+create function f5() returns int return
+( select sum(s.c1) from v1 as s, v1 as t where s.c1=t.c1
+);
+select f5();
+f5()
+9
+create view v2(s) as
+with cte1 as (select c1 from t1)
+select sum(c1) from (select * from cte1 union all select * from cte1) dt;
+create function f6() returns int return
+(select s from v2);
+select f6();
+f6()
+18
+create function f7() returns int return
+( select r.s from v2 as r, v2 as t where r.s=t.s
+);
+select f7();
+f7()
+18
+select f5() + f6();
+f5() + f6()
+27
+prepare stmt from "select f5() + f6();";
+execute stmt;
+f5() + f6()
+27
+execute stmt;
+f5() + f6()
+27
+deallocate prepare stmt;
+drop function f1;
+drop function f2;
+drop function f3;
+drop function f4;
+drop function f5;
+drop function f6;
+drop function f7;
+drop view v1;
+drop view v2;
+create table t2 (a int, b int);
+insert into t2
+with cte1 as (select c1 from t1)
+select * from cte1 as s, cte1 as t where s.c1=t.c1 and s.c1 > 5;
+select * from t2;
+a b
+6 6
+create procedure p1()
+begin
+insert into t2
+with cte1 as (select c1 from t1)
+select * from cte1 as s, cte1 as t where s.c1=t.c1 and s.c1 <= 2 and t.c1 >= 2;
+end |
+call p1();
+select * from t2;
+a b
+6 6
+2 2
+drop procedure p1;
+# checking CTE resolution for queries with hanging CTEs
+with
+cte1(a) as (select * from t1 where c1 <= 2),
+cte2(b) as (select * from cte1 where a >= 2),
+cte3 as (select * from cte1,cte2 where cte1.a < cte2.b)
+select * from cte3;
+a b
+1 2
+select * from t2;
+a b
+6 6
+2 2
+with
+cte1(a) as (select * from t1 where c1 <= 2),
+cte2(b) as (select * from cte1 where a >= 2),
+cte3 as (select * from cte1,cte2 where cte1.a < cte2.b)
+select * from t2;
+a b
+6 6
+2 2
+with
+cte1(a) as (select * from t1 where c1 <= 2),
+cte2(b) as (select * from cte1 where c1 >= 2),
+cte3 as (select * from cte1,cte2 where cte1.a < cte2.b)
+select * from t2;
+ERROR 42S22: Unknown column 'c1' in 'where clause'
+with
+cte1(a) as (select * from t1 where c1 <= 2),
+cte2(b) as (select * from cte1 where a >= 2),
+cte3 as (select * from cte1,cte2 where cte1.a < cte2.c1)
+select * from t2;
+ERROR 42S22: Unknown column 'cte2.c1' in 'where clause'
+with
+cte1 as (select * from t1 where c1 <= 2),
+cte2(a,b) as (select * from cte1 as s1, cte1 as s2 where s1.c1=s2.c1)
+select * from cte2;
+a b
+1 1
+2 2
+with
+cte1 as (select * from t1 where c1 <= 2),
+cte2(a,b) as (select * from cte1 as s1, cte1 as s2 where s1.c1=s2.c1)
+select * from t2;
+a b
+6 6
+2 2
+with
+cte1 as (select * from t1 where c1 <= 2),
+cte2(a,b) as (select * from cte1 as s1, cte1 as s2 where s1.c1=c1)
+select * from t2;
+ERROR 23000: Column 'c1' in where clause is ambiguous
+with cte3 as
+( with cte2(a,b) as
+( with cte1 as (select * from t1 where c1 <= 2)
+select * from cte1 as s1, cte1 as s2 where s1.c1=s2.c1)
+select r1.a,r2.b from cte2 as r1, cte2 as r2)
+select * from cte3;
+a b
+1 1
+2 1
+1 2
+2 2
+with cte3 as
+( with cte2(a,b) as
+( with cte1 as (select * from t1 where c1 <= 2)
+select * from cte1 as s1, cte1 as s2 where s1.c1=s2.c1)
+select r1.a,r2.b from cte2 as r1, cte2 as r2)
+select * from t2;
+a b
+6 6
+2 2
+with cte3 as
+( with cte2(a,b) as
+( with cte1 as (select * from t1 where c1 <= 2)
+select * from cte1 as s1, cte1 as s2 where s1.c1=s2.c1)
+select r1.c1,r2.c1 from cte2 as r1, cte2 as r2)
+select * from t2;
+ERROR 42S22: Unknown column 'r1.c1' in 'field list'
+create procedure p1()
+begin
+insert into t2
+with cte1 as (select c1 from t1)
+select * from t1 as s, t1 as t where s.c1=t.c1 and s.c1 <= 2 and t.c1 >= 2;
+end |
+call p1();
+select * from t2;
+a b
+6 6
+2 2
+2 2
+drop procedure p1;
+create procedure p1()
+begin
+insert into t2
+with cte1 as (select a from t1)
+select * from t1 as s, t1 as t where s.c1=t.c1 and s.c1 <= 2 and t.c1 >= 2;
+end |
+call p1();
+ERROR 42S22: Unknown column 'a' in 'field list'
+drop procedure p1;
+drop table t1,t2;
# End of 10.2 tests
#
# MDEV-21673: several references to CTE that uses
diff --git a/mysql-test/main/cte_nonrecursive.test b/mysql-test/main/cte_nonrecursive.test
index a4a1a17..59dae44 100644
--- a/mysql-test/main/cte_nonrecursive.test
+++ b/mysql-test/main/cte_nonrecursive.test
@@ -1273,6 +1273,208 @@ select a, c from cte as r2 where a > 4;
drop table t1;
+--echo #
+--echo # MDEV-23886: Stored Function returning the result of a query
+--echo # that uses CTE over a table twice
+--echo #
+
+create table t1 (c1 int);
+insert into t1 values (1),(2),(6);
+
+create function f1() returns int return
+( with cte1 as (select c1 from t1)
+ select sum(c1) from
+ (select * from cte1 union all select * from cte1) dt
+);
+select f1();
+
+create function f2() returns int return
+( with cte1 as (select c1 from t1)
+ select sum(s.c1) from cte1 as s, cte1 as t where s.c1=t.c1
+);
+select f2();
+
+create function f3() returns int return
+( with cte1 as (select c1 from t1)
+ select
+ case
+ when exists(select 1 from cte1 where c1 between 1 and 2) then 1
+ when exists(select 1 from cte1 where c1 between 5 and 6) then 2
+ else 0
+ end
+);
+select f3();
+
+create view v1 as (select c1 from t1);
+
+create function f4() returns int return
+( select sum(c1) from
+ (select * from v1 union all select * from v1) dt
+);
+select f4();
+
+create function f5() returns int return
+( select sum(s.c1) from v1 as s, v1 as t where s.c1=t.c1
+);
+select f5();
+
+create view v2(s) as
+with cte1 as (select c1 from t1)
+select sum(c1) from (select * from cte1 union all select * from cte1) dt;
+
+create function f6() returns int return
+(select s from v2);
+select f6();
+
+create function f7() returns int return
+( select r.s from v2 as r, v2 as t where r.s=t.s
+);
+select f7();
+
+select f5() + f6();
+
+prepare stmt from "select f5() + f6();";
+execute stmt;
+execute stmt;
+deallocate prepare stmt;
+
+drop function f1;
+drop function f2;
+drop function f3;
+drop function f4;
+drop function f5;
+drop function f6;
+drop function f7;
+
+drop view v1;
+drop view v2;
+
+create table t2 (a int, b int);
+
+insert into t2
+with cte1 as (select c1 from t1)
+select * from cte1 as s, cte1 as t where s.c1=t.c1 and s.c1 > 5;
+
+select * from t2;
+
+delimiter |;
+
+create procedure p1()
+begin
+insert into t2
+with cte1 as (select c1 from t1)
+select * from cte1 as s, cte1 as t where s.c1=t.c1 and s.c1 <= 2 and t.c1 >= 2;
+end |
+
+delimiter ;|
+
+call p1();
+select * from t2;
+
+drop procedure p1;
+
+--echo # checking CTE resolution for queries with hanging CTEs
+
+with
+cte1(a) as (select * from t1 where c1 <= 2),
+cte2(b) as (select * from cte1 where a >= 2),
+cte3 as (select * from cte1,cte2 where cte1.a < cte2.b)
+select * from cte3;
+
+select * from t2;
+
+with
+cte1(a) as (select * from t1 where c1 <= 2),
+cte2(b) as (select * from cte1 where a >= 2),
+cte3 as (select * from cte1,cte2 where cte1.a < cte2.b)
+select * from t2;
+
+--error ER_BAD_FIELD_ERROR
+with
+cte1(a) as (select * from t1 where c1 <= 2),
+cte2(b) as (select * from cte1 where c1 >= 2),
+cte3 as (select * from cte1,cte2 where cte1.a < cte2.b)
+select * from t2;
+
+--error ER_BAD_FIELD_ERROR
+with
+cte1(a) as (select * from t1 where c1 <= 2),
+cte2(b) as (select * from cte1 where a >= 2),
+cte3 as (select * from cte1,cte2 where cte1.a < cte2.c1)
+select * from t2;
+
+with
+cte1 as (select * from t1 where c1 <= 2),
+cte2(a,b) as (select * from cte1 as s1, cte1 as s2 where s1.c1=s2.c1)
+select * from cte2;
+
+with
+cte1 as (select * from t1 where c1 <= 2),
+cte2(a,b) as (select * from cte1 as s1, cte1 as s2 where s1.c1=s2.c1)
+select * from t2;
+
+--error ER_NON_UNIQ_ERROR
+with
+cte1 as (select * from t1 where c1 <= 2),
+cte2(a,b) as (select * from cte1 as s1, cte1 as s2 where s1.c1=c1)
+select * from t2;
+
+with cte3 as
+( with cte2(a,b) as
+ ( with cte1 as (select * from t1 where c1 <= 2)
+ select * from cte1 as s1, cte1 as s2 where s1.c1=s2.c1)
+ select r1.a,r2.b from cte2 as r1, cte2 as r2)
+select * from cte3;
+
+with cte3 as
+( with cte2(a,b) as
+ ( with cte1 as (select * from t1 where c1 <= 2)
+ select * from cte1 as s1, cte1 as s2 where s1.c1=s2.c1)
+ select r1.a,r2.b from cte2 as r1, cte2 as r2)
+select * from t2;
+
+--error ER_BAD_FIELD_ERROR
+with cte3 as
+( with cte2(a,b) as
+ ( with cte1 as (select * from t1 where c1 <= 2)
+ select * from cte1 as s1, cte1 as s2 where s1.c1=s2.c1)
+ select r1.c1,r2.c1 from cte2 as r1, cte2 as r2)
+select * from t2;
+
+delimiter |;
+
+create procedure p1()
+begin
+insert into t2
+with cte1 as (select c1 from t1)
+select * from t1 as s, t1 as t where s.c1=t.c1 and s.c1 <= 2 and t.c1 >= 2;
+end |
+
+delimiter ;|
+
+call p1();
+select * from t2;
+
+drop procedure p1;
+
+delimiter |;
+
+create procedure p1()
+begin
+insert into t2
+with cte1 as (select a from t1)
+select * from t1 as s, t1 as t where s.c1=t.c1 and s.c1 <= 2 and t.c1 >= 2;
+end |
+
+delimiter ;|
+
+--error ER_BAD_FIELD_ERROR
+call p1();
+
+drop procedure p1;
+
+drop table t1,t2;
+
--echo # End of 10.2 tests
--echo #
diff --git a/sql/item_subselect.cc b/sql/item_subselect.cc
index 7fc7f41..ffebf7d 100644
--- a/sql/item_subselect.cc
+++ b/sql/item_subselect.cc
@@ -1846,7 +1846,6 @@ double Item_in_subselect::val_real()
As far as Item_in_subselect called only from Item_in_optimizer this
method should not be used
*/
- DBUG_ASSERT(0);
DBUG_ASSERT(fixed == 1);
if (forced_const)
return value;
diff --git a/sql/sp_head.cc b/sql/sp_head.cc
index 513e720..b8e8def 100644
--- a/sql/sp_head.cc
+++ b/sql/sp_head.cc
@@ -3487,8 +3487,7 @@ sp_lex_keeper::reset_lex_and_exec_core(THD *thd, uint *nextp,
Json_writer_object trace_command(thd);
Json_writer_array trace_command_steps(thd, "steps");
if (open_tables)
- res= check_dependencies_in_with_clauses(m_lex->with_clauses_list) ||
- instr->exec_open_and_lock_tables(thd, m_lex->query_tables);
+ res= instr->exec_open_and_lock_tables(thd, m_lex->query_tables);
if (likely(!res))
{
diff --git a/sql/sql_base.cc b/sql/sql_base.cc
index e926ebc..f9f9856 100644
--- a/sql/sql_base.cc
+++ b/sql/sql_base.cc
@@ -3589,7 +3589,11 @@ open_and_process_table(THD *thd, TABLE_LIST *tables, uint *counter, uint flags,
if (tables->derived)
{
if (!tables->view)
+ {
+ if (!tables->is_derived())
+ tables->set_derived();
goto end;
+ }
/*
We restore view's name and database wiped out by derived tables
processing and fall back to standard open process in order to
@@ -3599,35 +3603,6 @@ open_and_process_table(THD *thd, TABLE_LIST *tables, uint *counter, uint flags,
tables->db= tables->view_db;
tables->table_name= tables->view_name;
}
- else if (tables->select_lex)
- {
- /*
- Check whether 'tables' refers to a table defined in a with clause.
- If so set the reference to the definition in tables->with.
- */
- if (!tables->with)
- tables->with= tables->select_lex->find_table_def_in_with_clauses(tables);
- /*
- If 'tables' is defined in a with clause set the pointer to the
- specification from its definition in tables->derived.
- */
- if (tables->with)
- {
- if (tables->is_recursive_with_table() &&
- !tables->is_with_table_recursive_reference())
- {
- tables->with->rec_outer_references++;
- With_element *with_elem= tables->with;
- while ((with_elem= with_elem->get_next_mutually_recursive()) !=
- tables->with)
- with_elem->rec_outer_references++;
- }
- if (tables->set_as_with_table(thd, tables->with))
- DBUG_RETURN(1);
- else
- goto end;
- }
- }
if (!tables->derived && is_infoschema_db(&tables->db))
{
diff --git a/sql/sql_class.cc b/sql/sql_class.cc
index ac42b50..8b61750 100644
--- a/sql/sql_class.cc
+++ b/sql/sql_class.cc
@@ -2810,6 +2810,62 @@ void THD::close_active_vio()
#endif
+/*
+ @brief MySQL parser used for recursive invocations
+
+ @param old_lex The LEX structure in the state when this parser
+ is called recursively
+ @param lex The LEX structure used to parse a new SQL fragment
+ @param str The SQL fragment to parse
+ @param str_len The length of the SQL fragment to parse
+ @param stmt_prepare_mode true <=> when parsing a prepare statement
+
+ @details
+ This function is to be used when parsing of an SQL fragment is
+ needed within one of the grammar rules.
+
+ @notes
+ Currently the function is used only when the specification of a CTE
+ is parsed for the not first and not recursive references of the CTE.
+
+ @retval false On a successful parsing of the fragment
+ @retval true Otherwise
+*/
+
+bool THD::sql_parser(LEX *old_lex, LEX *lex,
+ char *str, uint str_len, bool stmt_prepare_mode)
+{
+ extern int MYSQLparse(THD * thd);
+ extern int ORAparse(THD * thd);
+
+ bool parse_status= false;
+ Parser_state parser_state;
+ Parser_state *old_parser_state= m_parser_state;
+
+ if (parser_state.init(this, str, str_len))
+ return true;
+
+ m_parser_state= &parser_state;
+ parser_state.m_lip.stmt_prepare_mode= stmt_prepare_mode;
+ parser_state.m_lip.multi_statements= false;
+ parser_state.m_lip.m_digest= NULL;
+
+ lex->param_list= old_lex->param_list;
+ lex->sphead= old_lex->sphead;
+ lex->spname= old_lex->spname;
+ lex->spcont= old_lex->spcont;
+ lex->sp_chistics= old_lex->sp_chistics;
+ lex->trg_chistics= old_lex->trg_chistics;
+
+ parse_status= (variables.sql_mode & MODE_ORACLE) ?
+ ORAparse(this) : MYSQLparse(this) != 0;
+
+ m_parser_state= old_parser_state;
+
+ return parse_status;
+}
+
+
struct Item_change_record: public ilink
{
Item **place;
diff --git a/sql/sql_class.h b/sql/sql_class.h
index de05b0d..ac8d26c 100644
--- a/sql/sql_class.h
+++ b/sql/sql_class.h
@@ -4401,14 +4401,11 @@ class THD: public THD_count, /* this must be first */
to resolve all CTE names as we don't need this message to be thrown
for any CTE references.
*/
- if (!lex->with_clauses_list)
+ if (!lex->with_cte_resolution)
{
my_message(ER_NO_DB_ERROR, ER(ER_NO_DB_ERROR), MYF(0));
return TRUE;
}
- /* This will allow to throw an error later for non-CTE references */
- to->str= NULL;
- to->length= 0;
return FALSE;
}
@@ -5210,6 +5207,9 @@ class THD: public THD_count, /* this must be first */
Item *sp_prepare_func_item(Item **it_addr, uint cols= 1);
bool sp_eval_expr(Field *result_field, Item **expr_item_ptr);
+ bool sql_parser(LEX *old_lex, LEX *lex,
+ char *str, uint str_len, bool stmt_prepare_mode);
+
};
diff --git a/sql/sql_cte.cc b/sql/sql_cte.cc
index 815a0f4..1dbdfbd 100644
--- a/sql/sql_cte.cc
+++ b/sql/sql_cte.cc
@@ -84,7 +84,7 @@ void st_select_lex_unit::set_with_clause(With_clause *with_cl)
true on failure
*/
-bool check_dependencies_in_with_clauses(With_clause *with_clauses_list)
+bool LEX::check_dependencies_in_with_clauses()
{
for (With_clause *with_clause= with_clauses_list;
with_clause;
@@ -102,6 +102,201 @@ bool check_dependencies_in_with_clauses(With_clause *with_clauses_list)
/**
@brief
+ Resolve references to CTE in specification of hanging CTE
+
+ @details
+ A CTE to which there are no references in the query is called hanging CTE.
+ Although such CTE is not used for execution its specification must be
+ subject to context analysis. All errors concerning references to
+ non-existing tables or fields occurred in the specification must be
+ reported as well as all other errors caught at the prepare stage.
+ The specification of a hanging CTE might contain references to other
+ CTE outside of the specification and within it if the specification
+ contains a with clause. This function resolves all such references for
+ all hanging CTEs encountered in the processed query.
+
+ @retval
+ false on success
+ true on failure
+*/
+
+bool
+LEX::resolve_references_to_cte_in_hanging_cte()
+{
+ for (With_clause *with_clause= with_clauses_list;
+ with_clause; with_clause= with_clause->next_with_clause)
+ {
+ for (With_element *with_elem= with_clause->with_list.first;
+ with_elem; with_elem= with_elem->next)
+ {
+ if (!with_elem->is_referenced())
+ {
+ TABLE_LIST *first_tbl=
+ with_elem->spec->first_select()->table_list.first;
+ TABLE_LIST **with_elem_end_pos= with_elem->head->tables_pos.end_pos;
+ if (first_tbl && resolve_references_to_cte(first_tbl, with_elem_end_pos))
+ return true;
+ }
+ }
+ }
+ return false;
+}
+
+
+/**
+ @brief
+ Resolve table references to CTE from a sub-chain of table references
+
+ @param tables Points to the beginning of the sub-chain
+ @param tables_last Points to the address with the sub-chain barrier
+
+ @details
+ The method resolves tables references to CTE from the chain of
+ table references specified by the parameters 'tables' and 'tables_last'.
+ It resolves the references against the CTE definition occurred in a query
+ or the specification of a CTE whose parsing tree is represented by
+ this LEX structure. The method is always called right after the process
+ of parsing the query or of the specification of a CTE has been finished,
+ thus the chain of table references used in the parsed fragment has been
+ already built. It is assumed that parameters of the method specify a
+ a sub-chain of this chain.
+ If a table reference can be potentially a table reference to a CTE and it
+ has not been resolved yet then the method tries to find the definition
+ of the CTE against which the reference can be resolved. If it succeeds
+ it sets the field TABLE_LIST::with to point to the found definition.
+ It also sets the field TABLE_LIST::derived to point to the specification
+ of the found CTE and sets TABLE::db.str to empty_c_string. This will
+ allow to handle this table reference like a reference to a derived handle.
+ If another table reference has been already resolved against this CTE
+ and this CTE is not recursive then a clone of the CTE specification is
+ constructed using the function With_element::clone_parsed_spec() and
+ TABLE_LIST::derived is set to point to this clone rather than to the
+ original specification.
+ If the method does not find a matched CTE definition in the parsed fragment
+ then in the case when the flag this->only_cte_resolution is set to true
+ it just moves to the resolution of the next table reference from the
+ specified sub-chain while in the case when this->only_cte_resolution is set
+ to false the method additionally sets an mdl request for this table
+ reference.
+
+ @notes
+ The flag this->only_cte_resolution is set to true in the cases when
+ the failure to resolve a table reference as a CTE reference within
+ the fragment associated with this LEX structure does not imply that
+ this table reference cannot be resolved as such at all.
+
+ @retval false On success: no errors reported, no memory allocations failed
+ @retval true Otherwise
+*/
+
+bool LEX::resolve_references_to_cte(TABLE_LIST *tables,
+ TABLE_LIST **tables_last)
+{
+ With_element *with_elem= 0;
+
+ for (TABLE_LIST *tbl= tables; tbl != *tables_last; tbl= tbl->next_global)
+ {
+ if (tbl->derived)
+ continue;
+ if (!tbl->db.str && !tbl->with)
+ tbl->with= tbl->select_lex->find_table_def_in_with_clauses(tbl);
+ if (!tbl->with) // no CTE matches table reference tbl
+ {
+ if (only_cte_resolution)
+ continue;
+ if (!tbl->db.str) // no database specified in table reference tbl
+ {
+ if (!thd->db.str) // no default database is set
+ {
+ my_message(ER_NO_DB_ERROR, ER(ER_NO_DB_ERROR), MYF(0));
+ return true;
+ }
+ if (copy_db_to(&tbl->db))
+ return true;
+ if (!(tbl->table_options & TL_OPTION_ALIAS))
+ MDL_REQUEST_INIT(&tbl->mdl_request, MDL_key::TABLE,
+ tbl->db.str, tbl->table_name.str,
+ tbl->mdl_type, MDL_TRANSACTION);
+ tbl->mdl_request.set_type((tbl->lock_type >= TL_WRITE_ALLOW_WRITE) ?
+ MDL_SHARED_WRITE : MDL_SHARED_READ);
+ }
+ continue;
+ }
+ with_elem= tbl->with;
+ if (tbl->is_recursive_with_table() &&
+ !tbl->is_with_table_recursive_reference())
+ {
+ tbl->with->rec_outer_references++;
+ while ((with_elem= with_elem->get_next_mutually_recursive()) !=
+ tbl->with)
+ with_elem->rec_outer_references++;
+ }
+ if (!with_elem->is_used_in_query || with_elem->is_recursive)
+ {
+ tbl->derived= with_elem->spec;
+ if (tbl->derived != tbl->select_lex->master_unit() &&
+ !with_elem->is_recursive &&
+ !tbl->is_with_table_recursive_reference())
+ {
+ tbl->derived->move_as_slave(tbl->select_lex);
+ }
+ with_elem->is_used_in_query= true;
+ }
+ else
+ {
+ if (!(tbl->derived= tbl->with->clone_parsed_spec(thd->lex, tbl)))
+ return true;
+ }
+ tbl->db.str= empty_c_string;
+ tbl->db.length= 0;
+ tbl->schema_table= 0;
+ if (tbl->derived)
+ {
+ tbl->derived->first_select()->set_linkage(DERIVED_TABLE_TYPE);
+ tbl->select_lex->add_statistics(tbl->derived);
+ }
+ if (tbl->with->is_recursive && tbl->is_with_table_recursive_reference())
+ continue;
+ with_elem->inc_references();
+ }
+ return false;
+}
+
+
+/**
+ @brief
+ Find out dependencies between CTEs, resolve references to them
+
+ @details
+ The function can be called in two modes. With this->with_cte_resolution
+ set to false the function only finds out all dependencies between CTEs
+ used in a query expression with a WITH clause whose parsing has been
+ just finished. Based on these dependencies recursive CTEs are detected.
+ If this->with_cte_resolution is set to true the function additionally
+ resolves all references to CTE occurred in this query expression.
+
+ @retval
+ true on failure
+ false on success
+*/
+
+bool
+LEX::check_cte_dependencies_and_resolve_references()
+{
+ if (check_dependencies_in_with_clauses())
+ return true;
+ if (!with_cte_resolution)
+ return false;
+ if (resolve_references_to_cte(query_tables, query_tables_last))
+ return true;
+ if (resolve_references_to_cte_in_hanging_cte())
+ return true;
+ return false;
+}
+
+
+/**
+ @brief
Check dependencies between tables defined in this with clause
@details
@@ -138,10 +333,11 @@ bool With_clause::check_dependencies()
elem != with_elem;
elem= elem->next)
{
- if (lex_string_cmp(system_charset_info, with_elem->query_name,
- elem->query_name) == 0)
+ if (lex_string_cmp(system_charset_info, with_elem->get_name(),
+ elem->get_name()) == 0)
{
- my_error(ER_DUP_QUERY_NAME, MYF(0), with_elem->query_name->str);
+ my_error(ER_DUP_QUERY_NAME, MYF(0),
+ with_elem->get_name_str());
return true;
}
}
@@ -248,13 +444,12 @@ With_element *With_clause::find_table_def(TABLE_LIST *table,
with_elem != barrier;
with_elem= with_elem->next)
{
- if (my_strcasecmp(system_charset_info, with_elem->query_name->str,
- table->table_name.str) == 0 &&
+ if (my_strcasecmp(system_charset_info, with_elem->get_name_str(),
+ table->table_name.str) == 0 &&
!table->is_fqtn)
{
table->set_derived();
- table->db.str= empty_c_string;
- table->db.length= 0;
+ with_elem->referenced= true;
return with_elem;
}
}
@@ -611,7 +806,7 @@ bool With_clause::check_anchors()
if (elem == with_elem)
{
my_error(ER_RECURSIVE_WITHOUT_ANCHORS, MYF(0),
- with_elem->query_name->str);
+ with_elem->get_name_str());
return true;
}
}
@@ -644,7 +839,7 @@ bool With_clause::check_anchors()
if (elem->work_dep_map & elem->get_elem_map())
{
my_error(ER_UNACCEPTABLE_MUTUAL_RECURSION, MYF(0),
- with_elem->query_name->str);
+ with_elem->get_name_str());
return true;
}
}
@@ -798,7 +993,8 @@ bool With_element::set_unparsed_spec(THD *thd,
@brief
Create a clone of the specification for the given with table
- @param thd The context of the statement containing this with element
+ @param old_lex The LEX structure created for the query or CTE specification
+ where this With_element is defined
@param with_table The reference to the table defined in this element for which
the clone is created.
@@ -808,12 +1004,13 @@ bool With_element::set_unparsed_spec(THD *thd,
this element.
The clone is created when the string with the specification saved in
unparsed_spec is fed into the parser as an input string. The parsing
- this string a unit object representing the specification is build.
+ this string a unit object representing the specification is built.
A chain of all table references occurred in the specification is also
formed.
The method includes the new unit and its sub-unit into hierarchy of
the units of the main query. I also insert the constructed chain of the
table references into the chain of all table references of the main query.
+ The method resolves all references to CTE in the clone.
@note
Clones is created only for not first references to tables defined in
@@ -829,116 +1026,128 @@ bool With_element::set_unparsed_spec(THD *thd,
NULL - otherwise
*/
-st_select_lex_unit *With_element::clone_parsed_spec(THD *thd,
+st_select_lex_unit *With_element::clone_parsed_spec(LEX *old_lex,
TABLE_LIST *with_table)
{
+ THD *thd= old_lex->thd;
LEX *lex;
- st_select_lex_unit *res= NULL;
- Query_arena backup;
- Query_arena *arena= thd->activate_stmt_arena_if_needed(&backup);
- bool has_tmp_tables;
+ st_select_lex_unit *res= NULL;
if (!(lex= (LEX*) new(thd->mem_root) st_lex_local))
- {
- if (arena)
- thd->restore_active_arena(arena, &backup);
return res;
- }
- LEX *old_lex= thd->lex;
thd->lex= lex;
bool parse_status= false;
- Parser_state parser_state;
- TABLE_LIST *spec_tables;
- TABLE_LIST *spec_tables_tail;
st_select_lex *with_select;
char save_end= unparsed_spec.str[unparsed_spec.length];
((char*) &unparsed_spec.str[unparsed_spec.length])[0]= '\0';
- if (parser_state.init(thd, (char*) unparsed_spec.str, (unsigned int)unparsed_spec.length))
- goto err;
- parser_state.m_lip.stmt_prepare_mode= stmt_prepare_mode;
- parser_state.m_lip.multi_statements= false;
- parser_state.m_lip.m_digest= NULL;
lex_start(thd);
lex->clone_spec_offset= unparsed_spec_offset;
- lex->param_list= old_lex->param_list;
- lex->sphead= old_lex->sphead;
- lex->spname= old_lex->spname;
- lex->spcont= old_lex->spcont;
- lex->sp_chistics= old_lex->sp_chistics;
-
- lex->stmt_lex= old_lex;
- parse_status= parse_sql(thd, &parser_state, 0);
+ lex->with_cte_resolution= true;
+
+ /*
+ The specification of a CTE is to be parsed as a regular query.
+ At the very end of the parsing query the function
+ check_cte_dependencies_and_resolve_references() will be called.
+ It will check the dependencies between CTEs that are defined
+ within the query and will resolve CTE references in this query.
+ If a table reference is not resolved as a CTE reference within
+ this query it still can be resolved as a reference to a CTE defined
+ in the same clause as the CTE whose specification is to be parsed
+ or defined in an embedding CTE definition.
+
+ Example:
+ with
+ cte1 as ( ... ),
+ cte2 as ([WITH ...] select ... from cte1 ...)
+ select ... from cte2 as r, ..., cte2 as s ...
+
+ Here the specification of cte2 has be cloned for table reference
+ with alias s1. The specification contains a reference to cte1
+ that is defined outside this specification. If the reference to
+ cte1 cannot be resolved within the specification of cte2 it's
+ not necessarily has to be a reference to a non-CTE table. That's
+ why the flag lex->only_cte_resolution has to be set to true
+ before parsing of the specification of cte2 invoked by this
+ function starts. Otherwise an mdl_lock would be requested for s
+ and this would not be correct.
+ */
+
+ lex->only_cte_resolution= true;
+
+ lex->stmt_lex= old_lex->stmt_lex ? old_lex->stmt_lex : old_lex;
+
+ parse_status= thd->sql_parser(old_lex, lex,
+ (char*) unparsed_spec.str,
+ (unsigned int)unparsed_spec.length,
+ stmt_prepare_mode);
+
((char*) &unparsed_spec.str[unparsed_spec.length])[0]= save_end;
- with_select= lex->first_select_lex();
+ with_select= lex->unit.first_select();
if (parse_status)
goto err;
- if (check_dependencies_in_with_clauses(lex->with_clauses_list))
- goto err;
-
- spec_tables= lex->query_tables;
- spec_tables_tail= 0;
- has_tmp_tables= thd->has_temporary_tables();
- for (TABLE_LIST *tbl= spec_tables;
- tbl;
- tbl= tbl->next_global)
- {
- if (has_tmp_tables && !tbl->derived && !tbl->schema_table &&
- thd->open_temporary_table(tbl))
- goto err;
- spec_tables_tail= tbl;
- }
- if (spec_tables)
+ /*
+ The global chain of TABLE_LIST objects created for the specification that
+ just has been parsed is added to such chain that contains the reference
+ to the CTE whose specification is parsed right after the TABLE_LIST object
+ created for the reference.
+ */
+ if (lex->query_tables)
{
- if (with_table->next_global)
+ head->tables_pos.set_start_pos(&with_table->next_global);
+ head->tables_pos.set_end_pos(lex->query_tables_last);
+ TABLE_LIST *next_tbl= with_table->next_global;
+ if (next_tbl)
{
- spec_tables_tail->next_global= with_table->next_global;
- with_table->next_global->prev_global= &spec_tables_tail->next_global;
+ *(lex->query_tables->prev_global= next_tbl->prev_global)=
+ lex->query_tables;
+ *(next_tbl->prev_global= lex->query_tables_last)= next_tbl;
}
else
{
- old_lex->query_tables_last= &spec_tables_tail->next_global;
+ *(lex->query_tables->prev_global= old_lex->query_tables_last)=
+ lex->query_tables;
+ old_lex->query_tables_last= lex->query_tables_last;
}
- spec_tables->prev_global= &with_table->next_global;
- with_table->next_global= spec_tables;
}
res= &lex->unit;
res->with_element= this;
+ /*
+ The unit of the specification that just has been parsed is included
+ as a slave of the select that contained in its from list the table
+ reference for which the unit has been created.
+ */
lex->unit.include_down(with_table->select_lex);
- lex->unit.set_slave(with_select);
+ lex->unit.set_slave(with_select);
+ lex->unit.cloned_from= spec;
old_lex->all_selects_list=
(st_select_lex*) (lex->all_selects_list->
insert_chain_before(
(st_select_lex_node **) &(old_lex->all_selects_list),
with_select));
- if (check_dependencies_in_with_clauses(lex->with_clauses_list))
- res= NULL;
+
/*
- Resolve references to CTE from the spec_tables list that has not
- been resolved yet.
+ Now all references to the CTE defined outside of the cloned specification
+ has to be resolved. Additionally if old_lex->only_cte_resolution == false
+ for the table references that has not been resolved requests for mdl_locks
+ has to be set.
*/
- for (TABLE_LIST *tbl= spec_tables;
- tbl;
- tbl= tbl->next_global)
+ lex->only_cte_resolution= old_lex->only_cte_resolution;
+ if (lex->resolve_references_to_cte(lex->query_tables,
+ lex->query_tables_last))
{
- if (!tbl->with)
- tbl->with= with_select->find_table_def_in_with_clauses(tbl);
- if (tbl == spec_tables_tail)
- break;
- }
- if (check_table_access(thd, SELECT_ACL, spec_tables, FALSE, UINT_MAX, FALSE))
+ res= NULL;
goto err;
+ }
- lex->sphead= NULL; // in order not to delete lex->sphead
+ lex->sphead= NULL; // in order not to delete lex->sphead
lex_end(lex);
err:
- if (arena)
- thd->restore_active_arena(arena, &backup);
thd->lex= old_lex;
return res;
}
@@ -1145,59 +1354,6 @@ With_element *st_select_lex::find_table_def_in_with_clauses(TABLE_LIST *table)
}
-/**
- @brief
- Set the specifying unit in this reference to a with table
-
- @details
- The method assumes that the given element with_elem defines the table T
- this table reference refers to.
- If this is the first reference to T the method just sets its specification
- in the field 'derived' as the unit that yields T. Otherwise the method
- first creates a clone specification and sets rather this clone in this field.
-
- @retval
- false on success
- true on failure
-*/
-
-bool TABLE_LIST::set_as_with_table(THD *thd, With_element *with_elem)
-{
- if (table)
- {
- /*
- This table was prematurely identified as a temporary table.
- We correct it here, but it's not a nice solution in the case
- when the temporary table with this name is not used anywhere
- else in the query.
- */
- thd->mark_tmp_table_as_free_for_reuse(table);
- table= 0;
- }
- with= with_elem;
- schema_table= NULL;
- if (!with_elem->is_referenced() || with_elem->is_recursive)
- {
- derived= with_elem->spec;
- if (derived != select_lex->master_unit() &&
- !with_elem->is_recursive &&
- !is_with_table_recursive_reference())
- {
- derived->move_as_slave(select_lex);
- }
- }
- else
- {
- if(!(derived= with_elem->clone_parsed_spec(thd, this)))
- return true;
- }
- derived->first_select()->set_linkage(DERIVED_TABLE_TYPE);
- select_lex->add_statistics(derived);
- with_elem->inc_references();
- return false;
-}
-
-
bool TABLE_LIST::is_recursive_with_table()
{
return with && with->is_recursive;
@@ -1297,7 +1453,7 @@ bool st_select_lex::check_unrestricted_recursive(bool only_standard_compliant)
if (only_standard_compliant && with_elem->is_unrestricted())
{
my_error(ER_NOT_STANDARD_COMPLIANT_RECURSIVE,
- MYF(0), with_elem->query_name->str);
+ MYF(0), with_elem->get_name_str());
return true;
}
@@ -1514,7 +1670,7 @@ static void list_strlex_print(THD *thd, String *str, List<Lex_ident_sys> *list)
void With_element::print(THD *thd, String *str, enum_query_type query_type)
{
- str->append(query_name);
+ str->append(get_name());
if (column_list.elements)
{
List_iterator_fast<Lex_ident_sys> li(column_list);
diff --git a/sql/sql_cte.h b/sql/sql_cte.h
index 4c42dd2..44628df 100644
--- a/sql/sql_cte.h
+++ b/sql/sql_cte.h
@@ -25,6 +25,39 @@ struct st_unit_ctxt_elem;
/**
+ @class With_element_head
+ @brief Head of the definition of a CTE table
+
+ It contains the name of the CTE and it contains the position of the subchain
+ of table references used in the definition in the global chain of table
+ references used in the query where this definition is encountered.
+*/
+
+class With_element_head : public Sql_alloc
+{
+ /* The name of the defined CTE */
+ LEX_CSTRING *query_name;
+
+public:
+ /*
+ The structure describing the subchain of the table references used in
+ the specification of the defined CTE in the global chain of table
+ references used in the query. The structure is fully defined only
+ after the CTE definition has been parsed.
+ */
+ TABLE_CHAIN tables_pos;
+
+ With_element_head(LEX_CSTRING *name)
+ : query_name(name)
+ {
+ tables_pos.set_start_pos(0);
+ tables_pos.set_end_pos(0);
+ }
+ friend class With_element;
+};
+
+
+/**
@class With_element
@brief Definition of a CTE table
@@ -85,9 +118,22 @@ class With_element : public Sql_alloc
subqueries and specifications of other with elements).
*/
uint references;
+
+ /*
+ true <=> this With_element is referred in the query in which the
+ element is defined
+ */
+ bool referenced;
+
+ /*
+ true <=> this With_element is needed for the execution of the query
+ in which the element is defined
+ */
+ bool is_used_in_query;
+
/*
Unparsed specification of the query that specifies this element.
- It used to build clones of the specification if they are needed.
+ It's used to build clones of the specification if they are needed.
*/
LEX_CSTRING unparsed_spec;
/* Offset of the specification in the input string */
@@ -101,10 +147,12 @@ class With_element : public Sql_alloc
public:
/*
- The name of the table introduced by this with elememt. The name
- can be used in FROM lists of the queries in the scope of the element.
+ Contains the name of the defined With element and the position of
+ the subchain of the tables references used by its definition in the
+ global chain of TABLE_LIST objects created for the whole query.
*/
- LEX_CSTRING *query_name;
+ With_element_head *head;
+
/*
Optional list of column names to name the columns of the table introduced
by this with element. It is used in the case when the names are not
@@ -163,18 +211,27 @@ class With_element : public Sql_alloc
/* List of derived tables containing recursive references to this CTE */
SQL_I_List<TABLE_LIST> derived_with_rec_ref;
- With_element(LEX_CSTRING *name,
+ With_element(With_element_head *h,
List <Lex_ident_sys> list,
st_select_lex_unit *unit)
: next(NULL), base_dep_map(0), derived_dep_map(0),
sq_dep_map(0), work_dep_map(0), mutually_recursive(0),
top_level_dep_map(0), sq_rec_ref(NULL),
next_mutually_recursive(NULL), references(0),
- query_name(name), column_list(list), cycle_list(0), spec(unit),
+ referenced(false), is_used_in_query(false),
+ head(h), column_list(list), cycle_list(0), spec(unit),
is_recursive(false), rec_outer_references(0), with_anchor(false),
level(0), rec_result(NULL)
{ unit->with_element= this; }
+ LEX_CSTRING *get_name() { return head->query_name; }
+ const char *get_name_str() { return get_name()->str; }
+
+ void set_tables_start_pos(TABLE_LIST **pos)
+ { head->tables_pos.set_start_pos(pos); }
+ void set_tables_end_pos(TABLE_LIST **pos)
+ { head->tables_pos.set_end_pos(pos); }
+
bool check_dependencies_in_spec();
void check_dependencies_in_select(st_select_lex *sl, st_unit_ctxt_elem *ctxt,
@@ -201,9 +258,9 @@ class With_element : public Sql_alloc
bool set_unparsed_spec(THD *thd, const char *spec_start, const char *spec_end,
my_ptrdiff_t spec_offset);
- st_select_lex_unit *clone_parsed_spec(THD *thd, TABLE_LIST *with_table);
+ st_select_lex_unit *clone_parsed_spec(LEX *old_lex, TABLE_LIST *with_table);
- bool is_referenced() { return references != 0; }
+ bool is_referenced() { return referenced; }
void inc_references() { references++; }
@@ -263,6 +320,12 @@ class With_element : public Sql_alloc
void set_cycle_list(List<Lex_ident_sys> *cycle_list_arg);
friend class With_clause;
+
+ friend
+ bool LEX::resolve_references_to_cte(TABLE_LIST *tables,
+ TABLE_LIST **tables_last);
+ friend
+ bool LEX::resolve_references_to_cte_in_hanging_cte();
};
const uint max_number_of_elements_in_with_clause= sizeof(table_map)*8;
@@ -361,8 +424,10 @@ class With_clause : public Sql_alloc
friend class With_element;
friend
- bool
- check_dependencies_in_with_clauses(With_clause *with_clauses_list);
+ bool LEX::check_dependencies_in_with_clauses();
+
+ friend
+ bool LEX::resolve_references_to_cte_in_hanging_cte();
};
inline
diff --git a/sql/sql_lex.cc b/sql/sql_lex.cc
index f16102d..de323ae 100644
--- a/sql/sql_lex.cc
+++ b/sql/sql_lex.cc
@@ -1255,6 +1255,8 @@ void LEX::start(THD *thd_arg)
explain_json= false;
context_analysis_only= 0;
derived_tables= 0;
+ with_cte_resolution= false;
+ only_cte_resolution= false;
safe_to_cache_query= 1;
parsing_options.reset();
empty_field_list_on_rset= 0;
@@ -2911,6 +2913,7 @@ void st_select_lex_unit::init_query()
is_view= false;
with_clause= 0;
with_element= 0;
+ cloned_from= 0;
columns_are_renamed= false;
with_wrapped_tvc= false;
have_except_all_or_intersect_all= false;
@@ -8997,6 +9000,8 @@ bool LEX::check_main_unit_semantics()
if (unit.set_nest_level(0) ||
unit.check_parameters(first_select_lex()))
return TRUE;
+ if (check_cte_dependencies_and_resolve_references())
+ return TRUE;
return FALSE;
}
@@ -9703,8 +9708,8 @@ bool LEX::main_select_push(bool service)
{
DBUG_ENTER("LEX::main_select_push");
DBUG_PRINT("info", ("service: %u", service));
- current_select_number= 1;
- builtin_select.select_number= 1;
+ current_select_number= ++thd->lex->stmt_lex->current_select_number;
+ builtin_select.select_number= current_select_number;
builtin_select.is_service_select= service;
if (push_select(&builtin_select))
DBUG_RETURN(TRUE);
diff --git a/sql/sql_lex.h b/sql/sql_lex.h
index 45a3bf7..4c15c75 100644
--- a/sql/sql_lex.h
+++ b/sql/sql_lex.h
@@ -944,6 +944,8 @@ class st_select_lex_unit: public st_select_lex_node {
With_clause *with_clause;
/* With element where this unit is used as the specification (if any) */
With_element *with_element;
+ /* The unit used as a CTE specification from which this unit is cloned */
+ st_select_lex_unit *cloned_from;
/* thread handler */
THD *thd;
/*
@@ -1543,7 +1545,9 @@ class st_select_lex: public st_select_lex_node
}
With_element *get_with_element()
{
- return master_unit()->with_element;
+ return master_unit()->cloned_from ?
+ master_unit()->cloned_from->with_element :
+ master_unit()->with_element;
}
With_element *find_table_def_in_with_clauses(TABLE_LIST *table);
bool check_unrestricted_recursive(bool only_standard_compliant);
@@ -3389,6 +3393,20 @@ struct LEX: public Query_tables_list
*/
uint8 derived_tables;
uint8 context_analysis_only;
+ /*
+ true <=> The parsed fragment requires resolution of references to CTE
+ at the end of parsing. This name resolution process involves searching
+ for possible dependencies between CTE defined in the parsed fragment and
+ detecting possible recursive references.
+ The flag is set to true if the fragment contains CTE definitions.
+ */
+ bool with_cte_resolution;
+ /*
+ true <=> only resolution of references to CTE are required in the parsed
+ fragment, no checking of dependencies between CTE is required.
+ This flag is used only when parsing clones of CTE specifications.
+ */
+ bool only_cte_resolution;
bool local_file;
bool check_exists;
bool autocommit;
@@ -4725,6 +4743,12 @@ struct LEX: public Query_tables_list
const LEX_CSTRING *constraint_name,
Table_ident *ref_table_name,
DDL_options ddl_options);
+ bool check_dependencies_in_with_clauses();
+ bool resolve_references_to_cte_in_hanging_cte();
+ bool check_cte_dependencies_and_resolve_references();
+ bool resolve_references_to_cte(TABLE_LIST *tables,
+ TABLE_LIST **tables_last);
+
};
diff --git a/sql/sql_parse.cc b/sql/sql_parse.cc
index 0e2b80b..e9252f0 100644
--- a/sql/sql_parse.cc
+++ b/sql/sql_parse.cc
@@ -3594,9 +3594,6 @@ mysql_execute_command(THD *thd)
thd->get_stmt_da()->opt_clear_warning_info(thd->query_id);
}
- if (check_dependencies_in_with_clauses(thd->lex->with_clauses_list))
- DBUG_RETURN(1);
-
#ifdef HAVE_REPLICATION
if (unlikely(thd->slave_thread))
{
@@ -8291,7 +8288,7 @@ TABLE_LIST *st_select_lex::add_table_to_list(THD *thd,
ptr->is_fqtn= TRUE;
ptr->db= table->db;
}
- else if (lex->copy_db_to(&ptr->db))
+ else if (!lex->with_cte_resolution && lex->copy_db_to(&ptr->db))
DBUG_RETURN(0);
else
ptr->is_fqtn= FALSE;
@@ -8308,7 +8305,9 @@ TABLE_LIST *st_select_lex::add_table_to_list(THD *thd,
}
ptr->table_name= table->table;
- ptr->lock_type= lock_type;
+ ptr->lock_type= lock_type;
+ ptr->mdl_type= mdl_type;
+ ptr->table_options= table_options;
ptr->updating= MY_TEST(table_options & TL_OPTION_UPDATING);
/* TODO: remove TL_OPTION_FORCE_INDEX as it looks like it's not used */
ptr->force_index= MY_TEST(table_options & TL_OPTION_FORCE_INDEX);
@@ -8989,8 +8988,10 @@ void st_select_lex::set_lock_for_tables(thr_lock_type lock_type, bool for_update
{
tables->lock_type= lock_type;
tables->updating= for_update;
- tables->mdl_request.set_type((lock_type >= TL_WRITE_ALLOW_WRITE) ?
- MDL_SHARED_WRITE : MDL_SHARED_READ);
+
+ if (tables->db.str && tables->db.str[0])
+ tables->mdl_request.set_type((lock_type >= TL_WRITE_ALLOW_WRITE) ?
+ MDL_SHARED_WRITE : MDL_SHARED_READ);
}
DBUG_VOID_RETURN;
}
diff --git a/sql/sql_prepare.cc b/sql/sql_prepare.cc
index 21958e2..5fac907 100644
--- a/sql/sql_prepare.cc
+++ b/sql/sql_prepare.cc
@@ -2321,9 +2321,6 @@ static bool check_prepared_statement(Prepared_statement *stmt)
if (tables)
thd->get_stmt_da()->opt_clear_warning_info(thd->query_id);
- if (check_dependencies_in_with_clauses(thd->lex->with_clauses_list))
- goto error;
-
if (sql_command_flags[sql_command] & CF_HA_CLOSE)
mysql_ha_rm_tables(thd, tables);
diff --git a/sql/sql_view.cc b/sql/sql_view.cc
index cfd43bd..01fd424 100644
--- a/sql/sql_view.cc
+++ b/sql/sql_view.cc
@@ -431,12 +431,6 @@ bool mysql_create_view(THD *thd, TABLE_LIST *views,
lex->link_first_table_back(view, link_to_local);
view->open_type= OT_BASE_ONLY;
- if (check_dependencies_in_with_clauses(lex->with_clauses_list))
- {
- res= TRUE;
- goto err_no_relink;
- }
-
WSREP_TO_ISOLATION_BEGIN(WSREP_MYSQL_DB, NULL, NULL);
/*
@@ -1419,9 +1413,6 @@ bool mysql_make_view(THD *thd, TABLE_SHARE *share, TABLE_LIST *table,
TABLE_LIST *tbl;
Security_context *security_ctx= 0;
- if (check_dependencies_in_with_clauses(thd->lex->with_clauses_list))
- goto err;
-
/*
Check rights to run commands which show underlying tables.
In the optimizer trace we would not like to show trace for
diff --git a/sql/sql_yacc.yy b/sql/sql_yacc.yy
index 1a045c5..ec72e39 100644
--- a/sql/sql_yacc.yy
+++ b/sql/sql_yacc.yy
@@ -280,6 +280,7 @@ void _CONCAT_UNDERSCORED(turn_parser_debug_on,yyparse)()
class sp_head *sphead;
class sp_name *spname;
class sp_variable *spvar;
+ class With_element_head *with_element_head;
class With_clause *with_clause;
class Virtual_column_info *virtual_column;
@@ -1753,7 +1754,7 @@ End SQL_MODE_ORACLE_SPECIFIC */
%type <with_clause> with_clause
-%type <lex_str_ptr> query_name
+%type <with_element_head> with_element_head
%type <ident_sys_list>
comma_separated_ident_list
@@ -2946,7 +2947,11 @@ call:
if (unlikely(Lex->call_statement_start(thd, $2)))
MYSQL_YYABORT;
}
- opt_sp_cparam_list {}
+ opt_sp_cparam_list
+ {
+ if (Lex->check_cte_dependencies_and_resolve_references())
+ MYSQL_YYABORT;
+ }
;
/* CALL parameters */
@@ -3753,6 +3758,8 @@ expr_lex:
$$->sp_lex_in_use= true;
$$->set_item($2);
Lex->pop_select(); //min select
+ if (Lex->check_cte_dependencies_and_resolve_references())
+ MYSQL_YYABORT;
if ($$->sphead->restore_lex(thd))
MYSQL_YYABORT;
}
@@ -12649,6 +12656,8 @@ do:
{
Lex->insert_list= $3;
Lex->pop_select(); //main select
+ if (Lex->check_cte_dependencies_and_resolve_references())
+ MYSQL_YYABORT;
}
;
@@ -14831,6 +14840,7 @@ with_clause:
if (unlikely(with_clause == NULL))
MYSQL_YYABORT;
lex->derived_tables|= DERIVED_WITH;
+ lex->with_cte_resolution= true;
lex->curr_with_clause= with_clause;
with_clause->add_to_list(Lex->with_clauses_list_last_next);
if (lex->current_select &&
@@ -14858,7 +14868,7 @@ with_list:
with_list_element:
- query_name
+ with_element_head
opt_with_column_list
AS '(' query_expression ')' opt_cycle
{
@@ -14876,6 +14886,7 @@ with_list_element:
{
elem->set_cycle_list($7);
}
+ elem->set_tables_end_pos(lex->query_tables_last);
}
;
@@ -14936,12 +14947,15 @@ comma_separated_ident_list:
;
-query_name:
+with_element_head:
ident
{
- $$= (LEX_CSTRING *) thd->memdup(&$1, sizeof(LEX_CSTRING));
- if (unlikely($$ == NULL))
+ LEX_CSTRING *name=
+ (LEX_CSTRING *) thd->memdup(&$1, sizeof(LEX_CSTRING));
+ $$= new (thd->mem_root) With_element_head(name);
+ if (unlikely(name == NULL || $$ == NULL))
MYSQL_YYABORT;
+ $$->tables_pos.set_start_pos(Lex->query_tables_last);
}
;
diff --git a/sql/table.h b/sql/table.h
index 58789dc..8efab4c 100644
--- a/sql/table.h
+++ b/sql/table.h
@@ -2109,6 +2109,29 @@ struct vers_select_conds_t
struct LEX;
class Index_hint;
+
+/*
+ @struct TABLE_CHAIN
+ @brief Subchain of global chain of table references
+
+ The structure contains a pointer to the address of the next_global
+ pointer to the first TABLE_LIST objectof the subchain and the address
+ of the next_global pointer to the element right after the last
+ TABLE_LIST object of the subchain. For an empty subchain both pointers
+ have the same value.
+*/
+
+struct TABLE_CHAIN
+{
+ TABLE_CHAIN() {}
+
+ TABLE_LIST **start_pos;
+ TABLE_LIST ** end_pos;
+
+ void set_start_pos(TABLE_LIST **pos) { start_pos= pos; }
+ void set_end_pos(TABLE_LIST **pos) { end_pos= pos; }
+};
+
struct TABLE_LIST
{
TABLE_LIST() {} /* Remove gcc warning */
@@ -2443,6 +2466,20 @@ struct TABLE_LIST
/* call back function for asking handler about caching in query cache */
qc_engine_callback callback_func;
thr_lock_type lock_type;
+
+ /*
+ Two fields below are set during parsing this table reference in the cases
+ when the table reference can be potentially a reference to a CTE table.
+ In this cases the fact that the reference is a reference to a CTE or not
+ will be ascertained at the very end of parsing of the query when referencies
+ to CTE are resolved. For references to CTE and to derived tables no mdl
+ requests are needed while for other table references they are. If a request
+ is possibly postponed the info that allows to issue this request must be
+ saved in 'mdl_type' and 'table_options'.
+ */
+ enum_mdl_type mdl_type;
+ ulong table_options;
+
uint outer_join; /* Which join type */
uint shared; /* Used in multi-upd */
bool updatable; /* VIEW/TABLE can be updated now */
1
0
[Commits] 04de651: MDEV-23886 Reusing CTE inside a function fails with table doesn't exist
by IgorBabaev 25 May '21
by IgorBabaev 25 May '21
25 May '21
revision-id: 04de651725c3eeee8f216c55e2f8133e4547fadb (mariadb-10.4.11-630-g04de651)
parent(s): 67083ca4f3dd11f44810e22d370e6c3b01e3bc54
author: Igor Babaev
committer: Igor Babaev
timestamp: 2021-05-25 00:43:03 -0700
message:
MDEV-23886 Reusing CTE inside a function fails with table doesn't exist
In the code existed just before this patch binding of a table reference to
the specification of the corresponding CTE happens in the function
open_and_process_table(). If the table reference is not the first in the
query the specification is cloned in the same way as the specification of
a view is cloned for any reference of the view. This works fine for
standalone queries, but does not work for stored procedures / functions
for the following reason.
When the first call of a stored procedure/ function SP is processed the
body of SP is parsed. When a query of SP is parsed the info on each
encountered table reference is put into a TABLE_LIST object linked into
a global chain associated with the query. When parsing of the query is
finished the basic info on the table references from this chain except
table references to derived tables and information schema tables is put
in one hash table associated with SP. When parsing of the body of SP is
finished this hash table is used to construct TABLE_LIST objects for all
table references mentioned in SP and link them into the list of such
objects passed to a pre-locking process that calls open_and_process_table()
for each table from the list.
When a TABLE_LIST for a view is encountered the view is opened and its
specification is parsed. For any table reference occurred in
the specification a new TABLE_LIST object is created to be included into
the list for pre-locking. After all objects in the pre-locking have been
looked through the tables mentioned in the list are locked. Note that the
objects referenced CTEs are just skipped here as it is impossible to
resolve these references without any info on the context where they occur.
Now the statements from the body of SP are executed one by one that.
At the very beginning of the execution of a query the tables used in the
query are opened and open_and_process_table() now is called for each table
reference mentioned in the list of TABLE_LIST objects associated with the
query that was built when the query was parsed.
For each table reference first the reference is checked against CTEs
definitions in whose scope it occurred. If such definition is found the
reference is considered resolved and if this is not the first reference
to the found CTE the the specification of the CTE is re-parsed and the
result of the parsing is added to the parsing tree of the query as a
sub-tree. If this sub-tree contains table references to other tables they
are added to the list of TABLE_LIST objects associated with the query in
order the referenced tables to be opened. When the procedure that opens
the tables comes to the TABLE_LIST object created for a non-first
reference to a CTE it discovers that the referenced table instance is not
locked and reports an error.
Thus processing non-first table references to a CTE similar to how
references to view are processed does not work for queries used in stored
procedures / functions. And the main problem is that the current
pre-locking mechanism employed for stored procedures / functions does not
allow to save the context in which a CTE reference occur. It's not trivial
to save the info about the context where a CTE reference occurs while the
resolution of the table reference cannot be done without this context and
consequentially the specification for the table reference cannot be
determined.
This patch solves the above problem by moving resolution of all CTE
references at the parsing stage. More exactly references to CTEs occurred in
a query are resolved right after parsing of the query has finished. After
resolution any CTE reference it is marked as a reference to to derived
table. So it is excluded from the hash table created for pre-locking used
base tables and view when the first call of a stored procedure / function
is processed.
This solution required recursive calls of the parser. The function
THD::sql_parser() has been added specifically for recursive invocations of
the parser.
---
mysql-test/main/cte_nonrecursive.result | 201 +++++++++++++++
mysql-test/main/cte_nonrecursive.test | 202 +++++++++++++++
sql/item_subselect.cc | 1 -
sql/sp_head.cc | 3 +-
sql/sql_base.cc | 33 +--
sql/sql_class.cc | 56 +++++
sql/sql_class.h | 8 +-
sql/sql_cte.cc | 420 ++++++++++++++++++++++----------
sql/sql_cte.h | 85 ++++++-
sql/sql_lex.cc | 9 +-
sql/sql_lex.h | 25 +-
sql/sql_parse.cc | 15 +-
sql/sql_prepare.cc | 3 -
sql/sql_view.cc | 9 -
sql/sql_yacc.yy | 26 +-
sql/sql_yacc_ora.yy | 26 +-
sql/table.h | 37 +++
17 files changed, 948 insertions(+), 211 deletions(-)
diff --git a/mysql-test/main/cte_nonrecursive.result b/mysql-test/main/cte_nonrecursive.result
index f50ac50..d647057 100644
--- a/mysql-test/main/cte_nonrecursive.result
+++ b/mysql-test/main/cte_nonrecursive.result
@@ -1763,6 +1763,207 @@ a c
2 1
7 3
drop table t1;
+#
+# MDEV-23886: Stored Function returning the result of a query
+# that uses CTE over a table twice
+#
+create table t1 (c1 int);
+insert into t1 values (1),(2),(6);
+create function f1() returns int return
+( with cte1 as (select c1 from t1)
+select sum(c1) from
+(select * from cte1 union all select * from cte1) dt
+);
+select f1();
+f1()
+18
+create function f2() returns int return
+( with cte1 as (select c1 from t1)
+select sum(s.c1) from cte1 as s, cte1 as t where s.c1=t.c1
+);
+select f2();
+f2()
+9
+create function f3() returns int return
+( with cte1 as (select c1 from t1)
+select
+case
+when exists(select 1 from cte1 where c1 between 1 and 2) then 1
+when exists(select 1 from cte1 where c1 between 5 and 6) then 2
+else 0
+end
+);
+select f3();
+f3()
+1
+create view v1 as (select c1 from t1);
+create function f4() returns int return
+( select sum(c1) from
+(select * from v1 union all select * from v1) dt
+);
+select f4();
+f4()
+18
+create function f5() returns int return
+( select sum(s.c1) from v1 as s, v1 as t where s.c1=t.c1
+);
+select f5();
+f5()
+9
+create view v2(s) as
+with cte1 as (select c1 from t1)
+select sum(c1) from (select * from cte1 union all select * from cte1) dt;
+create function f6() returns int return
+(select s from v2);
+select f6();
+f6()
+18
+create function f7() returns int return
+( select r.s from v2 as r, v2 as t where r.s=t.s
+);
+select f7();
+f7()
+18
+select f5() + f6();
+f5() + f6()
+27
+prepare stmt from "select f5() + f6();";
+execute stmt;
+f5() + f6()
+27
+execute stmt;
+f5() + f6()
+27
+deallocate prepare stmt;
+drop function f1;
+drop function f2;
+drop function f3;
+drop function f4;
+drop function f5;
+drop function f6;
+drop function f7;
+drop view v1;
+drop view v2;
+create table t2 (a int, b int);
+insert into t2
+with cte1 as (select c1 from t1)
+select * from cte1 as s, cte1 as t where s.c1=t.c1 and s.c1 > 5;
+select * from t2;
+a b
+6 6
+create procedure p1()
+begin
+insert into t2
+with cte1 as (select c1 from t1)
+select * from cte1 as s, cte1 as t where s.c1=t.c1 and s.c1 <= 2 and t.c1 >= 2;
+end |
+call p1();
+select * from t2;
+a b
+6 6
+2 2
+drop procedure p1;
+# checking CTE resolution for queries with hanging CTEs
+with
+cte1(a) as (select * from t1 where c1 <= 2),
+cte2(b) as (select * from cte1 where a >= 2),
+cte3 as (select * from cte1,cte2 where cte1.a < cte2.b)
+select * from cte3;
+a b
+1 2
+select * from t2;
+a b
+6 6
+2 2
+with
+cte1(a) as (select * from t1 where c1 <= 2),
+cte2(b) as (select * from cte1 where a >= 2),
+cte3 as (select * from cte1,cte2 where cte1.a < cte2.b)
+select * from t2;
+a b
+6 6
+2 2
+with
+cte1(a) as (select * from t1 where c1 <= 2),
+cte2(b) as (select * from cte1 where c1 >= 2),
+cte3 as (select * from cte1,cte2 where cte1.a < cte2.b)
+select * from t2;
+ERROR 42S22: Unknown column 'c1' in 'where clause'
+with
+cte1(a) as (select * from t1 where c1 <= 2),
+cte2(b) as (select * from cte1 where a >= 2),
+cte3 as (select * from cte1,cte2 where cte1.a < cte2.c1)
+select * from t2;
+ERROR 42S22: Unknown column 'cte2.c1' in 'where clause'
+with
+cte1 as (select * from t1 where c1 <= 2),
+cte2(a,b) as (select * from cte1 as s1, cte1 as s2 where s1.c1=s2.c1)
+select * from cte2;
+a b
+1 1
+2 2
+with
+cte1 as (select * from t1 where c1 <= 2),
+cte2(a,b) as (select * from cte1 as s1, cte1 as s2 where s1.c1=s2.c1)
+select * from t2;
+a b
+6 6
+2 2
+with
+cte1 as (select * from t1 where c1 <= 2),
+cte2(a,b) as (select * from cte1 as s1, cte1 as s2 where s1.c1=c1)
+select * from t2;
+ERROR 23000: Column 'c1' in where clause is ambiguous
+with cte3 as
+( with cte2(a,b) as
+( with cte1 as (select * from t1 where c1 <= 2)
+select * from cte1 as s1, cte1 as s2 where s1.c1=s2.c1)
+select r1.a,r2.b from cte2 as r1, cte2 as r2)
+select * from cte3;
+a b
+1 1
+2 1
+1 2
+2 2
+with cte3 as
+( with cte2(a,b) as
+( with cte1 as (select * from t1 where c1 <= 2)
+select * from cte1 as s1, cte1 as s2 where s1.c1=s2.c1)
+select r1.a,r2.b from cte2 as r1, cte2 as r2)
+select * from t2;
+a b
+6 6
+2 2
+with cte3 as
+( with cte2(a,b) as
+( with cte1 as (select * from t1 where c1 <= 2)
+select * from cte1 as s1, cte1 as s2 where s1.c1=s2.c1)
+select r1.c1,r2.c1 from cte2 as r1, cte2 as r2)
+select * from t2;
+ERROR 42S22: Unknown column 'r1.c1' in 'field list'
+create procedure p1()
+begin
+insert into t2
+with cte1 as (select c1 from t1)
+select * from t1 as s, t1 as t where s.c1=t.c1 and s.c1 <= 2 and t.c1 >= 2;
+end |
+call p1();
+select * from t2;
+a b
+6 6
+2 2
+2 2
+drop procedure p1;
+create procedure p1()
+begin
+insert into t2
+with cte1 as (select a from t1)
+select * from t1 as s, t1 as t where s.c1=t.c1 and s.c1 <= 2 and t.c1 >= 2;
+end |
+call p1();
+ERROR 42S22: Unknown column 'a' in 'field list'
+drop procedure p1;
+drop table t1,t2;
# End of 10.2 tests
#
# MDEV-21673: several references to CTE that uses
diff --git a/mysql-test/main/cte_nonrecursive.test b/mysql-test/main/cte_nonrecursive.test
index 49df2bb..c2a0a39 100644
--- a/mysql-test/main/cte_nonrecursive.test
+++ b/mysql-test/main/cte_nonrecursive.test
@@ -1261,6 +1261,208 @@ select a, c from cte as r2 where a > 4;
drop table t1;
+--echo #
+--echo # MDEV-23886: Stored Function returning the result of a query
+--echo # that uses CTE over a table twice
+--echo #
+
+create table t1 (c1 int);
+insert into t1 values (1),(2),(6);
+
+create function f1() returns int return
+( with cte1 as (select c1 from t1)
+ select sum(c1) from
+ (select * from cte1 union all select * from cte1) dt
+);
+select f1();
+
+create function f2() returns int return
+( with cte1 as (select c1 from t1)
+ select sum(s.c1) from cte1 as s, cte1 as t where s.c1=t.c1
+);
+select f2();
+
+create function f3() returns int return
+( with cte1 as (select c1 from t1)
+ select
+ case
+ when exists(select 1 from cte1 where c1 between 1 and 2) then 1
+ when exists(select 1 from cte1 where c1 between 5 and 6) then 2
+ else 0
+ end
+);
+select f3();
+
+create view v1 as (select c1 from t1);
+
+create function f4() returns int return
+( select sum(c1) from
+ (select * from v1 union all select * from v1) dt
+);
+select f4();
+
+create function f5() returns int return
+( select sum(s.c1) from v1 as s, v1 as t where s.c1=t.c1
+);
+select f5();
+
+create view v2(s) as
+with cte1 as (select c1 from t1)
+select sum(c1) from (select * from cte1 union all select * from cte1) dt;
+
+create function f6() returns int return
+(select s from v2);
+select f6();
+
+create function f7() returns int return
+( select r.s from v2 as r, v2 as t where r.s=t.s
+);
+select f7();
+
+select f5() + f6();
+
+prepare stmt from "select f5() + f6();";
+execute stmt;
+execute stmt;
+deallocate prepare stmt;
+
+drop function f1;
+drop function f2;
+drop function f3;
+drop function f4;
+drop function f5;
+drop function f6;
+drop function f7;
+
+drop view v1;
+drop view v2;
+
+create table t2 (a int, b int);
+
+insert into t2
+with cte1 as (select c1 from t1)
+select * from cte1 as s, cte1 as t where s.c1=t.c1 and s.c1 > 5;
+
+select * from t2;
+
+delimiter |;
+
+create procedure p1()
+begin
+insert into t2
+with cte1 as (select c1 from t1)
+select * from cte1 as s, cte1 as t where s.c1=t.c1 and s.c1 <= 2 and t.c1 >= 2;
+end |
+
+delimiter ;|
+
+call p1();
+select * from t2;
+
+drop procedure p1;
+
+--echo # checking CTE resolution for queries with hanging CTEs
+
+with
+cte1(a) as (select * from t1 where c1 <= 2),
+cte2(b) as (select * from cte1 where a >= 2),
+cte3 as (select * from cte1,cte2 where cte1.a < cte2.b)
+select * from cte3;
+
+select * from t2;
+
+with
+cte1(a) as (select * from t1 where c1 <= 2),
+cte2(b) as (select * from cte1 where a >= 2),
+cte3 as (select * from cte1,cte2 where cte1.a < cte2.b)
+select * from t2;
+
+--error ER_BAD_FIELD_ERROR
+with
+cte1(a) as (select * from t1 where c1 <= 2),
+cte2(b) as (select * from cte1 where c1 >= 2),
+cte3 as (select * from cte1,cte2 where cte1.a < cte2.b)
+select * from t2;
+
+--error ER_BAD_FIELD_ERROR
+with
+cte1(a) as (select * from t1 where c1 <= 2),
+cte2(b) as (select * from cte1 where a >= 2),
+cte3 as (select * from cte1,cte2 where cte1.a < cte2.c1)
+select * from t2;
+
+with
+cte1 as (select * from t1 where c1 <= 2),
+cte2(a,b) as (select * from cte1 as s1, cte1 as s2 where s1.c1=s2.c1)
+select * from cte2;
+
+with
+cte1 as (select * from t1 where c1 <= 2),
+cte2(a,b) as (select * from cte1 as s1, cte1 as s2 where s1.c1=s2.c1)
+select * from t2;
+
+--error ER_NON_UNIQ_ERROR
+with
+cte1 as (select * from t1 where c1 <= 2),
+cte2(a,b) as (select * from cte1 as s1, cte1 as s2 where s1.c1=c1)
+select * from t2;
+
+with cte3 as
+( with cte2(a,b) as
+ ( with cte1 as (select * from t1 where c1 <= 2)
+ select * from cte1 as s1, cte1 as s2 where s1.c1=s2.c1)
+ select r1.a,r2.b from cte2 as r1, cte2 as r2)
+select * from cte3;
+
+with cte3 as
+( with cte2(a,b) as
+ ( with cte1 as (select * from t1 where c1 <= 2)
+ select * from cte1 as s1, cte1 as s2 where s1.c1=s2.c1)
+ select r1.a,r2.b from cte2 as r1, cte2 as r2)
+select * from t2;
+
+--error ER_BAD_FIELD_ERROR
+with cte3 as
+( with cte2(a,b) as
+ ( with cte1 as (select * from t1 where c1 <= 2)
+ select * from cte1 as s1, cte1 as s2 where s1.c1=s2.c1)
+ select r1.c1,r2.c1 from cte2 as r1, cte2 as r2)
+select * from t2;
+
+delimiter |;
+
+create procedure p1()
+begin
+insert into t2
+with cte1 as (select c1 from t1)
+select * from t1 as s, t1 as t where s.c1=t.c1 and s.c1 <= 2 and t.c1 >= 2;
+end |
+
+delimiter ;|
+
+call p1();
+select * from t2;
+
+drop procedure p1;
+
+delimiter |;
+
+create procedure p1()
+begin
+insert into t2
+with cte1 as (select a from t1)
+select * from t1 as s, t1 as t where s.c1=t.c1 and s.c1 <= 2 and t.c1 >= 2;
+end |
+
+delimiter ;|
+
+--error ER_BAD_FIELD_ERROR
+call p1();
+
+drop procedure p1;
+
+drop table t1,t2;
+
--echo # End of 10.2 tests
--echo #
diff --git a/sql/item_subselect.cc b/sql/item_subselect.cc
index 53a6847..0fd6454 100644
--- a/sql/item_subselect.cc
+++ b/sql/item_subselect.cc
@@ -1810,7 +1810,6 @@ double Item_in_subselect::val_real()
As far as Item_in_subselect called only from Item_in_optimizer this
method should not be used
*/
- DBUG_ASSERT(0);
DBUG_ASSERT(fixed == 1);
if (forced_const)
return value;
diff --git a/sql/sp_head.cc b/sql/sp_head.cc
index aa4f809..3ea4938 100644
--- a/sql/sp_head.cc
+++ b/sql/sp_head.cc
@@ -3411,8 +3411,7 @@ sp_lex_keeper::reset_lex_and_exec_core(THD *thd, uint *nextp,
Json_writer_object trace_command(thd);
Json_writer_array trace_command_steps(thd, "steps");
if (open_tables)
- res= check_dependencies_in_with_clauses(m_lex->with_clauses_list) ||
- instr->exec_open_and_lock_tables(thd, m_lex->query_tables);
+ res= instr->exec_open_and_lock_tables(thd, m_lex->query_tables);
if (likely(!res))
{
diff --git a/sql/sql_base.cc b/sql/sql_base.cc
index 41f02fb..d8baa11 100644
--- a/sql/sql_base.cc
+++ b/sql/sql_base.cc
@@ -3695,7 +3695,11 @@ open_and_process_table(THD *thd, TABLE_LIST *tables, uint *counter, uint flags,
if (tables->derived)
{
if (!tables->view)
+ {
+ if (!tables->is_derived())
+ tables->set_derived();
goto end;
+ }
/*
We restore view's name and database wiped out by derived tables
processing and fall back to standard open process in order to
@@ -3705,35 +3709,6 @@ open_and_process_table(THD *thd, TABLE_LIST *tables, uint *counter, uint flags,
tables->db= tables->view_db;
tables->table_name= tables->view_name;
}
- else if (tables->select_lex)
- {
- /*
- Check whether 'tables' refers to a table defined in a with clause.
- If so set the reference to the definition in tables->with.
- */
- if (!tables->with)
- tables->with= tables->select_lex->find_table_def_in_with_clauses(tables);
- /*
- If 'tables' is defined in a with clause set the pointer to the
- specification from its definition in tables->derived.
- */
- if (tables->with)
- {
- if (tables->is_recursive_with_table() &&
- !tables->is_with_table_recursive_reference())
- {
- tables->with->rec_outer_references++;
- With_element *with_elem= tables->with;
- while ((with_elem= with_elem->get_next_mutually_recursive()) !=
- tables->with)
- with_elem->rec_outer_references++;
- }
- if (tables->set_as_with_table(thd, tables->with))
- DBUG_RETURN(1);
- else
- goto end;
- }
- }
if (!tables->derived && is_infoschema_db(&tables->db))
{
diff --git a/sql/sql_class.cc b/sql/sql_class.cc
index 7864623..7e43605 100644
--- a/sql/sql_class.cc
+++ b/sql/sql_class.cc
@@ -2790,6 +2790,62 @@ void THD::close_active_vio()
#endif
+/*
+ @brief MySQL parser used for recursive invocations
+
+ @param old_lex The LEX structure in the state when this parser
+ is called recursively
+ @param lex The LEX structure used to parse a new SQL fragment
+ @param str The SQL fragment to parse
+ @param str_len The length of the SQL fragment to parse
+ @param stmt_prepare_mode true <=> when parsing a prepare statement
+
+ @details
+ This function is to be used when parsing of an SQL fragment is
+ needed within one of the grammar rules.
+
+ @notes
+ Currently the function is used only when the specification of a CTE
+ is parsed for the not first and not recursive references of the CTE.
+
+ @retval false On a successful parsing of the fragment
+ @retval true Otherwise
+*/
+
+bool THD::sql_parser(LEX *old_lex, LEX *lex,
+ char *str, uint str_len, bool stmt_prepare_mode)
+{
+ extern int MYSQLparse(THD * thd);
+ extern int ORAparse(THD * thd);
+
+ bool parse_status= false;
+ Parser_state parser_state;
+ Parser_state *old_parser_state= m_parser_state;
+
+ if (parser_state.init(this, str, str_len))
+ return true;
+
+ m_parser_state= &parser_state;
+ parser_state.m_lip.stmt_prepare_mode= stmt_prepare_mode;
+ parser_state.m_lip.multi_statements= false;
+ parser_state.m_lip.m_digest= NULL;
+
+ lex->param_list= old_lex->param_list;
+ lex->sphead= old_lex->sphead;
+ lex->spname= old_lex->spname;
+ lex->spcont= old_lex->spcont;
+ lex->sp_chistics= old_lex->sp_chistics;
+ lex->trg_chistics= old_lex->trg_chistics;
+
+ parse_status= (variables.sql_mode & MODE_ORACLE) ?
+ ORAparse(this) : MYSQLparse(this) != 0;
+
+ m_parser_state= old_parser_state;
+
+ return parse_status;
+}
+
+
struct Item_change_record: public ilink
{
Item **place;
diff --git a/sql/sql_class.h b/sql/sql_class.h
index f754524..cb34067 100644
--- a/sql/sql_class.h
+++ b/sql/sql_class.h
@@ -4244,14 +4244,11 @@ class THD: public THD_count, /* this must be first */
to resolve all CTE names as we don't need this message to be thrown
for any CTE references.
*/
- if (!lex->with_clauses_list)
+ if (!lex->with_cte_resolution)
{
my_message(ER_NO_DB_ERROR, ER(ER_NO_DB_ERROR), MYF(0));
return TRUE;
}
- /* This will allow to throw an error later for non-CTE references */
- to->str= NULL;
- to->length= 0;
return FALSE;
}
@@ -5047,6 +5044,9 @@ class THD: public THD_count, /* this must be first */
Item *sp_prepare_func_item(Item **it_addr, uint cols= 1);
bool sp_eval_expr(Field *result_field, Item **expr_item_ptr);
+ bool sql_parser(LEX *old_lex, LEX *lex,
+ char *str, uint str_len, bool stmt_prepare_mode);
+
};
/** A short cut for thd->get_stmt_da()->set_ok_status(). */
diff --git a/sql/sql_cte.cc b/sql/sql_cte.cc
index f3861eb..dfcb4e1 100644
--- a/sql/sql_cte.cc
+++ b/sql/sql_cte.cc
@@ -83,7 +83,7 @@ void st_select_lex_unit::set_with_clause(With_clause *with_cl)
true on failure
*/
-bool check_dependencies_in_with_clauses(With_clause *with_clauses_list)
+bool LEX::check_dependencies_in_with_clauses()
{
for (With_clause *with_clause= with_clauses_list;
with_clause;
@@ -101,6 +101,201 @@ bool check_dependencies_in_with_clauses(With_clause *with_clauses_list)
/**
@brief
+ Resolve references to CTE in specification of hanging CTE
+
+ @details
+ A CTE to which there are no references in the query is called hanging CTE.
+ Although such CTE is not used for execution its specification must be
+ subject to context analysis. All errors concerning references to
+ non-existing tables or fields occurred in the specification must be
+ reported as well as all other errors caught at the prepare stage.
+ The specification of a hanging CTE might contain references to other
+ CTE outside of the specification and within it if the specification
+ contains a with clause. This function resolves all such references for
+ all hanging CTEs encountered in the processed query.
+
+ @retval
+ false on success
+ true on failure
+*/
+
+bool
+LEX::resolve_references_to_cte_in_hanging_cte()
+{
+ for (With_clause *with_clause= with_clauses_list;
+ with_clause; with_clause= with_clause->next_with_clause)
+ {
+ for (With_element *with_elem= with_clause->with_list.first;
+ with_elem; with_elem= with_elem->next)
+ {
+ if (!with_elem->is_referenced())
+ {
+ TABLE_LIST *first_tbl=
+ with_elem->spec->first_select()->table_list.first;
+ TABLE_LIST **with_elem_end_pos= with_elem->head->tables_pos.end_pos;
+ if (first_tbl && resolve_references_to_cte(first_tbl, with_elem_end_pos))
+ return true;
+ }
+ }
+ }
+ return false;
+}
+
+
+/**
+ @brief
+ Resolve table references to CTE from a sub-chain of table references
+
+ @param tables Points to the beginning of the sub-chain
+ @param tables_last Points to the address with the sub-chain barrier
+
+ @details
+ The method resolves tables references to CTE from the chain of
+ table references specified by the parameters 'tables' and 'tables_last'.
+ It resolves the references against the CTE definition occurred in a query
+ or the specification of a CTE whose parsing tree is represented by
+ this LEX structure. The method is always called right after the process
+ of parsing the query or of the specification of a CTE has been finished,
+ thus the chain of table references used in the parsed fragment has been
+ already built. It is assumed that parameters of the method specify a
+ a sub-chain of this chain.
+ If a table reference can be potentially a table reference to a CTE and it
+ has not been resolved yet then the method tries to find the definition
+ of the CTE against which the reference can be resolved. If it succeeds
+ it sets the field TABLE_LIST::with to point to the found definition.
+ It also sets the field TABLE_LIST::derived to point to the specification
+ of the found CTE and sets TABLE::db.str to empty_c_string. This will
+ allow to handle this table reference like a reference to a derived handle.
+ If another table reference has been already resolved against this CTE
+ and this CTE is not recursive then a clone of the CTE specification is
+ constructed using the function With_element::clone_parsed_spec() and
+ TABLE_LIST::derived is set to point to this clone rather than to the
+ original specification.
+ If the method does not find a matched CTE definition in the parsed fragment
+ then in the case when the flag this->only_cte_resolution is set to true
+ it just moves to the resolution of the next table reference from the
+ specified sub-chain while in the case when this->only_cte_resolution is set
+ to false the method additionally sets an mdl request for this table
+ reference.
+
+ @notes
+ The flag this->only_cte_resolution is set to true in the cases when
+ the failure to resolve a table reference as a CTE reference within
+ the fragment associated with this LEX structure does not imply that
+ this table reference cannot be resolved as such at all.
+
+ @retval false On success: no errors reported, no memory allocations failed
+ @retval true Otherwise
+*/
+
+bool LEX::resolve_references_to_cte(TABLE_LIST *tables,
+ TABLE_LIST **tables_last)
+{
+ With_element *with_elem= 0;
+
+ for (TABLE_LIST *tbl= tables; tbl != *tables_last; tbl= tbl->next_global)
+ {
+ if (tbl->derived)
+ continue;
+ if (!tbl->db.str && !tbl->with)
+ tbl->with= tbl->select_lex->find_table_def_in_with_clauses(tbl);
+ if (!tbl->with) // no CTE matches table reference tbl
+ {
+ if (only_cte_resolution)
+ continue;
+ if (!tbl->db.str) // no database specified in table reference tbl
+ {
+ if (!thd->db.str) // no default database is set
+ {
+ my_message(ER_NO_DB_ERROR, ER(ER_NO_DB_ERROR), MYF(0));
+ return true;
+ }
+ if (copy_db_to(&tbl->db))
+ return true;
+ if (!(tbl->table_options & TL_OPTION_ALIAS))
+ tbl->mdl_request.init(MDL_key::TABLE, tbl->db.str,
+ tbl->table_name.str,
+ tbl->mdl_type, MDL_TRANSACTION);
+ tbl->mdl_request.set_type((tbl->lock_type >= TL_WRITE_ALLOW_WRITE) ?
+ MDL_SHARED_WRITE : MDL_SHARED_READ);
+ }
+ continue;
+ }
+ with_elem= tbl->with;
+ if (tbl->is_recursive_with_table() &&
+ !tbl->is_with_table_recursive_reference())
+ {
+ tbl->with->rec_outer_references++;
+ while ((with_elem= with_elem->get_next_mutually_recursive()) !=
+ tbl->with)
+ with_elem->rec_outer_references++;
+ }
+ if (!with_elem->is_used_in_query || with_elem->is_recursive)
+ {
+ tbl->derived= with_elem->spec;
+ if (tbl->derived != tbl->select_lex->master_unit() &&
+ !with_elem->is_recursive &&
+ !tbl->is_with_table_recursive_reference())
+ {
+ tbl->derived->move_as_slave(tbl->select_lex);
+ }
+ with_elem->is_used_in_query= true;
+ }
+ else
+ {
+ if (!(tbl->derived= tbl->with->clone_parsed_spec(thd->lex, tbl)))
+ return true;
+ }
+ tbl->db.str= empty_c_string;
+ tbl->db.length= 0;
+ tbl->schema_table= 0;
+ if (tbl->derived)
+ {
+ tbl->derived->first_select()->set_linkage(DERIVED_TABLE_TYPE);
+ tbl->select_lex->add_statistics(tbl->derived);
+ }
+ if (tbl->with->is_recursive && tbl->is_with_table_recursive_reference())
+ continue;
+ with_elem->inc_references();
+ }
+ return false;
+}
+
+
+/**
+ @brief
+ Find out dependencies between CTEs, resolve references to them
+
+ @details
+ The function can be called in two modes. With this->with_cte_resolution
+ set to false the function only finds out all dependencies between CTEs
+ used in a query expression with a WITH clause whose parsing has been
+ just finished. Based on these dependencies recursive CTEs are detected.
+ If this->with_cte_resolution is set to true the function additionally
+ resolves all references to CTE occurred in this query expression.
+
+ @retval
+ true on failure
+ false on success
+*/
+
+bool
+LEX::check_cte_dependencies_and_resolve_references()
+{
+ if (check_dependencies_in_with_clauses())
+ return true;
+ if (!with_cte_resolution)
+ return false;
+ if (resolve_references_to_cte(query_tables, query_tables_last))
+ return true;
+ if (resolve_references_to_cte_in_hanging_cte())
+ return true;
+ return false;
+}
+
+
+/**
+ @brief
Check dependencies between tables defined in this with clause
@details
@@ -137,10 +332,11 @@ bool With_clause::check_dependencies()
elem != with_elem;
elem= elem->next)
{
- if (lex_string_cmp(system_charset_info, with_elem->query_name,
- elem->query_name) == 0)
+ if (lex_string_cmp(system_charset_info, with_elem->get_name(),
+ elem->get_name()) == 0)
{
- my_error(ER_DUP_QUERY_NAME, MYF(0), with_elem->query_name->str);
+ my_error(ER_DUP_QUERY_NAME, MYF(0),
+ with_elem->get_name_str());
return true;
}
}
@@ -247,13 +443,12 @@ With_element *With_clause::find_table_def(TABLE_LIST *table,
with_elem != barrier;
with_elem= with_elem->next)
{
- if (my_strcasecmp(system_charset_info, with_elem->query_name->str,
- table->table_name.str) == 0 &&
+ if (my_strcasecmp(system_charset_info, with_elem->get_name_str(),
+ table->table_name.str) == 0 &&
!table->is_fqtn)
{
table->set_derived();
- table->db.str= empty_c_string;
- table->db.length= 0;
+ with_elem->referenced= true;
return with_elem;
}
}
@@ -610,7 +805,7 @@ bool With_clause::check_anchors()
if (elem == with_elem)
{
my_error(ER_RECURSIVE_WITHOUT_ANCHORS, MYF(0),
- with_elem->query_name->str);
+ with_elem->get_name_str());
return true;
}
}
@@ -643,7 +838,7 @@ bool With_clause::check_anchors()
if (elem->work_dep_map & elem->get_elem_map())
{
my_error(ER_UNACCEPTABLE_MUTUAL_RECURSION, MYF(0),
- with_elem->query_name->str);
+ with_elem->get_name_str());
return true;
}
}
@@ -797,7 +992,8 @@ bool With_element::set_unparsed_spec(THD *thd,
@brief
Create a clone of the specification for the given with table
- @param thd The context of the statement containing this with element
+ @param old_lex The LEX structure created for the query or CTE specification
+ where this With_element is defined
@param with_table The reference to the table defined in this element for which
the clone is created.
@@ -807,12 +1003,13 @@ bool With_element::set_unparsed_spec(THD *thd,
this element.
The clone is created when the string with the specification saved in
unparsed_spec is fed into the parser as an input string. The parsing
- this string a unit object representing the specification is build.
+ this string a unit object representing the specification is built.
A chain of all table references occurred in the specification is also
formed.
The method includes the new unit and its sub-unit into hierarchy of
the units of the main query. I also insert the constructed chain of the
table references into the chain of all table references of the main query.
+ The method resolves all references to CTE in the clone.
@note
Clones is created only for not first references to tables defined in
@@ -828,114 +1025,128 @@ bool With_element::set_unparsed_spec(THD *thd,
NULL - otherwise
*/
-st_select_lex_unit *With_element::clone_parsed_spec(THD *thd,
+st_select_lex_unit *With_element::clone_parsed_spec(LEX *old_lex,
TABLE_LIST *with_table)
{
+ THD *thd= old_lex->thd;
LEX *lex;
- st_select_lex_unit *res= NULL;
- Query_arena backup;
- Query_arena *arena= thd->activate_stmt_arena_if_needed(&backup);
+ st_select_lex_unit *res= NULL;
if (!(lex= (LEX*) new(thd->mem_root) st_lex_local))
- {
- if (arena)
- thd->restore_active_arena(arena, &backup);
return res;
- }
- LEX *old_lex= thd->lex;
thd->lex= lex;
bool parse_status= false;
- Parser_state parser_state;
- TABLE_LIST *spec_tables;
- TABLE_LIST *spec_tables_tail;
st_select_lex *with_select;
char save_end= unparsed_spec.str[unparsed_spec.length];
((char*) &unparsed_spec.str[unparsed_spec.length])[0]= '\0';
- if (parser_state.init(thd, (char*) unparsed_spec.str, (unsigned int)unparsed_spec.length))
- goto err;
- parser_state.m_lip.stmt_prepare_mode= stmt_prepare_mode;
- parser_state.m_lip.multi_statements= false;
- parser_state.m_lip.m_digest= NULL;
lex_start(thd);
lex->clone_spec_offset= unparsed_spec_offset;
- lex->param_list= old_lex->param_list;
- lex->sphead= old_lex->sphead;
- lex->spname= old_lex->spname;
- lex->spcont= old_lex->spcont;
- lex->sp_chistics= old_lex->sp_chistics;
-
- lex->stmt_lex= old_lex;
- parse_status= parse_sql(thd, &parser_state, 0);
+ lex->with_cte_resolution= true;
+
+ /*
+ The specification of a CTE is to be parsed as a regular query.
+ At the very end of the parsing query the function
+ check_cte_dependencies_and_resolve_references() will be called.
+ It will check the dependencies between CTEs that are defined
+ within the query and will resolve CTE references in this query.
+ If a table reference is not resolved as a CTE reference within
+ this query it still can be resolved as a reference to a CTE defined
+ in the same clause as the CTE whose specification is to be parsed
+ or defined in an embedding CTE definition.
+
+ Example:
+ with
+ cte1 as ( ... ),
+ cte2 as ([WITH ...] select ... from cte1 ...)
+ select ... from cte2 as r, ..., cte2 as s ...
+
+ Here the specification of cte2 has be cloned for table reference
+ with alias s1. The specification contains a reference to cte1
+ that is defined outside this specification. If the reference to
+ cte1 cannot be resolved within the specification of cte2 it's
+ not necessarily has to be a reference to a non-CTE table. That's
+ why the flag lex->only_cte_resolution has to be set to true
+ before parsing of the specification of cte2 invoked by this
+ function starts. Otherwise an mdl_lock would be requested for s
+ and this would not be correct.
+ */
+
+ lex->only_cte_resolution= true;
+
+ lex->stmt_lex= old_lex->stmt_lex ? old_lex->stmt_lex : old_lex;
+
+ parse_status= thd->sql_parser(old_lex, lex,
+ (char*) unparsed_spec.str,
+ (unsigned int)unparsed_spec.length,
+ stmt_prepare_mode);
+
((char*) &unparsed_spec.str[unparsed_spec.length])[0]= save_end;
- with_select= lex->first_select_lex();
+ with_select= lex->unit.first_select();
if (parse_status)
goto err;
- if (check_dependencies_in_with_clauses(lex->with_clauses_list))
- goto err;
-
- spec_tables= lex->query_tables;
- spec_tables_tail= 0;
- for (TABLE_LIST *tbl= spec_tables;
- tbl;
- tbl= tbl->next_global)
- {
- if (!tbl->derived && !tbl->schema_table &&
- thd->open_temporary_table(tbl))
- goto err;
- spec_tables_tail= tbl;
- }
- if (spec_tables)
+ /*
+ The global chain of TABLE_LIST objects created for the specification that
+ just has been parsed is added to such chain that contains the reference
+ to the CTE whose specification is parsed right after the TABLE_LIST object
+ created for the reference.
+ */
+ if (lex->query_tables)
{
- if (with_table->next_global)
+ head->tables_pos.set_start_pos(&with_table->next_global);
+ head->tables_pos.set_end_pos(lex->query_tables_last);
+ TABLE_LIST *next_tbl= with_table->next_global;
+ if (next_tbl)
{
- spec_tables_tail->next_global= with_table->next_global;
- with_table->next_global->prev_global= &spec_tables_tail->next_global;
+ *(lex->query_tables->prev_global= next_tbl->prev_global)=
+ lex->query_tables;
+ *(next_tbl->prev_global= lex->query_tables_last)= next_tbl;
}
else
{
- old_lex->query_tables_last= &spec_tables_tail->next_global;
+ *(lex->query_tables->prev_global= old_lex->query_tables_last)=
+ lex->query_tables;
+ old_lex->query_tables_last= lex->query_tables_last;
}
- spec_tables->prev_global= &with_table->next_global;
- with_table->next_global= spec_tables;
}
res= &lex->unit;
res->with_element= this;
+ /*
+ The unit of the specification that just has been parsed is included
+ as a slave of the select that contained in its from list the table
+ reference for which the unit has been created.
+ */
lex->unit.include_down(with_table->select_lex);
- lex->unit.set_slave(with_select);
+ lex->unit.set_slave(with_select);
+ lex->unit.cloned_from= spec;
old_lex->all_selects_list=
(st_select_lex*) (lex->all_selects_list->
insert_chain_before(
(st_select_lex_node **) &(old_lex->all_selects_list),
with_select));
- if (check_dependencies_in_with_clauses(lex->with_clauses_list))
- res= NULL;
+
/*
- Resolve references to CTE from the spec_tables list that has not
- been resolved yet.
+ Now all references to the CTE defined outside of the cloned specification
+ has to be resolved. Additionally if old_lex->only_cte_resolution == false
+ for the table references that has not been resolved requests for mdl_locks
+ has to be set.
*/
- for (TABLE_LIST *tbl= spec_tables;
- tbl;
- tbl= tbl->next_global)
+ lex->only_cte_resolution= old_lex->only_cte_resolution;
+ if (lex->resolve_references_to_cte(lex->query_tables,
+ lex->query_tables_last))
{
- if (!tbl->with)
- tbl->with= with_select->find_table_def_in_with_clauses(tbl);
- if (tbl == spec_tables_tail)
- break;
- }
- if (check_table_access(thd, SELECT_ACL, spec_tables, FALSE, UINT_MAX, FALSE))
+ res= NULL;
goto err;
+ }
- lex->sphead= NULL; // in order not to delete lex->sphead
+ lex->sphead= NULL; // in order not to delete lex->sphead
lex_end(lex);
err:
- if (arena)
- thd->restore_active_arena(arena, &backup);
thd->lex= old_lex;
return res;
}
@@ -1105,59 +1316,6 @@ With_element *st_select_lex::find_table_def_in_with_clauses(TABLE_LIST *table)
}
-/**
- @brief
- Set the specifying unit in this reference to a with table
-
- @details
- The method assumes that the given element with_elem defines the table T
- this table reference refers to.
- If this is the first reference to T the method just sets its specification
- in the field 'derived' as the unit that yields T. Otherwise the method
- first creates a clone specification and sets rather this clone in this field.
-
- @retval
- false on success
- true on failure
-*/
-
-bool TABLE_LIST::set_as_with_table(THD *thd, With_element *with_elem)
-{
- if (table)
- {
- /*
- This table was prematurely identified as a temporary table.
- We correct it here, but it's not a nice solution in the case
- when the temporary table with this name is not used anywhere
- else in the query.
- */
- thd->mark_tmp_table_as_free_for_reuse(table);
- table= 0;
- }
- with= with_elem;
- schema_table= NULL;
- if (!with_elem->is_referenced() || with_elem->is_recursive)
- {
- derived= with_elem->spec;
- if (derived != select_lex->master_unit() &&
- !with_elem->is_recursive &&
- !is_with_table_recursive_reference())
- {
- derived->move_as_slave(select_lex);
- }
- }
- else
- {
- if(!(derived= with_elem->clone_parsed_spec(thd, this)))
- return true;
- }
- derived->first_select()->set_linkage(DERIVED_TABLE_TYPE);
- select_lex->add_statistics(derived);
- with_elem->inc_references();
- return false;
-}
-
-
bool TABLE_LIST::is_recursive_with_table()
{
return with && with->is_recursive;
@@ -1257,7 +1415,7 @@ bool st_select_lex::check_unrestricted_recursive(bool only_standard_compliant)
if (only_standard_compliant && with_elem->is_unrestricted())
{
my_error(ER_NOT_STANDARD_COMPLIANT_RECURSIVE,
- MYF(0), with_elem->query_name->str);
+ MYF(0), with_elem->get_name_str());
return true;
}
@@ -1457,7 +1615,7 @@ void With_clause::print(String *str, enum_query_type query_type)
void With_element::print(String *str, enum_query_type query_type)
{
- str->append(query_name);
+ str->append(get_name());
if (column_list.elements)
{
List_iterator_fast<LEX_CSTRING> li(column_list);
diff --git a/sql/sql_cte.h b/sql/sql_cte.h
index 80d5664..4693599 100644
--- a/sql/sql_cte.h
+++ b/sql/sql_cte.h
@@ -25,6 +25,39 @@ struct st_unit_ctxt_elem;
/**
+ @class With_element_head
+ @brief Head of the definition of a CTE table
+
+ It contains the name of the CTE and it contains the position of the subchain
+ of table references used in the definition in the global chain of table
+ references used in the query where this definition is encountered.
+*/
+
+class With_element_head : public Sql_alloc
+{
+ /* The name of the defined CTE */
+ LEX_CSTRING *query_name;
+
+public:
+ /*
+ The structure describing the subchain of the table references used in
+ the specification of the defined CTE in the global chain of table
+ references used in the query. The structure is fully defined only
+ after the CTE definition has been parsed.
+ */
+ TABLE_CHAIN tables_pos;
+
+ With_element_head(LEX_CSTRING *name)
+ : query_name(name)
+ {
+ tables_pos.set_start_pos(0);
+ tables_pos.set_end_pos(0);
+ }
+ friend class With_element;
+};
+
+
+/**
@class With_element
@brief Definition of a CTE table
@@ -85,9 +118,22 @@ class With_element : public Sql_alloc
subqueries and specifications of other with elements).
*/
uint references;
+
+ /*
+ true <=> this With_element is referred in the query in which the
+ element is defined
+ */
+ bool referenced;
+
+ /*
+ true <=> this With_element is needed for the execution of the query
+ in which the element is defined
+ */
+ bool is_used_in_query;
+
/*
Unparsed specification of the query that specifies this element.
- It used to build clones of the specification if they are needed.
+ It's used to build clones of the specification if they are needed.
*/
LEX_CSTRING unparsed_spec;
/* Offset of the specification in the input string */
@@ -101,10 +147,12 @@ class With_element : public Sql_alloc
public:
/*
- The name of the table introduced by this with elememt. The name
- can be used in FROM lists of the queries in the scope of the element.
+ Contains the name of the defined With element and the position of
+ the subchain of the tables references used by its definition in the
+ global chain of TABLE_LIST objects created for the whole query.
*/
- LEX_CSTRING *query_name;
+ With_element_head *head;
+
/*
Optional list of column names to name the columns of the table introduced
by this with element. It is used in the case when the names are not
@@ -162,18 +210,27 @@ class With_element : public Sql_alloc
/* List of derived tables containing recursive references to this CTE */
SQL_I_List<TABLE_LIST> derived_with_rec_ref;
- With_element(LEX_CSTRING *name,
+ With_element(With_element_head *h,
List <LEX_CSTRING> list,
st_select_lex_unit *unit)
: next(NULL), base_dep_map(0), derived_dep_map(0),
sq_dep_map(0), work_dep_map(0), mutually_recursive(0),
top_level_dep_map(0), sq_rec_ref(NULL),
next_mutually_recursive(NULL), references(0),
- query_name(name), column_list(list), spec(unit),
+ referenced(false), is_used_in_query(false),
+ head(h), column_list(list), spec(unit),
is_recursive(false), rec_outer_references(0), with_anchor(false),
level(0), rec_result(NULL)
{ unit->with_element= this; }
+ LEX_CSTRING *get_name() { return head->query_name; }
+ const char *get_name_str() { return get_name()->str; }
+
+ void set_tables_start_pos(TABLE_LIST **pos)
+ { head->tables_pos.set_start_pos(pos); }
+ void set_tables_end_pos(TABLE_LIST **pos)
+ { head->tables_pos.set_end_pos(pos); }
+
bool check_dependencies_in_spec();
void check_dependencies_in_select(st_select_lex *sl, st_unit_ctxt_elem *ctxt,
@@ -200,9 +257,9 @@ class With_element : public Sql_alloc
bool set_unparsed_spec(THD *thd, const char *spec_start, const char *spec_end,
my_ptrdiff_t spec_offset);
- st_select_lex_unit *clone_parsed_spec(THD *thd, TABLE_LIST *with_table);
+ st_select_lex_unit *clone_parsed_spec(LEX *old_lex, TABLE_LIST *with_table);
- bool is_referenced() { return references != 0; }
+ bool is_referenced() { return referenced; }
void inc_references() { references++; }
@@ -260,6 +317,12 @@ class With_element : public Sql_alloc
void prepare_for_next_iteration();
friend class With_clause;
+
+ friend
+ bool LEX::resolve_references_to_cte(TABLE_LIST *tables,
+ TABLE_LIST **tables_last);
+ friend
+ bool LEX::resolve_references_to_cte_in_hanging_cte();
};
const uint max_number_of_elements_in_with_clause= sizeof(table_map)*8;
@@ -358,8 +421,10 @@ class With_clause : public Sql_alloc
friend class With_element;
friend
- bool
- check_dependencies_in_with_clauses(With_clause *with_clauses_list);
+ bool LEX::check_dependencies_in_with_clauses();
+
+ friend
+ bool LEX::resolve_references_to_cte_in_hanging_cte();
};
inline
diff --git a/sql/sql_lex.cc b/sql/sql_lex.cc
index 386d179..c63a3b5 100644
--- a/sql/sql_lex.cc
+++ b/sql/sql_lex.cc
@@ -721,6 +721,8 @@ void LEX::start(THD *thd_arg)
explain_json= false;
context_analysis_only= 0;
derived_tables= 0;
+ with_cte_resolution= false;
+ only_cte_resolution= false;
safe_to_cache_query= 1;
parsing_options.reset();
empty_field_list_on_rset= 0;
@@ -2374,6 +2376,7 @@ void st_select_lex_unit::init_query()
is_view= false;
with_clause= 0;
with_element= 0;
+ cloned_from= 0;
columns_are_renamed= false;
intersect_mark= NULL;
with_wrapped_tvc= false;
@@ -8326,6 +8329,8 @@ bool LEX::check_main_unit_semantics()
if (unit.set_nest_level(0) ||
unit.check_parameters(first_select_lex()))
return TRUE;
+ if (check_cte_dependencies_and_resolve_references())
+ return TRUE;
return FALSE;
}
@@ -9021,8 +9026,8 @@ void st_select_lex::add_statistics(SELECT_LEX_UNIT *unit)
bool LEX::main_select_push(bool service)
{
DBUG_ENTER("LEX::main_select_push");
- current_select_number= 1;
- builtin_select.select_number= 1;
+ current_select_number= ++thd->lex->stmt_lex->current_select_number;
+ builtin_select.select_number= current_select_number;
builtin_select.is_service_select= service;
if (push_select(&builtin_select))
DBUG_RETURN(TRUE);
diff --git a/sql/sql_lex.h b/sql/sql_lex.h
index 2389b23..eb21419 100644
--- a/sql/sql_lex.h
+++ b/sql/sql_lex.h
@@ -917,6 +917,8 @@ class st_select_lex_unit: public st_select_lex_node {
With_clause *with_clause;
/* With element where this unit is used as the specification (if any) */
With_element *with_element;
+ /* The unit used as a CTE specification from which this unit is cloned */
+ st_select_lex_unit *cloned_from;
/* thread handler */
THD *thd;
/*
@@ -1501,7 +1503,9 @@ class st_select_lex: public st_select_lex_node
}
With_element *get_with_element()
{
- return master_unit()->with_element;
+ return master_unit()->cloned_from ?
+ master_unit()->cloned_from->with_element :
+ master_unit()->with_element;
}
With_element *find_table_def_in_with_clauses(TABLE_LIST *table);
bool check_unrestricted_recursive(bool only_standard_compliant);
@@ -3324,6 +3328,20 @@ struct LEX: public Query_tables_list
*/
uint8 derived_tables;
uint8 context_analysis_only;
+ /*
+ true <=> The parsed fragment requires resolution of references to CTE
+ at the end of parsing. This name resolution process involves searching
+ for possible dependencies between CTE defined in the parsed fragment and
+ detecting possible recursive references.
+ The flag is set to true if the fragment contains CTE definitions.
+ */
+ bool with_cte_resolution;
+ /*
+ true <=> only resolution of references to CTE are required in the parsed
+ fragment, no checking of dependencies between CTE is required.
+ This flag is used only when parsing clones of CTE specifications.
+ */
+ bool only_cte_resolution;
bool local_file;
bool check_exists;
bool autocommit;
@@ -4571,6 +4589,11 @@ struct LEX: public Query_tables_list
select_stack[0]->is_service_select);
}
+ bool check_dependencies_in_with_clauses();
+ bool resolve_references_to_cte_in_hanging_cte();
+ bool check_cte_dependencies_and_resolve_references();
+ bool resolve_references_to_cte(TABLE_LIST *tables,
+ TABLE_LIST **tables_last);
};
diff --git a/sql/sql_parse.cc b/sql/sql_parse.cc
index 3ae7c7c..82b81a2b 100644
--- a/sql/sql_parse.cc
+++ b/sql/sql_parse.cc
@@ -3554,9 +3554,6 @@ mysql_execute_command(THD *thd)
thd->get_stmt_da()->opt_clear_warning_info(thd->query_id);
}
- if (check_dependencies_in_with_clauses(thd->lex->with_clauses_list))
- DBUG_RETURN(1);
-
#ifdef HAVE_REPLICATION
if (unlikely(thd->slave_thread))
{
@@ -8187,7 +8184,7 @@ TABLE_LIST *st_select_lex::add_table_to_list(THD *thd,
ptr->is_fqtn= TRUE;
ptr->db= table->db;
}
- else if (lex->copy_db_to(&ptr->db))
+ else if (!lex->with_cte_resolution && lex->copy_db_to(&ptr->db))
DBUG_RETURN(0);
else
ptr->is_fqtn= FALSE;
@@ -8204,7 +8201,9 @@ TABLE_LIST *st_select_lex::add_table_to_list(THD *thd,
}
ptr->table_name= table->table;
- ptr->lock_type= lock_type;
+ ptr->lock_type= lock_type;
+ ptr->mdl_type= mdl_type;
+ ptr->table_options= table_options;
ptr->updating= MY_TEST(table_options & TL_OPTION_UPDATING);
/* TODO: remove TL_OPTION_FORCE_INDEX as it looks like it's not used */
ptr->force_index= MY_TEST(table_options & TL_OPTION_FORCE_INDEX);
@@ -8886,8 +8885,10 @@ void st_select_lex::set_lock_for_tables(thr_lock_type lock_type, bool for_update
{
tables->lock_type= lock_type;
tables->updating= for_update;
- tables->mdl_request.set_type((lock_type >= TL_WRITE_ALLOW_WRITE) ?
- MDL_SHARED_WRITE : MDL_SHARED_READ);
+
+ if (tables->db.str && tables->db.str[0])
+ tables->mdl_request.set_type((lock_type >= TL_WRITE_ALLOW_WRITE) ?
+ MDL_SHARED_WRITE : MDL_SHARED_READ);
}
DBUG_VOID_RETURN;
}
diff --git a/sql/sql_prepare.cc b/sql/sql_prepare.cc
index 8d094a0..816990f 100644
--- a/sql/sql_prepare.cc
+++ b/sql/sql_prepare.cc
@@ -2376,9 +2376,6 @@ static bool check_prepared_statement(Prepared_statement *stmt)
if (tables)
thd->get_stmt_da()->opt_clear_warning_info(thd->query_id);
- if (check_dependencies_in_with_clauses(thd->lex->with_clauses_list))
- goto error;
-
if (sql_command_flags[sql_command] & CF_HA_CLOSE)
mysql_ha_rm_tables(thd, tables);
diff --git a/sql/sql_view.cc b/sql/sql_view.cc
index 126db90..1e3c4ce 100644
--- a/sql/sql_view.cc
+++ b/sql/sql_view.cc
@@ -431,12 +431,6 @@ bool mysql_create_view(THD *thd, TABLE_LIST *views,
lex->link_first_table_back(view, link_to_local);
view->open_type= OT_BASE_ONLY;
- if (check_dependencies_in_with_clauses(lex->with_clauses_list))
- {
- res= TRUE;
- goto err;
- }
-
WSREP_TO_ISOLATION_BEGIN(WSREP_MYSQL_DB, NULL, NULL);
/*
@@ -1413,9 +1407,6 @@ bool mysql_make_view(THD *thd, TABLE_SHARE *share, TABLE_LIST *table,
TABLE_LIST *tbl;
Security_context *security_ctx= 0;
- if (check_dependencies_in_with_clauses(thd->lex->with_clauses_list))
- goto err;
-
/*
Check rights to run commands which show underlying tables.
In the optimizer trace we would not like to show trace for
diff --git a/sql/sql_yacc.yy b/sql/sql_yacc.yy
index 9b2355e..dfe239c 100644
--- a/sql/sql_yacc.yy
+++ b/sql/sql_yacc.yy
@@ -770,6 +770,7 @@ Virtual_column_info *add_virtual_expression(THD *thd, Item *expr)
class sp_head *sphead;
class sp_name *spname;
class sp_variable *spvar;
+ class With_element_head *with_element_head;
class With_clause *with_clause;
class Virtual_column_info *virtual_column;
@@ -2188,7 +2189,7 @@ END_OF_INPUT
%type <with_clause> with_clause
-%type <lex_str_ptr> query_name
+%type <with_element_head> with_element_head
%type <lex_str_list> opt_with_column_list
@@ -3336,7 +3337,11 @@ call:
if (unlikely(Lex->call_statement_start(thd, $2)))
MYSQL_YYABORT;
}
- opt_sp_cparam_list {}
+ opt_sp_cparam_list
+ {
+ if (Lex->check_cte_dependencies_and_resolve_references())
+ MYSQL_YYABORT;
+ }
;
/* CALL parameters */
@@ -4185,6 +4190,8 @@ sp_proc_stmt_return:
LEX *lex= Lex;
sp_head *sp= lex->sphead;
Lex->pop_select(); //main select
+ if (Lex->check_cte_dependencies_and_resolve_references())
+ MYSQL_YYABORT;
if (unlikely(sp->m_handler->add_instr_freturn(thd, sp, lex->spcont,
$3, lex)) ||
unlikely(sp->restore_lex(thd)))
@@ -13337,6 +13344,8 @@ do:
{
Lex->insert_list= $3;
Lex->pop_select(); //main select
+ if (Lex->check_cte_dependencies_and_resolve_references())
+ MYSQL_YYABORT;
}
;
@@ -15507,6 +15516,7 @@ with_clause:
if (unlikely(with_clause == NULL))
MYSQL_YYABORT;
lex->derived_tables|= DERIVED_WITH;
+ lex->with_cte_resolution= true;
lex->curr_with_clause= with_clause;
with_clause->add_to_list(Lex->with_clauses_list_last_next);
if (lex->current_select &&
@@ -15534,7 +15544,7 @@ with_list:
with_list_element:
- query_name
+ with_element_head
opt_with_column_list
{
$2= new List<LEX_CSTRING> (Lex->with_column_list);
@@ -15554,6 +15564,7 @@ with_list_element:
if (elem->set_unparsed_spec(thd, spec_start, $7.pos(),
spec_start - query_start))
MYSQL_YYABORT;
+ elem->set_tables_end_pos(lex->query_tables_last);
}
;
@@ -15580,12 +15591,15 @@ with_column_list:
;
-query_name:
+with_element_head:
ident
{
- $$= (LEX_CSTRING *) thd->memdup(&$1, sizeof(LEX_CSTRING));
- if (unlikely($$ == NULL))
+ LEX_CSTRING *name=
+ (LEX_CSTRING *) thd->memdup(&$1, sizeof(LEX_CSTRING));
+ $$= new (thd->mem_root) With_element_head(name);
+ if (unlikely(name == NULL || $$ == NULL))
MYSQL_YYABORT;
+ $$->tables_pos.set_start_pos(Lex->query_tables_last);
}
;
diff --git a/sql/sql_yacc_ora.yy b/sql/sql_yacc_ora.yy
index cb58c4a..76eb21a 100644
--- a/sql/sql_yacc_ora.yy
+++ b/sql/sql_yacc_ora.yy
@@ -248,6 +248,7 @@ void ORAerror(THD *thd, const char *s)
class sp_head *sphead;
class sp_name *spname;
class sp_variable *spvar;
+ class With_element_head *with_element_head;
class With_clause *with_clause;
class Virtual_column_info *virtual_column;
@@ -1689,7 +1690,7 @@ END_OF_INPUT
%type <with_clause> with_clause
-%type <lex_str_ptr> query_name
+%type <with_element_head> with_element_head
%type <lex_str_list> opt_with_column_list
@@ -3138,7 +3139,11 @@ call:
if (unlikely(Lex->call_statement_start(thd, $2)))
MYSQL_YYABORT;
}
- opt_sp_cparam_list {}
+ opt_sp_cparam_list
+ {
+ if (Lex->check_cte_dependencies_and_resolve_references())
+ MYSQL_YYABORT;
+ }
;
/* CALL parameters */
@@ -4092,6 +4097,8 @@ sp_proc_stmt_return:
LEX *lex= Lex;
sp_head *sp= lex->sphead;
Lex->pop_select(); //main select
+ if (Lex->check_cte_dependencies_and_resolve_references())
+ MYSQL_YYABORT;
if (unlikely(sp->m_handler->add_instr_freturn(thd, sp, lex->spcont,
$3, lex)) ||
unlikely(sp->restore_lex(thd)))
@@ -13436,6 +13443,8 @@ do:
{
Lex->insert_list= $3;
Lex->pop_select(); //main select
+ if (Lex->check_cte_dependencies_and_resolve_references())
+ MYSQL_YYABORT;
}
;
@@ -15624,6 +15633,7 @@ with_clause:
if (unlikely(with_clause == NULL))
MYSQL_YYABORT;
lex->derived_tables|= DERIVED_WITH;
+ lex->with_cte_resolution= true;
lex->curr_with_clause= with_clause;
with_clause->add_to_list(Lex->with_clauses_list_last_next);
if (lex->current_select &&
@@ -15651,7 +15661,7 @@ with_list:
with_list_element:
- query_name
+ with_element_head
opt_with_column_list
{
$2= new List<LEX_CSTRING> (Lex->with_column_list);
@@ -15671,6 +15681,7 @@ with_list_element:
if (elem->set_unparsed_spec(thd, spec_start, $7.pos(),
spec_start - query_start))
MYSQL_YYABORT;
+ elem->set_tables_end_pos(lex->query_tables_last);
}
;
@@ -15697,12 +15708,15 @@ with_column_list:
;
-query_name:
+with_element_head:
ident
{
- $$= (LEX_CSTRING *) thd->memdup(&$1, sizeof(LEX_CSTRING));
- if (unlikely($$ == NULL))
+ LEX_CSTRING *name=
+ (LEX_CSTRING *) thd->memdup(&$1, sizeof(LEX_CSTRING));
+ $$= new (thd->mem_root) With_element_head(name);
+ if (unlikely(name == NULL || $$ == NULL))
MYSQL_YYABORT;
+ $$->tables_pos.set_start_pos(Lex->query_tables_last);
}
;
diff --git a/sql/table.h b/sql/table.h
index 4a739ed..f510aaa 100644
--- a/sql/table.h
+++ b/sql/table.h
@@ -2139,6 +2139,29 @@ struct vers_select_conds_t
struct LEX;
class Index_hint;
+
+/*
+ @struct TABLE_CHAIN
+ @brief Subchain of global chain of table references
+
+ The structure contains a pointer to the address of the next_global
+ pointer to the first TABLE_LIST objectof the subchain and the address
+ of the next_global pointer to the element right after the last
+ TABLE_LIST object of the subchain. For an empty subchain both pointers
+ have the same value.
+*/
+
+struct TABLE_CHAIN
+{
+ TABLE_CHAIN() {}
+
+ TABLE_LIST **start_pos;
+ TABLE_LIST ** end_pos;
+
+ void set_start_pos(TABLE_LIST **pos) { start_pos= pos; }
+ void set_end_pos(TABLE_LIST **pos) { end_pos= pos; }
+};
+
struct TABLE_LIST
{
TABLE_LIST() {} /* Remove gcc warning */
@@ -2473,6 +2496,20 @@ struct TABLE_LIST
/* call back function for asking handler about caching in query cache */
qc_engine_callback callback_func;
thr_lock_type lock_type;
+
+ /*
+ Two fields below are set during parsing this table reference in the cases
+ when the table reference can be potentially a reference to a CTE table.
+ In this cases the fact that the reference is a reference to a CTE or not
+ will be ascertained at the very end of parsing of the query when referencies
+ to CTE are resolved. For references to CTE and to derived tables no mdl
+ requests are needed while for other table references they are. If a request
+ is possibly postponed the info that allows to issue this request must be
+ saved in 'mdl_type' and 'table_options'.
+ */
+ enum_mdl_type mdl_type;
+ ulong table_options;
+
uint outer_join; /* Which join type */
uint shared; /* Used in multi-upd */
bool updatable; /* VIEW/TABLE can be updated now */
1
0
[Commits] 8c5f4cde660: MDEV-25630: Crash with window function in left expr of IN subquery
by psergey 22 May '21
by psergey 22 May '21
22 May '21
revision-id: 8c5f4cde660e2c93f57a6204aa2077768094baab (mariadb-10.2.31-965-g8c5f4cde660)
parent(s): 2087d47aaeadc06dd007ce9bd28984ecc8e2101e
author: Sergei Petrunia
committer: Sergei Petrunia
timestamp: 2021-05-22 15:53:33 +0300
message:
MDEV-25630: Crash with window function in left expr of IN subquery
* Make Item_in_optimizer::fix_fields inherit the with_window_func
attribute of the subquery's left expression (the subquery itself
cannot have window functions that are aggregated in this select)
* Make Item_cache_wrapper::Item_cache_wrapper() inherit
with_window_func attribute of the item it is caching.
---
mysql-test/r/win.result | 19 +++++++++++++++++++
mysql-test/t/win.test | 14 ++++++++++++++
sql/item.cc | 1 +
sql/item_cmpfunc.cc | 3 +++
4 files changed, 37 insertions(+)
diff --git a/mysql-test/r/win.result b/mysql-test/r/win.result
index dd74c5c77fd..8a31dcc0634 100644
--- a/mysql-test/r/win.result
+++ b/mysql-test/r/win.result
@@ -3892,5 +3892,24 @@ id rn
1 1
drop table t1;
#
+# MDEV-25630: Crash with window function in left expr of IN subquery
+#
+CREATE TABLE t1 (i int);
+INSERT INTO t1 VALUES (1),(2),(3);
+SELECT lag(i) over (ORDER BY i) IN ( SELECT 1 FROM t1 a) FROM t1;
+lag(i) over (ORDER BY i) IN ( SELECT 1 FROM t1 a)
+NULL
+1
+0
+DROP TABLE t1;
+CREATE TABLE t1 (i int);
+INSERT INTO t1 VALUES (1),(2),(3);
+SELECT sum(i) over () IN ( SELECT 1 FROM t1 a) FROM t1;
+sum(i) over () IN ( SELECT 1 FROM t1 a)
+0
+0
+0
+DROP TABLE t1;
+#
# End of 10.2 tests
#
diff --git a/mysql-test/t/win.test b/mysql-test/t/win.test
index 57214ab0165..c07a81f17da 100644
--- a/mysql-test/t/win.test
+++ b/mysql-test/t/win.test
@@ -2542,6 +2542,20 @@ order by rn desc;
drop table t1;
+--echo #
+--echo # MDEV-25630: Crash with window function in left expr of IN subquery
+--echo #
+
+CREATE TABLE t1 (i int);
+INSERT INTO t1 VALUES (1),(2),(3);
+SELECT lag(i) over (ORDER BY i) IN ( SELECT 1 FROM t1 a) FROM t1;
+DROP TABLE t1;
+
+CREATE TABLE t1 (i int);
+INSERT INTO t1 VALUES (1),(2),(3);
+SELECT sum(i) over () IN ( SELECT 1 FROM t1 a) FROM t1;
+DROP TABLE t1;
+
--echo #
--echo # End of 10.2 tests
--echo #
diff --git a/sql/item.cc b/sql/item.cc
index be64edca9a1..d7a3659a2ce 100644
--- a/sql/item.cc
+++ b/sql/item.cc
@@ -8203,6 +8203,7 @@ Item_cache_wrapper::Item_cache_wrapper(THD *thd, Item *item_arg):
name= item_arg->name;
name_length= item_arg->name_length;
with_subselect= orig_item->with_subselect;
+ with_window_func= orig_item->with_window_func;
if ((expr_value= Item_cache::get_cache(thd, orig_item)))
expr_value->setup(thd, orig_item);
diff --git a/sql/item_cmpfunc.cc b/sql/item_cmpfunc.cc
index 7b7604053e3..8a2c532f621 100644
--- a/sql/item_cmpfunc.cc
+++ b/sql/item_cmpfunc.cc
@@ -1416,6 +1416,9 @@ bool Item_in_optimizer::fix_fields(THD *thd, Item **ref)
maybe_null=1;
with_subselect= 1;
with_sum_func= with_sum_func || args[1]->with_sum_func;
+ with_window_func= args[0]->with_window_func;
+ // The subquery cannot have window functions aggregated in this select
+ DBUG_ASSERT(!args[1]->with_window_func);
with_field= with_field || args[1]->with_field;
with_param= args[0]->with_param || args[1]->with_param;
used_tables_and_const_cache_join(args[1]);
1
0
[Commits] 2087d47aaea: MDEV-22462: Item_in_subselect::create_single_in_to_exists_cond(JOIN *, Item **, Item **): Assertion `false' failed.
by psergey 21 May '21
by psergey 21 May '21
21 May '21
revision-id: 2087d47aaeadc06dd007ce9bd28984ecc8e2101e (mariadb-10.2.31-964-g2087d47aaea)
parent(s): 8c8a6ed3b8e2bf6d9c0c155ba9a987c0ff27ac6c
author: Sergei Petrunia
committer: Sergei Petrunia
timestamp: 2021-05-21 17:46:48 +0300
message:
MDEV-22462: Item_in_subselect::create_single_in_to_exists_cond(JOIN *, Item **, Item **): Assertion `false' failed.
Item_in_subselect::create_single_in_to_exists_cond() should handle the
case where the subquery is a table-less select but it is not a result
of a UNION.
(Table-less subqueries like "(SELECT 1)" are "substituted" with their select
list, but table-less subqueries with WHERE or HAVING clause, like
"(SELECT 1 WHERE ...)" are not substituted. They are handled with regular
execution path)
---
mysql-test/r/subselect4.result | 11 ++++++++++-
mysql-test/t/subselect4.test | 13 ++++++++++++-
sql/item_subselect.cc | 44 ++++++++++++++++++++----------------------
3 files changed, 43 insertions(+), 25 deletions(-)
diff --git a/mysql-test/r/subselect4.result b/mysql-test/r/subselect4.result
index 2a691799be5..b1db309ec18 100644
--- a/mysql-test/r/subselect4.result
+++ b/mysql-test/r/subselect4.result
@@ -2721,7 +2721,15 @@ id select_type table type possible_keys key key_len ref rows Extra
SELECT a FROM t1 WHERE (a, a) IN (SELECT 1, 2) AND a = (SELECT MIN(b) FROM t2);
a
DROP TABLE t1,t2;
-# End of 10.2 tests
+#
+# MDEV-22462: Item_in_subselect::create_single_in_to_exists_cond(JOIN *, Item **, Item **): Assertion `false' failed.
+#
+select 1 from dual where 1 in (select 5 from dual where 1);
+1
+create table t1 (a int);
+insert into t1 values (1),(2),(3);
+update t1 set a = 2 where a in (select a from dual where a = a);
+drop table t1;
#
# MDEV-24925: Server crashes in Item_subselect::init_expr_cache_tracker
#
@@ -2793,3 +2801,4 @@ FROM (t1 JOIN t1 AS ref_t1 ON
(t1.i1 > (SELECT ref_t1.i1 AS c0 FROM t1 b ORDER BY -c0)));
ERROR 21000: Subquery returns more than 1 row
DROP TABLE t1;
+# End of 10.2 tests
diff --git a/mysql-test/t/subselect4.test b/mysql-test/t/subselect4.test
index 58aa7868815..bd1e20cb5d6 100644
--- a/mysql-test/t/subselect4.test
+++ b/mysql-test/t/subselect4.test
@@ -2236,7 +2236,17 @@ SELECT a FROM t1 WHERE (a, a) IN (SELECT 1, 2) AND a = (SELECT MIN(b) FROM t2);
DROP TABLE t1,t2;
---echo # End of 10.2 tests
+--echo #
+--echo # MDEV-22462: Item_in_subselect::create_single_in_to_exists_cond(JOIN *, Item **, Item **): Assertion `false' failed.
+--echo #
+
+select 1 from dual where 1 in (select 5 from dual where 1);
+
+create table t1 (a int);
+insert into t1 values (1),(2),(3);
+
+update t1 set a = 2 where a in (select a from dual where a = a);
+drop table t1;
--echo #
--echo # MDEV-24925: Server crashes in Item_subselect::init_expr_cache_tracker
@@ -2296,3 +2306,4 @@ FROM (t1 JOIN t1 AS ref_t1 ON
DROP TABLE t1;
+--echo # End of 10.2 tests
diff --git a/sql/item_subselect.cc b/sql/item_subselect.cc
index ed8e5e900a2..1e3c9a77a26 100644
--- a/sql/item_subselect.cc
+++ b/sql/item_subselect.cc
@@ -2249,7 +2249,8 @@ Item_in_subselect::create_single_in_to_exists_cond(JOIN *join,
*/
Item *item= (Item*) select_lex->item_list.head();
- if (select_lex->table_list.elements)
+ if (select_lex->table_list.elements ||
+ !(select_lex->master_unit()->is_union()))
{
Item *having= item;
Item *orig_item= item;
@@ -2297,31 +2298,28 @@ Item_in_subselect::create_single_in_to_exists_cond(JOIN *join,
}
else
{
- if (select_lex->master_unit()->is_union())
+ DBUG_ASSERT(select_lex->master_unit()->is_union());
+
+ Item *new_having=
+ func->create(thd, expr,
+ new (thd->mem_root) Item_ref_null_helper(thd,
+ &select_lex->context,
+ this,
+ &select_lex->ref_pointer_array[0],
+ (char *)"<no matter>",
+ (char *)"<result>"));
+ if (!abort_on_null && left_expr->maybe_null)
{
- Item *new_having=
- func->create(thd, expr,
- new (thd->mem_root) Item_ref_null_helper(thd,
- &select_lex->context,
- this,
- &select_lex->ref_pointer_array[0],
- (char *)"<no matter>",
- (char *)"<result>"));
- if (!abort_on_null && left_expr->maybe_null)
- {
- disable_cond_guard_for_const_null_left_expr(0);
- if (!(new_having= new (thd->mem_root) Item_func_trig_cond(thd, new_having,
- get_cond_guard(0))))
- DBUG_RETURN(true);
- }
-
- new_having->name= (char*) in_having_cond;
- if (fix_having(new_having, select_lex))
+ disable_cond_guard_for_const_null_left_expr(0);
+ if (!(new_having= new (thd->mem_root) Item_func_trig_cond(thd, new_having,
+ get_cond_guard(0))))
DBUG_RETURN(true);
- *having_item= new_having;
}
- else
- DBUG_ASSERT(false);
+
+ new_having->name= (char*) in_having_cond;
+ if (fix_having(new_having, select_lex))
+ DBUG_RETURN(true);
+ *having_item= new_having;
}
}
1
0
[Commits] f3dd96ad25e: MDEV-23937: SIGSEGV in looped best_extension_by_limited_search from greedy_search
by Sergei Petrunia 20 May '21
by Sergei Petrunia 20 May '21
20 May '21
revision-id: f3dd96ad25efe23081981f52a54a57b17a5a890e (mariadb-10.2.31-961-gf3dd96ad25e)
parent(s): 4625830b6794184a57c2702436e810be941a51c0
author: Sergei Petrunia
committer: Sergei Petrunia
timestamp: 2021-05-20 18:26:24 +0300
message:
MDEV-23937: SIGSEGV in looped best_extension_by_limited_search from greedy_search
Add a testcase (fixed by fix for MDEV-17783)
---
mysql-test/r/selectivity_no_engine.result | 20 ++++++++++++++++++++
mysql-test/t/selectivity_no_engine.test | 21 +++++++++++++++++++++
2 files changed, 41 insertions(+)
diff --git a/mysql-test/r/selectivity_no_engine.result b/mysql-test/r/selectivity_no_engine.result
index 7fc3c6e9909..74a52c9fed8 100644
--- a/mysql-test/r/selectivity_no_engine.result
+++ b/mysql-test/r/selectivity_no_engine.result
@@ -293,6 +293,26 @@ SELECT * FROM t1 WHERE t1.d = 0 AND t1.p = '1' AND t1.i != '-1' AND t1.n = 'some
i n d p
set optimizer_use_condition_selectivity= @tmp_mdev8779;
DROP TABLE t1;
+#
+# MDEV-23937: SIGSEGV in looped best_extension_by_limited_search from greedy_search
+# (Testcase only)
+#
+set
+@tmp_jcl= @@join_cache_level,
+@tmp_ucs= @@optimizer_use_condition_selectivity;
+set
+join_cache_level=3,
+optimizer_use_condition_selectivity=2;
+CREATE TABLE t1 AS SELECT * FROM mysql.user;
+CREATE TABLE t3 (b VARCHAR (1));
+CREATE TABLE t2 (c2 INT);
+INSERT INTO t2 VALUES (1);
+EXPLAIN
+SELECT * FROM t1 AS a NATURAL JOIN t1 AS b;
+id select_type table type possible_keys key key_len ref rows Extra
+1 SIMPLE a ALL NULL NULL NULL NULL 4
+1 SIMPLE b hash_ALL NULL #hash#$hj 827 test.a.Host,test.a.User,test.a.Password,test.a.Select_priv,test.a.Insert_priv,test.a.Update_priv,test.a.Delete_priv,test.a.Create_priv,test.a.Drop_priv,test.a.Reload_priv,test.a.Shutdown_priv,test.a.Process_priv,test.a.File_priv,test.a.Grant_priv,test.a.References_priv,test.a.Index_priv,test.a.Alter_priv,test.a.Show_db_priv,test.a.Super_priv,test.a.Create_tmp_table_priv,test.a.Lock_tables_priv,test.a.Execute_priv,test.a.Repl_slave_priv,test.a.Repl_client_priv,test.a.Create_view_priv,test.a.Show_view_priv,test.a.Create_routine_priv,test.a.Alter_routine_priv,test.a.Create_user_priv,test.a.Event_priv,test.a.Trigger_priv,test.a.Create_tablespace_priv,test.a.ssl_type,test.a.ssl_cipher,test.a.x509_issuer,test.a.x509_subject,test.a.max_questions,test.a.max_updates,test.a.max_connections,test.a.max_user_connections,test.a.plugin,test.a.authentication_string,test.a.password_expired,test.a.is_role,test.a.default_role,test.a.max_statement_time 4 Using whe
re; Using join buffer (flat, BNLH join)
+DROP TABLE t1,t2,t3;
#
# End of the test file
#
diff --git a/mysql-test/t/selectivity_no_engine.test b/mysql-test/t/selectivity_no_engine.test
index 345b7bd1e8a..b5f52dd167d 100644
--- a/mysql-test/t/selectivity_no_engine.test
+++ b/mysql-test/t/selectivity_no_engine.test
@@ -228,6 +228,27 @@ SELECT * FROM t1 WHERE t1.d = 0 AND t1.p = '1' AND t1.i != '-1' AND t1.n = 'some
set optimizer_use_condition_selectivity= @tmp_mdev8779;
DROP TABLE t1;
+--echo #
+--echo # MDEV-23937: SIGSEGV in looped best_extension_by_limited_search from greedy_search
+--echo # (Testcase only)
+--echo #
+set
+ @tmp_jcl= @@join_cache_level,
+ @tmp_ucs= @@optimizer_use_condition_selectivity;
+set
+ join_cache_level=3,
+ optimizer_use_condition_selectivity=2;
+
+CREATE TABLE t1 AS SELECT * FROM mysql.user;
+CREATE TABLE t3 (b VARCHAR (1));
+CREATE TABLE t2 (c2 INT);
+INSERT INTO t2 VALUES (1);
+
+EXPLAIN
+SELECT * FROM t1 AS a NATURAL JOIN t1 AS b;
+
+DROP TABLE t1,t2,t3;
+
--echo #
--echo # End of the test file
--echo #
1
0
[Commits] 4625830b679: MDEV-17783: AddressSanitizer: stack-buffer-overflow in table_cond_selectivity
by Sergei Petrunia 20 May '21
by Sergei Petrunia 20 May '21
20 May '21
revision-id: 4625830b6794184a57c2702436e810be941a51c0 (mariadb-10.2.31-960-g4625830b679)
parent(s): af8d4a97e29905f2806e7f26b420ce517e96c723
author: Sergei Petrunia
committer: Sergei Petrunia
timestamp: 2021-05-20 18:23:12 +0300
message:
MDEV-17783: AddressSanitizer: stack-buffer-overflow in table_cond_selectivity
A less-intrusive fix: don't have table_cond_selectivity() assume that
there are less than MAX_REF_PARTS hash-join KEYUSEs.
If there are more than that, switch to using an array. Allocate the array
on the heap: we can't allocate it on MEM_ROOT as table_cond_selectivity()
is called many times during the optimization.
---
mysql-test/r/selectivity_innodb.result | 29 +++++++++++++++++++++++++++++
mysql-test/t/selectivity_innodb.test | 34 ++++++++++++++++++++++++++++++++++
sql/sql_select.cc | 25 ++++++++++++++++++++++++-
3 files changed, 87 insertions(+), 1 deletion(-)
diff --git a/mysql-test/r/selectivity_innodb.result b/mysql-test/r/selectivity_innodb.result
index 5452919aa6d..23e0fcc9387 100644
--- a/mysql-test/r/selectivity_innodb.result
+++ b/mysql-test/r/selectivity_innodb.result
@@ -2103,6 +2103,35 @@ drop view v1;
#
# End of 10.1 tests
#
+#
+# MDEV-17783: AddressSanitizer: stack-buffer-overflow in table_cond_selectivity
+#
+set
+@tmp_jcl=@@join_cache_level,
+@tmp_sel=@@optimizer_use_condition_selectivity;
+set
+join_cache_level=3,
+optimizer_use_condition_selectivity=2;
+CREATE TABLE t1 (
+c1 int, c2 int, c3 int, c4 int, c5 int, c6 int, c7 int, c8 int, c9 int, c10 int,
+c11 int, c12 int, c13 int, c14 int, c15 int, c16 int, c17 int, c18 int, c19 int,
+c20 int, c21 int, c22 int, c23 int, c24 int, c25 int, c26 int, c27 int, c28 int,
+c29 int, c30 int, c31 int, c32 int, c33 int, c34 int
+) ENGINE=InnoDB;
+SELECT * FROM t1
+WHERE
+(c1, c2, c3, c4, c5, c6, c7, c8, c9, c10,
+c11, c12, c13, c14, c15, c16, c17, c18, c19,
+c20, c21, c22, c23, c24, c25, c26, c27, c28, c29,
+c30, c31, c32, c33, c34) IN (SELECT * FROM t1) ;
+c1 c2 c3 c4 c5 c6 c7 c8 c9 c10 c11 c12 c13 c14 c15 c16 c17 c18 c19 c20 c21 c22 c23 c24 c25 c26 c27 c28 c29 c30 c31 c32 c33 c34
+set
+join_cache_level=@tmp_jcl,
+optimizer_use_condition_selectivity=@tmp_sel;
+drop table t1;
+#
+# End of 10.1 tests
+#
set use_stat_tables= @tmp_ust;
set optimizer_use_condition_selectivity= @tmp_oucs;
set @@global.histogram_size=@save_histogram_size;
diff --git a/mysql-test/t/selectivity_innodb.test b/mysql-test/t/selectivity_innodb.test
index 6c457e2848b..eb05091e43a 100644
--- a/mysql-test/t/selectivity_innodb.test
+++ b/mysql-test/t/selectivity_innodb.test
@@ -174,6 +174,40 @@ drop view v1;
--echo # End of 10.1 tests
--echo #
+--echo #
+--echo # MDEV-17783: AddressSanitizer: stack-buffer-overflow in table_cond_selectivity
+--echo #
+
+set
+ @tmp_jcl=@@join_cache_level,
+ @tmp_sel=@@optimizer_use_condition_selectivity;
+set
+ join_cache_level=3,
+ optimizer_use_condition_selectivity=2;
+
+CREATE TABLE t1 (
+ c1 int, c2 int, c3 int, c4 int, c5 int, c6 int, c7 int, c8 int, c9 int, c10 int,
+ c11 int, c12 int, c13 int, c14 int, c15 int, c16 int, c17 int, c18 int, c19 int,
+ c20 int, c21 int, c22 int, c23 int, c24 int, c25 int, c26 int, c27 int, c28 int,
+ c29 int, c30 int, c31 int, c32 int, c33 int, c34 int
+) ENGINE=InnoDB;
+
+SELECT * FROM t1
+WHERE
+ (c1, c2, c3, c4, c5, c6, c7, c8, c9, c10,
+ c11, c12, c13, c14, c15, c16, c17, c18, c19,
+ c20, c21, c22, c23, c24, c25, c26, c27, c28, c29,
+ c30, c31, c32, c33, c34) IN (SELECT * FROM t1) ;
+
+set
+ join_cache_level=@tmp_jcl,
+ optimizer_use_condition_selectivity=@tmp_sel;
+drop table t1;
+
+--echo #
+--echo # End of 10.1 tests
+--echo #
+
set use_stat_tables= @tmp_ust;
set optimizer_use_condition_selectivity= @tmp_oucs;
set @@global.histogram_size=@save_histogram_size;
diff --git a/sql/sql_select.cc b/sql/sql_select.cc
index ce706209017..950d8f8fa70 100644
--- a/sql/sql_select.cc
+++ b/sql/sql_select.cc
@@ -7949,7 +7949,9 @@ static
double table_cond_selectivity(JOIN *join, uint idx, JOIN_TAB *s,
table_map rem_tables)
{
- uint16 ref_keyuse_steps[MAX_REF_PARTS - 1];
+ uint16 ref_keyuse_steps_buf[MAX_REF_PARTS - 1];
+ uint ref_keyuse_size= MAX_REF_PARTS - 1;
+ uint16 *ref_keyuse_steps= ref_keyuse_steps_buf;
Field *field;
TABLE *table= s->table;
MY_BITMAP *read_set= table->read_set;
@@ -8096,6 +8098,24 @@ double table_cond_selectivity(JOIN *join, uint idx, JOIN_TAB *s,
}
if (keyparts > 1)
{
+ if (keyparts - 2 >= ref_keyuse_size)
+ {
+ uint new_size= MY_MAX(ref_keyuse_size*2, keyparts);
+ void *new_buf;
+ if (!(new_buf= my_malloc(sizeof(uint16)*new_size, MYF(0))))
+ {
+ if (ref_keyuse_steps != ref_keyuse_steps_buf)
+ my_free(ref_keyuse_steps);
+ return 1.0; // As if no selectivity was computed
+ }
+ memcpy(new_buf, ref_keyuse_steps,
+ sizeof(uint16)*ref_keyuse_size);
+ if (ref_keyuse_steps != ref_keyuse_steps_buf)
+ my_free(ref_keyuse_steps);
+
+ ref_keyuse_steps= (uint16*)new_buf;
+ ref_keyuse_size= new_size;
+ }
ref_keyuse_steps[keyparts-2]= (uint16)(keyuse - prev_ref_keyuse);
prev_ref_keyuse= keyuse;
}
@@ -8151,6 +8171,9 @@ double table_cond_selectivity(JOIN *join, uint idx, JOIN_TAB *s,
sel*= table_multi_eq_cond_selectivity(join, idx, s, rem_tables,
keyparts, ref_keyuse_steps);
+ if (ref_keyuse_steps != ref_keyuse_steps_buf)
+ my_free(ref_keyuse_steps);
+
return sel;
}
1
0
[Commits] c89e6b34533: This adds a global my.cnf parameter, rocksdb_use_range_locking.
by psergey 17 May '21
by psergey 17 May '21
17 May '21
revision-id: c89e6b3453349616bbe5b810a0ba2cb060625889 (percona-202102-56-gc89e6b34533)
parent(s): 64a1f75b1f122633470d693bcd71b0a237d7b347
author: Sergei Petrunia
committer: Sergei Petrunia
timestamp: 2021-05-17 18:15:37 +0300
message:
This adds a global my.cnf parameter, rocksdb_use_range_locking.
When it is ON, MyRocks will:
- initialize RocksDB to use range-locking lock manager
- for all DML operations (including SELECT .. FOR UPDATE) will lock
the scanned range before reading/modifying rows.
- In range locking mode, there is no snapshot checking (cannot do that
for ranges). Instead, MyRocks will read and modify latest committed
data, just like InnoDB does (in the code, grep for (start|end)
_ignore_snapshot)
- Queries that do not have a finite range to scan, like
UPDATE t1 .... ORDER BY t1.key LIMIT n
will use a "Locking iterator" which will read rows, lock the range,
and re-read the rows. See class LockingIterator.
---
mysql-test/suite/rocksdb/combinations | 3 +
.../suite/rocksdb/include/have_range_locking.inc | 3 +
.../suite/rocksdb/include/not_range_locking.inc | 5 +
.../rocksdb/include/select_from_is_rowlocks.inc | 77 +++
.../suite/rocksdb/r/hermitage-range_locking.result | 652 +++++++++++++++++++++
...issue243_transactionStatus-range_locking.result | 182 ++++++
.../r/level_repeatable_read-range_locking.result | 106 ++++
.../suite/rocksdb/r/partial_index_stress.result | 22 +-
mysql-test/suite/rocksdb/r/range_locking.result | 522 +++++++++++++++++
.../r/range_locking_deadlock_tracking.result | 453 ++++++++++++++
.../rocksdb/r/range_locking_escalation.result | 27 +
.../rocksdb/r/range_locking_refresh_iter.result | 50 ++
.../suite/rocksdb/r/range_locking_rev_cf.result | 482 +++++++++++++++
.../rocksdb/r/range_locking_seek_for_update.result | 279 +++++++++
.../rocksdb/r/range_locking_shared_locks.result | 251 ++++++++
mysql-test/suite/rocksdb/r/rocksdb.result | 3 +
.../suite/rocksdb/r/rocksdb_read_free_rpl.result | 2 +-
.../rocksdb/r/rocksdb_timeout_rollback.result | 3 +
mysql-test/suite/rocksdb/r/unique_sec.result | 4 +
.../suite/rocksdb/r/unique_sec_rev_cf.result | 4 +
mysql-test/suite/rocksdb/t/db_max_index_num.test | 5 +
mysql-test/suite/rocksdb/t/deadlock_tracking.test | 7 +-
.../t/drop_cf_before_show_deadlock_info.test | 4 +
.../suite/rocksdb/t/hermitage-range_locking.test | 15 +
mysql-test/suite/rocksdb/t/hermitage.inc | 14 +-
mysql-test/suite/rocksdb/t/hermitage.test | 3 +
mysql-test/suite/rocksdb/t/i_s_deadlock.test | 4 +
mysql-test/suite/rocksdb/t/issue111.test | 4 +
.../issue243_transactionStatus-range_locking.test | 10 +
.../rocksdb/t/issue243_transactionStatus.test | 4 +
.../t/level_repeatable_read-range_locking.test | 9 +
.../suite/rocksdb/t/level_repeatable_read.test | 3 +
mysql-test/suite/rocksdb/t/lock_info.test | 3 +
mysql-test/suite/rocksdb/t/locking_issues.test | 3 +
mysql-test/suite/rocksdb/t/max_row_locks.test | 1 +
mysql-test/suite/rocksdb/t/partial_index_stress.py | 13 +-
.../suite/rocksdb/t/partial_index_stress.test | 24 +-
mysql-test/suite/rocksdb/t/range_locking.inc | 544 +++++++++++++++++
mysql-test/suite/rocksdb/t/range_locking.test | 6 +
.../rocksdb/t/range_locking_deadlock_tracking.test | 196 +++++++
.../rocksdb/t/range_locking_escalation-master.opt | 1 +
.../suite/rocksdb/t/range_locking_escalation.test | 39 ++
.../rocksdb/t/range_locking_refresh_iter.test | 70 +++
.../suite/rocksdb/t/range_locking_rev_cf.test | 12 +
.../rocksdb/t/range_locking_seek_for_update.test | 288 +++++++++
.../rocksdb/t/range_locking_shared_locks.test | 202 +++++++
mysql-test/suite/rocksdb/t/rocksdb.test | 3 +
.../suite/rocksdb/t/rocksdb_concurrent_delete.test | 4 +
mysql-test/suite/rocksdb/t/rocksdb_locks.test | 3 +
.../suite/rocksdb/t/rocksdb_read_free_rpl.test | 2 +-
.../suite/rocksdb/t/rocksdb_timeout_rollback.test | 2 +
mysql-test/suite/rocksdb/t/rpl_row_not_found.inc | 2 +
.../suite/rocksdb/t/select_lock_in_share_mode.test | 3 +
mysql-test/suite/rocksdb/t/unique_check.test | 5 +
mysql-test/suite/rocksdb/t/unique_sec.inc | 10 +-
mysql-test/suite/rocksdb/t/unique_sec_rev_cf.test | 1 +
mysql-test/suite/rocksdb/t/varbinary_format.test | 4 +
mysql-test/suite/rocksdb/t/varchar_format.test | 2 +
.../r/rocksdb_max_lock_memory_basic.result | 7 +
.../r/rocksdb_use_range_locking_basic.result | 7 +
.../t/rocksdb_max_lock_memory_basic.test | 5 +
.../t/rocksdb_use_range_locking_basic.test | 5 +
storage/rocksdb/CMakeLists.txt | 1 +
storage/rocksdb/ha_rocksdb.cc | 632 ++++++++++++++++++--
storage/rocksdb/ha_rocksdb.h | 19 +-
storage/rocksdb/nosql_access.cc | 6 +-
storage/rocksdb/rdb_i_s.cc | 82 ++-
storage/rocksdb/rdb_iterator.cc | 33 +-
storage/rocksdb/rdb_iterator.h | 9 +
storage/rocksdb/rdb_locking_iter.cc | 108 ++++
storage/rocksdb/rdb_locking_iter.h | 190 ++++++
storage/rocksdb/rdb_utils.cc | 27 +
storage/rocksdb/rdb_utils.h | 3 +
73 files changed, 5674 insertions(+), 115 deletions(-)
diff --git a/mysql-test/suite/rocksdb/combinations b/mysql-test/suite/rocksdb/combinations
index acf2f49a0c3..5e3b56932c6 100644
--- a/mysql-test/suite/rocksdb/combinations
+++ b/mysql-test/suite/rocksdb/combinations
@@ -7,3 +7,6 @@ rocksdb_write_policy=write_prepared
[write_unprepared]
rocksdb_write_policy=write_unprepared
rocksdb_write_batch_flush_threshold=1
+
+[range_locking]
+rocksdb_use_range_locking=1
diff --git a/mysql-test/suite/rocksdb/include/have_range_locking.inc b/mysql-test/suite/rocksdb/include/have_range_locking.inc
new file mode 100644
index 00000000000..a8600daea77
--- /dev/null
+++ b/mysql-test/suite/rocksdb/include/have_range_locking.inc
@@ -0,0 +1,3 @@
+if (`select count(*) = 0 from performance_schema.session_variables where variable_name = 'rocksdb_use_range_locking' and variable_value = 'ON';`) {
+ --skip Test requires range locking
+}
diff --git a/mysql-test/suite/rocksdb/include/not_range_locking.inc b/mysql-test/suite/rocksdb/include/not_range_locking.inc
new file mode 100644
index 00000000000..62c26b134bc
--- /dev/null
+++ b/mysql-test/suite/rocksdb/include/not_range_locking.inc
@@ -0,0 +1,5 @@
+--let $_use_range_locking= `select @@rocksdb_use_range_locking`
+if ($_use_range_locking == 1)
+{
+ --skip Test doesn't support range locking
+}
diff --git a/mysql-test/suite/rocksdb/include/select_from_is_rowlocks.inc b/mysql-test/suite/rocksdb/include/select_from_is_rowlocks.inc
new file mode 100644
index 00000000000..6dbd63c87c4
--- /dev/null
+++ b/mysql-test/suite/rocksdb/include/select_from_is_rowlocks.inc
@@ -0,0 +1,77 @@
+--echo # select * from information_schema.rocksdb_locks; # With replacements by select_from_is_rowlocks.inc
+#
+# An include to print contents of I_S.ROCKSB_LOCKS
+#
+# Implicit "parameters"
+# - Currently it prints locks on t1.PRIMARY
+#
+# Explicit "parameter" variables:
+# - $TRX1_ID - print this transaction as "TRX1"
+# - $TRX2_ID - print this transaction as "TRX2"
+#
+# - $select_from_is_rowlocks_current_trx_only
+# - $order_by_rowkey
+
+--disable_query_log
+set @cf_id=(select column_family from information_schema.rocksdb_ddl
+ where table_name='t1' and index_name='PRIMARY');
+set @rtrx_id=(select transaction_id from information_schema.rocksdb_trx
+ where thread_id=connection_id());
+set @indexnr= (select lower(
+ concat(
+ lpad(hex(db_number),8,'0'),
+ lpad(hex(index_number),8,'0')
+ )
+ )
+ from information_schema.rocksdb_ddl
+ where table_name='t1' and index_name='PRIMARY');
+
+set @indexnr_next= (select lower(
+ concat(
+ lpad(hex(db_number),8,'0'),
+ lpad(hex(index_number+1),8,'0')
+ )
+ )
+ from information_schema.rocksdb_ddl
+ where table_name='t1' and index_name='PRIMARY');
+
+let $extra_where = where 1;
+
+if ($select_from_is_rowlocks_current_trx_only)
+{
+ let $extra_where = where transaction_id=(select transaction_id from information_schema.rocksdb_trx where connection_id()=thread_id);
+}
+
+# If TRX1_ID is not specified, get the current transaction:
+let $transaction_col= replace(transaction_id, @rtrx_id, "\$trx_id");
+if ($TRX1_ID)
+{
+ let $transaction_col = replace(transaction_id, '$TRX1_ID', "\$TRX1_ID");
+}
+
+if ($TRX2_ID)
+{
+ let $transaction_col = replace($transaction_col, '$TRX2_ID', "\$TRX2_ID");
+}
+
+if ($order_by_rowkey)
+{
+ let $extra_order_by = ORDER BY 3,2;
+}
+
+if (!$order_by_rowkey)
+{
+ --sorted_result
+}
+
+eval select
+ replace(column_family_id, @cf_id, "\$cf_id") as COLUMN_FAMILY_ID,
+ $transaction_col as TRANSACTION_ID,
+ replace(
+ replace(`key`, @indexnr, '\${indexnr}'),
+ @indexnr_next, '\${indexnr+1}'
+ ) as `KEY`,
+ mode
+from information_schema.rocksdb_locks $extra_where $extra_order_by;
+
+--enable_query_log
diff --git a/mysql-test/suite/rocksdb/r/hermitage-range_locking.result b/mysql-test/suite/rocksdb/r/hermitage-range_locking.result
new file mode 100644
index 00000000000..3938fa38b6c
--- /dev/null
+++ b/mysql-test/suite/rocksdb/r/hermitage-range_locking.result
@@ -0,0 +1,652 @@
+DROP TABLE IF EXISTS test;
+connect con1,localhost,root,,;
+SET SESSION TRANSACTION ISOLATION LEVEL READ COMMITTED;
+connect con2,localhost,root,,;
+SET SESSION TRANSACTION ISOLATION LEVEL READ COMMITTED;
+connect con3,localhost,root,,;
+SET SESSION TRANSACTION ISOLATION LEVEL READ COMMITTED;
+connection con1;
+create table test (id int primary key, value int) engine=rocksdb;
+connection con1;
+truncate table test;
+insert into test (id, value) values (1, 10), (2, 20);
+begin;
+connection con2;
+begin;
+connection con3;
+begin;
+connection con1;
+select * from test;
+id value
+1 10
+2 20
+update test set value = 101 where id = 1;
+connection con2;
+select * from test;
+id value
+1 10
+2 20
+connection con1;
+rollback;
+connection con2;
+select * from test;
+id value
+1 10
+2 20
+commit;
+connection con1;
+truncate table test;
+insert into test (id, value) values (1, 10), (2, 20);
+begin;
+connection con2;
+begin;
+connection con3;
+begin;
+connection con1;
+update test set value = 101 where id = 1;
+connection con2;
+select * from test;
+id value
+1 10
+2 20
+connection con1;
+update test set value = 11 where id = 1;
+commit;
+connection con2;
+select * from test;
+id value
+1 11
+2 20
+commit;
+connection con1;
+truncate table test;
+insert into test (id, value) values (1, 10), (2, 20);
+begin;
+connection con2;
+begin;
+connection con3;
+begin;
+connection con1;
+update test set value = 11 where id = 1;
+connection con2;
+update test set value = 22 where id = 2;
+connection con1;
+select * from test where id = 2;
+id value
+2 20
+connection con2;
+select * from test where id = 1;
+id value
+1 10
+connection con1;
+commit;
+connection con2;
+commit;
+connection con1;
+truncate table test;
+insert into test (id, value) values (1, 10), (2, 20);
+begin;
+connection con2;
+begin;
+connection con3;
+begin;
+connection con1;
+update test set value = 11 where id = 1;
+update test set value = 19 where id = 2;
+connection con2;
+update test set value = 12 where id = 1;
+connection con1;
+commit;
+connection con2;
+connection con3;
+select * from test;
+id value
+1 11
+2 19
+connection con2;
+update test set value = 18 where id = 2;
+connection con3;
+select * from test;
+id value
+1 11
+2 19
+connection con2;
+commit;
+connection con3;
+select * from test;
+id value
+1 12
+2 18
+commit;
+connection con1;
+truncate table test;
+insert into test (id, value) values (1, 10), (2, 20);
+begin;
+connection con2;
+begin;
+connection con3;
+begin;
+connection con1;
+select * from test where value = 30;
+id value
+connection con2;
+insert into test (id, value) values(3, 30);
+commit;
+connection con1;
+select * from test where value % 3 = 0;
+id value
+3 30
+commit;
+connection con1;
+truncate table test;
+insert into test (id, value) values (1, 10), (2, 20);
+begin;
+connection con2;
+begin;
+connection con3;
+begin;
+connection con1;
+update test set value = value + 10;
+connection con2;
+select variable_value into @a from performance_schema.global_status where variable_name='rocksdb_snapshot_conflict_errors';
+select * from test;
+id value
+1 10
+2 20
+delete from test where value = 20;
+connection con1;
+commit;
+connection con2;
+select * from test;
+id value
+2 30
+commit;
+connection con1;
+truncate table test;
+insert into test (id, value) values (1, 10), (2, 20);
+begin;
+connection con2;
+begin;
+connection con3;
+begin;
+connection con1;
+select * from test where id = 1;
+id value
+1 10
+connection con2;
+select * from test where id = 1;
+id value
+1 10
+connection con1;
+update test set value = 11 where id = 1;
+connection con2;
+update test set value = 12 where id = 1;
+connection con1;
+commit;
+connection con2;
+select * from test;
+id value
+1 12
+2 20
+commit;
+connection con1;
+truncate table test;
+insert into test (id, value) values (1, 10), (2, 20);
+begin;
+connection con2;
+begin;
+connection con3;
+begin;
+connection con1;
+select * from test where id = 1;
+id value
+1 10
+connection con2;
+select * from test where id = 1;
+id value
+1 10
+select * from test where id = 2;
+id value
+2 20
+update test set value = 12 where id = 1;
+update test set value = 18 where id = 2;
+commit;
+connection con1;
+select * from test where id = 2;
+id value
+2 18
+commit;
+connection con1;
+truncate table test;
+insert into test (id, value) values (1, 10), (2, 20);
+begin;
+connection con2;
+begin;
+connection con3;
+begin;
+connection con1;
+select * from test where value % 5 = 0;
+id value
+1 10
+2 20
+connection con2;
+update test set value = 12 where value = 10;
+commit;
+connection con1;
+select * from test where value % 3 = 0;
+id value
+1 12
+commit;
+connection con1;
+truncate table test;
+insert into test (id, value) values (1, 10), (2, 20);
+begin;
+connection con2;
+begin;
+connection con3;
+begin;
+connection con1;
+select * from test where id = 1;
+id value
+1 10
+connection con2;
+select * from test;
+id value
+1 10
+2 20
+update test set value = 12 where id = 1;
+update test set value = 18 where id = 2;
+commit;
+connection con1;
+delete from test where value = 20;
+select * from test where id = 2;
+id value
+2 18
+commit;
+connection con1;
+truncate table test;
+insert into test (id, value) values (1, 10), (2, 20);
+begin;
+connection con2;
+begin;
+connection con3;
+begin;
+connection con1;
+select * from test where id in (1,2);
+id value
+1 10
+2 20
+connection con2;
+select * from test where id in (1,2);
+id value
+1 10
+2 20
+connection con1;
+update test set value = 11 where id = 1;
+connection con2;
+update test set value = 21 where id = 2;
+connection con1;
+commit;
+connection con2;
+commit;
+connection con1;
+truncate table test;
+insert into test (id, value) values (1, 10), (2, 20);
+begin;
+connection con2;
+begin;
+connection con3;
+begin;
+connection con1;
+select * from test where value % 3 = 0;
+id value
+connection con2;
+select * from test where value % 3 = 0;
+id value
+connection con1;
+insert into test (id, value) values(3, 30);
+connection con2;
+insert into test (id, value) values(4, 42);
+connection con1;
+commit;
+connection con2;
+commit;
+select * from test where value % 3 = 0;
+id value
+3 30
+4 42
+connection con1;
+select * from test where value % 3 = 0;
+id value
+3 30
+4 42
+connection default;
+drop table test;
+disconnect con1;
+disconnect con2;
+disconnect con3;
+DROP TABLE IF EXISTS test;
+connect con1,localhost,root,,;
+SET SESSION TRANSACTION ISOLATION LEVEL REPEATABLE READ;
+connect con2,localhost,root,,;
+SET SESSION TRANSACTION ISOLATION LEVEL REPEATABLE READ;
+connect con3,localhost,root,,;
+SET SESSION TRANSACTION ISOLATION LEVEL REPEATABLE READ;
+connection con1;
+create table test (id int primary key, value int) engine=rocksdb;
+connection con1;
+truncate table test;
+insert into test (id, value) values (1, 10), (2, 20);
+begin;
+connection con2;
+begin;
+connection con3;
+begin;
+connection con1;
+select * from test;
+id value
+1 10
+2 20
+update test set value = 101 where id = 1;
+connection con2;
+select * from test;
+id value
+1 10
+2 20
+connection con1;
+rollback;
+connection con2;
+select * from test;
+id value
+1 10
+2 20
+commit;
+connection con1;
+truncate table test;
+insert into test (id, value) values (1, 10), (2, 20);
+begin;
+connection con2;
+begin;
+connection con3;
+begin;
+connection con1;
+update test set value = 101 where id = 1;
+connection con2;
+select * from test;
+id value
+1 10
+2 20
+connection con1;
+update test set value = 11 where id = 1;
+commit;
+connection con2;
+select * from test;
+id value
+1 10
+2 20
+commit;
+connection con1;
+truncate table test;
+insert into test (id, value) values (1, 10), (2, 20);
+begin;
+connection con2;
+begin;
+connection con3;
+begin;
+connection con1;
+update test set value = 11 where id = 1;
+connection con2;
+update test set value = 22 where id = 2;
+connection con1;
+select * from test where id = 2;
+id value
+2 20
+connection con2;
+select * from test where id = 1;
+id value
+1 10
+connection con1;
+commit;
+connection con2;
+commit;
+connection con1;
+truncate table test;
+insert into test (id, value) values (1, 10), (2, 20);
+begin;
+connection con2;
+begin;
+connection con3;
+begin;
+connection con1;
+update test set value = 11 where id = 1;
+update test set value = 19 where id = 2;
+connection con2;
+update test set value = 12 where id = 1;
+connection con1;
+commit;
+connection con2;
+connection con3;
+select * from test;
+id value
+1 11
+2 19
+connection con2;
+update test set value = 18 where id = 2;
+connection con3;
+select * from test;
+id value
+1 11
+2 19
+connection con2;
+commit;
+connection con3;
+select * from test;
+id value
+1 11
+2 19
+commit;
+connection con1;
+truncate table test;
+insert into test (id, value) values (1, 10), (2, 20);
+begin;
+connection con2;
+begin;
+connection con3;
+begin;
+connection con1;
+select * from test where value = 30;
+id value
+connection con2;
+insert into test (id, value) values(3, 30);
+commit;
+connection con1;
+select * from test where value % 3 = 0;
+id value
+commit;
+connection con1;
+truncate table test;
+insert into test (id, value) values (1, 10), (2, 20);
+begin;
+connection con2;
+begin;
+connection con3;
+begin;
+connection con1;
+update test set value = value + 10;
+connection con2;
+select variable_value into @a from performance_schema.global_status where variable_name='rocksdb_snapshot_conflict_errors';
+select * from test;
+id value
+1 10
+2 20
+delete from test where value = 20;
+connection con1;
+commit;
+connection con2;
+select * from test;
+id value
+2 20
+commit;
+connection con1;
+truncate table test;
+insert into test (id, value) values (1, 10), (2, 20);
+begin;
+connection con2;
+begin;
+connection con3;
+begin;
+connection con1;
+select * from test where id = 1;
+id value
+1 10
+connection con2;
+select * from test where id = 1;
+id value
+1 10
+connection con1;
+update test set value = 11 where id = 1;
+connection con2;
+update test set value = 12 where id = 1;
+connection con1;
+commit;
+connection con2;
+select * from test;
+id value
+1 12
+2 20
+commit;
+connection con1;
+truncate table test;
+insert into test (id, value) values (1, 10), (2, 20);
+begin;
+connection con2;
+begin;
+connection con3;
+begin;
+connection con1;
+select * from test where id = 1;
+id value
+1 10
+connection con2;
+select * from test where id = 1;
+id value
+1 10
+select * from test where id = 2;
+id value
+2 20
+update test set value = 12 where id = 1;
+update test set value = 18 where id = 2;
+commit;
+connection con1;
+select * from test where id = 2;
+id value
+2 20
+commit;
+connection con1;
+truncate table test;
+insert into test (id, value) values (1, 10), (2, 20);
+begin;
+connection con2;
+begin;
+connection con3;
+begin;
+connection con1;
+select * from test where value % 5 = 0;
+id value
+1 10
+2 20
+connection con2;
+update test set value = 12 where value = 10;
+commit;
+connection con1;
+select * from test where value % 3 = 0;
+id value
+commit;
+connection con1;
+truncate table test;
+insert into test (id, value) values (1, 10), (2, 20);
+begin;
+connection con2;
+begin;
+connection con3;
+begin;
+connection con1;
+select * from test where id = 1;
+id value
+1 10
+connection con2;
+select * from test;
+id value
+1 10
+2 20
+update test set value = 12 where id = 1;
+update test set value = 18 where id = 2;
+commit;
+connection con1;
+delete from test where value = 20;
+select * from test where id = 2;
+id value
+2 20
+commit;
+connection con1;
+truncate table test;
+insert into test (id, value) values (1, 10), (2, 20);
+begin;
+connection con2;
+begin;
+connection con3;
+begin;
+connection con1;
+select * from test where id in (1,2);
+id value
+1 10
+2 20
+connection con2;
+select * from test where id in (1,2);
+id value
+1 10
+2 20
+connection con1;
+update test set value = 11 where id = 1;
+connection con2;
+update test set value = 21 where id = 2;
+connection con1;
+commit;
+connection con2;
+commit;
+connection con1;
+truncate table test;
+insert into test (id, value) values (1, 10), (2, 20);
+begin;
+connection con2;
+begin;
+connection con3;
+begin;
+connection con1;
+select * from test where value % 3 = 0;
+id value
+connection con2;
+select * from test where value % 3 = 0;
+id value
+connection con1;
+insert into test (id, value) values(3, 30);
+connection con2;
+insert into test (id, value) values(4, 42);
+connection con1;
+commit;
+connection con2;
+commit;
+select * from test where value % 3 = 0;
+id value
+3 30
+4 42
+connection con1;
+select * from test where value % 3 = 0;
+id value
+3 30
+4 42
+connection default;
+drop table test;
+disconnect con1;
+disconnect con2;
+disconnect con3;
diff --git a/mysql-test/suite/rocksdb/r/issue243_transactionStatus-range_locking.result b/mysql-test/suite/rocksdb/r/issue243_transactionStatus-range_locking.result
new file mode 100644
index 00000000000..b48535c5ee6
--- /dev/null
+++ b/mysql-test/suite/rocksdb/r/issue243_transactionStatus-range_locking.result
@@ -0,0 +1,182 @@
+DROP TABLE IF EXISTS t1;
+CREATE TABLE t1 (
+id INT,
+val1 INT,
+val2 INT,
+PRIMARY KEY (id)
+) ENGINE=rocksdb;
+INSERT INTO t1 VALUES(1,1,1),(2,1,2);
+SELECT * FROM t1;
+id val1 val2
+1 1 1
+2 1 2
+UPDATE t1 SET val1=2 WHERE id=2;
+SELECT * FROM t1;
+id val1 val2
+1 1 1
+2 2 2
+SHOW ENGINE rocksdb TRANSACTION STATUS;
+Type Name Status
+rocksdb
+============================================================
+TIMESTAMP ROCKSDB TRANSACTION MONITOR OUTPUT
+============================================================
+---------
+SNAPSHOTS
+---------
+LIST OF SNAPSHOTS FOR EACH SESSION:
+----------LATEST DETECTED DEADLOCKS----------
+-----------------------------------------
+END OF ROCKSDB TRANSACTION MONITOR OUTPUT
+=========================================
+
+SET AUTOCOMMIT=0;
+START TRANSACTION;
+INSERT INTO t1 VALUES(20,1,1),(30,30,30);
+SELECT * FROM t1;
+id val1 val2
+1 1 1
+2 2 2
+20 1 1
+30 30 30
+UPDATE t1 SET val1=20, val2=20 WHERE id=20;
+SELECT * FROM t1;
+id val1 val2
+1 1 1
+2 2 2
+20 20 20
+30 30 30
+DELETE FROM t1 WHERE id=30;
+SHOW ENGINE rocksdb TRANSACTION STATUS;
+Type Name Status
+rocksdb
+============================================================
+TIMESTAMP ROCKSDB TRANSACTION MONITOR OUTPUT
+============================================================
+---------
+SNAPSHOTS
+---------
+LIST OF SNAPSHOTS FOR EACH SESSION:
+---SNAPSHOT, ACTIVE NUM sec
+MySQL thread id TID, OS thread handle PTR, query id QID localhost root ACTION
+SHOW ENGINE rocksdb TRANSACTION STATUS
+lock count 4, write count 4
+insert count 2, update count 1, delete count 1
+----------LATEST DETECTED DEADLOCKS----------
+-----------------------------------------
+END OF ROCKSDB TRANSACTION MONITOR OUTPUT
+=========================================
+
+ROLLBACK;
+SHOW ENGINE rocksdb TRANSACTION STATUS;
+Type Name Status
+rocksdb
+============================================================
+TIMESTAMP ROCKSDB TRANSACTION MONITOR OUTPUT
+============================================================
+---------
+SNAPSHOTS
+---------
+LIST OF SNAPSHOTS FOR EACH SESSION:
+----------LATEST DETECTED DEADLOCKS----------
+-----------------------------------------
+END OF ROCKSDB TRANSACTION MONITOR OUTPUT
+=========================================
+
+START TRANSACTION;
+INSERT INTO t1 VALUES(40,40,40);
+SHOW ENGINE rocksdb TRANSACTION STATUS;
+Type Name Status
+rocksdb
+============================================================
+TIMESTAMP ROCKSDB TRANSACTION MONITOR OUTPUT
+============================================================
+---------
+SNAPSHOTS
+---------
+LIST OF SNAPSHOTS FOR EACH SESSION:
+----------LATEST DETECTED DEADLOCKS----------
+-----------------------------------------
+END OF ROCKSDB TRANSACTION MONITOR OUTPUT
+=========================================
+
+COMMIT;
+SHOW ENGINE rocksdb TRANSACTION STATUS;
+Type Name Status
+rocksdb
+============================================================
+TIMESTAMP ROCKSDB TRANSACTION MONITOR OUTPUT
+============================================================
+---------
+SNAPSHOTS
+---------
+LIST OF SNAPSHOTS FOR EACH SESSION:
+----------LATEST DETECTED DEADLOCKS----------
+-----------------------------------------
+END OF ROCKSDB TRANSACTION MONITOR OUTPUT
+=========================================
+
+SET AUTOCOMMIT=1;
+DROP TABLE t1;
+DROP TABLE IF EXISTS t2;
+CREATE TABLE t2 (
+id1 INT,
+id2 INT,
+value INT,
+PRIMARY KEY (id1),
+KEY (id2)
+) ENGINE=rocksdb;
+SET AUTOCOMMIT=0;
+START TRANSACTION;
+INSERT INTO t2 VALUES(1,2,0),(10,20,30);
+UPDATE t2 SET value=3 WHERE id2=2;
+DELETE FROM t2 WHERE id1=10;
+SHOW ENGINE rocksdb TRANSACTION STATUS;
+Type Name Status
+rocksdb
+============================================================
+TIMESTAMP ROCKSDB TRANSACTION MONITOR OUTPUT
+============================================================
+---------
+SNAPSHOTS
+---------
+LIST OF SNAPSHOTS FOR EACH SESSION:
+----------LATEST DETECTED DEADLOCKS----------
+-----------------------------------------
+END OF ROCKSDB TRANSACTION MONITOR OUTPUT
+=========================================
+
+ROLLBACK;
+SET AUTOCOMMIT=1;
+DROP TABLE t2;
+DROP TABLE IF EXISTS t2;
+CREATE TABLE t2 (
+id1 INT,
+id2 INT,
+value INT,
+PRIMARY KEY (id1),
+UNIQUE KEY (id2)
+) ENGINE=rocksdb;
+SET AUTOCOMMIT=0;
+START TRANSACTION;
+INSERT INTO t2 VALUES(1,2,0),(10,20,30);
+UPDATE t2 SET value=3 WHERE id2=2;
+DELETE FROM t2 WHERE id1=10;
+SHOW ENGINE rocksdb TRANSACTION STATUS;
+Type Name Status
+rocksdb
+============================================================
+TIMESTAMP ROCKSDB TRANSACTION MONITOR OUTPUT
+============================================================
+---------
+SNAPSHOTS
+---------
+LIST OF SNAPSHOTS FOR EACH SESSION:
+----------LATEST DETECTED DEADLOCKS----------
+-----------------------------------------
+END OF ROCKSDB TRANSACTION MONITOR OUTPUT
+=========================================
+
+ROLLBACK;
+SET AUTOCOMMIT=1;
+DROP TABLE t2;
diff --git a/mysql-test/suite/rocksdb/r/level_repeatable_read-range_locking.result b/mysql-test/suite/rocksdb/r/level_repeatable_read-range_locking.result
new file mode 100644
index 00000000000..0592b099238
--- /dev/null
+++ b/mysql-test/suite/rocksdb/r/level_repeatable_read-range_locking.result
@@ -0,0 +1,106 @@
+DROP TABLE IF EXISTS t1;
+connect con1,localhost,root,,;
+SET SESSION TRANSACTION ISOLATION LEVEL REPEATABLE READ;
+connect con2,localhost,root,,;
+SET SESSION TRANSACTION ISOLATION LEVEL REPEATABLE READ;
+connection con1;
+CREATE TABLE t1 (a INT, pk INT AUTO_INCREMENT PRIMARY KEY) ENGINE=rocksdb;
+START TRANSACTION;
+SELECT a FROM t1;
+a
+connection con2;
+BEGIN;
+INSERT INTO t1 (a) VALUES(1);
+connection con1;
+SELECT a FROM t1;
+a
+connection con2;
+INSERT INTO t1 (a) VALUES (2);
+connection con1;
+SELECT a FROM t1;
+a
+INSERT INTO t1 (a) SELECT a+100 FROM t1;
+SELECT a FROM t1;
+a
+connection con2;
+SELECT a FROM t1;
+a
+1
+2
+COMMIT;
+SELECT a FROM t1;
+a
+1
+2
+connection con1;
+SELECT a FROM t1;
+a
+INSERT INTO t1 (a) SELECT a+200 FROM t1;
+SELECT a FROM t1;
+a
+201
+202
+COMMIT;
+SELECT a FROM t1;
+a
+1
+2
+201
+202
+connection con2;
+SELECT a FROM t1;
+a
+1
+2
+201
+202
+connection default;
+CREATE TABLE t2 (a INT PRIMARY KEY) ENGINE=rocksdb;
+INSERT INTO t2 (a) VALUES (1);
+COMMIT;
+connection con1;
+BEGIN;
+SELECT a from t2;
+a
+1
+INSERT INTO t2 (a) VALUES (1), (3);
+ERROR 23000: Duplicate entry '1' for key 't2.PRIMARY'
+connection con2;
+INSERT INTO t2 (a) VALUES (2);
+COMMIT;
+connection con1;
+SELECT a from t2;
+a
+1
+COMMIT;
+connection default;
+disconnect con1;
+disconnect con2;
+DROP TABLE t1;
+DROP TABLE t2;
+CREATE TABLE t3 (
+pk int unsigned PRIMARY KEY,
+count int unsigned DEFAULT '0'
+) ENGINE=ROCKSDB;
+connect con1,localhost,root,,;
+SET SESSION TRANSACTION ISOLATION LEVEL REPEATABLE READ;
+connect con2,localhost,root,,;
+SET SESSION TRANSACTION ISOLATION LEVEL REPEATABLE READ;
+connection con1;
+BEGIN;
+SELECT * FROM t3;
+pk count
+connection con2;
+BEGIN;
+INSERT INTO t3 (pk) VALUES(1) ON DUPLICATE KEY UPDATE count=count+1;
+COMMIT;
+connection con1;
+INSERT INTO t3 (pk) VALUES(1) ON DUPLICATE KEY UPDATE count=count+1;
+COMMIT;
+SELECT count FROM t3;
+count
+1
+connection default;
+disconnect con1;
+disconnect con2;
+DROP TABLE t3;
diff --git a/mysql-test/suite/rocksdb/r/partial_index_stress.result b/mysql-test/suite/rocksdb/r/partial_index_stress.result
index 88f77bcc63f..9851c0732bb 100644
--- a/mysql-test/suite/rocksdb/r/partial_index_stress.result
+++ b/mysql-test/suite/rocksdb/r/partial_index_stress.result
@@ -1,6 +1,6 @@
set @save_rocksdb_lock_wait_timeout = @@rocksdb_lock_wait_timeout;
set global rocksdb_lock_wait_timeout = 100000;
-CREATE TABLE `assoc_table` (
+CREATE TABLE `assoc_table1` (
`id1` bigint(20) unsigned NOT NULL DEFAULT '0',
`id1_type` int(10) unsigned NOT NULL DEFAULT '0',
`id2` bigint(20) unsigned NOT NULL DEFAULT '0',
@@ -11,7 +11,7 @@ CREATE TABLE `assoc_table` (
`time` int(10) unsigned NOT NULL DEFAULT '0',
`version` bigint(20) unsigned NOT NULL DEFAULT '0',
PRIMARY KEY (`assoc_type`,`id1`,`id2`) COMMENT 'cf_assoc',
-KEY `id1_type` (`assoc_type`, `id1`, `visibility`,`time`,`id2`,`version`,`data`) COMMENT 'cfname=rev:cf_assoc_id1_type;partial_group_keyparts=2;partial_group_threshold=10'
+KEY `id1_type` (`assoc_type`, `id1`, `visibility`,`time`,`id2`,`version`,`data`) COMMENT 'cfname=rev:cf_assoc_id1_type'
) ENGINE=ROCKSDB DEFAULT CHARSET=latin1 COLLATE=latin1_bin ROW_FORMAT=COMPRESSED KEY_BLOCK_SIZE=8;
Warnings:
Warning 1681 Integer display width is deprecated and will be removed in a future release.
@@ -22,8 +22,8 @@ Warning 1681 Integer display width is deprecated and will be removed in a future
Warning 1681 Integer display width is deprecated and will be removed in a future release.
Warning 1681 Integer display width is deprecated and will be removed in a future release.
Warning 1681 Integer display width is deprecated and will be removed in a future release.
-DROP TABLE assoc_table;
-CREATE TABLE `assoc_table` (
+DROP TABLE assoc_table1;
+CREATE TABLE `assoc_table2` (
`id1` binary(16) NOT NULL DEFAULT '\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0',
`raw_key` text COLLATE latin1_bin,
`id2` bigint(20) unsigned NOT NULL DEFAULT '0',
@@ -35,7 +35,7 @@ CREATE TABLE `assoc_table` (
`version` bigint(20) unsigned NOT NULL DEFAULT '0',
PRIMARY KEY (`assoc_type`,`id1`,`id2`) COMMENT 'cf_assoc',
KEY `id1_type` (`assoc_type`,`id1`,`visibility`,`time`,`id2`,`version`,`data`) COMMENT 'rev:cf_assoc_id1_type',
-KEY `id1_type2` (`assoc_type`, `id1`, `visibility`,`time`,`id2`,`version`,`data`) COMMENT 'cfname=rev:cf_assoc_id1_type;partial_group_keyparts=2;partial_group_threshold=10'
+KEY `id1_type2` (`assoc_type`, `id1`, `visibility`,`time`,`id2`,`version`,`data`) COMMENT 'cfname=rev:cf_assoc_id1_type;'
) ENGINE=ROCKSDB DEFAULT CHARSET=latin1 COLLATE=latin1_bin ROW_FORMAT=COMPRESSED KEY_BLOCK_SIZE=8;
Warnings:
Warning 1681 Integer display width is deprecated and will be removed in a future release.
@@ -44,9 +44,9 @@ Warning 1681 Integer display width is deprecated and will be removed in a future
Warning 1681 Integer display width is deprecated and will be removed in a future release.
Warning 1681 Integer display width is deprecated and will be removed in a future release.
Warning 1681 Integer display width is deprecated and will be removed in a future release.
-Warning 1831 Duplicate index 'id1_type2' defined on the table 'test.assoc_table'. This is deprecated and will be disallowed in a future release.
-DROP TABLE assoc_table;
-CREATE TABLE `assoc_table` (
+Warning 1831 Duplicate index 'id1_type2' defined on the table 'test.assoc_table2'. This is deprecated and will be disallowed in a future release.
+DROP TABLE assoc_table2;
+CREATE TABLE `assoc_table3` (
`id1` bigint(20) unsigned NOT NULL DEFAULT '0',
`id1_type` int(10) unsigned NOT NULL DEFAULT '0',
`id2` bigint(20) unsigned NOT NULL DEFAULT '0',
@@ -58,7 +58,7 @@ CREATE TABLE `assoc_table` (
`version` bigint(20) unsigned NOT NULL DEFAULT '0',
PRIMARY KEY (`assoc_type`,`id1`,`id2`) COMMENT 'cf_assoc',
KEY `id1_type` (`assoc_type`, `id1`, `visibility`,`time`,`id2`,`version`,`data`(255)) COMMENT 'rev:cf_assoc_id1_type',
-KEY `id1_type2` (`assoc_type`, `id1`, `visibility`,`time`,`id2`,`version`,`data`(255)) COMMENT 'cfname=rev:cf_assoc_id1_type;partial_group_keyparts=2;partial_group_threshold=10'
+KEY `id1_type2` (`assoc_type`, `id1`, `visibility`,`time`,`id2`,`version`,`data`(255)) COMMENT 'cfname=rev:cf_assoc_id1_type;'
) ENGINE=ROCKSDB DEFAULT CHARSET=latin1 COLLATE=latin1_bin ROW_FORMAT=COMPRESSED KEY_BLOCK_SIZE=4;
Warnings:
Warning 1681 Integer display width is deprecated and will be removed in a future release.
@@ -69,6 +69,6 @@ Warning 1681 Integer display width is deprecated and will be removed in a future
Warning 1681 Integer display width is deprecated and will be removed in a future release.
Warning 1681 Integer display width is deprecated and will be removed in a future release.
Warning 1681 Integer display width is deprecated and will be removed in a future release.
-Warning 1831 Duplicate index 'id1_type2' defined on the table 'test.assoc_table'. This is deprecated and will be disallowed in a future release.
-DROP TABLE assoc_table;
+Warning 1831 Duplicate index 'id1_type2' defined on the table 'test.assoc_table3'. This is deprecated and will be disallowed in a future release.
+DROP TABLE assoc_table3;
set global rocksdb_lock_wait_timeout = @save_rocksdb_lock_wait_timeout;
diff --git a/mysql-test/suite/rocksdb/r/range_locking.result b/mysql-test/suite/rocksdb/r/range_locking.result
new file mode 100644
index 00000000000..603c0b99f09
--- /dev/null
+++ b/mysql-test/suite/rocksdb/r/range_locking.result
@@ -0,0 +1,522 @@
+show variables like 'rocksdb_use_range_locking';
+Variable_name Value
+rocksdb_use_range_locking ON
+create table t1 (
+pk int,
+a int,
+primary key (pk) comment 'default'
+) engine=rocksdb;
+insert into t1 values
+(10,10),(20,20),(30,30);
+connect con1,localhost,root,,;
+connect con2,localhost,root,,;
+### Test: check that range lock inhibits a point lock
+connection con1;
+begin;
+select * from t1 where pk between 5 and 25 for update;
+pk a
+10 10
+20 20
+connection con2;
+insert into t1 values (15,15);
+ERROR HY000: Lock wait timeout exceeded; try restarting transaction: Timeout on index: test.t1.PRIMARY
+connection con1;
+rollback;
+## Test: check that range lock inhibits another range lock
+connection con1;
+begin;
+select * from t1 where pk between 5 and 25 for update;
+pk a
+10 10
+20 20
+connection con2;
+begin;
+select * from t1 where pk between 15 and 35 for update;
+ERROR HY000: Lock wait timeout exceeded; try restarting transaction: Timeout on index: test.t1.PRIMARY
+rollback;
+connection con1;
+rollback;
+## Test: check that regular read does not get a range lock
+connection con1;
+begin;
+select * from t1 where pk between 5 and 25;
+pk a
+10 10
+20 20
+connection con2;
+begin;
+select * from t1 where pk between 15 and 35 for update;
+pk a
+20 20
+30 30
+rollback;
+connection con1;
+rollback;
+## Test that locks are not released when a statement inside
+## a transaction is rolled back
+create table t2 (
+pk int,
+a int,
+primary key (pk) comment 'default',
+unique key(a) comment 'default'
+) engine=rocksdb;
+insert into t2 values (1,1),(2,2);
+begin;
+insert into t2 values (3,3);
+insert into t2 values (10,2);
+ERROR 23000: Duplicate entry '2' for key 't2.a'
+connection con2;
+begin;
+select * from t2 where pk=3 for update;
+ERROR HY000: Lock wait timeout exceeded; try restarting transaction: Timeout on index: test.t2.PRIMARY
+rollback;
+connection con1;
+rollback;
+drop table t2;
+connection default;
+disconnect con1;
+disconnect con2;
+drop table t1;
+#
+# Test INFORMATION_SCHEMA.lock_info in range-locking mode
+#
+connect con1,localhost,root,,;
+connection con1;
+create table t0 (a int primary key);
+begin;
+insert into t0 values (1);
+connection default;
+create table t1 (
+pk int,
+a int,
+primary key (pk) comment 'default'
+) engine=rocksdb;
+insert into t1 values
+(10,10),(20,20),(30,30);
+begin;
+select * from t1 where pk=10 for update;
+pk a
+10 10
+# select * from information_schema.rocksdb_locks; # With replacements by select_from_is_rowlocks.inc
+COLUMN_FAMILY_ID TRANSACTION_ID KEY mode
+$cf_id $trx_id ${indexnr}8000000a X
+delete from t1 where pk between 25 and 40;
+# select * from information_schema.rocksdb_locks; # With replacements by select_from_is_rowlocks.inc
+COLUMN_FAMILY_ID TRANSACTION_ID KEY mode
+$cf_id $trx_id ${indexnr}8000000a X
+$cf_id $trx_id ${indexnr}80000019-${indexnr}80000028:1 X
+rollback;
+begin;
+# The following will show a range lock on 2-9 and also a point lock on 10.
+# This is how things currently work. (after MDEV-21314, not anymore)
+select * from t1 where pk between 2 and 9 for update;
+pk a
+# select * from information_schema.rocksdb_locks; # With replacements by select_from_is_rowlocks.inc
+COLUMN_FAMILY_ID TRANSACTION_ID KEY mode
+$cf_id $trx_id ${indexnr}80000002-${indexnr}80000009:1 X
+rollback;
+drop table t1;
+connection con1;
+rollback;
+drop table t0;
+connection default;
+disconnect con1;
+#
+# MDEV-18104: MyRocks-Gap-Lock: range locking bounds are incorrect for multi-part keys
+#
+create table t0(a int);
+insert into t0 values (0),(1),(2),(3),(4),(5),(6),(7),(8),(9);
+create table t1 (
+kp1 int not null,
+kp2 int not null,
+a int,
+primary key(kp1, kp2) comment 'default'
+) engine=rocksdb;
+insert into t1 select 1, a, 1234 from t0;
+insert into t1 select 2, a, 1234 from t0;
+insert into t1 select 3, a, 1234 from t0;
+connect con1,localhost,root,,;
+connection con1;
+begin;
+select * from t1 where kp1=2 for update;
+kp1 kp2 a
+2 0 1234
+2 1 1234
+2 2 1234
+2 3 1234
+2 4 1234
+2 5 1234
+2 6 1234
+2 7 1234
+2 8 1234
+2 9 1234
+connection default;
+# The lock on kp1=2 should inhibit the following INSERT:
+insert into t1 values ( 2,5,9999);
+ERROR HY000: Lock wait timeout exceeded; try restarting transaction: Timeout on index: test.t1.PRIMARY
+rollback;
+connection con1;
+rollback;
+connection default;
+disconnect con1;
+drop table t0,t1;
+#
+# Test that locks on ranges on non-unique secondary keys inhibit
+# modifications of the contents of these ranges
+#
+create table t0(a int);
+insert into t0 values (0),(1),(2),(3),(4),(5),(6),(7),(8),(9);
+create table t1 (
+kp1 int not null,
+kp2 int not null,
+a int,
+key(kp1, kp2) comment 'default'
+) engine=rocksdb;
+insert into t1 select 1, a, 1234 from t0;
+insert into t1 values (2, 3, 1234);
+insert into t1 values (2, 5, 1234);
+insert into t1 values (2, 7, 1234);
+insert into t1 select 3, a, 1234 from t0;
+connect con1,localhost,root,,;
+connection con1;
+begin;
+explain
+select * from t1 where kp1=2 for update;
+id select_type table partitions type possible_keys key key_len ref rows filtered Extra
+1 SIMPLE t1 NULL ref kp1 kp1 4 const # 100.00 NULL
+Warnings:
+Note 1003 /* select#1 */ select `test`.`t1`.`kp1` AS `kp1`,`test`.`t1`.`kp2` AS `kp2`,`test`.`t1`.`a` AS `a` from `test`.`t1` where (`test`.`t1`.`kp1` = 2)
+select * from t1 where kp1=2 for update;
+kp1 kp2 a
+2 3 1234
+2 5 1234
+2 7 1234
+connection default;
+begin;
+insert into t1 values (2, 9, 9999);
+ERROR HY000: Lock wait timeout exceeded; try restarting transaction: Timeout on index: test.t1.kp1
+delete from t1 where kp1=2 and kp2=5;
+ERROR HY000: Lock wait timeout exceeded; try restarting transaction: Timeout on index: test.t1.kp1
+update t1 set kp1=333 where kp1=2 and kp2=3;
+ERROR HY000: Lock wait timeout exceeded; try restarting transaction: Timeout on index: test.t1.kp1
+update t1 set kp1=2 where kp1=1 and kp2=8;
+ERROR HY000: Lock wait timeout exceeded; try restarting transaction: Timeout on index: test.t1.kp1
+rollback;
+connection con1;
+rollback;
+disconnect con1;
+connection default;
+drop table t0,t1;
+#
+# Transaction isolation test
+#
+create table t1 (pk int primary key, a int) engine=rocksdb;
+insert into t1 values (1,1),(2,2),(3,3);
+connect con1,localhost,root,,;
+# TRX1: Start, Allocate a snapshot
+connection con1;
+begin;
+select * from t1;
+pk a
+1 1
+2 2
+3 3
+# TRX2: Make a change that TRX1 will not see
+connection default;
+update t1 set a=2222 where pk=2;
+# TRX1: Now, make a change that would overwrite TRX2'x change and commit
+connection con1;
+update t1 set a=a+1 where pk=2;
+commit;
+# Examine the result:
+# pk=2, a=2223 means UPDATE in TRX1 used "read committed" (InnoDB-like isolation)
+# pk=2, a=3 means UPDATE in TRX1 silently overwrote TRX2
+# (and with key tracking, one would get an error on the second UPDATE)
+connection default;
+select * from t1;
+pk a
+1 1
+2 2223
+3 3
+disconnect con1;
+connection default;
+drop table t1;
+#
+# Same test as above, but check the range scan
+#
+create table t1 (
+pk int,
+a int,
+primary key (pk) comment 'default'
+) engine=rocksdb;
+insert into t1 values (1,1),(2,2),(3,3),(4,4),(5,5),(6,6);
+connect con1,localhost,root,,;
+# TRX1: Start, Allocate a snapshot
+connection con1;
+begin;
+select * from t1;
+pk a
+1 1
+2 2
+3 3
+4 4
+5 5
+6 6
+# TRX2: Make a change that TRX1 will not see
+connection default;
+update t1 set a=2222 where pk between 3 and 5;
+# TRX1: Now, make a change that would overwrite TRX2'x change and commit
+connection con1;
+update t1 set a=a+1 where pk between 3 and 5;
+commit;
+# Examine the result:
+# pk={3,4,5} a=2223 means UPDATE in TRX1 used "read committed" (InnoDB-like isolation)
+connection default;
+select * from t1;
+pk a
+1 1
+2 2
+3 2223
+4 2223
+5 2223
+6 6
+disconnect con1;
+connection default;
+drop table t1;
+#
+# Same as above, but test SELECT FOR UPDATE.
+#
+create table t1 (
+pk int,
+a int,
+primary key (pk) comment 'default'
+) engine=rocksdb;
+insert into t1 values (1,1),(2,2),(3,3),(4,4),(5,5),(6,6);
+connect con1,localhost,root,,;
+# TRX1: Start, Allocate a snapshot
+connection con1;
+begin;
+select * from t1;
+pk a
+1 1
+2 2
+3 3
+4 4
+5 5
+6 6
+# TRX2: Make a change that TRX1 will not see
+connection default;
+update t1 set a=222 where pk=2;
+update t1 set a=333 where pk=3;
+# TRX1: Check what select [FOR UPDATE] sees
+connection con1;
+select * from t1 where pk in (2,3);
+pk a
+2 2
+3 3
+select * from t1 where pk=2 for update;
+pk a
+2 222
+select * from t1 where pk=2;
+pk a
+2 2
+commit;
+disconnect con1;
+connection default;
+drop table t1;
+#
+# Another no-snapshot-checking test, this time for single-statement
+# transaction
+#
+create table t1 (
+pk int,
+a int,
+name varchar(16),
+primary key(pk) comment 'default'
+) engine=rocksdb;
+insert into t1 values (1,1, 'row1'), (2,2,'row2');
+connect con1,localhost,root,,;
+connection con1;
+select get_lock('row1', 100);
+get_lock('row1', 100)
+1
+connection default;
+# The following will read the first row (1,1,'row1'), and stop.
+update t1 set a=a+100 where get_lock(name, 1000)=1;
+connection con1;
+update t1 set a=5 where pk=2;
+select release_lock('row1');
+release_lock('row1')
+1
+connection default;
+# Look at the row with pk=2:
+# 2, 105, row2 - means the UPDATE was reading current data (Correct)
+# 2, 102, row - means the UPDATE read the snapshot (incorrect)
+select * from t1;
+pk a name
+1 101 row1
+2 105 row2
+# Try releasing both locks (in 5.6, we will be holding only the second one)
+select release_lock(name) from t1;
+release_lock(name)
+1
+1
+disconnect con1;
+connection default;
+drop table t1;
+#
+# Check that I_S.processlist.state is set correctly now.
+#
+create table t1(
+pk int,
+a int,
+primary key(pk) comment 'default'
+) engine=rocksdb;
+insert into t1 values (1,1),(2,2),(3,3);
+begin;
+select * from t1 where pk=2 for update;
+pk a
+2 2
+connect con1,localhost,root,,;
+begin;
+set rocksdb_lock_wait_timeout=300;
+select * from t1 where pk=2 for update;
+connection default;
+# Now, will wait until we see con1 have state="Waiting for row lock"
+rollback;
+connection con1;
+pk a
+2 2
+rollback;
+disconnect con1;
+connection default;
+drop table t1;
+#
+# Test range locking for ranges with HA_READ_PREFIX_LAST
+#
+create table t0(a int) engine=rocksdb;
+insert into t0 values (0),(1),(2),(3),(4),(5),(6),(7),(8),(9);
+create table t1 (
+pk1 int,
+pk2 int,
+a int,
+primary key(pk1, pk2) comment 'default'
+) engine=rocksdb;
+insert into t1
+select
+A.a, B.a, A.a*10+B.a
+from
+t0 A, t0 B;
+connect con1,localhost,root,,;
+connection con1;
+begin;
+insert into t1 values (0x1112222,0x1112222,0);
+connection default;
+begin;
+# Should use ref access w/o filesort:
+explain
+select * from t1
+where pk1=3
+order by pk1 desc, pk2 desc
+for update;
+id select_type table partitions type possible_keys key key_len ref rows filtered Extra
+1 SIMPLE t1 NULL ref PRIMARY PRIMARY 4 const # 100.00 Backward index scan
+Warnings:
+Note 1003 /* select#1 */ select `test`.`t1`.`pk1` AS `pk1`,`test`.`t1`.`pk2` AS `pk2`,`test`.`t1`.`a` AS `a` from `test`.`t1` where (`test`.`t1`.`pk1` = 3) order by `test`.`t1`.`pk1` desc,`test`.`t1`.`pk2` desc
+select * from t1
+where pk1=3
+order by pk1 desc, pk2 desc
+for update;
+pk1 pk2 a
+3 9 39
+3 8 38
+3 7 37
+3 6 36
+3 5 35
+3 4 34
+3 3 33
+3 2 32
+3 1 31
+3 0 30
+# select * from information_schema.rocksdb_locks; # With replacements by select_from_is_rowlocks.inc
+COLUMN_FAMILY_ID TRANSACTION_ID KEY mode
+$cf_id $trx_id ${indexnr}80000003-${indexnr}80000003:1 X
+rollback;
+#
+# Test range locking for ranges with HA_READ_PREFIX_LAST_OR_PREV
+#
+begin;
+# Should use range access with 2 keyparts and w/o filesort:
+explain
+select * from t1
+where pk1=4 and pk2 between 5 and 8
+order by pk1 desc, pk2 desc
+for update;
+id select_type table partitions type possible_keys key key_len ref rows filtered Extra
+1 SIMPLE t1 NULL range PRIMARY PRIMARY 8 NULL # 100.00 Using where; Backward index scan
+Warnings:
+Note 1003 /* select#1 */ select `test`.`t1`.`pk1` AS `pk1`,`test`.`t1`.`pk2` AS `pk2`,`test`.`t1`.`a` AS `a` from `test`.`t1` where ((`test`.`t1`.`pk1` = 4) and (`test`.`t1`.`pk2` between 5 and 8)) order by `test`.`t1`.`pk1` desc,`test`.`t1`.`pk2` desc
+select * from t1
+where pk1=4 and pk2 between 5 and 8
+order by pk1 desc, pk2 desc
+for update;
+pk1 pk2 a
+4 8 48
+4 7 47
+4 6 46
+4 5 45
+# select * from information_schema.rocksdb_locks; # With replacements by select_from_is_rowlocks.inc
+COLUMN_FAMILY_ID TRANSACTION_ID KEY mode
+$cf_id $trx_id ${indexnr}8000000480000005-${indexnr}8000000480000008:1 X
+rollback;
+connection con1;
+rollback;
+connection default;
+drop table t0, t1;
+#
+# A bug: range locking was not used when scan started at table start or end
+#
+create table t0(a int);
+insert into t0 values (0),(1),(2),(3),(4),(5),(6),(7),(8),(9);
+create table t10(a int);
+insert into t10 select A.a + B.a* 10 + C.a * 100 from t0 A, t0 B, t0 C;
+create table t1 (
+pk int not null,
+a int,
+primary key(pk)
+) engine=rocksdb;
+insert into t1 select a*2,a*2 from t10;
+connection con1;
+begin;
+select * from t1 where pk=500 for update;
+pk a
+500 500
+connection default;
+begin;
+select * from t1 where pk<10 order by pk limit 10 for update;
+pk a
+0 0
+2 2
+4 4
+6 6
+8 8
+# select * from information_schema.rocksdb_locks; # With replacements by select_from_is_rowlocks.inc
+COLUMN_FAMILY_ID TRANSACTION_ID KEY mode
+$cf_id $trx_id ${indexnr}-${indexnr}8000000a X
+rollback;
+begin;
+select * from t1 where pk>1990 order by pk desc limit 10 for update;
+pk a
+1998 1998
+1996 1996
+1994 1994
+1992 1992
+# select * from information_schema.rocksdb_locks; # With replacements by select_from_is_rowlocks.inc
+COLUMN_FAMILY_ID TRANSACTION_ID KEY mode
+$cf_id $trx_id ${indexnr}800007c6-${indexnr+1} X
+rollback;
+connection con1;
+rollback;
+disconnect con1;
+connection default;
+drop table t0,t10,t1;
diff --git a/mysql-test/suite/rocksdb/r/range_locking_deadlock_tracking.result b/mysql-test/suite/rocksdb/r/range_locking_deadlock_tracking.result
new file mode 100644
index 00000000000..00fd1788dfd
--- /dev/null
+++ b/mysql-test/suite/rocksdb/r/range_locking_deadlock_tracking.result
@@ -0,0 +1,453 @@
+set @prior_lock_wait_timeout = @@rocksdb_lock_wait_timeout;
+set @prior_deadlock_detect = @@rocksdb_deadlock_detect;
+set @prior_max_latest_deadlocks = @@rocksdb_max_latest_deadlocks;
+set global rocksdb_deadlock_detect = on;
+set global rocksdb_lock_wait_timeout = 10000;
+# Clears deadlock buffer of any prior deadlocks.
+set global rocksdb_max_latest_deadlocks = 0;
+set global rocksdb_max_latest_deadlocks = @prior_max_latest_deadlocks;
+create table t (i int primary key) engine=rocksdb;
+insert into t values (1), (2), (3);
+show engine rocksdb transaction status;
+Type Name Status
+rocksdb
+============================================================
+TIMESTAMP ROCKSDB TRANSACTION MONITOR OUTPUT
+============================================================
+---------
+SNAPSHOTS
+---------
+LIST OF SNAPSHOTS FOR EACH SESSION:
+----------LATEST DETECTED DEADLOCKS----------
+-----------------------------------------
+END OF ROCKSDB TRANSACTION MONITOR OUTPUT
+=========================================
+
+Deadlock #1
+begin;
+select * from t where i=1 for update;
+i
+1
+begin;
+select * from t where i=2 for update;
+i
+2
+select * from t where i=2 for update;
+select * from t where i=1 for update;
+ERROR 40001: Deadlock found when trying to get lock; try restarting transaction
+rollback;
+i
+2
+rollback;
+show engine rocksdb transaction status;
+Type Name Status
+rocksdb
+============================================================
+TIMESTAMP ROCKSDB TRANSACTION MONITOR OUTPUT
+============================================================
+---------
+SNAPSHOTS
+---------
+LIST OF SNAPSHOTS FOR EACH SESSION:
+----------LATEST DETECTED DEADLOCKS----------
+
+*** DEADLOCK PATH
+=========================================
+TSTAMP
+TXN_ID
+COLUMN FAMILY NAME: default
+KEY
+LOCK TYPE: EXCLUSIVE
+INDEX NAME: PRIMARY
+TABLE NAME: test.t
+---------------WAITING FOR---------------
+TSTAMP
+TXN_ID
+COLUMN FAMILY NAME: default
+KEY
+LOCK TYPE: EXCLUSIVE
+INDEX NAME: PRIMARY
+TABLE NAME: test.t
+
+--------TXN_ID GOT DEADLOCK---------
+-----------------------------------------
+END OF ROCKSDB TRANSACTION MONITOR OUTPUT
+=========================================
+
+Deadlock #2
+begin;
+select * from t where i=1 for update;
+i
+1
+begin;
+select * from t where i=2 for update;
+i
+2
+select * from t where i=2 for update;
+select * from t where i=1 for update;
+ERROR 40001: Deadlock found when trying to get lock; try restarting transaction
+rollback;
+i
+2
+rollback;
+show engine rocksdb transaction status;
+Type Name Status
+rocksdb
+============================================================
+TIMESTAMP ROCKSDB TRANSACTION MONITOR OUTPUT
+============================================================
+---------
+SNAPSHOTS
+---------
+LIST OF SNAPSHOTS FOR EACH SESSION:
+----------LATEST DETECTED DEADLOCKS----------
+
+*** DEADLOCK PATH
+=========================================
+TSTAMP
+TXN_ID
+COLUMN FAMILY NAME: default
+KEY
+LOCK TYPE: EXCLUSIVE
+INDEX NAME: PRIMARY
+TABLE NAME: test.t
+---------------WAITING FOR---------------
+TSTAMP
+TXN_ID
+COLUMN FAMILY NAME: default
+KEY
+LOCK TYPE: EXCLUSIVE
+INDEX NAME: PRIMARY
+TABLE NAME: test.t
+
+--------TXN_ID GOT DEADLOCK---------
+
+*** DEADLOCK PATH
+=========================================
+TSTAMP
+TXN_ID
+COLUMN FAMILY NAME: default
+KEY
+LOCK TYPE: EXCLUSIVE
+INDEX NAME: PRIMARY
+TABLE NAME: test.t
+---------------WAITING FOR---------------
+TSTAMP
+TXN_ID
+COLUMN FAMILY NAME: default
+KEY
+LOCK TYPE: EXCLUSIVE
+INDEX NAME: PRIMARY
+TABLE NAME: test.t
+
+--------TXN_ID GOT DEADLOCK---------
+-----------------------------------------
+END OF ROCKSDB TRANSACTION MONITOR OUTPUT
+=========================================
+
+set global rocksdb_max_latest_deadlocks = 10;
+Deadlock #3
+begin;
+select * from t where i=1 for update;
+i
+1
+begin;
+select * from t where i=2 for update;
+i
+2
+select * from t where i=2 for update;
+select * from t where i=1 for update;
+ERROR 40001: Deadlock found when trying to get lock; try restarting transaction
+rollback;
+i
+2
+rollback;
+show engine rocksdb transaction status;
+Type Name Status
+rocksdb
+============================================================
+TIMESTAMP ROCKSDB TRANSACTION MONITOR OUTPUT
+============================================================
+---------
+SNAPSHOTS
+---------
+LIST OF SNAPSHOTS FOR EACH SESSION:
+----------LATEST DETECTED DEADLOCKS----------
+
+*** DEADLOCK PATH
+=========================================
+TSTAMP
+TXN_ID
+COLUMN FAMILY NAME: default
+KEY
+LOCK TYPE: EXCLUSIVE
+INDEX NAME: PRIMARY
+TABLE NAME: test.t
+---------------WAITING FOR---------------
+TSTAMP
+TXN_ID
+COLUMN FAMILY NAME: default
+KEY
+LOCK TYPE: EXCLUSIVE
+INDEX NAME: PRIMARY
+TABLE NAME: test.t
+
+--------TXN_ID GOT DEADLOCK---------
+
+*** DEADLOCK PATH
+=========================================
+TSTAMP
+TXN_ID
+COLUMN FAMILY NAME: default
+KEY
+LOCK TYPE: EXCLUSIVE
+INDEX NAME: PRIMARY
+TABLE NAME: test.t
+---------------WAITING FOR---------------
+TSTAMP
+TXN_ID
+COLUMN FAMILY NAME: default
+KEY
+LOCK TYPE: EXCLUSIVE
+INDEX NAME: PRIMARY
+TABLE NAME: test.t
+
+--------TXN_ID GOT DEADLOCK---------
+
+*** DEADLOCK PATH
+=========================================
+TSTAMP
+TXN_ID
+COLUMN FAMILY NAME: default
+KEY
+LOCK TYPE: EXCLUSIVE
+INDEX NAME: PRIMARY
+TABLE NAME: test.t
+---------------WAITING FOR---------------
+TSTAMP
+TXN_ID
+COLUMN FAMILY NAME: default
+KEY
+LOCK TYPE: EXCLUSIVE
+INDEX NAME: PRIMARY
+TABLE NAME: test.t
+
+--------TXN_ID GOT DEADLOCK---------
+-----------------------------------------
+END OF ROCKSDB TRANSACTION MONITOR OUTPUT
+=========================================
+
+set global rocksdb_max_latest_deadlocks = 1;
+show engine rocksdb transaction status;
+Type Name Status
+rocksdb
+============================================================
+TIMESTAMP ROCKSDB TRANSACTION MONITOR OUTPUT
+============================================================
+---------
+SNAPSHOTS
+---------
+LIST OF SNAPSHOTS FOR EACH SESSION:
+----------LATEST DETECTED DEADLOCKS----------
+
+*** DEADLOCK PATH
+=========================================
+TSTAMP
+TXN_ID
+COLUMN FAMILY NAME: default
+KEY
+LOCK TYPE: EXCLUSIVE
+INDEX NAME: PRIMARY
+TABLE NAME: test.t
+---------------WAITING FOR---------------
+TSTAMP
+TXN_ID
+COLUMN FAMILY NAME: default
+KEY
+LOCK TYPE: EXCLUSIVE
+INDEX NAME: PRIMARY
+TABLE NAME: test.t
+
+--------TXN_ID GOT DEADLOCK---------
+-----------------------------------------
+END OF ROCKSDB TRANSACTION MONITOR OUTPUT
+=========================================
+
+set rocksdb_deadlock_detect_depth = 2;
+# Range locking code will report deadlocks, because it doesn't honor
+# rocksdb_deadlock_detect_depth:
+Deadlock #4
+begin;
+select * from t where i=1 for update;
+i
+1
+begin;
+select * from t where i=2 for update;
+i
+2
+begin;
+select * from t where i=3 for update;
+i
+3
+select * from t where i=2 for update;
+select * from t where i=3 for update;
+select variable_value into @a from performance_schema.global_status where variable_name='rocksdb_row_lock_deadlocks';
+select * from t where i=1 for update;
+ERROR 40001: Deadlock found when trying to get lock; try restarting transaction
+select case when variable_value-@a = 1 then 'true' else 'false' end as deadlocks from performance_schema.global_status where variable_name='rocksdb_row_lock_deadlocks';
+deadlocks
+true
+rollback;
+i
+3
+rollback;
+i
+2
+rollback;
+set global rocksdb_max_latest_deadlocks = 5;
+show engine rocksdb transaction status;
+Type Name Status
+rocksdb
+============================================================
+TIMESTAMP ROCKSDB TRANSACTION MONITOR OUTPUT
+============================================================
+---------
+SNAPSHOTS
+---------
+LIST OF SNAPSHOTS FOR EACH SESSION:
+----------LATEST DETECTED DEADLOCKS----------
+
+*** DEADLOCK PATH
+=========================================
+TSTAMP
+TXN_ID
+COLUMN FAMILY NAME: default
+KEY
+LOCK TYPE: EXCLUSIVE
+INDEX NAME: PRIMARY
+TABLE NAME: test.t
+---------------WAITING FOR---------------
+TSTAMP
+TXN_ID
+COLUMN FAMILY NAME: default
+KEY
+LOCK TYPE: EXCLUSIVE
+INDEX NAME: PRIMARY
+TABLE NAME: test.t
+---------------WAITING FOR---------------
+TSTAMP
+TXN_ID
+COLUMN FAMILY NAME: default
+KEY
+LOCK TYPE: EXCLUSIVE
+INDEX NAME: PRIMARY
+TABLE NAME: test.t
+
+--------TXN_ID GOT DEADLOCK---------
+-----------------------------------------
+END OF ROCKSDB TRANSACTION MONITOR OUTPUT
+=========================================
+
+Deadlock #6
+create table t1 (id int primary key, value int) engine=rocksdb;
+insert into t1 values (1,1),(2,2),(3,3),(4,4),(5,5);
+begin;
+update t1 set value=value+100 where id=1;
+update t1 set value=value+100 where id=2;
+begin;
+update t1 set value=value+200 where id=3;
+update t1 set value=value+100 where id=3;
+update t1 set value=value+200 where id=1;
+ERROR 40001: Deadlock found when trying to get lock; try restarting transaction
+select * from t1;
+id value
+1 101
+2 102
+3 103
+4 4
+5 5
+drop table t1;
+set global rocksdb_lock_wait_timeout = @prior_lock_wait_timeout;
+set global rocksdb_deadlock_detect = @prior_deadlock_detect;
+drop table t;
+show engine rocksdb transaction status;
+Type Name Status
+rocksdb
+============================================================
+TIMESTAMP ROCKSDB TRANSACTION MONITOR OUTPUT
+============================================================
+---------
+SNAPSHOTS
+---------
+LIST OF SNAPSHOTS FOR EACH SESSION:
+----------LATEST DETECTED DEADLOCKS----------
+
+*** DEADLOCK PATH
+=========================================
+TSTAMP
+TXN_ID
+COLUMN FAMILY NAME: default
+KEY
+LOCK TYPE: EXCLUSIVE
+INDEX NAME: NOT FOUND; IDX_ID
+TABLE NAME: NOT FOUND; IDX_ID
+---------------WAITING FOR---------------
+TSTAMP
+TXN_ID
+COLUMN FAMILY NAME: default
+KEY
+LOCK TYPE: EXCLUSIVE
+INDEX NAME: NOT FOUND; IDX_ID
+TABLE NAME: NOT FOUND; IDX_ID
+
+--------TXN_ID GOT DEADLOCK---------
+
+*** DEADLOCK PATH
+=========================================
+TSTAMP
+TXN_ID
+COLUMN FAMILY NAME: default
+KEY
+LOCK TYPE: EXCLUSIVE
+INDEX NAME: NOT FOUND; IDX_ID
+TABLE NAME: NOT FOUND; IDX_ID
+---------------WAITING FOR---------------
+TSTAMP
+TXN_ID
+COLUMN FAMILY NAME: default
+KEY
+LOCK TYPE: EXCLUSIVE
+INDEX NAME: NOT FOUND; IDX_ID
+TABLE NAME: NOT FOUND; IDX_ID
+---------------WAITING FOR---------------
+TSTAMP
+TXN_ID
+COLUMN FAMILY NAME: default
+KEY
+LOCK TYPE: EXCLUSIVE
+INDEX NAME: NOT FOUND; IDX_ID
+TABLE NAME: NOT FOUND; IDX_ID
+
+--------TXN_ID GOT DEADLOCK---------
+-----------------------------------------
+END OF ROCKSDB TRANSACTION MONITOR OUTPUT
+=========================================
+
+set global rocksdb_max_latest_deadlocks = 0;
+# Clears deadlock buffer of any existent deadlocks.
+set global rocksdb_max_latest_deadlocks = @prior_max_latest_deadlocks;
+show engine rocksdb transaction status;
+Type Name Status
+rocksdb
+============================================================
+TIMESTAMP ROCKSDB TRANSACTION MONITOR OUTPUT
+============================================================
+---------
+SNAPSHOTS
+---------
+LIST OF SNAPSHOTS FOR EACH SESSION:
+----------LATEST DETECTED DEADLOCKS----------
+-----------------------------------------
+END OF ROCKSDB TRANSACTION MONITOR OUTPUT
+=========================================
+
diff --git a/mysql-test/suite/rocksdb/r/range_locking_escalation.result b/mysql-test/suite/rocksdb/r/range_locking_escalation.result
new file mode 100644
index 00000000000..698b0f4a02f
--- /dev/null
+++ b/mysql-test/suite/rocksdb/r/range_locking_escalation.result
@@ -0,0 +1,27 @@
+show variables like 'rocksdb_use_range_locking';
+Variable_name Value
+rocksdb_use_range_locking ON
+show variables like 'rocksdb_max_lock_memory';
+Variable_name Value
+rocksdb_max_lock_memory 1024
+show status like 'rocksdb_locktree_escalation_count';
+Variable_name Value
+rocksdb_locktree_escalation_count 0
+create table t0(a int);
+insert into t0 values (0),(1),(2),(3),(4),(5),(6),(7),(8),(9);
+create table t1 (
+pk int primary key,
+a int
+) engine=rocksdb;
+insert into t1
+select
+A.a + B.a*10 + C.a*100 + D.a*1000,
+12345
+from t0 A, t0 B, t0 C, t0 D;
+select count(*) from t1;
+count(*)
+10000
+show status like 'rocksdb_locktree_escalation_count';
+Variable_name Value
+rocksdb_locktree_escalation_count 128
+drop table t0,t1;
diff --git a/mysql-test/suite/rocksdb/r/range_locking_refresh_iter.result b/mysql-test/suite/rocksdb/r/range_locking_refresh_iter.result
new file mode 100644
index 00000000000..1067087e816
--- /dev/null
+++ b/mysql-test/suite/rocksdb/r/range_locking_refresh_iter.result
@@ -0,0 +1,50 @@
+select @@rocksdb_use_range_locking;
+@@rocksdb_use_range_locking
+1
+set debug_sync='RESET';
+create table ten(a int primary key);
+insert into ten values (0),(1),(2),(3),(4),(5),(6),(7),(8),(9);
+create table one_k(a int primary key);
+insert into one_k select A.a + B.a* 10 + C.a * 100 from ten A, ten B, ten C;
+create table t1 (
+pk int primary key,
+a int
+) engine=rocksdb;
+insert into t1 select a,a from ten;
+insert into t1 select a+40, a+40 from ten;
+insert into t1 select a+100, a+100 from one_k;
+delete from t1 where pk=44;
+set global rocksdb_force_flush_memtable_and_lzero_now=1;
+begin;
+set debug_sync='rocksdb.check_flags_iri SIGNAL con1_stopped WAIT_FOR con1_cont';
+update t1 set a=a+100 where pk < 3 or pk between 10 and 50;
+set debug_sync='now WAIT_FOR con1_stopped';
+insert into t1 values (44,5000);
+delete from t1 where pk= 42;
+update t1 set a=5000 where pk between 40 and 45;
+set global rocksdb_force_flush_memtable_and_lzero_now=1;
+set debug_sync='now SIGNAL con1_cont';
+select * from t1 where pk<100;
+pk a
+0 100
+1 101
+2 102
+3 3
+4 4
+5 5
+6 6
+7 7
+8 8
+9 9
+40 5100
+41 5100
+43 5100
+44 5100
+45 5100
+46 146
+47 147
+48 148
+49 149
+commit;
+set debug_sync='RESET';
+drop table t1, ten, one_k;
diff --git a/mysql-test/suite/rocksdb/r/range_locking_rev_cf.result b/mysql-test/suite/rocksdb/r/range_locking_rev_cf.result
new file mode 100644
index 00000000000..5e1c2cf98a5
--- /dev/null
+++ b/mysql-test/suite/rocksdb/r/range_locking_rev_cf.result
@@ -0,0 +1,482 @@
+show variables like 'rocksdb_use_range_locking';
+Variable_name Value
+rocksdb_use_range_locking ON
+create table t1 (
+pk int,
+a int,
+primary key (pk) comment 'rev:cf1'
+) engine=rocksdb;
+insert into t1 values
+(10,10),(20,20),(30,30);
+connect con1,localhost,root,,;
+connect con2,localhost,root,,;
+### Test: check that range lock inhibits a point lock
+connection con1;
+begin;
+select * from t1 where pk between 5 and 25 for update;
+pk a
+10 10
+20 20
+connection con2;
+insert into t1 values (15,15);
+ERROR HY000: Lock wait timeout exceeded; try restarting transaction: Timeout on index: test.t1.PRIMARY
+connection con1;
+rollback;
+## Test: check that range lock inhibits another range lock
+connection con1;
+begin;
+select * from t1 where pk between 5 and 25 for update;
+pk a
+10 10
+20 20
+connection con2;
+begin;
+select * from t1 where pk between 15 and 35 for update;
+ERROR HY000: Lock wait timeout exceeded; try restarting transaction: Timeout on index: test.t1.PRIMARY
+rollback;
+connection con1;
+rollback;
+## Test: check that regular read does not get a range lock
+connection con1;
+begin;
+select * from t1 where pk between 5 and 25;
+pk a
+10 10
+20 20
+connection con2;
+begin;
+select * from t1 where pk between 15 and 35 for update;
+pk a
+20 20
+30 30
+rollback;
+connection con1;
+rollback;
+## Test that locks are not released when a statement inside
+## a transaction is rolled back
+create table t2 (
+pk int,
+a int,
+primary key (pk) comment 'rev:cf1',
+unique key(a) comment ''
+) engine=rocksdb;
+insert into t2 values (1,1),(2,2);
+begin;
+insert into t2 values (3,3);
+insert into t2 values (10,2);
+ERROR 23000: Duplicate entry '2' for key 't2.a'
+connection con2;
+begin;
+select * from t2 where pk=3 for update;
+ERROR HY000: Lock wait timeout exceeded; try restarting transaction: Timeout on index: test.t2.PRIMARY
+rollback;
+connection con1;
+rollback;
+drop table t2;
+connection default;
+disconnect con1;
+disconnect con2;
+drop table t1;
+#
+# Test INFORMATION_SCHEMA.lock_info in range-locking mode
+#
+connect con1,localhost,root,,;
+connection con1;
+create table t0 (a int primary key);
+begin;
+insert into t0 values (1);
+connection default;
+create table t1 (
+pk int,
+a int,
+primary key (pk) comment 'rev:cf1'
+) engine=rocksdb;
+insert into t1 values
+(10,10),(20,20),(30,30);
+begin;
+select * from t1 where pk=10 for update;
+pk a
+10 10
+# select * from information_schema.rocksdb_locks; # With replacements by select_from_is_rowlocks.inc
+COLUMN_FAMILY_ID TRANSACTION_ID KEY mode
+$cf_id $trx_id ${indexnr}8000000a X
+delete from t1 where pk between 25 and 40;
+# select * from information_schema.rocksdb_locks; # With replacements by select_from_is_rowlocks.inc
+COLUMN_FAMILY_ID TRANSACTION_ID KEY mode
+$cf_id $trx_id ${indexnr}8000000a X
+$cf_id $trx_id ${indexnr}80000028-${indexnr}80000019:1 X
+rollback;
+begin;
+# The following will show a range lock on 2-9 and also a point lock on 10.
+# This is how things currently work. (after MDEV-21314, not anymore)
+select * from t1 where pk between 2 and 9 for update;
+pk a
+# select * from information_schema.rocksdb_locks; # With replacements by select_from_is_rowlocks.inc
+COLUMN_FAMILY_ID TRANSACTION_ID KEY mode
+$cf_id $trx_id ${indexnr}80000009-${indexnr}80000002:1 X
+rollback;
+drop table t1;
+connection con1;
+rollback;
+drop table t0;
+connection default;
+disconnect con1;
+#
+# MDEV-18104: MyRocks-Gap-Lock: range locking bounds are incorrect for multi-part keys
+#
+create table t0(a int);
+insert into t0 values (0),(1),(2),(3),(4),(5),(6),(7),(8),(9);
+create table t1 (
+kp1 int not null,
+kp2 int not null,
+a int,
+primary key(kp1, kp2) comment 'rev:cf1'
+) engine=rocksdb;
+insert into t1 select 1, a, 1234 from t0;
+insert into t1 select 2, a, 1234 from t0;
+insert into t1 select 3, a, 1234 from t0;
+connect con1,localhost,root,,;
+connection con1;
+begin;
+select * from t1 where kp1=2 for update;
+kp1 kp2 a
+2 0 1234
+2 1 1234
+2 2 1234
+2 3 1234
+2 4 1234
+2 5 1234
+2 6 1234
+2 7 1234
+2 8 1234
+2 9 1234
+connection default;
+# The lock on kp1=2 should inhibit the following INSERT:
+insert into t1 values ( 2,5,9999);
+ERROR HY000: Lock wait timeout exceeded; try restarting transaction: Timeout on index: test.t1.PRIMARY
+rollback;
+connection con1;
+rollback;
+connection default;
+disconnect con1;
+drop table t0,t1;
+#
+# Test that locks on ranges on non-unique secondary keys inhibit
+# modifications of the contents of these ranges
+#
+create table t0(a int);
+insert into t0 values (0),(1),(2),(3),(4),(5),(6),(7),(8),(9);
+create table t1 (
+kp1 int not null,
+kp2 int not null,
+a int,
+key(kp1, kp2) comment 'rev:cf1'
+) engine=rocksdb;
+insert into t1 select 1, a, 1234 from t0;
+insert into t1 values (2, 3, 1234);
+insert into t1 values (2, 5, 1234);
+insert into t1 values (2, 7, 1234);
+insert into t1 select 3, a, 1234 from t0;
+connect con1,localhost,root,,;
+connection con1;
+begin;
+explain
+select * from t1 where kp1=2 for update;
+id select_type table partitions type possible_keys key key_len ref rows filtered Extra
+1 SIMPLE t1 NULL ref kp1 kp1 4 const # 100.00 NULL
+Warnings:
+Note 1003 /* select#1 */ select `test`.`t1`.`kp1` AS `kp1`,`test`.`t1`.`kp2` AS `kp2`,`test`.`t1`.`a` AS `a` from `test`.`t1` where (`test`.`t1`.`kp1` = 2)
+select * from t1 where kp1=2 for update;
+kp1 kp2 a
+2 3 1234
+2 5 1234
+2 7 1234
+connection default;
+begin;
+insert into t1 values (2, 9, 9999);
+ERROR HY000: Lock wait timeout exceeded; try restarting transaction: Timeout on index: test.t1.kp1
+delete from t1 where kp1=2 and kp2=5;
+ERROR HY000: Lock wait timeout exceeded; try restarting transaction: Timeout on index: test.t1.kp1
+update t1 set kp1=333 where kp1=2 and kp2=3;
+ERROR HY000: Lock wait timeout exceeded; try restarting transaction: Timeout on index: test.t1.kp1
+update t1 set kp1=2 where kp1=1 and kp2=8;
+ERROR HY000: Lock wait timeout exceeded; try restarting transaction: Timeout on index: test.t1.kp1
+rollback;
+connection con1;
+rollback;
+disconnect con1;
+connection default;
+drop table t0,t1;
+#
+# Transaction isolation test
+#
+create table t1 (pk int primary key, a int) engine=rocksdb;
+insert into t1 values (1,1),(2,2),(3,3);
+connect con1,localhost,root,,;
+# TRX1: Start, Allocate a snapshot
+connection con1;
+begin;
+select * from t1;
+pk a
+1 1
+2 2
+3 3
+# TRX2: Make a change that TRX1 will not see
+connection default;
+update t1 set a=2222 where pk=2;
+# TRX1: Now, make a change that would overwrite TRX2'x change and commit
+connection con1;
+update t1 set a=a+1 where pk=2;
+commit;
+# Examine the result:
+# pk=2, a=2223 means UPDATE in TRX1 used "read committed" (InnoDB-like isolation)
+# pk=2, a=3 means UPDATE in TRX1 silently overwrote TRX2
+# (and with key tracking, one would get an error on the second UPDATE)
+connection default;
+select * from t1;
+pk a
+1 1
+2 2223
+3 3
+disconnect con1;
+connection default;
+drop table t1;
+#
+# Same test as above, but check the range scan
+#
+create table t1 (
+pk int,
+a int,
+primary key (pk) comment 'rev:cf1'
+) engine=rocksdb;
+insert into t1 values (1,1),(2,2),(3,3),(4,4),(5,5),(6,6);
+connect con1,localhost,root,,;
+# TRX1: Start, Allocate a snapshot
+connection con1;
+begin;
+select * from t1;
+pk a
+6 6
+5 5
+4 4
+3 3
+2 2
+1 1
+# TRX2: Make a change that TRX1 will not see
+connection default;
+update t1 set a=2222 where pk between 3 and 5;
+# TRX1: Now, make a change that would overwrite TRX2'x change and commit
+connection con1;
+update t1 set a=a+1 where pk between 3 and 5;
+commit;
+# Examine the result:
+# pk={3,4,5} a=2223 means UPDATE in TRX1 used "read committed" (InnoDB-like isolation)
+connection default;
+select * from t1;
+pk a
+6 6
+5 2223
+4 2223
+3 2223
+2 2
+1 1
+disconnect con1;
+connection default;
+drop table t1;
+#
+# Same as above, but test SELECT FOR UPDATE.
+#
+create table t1 (
+pk int,
+a int,
+primary key (pk) comment 'rev:cf1'
+) engine=rocksdb;
+insert into t1 values (1,1),(2,2),(3,3),(4,4),(5,5),(6,6);
+connect con1,localhost,root,,;
+# TRX1: Start, Allocate a snapshot
+connection con1;
+begin;
+select * from t1;
+pk a
+6 6
+5 5
+4 4
+3 3
+2 2
+1 1
+# TRX2: Make a change that TRX1 will not see
+connection default;
+update t1 set a=222 where pk=2;
+update t1 set a=333 where pk=3;
+# TRX1: Check what select [FOR UPDATE] sees
+connection con1;
+select * from t1 where pk in (2,3);
+pk a
+2 2
+3 3
+select * from t1 where pk=2 for update;
+pk a
+2 222
+select * from t1 where pk=2;
+pk a
+2 2
+commit;
+disconnect con1;
+connection default;
+drop table t1;
+#
+# Check that I_S.processlist.state is set correctly now.
+#
+create table t1(
+pk int,
+a int,
+primary key(pk) comment 'rev:cf1'
+) engine=rocksdb;
+insert into t1 values (1,1),(2,2),(3,3);
+begin;
+select * from t1 where pk=2 for update;
+pk a
+2 2
+connect con1,localhost,root,,;
+begin;
+set rocksdb_lock_wait_timeout=300;
+select * from t1 where pk=2 for update;
+connection default;
+# Now, will wait until we see con1 have state="Waiting for row lock"
+rollback;
+connection con1;
+pk a
+2 2
+rollback;
+disconnect con1;
+connection default;
+drop table t1;
+#
+# Test range locking for ranges with HA_READ_PREFIX_LAST
+#
+create table t0(a int) engine=rocksdb;
+insert into t0 values (0),(1),(2),(3),(4),(5),(6),(7),(8),(9);
+create table t1 (
+pk1 int,
+pk2 int,
+a int,
+primary key(pk1, pk2) comment 'rev:cf1'
+) engine=rocksdb;
+insert into t1
+select
+A.a, B.a, A.a*10+B.a
+from
+t0 A, t0 B;
+connect con1,localhost,root,,;
+connection con1;
+begin;
+insert into t1 values (0x1112222,0x1112222,0);
+connection default;
+begin;
+# Should use ref access w/o filesort:
+explain
+select * from t1
+where pk1=3
+order by pk1 desc, pk2 desc
+for update;
+id select_type table partitions type possible_keys key key_len ref rows filtered Extra
+1 SIMPLE t1 NULL ref PRIMARY PRIMARY 4 const # 100.00 Backward index scan
+Warnings:
+Note 1003 /* select#1 */ select `test`.`t1`.`pk1` AS `pk1`,`test`.`t1`.`pk2` AS `pk2`,`test`.`t1`.`a` AS `a` from `test`.`t1` where (`test`.`t1`.`pk1` = 3) order by `test`.`t1`.`pk1` desc,`test`.`t1`.`pk2` desc
+select * from t1
+where pk1=3
+order by pk1 desc, pk2 desc
+for update;
+pk1 pk2 a
+3 9 39
+3 8 38
+3 7 37
+3 6 36
+3 5 35
+3 4 34
+3 3 33
+3 2 32
+3 1 31
+3 0 30
+# select * from information_schema.rocksdb_locks; # With replacements by select_from_is_rowlocks.inc
+COLUMN_FAMILY_ID TRANSACTION_ID KEY mode
+$cf_id $trx_id ${indexnr}80000003-${indexnr}80000003:1 X
+rollback;
+#
+# Test range locking for ranges with HA_READ_PREFIX_LAST_OR_PREV
+#
+begin;
+# Should use range access with 2 keyparts and w/o filesort:
+explain
+select * from t1
+where pk1=4 and pk2 between 5 and 8
+order by pk1 desc, pk2 desc
+for update;
+id select_type table partitions type possible_keys key key_len ref rows filtered Extra
+1 SIMPLE t1 NULL range PRIMARY PRIMARY 8 NULL # 100.00 Using where; Backward index scan
+Warnings:
+Note 1003 /* select#1 */ select `test`.`t1`.`pk1` AS `pk1`,`test`.`t1`.`pk2` AS `pk2`,`test`.`t1`.`a` AS `a` from `test`.`t1` where ((`test`.`t1`.`pk1` = 4) and (`test`.`t1`.`pk2` between 5 and 8)) order by `test`.`t1`.`pk1` desc,`test`.`t1`.`pk2` desc
+select * from t1
+where pk1=4 and pk2 between 5 and 8
+order by pk1 desc, pk2 desc
+for update;
+pk1 pk2 a
+4 8 48
+4 7 47
+4 6 46
+4 5 45
+# select * from information_schema.rocksdb_locks; # With replacements by select_from_is_rowlocks.inc
+COLUMN_FAMILY_ID TRANSACTION_ID KEY mode
+$cf_id $trx_id ${indexnr}8000000480000008-${indexnr}8000000480000005:1 X
+rollback;
+connection con1;
+rollback;
+connection default;
+drop table t0, t1;
+#
+# A bug: range locking was not used when scan started at table start or end
+#
+create table t0(a int);
+insert into t0 values (0),(1),(2),(3),(4),(5),(6),(7),(8),(9);
+create table t10(a int);
+insert into t10 select A.a + B.a* 10 + C.a * 100 from t0 A, t0 B, t0 C;
+create table t1 (
+pk int not null,
+a int,
+primary key(pk)
+) engine=rocksdb;
+insert into t1 select a*2,a*2 from t10;
+connection con1;
+begin;
+select * from t1 where pk=500 for update;
+pk a
+500 500
+connection default;
+begin;
+select * from t1 where pk<10 order by pk limit 10 for update;
+pk a
+0 0
+2 2
+4 4
+6 6
+8 8
+# select * from information_schema.rocksdb_locks; # With replacements by select_from_is_rowlocks.inc
+COLUMN_FAMILY_ID TRANSACTION_ID KEY mode
+$cf_id $trx_id ${indexnr}-${indexnr}8000000a X
+rollback;
+begin;
+select * from t1 where pk>1990 order by pk desc limit 10 for update;
+pk a
+1998 1998
+1996 1996
+1994 1994
+1992 1992
+# select * from information_schema.rocksdb_locks; # With replacements by select_from_is_rowlocks.inc
+COLUMN_FAMILY_ID TRANSACTION_ID KEY mode
+$cf_id $trx_id ${indexnr}800007c6-${indexnr+1} X
+rollback;
+connection con1;
+rollback;
+disconnect con1;
+connection default;
+drop table t0,t10,t1;
diff --git a/mysql-test/suite/rocksdb/r/range_locking_seek_for_update.result b/mysql-test/suite/rocksdb/r/range_locking_seek_for_update.result
new file mode 100644
index 00000000000..514916eaa22
--- /dev/null
+++ b/mysql-test/suite/rocksdb/r/range_locking_seek_for_update.result
@@ -0,0 +1,279 @@
+show variables like 'rocksdb_use_range_locking';
+Variable_name Value
+rocksdb_use_range_locking ON
+create table t0(a int primary key);
+insert into t0 values (0),(1),(2),(3),(4),(5),(6),(7),(8),(9);
+create table t1 (
+pk int,
+a int,
+primary key (pk)
+) engine=rocksdb;
+insert into t1 select
+A.a + B.a*10 + C.a*100,
+A.a + B.a*10 + C.a*100
+from
+t0 A, t0 B, t0 C;
+# Make another connection to get the lock tree out of the STO-mode
+connect con1,localhost,root,,;
+connection con1;
+begin;
+select * from t1 where pk=10 for update;
+pk a
+10 10
+connection default;
+begin;
+select * from t1 where pk=11 for update;
+pk a
+11 11
+# Now, we will just see locks on 10=0xA and 11=0xB:
+# select * from information_schema.rocksdb_locks; # With replacements by select_from_is_rowlocks.inc
+COLUMN_FAMILY_ID TRANSACTION_ID KEY mode
+$cf_id $trx_id ${indexnr}8000000b X
+#
+# SeekForUpdate Test #1: A query with type=range (without upper bound) and LIMIT
+#
+explain
+select * from t1 where pk>=500 order by pk limit 3 for update;
+id select_type table partitions type possible_keys key key_len ref rows filtered Extra
+1 SIMPLE t1 NULL range PRIMARY PRIMARY 4 NULL # 100.00 Using where
+Warnings:
+Note 1003 /* select#1 */ select `test`.`t1`.`pk` AS `pk`,`test`.`t1`.`a` AS `a` from `test`.`t1` where (`test`.`t1`.`pk` >= 500) order by `test`.`t1`.`pk` limit 3
+select * from t1 where pk>=500 order by pk limit 3 for update;
+pk a
+500 500
+501 501
+502 502
+# select * from information_schema.rocksdb_locks; # With replacements by select_from_is_rowlocks.inc
+COLUMN_FAMILY_ID TRANSACTION_ID KEY mode
+$cf_id $trx_id ${indexnr}8000000b X
+$cf_id $trx_id ${indexnr}800001f4-${indexnr}800001f6 X
+rollback;
+begin;
+select * from t1 where pk=11 for update;
+pk a
+11 11
+explain
+select * from t1 order by pk limit 3 for update;
+id select_type table partitions type possible_keys key key_len ref rows filtered Extra
+1 SIMPLE t1 NULL index NULL PRIMARY 4 NULL 3 100.00 NULL
+Warnings:
+Note 1003 /* select#1 */ select `test`.`t1`.`pk` AS `pk`,`test`.`t1`.`a` AS `a` from `test`.`t1` order by `test`.`t1`.`pk` limit 3
+select * from t1 order by pk limit 3 for update;
+pk a
+0 0
+1 1
+2 2
+# select * from information_schema.rocksdb_locks; # With replacements by select_from_is_rowlocks.inc
+COLUMN_FAMILY_ID TRANSACTION_ID KEY mode
+$cf_id $trx_id ${indexnr}-${indexnr}80000002 X
+$cf_id $trx_id ${indexnr}8000000b X
+rollback;
+connection con1;
+rollback;
+disconnect con1;
+connection default;
+drop table t0, t1;
+#
+# Concurrent tests: let one thread do SeekForUpdate and the other
+# interfere by committing modifications
+#
+create table t0(a int primary key);
+insert into t0 values (0),(1),(2),(3),(4),(5),(6),(7),(8),(9);
+create table t1 (
+pk int,
+a int,
+primary key (pk)
+) engine=rocksdb;
+insert into t1 select
+A.a + B.a*10 + C.a*100,
+A.a + B.a*10 + C.a*100
+from
+t0 A, t0 B, t0 C;
+select * from t1 where pk<10;
+pk a
+0 0
+1 1
+2 2
+3 3
+4 4
+5 5
+6 6
+7 7
+8 8
+9 9
+delete from t1 where pk<10;
+select * from t1 where pk<10;
+pk a
+# Test what happens when another transaction commits a row
+# right before the range we are about to lock (nothing)
+explain
+select * from t1 where pk >=5 order by pk limit 3 for update;
+id select_type table partitions type possible_keys key key_len ref rows filtered Extra
+1 SIMPLE t1 NULL range PRIMARY PRIMARY 4 NULL # 100.00 Using where
+Warnings:
+Note 1003 /* select#1 */ select `test`.`t1`.`pk` AS `pk`,`test`.`t1`.`a` AS `a` from `test`.`t1` where (`test`.`t1`.`pk` >= 5) order by `test`.`t1`.`pk` limit 3
+begin;
+set debug_sync='rocksdb.locking_iter_scan SIGNAL about_to_lock_range WAIT_FOR spoiler_inserted';
+select * from t1 where pk >=5 order by pk limit 3 for update;
+connect con1,localhost,root,,;
+connection con1;
+set debug_sync='now WAIT_FOR about_to_lock_range';
+insert into t1 values (3,3);
+set debug_sync='now SIGNAL spoiler_inserted';
+connection default;
+pk a
+10 10
+11 11
+12 12
+rollback;
+delete from t1 where pk=3;
+#
+# Now, repeat the test but let the other transaction insert the row into
+# the range we are locking
+explain
+select * from t1 where pk >=5 order by pk limit 1 for update;
+id select_type table partitions type possible_keys key key_len ref rows filtered Extra
+1 SIMPLE t1 NULL range PRIMARY PRIMARY 4 NULL # 100.00 Using where
+Warnings:
+Note 1003 /* select#1 */ select `test`.`t1`.`pk` AS `pk`,`test`.`t1`.`a` AS `a` from `test`.`t1` where (`test`.`t1`.`pk` >= 5) order by `test`.`t1`.`pk` limit 1
+begin;
+set debug_sync='rocksdb.locking_iter_scan SIGNAL about_to_lock_range WAIT_FOR spoiler_inserted';
+select * from t1 where pk >=5 order by pk limit 1 for update;
+connection con1;
+set debug_sync='now WAIT_FOR about_to_lock_range';
+insert into t1 values (8,8);
+set debug_sync='now SIGNAL spoiler_inserted';
+connection default;
+pk a
+8 8
+# select * from information_schema.rocksdb_locks; # With replacements by select_from_is_rowlocks.inc
+COLUMN_FAMILY_ID TRANSACTION_ID KEY mode
+$cf_id $trx_id ${indexnr}80000005-${indexnr}8000000a X
+rollback;
+delete from t1 where pk=8;
+#
+# Repeat the third time, this time deleting the row that SeekForUpdate saw
+#
+insert into t1 values (7,7);
+begin;
+set debug_sync='rocksdb.locking_iter_scan SIGNAL about_to_lock_range WAIT_FOR spoiler_inserted';
+select * from t1 where pk >=5 order by pk limit 1 for update;
+connection con1;
+set debug_sync='now WAIT_FOR about_to_lock_range';
+delete from t1 where pk=7;
+set debug_sync='now SIGNAL spoiler_inserted';
+connection default;
+pk a
+10 10
+rollback;
+#
+# Repeat the above test, but let the read fail with ER_LOCK_WAIT_TIMEOUT
+# error. MyRocks code should now be prepared that data reads cause this
+# error
+#
+insert into t1 values (7,7);
+begin;
+set debug_sync='rocksdb.locking_iter_scan SIGNAL about_to_lock_range WAIT_FOR spoiler_inserted';
+select * from t1 where pk >=5 order by pk limit 1 for update;
+connection con1;
+set debug_sync='now WAIT_FOR about_to_lock_range';
+begin;
+delete from t1 where pk=7;
+set debug_sync='now SIGNAL spoiler_inserted';
+connection default;
+ERROR HY000: Lock wait timeout exceeded; try restarting transaction: Timeout on index: test.t1.PRIMARY
+rollback;
+connection con1;
+rollback;
+connection default;
+#
+# Backward scan test
+#
+connection con1;
+begin;
+select * from t1 where pk=500 for update;
+pk a
+500 500
+connection default;
+insert into t1 values
+(1001, 1001),
+(1005, 1005),
+(1007, 1007),
+(1010, 1010);
+begin;
+select * from t1 order by pk desc limit 2 for update;
+pk a
+1010 1010
+1007 1007
+# The below will lock from pk=1007 (0x3ef) till the end of the table:
+# select * from information_schema.rocksdb_locks; # With replacements by select_from_is_rowlocks.inc
+COLUMN_FAMILY_ID TRANSACTION_ID KEY mode
+$cf_id $trx_id ${indexnr}800003ef-${indexnr+1} X
+rollback;
+begin;
+select * from t1 where pk <1007 order by pk desc limit 2 for update;
+pk a
+1005 1005
+1001 1001
+# select * from information_schema.rocksdb_locks; # With replacements by select_from_is_rowlocks.inc
+COLUMN_FAMILY_ID TRANSACTION_ID KEY mode
+$cf_id $trx_id ${indexnr}800003e9-${indexnr}800003ef X
+connection con1;
+rollback;
+connection default;
+rollback;
+#
+# Backward scan test 2: error condition
+#
+connection con1;
+begin;
+select * from t1 where pk=1010 for update;
+pk a
+1010 1010
+connection default;
+begin;
+select * from t1 order by pk desc limit 2 for update;
+ERROR HY000: Lock wait timeout exceeded; try restarting transaction: Timeout on index: test.t1.PRIMARY
+rollback;
+connection con1;
+rollback;
+begin;
+select * from t1 where pk=1007 for update;
+pk a
+1007 1007
+connection default;
+begin;
+select * from t1 order by pk desc limit 2 for update;
+ERROR HY000: Lock wait timeout exceeded; try restarting transaction: Timeout on index: test.t1.PRIMARY
+rollback;
+connection con1;
+rollback;
+disconnect con1;
+connection default;
+drop table t0,t1;
+#
+# A test: full table scan doesn't lock gaps
+#
+create table t1 (
+pk int primary key,
+a int
+) engine=rocksdb;
+insert into t1 values (10,10),(20,20),(30,30);
+connect con1,localhost,root,,;
+connect con2,localhost,root,,;
+connection con1;
+begin;
+select * from t1 for update;
+pk a
+10 10
+20 20
+30 30
+connection con2;
+insert into t1 values (5,5);
+ERROR HY000: Lock wait timeout exceeded; try restarting transaction: Timeout on index: test.t1.PRIMARY
+connection con1;
+rollback;
+disconnect con1;
+disconnect con2;
+connection default;
+drop table t1;
diff --git a/mysql-test/suite/rocksdb/r/range_locking_shared_locks.result b/mysql-test/suite/rocksdb/r/range_locking_shared_locks.result
new file mode 100644
index 00000000000..90223043c08
--- /dev/null
+++ b/mysql-test/suite/rocksdb/r/range_locking_shared_locks.result
@@ -0,0 +1,251 @@
+select @@rocksdb_use_range_locking;
+@@rocksdb_use_range_locking
+1
+create table t0 (a int);
+insert into t0 values (0),(1),(2),(3),(4),(5),(6),(7),(8),(9);
+create table t1 (
+pk int primary key,
+a int
+) engine=rocksdb;
+insert into t1 select a,a from t0;
+# A basic test for shared locks
+begin;
+select * from t1 where pk=3 for update;
+pk a
+3 3
+select * from t1 where pk=5 lock in share mode;
+pk a
+5 5
+connect con1,localhost,root,,;
+connection con1;
+begin;
+select * from t1 where pk=5 lock in share mode;
+pk a
+5 5
+# Now for pk=5 we should see two locks by TRX1 and TRX2 with mode=S:
+# select * from information_schema.rocksdb_locks; # With replacements by select_from_is_rowlocks.inc
+COLUMN_FAMILY_ID TRANSACTION_ID KEY mode
+$cf_id $TRX1_ID ${indexnr}80000003 X
+$cf_id $TRX1_ID ${indexnr}80000005 S
+$cf_id $TRX2_ID ${indexnr}80000005 S
+rollback;
+# Now, TRX2_ID should be gone:
+# select * from information_schema.rocksdb_locks; # With replacements by select_from_is_rowlocks.inc
+COLUMN_FAMILY_ID TRANSACTION_ID KEY mode
+$cf_id $TRX1_ID ${indexnr}80000003 X
+$cf_id $TRX1_ID ${indexnr}80000005 S
+connection default;
+# Get a read lock on pk=3 (where we have a write lock).
+# The result should be that we will still have a write lock
+select * from t1 where pk=3 for update;
+pk a
+3 3
+# select * from information_schema.rocksdb_locks; # With replacements by select_from_is_rowlocks.inc
+COLUMN_FAMILY_ID TRANSACTION_ID KEY mode
+$cf_id $TRX1_ID ${indexnr}80000003 X
+$cf_id $TRX1_ID ${indexnr}80000005 S
+# Get a write lock on pk=5 (where we have a read lock).
+# The result should be that we will have a write lock.
+select * from t1 where pk=5 for update;
+pk a
+5 5
+# select * from information_schema.rocksdb_locks; # With replacements by select_from_is_rowlocks.inc
+COLUMN_FAMILY_ID TRANSACTION_ID KEY mode
+$cf_id $TRX1_ID ${indexnr}80000003 X
+$cf_id $TRX1_ID ${indexnr}80000005 X
+connection default;
+rollback;
+#
+# Test if a read lock inhibits write locks
+#
+begin;
+select * from t1 where pk=2 lock in share mode;
+pk a
+2 2
+select * from t1 where pk=8 for update;
+pk a
+8 8
+connection con1;
+begin;
+select * from t1 where pk=2 for update;
+ERROR HY000: Lock wait timeout exceeded; try restarting transaction: Timeout on index: test.t1.PRIMARY
+select * from t1 where pk between 0 and 4 for update;
+ERROR HY000: Lock wait timeout exceeded; try restarting transaction: Timeout on index: test.t1.PRIMARY
+delete from t1 where pk=2;
+ERROR HY000: Lock wait timeout exceeded; try restarting transaction: Timeout on index: test.t1.PRIMARY
+# Get a shared lock
+select * from t1 where pk=2 lock in share mode;
+pk a
+2 2
+# But this should still prevent us from acquiring a write lock on that value:
+select * from t1 where pk=2 for update;
+ERROR HY000: Lock wait timeout exceeded; try restarting transaction: Timeout on index: test.t1.PRIMARY
+rollback;
+connection default;
+rollback;
+drop table t1;
+create table t1 (
+pk int not null primary key,
+a int not null,
+key(a)
+) engine=rocksdb;
+insert into t1
+select
+A.a+10*B.a+100*C.a+1000*D.a, A.a+10*B.a+100*C.a+1000*D.a
+from
+t0 A, t0 B, t0 C, t0 D;
+set global rocksdb_force_flush_memtable_now=1;
+connection con1;
+begin;
+select * from t1 where pk=900 for update;
+pk a
+900 900
+connection default;
+begin;
+explain
+select * from t1 where a between 2 and 5 lock in share mode;
+id select_type table partitions type possible_keys key key_len ref rows filtered Extra
+1 SIMPLE t1 NULL range a a 4 NULL # 100.00 Using where; Using index
+Warnings:
+Note 1003 /* select#1 */ select `test`.`t1`.`pk` AS `pk`,`test`.`t1`.`a` AS `a` from `test`.`t1` where (`test`.`t1`.`a` between 2 and 5)
+select * from t1 where a between 2 and 5 lock in share mode;
+pk a
+2 2
+3 3
+4 4
+5 5
+# select * from information_schema.rocksdb_locks; # With replacements by select_from_is_rowlocks.inc
+COLUMN_FAMILY_ID TRANSACTION_ID KEY mode
+$cf_id $TRX1_ID ${indexnr+1}80000002-${indexnr+1}80000005:1 X
+$cf_id $TRX1_ID ${indexnr}80000002 S
+$cf_id $TRX1_ID ${indexnr}80000003 S
+$cf_id $TRX1_ID ${indexnr}80000004 S
+$cf_id $TRX1_ID ${indexnr}80000005 S
+$cf_id $TRX1_ID ${indexnr}80000006 S
+$cf_id $TRX2_ID ${indexnr}80000384 X
+rollback;
+disconnect con1;
+drop table t0,t1;
+#
+# Test shared point locks and lock escalation
+#
+create table t0 (a int);
+insert into t0 values (0),(1),(2),(3),(4),(5),(6),(7),(8),(9);
+create table t1 (
+pk int primary key,
+a int
+) engine=rocksdb;
+insert into t1
+select 1000 + 100*A.a + 10*B.a + C.a, 12345 from t0 A, t0 B, t0 C;
+show status like 'rocksdb_locktree_current_lock_memory';
+Variable_name Value
+rocksdb_locktree_current_lock_memory 0
+connect con1,localhost,root,,;
+connection con1;
+begin;
+# CON1: get some shared locks
+select * from t1 where pk=1001 lock in share mode;
+pk a
+1001 12345
+select * from t1 where pk=1100 lock in share mode;
+pk a
+1100 12345
+select * from t1 where pk=1200 lock in share mode;
+pk a
+1200 12345
+select * from t1 where pk=2500 lock in share mode;
+pk a
+connection default;
+begin;
+# DEFAULT: get the same locks so we have locks with multiple owners
+select * from t1 where pk=1001 lock in share mode;
+pk a
+1001 12345
+select * from t1 where pk=1100 lock in share mode;
+pk a
+1100 12345
+select * from t1 where pk=1200 lock in share mode;
+pk a
+1200 12345
+# DEFAULT: get shared locks with one owner:
+select * from t1 where pk=2510 lock in share mode;
+pk a
+# DEFAULT: exclusive locks on 0-10:
+insert into t1 select A.a, 0 from t0 A;
+connection con1;
+# CON1: exclusive locks on 2000-2010:
+insert into t1 select 2000+A.a, 0 from t0 A;
+# select * from information_schema.rocksdb_locks; # With replacements by select_from_is_rowlocks.inc
+COLUMN_FAMILY_ID TRANSACTION_ID KEY mode
+$cf_id $TRX2_ID ${indexnr}80000000 X
+$cf_id $TRX2_ID ${indexnr}80000001 X
+$cf_id $TRX2_ID ${indexnr}80000002 X
+$cf_id $TRX2_ID ${indexnr}80000003 X
+$cf_id $TRX2_ID ${indexnr}80000004 X
+$cf_id $TRX2_ID ${indexnr}80000005 X
+$cf_id $TRX2_ID ${indexnr}80000006 X
+$cf_id $TRX2_ID ${indexnr}80000007 X
+$cf_id $TRX2_ID ${indexnr}80000008 X
+$cf_id $TRX2_ID ${indexnr}80000009 X
+$cf_id $TRX1_ID ${indexnr}800003e9 S
+$cf_id $TRX2_ID ${indexnr}800003e9 S
+$cf_id $TRX1_ID ${indexnr}8000044c S
+$cf_id $TRX2_ID ${indexnr}8000044c S
+$cf_id $TRX1_ID ${indexnr}800004b0 S
+$cf_id $TRX2_ID ${indexnr}800004b0 S
+$cf_id $TRX1_ID ${indexnr}800007d0 X
+$cf_id $TRX1_ID ${indexnr}800007d1 X
+$cf_id $TRX1_ID ${indexnr}800007d2 X
+$cf_id $TRX1_ID ${indexnr}800007d3 X
+$cf_id $TRX1_ID ${indexnr}800007d4 X
+$cf_id $TRX1_ID ${indexnr}800007d5 X
+$cf_id $TRX1_ID ${indexnr}800007d6 X
+$cf_id $TRX1_ID ${indexnr}800007d7 X
+$cf_id $TRX1_ID ${indexnr}800007d8 X
+$cf_id $TRX1_ID ${indexnr}800007d9 X
+$cf_id $TRX1_ID ${indexnr}800009c4 S
+$cf_id $TRX2_ID ${indexnr}800009ce S
+connection default;
+show status like 'rocksdb_locktree_current_lock_memory';
+Variable_name Value
+rocksdb_locktree_current_lock_memory 9016
+set @save_mlm= @@rocksdb_max_lock_memory;
+# Set the limit to cause lock escalation:
+set @cur_mem_usage= (select
+variable_value
+from
+performance_schema.global_status
+where
+variable_name='rocksdb_locktree_current_lock_memory');
+set global rocksdb_max_lock_memory = cast(@cur_mem_usage+4 as SIGNED);
+connection con1;
+insert into t1 select 3000+A.a, 0 from t0 A;
+# select * from information_schema.rocksdb_locks; # With replacements by select_from_is_rowlocks.inc
+COLUMN_FAMILY_ID TRANSACTION_ID KEY mode
+$cf_id $TRX2_ID ${indexnr}80000000-${indexnr}80000009 X
+$cf_id $TRX1_ID ${indexnr}800003e9 S
+$cf_id $TRX2_ID ${indexnr}800003e9 S
+$cf_id $TRX1_ID ${indexnr}8000044c S
+$cf_id $TRX2_ID ${indexnr}8000044c S
+$cf_id $TRX1_ID ${indexnr}800004b0 S
+$cf_id $TRX2_ID ${indexnr}800004b0 S
+$cf_id $TRX1_ID ${indexnr}800007d0-${indexnr}800007d9 X
+$cf_id $TRX1_ID ${indexnr}800009c4 S
+$cf_id $TRX2_ID ${indexnr}800009ce S
+$cf_id $TRX1_ID ${indexnr}80000bb8 X
+$cf_id $TRX1_ID ${indexnr}80000bb9 X
+$cf_id $TRX1_ID ${indexnr}80000bba X
+$cf_id $TRX1_ID ${indexnr}80000bbb X
+$cf_id $TRX1_ID ${indexnr}80000bbc X
+$cf_id $TRX1_ID ${indexnr}80000bbd X
+$cf_id $TRX1_ID ${indexnr}80000bbe X
+$cf_id $TRX1_ID ${indexnr}80000bbf X
+$cf_id $TRX1_ID ${indexnr}80000bc0 X
+$cf_id $TRX1_ID ${indexnr}80000bc1 X
+connection con1;
+rollback;
+connection default;
+rollback;
+disconnect con1;
+set global rocksdb_max_lock_memory= cast(@save_mlm as SIGNED);
+drop table t0, t1;
diff --git a/mysql-test/suite/rocksdb/r/rocksdb.result b/mysql-test/suite/rocksdb/r/rocksdb.result
index 5c440c88317..2f562ce6d6f 100644
--- a/mysql-test/suite/rocksdb/r/rocksdb.result
+++ b/mysql-test/suite/rocksdb/r/rocksdb.result
@@ -986,6 +986,7 @@ rocksdb_max_background_jobs 2
rocksdb_max_bottom_pri_background_compactions 0
rocksdb_max_compaction_history 64
rocksdb_max_latest_deadlocks 5
+rocksdb_max_lock_memory 1073741824
rocksdb_max_log_file_size 0
rocksdb_max_manifest_file_size 1073741824
rocksdb_max_manual_compactions 10
@@ -1054,6 +1055,8 @@ rocksdb_use_default_sk_cf OFF
rocksdb_use_direct_io_for_flush_and_compaction OFF
rocksdb_use_direct_reads OFF
rocksdb_use_fsync OFF
+rocksdb_use_range_lock_manager_as_point OFF
+rocksdb_use_range_locking OFF
rocksdb_validate_tables 1
rocksdb_verify_row_debug_checksums OFF
rocksdb_wal_bytes_per_sync 0
diff --git a/mysql-test/suite/rocksdb/r/rocksdb_read_free_rpl.result b/mysql-test/suite/rocksdb/r/rocksdb_read_free_rpl.result
index 46e846afda0..b418bfa9336 100644
--- a/mysql-test/suite/rocksdb/r/rocksdb_read_free_rpl.result
+++ b/mysql-test/suite/rocksdb/r/rocksdb_read_free_rpl.result
@@ -72,7 +72,7 @@ update t1 set c2=100 where c1=3;
delete from t1 where c1 <= 2;
include/sync_slave_sql_with_master.inc
[connection slave]
-select case when variable_value-@up > 0 then 'false' else 'true' end as read_free from performance_schema.global_status where variable_name='rocksdb_num_get_for_update_calls';
+select case when (@@rocksdb_use_range_locking=1 OR variable_value-@up > 0) then 'false' else 'true' end as read_free from performance_schema.global_status where variable_name='rocksdb_num_get_for_update_calls';
read_free
false
select * from t1;
diff --git a/mysql-test/suite/rocksdb/r/rocksdb_timeout_rollback.result b/mysql-test/suite/rocksdb/r/rocksdb_timeout_rollback.result
index 1e253a9974b..08a0a2f5942 100644
--- a/mysql-test/suite/rocksdb/r/rocksdb_timeout_rollback.result
+++ b/mysql-test/suite/rocksdb/r/rocksdb_timeout_rollback.result
@@ -36,6 +36,9 @@ rocksdb_rollback_on_timeout OFF
begin work;
insert into t1 values (9);
insert into t1 values (10);
+# Fix for Range Locking: force a snapshot to be taken:
+select * from t1 where a=100;
+a
update t1 set a = a + 1 where a = 2;
begin work;
insert into t1 values (11);
diff --git a/mysql-test/suite/rocksdb/r/unique_sec.result b/mysql-test/suite/rocksdb/r/unique_sec.result
index 1da78db24b1..d4ef2e0ff2e 100644
--- a/mysql-test/suite/rocksdb/r/unique_sec.result
+++ b/mysql-test/suite/rocksdb/r/unique_sec.result
@@ -114,6 +114,10 @@ ERROR 23000: Duplicate entry '37' for key 't1.id5'
UPDATE t1 SET id5=34 WHERE id1=38;
ERROR HY000: Lock wait timeout exceeded; try restarting transaction: Timeout on index: test.t1.id5
# NULL values are unique
+# (Note: the following UPDATE reads through the whole table without
+# finding anything to update. With point locking, this is fine,
+# but with range locking it will time out while waiting on a row lock
+# that the other transaction is holding)
UPDATE t1 SET id5=NULL WHERE value1 > 37;
COMMIT;
COMMIT;
diff --git a/mysql-test/suite/rocksdb/r/unique_sec_rev_cf.result b/mysql-test/suite/rocksdb/r/unique_sec_rev_cf.result
index d6d06f6ece5..0e71e6481aa 100644
--- a/mysql-test/suite/rocksdb/r/unique_sec_rev_cf.result
+++ b/mysql-test/suite/rocksdb/r/unique_sec_rev_cf.result
@@ -114,6 +114,10 @@ ERROR 23000: Duplicate entry '37' for key 't1.id5'
UPDATE t1 SET id5=34 WHERE id1=38;
ERROR HY000: Lock wait timeout exceeded; try restarting transaction: Timeout on index: test.t1.id5
# NULL values are unique
+# (Note: the following UPDATE reads through the whole table without
+# finding anything to update. With point locking, this is fine,
+# but with range locking it will time out while waiting on a row lock
+# that the other transaction is holding)
UPDATE t1 SET id5=NULL WHERE value1 > 37;
COMMIT;
COMMIT;
diff --git a/mysql-test/suite/rocksdb/t/db_max_index_num.test b/mysql-test/suite/rocksdb/t/db_max_index_num.test
index 1adf7d2c644..10b5a7a4033 100644
--- a/mysql-test/suite/rocksdb/t/db_max_index_num.test
+++ b/mysql-test/suite/rocksdb/t/db_max_index_num.test
@@ -2,6 +2,11 @@
--source include/have_debug_sync.inc
--source include/have_rocksdb.inc
+# Does SELECT...FOR UPDATE; SELECT ... FROM I_S.ROCKSDB_LOCKS which produces
+# different output in range locking mode.
+--source suite/rocksdb/include/not_range_locking.inc
+
+
--echo ## Test creating dropping database of same name while ##
--echo ## keeping drop index thread suspended ##
diff --git a/mysql-test/suite/rocksdb/t/deadlock_tracking.test b/mysql-test/suite/rocksdb/t/deadlock_tracking.test
index 96ececd4b36..7d751b30097 100644
--- a/mysql-test/suite/rocksdb/t/deadlock_tracking.test
+++ b/mysql-test/suite/rocksdb/t/deadlock_tracking.test
@@ -1,3 +1,9 @@
+# Deadlock #5 uses SELECT ... LOCK IN SHARE MODE;
+# SHOW ENGINE ROCKSDB TRANSACTION status prints information about deadlocks.
+# A part of this test that works with range locking is in
+# range_locking_deadlock_tracking.test
+--source suite/rocksdb/include/not_range_locking.inc
+
set @prior_lock_wait_timeout = @@rocksdb_lock_wait_timeout;
set @prior_deadlock_detect = @@rocksdb_deadlock_detect;
set @prior_max_latest_deadlocks = @@rocksdb_max_latest_deadlocks;
@@ -137,7 +143,6 @@ rollback;
connection default;
--replace_regex /[0-9]{4}-[0-9]{2}-[0-9]{2} [0-9]{2}:[0-9]{2}:[0-9]{2}/TIMESTAMP/ /WAITING KEY: [0-9a-f]{24}/KEY/ /TRANSACTION ID: [0-9]*/TXN_ID/ /TIMESTAMP: [0-9]*/TSTAMP/
show engine rocksdb transaction status;
-
echo Deadlock #6;
connection con1;
create table t1 (id int primary key, value int) engine=rocksdb;
diff --git a/mysql-test/suite/rocksdb/t/drop_cf_before_show_deadlock_info.test b/mysql-test/suite/rocksdb/t/drop_cf_before_show_deadlock_info.test
index f7eb8151f40..05ae30f2ddd 100644
--- a/mysql-test/suite/rocksdb/t/drop_cf_before_show_deadlock_info.test
+++ b/mysql-test/suite/rocksdb/t/drop_cf_before_show_deadlock_info.test
@@ -3,6 +3,10 @@
--source include/have_rocksdb.inc
--source include/count_sessions.inc
+# Doesn't work with range locking because range locking
+# does not provide info in rocksdb_deadlock.
+--source suite/rocksdb/include/not_range_locking.inc
+
--disable_query_log
call mtr.add_suppression("Column family '[a-z_]+' not found");
--enable_query_log
diff --git a/mysql-test/suite/rocksdb/t/hermitage-range_locking.test b/mysql-test/suite/rocksdb/t/hermitage-range_locking.test
new file mode 100644
index 00000000000..55203af9cf8
--- /dev/null
+++ b/mysql-test/suite/rocksdb/t/hermitage-range_locking.test
@@ -0,0 +1,15 @@
+--source include/have_rocksdb.inc
+
+# Range locking uses InnoDB-like transaction isolation, which
+# means the results differ from "true" Repeatable Read.
+--source suite/rocksdb/include/have_range_locking.inc
+
+
+# Hermitage is an attempt to test transaction isolation levels.
+# https://github.com/ept/hermitage
+
+let $trx_isolation = READ COMMITTED;
+--source hermitage.inc
+
+let $trx_isolation = REPEATABLE READ;
+--source hermitage.inc
diff --git a/mysql-test/suite/rocksdb/t/hermitage.inc b/mysql-test/suite/rocksdb/t/hermitage.inc
index 90f7d482533..83815a70459 100644
--- a/mysql-test/suite/rocksdb/t/hermitage.inc
+++ b/mysql-test/suite/rocksdb/t/hermitage.inc
@@ -108,6 +108,8 @@ select * from test where value % 3 = 0;
commit;
--source hermitage_init.inc
+let $RC_OR_RANGE_LOCKING=`select @@tx_isolation='READ-COMMITTED' OR @@rocksdb_use_range_locking=1`;
+let $RR_AND_NOT_RANGE_LOCKING=`select @@tx_isolation='REPEATABLE-READ' AND @@rocksdb_use_range_locking=0`;
connection con1;
update test set value = value + 10;
connection con2;
@@ -117,13 +119,13 @@ send delete from test where value = 20;
connection con1;
commit;
connection con2;
-if ($trx_isolation == "READ COMMITTED")
+if ($RC_OR_RANGE_LOCKING)
{
reap;
# RC: Returns 2 => 30
select * from test;
}
-if ($trx_isolation == "REPEATABLE READ")
+if ($RR_AND_NOT_RANGE_LOCKING)
{
--error ER_LOCK_DEADLOCK
reap;
@@ -147,13 +149,13 @@ send update test set value = 12 where id = 1;
connection con1;
commit;
connection con2;
-if ($trx_isolation == "READ COMMITTED")
+if ($RC_OR_RANGE_LOCKING)
{
reap;
# RC: Returns 1 => 12
select * from test;
}
-if ($trx_isolation == "REPEATABLE READ")
+if ($RR_AND_NOT_RANGE_LOCKING)
{
--error ER_LOCK_DEADLOCK
reap;
@@ -200,12 +202,12 @@ update test set value = 12 where id = 1;
update test set value = 18 where id = 2;
commit;
connection con1;
-if ($trx_isolation == "READ COMMITTED")
+if ($RC_OR_RANGE_LOCKING)
{
delete from test where value = 20; # doesn't delete anything
select * from test where id = 2; # shows 2 => 18
}
-if ($trx_isolation == "REPEATABLE READ")
+if ($RR_AND_NOT_RANGE_LOCKING)
{
--error ER_LOCK_DEADLOCK
delete from test where value = 20;
diff --git a/mysql-test/suite/rocksdb/t/hermitage.test b/mysql-test/suite/rocksdb/t/hermitage.test
index e4138e8d89f..51f3f286a0e 100644
--- a/mysql-test/suite/rocksdb/t/hermitage.test
+++ b/mysql-test/suite/rocksdb/t/hermitage.test
@@ -1,5 +1,8 @@
--source include/have_rocksdb.inc
+# See hermitage-range_locking variant
+--source suite/rocksdb/include/not_range_locking.inc
+
# Hermitage is an attempt to test transaction isolation levels.
# https://github.com/ept/hermitage
diff --git a/mysql-test/suite/rocksdb/t/i_s_deadlock.test b/mysql-test/suite/rocksdb/t/i_s_deadlock.test
index e0479d6a337..82fa9fc6bbd 100644
--- a/mysql-test/suite/rocksdb/t/i_s_deadlock.test
+++ b/mysql-test/suite/rocksdb/t/i_s_deadlock.test
@@ -1,5 +1,9 @@
--source include/have_rocksdb.inc
+# Uses LOCK IN SHARE MODE and so will hang in range-locking mode. The part that
+# doesn't hang is in rocksdb.range_locking_i_s_deadlock.test
+--source suite/rocksdb/include/not_range_locking.inc
+
set @prior_lock_wait_timeout = @@rocksdb_lock_wait_timeout;
set @prior_deadlock_detect = @@rocksdb_deadlock_detect;
set @prior_max_latest_deadlocks = @@rocksdb_max_latest_deadlocks;
diff --git a/mysql-test/suite/rocksdb/t/issue111.test b/mysql-test/suite/rocksdb/t/issue111.test
index 671ea4708d6..3657e977a70 100644
--- a/mysql-test/suite/rocksdb/t/issue111.test
+++ b/mysql-test/suite/rocksdb/t/issue111.test
@@ -1,5 +1,9 @@
--source include/have_rocksdb.inc
+# The testcase here assumes key tracking is present
+# (and range locking uses InnoDB-like approach, "DMLs use Read Commited")
+--source suite/rocksdb/include/not_range_locking.inc
+
connect (con2,localhost,root,,);
connection default;
diff --git a/mysql-test/suite/rocksdb/t/issue243_transactionStatus-range_locking.test b/mysql-test/suite/rocksdb/t/issue243_transactionStatus-range_locking.test
new file mode 100644
index 00000000000..465fb9099da
--- /dev/null
+++ b/mysql-test/suite/rocksdb/t/issue243_transactionStatus-range_locking.test
@@ -0,0 +1,10 @@
+#
+# A range-locking variant of issue243_transactionStatus.test
+
+--source include/have_rocksdb.inc
+--source suite/rocksdb/include/have_range_locking.inc
+
+let $forced_range_locking=1;
+--source issue243_transactionStatus.test
+
+
diff --git a/mysql-test/suite/rocksdb/t/issue243_transactionStatus.test b/mysql-test/suite/rocksdb/t/issue243_transactionStatus.test
index 1e2f0b41226..5c1948ebe81 100644
--- a/mysql-test/suite/rocksdb/t/issue243_transactionStatus.test
+++ b/mysql-test/suite/rocksdb/t/issue243_transactionStatus.test
@@ -1,5 +1,9 @@
--source include/have_rocksdb.inc
+if (!$forced_range_locking) {
+--source suite/rocksdb/include/not_range_locking.inc
+}
+
--disable_warnings
DROP TABLE IF EXISTS t1;
--enable_warnings
diff --git a/mysql-test/suite/rocksdb/t/level_repeatable_read-range_locking.test b/mysql-test/suite/rocksdb/t/level_repeatable_read-range_locking.test
new file mode 100644
index 00000000000..6c42c7be12c
--- /dev/null
+++ b/mysql-test/suite/rocksdb/t/level_repeatable_read-range_locking.test
@@ -0,0 +1,9 @@
+--source include/have_rocksdb.inc
+
+# Range locking uses InnoDB-like transaction isolation, which
+# means the results differ from "true" Repeatable Read.
+--source suite/rocksdb/include/have_range_locking.inc
+
+let $trx_isolation = REPEATABLE READ;
+--source transaction_isolation.inc
+
diff --git a/mysql-test/suite/rocksdb/t/level_repeatable_read.test b/mysql-test/suite/rocksdb/t/level_repeatable_read.test
index cf29073f69e..b81dcf31ab1 100644
--- a/mysql-test/suite/rocksdb/t/level_repeatable_read.test
+++ b/mysql-test/suite/rocksdb/t/level_repeatable_read.test
@@ -1,5 +1,8 @@
--source include/have_rocksdb.inc
+# See level_repeatable_read-range_locking variant
+--source suite/rocksdb/include/not_range_locking.inc
+
let $trx_isolation = REPEATABLE READ;
--source transaction_isolation.inc
diff --git a/mysql-test/suite/rocksdb/t/lock_info.test b/mysql-test/suite/rocksdb/t/lock_info.test
index 1b624cf38c0..a277c1b8d8d 100644
--- a/mysql-test/suite/rocksdb/t/lock_info.test
+++ b/mysql-test/suite/rocksdb/t/lock_info.test
@@ -1,5 +1,8 @@
--source include/have_rocksdb.inc
+# Range Locking supports I_S.lock_info but its printout is different (see range_locking.test)
+--source suite/rocksdb/include/not_range_locking.inc
+
--disable_warnings
DROP TABLE IF EXISTS t1;
DROP TABLE IF EXISTS t2;
diff --git a/mysql-test/suite/rocksdb/t/locking_issues.test b/mysql-test/suite/rocksdb/t/locking_issues.test
index 035046ae368..95a6676f78a 100644
--- a/mysql-test/suite/rocksdb/t/locking_issues.test
+++ b/mysql-test/suite/rocksdb/t/locking_issues.test
@@ -1,5 +1,8 @@
--source include/have_rocksdb.inc
+# A lot of tests below assume point locking, not range.
+--source suite/rocksdb/include/not_range_locking.inc
+
let $isolation_level = REPEATABLE READ;
--source suite/rocksdb/include/locking_issues_case1_1.inc
diff --git a/mysql-test/suite/rocksdb/t/max_row_locks.test b/mysql-test/suite/rocksdb/t/max_row_locks.test
index 4b07f3d8492..d4b2604f1e3 100644
--- a/mysql-test/suite/rocksdb/t/max_row_locks.test
+++ b/mysql-test/suite/rocksdb/t/max_row_locks.test
@@ -1,4 +1,5 @@
--source include/have_rocksdb.inc
+--source suite/rocksdb/include/not_range_locking.inc
create table t1 (id1 bigint, id2 bigint, c1 bigint, c2 bigint, c3 bigint, c4 bigint, c5 bigint, c6 bigint, c7 bigint, primary key (id1, id2), index i(c1, c2));
--disable_query_log
diff --git a/mysql-test/suite/rocksdb/t/partial_index_stress.py b/mysql-test/suite/rocksdb/t/partial_index_stress.py
index 07220d88705..0419637719b 100644
--- a/mysql-test/suite/rocksdb/t/partial_index_stress.py
+++ b/mysql-test/suite/rocksdb/t/partial_index_stress.py
@@ -26,7 +26,7 @@ def get_query(table_name, binary_id1):
return """INSERT INTO %s VALUES (%d, 0, %d, 0, %d, 1, 'abc', 100, 1) ON DUPLICATE KEY UPDATE time=time+10, version=version+1""" % (table_name, id1, id2, assoc_type)
class Worker(threading.Thread):
- def __init__(self, con, table_name, num_iters, check, event):
+ def __init__(self, con, table_name, num_iters, check, event, binary_id1):
threading.Thread.__init__(self)
self.con = con
self.table_name = table_name
@@ -47,8 +47,8 @@ class Worker(threading.Thread):
def run_write(self):
cur = self.con.cursor()
- cur.execute("select data_type from information_schema.columns where table_schema = database() and table_name = '%s' and column_name = 'id1'" % self.table_name);
- binary_id1 = cur.fetchone()[0] == "binary"
+# cur.execute("select data_type from information_schema.columns where table_schema = database() and table_name = '%s' and column_name = 'id1'" % self.table_name);
+# binary_id1 = cur.fetchone()[0] == "binary"
cur.execute("SET SESSION TRANSACTION ISOLATION LEVEL READ COMMITTED")
for x in range(self.num_iters):
try:
@@ -74,7 +74,7 @@ class Worker(threading.Thread):
raise e
if __name__ == '__main__':
- if len(sys.argv) != 8:
+ if len(sys.argv) != 9:
print("Usage: partial_index_stress.py user host port db_name " \
"table_name num_iters num_threads")
sys.exit(1)
@@ -86,6 +86,7 @@ if __name__ == '__main__':
table_name = sys.argv[5]
num_iters = int(sys.argv[6])
num_workers = int(sys.argv[7])
+ binary_id1 = int(sys.argv[8])
done_event = threading.Event();
@@ -94,12 +95,12 @@ if __name__ == '__main__':
for i in range(num_workers):
w = Worker(
MySQLdb.connect(user=user, host=host, port=port, db=db), table_name,
- num_iters, False, None)
+ num_iters, False, None, binary_id1)
workers.append(w)
checker = Worker(
MySQLdb.connect(user=user, host=host, port=port, db=db), table_name,
- num_iters, True, done_event)
+ num_iters, True, done_event, binary_id1)
for w in workers:
w.join()
diff --git a/mysql-test/suite/rocksdb/t/partial_index_stress.test b/mysql-test/suite/rocksdb/t/partial_index_stress.test
index c78e8cb980e..1608979b661 100644
--- a/mysql-test/suite/rocksdb/t/partial_index_stress.test
+++ b/mysql-test/suite/rocksdb/t/partial_index_stress.test
@@ -5,7 +5,7 @@
set @save_rocksdb_lock_wait_timeout = @@rocksdb_lock_wait_timeout;
set global rocksdb_lock_wait_timeout = 100000;
-CREATE TABLE `assoc_table` (
+CREATE TABLE `assoc_table1` (
`id1` bigint(20) unsigned NOT NULL DEFAULT '0',
`id1_type` int(10) unsigned NOT NULL DEFAULT '0',
`id2` bigint(20) unsigned NOT NULL DEFAULT '0',
@@ -16,14 +16,14 @@ CREATE TABLE `assoc_table` (
`time` int(10) unsigned NOT NULL DEFAULT '0',
`version` bigint(20) unsigned NOT NULL DEFAULT '0',
PRIMARY KEY (`assoc_type`,`id1`,`id2`) COMMENT 'cf_assoc',
- KEY `id1_type` (`assoc_type`, `id1`, `visibility`,`time`,`id2`,`version`,`data`) COMMENT 'cfname=rev:cf_assoc_id1_type;partial_group_keyparts=2;partial_group_threshold=10'
+ KEY `id1_type` (`assoc_type`, `id1`, `visibility`,`time`,`id2`,`version`,`data`) COMMENT 'cfname=rev:cf_assoc_id1_type'
) ENGINE=ROCKSDB DEFAULT CHARSET=latin1 COLLATE=latin1_bin ROW_FORMAT=COMPRESSED KEY_BLOCK_SIZE=8;
-exec /usr/bin/python3 suite/rocksdb/t/partial_index_stress.py root 127.0.0.1 $MASTER_MYPORT test assoc_table 1000 10;
+exec /usr/bin/python3 suite/rocksdb/t/partial_index_stress.py root 127.0.0.1 $MASTER_MYPORT test assoc_table1 1000 10 0;
-DROP TABLE assoc_table;
+DROP TABLE assoc_table1;
-CREATE TABLE `assoc_table` (
+CREATE TABLE `assoc_table2` (
`id1` binary(16) NOT NULL DEFAULT '\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0',
`raw_key` text COLLATE latin1_bin,
`id2` bigint(20) unsigned NOT NULL DEFAULT '0',
@@ -35,14 +35,14 @@ CREATE TABLE `assoc_table` (
`version` bigint(20) unsigned NOT NULL DEFAULT '0',
PRIMARY KEY (`assoc_type`,`id1`,`id2`) COMMENT 'cf_assoc',
KEY `id1_type` (`assoc_type`,`id1`,`visibility`,`time`,`id2`,`version`,`data`) COMMENT 'rev:cf_assoc_id1_type',
- KEY `id1_type2` (`assoc_type`, `id1`, `visibility`,`time`,`id2`,`version`,`data`) COMMENT 'cfname=rev:cf_assoc_id1_type;partial_group_keyparts=2;partial_group_threshold=10'
+ KEY `id1_type2` (`assoc_type`, `id1`, `visibility`,`time`,`id2`,`version`,`data`) COMMENT 'cfname=rev:cf_assoc_id1_type;'
) ENGINE=ROCKSDB DEFAULT CHARSET=latin1 COLLATE=latin1_bin ROW_FORMAT=COMPRESSED KEY_BLOCK_SIZE=8;
-exec /usr/bin/python3 suite/rocksdb/t/partial_index_stress.py root 127.0.0.1 $MASTER_MYPORT test assoc_table 1000 10;
+exec /usr/bin/python3 suite/rocksdb/t/partial_index_stress.py root 127.0.0.1 $MASTER_MYPORT test assoc_table2 1000 10 1;
-DROP TABLE assoc_table;
+DROP TABLE assoc_table2;
-CREATE TABLE `assoc_table` (
+CREATE TABLE `assoc_table3` (
`id1` bigint(20) unsigned NOT NULL DEFAULT '0',
`id1_type` int(10) unsigned NOT NULL DEFAULT '0',
`id2` bigint(20) unsigned NOT NULL DEFAULT '0',
@@ -54,11 +54,11 @@ CREATE TABLE `assoc_table` (
`version` bigint(20) unsigned NOT NULL DEFAULT '0',
PRIMARY KEY (`assoc_type`,`id1`,`id2`) COMMENT 'cf_assoc',
KEY `id1_type` (`assoc_type`, `id1`, `visibility`,`time`,`id2`,`version`,`data`(255)) COMMENT 'rev:cf_assoc_id1_type',
- KEY `id1_type2` (`assoc_type`, `id1`, `visibility`,`time`,`id2`,`version`,`data`(255)) COMMENT 'cfname=rev:cf_assoc_id1_type;partial_group_keyparts=2;partial_group_threshold=10'
+ KEY `id1_type2` (`assoc_type`, `id1`, `visibility`,`time`,`id2`,`version`,`data`(255)) COMMENT 'cfname=rev:cf_assoc_id1_type;'
) ENGINE=ROCKSDB DEFAULT CHARSET=latin1 COLLATE=latin1_bin ROW_FORMAT=COMPRESSED KEY_BLOCK_SIZE=4;
-exec /usr/bin/python3 suite/rocksdb/t/partial_index_stress.py root 127.0.0.1 $MASTER_MYPORT test assoc_table 1000 10;
+exec /usr/bin/python3 suite/rocksdb/t/partial_index_stress.py root 127.0.0.1 $MASTER_MYPORT test assoc_table3 1000 10 0;
-DROP TABLE assoc_table;
+DROP TABLE assoc_table3;
set global rocksdb_lock_wait_timeout = @save_rocksdb_lock_wait_timeout;
diff --git a/mysql-test/suite/rocksdb/t/range_locking.inc b/mysql-test/suite/rocksdb/t/range_locking.inc
new file mode 100644
index 00000000000..4f1db4399cb
--- /dev/null
+++ b/mysql-test/suite/rocksdb/t/range_locking.inc
@@ -0,0 +1,544 @@
+#
+# Range locking tests.
+#
+
+--source include/have_rocksdb.inc
+--source suite/rocksdb/include/have_range_locking.inc
+
+--enable_connect_log
+
+
+show variables like 'rocksdb_use_range_locking';
+
+eval create table t1 (
+ pk int,
+ a int,
+ primary key (pk) comment '$pk_cf'
+) engine=rocksdb;
+
+insert into t1 values
+(10,10),(20,20),(30,30);
+
+connect (con1,localhost,root,,);
+connect (con2,localhost,root,,);
+
+--echo ### Test: check that range lock inhibits a point lock
+connection con1;
+begin;
+select * from t1 where pk between 5 and 25 for update;
+
+connection con2;
+--error ER_LOCK_WAIT_TIMEOUT
+insert into t1 values (15,15);
+
+connection con1;
+rollback;
+
+--echo ## Test: check that range lock inhibits another range lock
+connection con1;
+begin;
+select * from t1 where pk between 5 and 25 for update;
+
+connection con2;
+begin;
+--error ER_LOCK_WAIT_TIMEOUT
+select * from t1 where pk between 15 and 35 for update;
+rollback;
+
+connection con1;
+rollback;
+
+--echo ## Test: check that regular read does not get a range lock
+connection con1;
+begin;
+select * from t1 where pk between 5 and 25;
+
+connection con2;
+begin;
+# This must not block
+select * from t1 where pk between 15 and 35 for update;
+rollback;
+
+connection con1;
+rollback;
+
+--echo ## Test that locks are not released when a statement inside
+--echo ## a transaction is rolled back
+eval
+create table t2 (
+ pk int,
+ a int,
+ primary key (pk) comment '$pk_cf',
+ unique key(a) comment '$sk_cf'
+) engine=rocksdb;
+
+insert into t2 values (1,1),(2,2);
+
+begin;
+insert into t2 values (3,3);
+--error ER_DUP_ENTRY
+insert into t2 values (10,2);
+
+connection con2;
+begin;
+# This must time out:
+--error ER_LOCK_WAIT_TIMEOUT
+select * from t2 where pk=3 for update;
+
+rollback;
+connection con1;
+rollback;
+drop table t2;
+
+# cleanup
+connection default;
+disconnect con1;
+disconnect con2;
+drop table t1;
+
+--echo #
+--echo # Test INFORMATION_SCHEMA.lock_info in range-locking mode
+--echo #
+
+connect (con1,localhost,root,,);
+connection con1;
+eval create table t0 (a int primary key);
+begin;
+insert into t0 values (1);
+connection default;
+
+
+eval
+create table t1 (
+ pk int,
+ a int,
+ primary key (pk) comment '$pk_cf'
+) engine=rocksdb;
+
+insert into t1 values
+(10,10),(20,20),(30,30);
+
+begin;
+select * from t1 where pk=10 for update;
+
+#let TRX1_ID=`(select transaction_id from information_schema.rocksdb_trx where thread_id=connection_id())` ;
+let $select_from_is_rowlocks_current_trx_only=1;
+--source suite/rocksdb/include/select_from_is_rowlocks.inc
+
+delete from t1 where pk between 25 and 40;
+
+--source suite/rocksdb/include/select_from_is_rowlocks.inc
+
+rollback;
+begin;
+--echo # The following will show a range lock on 2-9 and also a point lock on 10.
+--echo # This is how things currently work. (after MDEV-21314, not anymore)
+select * from t1 where pk between 2 and 9 for update;
+--source suite/rocksdb/include/select_from_is_rowlocks.inc
+rollback;
+
+drop table t1;
+connection con1;
+rollback;
+drop table t0;
+connection default;
+disconnect con1;
+
+--echo #
+--echo # MDEV-18104: MyRocks-Gap-Lock: range locking bounds are incorrect for multi-part keys
+--echo #
+
+create table t0(a int);
+insert into t0 values (0),(1),(2),(3),(4),(5),(6),(7),(8),(9);
+eval
+create table t1 (
+ kp1 int not null,
+ kp2 int not null,
+ a int,
+ primary key(kp1, kp2) comment '$pk_cf'
+) engine=rocksdb;
+
+insert into t1 select 1, a, 1234 from t0;
+insert into t1 select 2, a, 1234 from t0;
+insert into t1 select 3, a, 1234 from t0;
+
+connect (con1,localhost,root,,);
+connection con1;
+
+begin;
+select * from t1 where kp1=2 for update;
+
+connection default;
+--echo # The lock on kp1=2 should inhibit the following INSERT:
+--error ER_LOCK_WAIT_TIMEOUT
+insert into t1 values ( 2,5,9999);
+rollback;
+
+connection con1;
+rollback;
+connection default;
+disconnect con1;
+drop table t0,t1;
+
+--echo #
+--echo # Test that locks on ranges on non-unique secondary keys inhibit
+--echo # modifications of the contents of these ranges
+--echo #
+
+create table t0(a int);
+insert into t0 values (0),(1),(2),(3),(4),(5),(6),(7),(8),(9);
+eval
+create table t1 (
+ kp1 int not null,
+ kp2 int not null,
+ a int,
+ key(kp1, kp2) comment '$pk_cf'
+) engine=rocksdb;
+
+insert into t1 select 1, a, 1234 from t0;
+insert into t1 values (2, 3, 1234);
+insert into t1 values (2, 5, 1234);
+insert into t1 values (2, 7, 1234);
+insert into t1 select 3, a, 1234 from t0;
+
+connect (con1,localhost,root,,);
+connection con1;
+begin;
+--replace_column 10 #
+explain
+select * from t1 where kp1=2 for update;
+select * from t1 where kp1=2 for update;
+
+connection default;
+begin;
+--error ER_LOCK_WAIT_TIMEOUT
+insert into t1 values (2, 9, 9999);
+
+--error ER_LOCK_WAIT_TIMEOUT
+delete from t1 where kp1=2 and kp2=5;
+
+# Update that "moves a row away" from the locked range
+--error ER_LOCK_WAIT_TIMEOUT
+update t1 set kp1=333 where kp1=2 and kp2=3;
+
+# Update that "moves a row into" the locked range
+--error ER_LOCK_WAIT_TIMEOUT
+update t1 set kp1=2 where kp1=1 and kp2=8;
+
+rollback;
+
+connection con1;
+rollback;
+disconnect con1;
+connection default;
+drop table t0,t1;
+
+--echo #
+--echo # Transaction isolation test
+--echo #
+
+create table t1 (pk int primary key, a int) engine=rocksdb;
+insert into t1 values (1,1),(2,2),(3,3);
+
+connect (con1,localhost,root,,);
+
+--echo # TRX1: Start, Allocate a snapshot
+connection con1;
+begin;
+select * from t1;
+
+--echo # TRX2: Make a change that TRX1 will not see
+connection default;
+update t1 set a=2222 where pk=2;
+
+--echo # TRX1: Now, make a change that would overwrite TRX2'x change and commit
+connection con1;
+update t1 set a=a+1 where pk=2;
+commit;
+
+--echo # Examine the result:
+--echo # pk=2, a=2223 means UPDATE in TRX1 used "read committed" (InnoDB-like isolation)
+--echo # pk=2, a=3 means UPDATE in TRX1 silently overwrote TRX2
+--echo # (and with key tracking, one would get an error on the second UPDATE)
+connection default;
+select * from t1;
+
+disconnect con1;
+connection default;
+drop table t1;
+
+--echo #
+--echo # Same test as above, but check the range scan
+--echo #
+
+eval
+create table t1 (
+ pk int,
+ a int,
+ primary key (pk) comment '$pk_cf'
+) engine=rocksdb;
+insert into t1 values (1,1),(2,2),(3,3),(4,4),(5,5),(6,6);
+
+connect (con1,localhost,root,,);
+
+--echo # TRX1: Start, Allocate a snapshot
+connection con1;
+begin;
+select * from t1;
+
+--echo # TRX2: Make a change that TRX1 will not see
+connection default;
+update t1 set a=2222 where pk between 3 and 5;
+
+--echo # TRX1: Now, make a change that would overwrite TRX2'x change and commit
+connection con1;
+update t1 set a=a+1 where pk between 3 and 5;
+commit;
+
+--echo # Examine the result:
+--echo # pk={3,4,5} a=2223 means UPDATE in TRX1 used "read committed" (InnoDB-like isolation)
+connection default;
+select * from t1;
+
+disconnect con1;
+connection default;
+drop table t1;
+
+--echo #
+--echo # Same as above, but test SELECT FOR UPDATE.
+--echo #
+eval
+create table t1 (
+ pk int,
+ a int,
+ primary key (pk) comment '$pk_cf'
+) engine=rocksdb;
+insert into t1 values (1,1),(2,2),(3,3),(4,4),(5,5),(6,6);
+
+connect (con1,localhost,root,,);
+
+--echo # TRX1: Start, Allocate a snapshot
+connection con1;
+begin;
+select * from t1;
+
+--echo # TRX2: Make a change that TRX1 will not see
+connection default;
+update t1 set a=222 where pk=2;
+update t1 set a=333 where pk=3;
+
+--echo # TRX1: Check what select [FOR UPDATE] sees
+connection con1;
+select * from t1 where pk in (2,3);
+select * from t1 where pk=2 for update;
+select * from t1 where pk=2;
+
+commit;
+
+disconnect con1;
+connection default;
+drop table t1;
+
+if (!$PK_USES_REVERSE_CF) {
+--echo #
+--echo # Another no-snapshot-checking test, this time for single-statement
+--echo # transaction
+--echo #
+eval
+create table t1 (
+ pk int,
+ a int,
+ name varchar(16),
+ primary key(pk) comment '$pk_cf'
+) engine=rocksdb;
+insert into t1 values (1,1, 'row1'), (2,2,'row2');
+
+connect (con1,localhost,root,,);
+connection con1;
+select get_lock('row1', 100);
+
+connection default;
+
+--echo # The following will read the first row (1,1,'row1'), and stop.
+
+send update t1 set a=a+100 where get_lock(name, 1000)=1;
+
+# Wait till the default connection has stopped:
+connection con1;
+
+let $wait_condition=
+ SELECT COUNT(*) = 1 FROM INFORMATION_SCHEMA.PROCESSLIST WHERE STATE = "User lock"
+ AND INFO = "update t1 set a=a+100 where get_lock(name, 1000)=1";
+--source include/wait_condition.inc
+
+# Update the second row
+update t1 set a=5 where pk=2;
+
+select release_lock('row1');
+
+connection default;
+reap;
+
+--echo # Look at the row with pk=2:
+--echo # 2, 105, row2 - means the UPDATE was reading current data (Correct)
+--echo # 2, 102, row - means the UPDATE read the snapshot (incorrect)
+select * from t1;
+
+--echo # Try releasing both locks (in 5.6, we will be holding only the second one)
+select release_lock(name) from t1;
+
+disconnect con1;
+connection default;
+drop table t1;
+}
+
+--echo #
+--echo # Check that I_S.processlist.state is set correctly now.
+--echo #
+eval
+create table t1(
+ pk int,
+ a int,
+ primary key(pk) comment '$pk_cf'
+) engine=rocksdb;
+insert into t1 values (1,1),(2,2),(3,3);
+
+begin;
+select * from t1 where pk=2 for update;
+
+--connect (con1,localhost,root,,)
+begin;
+set rocksdb_lock_wait_timeout=300;
+send select * from t1 where pk=2 for update;
+
+connection default;
+--echo # Now, will wait until we see con1 have state="Waiting for row lock"
+let $wait_condition=
+ SELECT COUNT(*) = 1 FROM INFORMATION_SCHEMA.PROCESSLIST WHERE STATE = "Waiting for row lock"
+ AND INFO = "select * from t1 where pk=2 for update";
+--source include/wait_condition.inc
+
+rollback;
+connection con1;
+--reap
+rollback;
+
+disconnect con1;
+connection default;
+drop table t1;
+
+--echo #
+--echo # Test range locking for ranges with HA_READ_PREFIX_LAST
+--echo #
+create table t0(a int) engine=rocksdb;
+insert into t0 values (0),(1),(2),(3),(4),(5),(6),(7),(8),(9);
+
+eval
+create table t1 (
+ pk1 int,
+ pk2 int,
+ a int,
+ primary key(pk1, pk2) comment '$pk_cf'
+) engine=rocksdb;
+
+insert into t1
+select
+ A.a, B.a, A.a*10+B.a
+from
+ t0 A, t0 B;
+
+
+# Get a lock in another connection so that the primary transaction is not using
+# STO optimization, and its locks can be seen in I_S.rocksdb_locks
+--connect (con1,localhost,root,,)
+connection con1;
+begin;
+insert into t1 values (0x1112222,0x1112222,0);
+
+connection default;
+begin;
+--echo # Should use ref access w/o filesort:
+--replace_column 10 #
+explain
+select * from t1
+where pk1=3
+order by pk1 desc, pk2 desc
+for update;
+
+select * from t1
+where pk1=3
+order by pk1 desc, pk2 desc
+for update;
+
+let $select_from_is_rowlocks_current_trx_only=1;
+--source suite/rocksdb/include/select_from_is_rowlocks.inc
+rollback;
+
+--echo #
+--echo # Test range locking for ranges with HA_READ_PREFIX_LAST_OR_PREV
+--echo #
+
+begin;
+--echo # Should use range access with 2 keyparts and w/o filesort:
+--replace_column 10 #
+explain
+select * from t1
+where pk1=4 and pk2 between 5 and 8
+order by pk1 desc, pk2 desc
+for update;
+
+select * from t1
+where pk1=4 and pk2 between 5 and 8
+order by pk1 desc, pk2 desc
+for update;
+
+let $select_from_is_rowlocks_current_trx_only=1;
+--source suite/rocksdb/include/select_from_is_rowlocks.inc
+rollback;
+
+connection con1;
+rollback;
+
+connection default;
+drop table t0, t1;
+
+--echo #
+--echo # A bug: range locking was not used when scan started at table start or end
+--echo #
+create table t0(a int);
+insert into t0 values (0),(1),(2),(3),(4),(5),(6),(7),(8),(9);
+create table t10(a int);
+insert into t10 select A.a + B.a* 10 + C.a * 100 from t0 A, t0 B, t0 C;
+
+create table t1 (
+ pk int not null,
+ a int,
+ primary key(pk)
+) engine=rocksdb;
+
+insert into t1 select a*2,a*2 from t10;
+
+connection con1;
+begin;
+select * from t1 where pk=500 for update;
+connection default;
+
+begin;
+select * from t1 where pk<10 order by pk limit 10 for update;
+
+let $select_from_is_rowlocks_current_trx_only=1;
+--source suite/rocksdb/include/select_from_is_rowlocks.inc
+rollback;
+
+begin;
+select * from t1 where pk>1990 order by pk desc limit 10 for update;
+let $select_from_is_rowlocks_current_trx_only=1;
+--source suite/rocksdb/include/select_from_is_rowlocks.inc
+rollback;
+
+connection con1;
+rollback;
+disconnect con1;
+
+connection default;
+drop table t0,t10,t1;
diff --git a/mysql-test/suite/rocksdb/t/range_locking.test b/mysql-test/suite/rocksdb/t/range_locking.test
new file mode 100644
index 00000000000..5c599238a0a
--- /dev/null
+++ b/mysql-test/suite/rocksdb/t/range_locking.test
@@ -0,0 +1,6 @@
+
+--let pk_cf=default
+--let sk_cf=default
+
+--source range_locking.inc
+
diff --git a/mysql-test/suite/rocksdb/t/range_locking_deadlock_tracking.test b/mysql-test/suite/rocksdb/t/range_locking_deadlock_tracking.test
new file mode 100644
index 00000000000..57fb4da340a
--- /dev/null
+++ b/mysql-test/suite/rocksdb/t/range_locking_deadlock_tracking.test
@@ -0,0 +1,196 @@
+--source suite/rocksdb/include/have_range_locking.inc
+
+#
+# This is deadlock_tracking.test, variant for running with Range Locking:
+# - Deadlock #5 is disabled, it requires LOCK IN SHARE MODE tests
+# - In the result file, SHOW ENGINE ROCKSDB TRANSACTION STATUS does not print
+# deadlock information.
+#
+set @prior_lock_wait_timeout = @@rocksdb_lock_wait_timeout;
+set @prior_deadlock_detect = @@rocksdb_deadlock_detect;
+set @prior_max_latest_deadlocks = @@rocksdb_max_latest_deadlocks;
+set global rocksdb_deadlock_detect = on;
+set global rocksdb_lock_wait_timeout = 10000;
+--echo # Clears deadlock buffer of any prior deadlocks.
+set global rocksdb_max_latest_deadlocks = 0;
+set global rocksdb_max_latest_deadlocks = @prior_max_latest_deadlocks;
+let $engine = rocksdb;
+
+--source include/count_sessions.inc
+connect (con1,localhost,root,,);
+let $con1= `SELECT CONNECTION_ID()`;
+
+connect (con2,localhost,root,,);
+let $con2= `SELECT CONNECTION_ID()`;
+
+connect (con3,localhost,root,,);
+let $con3= `SELECT CONNECTION_ID()`;
+
+connection default;
+eval create table t (i int primary key) engine=$engine;
+insert into t values (1), (2), (3);
+--replace_regex /[0-9]{4}-[0-9]{2}-[0-9]{2} [0-9]{2}:[0-9]{2}:[0-9]{2}/TIMESTAMP/ /WAITING KEY: [0-9a-f]{24}/KEY/ /TRANSACTION ID: [0-9]*/TXN_ID/ /TIMESTAMP: [0-9]*/TSTAMP/
+show engine rocksdb transaction status;
+
+echo Deadlock #1;
+--source include/simple_deadlock.inc
+connection default;
+--replace_regex /[0-9]{4}-[0-9]{2}-[0-9]{2} [0-9]{2}:[0-9]{2}:[0-9]{2}/TIMESTAMP/ /WAITING KEY: [0-9a-f]{24}/KEY/ /TRANSACTION ID: [0-9]*/TXN_ID/ /TIMESTAMP: [0-9]*/TSTAMP/
+show engine rocksdb transaction status;
+
+echo Deadlock #2;
+--source include/simple_deadlock.inc
+connection default;
+--replace_regex /[0-9]{4}-[0-9]{2}-[0-9]{2} [0-9]{2}:[0-9]{2}:[0-9]{2}/TIMESTAMP/ /WAITING KEY: [0-9a-f]{24}/KEY/ /TRANSACTION ID: [0-9]*/TXN_ID/ /TIMESTAMP: [0-9]*/TSTAMP/
+show engine rocksdb transaction status;
+set global rocksdb_max_latest_deadlocks = 10;
+
+echo Deadlock #3;
+--source include/simple_deadlock.inc
+connection default;
+--replace_regex /[0-9]{4}-[0-9]{2}-[0-9]{2} [0-9]{2}:[0-9]{2}:[0-9]{2}/TIMESTAMP/ /WAITING KEY: [0-9a-f]{24}/KEY/ /TRANSACTION ID: [0-9]*/TXN_ID/ /TIMESTAMP: [0-9]*/TSTAMP/
+show engine rocksdb transaction status;
+set global rocksdb_max_latest_deadlocks = 1;
+--replace_regex /[0-9]{4}-[0-9]{2}-[0-9]{2} [0-9]{2}:[0-9]{2}:[0-9]{2}/TIMESTAMP/ /WAITING KEY: [0-9a-f]{24}/KEY/ /TRANSACTION ID: [0-9]*/TXN_ID/ /TIMESTAMP: [0-9]*/TSTAMP/
+show engine rocksdb transaction status;
+
+connection con3;
+set rocksdb_deadlock_detect_depth = 2;
+
+--echo # Range locking code will report deadlocks, because it doesn't honor
+--echo # rocksdb_deadlock_detect_depth:
+echo Deadlock #4;
+connection con1;
+begin;
+select * from t where i=1 for update;
+
+connection con2;
+begin;
+select * from t where i=2 for update;
+
+connection con3;
+begin;
+select * from t where i=3 for update;
+
+connection con1;
+send select * from t where i=2 for update;
+
+connection con2;
+let $wait_condition = select count(*) = 1 from information_schema.rocksdb_trx
+where thread_id = $con1 and waiting_key != "";
+--source include/wait_condition.inc
+
+send select * from t where i=3 for update;
+
+connection con3;
+let $wait_condition = select count(*) = 1 from information_schema.rocksdb_trx
+where thread_id = $con2 and waiting_key != "";
+--source include/wait_condition.inc
+
+select variable_value into @a from performance_schema.global_status where variable_name='rocksdb_row_lock_deadlocks';
+--error ER_LOCK_DEADLOCK
+select * from t where i=1 for update;
+select case when variable_value-@a = 1 then 'true' else 'false' end as deadlocks from performance_schema.global_status where variable_name='rocksdb_row_lock_deadlocks';
+rollback;
+
+connection con2;
+reap;
+rollback;
+
+connection con1;
+reap;
+rollback;
+
+connection default;
+set global rocksdb_max_latest_deadlocks = 5;
+--replace_regex /[0-9]{4}-[0-9]{2}-[0-9]{2} [0-9]{2}:[0-9]{2}:[0-9]{2}/TIMESTAMP/ /WAITING KEY: [0-9a-f]{24}/KEY/ /TRANSACTION ID: [0-9]*/TXN_ID/ /TIMESTAMP: [0-9]*/TSTAMP/
+show engine rocksdb transaction status;
+
+--disable_testcase BUG#0000
+echo Deadlock #5;
+connection con1;
+begin;
+select * from t where i=1 for update;
+
+connection con2;
+begin;
+select * from t where i=2 for update;
+
+connection con3;
+begin;
+select * from t where i=3 lock in share mode;
+
+connection con1;
+select * from t where i=100 for update;
+select * from t where i=101 for update;
+send select * from t where i=2 for update;
+
+connection con2;
+let $wait_condition = select count(*) = 1 from information_schema.rocksdb_trx
+where thread_id = $con1 and waiting_key != "";
+--source include/wait_condition.inc
+
+select * from t where i=3 lock in share mode;
+select * from t where i=200 for update;
+select * from t where i=201 for update;
+
+--error ER_LOCK_DEADLOCK
+select * from t where i=1 lock in share mode;
+rollback;
+
+connection con1;
+reap;
+rollback;
+
+connection con3;
+rollback;
+
+connection default;
+--replace_regex /[0-9]{4}-[0-9]{2}-[0-9]{2} [0-9]{2}:[0-9]{2}:[0-9]{2}/TIMESTAMP/ /WAITING KEY: [0-9a-f]{24}/KEY/ /TRANSACTION ID: [0-9]*/TXN_ID/ /TIMESTAMP: [0-9]*/TSTAMP/
+show engine rocksdb transaction status;
+--enable_testcase
+echo Deadlock #6;
+connection con1;
+create table t1 (id int primary key, value int) engine=rocksdb;
+insert into t1 values (1,1),(2,2),(3,3),(4,4),(5,5);
+begin;
+update t1 set value=value+100 where id=1;
+update t1 set value=value+100 where id=2;
+
+connection con2;
+begin;
+update t1 set value=value+200 where id=3;
+
+connection con1;
+send update t1 set value=value+100 where id=3;
+
+connection con2;
+let $wait_condition = select count(*) = 1 from information_schema.rocksdb_trx
+where thread_id = $con1 and waiting_key != "";
+--source include/wait_condition.inc
+--error ER_LOCK_DEADLOCK
+update t1 set value=value+200 where id=1;
+
+# con2 tx is automatically rolled back
+connection con1;
+reap;
+select * from t1;
+drop table t1;
+
+connection default;
+
+disconnect con1;
+disconnect con2;
+disconnect con3;
+
+set global rocksdb_lock_wait_timeout = @prior_lock_wait_timeout;
+set global rocksdb_deadlock_detect = @prior_deadlock_detect;
+drop table t;
+--replace_regex /[0-9]{4}-[0-9]{2}-[0-9]{2} [0-9]{2}:[0-9]{2}:[0-9]{2}/TIMESTAMP/ /WAITING KEY: [0-9a-f]{24}/KEY/ /TRANSACTION ID: [0-9]*/TXN_ID/ /INDEX_ID: [0-9a-f]*/IDX_ID/ /TIMESTAMP: [0-9]*/TSTAMP/
+show engine rocksdb transaction status;
+set global rocksdb_max_latest_deadlocks = 0;
+--echo # Clears deadlock buffer of any existent deadlocks.
+set global rocksdb_max_latest_deadlocks = @prior_max_latest_deadlocks;
+--replace_regex /[0-9]{4}-[0-9]{2}-[0-9]{2} [0-9]{2}:[0-9]{2}:[0-9]{2}/TIMESTAMP/ /WAITING KEY: [0-9a-f]{24}/KEY/ /TRANSACTION ID: [0-9]*/TXN_ID/ /INDEX_ID: [0-9a-f]*/IDX_ID/ /TIMESTAMP: [0-9]*/TSTAMP/
+show engine rocksdb transaction status;
+--source include/wait_until_count_sessions.inc
diff --git a/mysql-test/suite/rocksdb/t/range_locking_escalation-master.opt b/mysql-test/suite/rocksdb/t/range_locking_escalation-master.opt
new file mode 100644
index 00000000000..d0087e2a77b
--- /dev/null
+++ b/mysql-test/suite/rocksdb/t/range_locking_escalation-master.opt
@@ -0,0 +1 @@
+--rocksdb_use_range_locking=1 --rocksdb_max_lock_memory=1024
diff --git a/mysql-test/suite/rocksdb/t/range_locking_escalation.test b/mysql-test/suite/rocksdb/t/range_locking_escalation.test
new file mode 100644
index 00000000000..5a6e9fa6616
--- /dev/null
+++ b/mysql-test/suite/rocksdb/t/range_locking_escalation.test
@@ -0,0 +1,39 @@
+#
+# Range Locking - Lock Escalation Tests.
+#
+
+--source include/have_rocksdb.inc
+--source suite/rocksdb/include/have_range_locking.inc
+--enable_connect_log
+
+
+show variables like 'rocksdb_use_range_locking';
+show variables like 'rocksdb_max_lock_memory';
+show status like 'rocksdb_locktree_escalation_count';
+create table t0(a int);
+insert into t0 values (0),(1),(2),(3),(4),(5),(6),(7),(8),(9);
+
+create table t1 (
+ pk int primary key,
+ a int
+) engine=rocksdb;
+
+#begin;
+#insert into t1 values (1000111,100011);
+#connect (con1,localhost,root,,);
+#connection con1;
+
+insert into t1
+select
+ A.a + B.a*10 + C.a*100 + D.a*1000,
+ 12345
+from t0 A, t0 B, t0 C, t0 D;
+
+select count(*) from t1;
+
+#connection default;
+#disconnect con1;
+show status like 'rocksdb_locktree_escalation_count';
+
+drop table t0,t1;
+
diff --git a/mysql-test/suite/rocksdb/t/range_locking_refresh_iter.test b/mysql-test/suite/rocksdb/t/range_locking_refresh_iter.test
new file mode 100644
index 00000000000..9bbb1b9b392
--- /dev/null
+++ b/mysql-test/suite/rocksdb/t/range_locking_refresh_iter.test
@@ -0,0 +1,70 @@
+--source include/have_rocksdb.inc
+--source suite/rocksdb/include/have_range_locking.inc
+--source include/have_debug_sync.inc
+
+select @@rocksdb_use_range_locking;
+
+--disable_warnings
+set debug_sync='RESET';
+--enable_warnings
+#
+# Testcase for iterator snapshot refresh
+#
+create table ten(a int primary key);
+insert into ten values (0),(1),(2),(3),(4),(5),(6),(7),(8),(9);
+
+create table one_k(a int primary key);
+insert into one_k select A.a + B.a* 10 + C.a * 100 from ten A, ten B, ten C;
+
+create table t1 (
+ pk int primary key,
+ a int
+) engine=rocksdb;
+
+insert into t1 select a,a from ten;
+insert into t1 select a+40, a+40 from ten;
+insert into t1 select a+100, a+100 from one_k;
+delete from t1 where pk=44;
+set global rocksdb_force_flush_memtable_and_lzero_now=1;
+
+# Ok, now the table has these PK ranges:
+# 0..9 40..49 100...1000
+# and all rows have pk=a
+connect (con1,localhost,root,,);
+connect (con2,localhost,root,,);
+
+connection con1;
+begin;
+set debug_sync='rocksdb.check_flags_iri SIGNAL con1_stopped WAIT_FOR con1_cont';
+send
+update t1 set a=a+100 where pk < 3 or pk between 10 and 50;
+
+# The query is how stuck at the start of the second range.
+
+
+## con2>
+connection con2;
+set debug_sync='now WAIT_FOR con1_stopped';
+
+# Make some changes to check if the iterator is reading current data or
+# snapshot
+insert into t1 values (44,5000);
+delete from t1 where pk= 42;
+update t1 set a=5000 where pk between 40 and 45;
+set global rocksdb_force_flush_memtable_and_lzero_now=1;
+
+set debug_sync='now SIGNAL con1_cont';
+
+connection con1;
+#--error ER_GET_ERRMSG
+reap;
+select * from t1 where pk<100;
+
+commit;
+disconnect con1;
+disconnect con2;
+connection default;
+set debug_sync='RESET';
+
+drop table t1, ten, one_k;
+
diff --git a/mysql-test/suite/rocksdb/t/range_locking_rev_cf.test b/mysql-test/suite/rocksdb/t/range_locking_rev_cf.test
new file mode 100644
index 00000000000..8b993764235
--- /dev/null
+++ b/mysql-test/suite/rocksdb/t/range_locking_rev_cf.test
@@ -0,0 +1,12 @@
+#
+# Range locking tests.
+#
+
+--source include/have_rocksdb.inc
+--source suite/rocksdb/include/have_range_locking.inc
+
+--let pk_cf=rev:cf1
+--let PK_USES_REVERSE_CF=1
+
+--source range_locking.inc
+
diff --git a/mysql-test/suite/rocksdb/t/range_locking_seek_for_update.test b/mysql-test/suite/rocksdb/t/range_locking_seek_for_update.test
new file mode 100644
index 00000000000..c1f0fe312e0
--- /dev/null
+++ b/mysql-test/suite/rocksdb/t/range_locking_seek_for_update.test
@@ -0,0 +1,288 @@
+#
+# Range Locking : tests for SeekForUpdate feature
+#
+
+--source include/have_rocksdb.inc
+--source include/have_debug_sync.inc
+--source suite/rocksdb/include/have_range_locking.inc
+--enable_connect_log
+show variables like 'rocksdb_use_range_locking';
+
+create table t0(a int primary key);
+insert into t0 values (0),(1),(2),(3),(4),(5),(6),(7),(8),(9);
+
+create table t1 (
+ pk int,
+ a int,
+ primary key (pk)
+) engine=rocksdb;
+
+insert into t1 select
+ A.a + B.a*10 + C.a*100,
+ A.a + B.a*10 + C.a*100
+from
+ t0 A, t0 B, t0 C;
+
+--echo # Make another connection to get the lock tree out of the STO-mode
+connect (con1,localhost,root,,);
+connection con1;
+begin;
+select * from t1 where pk=10 for update;
+
+connection default;
+begin;
+select * from t1 where pk=11 for update;
+
+let $select_from_is_rowlocks_current_trx_only=1;
+--echo # Now, we will just see locks on 10=0xA and 11=0xB:
+--source suite/rocksdb/include/select_from_is_rowlocks.inc
+
+--echo #
+--echo # SeekForUpdate Test #1: A query with type=range (without upper bound) and LIMIT
+--echo #
+--replace_column 10 #
+explain
+select * from t1 where pk>=500 order by pk limit 3 for update;
+select * from t1 where pk>=500 order by pk limit 3 for update;
+--source suite/rocksdb/include/select_from_is_rowlocks.inc
+rollback;
+
+
+begin;
+select * from t1 where pk=11 for update;
+explain
+select * from t1 order by pk limit 3 for update;
+select * from t1 order by pk limit 3 for update;
+
+--source suite/rocksdb/include/select_from_is_rowlocks.inc
+
+rollback;
+connection con1;
+rollback;
+disconnect con1;
+connection default;
+drop table t0, t1;
+
+
+--echo #
+--echo # Concurrent tests: let one thread do SeekForUpdate and the other
+--echo # interfere by committing modifications
+--echo #
+
+create table t0(a int primary key);
+insert into t0 values (0),(1),(2),(3),(4),(5),(6),(7),(8),(9);
+
+create table t1 (
+ pk int,
+ a int,
+ primary key (pk)
+) engine=rocksdb;
+
+insert into t1 select
+ A.a + B.a*10 + C.a*100,
+ A.a + B.a*10 + C.a*100
+from
+ t0 A, t0 B, t0 C;
+
+select * from t1 where pk<10;
+delete from t1 where pk<10;
+select * from t1 where pk<10;
+
+
+--echo # Test what happens when another transaction commits a row
+--echo # right before the range we are about to lock (nothing)
+
+--replace_column 10 #
+explain
+select * from t1 where pk >=5 order by pk limit 3 for update;
+
+begin;
+
+set debug_sync='rocksdb.locking_iter_scan SIGNAL about_to_lock_range WAIT_FOR spoiler_inserted';
+send select * from t1 where pk >=5 order by pk limit 3 for update;
+
+connect (con1,localhost,root,,);
+connection con1;
+set debug_sync='now WAIT_FOR about_to_lock_range';
+insert into t1 values (3,3);
+set debug_sync='now SIGNAL spoiler_inserted';
+
+connection default;
+reap;
+rollback;
+
+delete from t1 where pk=3;
+
+--echo #
+--echo # Now, repeat the test but let the other transaction insert the row into
+--echo # the range we are locking
+
+--replace_column 10 #
+explain
+select * from t1 where pk >=5 order by pk limit 1 for update;
+
+begin;
+
+set debug_sync='rocksdb.locking_iter_scan SIGNAL about_to_lock_range WAIT_FOR spoiler_inserted';
+send
+select * from t1 where pk >=5 order by pk limit 1 for update;
+
+connection con1;
+set debug_sync='now WAIT_FOR about_to_lock_range';
+insert into t1 values (8,8);
+set debug_sync='now SIGNAL spoiler_inserted';
+
+connection default;
+reap;
+
+--source suite/rocksdb/include/select_from_is_rowlocks.inc
+
+rollback;
+delete from t1 where pk=8;
+
+--echo #
+--echo # Repeat the third time, this time deleting the row that SeekForUpdate saw
+--echo #
+insert into t1 values (7,7);
+
+begin;
+
+set debug_sync='rocksdb.locking_iter_scan SIGNAL about_to_lock_range WAIT_FOR spoiler_inserted';
+send
+select * from t1 where pk >=5 order by pk limit 1 for update;
+
+connection con1;
+set debug_sync='now WAIT_FOR about_to_lock_range';
+delete from t1 where pk=7;
+set debug_sync='now SIGNAL spoiler_inserted';
+
+connection default;
+reap;
+
+rollback;
+
+--echo #
+--echo # Repeat the above test, but let the read fail with ER_LOCK_WAIT_TIMEOUT
+--echo # error. MyRocks code should now be prepared that data reads cause this
+--echo # error
+--echo #
+insert into t1 values (7,7);
+
+begin;
+
+set debug_sync='rocksdb.locking_iter_scan SIGNAL about_to_lock_range WAIT_FOR spoiler_inserted';
+send
+select * from t1 where pk >=5 order by pk limit 1 for update;
+
+connection con1;
+set debug_sync='now WAIT_FOR about_to_lock_range';
+begin;
+delete from t1 where pk=7;
+set debug_sync='now SIGNAL spoiler_inserted';
+
+connection default;
+--error ER_LOCK_WAIT_TIMEOUT
+reap;
+
+rollback;
+
+connection con1;
+rollback;
+connection default;
+
+--echo #
+--echo # Backward scan test
+--echo #
+connection con1;
+begin;
+select * from t1 where pk=500 for update;
+connection default;
+
+insert into t1 values
+ (1001, 1001),
+ (1005, 1005),
+ (1007, 1007),
+ (1010, 1010);
+
+begin;
+select * from t1 order by pk desc limit 2 for update;
+
+let $select_from_is_rowlocks_current_trx_only=1;
+
+--echo # The below will lock from pk=1007 (0x3ef) till the end of the table:
+--source suite/rocksdb/include/select_from_is_rowlocks.inc
+
+rollback;
+
+begin;
+select * from t1 where pk <1007 order by pk desc limit 2 for update;
+--source suite/rocksdb/include/select_from_is_rowlocks.inc
+
+connection con1;
+rollback;
+
+connection default;
+rollback;
+
+--echo #
+--echo # Backward scan test 2: error condition
+--echo #
+connection con1;
+begin;
+select * from t1 where pk=1010 for update;
+
+connection default;
+begin;
+--error ER_LOCK_WAIT_TIMEOUT
+select * from t1 order by pk desc limit 2 for update;
+rollback;
+
+connection con1;
+rollback;
+begin;
+select * from t1 where pk=1007 for update;
+
+connection default;
+begin;
+--error ER_LOCK_WAIT_TIMEOUT
+select * from t1 order by pk desc limit 2 for update;
+rollback;
+
+connection con1;
+rollback;
+
+disconnect con1;
+connection default;
+drop table t0,t1;
+
+--echo #
+--echo # A test: full table scan doesn't lock gaps
+--echo #
+
+create table t1 (
+ pk int primary key,
+ a int
+) engine=rocksdb;
+
+insert into t1 values (10,10),(20,20),(30,30);
+
+connect (con1,localhost,root,,);
+connect (con2,localhost,root,,);
+
+connection con1;
+begin;
+
+select * from t1 for update;
+
+connection con2;
+
+--error ER_LOCK_WAIT_TIMEOUT
+insert into t1 values (5,5);
+
+connection con1;
+rollback;
+
+disconnect con1;
+disconnect con2;
+connection default;
+drop table t1;
diff --git a/mysql-test/suite/rocksdb/t/range_locking_shared_locks.test b/mysql-test/suite/rocksdb/t/range_locking_shared_locks.test
new file mode 100644
index 00000000000..c6e4e457897
--- /dev/null
+++ b/mysql-test/suite/rocksdb/t/range_locking_shared_locks.test
@@ -0,0 +1,202 @@
+#
+# Test for shared lock support for range locking
+#
+--source include/have_rocksdb.inc
+--source suite/rocksdb/include/have_range_locking.inc
+--enable_connect_log
+
+select @@rocksdb_use_range_locking;
+
+create table t0 (a int);
+insert into t0 values (0),(1),(2),(3),(4),(5),(6),(7),(8),(9);
+
+create table t1 (
+ pk int primary key,
+ a int
+) engine=rocksdb;
+
+
+insert into t1 select a,a from t0;
+
+--echo # A basic test for shared locks
+
+begin;
+select * from t1 where pk=3 for update;
+select * from t1 where pk=5 lock in share mode;
+let $TRX1_ID=`select transaction_id from information_schema.rocksdb_trx where thread_id=connection_id()`;
+
+connect (con1,localhost,root,,);
+connection con1;
+begin;
+select * from t1 where pk=5 lock in share mode;
+let $TRX2_ID=`select transaction_id from information_schema.rocksdb_trx where thread_id=connection_id()`;
+--echo # Now for pk=5 we should see two locks by TRX1 and TRX2 with mode=S:
+--source suite/rocksdb/include/select_from_is_rowlocks.inc
+
+rollback;
+--echo # Now, TRX2_ID should be gone:
+--source suite/rocksdb/include/select_from_is_rowlocks.inc
+
+connection default;
+
+--echo # Get a read lock on pk=3 (where we have a write lock).
+--echo # The result should be that we will still have a write lock
+select * from t1 where pk=3 for update;
+--source suite/rocksdb/include/select_from_is_rowlocks.inc
+
+--echo # Get a write lock on pk=5 (where we have a read lock).
+--echo # The result should be that we will have a write lock.
+select * from t1 where pk=5 for update;
+--source suite/rocksdb/include/select_from_is_rowlocks.inc
+
+connection default;
+rollback;
+
+--echo #
+--echo # Test if a read lock inhibits write locks
+--echo #
+
+begin;
+select * from t1 where pk=2 lock in share mode;
+select * from t1 where pk=8 for update;
+
+connection con1;
+begin;
+
+--error ER_LOCK_WAIT_TIMEOUT
+select * from t1 where pk=2 for update;
+
+--error ER_LOCK_WAIT_TIMEOUT
+select * from t1 where pk between 0 and 4 for update;
+
+--error ER_LOCK_WAIT_TIMEOUT
+delete from t1 where pk=2;
+
+--echo # Get a shared lock
+select * from t1 where pk=2 lock in share mode;
+
+--echo # But this should still prevent us from acquiring a write lock on that value:
+--error ER_LOCK_WAIT_TIMEOUT
+select * from t1 where pk=2 for update;
+
+rollback;
+connection default;
+rollback;
+
+drop table t1;
+create table t1 (
+ pk int not null primary key,
+ a int not null,
+ key(a)
+) engine=rocksdb;
+
+insert into t1
+select
+ A.a+10*B.a+100*C.a+1000*D.a, A.a+10*B.a+100*C.a+1000*D.a
+from
+ t0 A, t0 B, t0 C, t0 D;
+set global rocksdb_force_flush_memtable_now=1;
+
+connection con1;
+begin;
+select * from t1 where pk=900 for update;
+let $TRX2_ID=`select transaction_id from information_schema.rocksdb_trx where thread_id=connection_id()`;
+
+connection default;
+begin;
+--replace_column 10 #
+explain
+select * from t1 where a between 2 and 5 lock in share mode;
+select * from t1 where a between 2 and 5 lock in share mode;
+let $TRX1_ID=`select transaction_id from information_schema.rocksdb_trx where thread_id=connection_id()`;
+
+--source suite/rocksdb/include/select_from_is_rowlocks.inc
+
+rollback;
+
+disconnect con1;
+
+drop table t0,t1;
+
+--echo #
+--echo # Test shared point locks and lock escalation
+--echo #
+create table t0 (a int);
+insert into t0 values (0),(1),(2),(3),(4),(5),(6),(7),(8),(9);
+
+create table t1 (
+ pk int primary key,
+ a int
+) engine=rocksdb;
+
+insert into t1
+select 1000 + 100*A.a + 10*B.a + C.a, 12345 from t0 A, t0 B, t0 C;
+
+show status like 'rocksdb_locktree_current_lock_memory';
+
+connect (con1,localhost,root,,);
+connection con1;
+
+begin;
+--echo # CON1: get some shared locks
+select * from t1 where pk=1001 lock in share mode;
+select * from t1 where pk=1100 lock in share mode;
+select * from t1 where pk=1200 lock in share mode;
+
+select * from t1 where pk=2500 lock in share mode;
+let $TRX1_ID=`select transaction_id from information_schema.rocksdb_trx where thread_id=connection_id()`;
+
+connection default;
+begin;
+--echo # DEFAULT: get the same locks so we have locks with multiple owners
+select * from t1 where pk=1001 lock in share mode;
+select * from t1 where pk=1100 lock in share mode;
+select * from t1 where pk=1200 lock in share mode;
+
+--echo # DEFAULT: get shared locks with one owner:
+select * from t1 where pk=2510 lock in share mode;
+let $TRX2_ID=`select transaction_id from information_schema.rocksdb_trx where thread_id=connection_id()`;
+
+
+--echo # DEFAULT: exclusive locks on 0-10:
+insert into t1 select A.a, 0 from t0 A;
+
+connection con1;
+--echo # CON1: exclusive locks on 2000-2010:
+insert into t1 select 2000+A.a, 0 from t0 A;
+
+let $order_by_rowkey=1;
+#select * from information_schema.rocksdb_locks;
+--source suite/rocksdb/include/select_from_is_rowlocks.inc
+
+connection default;
+show status like 'rocksdb_locktree_current_lock_memory';
+set @save_mlm= @@rocksdb_max_lock_memory;
+
+--echo # Set the limit to cause lock escalation:
+set @cur_mem_usage= (select
+ variable_value
+ from
+ performance_schema.global_status
+ where
+ variable_name='rocksdb_locktree_current_lock_memory');
+
+set global rocksdb_max_lock_memory = cast(@cur_mem_usage+4 as SIGNED);
+
+connection con1;
+insert into t1 select 3000+A.a, 0 from t0 A;
+
+#select * from information_schema.rocksdb_locks;
+--source suite/rocksdb/include/select_from_is_rowlocks.inc
+
+connection con1;
+rollback;
+connection default;
+rollback;
+
+disconnect con1;
+set global rocksdb_max_lock_memory= cast(@save_mlm as SIGNED);
+
+drop table t0, t1;
+
+
diff --git a/mysql-test/suite/rocksdb/t/rocksdb.test b/mysql-test/suite/rocksdb/t/rocksdb.test
index c063d8c7ccb..0544214b8c9 100644
--- a/mysql-test/suite/rocksdb/t/rocksdb.test
+++ b/mysql-test/suite/rocksdb/t/rocksdb.test
@@ -2,6 +2,9 @@
--source suite/rocksdb/include/have_write_committed.inc
--source include/count_sessions.inc
+# Does SHOW WARNINGS and SHOW STATUS which change in Range Locking mode
+--source suite/rocksdb/include/not_range_locking.inc
+
#
# RocksDB Storage Engine tests
#
diff --git a/mysql-test/suite/rocksdb/t/rocksdb_concurrent_delete.test b/mysql-test/suite/rocksdb/t/rocksdb_concurrent_delete.test
index 47818bfdbe1..3aa51b7be80 100644
--- a/mysql-test/suite/rocksdb/t/rocksdb_concurrent_delete.test
+++ b/mysql-test/suite/rocksdb/t/rocksdb_concurrent_delete.test
@@ -27,6 +27,10 @@
# In all cases, RR gets snapshot conflict errors if non-first rows get
# deleted by another transaction after scanning.
+# The tests do not work with range locking as it locks it is about to
+# read, first.
+--source suite/rocksdb/include/not_range_locking.inc
+
--source include/have_rocksdb.inc
--source include/have_debug_sync.inc
diff --git a/mysql-test/suite/rocksdb/t/rocksdb_locks.test b/mysql-test/suite/rocksdb/t/rocksdb_locks.test
index ff092773737..8b3975723df 100644
--- a/mysql-test/suite/rocksdb/t/rocksdb_locks.test
+++ b/mysql-test/suite/rocksdb/t/rocksdb_locks.test
@@ -5,6 +5,9 @@
#
--source include/have_debug.inc
+# Range locking requests locks before doing snapshot checking.
+--source suite/rocksdb/include/not_range_locking.inc
+
--enable_connect_log
create table t1 (pk int not null primary key) engine=rocksdb;
diff --git a/mysql-test/suite/rocksdb/t/rocksdb_read_free_rpl.test b/mysql-test/suite/rocksdb/t/rocksdb_read_free_rpl.test
index 92981e19a43..cc5c1a90436 100644
--- a/mysql-test/suite/rocksdb/t/rocksdb_read_free_rpl.test
+++ b/mysql-test/suite/rocksdb/t/rocksdb_read_free_rpl.test
@@ -62,7 +62,7 @@ update t1 set c2=100 where c1=3;
delete from t1 where c1 <= 2;
--source include/sync_slave_sql_with_master.inc
--source include/rpl_connection_slave.inc
-select case when variable_value-@up > 0 then 'false' else 'true' end as read_free from performance_schema.global_status where variable_name='rocksdb_num_get_for_update_calls';
+select case when (@@rocksdb_use_range_locking=1 OR variable_value-@up > 0) then 'false' else 'true' end as read_free from performance_schema.global_status where variable_name='rocksdb_num_get_for_update_calls';
select * from t1;
--echo
diff --git a/mysql-test/suite/rocksdb/t/rocksdb_timeout_rollback.test b/mysql-test/suite/rocksdb/t/rocksdb_timeout_rollback.test
index 694594efd70..1273a2b6f70 100644
--- a/mysql-test/suite/rocksdb/t/rocksdb_timeout_rollback.test
+++ b/mysql-test/suite/rocksdb/t/rocksdb_timeout_rollback.test
@@ -46,6 +46,8 @@ begin work;
insert into t1 values (9);
insert into t1 values (10);
+--echo # Fix for Range Locking: force a snapshot to be taken:
+select * from t1 where a=100;
update t1 set a = a + 1 where a = 2;
connection con1;
diff --git a/mysql-test/suite/rocksdb/t/rpl_row_not_found.inc b/mysql-test/suite/rocksdb/t/rpl_row_not_found.inc
index 5a78979f048..63b72ce5c5a 100644
--- a/mysql-test/suite/rocksdb/t/rpl_row_not_found.inc
+++ b/mysql-test/suite/rocksdb/t/rpl_row_not_found.inc
@@ -3,6 +3,8 @@
--source include/have_debug.inc
--source include/have_debug_sync.inc
+--source suite/rocksdb/include/not_range_locking.inc
+
connection master;
--disable_warnings
drop table if exists t1;
diff --git a/mysql-test/suite/rocksdb/t/select_lock_in_share_mode.test b/mysql-test/suite/rocksdb/t/select_lock_in_share_mode.test
index 23ce6d45234..cf9d53ff88a 100644
--- a/mysql-test/suite/rocksdb/t/select_lock_in_share_mode.test
+++ b/mysql-test/suite/rocksdb/t/select_lock_in_share_mode.test
@@ -1,5 +1,8 @@
--source include/have_rocksdb.inc
+# Range locking only supports exclusive locks currently.
+--source suite/rocksdb/include/not_range_locking.inc
+
#
# SELECT .. LOCK IN SHARE MODE
#
diff --git a/mysql-test/suite/rocksdb/t/unique_check.test b/mysql-test/suite/rocksdb/t/unique_check.test
index 47ca74d0e5e..9814d89448d 100644
--- a/mysql-test/suite/rocksdb/t/unique_check.test
+++ b/mysql-test/suite/rocksdb/t/unique_check.test
@@ -2,6 +2,11 @@
--source include/have_debug_sync.inc
--source include/count_sessions.inc
+# Doesn't work with range locking because lock tree waits do not set
+# state="Waiting for row lock" in I_S.PROCESSLIST. See MDEV-17873 for
+# details.
+--source suite/rocksdb/include/not_range_locking.inc
+
# For GitHub issue#167 -- Unique key check doesn't work
connect (con1, localhost, root,,);
diff --git a/mysql-test/suite/rocksdb/t/unique_sec.inc b/mysql-test/suite/rocksdb/t/unique_sec.inc
index ce0bb1e39a9..508816e6ace 100644
--- a/mysql-test/suite/rocksdb/t/unique_sec.inc
+++ b/mysql-test/suite/rocksdb/t/unique_sec.inc
@@ -144,8 +144,16 @@ UPDATE t1 SET id5=37 WHERE id1=38;
UPDATE t1 SET id5=34 WHERE id1=38;
--echo # NULL values are unique
+--echo # (Note: the following UPDATE reads through the whole table without
+--echo # finding anything to update. With point locking, this is fine,
+--echo # but with range locking it will time out while waiting on a row lock
+--echo # that the other transaction is holding)
+if (`select @@rocksdb_use_range_locking=0`) {
UPDATE t1 SET id5=NULL WHERE value1 > 37;
-
+}
+if (`select @@rocksdb_use_range_locking=1`) {
+-- echo UPDATE t1 SET id5=NULL WHERE value1 > 37;
+}
connection con1;
COMMIT;
diff --git a/mysql-test/suite/rocksdb/t/unique_sec_rev_cf.test b/mysql-test/suite/rocksdb/t/unique_sec_rev_cf.test
index d6a8e3d5a1b..8d2e64e5890 100644
--- a/mysql-test/suite/rocksdb/t/unique_sec_rev_cf.test
+++ b/mysql-test/suite/rocksdb/t/unique_sec_rev_cf.test
@@ -3,3 +3,4 @@
let ddl= $MYSQL_TMP_DIR/unique_sec_rev_cf.sql;
--exec sed s/##CF##/" COMMENT 'rev:cf'"/g suite/rocksdb/t/unique_sec.inc > $ddl
--source $ddl
+--remove_file $ddl
diff --git a/mysql-test/suite/rocksdb/t/varbinary_format.test b/mysql-test/suite/rocksdb/t/varbinary_format.test
index c5ada5b158d..446cc69f036 100644
--- a/mysql-test/suite/rocksdb/t/varbinary_format.test
+++ b/mysql-test/suite/rocksdb/t/varbinary_format.test
@@ -1,6 +1,10 @@
--source include/have_debug.inc
--source include/have_rocksdb.inc
+# The test uses SELECT .. FOR UPDATE and examines which locks it acquires
+# Range Locking will use different locks from point locking
+--source suite/rocksdb/include/not_range_locking.inc
+
# Create a table with a varbinary key with the current format and validate
# that it sorts correctly
CREATE TABLE t1(
diff --git a/mysql-test/suite/rocksdb/t/varchar_format.test b/mysql-test/suite/rocksdb/t/varchar_format.test
index c2814d65c20..bf3149ca754 100644
--- a/mysql-test/suite/rocksdb/t/varchar_format.test
+++ b/mysql-test/suite/rocksdb/t/varchar_format.test
@@ -1,6 +1,8 @@
--source include/have_debug.inc
--source include/have_rocksdb.inc
+--source suite/rocksdb/include/not_range_locking.inc
+
####################
# Create a table with a varchar key with the current format and validate
# that it sorts correctly
diff --git a/mysql-test/suite/rocksdb_sys_vars/r/rocksdb_max_lock_memory_basic.result b/mysql-test/suite/rocksdb_sys_vars/r/rocksdb_max_lock_memory_basic.result
new file mode 100644
index 00000000000..614737fcfbc
--- /dev/null
+++ b/mysql-test/suite/rocksdb_sys_vars/r/rocksdb_max_lock_memory_basic.result
@@ -0,0 +1,7 @@
+SET @start_global_value = @@global.ROCKSDB_USE_RANGE_LOCKING;
+SELECT @start_global_value;
+@start_global_value
+0
+"Trying to set variable @@global.ROCKSDB_USE_RANGE_LOCKING to 444. It should fail because it is readonly."
+SET @@global.ROCKSDB_USE_RANGE_LOCKING = 444;
+ERROR HY000: Variable 'rocksdb_use_range_locking' is a read only variable
diff --git a/mysql-test/suite/rocksdb_sys_vars/r/rocksdb_use_range_locking_basic.result b/mysql-test/suite/rocksdb_sys_vars/r/rocksdb_use_range_locking_basic.result
new file mode 100644
index 00000000000..614737fcfbc
--- /dev/null
+++ b/mysql-test/suite/rocksdb_sys_vars/r/rocksdb_use_range_locking_basic.result
@@ -0,0 +1,7 @@
+SET @start_global_value = @@global.ROCKSDB_USE_RANGE_LOCKING;
+SELECT @start_global_value;
+@start_global_value
+0
+"Trying to set variable @@global.ROCKSDB_USE_RANGE_LOCKING to 444. It should fail because it is readonly."
+SET @@global.ROCKSDB_USE_RANGE_LOCKING = 444;
+ERROR HY000: Variable 'rocksdb_use_range_locking' is a read only variable
diff --git a/mysql-test/suite/rocksdb_sys_vars/t/rocksdb_max_lock_memory_basic.test b/mysql-test/suite/rocksdb_sys_vars/t/rocksdb_max_lock_memory_basic.test
new file mode 100644
index 00000000000..ee185aba660
--- /dev/null
+++ b/mysql-test/suite/rocksdb_sys_vars/t/rocksdb_max_lock_memory_basic.test
@@ -0,0 +1,5 @@
+--source include/have_rocksdb.inc
+--let $sys_var=ROCKSDB_USE_RANGE_LOCKING
+--let $read_only=1
+--let $session=0
+--source ../include/rocksdb_sys_var.inc
diff --git a/mysql-test/suite/rocksdb_sys_vars/t/rocksdb_use_range_locking_basic.test b/mysql-test/suite/rocksdb_sys_vars/t/rocksdb_use_range_locking_basic.test
new file mode 100644
index 00000000000..ee185aba660
--- /dev/null
+++ b/mysql-test/suite/rocksdb_sys_vars/t/rocksdb_use_range_locking_basic.test
@@ -0,0 +1,5 @@
+--source include/have_rocksdb.inc
+--let $sys_var=ROCKSDB_USE_RANGE_LOCKING
+--let $read_only=1
+--let $session=0
+--source ../include/rocksdb_sys_var.inc
diff --git a/storage/rocksdb/CMakeLists.txt b/storage/rocksdb/CMakeLists.txt
index 135a6af62df..5c25f89b6e9 100644
--- a/storage/rocksdb/CMakeLists.txt
+++ b/storage/rocksdb/CMakeLists.txt
@@ -126,6 +126,7 @@ SET(ROCKSDB_SOURCES
logger.h
rdb_datadic.cc rdb_datadic.h
rdb_iterator.cc rdb_iterator.h
+ rdb_locking_iter.cc rdb_locking_iter.h
rdb_cf_options.cc rdb_cf_options.h
rdb_cf_manager.cc rdb_cf_manager.h
rdb_converter.cc rdb_converter.h
diff --git a/storage/rocksdb/ha_rocksdb.cc b/storage/rocksdb/ha_rocksdb.cc
index f59b456675d..0a7ed94beb7 100644
--- a/storage/rocksdb/ha_rocksdb.cc
+++ b/storage/rocksdb/ha_rocksdb.cc
@@ -105,6 +105,9 @@
#include "./ObjectFactory.h"
#endif
+#include "./rdb_locking_iter.h"
+
+
// Internal MySQL APIs not exposed in any header.
extern "C" {
/**
@@ -635,6 +638,10 @@ static void rocksdb_set_delayed_write_rate(THD *thd, struct SYS_VAR *var,
static void rocksdb_set_max_latest_deadlocks(THD *thd, struct SYS_VAR *var,
void *var_ptr, const void *save);
+static void rocksdb_set_max_lock_memory(THD *thd,
+ struct SYS_VAR *var,
+ void *var_ptr, const void *save);
+
static void rdb_set_collation_exception_list(const char *exception_list);
static void rocksdb_set_collation_exception_list(THD *thd, struct SYS_VAR *var,
void *var_ptr,
@@ -789,6 +796,16 @@ static unsigned long long // NOLINT(runtime/int)
static bool rocksdb_skip_locks_if_skip_unique_check = false;
static bool rocksdb_alter_column_default_inplace = false;
+// Range Locking: how much memory can be used used for the lock data structure
+// (which hold the locks acquired by all clients).
+static ulonglong rocksdb_max_lock_memory;
+
+static bool rocksdb_use_range_locking = 0;
+static bool rocksdb_use_range_lock_manager_as_point = 0;
+std::shared_ptr<rocksdb::RangeLockManagerHandle> range_lock_mgr;
+
+std::shared_ptr<rocksdb::RangeLockManagerHandle> range_lock_mgr_used_as_point;
+
std::atomic<uint64_t> rocksdb_row_lock_deadlocks(0);
std::atomic<uint64_t> rocksdb_row_lock_wait_timeouts(0);
std::atomic<uint64_t> rocksdb_snapshot_conflict_errors(0);
@@ -1506,6 +1523,13 @@ static MYSQL_SYSVAR_UINT(max_latest_deadlocks, rocksdb_max_latest_deadlocks,
nullptr, rocksdb_set_max_latest_deadlocks,
rocksdb::kInitialMaxDeadlocks, 0, UINT32_MAX, 0);
+static MYSQL_SYSVAR_ULONGLONG(max_lock_memory, rocksdb_max_lock_memory,
+ PLUGIN_VAR_RQCMDARG,
+ "Range-locking mode: Maximum amount of memory "
+ "that locks from all transactions can use at a time",
+ nullptr, rocksdb_set_max_lock_memory,
+ /*initial*/1073741824, 0, UINT64_MAX, 0);
+
static MYSQL_SYSVAR_ENUM(
info_log_level, rocksdb_info_log_level, PLUGIN_VAR_RQCMDARG,
"Filter level for info logs to be written mysqld error log. "
@@ -2355,6 +2379,19 @@ static MYSQL_SYSVAR_BOOL(table_stats_use_table_scan,
rocksdb_update_table_stats_use_table_scan,
rocksdb_table_stats_use_table_scan);
+static MYSQL_SYSVAR_BOOL(use_range_locking, rocksdb_use_range_locking,
+ PLUGIN_VAR_OPCMDARG | PLUGIN_VAR_READONLY,
+ "Use Range Locking",
+ nullptr, nullptr,
+ rocksdb_use_range_locking);
+
+static MYSQL_SYSVAR_BOOL(use_range_lock_manager_as_point,
+ rocksdb_use_range_lock_manager_as_point,
+ PLUGIN_VAR_OPCMDARG | PLUGIN_VAR_READONLY,
+ "Use Range Lock Manager as point",
+ nullptr, nullptr,
+ rocksdb_use_range_lock_manager_as_point);
+
static MYSQL_SYSVAR_BOOL(
large_prefix, rocksdb_large_prefix, PLUGIN_VAR_RQCMDARG,
"Support large index prefix length of 3072 bytes. If off, the maximum "
@@ -2678,7 +2715,9 @@ static struct SYS_VAR *rocksdb_system_variables[] = {
MYSQL_SYSVAR(manual_compaction_threads),
MYSQL_SYSVAR(manual_compaction_bottommost_level),
MYSQL_SYSVAR(rollback_on_timeout),
-
+ MYSQL_SYSVAR(use_range_locking),
+ MYSQL_SYSVAR(use_range_lock_manager_as_point),
+ MYSQL_SYSVAR(max_lock_memory),
MYSQL_SYSVAR(enable_insert_with_update_caching),
MYSQL_SYSVAR(trace_block_cache_access),
MYSQL_SYSVAR(trace_queries),
@@ -2936,8 +2975,41 @@ class Rdb_transaction {
virtual rocksdb::Status do_pop_savepoint() = 0;
virtual void do_rollback_to_savepoint() = 0;
+ private:
+ /*
+ If true, the current statement should not use a snapshot for reading.
+ Note that in a multi-statement transaction, the snapshot may have been
+ allocated by another statement.
+ */
+ bool m_stmt_ignores_snapshot = false;
+
+ /* Snapshot-ignore mode will put away m_reads_opts.snapshot here: */
+ const rocksdb::Snapshot *m_saved_snapshot= nullptr;
+
public:
+
+ void start_ignore_snapshot() {
+ // note: this may be called several times for the same statement
+ if (!m_stmt_ignores_snapshot) {
+ m_saved_snapshot = m_read_opts.snapshot;
+ m_read_opts.snapshot = nullptr;
+ m_stmt_ignores_snapshot= true;
+ if (!m_snapshot_timestamp)
+ rdb->GetEnv()->GetCurrentTime(&m_snapshot_timestamp);
+ }
+ }
+
+ void end_ignore_snapshot_if_needed() {
+ if (m_stmt_ignores_snapshot) {
+ m_stmt_ignores_snapshot = false;
+ m_read_opts.snapshot = m_saved_snapshot;
+ m_saved_snapshot = nullptr;
+ }
+ }
+ bool in_snapshot_ignore_mode() const { return m_stmt_ignores_snapshot; }
+
rocksdb::ReadOptions m_read_opts;
+
const char *m_mysql_log_file_name;
my_off_t m_mysql_log_offset;
const char *m_mysql_gtid;
@@ -3118,6 +3190,19 @@ class Rdb_transaction {
virtual void release_lock(const Rdb_key_def &key_descr,
const std::string &rowkey, bool force = false) = 0;
+ virtual
+ rocksdb::Status lock_range(rocksdb::ColumnFamilyHandle *const cf,
+ const rocksdb::Endpoint &start,
+ const rocksdb::Endpoint &end) = 0;
+
+ rocksdb::Status lock_singlepoint_range(rocksdb::ColumnFamilyHandle *const cf,
+ const rocksdb::Slice &point) {
+ // Normally, one needs to "flip" the endpoint type for reverse-ordered CFs.
+ // But here we are locking just one point so this is not necessary.
+ rocksdb::Endpoint endp(point, false);
+ return lock_range(cf, endp, endp);
+ }
+
virtual bool prepare() = 0;
bool commit_or_rollback() {
@@ -3176,10 +3261,17 @@ class Rdb_transaction {
m_is_delayed_snapshot = false;
}
+ void locking_iter_created() {
+ if (!m_snapshot_timestamp)
+ rdb->GetEnv()->GetCurrentTime(&m_snapshot_timestamp);
+ }
+
virtual void acquire_snapshot(bool acquire_now) = 0;
virtual void release_snapshot() = 0;
- bool has_snapshot() const { return m_read_opts.snapshot != nullptr; }
+ bool has_snapshot() const {
+ return m_read_opts.snapshot != nullptr || m_saved_snapshot;
+ }
private:
// The Rdb_sst_info structures we are currently loading. In a partitioned
@@ -3530,7 +3622,9 @@ class Rdb_transaction {
virtual rocksdb::Iterator *get_iterator(
const rocksdb::ReadOptions &options,
- rocksdb::ColumnFamilyHandle *column_family) = 0;
+ rocksdb::ColumnFamilyHandle *column_family,
+ bool is_rev_cf,
+ bool use_locking_iterator=false) = 0;
virtual void multi_get(rocksdb::ColumnFamilyHandle *const column_family,
const size_t num_keys, const rocksdb::Slice *keys,
@@ -3539,10 +3633,12 @@ class Rdb_transaction {
const bool sorted_input) const = 0;
rocksdb::Iterator *get_iterator(
- rocksdb::ColumnFamilyHandle *const column_family, bool skip_bloom_filter,
+ rocksdb::ColumnFamilyHandle *const column_family, bool is_rev_cf,
+ bool skip_bloom_filter,
const rocksdb::Slice &eq_cond_lower_bound,
const rocksdb::Slice &eq_cond_upper_bound, bool read_current = false,
- bool create_snapshot = true) {
+ bool create_snapshot = true,
+ bool use_locking_iterator=false) {
// Make sure we are not doing both read_current (which implies we don't
// want a snapshot) and create_snapshot which makes sure we create
// a snapshot
@@ -3572,12 +3668,14 @@ class Rdb_transaction {
if (read_current) {
options.snapshot = nullptr;
}
- return get_iterator(options, column_family);
+ return get_iterator(options, column_family, is_rev_cf,
+ use_locking_iterator);
}
virtual bool is_tx_started() const = 0;
virtual void start_tx() = 0;
- virtual void start_stmt() = 0;
+ virtual void start_stmt(bool is_dml_statement) = 0;
+ virtual void start_autocommit_stmt(bool /*is_dml_statement*/){}
virtual void set_name() = 0;
protected:
@@ -3736,6 +3834,13 @@ class Rdb_transaction_impl : public Rdb_transaction {
virtual bool is_writebatch_trx() const override { return false; }
+ // Lock the range between two specified endpoints
+ rocksdb::Status lock_range(rocksdb::ColumnFamilyHandle *const cf,
+ const rocksdb::Endpoint &start_endp,
+ const rocksdb::Endpoint &end_endp) override {
+ ++m_row_lock_count;
+ return m_rocksdb_tx->GetRangeLock(cf, start_endp, end_endp);
+ }
private:
void release_tx(void) {
// We are done with the current active transaction object. Preserve it
@@ -3821,7 +3926,7 @@ class Rdb_transaction_impl : public Rdb_transaction {
}
void acquire_snapshot(bool acquire_now) override {
- if (m_read_opts.snapshot == nullptr) {
+ if (m_read_opts.snapshot == nullptr && !in_snapshot_ignore_mode()) {
const auto thd_ss = std::static_pointer_cast<Rdb_explicit_snapshot>(
m_thd->get_explicit_snapshot());
if (thd_ss) {
@@ -3966,9 +4071,17 @@ class Rdb_transaction_impl : public Rdb_transaction {
rocksdb::Iterator *get_iterator(
const rocksdb::ReadOptions &options,
- rocksdb::ColumnFamilyHandle *const column_family) override {
+ rocksdb::ColumnFamilyHandle *const column_family,
+ bool is_rev_cf,
+ bool use_locking_iterator) override {
global_stats.queries[QUERIES_RANGE].inc();
- return m_rocksdb_tx->GetIterator(options, column_family);
+ if (use_locking_iterator) {
+ locking_iter_created();
+ return GetLockingIterator(m_rocksdb_tx, options, column_family,
+ is_rev_cf, &m_row_lock_count);
+ }
+ else
+ return m_rocksdb_tx->GetIterator(options, column_family);
}
const rocksdb::Transaction *get_rdb_trx() const { return m_rocksdb_tx; }
@@ -4042,17 +4155,35 @@ class Rdb_transaction_impl : public Rdb_transaction {
/*
Start a statement inside a multi-statement transaction.
- @todo: are we sure this is called once (and not several times) per
- statement start?
+ @note: If a statement uses N tables, this function will be called N times,
+ for each TABLE object that is used.
For hooking to start of statement that is its own transaction, see
ha_rocksdb::external_lock().
*/
- void start_stmt() override {
+ void start_stmt(bool is_dml_statement) override {
+
+ if (rocksdb_use_range_locking && is_dml_statement) {
+ /*
+ In Range Locking mode, RocksDB does not do "key tracking".
+ Use InnoDB-like concurrency mode: make the DML statements always read
+ the latest data (instead of using transaction's snapshot).
+ This "downgrades" the transaction isolation to READ-COMMITTED on the
+ master, but in return the actions can be replayed on the slave.
+ */
+ start_ignore_snapshot();
+ }
+
// Set the snapshot to delayed acquisition (SetSnapshotOnNextOperation)
acquire_snapshot(false);
}
+ void start_autocommit_stmt(bool is_dml_statement) override {
+ if (rocksdb_use_range_locking && is_dml_statement) {
+ start_ignore_snapshot();
+ }
+ }
+
/*
This must be called when last statement is rolled back, but the transaction
continues
@@ -4177,6 +4308,12 @@ class Rdb_writebatch_impl : public Rdb_transaction {
// Nothing to do here since we don't hold any row locks.
}
+ rocksdb::Status lock_range(rocksdb::ColumnFamilyHandle *const,
+ const rocksdb::Endpoint&,
+ const rocksdb::Endpoint&) override {
+ return rocksdb::Status::OK();
+ }
+
void rollback() override {
on_rollback();
m_write_count = 0;
@@ -4275,7 +4412,9 @@ class Rdb_writebatch_impl : public Rdb_transaction {
rocksdb::Iterator *get_iterator(
const rocksdb::ReadOptions &options,
- rocksdb::ColumnFamilyHandle *const /* column_family */) override {
+ rocksdb::ColumnFamilyHandle *const /* column_family */,
+ bool /*is_rev_cf*/,
+ bool /*use_locking_iterator*/) override {
const auto it = rdb->NewIterator(options);
return m_batch->NewIteratorWithBase(it);
}
@@ -4293,9 +4432,9 @@ class Rdb_writebatch_impl : public Rdb_transaction {
set_initial_savepoint();
}
+ void start_stmt(bool /*is_dml_statement*/) override {}
void set_name() override {}
- void start_stmt() override {}
void rollback_stmt() override {
if (m_batch) rollback_to_stmt_savepoint();
@@ -4623,6 +4762,7 @@ static int rocksdb_prepare(handlerton *const hton MY_ATTRIBUTE((__unused__)),
DEBUG_SYNC(thd, "rocksdb.prepared");
} else {
tx->make_stmt_savepoint_permanent();
+ tx->end_ignore_snapshot_if_needed();
}
return HA_EXIT_SUCCESS;
@@ -4850,6 +4990,7 @@ static int rocksdb_commit(handlerton *const hton MY_ATTRIBUTE((__unused__)),
- For a COMMIT statement that finishes a multi-statement transaction
- For a statement that has its own transaction
*/
+ tx->end_ignore_snapshot_if_needed();
if (tx->commit()) {
DBUG_RETURN(HA_ERR_ROCKSDB_COMMIT_FAILED);
}
@@ -4859,6 +5000,7 @@ static int rocksdb_commit(handlerton *const hton MY_ATTRIBUTE((__unused__)),
*/
tx->set_tx_failed(false);
tx->make_stmt_savepoint_permanent();
+ tx->end_ignore_snapshot_if_needed();
}
if (my_core::thd_tx_isolation(thd) <= ISO_READ_COMMITTED) {
@@ -4896,6 +5038,7 @@ static int rocksdb_rollback(handlerton *const hton MY_ATTRIBUTE((__unused__)),
- a statement inside a transaction is rolled back
*/
+ tx->end_ignore_snapshot_if_needed();
tx->rollback_stmt();
tx->set_tx_failed(true);
}
@@ -5000,8 +5143,9 @@ class Rdb_snapshot_status : public Rdb_tx_list_walker {
"=========================================\n";
}
+ template<class PathStruct>
static Rdb_deadlock_info::Rdb_dl_trx_info get_dl_txn_info(
- const rocksdb::DeadlockInfo &txn, const GL_INDEX_ID &gl_index_id) {
+ const PathStruct &txn, const GL_INDEX_ID &gl_index_id) {
Rdb_deadlock_info::Rdb_dl_trx_info txn_data;
txn_data.trx_id = txn.m_txn_id;
@@ -5027,26 +5171,52 @@ class Rdb_snapshot_status : public Rdb_tx_list_walker {
? cfh->GetName()
: "NOT FOUND; CF_ID: " + std::to_string(txn.m_cf_id);
- txn_data.waiting_key =
- rdb_hexdump(txn.m_waiting_key.c_str(), txn.m_waiting_key.length());
+ txn_data.waiting_key = format_wait_key(txn);
txn_data.exclusive_lock = txn.m_exclusive;
return txn_data;
}
+ // Get the key to use to find the index number (and then, index name)
+ // Two functions with matching signatures so get_dl_path_trx_info() template
+ // can be used with both point and range locking.
+ static const std::string& get_key_for_indexnr(
+ const rocksdb::DeadlockInfo& info) {
+ return info.m_waiting_key;
+ }
+ static const std::string& get_key_for_indexnr(
+ const rocksdb::RangeDeadlockInfo& info) {
+ // Range locks do not span across indexes, so take the left bound
+ return info.m_start.slice;
+ }
+
+ // Print the locked key (or range) in hex
+ // Two functions with matching signatures so get_dl_path_trx_info() template
+ // can be used with both point and range locking.
+ static std::string format_wait_key(const rocksdb::DeadlockInfo& info) {
+ return rdb_hexdump(info.m_waiting_key.c_str(), info.m_waiting_key.length());
+ }
+ static std::string format_wait_key(const rocksdb::RangeDeadlockInfo& info) {
+ return rdb_hexdump_range(info.m_start, info.m_end);
+ }
+
+ // Get deadlock path info. A templated function so one can use it with both
+ // point and range locking.
+ template<class PathStruct>
static Rdb_deadlock_info get_dl_path_trx_info(
- const rocksdb::DeadlockPath &path_entry) {
+ const PathStruct &path_entry) {
Rdb_deadlock_info deadlock_info;
for (auto it = path_entry.path.begin(); it != path_entry.path.end(); it++) {
const auto &txn = *it;
+ auto waiting_key = get_key_for_indexnr(txn);
const GL_INDEX_ID gl_index_id = {
txn.m_cf_id,
{rdb_netbuf_to_uint32(
- reinterpret_cast<const uchar *>(txn.m_waiting_key.c_str())),
+ reinterpret_cast<const uchar *>(waiting_key.c_str())),
rdb_netbuf_to_uint32(
- reinterpret_cast<const uchar *>(txn.m_waiting_key.c_str()) +
+ reinterpret_cast<const uchar *>(waiting_key.c_str()) +
Rdb_key_def::DB_NUMBER_SIZE)}};
deadlock_info.path.push_back(get_dl_txn_info(txn, gl_index_id));
}
@@ -5072,7 +5242,7 @@ class Rdb_snapshot_status : public Rdb_tx_list_walker {
/* Calculate the duration the snapshot has existed */
int64_t snapshot_timestamp = tx->m_snapshot_timestamp;
- if (snapshot_timestamp != 0) {
+ if (snapshot_timestamp != 0 && tx->has_snapshot()) {
int64_t curr_time;
rdb->GetEnv()->GetCurrentTime(&curr_time);
@@ -5091,8 +5261,8 @@ class Rdb_snapshot_status : public Rdb_tx_list_walker {
}
}
- void populate_deadlock_buffer() {
- auto dlock_buffer = rdb->GetDeadlockInfoBuffer();
+ template<class PathStruct>
+ void populate_deadlock_buffer_tmpl(PathStruct &dlock_buffer) {
m_data += "----------LATEST DETECTED DEADLOCKS----------\n";
for (const auto &path_entry : dlock_buffer) {
@@ -5132,12 +5302,32 @@ class Rdb_snapshot_status : public Rdb_tx_list_walker {
}
}
+ void populate_deadlock_buffer() {
+ if (range_lock_mgr) {
+ auto dlock_buffer = range_lock_mgr->GetRangeDeadlockInfoBuffer();
+ populate_deadlock_buffer_tmpl(dlock_buffer);
+ } else {
+ auto dlock_buffer = rdb->GetDeadlockInfoBuffer();
+ populate_deadlock_buffer_tmpl(dlock_buffer);
+ }
+ }
+
std::vector<Rdb_deadlock_info> get_deadlock_info() {
std::vector<Rdb_deadlock_info> deadlock_info;
- auto dlock_buffer = rdb->GetDeadlockInfoBuffer();
- for (const auto &path_entry : dlock_buffer) {
- if (!path_entry.limit_exceeded) {
- deadlock_info.push_back(get_dl_path_trx_info(path_entry));
+
+ if (range_lock_mgr) {
+ auto dlock_buffer = range_lock_mgr->GetRangeDeadlockInfoBuffer();
+ for (const auto &path_entry : dlock_buffer) {
+ if (!path_entry.limit_exceeded) {
+ deadlock_info.push_back(get_dl_path_trx_info(path_entry));
+ }
+ }
+ } else {
+ auto dlock_buffer = rdb->GetDeadlockInfoBuffer();
+ for (const auto &path_entry : dlock_buffer) {
+ if (!path_entry.limit_exceeded) {
+ deadlock_info.push_back(get_dl_path_trx_info(path_entry));
+ }
}
}
return deadlock_info;
@@ -5553,9 +5743,13 @@ static bool rocksdb_collect_hton_log_info(handlerton *const /* unused */,
return ret_val;
}
+/*
+ @param is_dml_statement If true, we are is a DML statement
+*/
static inline void rocksdb_register_tx(
handlerton *const hton MY_ATTRIBUTE((__unused__)), THD *const thd,
- Rdb_transaction *const tx) {
+ Rdb_transaction *const tx,
+ bool is_dml_stmt) {
DBUG_ASSERT(tx != nullptr);
trans_register_ha(thd, false, rocksdb_hton, NULL);
@@ -5570,8 +5764,10 @@ static inline void rocksdb_register_tx(
}
}
if (my_core::thd_test_options(thd, OPTION_NOT_AUTOCOMMIT | OPTION_BEGIN)) {
- tx->start_stmt();
+ tx->start_stmt(is_dml_stmt);
trans_register_ha(thd, true, rocksdb_hton, NULL);
+ } else {
+ tx->start_autocommit_stmt(is_dml_stmt);
}
}
@@ -5656,7 +5852,7 @@ static int rocksdb_start_tx_and_assign_read_view(
DBUG_ASSERT(!tx->has_snapshot());
tx->set_tx_read_only(true);
- rocksdb_register_tx(hton, thd, tx);
+ rocksdb_register_tx(hton, thd, tx, false);
tx->acquire_snapshot(true);
return HA_EXIT_SUCCESS;
@@ -5713,7 +5909,7 @@ static int rocksdb_start_tx_with_shared_read_view(
DBUG_ASSERT(!tx->has_snapshot());
tx->set_tx_read_only(true);
- rocksdb_register_tx(hton, thd, tx);
+ rocksdb_register_tx(hton, thd, tx, false);
tx->acquire_snapshot(true);
// case: an explicit snapshot was not assigned to this transaction
@@ -6399,6 +6595,25 @@ static int rocksdb_init_internal(void *const p) {
tx_db_options.custom_mutex_factory = std::make_shared<Rdb_mutex_factory>();
tx_db_options.write_policy =
static_cast<rocksdb::TxnDBWritePolicy>(rocksdb_write_policy);
+
+ if (rocksdb_use_range_locking && rocksdb_use_range_lock_manager_as_point) {
+ //rdb_log_status_error(
+ // status, "Can't have both range_locking and range_lock_manager_as_point");
+ //DBUG_RETURN(HA_EXIT_FAILURE);
+ rocksdb_use_range_lock_manager_as_point= 0;
+ }
+
+
+ if (rocksdb_use_range_locking) {
+ range_lock_mgr.reset(
+ rocksdb::NewRangeLockManager(tx_db_options.custom_mutex_factory));
+ tx_db_options.lock_mgr_handle = range_lock_mgr;
+ }
+ if (rocksdb_use_range_lock_manager_as_point) {
+ range_lock_mgr_used_as_point.reset(
+ rocksdb::NewRangeLockManager(tx_db_options.custom_mutex_factory));
+ tx_db_options.lock_mgr_handle = range_lock_mgr_used_as_point;
+ }
status =
check_rocksdb_options_compatibility(rocksdb_datadir, main_opts, cf_descr);
@@ -6433,6 +6648,15 @@ static int rocksdb_init_internal(void *const p) {
DBUG_RETURN(HA_EXIT_FAILURE);
}
+ if (range_lock_mgr)
+ {
+ range_lock_mgr->SetMaxLockMemory(rocksdb_max_lock_memory);
+ sql_print_information("RocksDB: USING NEW RANGE LOCKING");
+ sql_print_information("RocksDB: Max lock memory=%llu", rocksdb_max_lock_memory);
+ }
+ else
+ sql_print_information("RocksDB: USING POINT LOCKING");
+
cf_manager.init(std::move(cf_options_map), &cf_handles);
// NO_LINT_DEBUG
@@ -9006,6 +9230,15 @@ int ha_rocksdb::index_read_intern(uchar *const buf, const uchar *const key,
}
Rdb_transaction *const tx = get_or_create_tx(table->in_use);
+
+ bool use_locking_iter= false;
+
+ if ((rc = set_range_lock(tx, kd, find_flag, slice, end_range,
+ &use_locking_iter)))
+ DBUG_RETURN(rc);
+ if (use_locking_iter)
+ m_iterator->set_use_locking();
+
const bool is_new_snapshot = !tx->has_snapshot();
// Loop as long as we get a deadlock error AND we end up creating the
@@ -9054,6 +9287,234 @@ int ha_rocksdb::index_read_intern(uchar *const buf, const uchar *const key,
DBUG_RETURN(rc);
}
+
+/*
+ @brief
+ Compute the range lock endpoints and set the range lock, if necessary
+
+ @param use_locking_iter OUT If true, locks are not set and LockingIterator
+ should be used instead
+
+ @detail
+ If the scanned range doesn't have the endpoint we're scanning towards,
+ don't set the lock, it will be too coarse. Indicate that LockingIterator
+ should be used, instead.
+
+ @return
+ 0 Ok
+ Other Error acquiring the lock (wait timeout, deadlock, etc)
+*/
+
+int ha_rocksdb::set_range_lock(Rdb_transaction *tx,
+ const Rdb_key_def &kd,
+ const enum ha_rkey_function &find_flag,
+ const rocksdb::Slice &slice_arg,
+ const key_range *const end_key,
+ bool *use_locking_iterator)
+{
+ rocksdb::Slice end_slice;
+ uchar end_slice_buf[MAX_KEY_LENGTH];
+ bool start_has_inf_suffix = false, end_has_inf_suffix = false;
+ rocksdb::Slice slice(slice_arg);
+ *use_locking_iterator= false;
+
+ if (m_lock_rows == RDB_LOCK_NONE || !rocksdb_use_range_locking) {
+ return 0;
+ }
+ bool big_range= false;
+
+ /*
+ The 'slice' has the left endpoint of the range to lock.
+ Figure out the right endpoint.
+ */
+
+ if (find_flag == HA_READ_KEY_EXACT) {
+ if (slice.size() == Rdb_key_def::INDEX_ID_SIZE) {
+ // This is a full table/index scan
+ start_has_inf_suffix= false;
+ big_range = true;
+ } else {
+ /*
+ This is "key_part= const" interval. We need to lock this range:
+ (lookup_value, -inf) < key < (lookup_value, +inf)
+ */
+ start_has_inf_suffix= false;
+ end_has_inf_suffix= true;
+ end_slice= slice;
+ }
+ }
+ else if (find_flag == HA_READ_PREFIX_LAST) {
+ if (slice.size() == Rdb_key_def::INDEX_ID_SIZE) {
+ /* Reverse-ordered full index scan */
+ start_has_inf_suffix= true;
+ big_range = true;
+ } else {
+ /*
+ We get here for queries like:
+
+ select * from t1 where pk1=const order by pk1 desc for update
+
+ assuming this uses an index on (pk1, ...)
+ We get end_key=nullptr.
+
+ The range to lock is the same as with HA_READ_KEY_EXACT above.
+ */
+ end_slice= slice;
+ start_has_inf_suffix= false;
+ end_has_inf_suffix= true;
+ }
+ }
+ else if (find_flag == HA_READ_PREFIX_LAST_OR_PREV) {
+ /*
+ We get here for queries like:
+
+ select * from t1 where pk1=const1 and pk2 between const2 and const3
+ order by pk1 desc
+ for update
+
+ assuming this uses an index on (pk1, pk2).
+ The slice has the right endpoint: {const1, const3}
+ the end_key has the left endpoint: {const1, const2}.
+ */
+
+ // Move the right endpoint from slice to end_slice
+ end_slice= slice;
+
+ // Pack the left endpoint and make "slice" point to it
+ uchar pack_buffer[MAX_KEY_LENGTH];
+ uint end_slice_size=
+ kd.pack_index_tuple(table, pack_buffer, end_slice_buf,
+ end_key->key, end_key->keypart_map);
+ slice= rocksdb::Slice(reinterpret_cast<char *>(end_slice_buf),
+ end_slice_size);
+ start_has_inf_suffix= false;
+ end_has_inf_suffix= true;
+ }
+ else if (find_flag == HA_READ_BEFORE_KEY) {
+ /*
+ We get here for queries like
+ select * from t1
+ where pk <1007 order by pk desc limit 2 for update
+ select * from t1
+ where pk >=800 and pk <1007 order by pk desc limit 2 for update
+ */
+
+ // Move the right endpoint from slice to end_slice
+ end_slice= slice;
+
+ if (end_key) {
+ uchar pack_buffer[MAX_KEY_LENGTH];
+ uint end_slice_size=
+ kd.pack_index_tuple(table, pack_buffer, end_slice_buf,
+ end_key->key, end_key->keypart_map);
+
+ slice= rocksdb::Slice(reinterpret_cast<char *>(end_slice_buf),
+ end_slice_size);
+
+ end_has_inf_suffix= false;
+ big_range= false;
+ } else {
+ uint end_slice_size;
+ kd.get_infimum_key(end_slice_buf, &end_slice_size);
+ slice= rocksdb::Slice((char*)end_slice_buf, end_slice_size);
+
+ big_range= true;
+ }
+ }
+ else if (end_key) {
+ // Known start range bounds: HA_READ_KEY_OR_NEXT, HA_READ_AFTER_KEY
+ if (find_flag == HA_READ_KEY_OR_NEXT)
+ start_has_inf_suffix= false;
+ else if (find_flag == HA_READ_AFTER_KEY)
+ start_has_inf_suffix= true;
+ else
+ DBUG_ASSERT(0);
+
+ // Known end range bounds: HA_READ_AFTER_KEY, HA_READ_BEFORE_KEY
+ if (end_key->flag == HA_READ_AFTER_KEY) {
+ // this is "key_part <= const".
+ end_has_inf_suffix= true;
+ } else if (end_key->flag == HA_READ_BEFORE_KEY) {
+ // this is "key_part < const", non-inclusive.
+ end_has_inf_suffix= false;
+ } else
+ DBUG_ASSERT(0);
+
+ uchar pack_buffer[MAX_KEY_LENGTH];
+ uint end_slice_size= kd.pack_index_tuple(table, pack_buffer, end_slice_buf,
+ end_key->key,
+ end_key->keypart_map);
+
+ end_slice= rocksdb::Slice(reinterpret_cast<char *>(end_slice_buf),
+ end_slice_size);
+ }
+ else
+ {
+ big_range= true;
+#if 0
+ // The below is code to handle this without LockingIterator:
+ // No end key
+ // Known start range bounds: HA_READ_KEY_OR_NEXT, HA_READ_AFTER_KEY
+ if (find_flag == HA_READ_KEY_OR_NEXT)
+ start_has_inf_suffix= false;
+ else if (find_flag == HA_READ_AFTER_KEY)
+ start_has_inf_suffix= true;
+ else
+ DBUG_ASSERT(0);
+
+ uint end_slice_size;
+ kd.get_infimum_key(end_slice_buf, &end_slice_size);
+ end_slice= rocksdb::Slice((char*)end_slice_buf, end_slice_size);
+ end_has_inf_suffix= true;
+#endif
+ }
+
+ if (big_range)
+ {
+ *use_locking_iterator= true;
+ return 0;
+ }
+
+ rocksdb::Endpoint start_endp;
+ rocksdb::Endpoint end_endp;
+
+ if (kd.m_is_reverse_cf) {
+ // Flip the endpoints
+ start_endp =rocksdb::Endpoint(end_slice, !end_has_inf_suffix);
+ end_endp = rocksdb::Endpoint(slice, !start_has_inf_suffix);
+ } else {
+ start_endp= rocksdb::Endpoint(slice, start_has_inf_suffix);
+ end_endp= rocksdb::Endpoint(end_slice, end_has_inf_suffix);
+ }
+
+ /*
+ RocksDB's iterator is reading the snapshot of the data that was taken at
+ the time the iterator was created.
+
+ After we've got a lock on the range, we'll need to refresh the iterator
+ to read the latest contents. (If we use the iterator created before the
+ lock_range() call, we may miss the changes that were made/committed after
+ the iterator was created but before the lock_range() call was made).
+
+ RocksDB has Iterator::Refresh() method, but alas, it is not implemented for
+ the iterator returned by Transaction object (Transaction object returns
+ BaseDeltaIterator which allows one to see the transactions's own changes).
+
+ Our solution to this is to release the iterator and create the new one.
+ We release it here, it will be created as soon as there's a need to read
+ records.
+ */
+ //release_scan_iterator();
+ m_iterator->reset();
+
+ auto s= tx->lock_range(kd.get_cf(), start_endp, end_endp);
+ if (!s.ok()) {
+ return (tx->set_status_error(table->in_use, s, kd, m_tbl_def,
+ m_table_handler));
+ }
+ return 0;
+}
+
/*
See storage/rocksdb/rocksdb-range-access.txt for description of how MySQL
index navigation commands are converted into RocksDB lookup commands.
@@ -9520,7 +9981,7 @@ int ha_rocksdb::index_next_with_direction_intern(uchar *const buf,
}
}
- if (rc == HA_ERR_END_OF_FILE) {
+ if (rc != HA_EXIT_SUCCESS) {
break;
}
@@ -9528,7 +9989,7 @@ int ha_rocksdb::index_next_with_direction_intern(uchar *const buf,
const rocksdb::Slice &value = m_iterator->value();
if (active_index == table->s->primary_key) {
- if (m_lock_rows != RDB_LOCK_NONE) {
+ if (m_lock_rows != RDB_LOCK_NONE && !rocksdb_use_range_locking) {
DEBUG_SYNC(ha_thd(), "rocksdb_concurrent_delete");
/* We need to put a lock and re-read */
rc = get_row_by_rowid(buf, key.data(), key.size());
@@ -10610,6 +11071,15 @@ int ha_rocksdb::update_write_sk(const TABLE *const table_arg,
old_key_slice = rocksdb::Slice(
reinterpret_cast<const char *>(m_sk_packed_tuple_old), old_packed_size);
+ /* Range locking: lock the index tuple being deleted */
+ if (rocksdb_use_range_locking) {
+ auto s= row_info.tx->lock_singlepoint_range(kd.get_cf(), old_key_slice);
+ if (!s.ok()) {
+ return (row_info.tx->set_status_error(table->in_use, s, kd,
+ m_tbl_def, m_table_handler));
+ }
+ }
+
row_info.tx->get_indexed_write_batch()->SingleDelete(kd.get_cf(),
old_key_slice);
@@ -10653,6 +11123,14 @@ int ha_rocksdb::update_write_sk(const TABLE *const table_arg,
if (bulk_load_sk && row_info.old_data == nullptr) {
rc = bulk_load_key(row_info.tx, kd, new_key_slice, new_value_slice, true);
} else {
+ /* Range locking: lock the index tuple being inserted */
+ if (rocksdb_use_range_locking) {
+ auto s= row_info.tx->lock_singlepoint_range(kd.get_cf(), new_key_slice);
+ if (!s.ok()) {
+ return (row_info.tx->set_status_error(table->in_use, s, kd,
+ m_tbl_def, m_table_handler));
+ }
+ }
row_info.tx->get_indexed_write_batch()->Put(kd.get_cf(), new_key_slice,
new_value_slice);
}
@@ -11068,6 +11546,19 @@ int ha_rocksdb::delete_row(const uchar *const buf) {
nullptr, false, hidden_pk_id);
rocksdb::Slice secondary_key_slice(
reinterpret_cast<const char *>(m_sk_packed_tuple), packed_size);
+
+ /*
+ For point locking, Deleting on secondary key doesn't need any locks.
+ Range locking must get a lock.
+ */
+ if (rocksdb_use_range_locking) {
+ auto s= tx->lock_singlepoint_range(kd.get_cf(), secondary_key_slice);
+ if (!s.ok()) {
+ DBUG_RETURN(tx->set_status_error(table->in_use, s, kd, m_tbl_def,
+ m_table_handler));
+ }
+ }
+
tx->get_indexed_write_batch()->SingleDelete(kd.get_cf(),
secondary_key_slice);
bytes_written += secondary_key_slice.size();
@@ -11635,7 +12126,7 @@ int ha_rocksdb::external_lock(THD *const thd, int lock_type) {
}
}
tx->m_n_mysql_tables_in_use++;
- rocksdb_register_tx(rocksdb_hton, thd, tx);
+ rocksdb_register_tx(rocksdb_hton, thd, tx, (lock_type == F_WRLCK));
tx->io_perf_start(&m_io_perf);
}
@@ -11663,7 +12154,7 @@ int ha_rocksdb::start_stmt(THD *const thd,
Rdb_transaction *const tx = get_or_create_tx(thd);
read_thd_vars(thd);
- rocksdb_register_tx(ht, thd, tx);
+ rocksdb_register_tx(ht, thd, tx, (lock_type == F_WRLCK));
tx->io_perf_start(&m_io_perf);
DBUG_RETURN(HA_EXIT_SUCCESS);
@@ -14150,6 +14641,36 @@ static int show_rocksdb_stall_vars(THD *thd MY_ATTRIBUTE((unused)),
return 0;
}
+//
+// Lock Tree Status variables
+//
+static longlong rocksdb_locktree_escalation_count=1234;
+static longlong rocksdb_locktree_current_lock_memory=0;
+
+static SHOW_VAR rocksdb_locktree_status_variables[] = {
+ DEF_STATUS_VAR_FUNC("escalation_count",
+ &rocksdb_locktree_escalation_count, SHOW_LONGLONG),
+ DEF_STATUS_VAR_FUNC("current_lock_memory",
+ &rocksdb_locktree_current_lock_memory, SHOW_LONGLONG),
+ // end of the array marker
+ {NullS, NullS, SHOW_LONG}};
+
+static SHOW_VAR rocksdb_empty_status_variables[] = {
+ {NullS, NullS, SHOW_LONG}};
+
+static void show_rocksdb_locktree_vars(THD*, SHOW_VAR *var, char*) {
+ var->type = SHOW_ARRAY;
+ if (range_lock_mgr)
+ {
+ auto status = range_lock_mgr->GetStatus();
+ rocksdb_locktree_escalation_count = status.escalation_count;
+ rocksdb_locktree_current_lock_memory = status.current_lock_memory;
+ var->value = reinterpret_cast<char *>(&rocksdb_locktree_status_variables);
+ }
+ else
+ var->value = reinterpret_cast<char *>(&rocksdb_empty_status_variables);
+}
+
static SHOW_VAR rocksdb_status_vars[] = {
DEF_STATUS_VAR(block_cache_miss),
DEF_STATUS_VAR(block_cache_hit),
@@ -14275,6 +14796,8 @@ static SHOW_VAR rocksdb_status_vars[] = {
SHOW_SCOPE_GLOBAL},
{"rocksdb_stall", reinterpret_cast<char *>(&show_rocksdb_stall_vars),
SHOW_FUNC, SHOW_SCOPE_GLOBAL},
+ {"rocksdb_locktree", reinterpret_cast<char *>(show_rocksdb_locktree_vars),
+ SHOW_FUNC, SHOW_SCOPE_GLOBAL},
{NullS, NullS, SHOW_LONG, SHOW_SCOPE_GLOBAL}};
/*
@@ -15135,6 +15658,23 @@ void rocksdb_set_delayed_write_rate(THD *thd MY_ATTRIBUTE((unused)),
RDB_MUTEX_UNLOCK_CHECK(rdb_sysvars_mutex);
}
+void rocksdb_set_max_lock_memory(THD *thd, struct SYS_VAR*,
+ void* /*var_ptr*/, const void *save) {
+ const uint64_t new_val = *static_cast<const uint64_t *>(save);
+ if (rocksdb_max_lock_memory != new_val) {
+ if (range_lock_mgr->SetMaxLockMemory(new_val)) {
+ /* NO_LINT_DEBUG */
+ sql_print_warning("MyRocks: failed to set max_lock_memory");
+ push_warning_printf(thd, Sql_condition::SL_WARNING,
+ ER_ERROR_WHEN_EXECUTING_COMMAND,
+ "Cannot set max_lock_memory to size below currently used");
+ } else {
+ // Succeeded
+ rocksdb_max_lock_memory = new_val;
+ }
+ }
+}
+
void rocksdb_set_max_latest_deadlocks(
THD *thd MY_ATTRIBUTE((unused)), struct SYS_VAR *var MY_ATTRIBUTE((unused)),
void *var_ptr MY_ATTRIBUTE((unused)), const void *save) {
@@ -15142,7 +15682,13 @@ void rocksdb_set_max_latest_deadlocks(
const uint32_t new_val = *static_cast<const uint32_t *>(save);
if (rocksdb_max_latest_deadlocks != new_val) {
rocksdb_max_latest_deadlocks = new_val;
- rdb->SetDeadlockInfoBufferSize(rocksdb_max_latest_deadlocks);
+ if (range_lock_mgr) {
+ auto n= rocksdb_max_latest_deadlocks;
+ range_lock_mgr->SetRangeDeadlockInfoBufferSize(n);
+ }
+ else
+ rdb->SetDeadlockInfoBufferSize(rocksdb_max_latest_deadlocks);
+
}
RDB_MUTEX_UNLOCK_CHECK(rdb_sysvars_mutex);
}
@@ -15848,19 +16394,21 @@ const rocksdb::ReadOptions &rdb_tx_acquire_snapshot(Rdb_transaction *tx) {
rocksdb::Iterator *rdb_tx_get_iterator(
Rdb_transaction *tx, rocksdb::ColumnFamilyHandle *const column_family,
+ bool is_rev_cf,
bool skip_bloom_filter, const rocksdb::Slice &lower_bound_slice,
const rocksdb::Slice &upper_bound_slice, bool read_current,
bool create_snapshot) {
- return tx->get_iterator(column_family, skip_bloom_filter, lower_bound_slice,
+ return tx->get_iterator(column_family, is_rev_cf, skip_bloom_filter, lower_bound_slice,
upper_bound_slice, read_current, create_snapshot);
}
rocksdb::Iterator *rdb_tx_get_iterator(
- THD *thd, rocksdb::ColumnFamilyHandle *const cf, bool skip_bloom_filter,
+ THD *thd, rocksdb::ColumnFamilyHandle *const cf, bool is_rev_cf,
+ bool skip_bloom_filter,
const rocksdb::Slice &eq_cond_lower_bound,
const rocksdb::Slice &eq_cond_upper_bound,
const rocksdb::Snapshot **snapshot, bool read_current,
- bool create_snapshot) {
+ bool create_snapshot, bool use_locking_iter) {
if (commit_in_the_middle(thd)) {
DBUG_ASSERT(*snapshot == nullptr);
*snapshot = rdb->GetSnapshot();
@@ -15872,8 +16420,8 @@ rocksdb::Iterator *rdb_tx_get_iterator(
return rdb->NewIterator(read_opts, cf);
} else {
Rdb_transaction *tx = get_tx_from_thd(thd);
- return tx->get_iterator(cf, skip_bloom_filter, eq_cond_lower_bound,
- eq_cond_upper_bound, read_current, create_snapshot);
+ return tx->get_iterator(cf, is_rev_cf, skip_bloom_filter, eq_cond_lower_bound,
+ eq_cond_upper_bound, read_current, create_snapshot, use_locking_iter);
}
}
diff --git a/storage/rocksdb/ha_rocksdb.h b/storage/rocksdb/ha_rocksdb.h
index d0baeefe942..dc7a39538b2 100644
--- a/storage/rocksdb/ha_rocksdb.h
+++ b/storage/rocksdb/ha_rocksdb.h
@@ -317,6 +317,13 @@ class ha_rocksdb : public my_core::handler {
const rocksdb::Slice &key,
rocksdb::PinnableSlice *value) const;
+ int set_range_lock(Rdb_transaction *tx,
+ const Rdb_key_def &kd,
+ const enum ha_rkey_function &find_flag,
+ const rocksdb::Slice &slice,
+ const key_range *const end_key,
+ bool *use_locking_iterator);
+
int get_row_by_rowid(uchar *const buf, const char *const rowid,
const uint rowid_size, const bool skip_lookup = false,
const bool skip_ttl_check = true)
@@ -997,6 +1004,8 @@ class ha_rocksdb : public my_core::handler {
/* Need to build decoder on next read operation */
bool m_need_build_decoder;
+
+ int iter_status_to_retval(rocksdb::Iterator *it, const Rdb_key_def &kd, int not_found_code);
};
/*
@@ -1144,16 +1153,20 @@ const rocksdb::ReadOptions &rdb_tx_acquire_snapshot(Rdb_transaction *tx);
rocksdb::Iterator *rdb_tx_get_iterator(
Rdb_transaction *tx, rocksdb::ColumnFamilyHandle *const column_family,
+ bool is_rev_cf,
bool skip_bloom, const rocksdb::Slice &lower_bound_slice,
const rocksdb::Slice &upper_bound_slice, bool read_current = false,
bool create_snapshot = true);
rocksdb::Iterator *rdb_tx_get_iterator(
- THD *thd, rocksdb::ColumnFamilyHandle *const cf, bool skip_bloom_filter,
+ THD *thd, rocksdb::ColumnFamilyHandle *const cf,
+ bool is_rev_cf,
+ bool skip_bloom_filter,
const rocksdb::Slice &eq_cond_lower_bound,
const rocksdb::Slice &eq_cond_upper_bound,
const rocksdb::Snapshot **snapshot, bool read_current = false,
- bool create_snapshot = true);
+ bool create_snapshot = true,
+ bool use_locking_iter= false);
rocksdb::Status rdb_tx_get(Rdb_transaction *tx,
rocksdb::ColumnFamilyHandle *const column_family,
@@ -1226,4 +1239,6 @@ extern std::atomic<uint64_t> rocksdb_partial_index_groups_materialized;
extern std::atomic<uint64_t> rocksdb_partial_index_rows_sorted;
extern std::atomic<uint64_t> rocksdb_partial_index_rows_materialized;
+extern std::shared_ptr<rocksdb::RangeLockManagerHandle> range_lock_mgr;
+
} // namespace myrocks
diff --git a/storage/rocksdb/nosql_access.cc b/storage/rocksdb/nosql_access.cc
index f13b94b32f1..a5eae60b71f 100644
--- a/storage/rocksdb/nosql_access.cc
+++ b/storage/rocksdb/nosql_access.cc
@@ -675,10 +675,11 @@ class select_exec {
}
rocksdb::Iterator *get_iterator(rocksdb::ColumnFamilyHandle *cf,
+ bool is_rev_cf,
bool use_bloom,
const rocksdb::Slice &lower_bound,
const rocksdb::Slice &upper_bound) {
- return rdb_tx_get_iterator(m_tx, cf, !use_bloom, lower_bound,
+ return rdb_tx_get_iterator(m_tx, cf, is_rev_cf, !use_bloom, lower_bound,
upper_bound);
}
@@ -1509,7 +1510,8 @@ bool INLINE_ATTR select_exec::setup_iterator(txn_wrapper *txn,
m_thd, *m_key_def, eq_slice, bound_len, m_lower_bound_buf.data(),
m_upper_bound_buf.data(), &m_lower_bound_slice, &m_upper_bound_slice);
rocksdb::Iterator *it = txn->get_iterator(
- m_key_def->get_cf(), use_bloom, m_lower_bound_slice, m_upper_bound_slice);
+ m_key_def->get_cf(), m_key_def->m_is_reverse_cf, use_bloom,
+ m_lower_bound_slice, m_upper_bound_slice);
if (it == nullptr) {
return true;
}
diff --git a/storage/rocksdb/rdb_i_s.cc b/storage/rocksdb/rdb_i_s.cc
index 3d9bae5c6aa..d3f146e9688 100644
--- a/storage/rocksdb/rdb_i_s.cc
+++ b/storage/rocksdb/rdb_i_s.cc
@@ -1783,31 +1783,63 @@ static int rdb_i_s_lock_info_fill_table(
}
/* cf id -> rocksdb::KeyLockInfo */
- std::unordered_multimap<uint32_t, rocksdb::KeyLockInfo> lock_info =
- rdb->GetLockStatusData();
-
- for (const auto &lock : lock_info) {
- const uint32_t cf_id = lock.first;
- const auto &key_lock_info = lock.second;
- const auto key_hexstr = rdb_hexdump(key_lock_info.key.c_str(),
- key_lock_info.key.length(), FN_REFLEN);
-
- for (const auto &id : key_lock_info.ids) {
- tables->table->field[RDB_LOCKS_FIELD::COLUMN_FAMILY_ID]->store(cf_id,
- true);
- tables->table->field[RDB_LOCKS_FIELD::TRANSACTION_ID]->store(id, true);
-
- tables->table->field[RDB_LOCKS_FIELD::KEY]->store(
- key_hexstr.c_str(), key_hexstr.size(), system_charset_info);
- tables->table->field[RDB_LOCKS_FIELD::MODE]->store(
- key_lock_info.exclusive ? "X" : "S", 1, system_charset_info);
-
- /* Tell MySQL about this row in the virtual table */
- ret = static_cast<int>(
- my_core::schema_table_store_record(thd, tables->table));
-
- if (ret != 0) {
- break;
+ if (range_lock_mgr) {
+ // Use Range Lock Manager's interface for obtaining more specific
+ // information about the acquired locks
+ auto lock_info = range_lock_mgr->GetRangeLockStatusData();
+
+ for (const auto &lock : lock_info) {
+ const uint32_t cf_id = lock.first;
+ const auto &range_lock_info = lock.second;
+
+ std::string key_hexstr = rdb_hexdump_range(range_lock_info.start,
+ range_lock_info.end);
+
+ for (const auto &id : range_lock_info.ids) {
+ tables->table->field[RDB_LOCKS_FIELD::COLUMN_FAMILY_ID]->store(cf_id,
+ true);
+ tables->table->field[RDB_LOCKS_FIELD::TRANSACTION_ID]->store(id, true);
+
+ tables->table->field[RDB_LOCKS_FIELD::KEY]->store(
+ key_hexstr.c_str(), key_hexstr.size(), system_charset_info);
+ tables->table->field[RDB_LOCKS_FIELD::MODE]->store(
+ range_lock_info.exclusive ? "X" : "S", 1, system_charset_info);
+
+ /* Tell MySQL about this row in the virtual table */
+ ret = static_cast<int>(
+ my_core::schema_table_store_record(thd, tables->table));
+
+ if (ret != 0) {
+ break;
+ }
+ }
+ }
+ } else {
+ std::unordered_multimap<uint32_t, rocksdb::KeyLockInfo> lock_info =
+ rdb->GetLockStatusData();
+
+ for (const auto &lock : lock_info) {
+ const uint32_t cf_id = lock.first;
+ const auto &key_lock_info = lock.second;
+ auto key_hexstr = rdb_hexdump(key_lock_info.key.c_str(),
+ key_lock_info.key.length(), FN_REFLEN);
+ for (const auto &id : key_lock_info.ids) {
+ tables->table->field[RDB_LOCKS_FIELD::COLUMN_FAMILY_ID]->store(cf_id,
+ true);
+ tables->table->field[RDB_LOCKS_FIELD::TRANSACTION_ID]->store(id, true);
+
+ tables->table->field[RDB_LOCKS_FIELD::KEY]->store(
+ key_hexstr.c_str(), key_hexstr.size(), system_charset_info);
+ tables->table->field[RDB_LOCKS_FIELD::MODE]->store(
+ key_lock_info.exclusive ? "X" : "S", 1, system_charset_info);
+
+ /* Tell MySQL about this row in the virtual table */
+ ret = static_cast<int>(
+ my_core::schema_table_store_record(thd, tables->table));
+
+ if (ret != 0) {
+ break;
+ }
}
}
}
diff --git a/storage/rocksdb/rdb_iterator.cc b/storage/rocksdb/rdb_iterator.cc
index 978f5ab1023..5c0513a6cca 100644
--- a/storage/rocksdb/rdb_iterator.cc
+++ b/storage/rocksdb/rdb_iterator.cc
@@ -34,6 +34,7 @@ Rdb_iterator_base::Rdb_iterator_base(THD *thd,
m_tbl_def(tbl_def),
m_thd(thd),
m_scan_it(nullptr),
+ m_use_locking_iter(false),
m_scan_it_skips_bloom(false),
m_scan_it_snapshot(nullptr),
m_scan_it_lower_bound(nullptr),
@@ -81,7 +82,7 @@ int Rdb_iterator_base::read_before_key(const bool full_key_match,
return HA_EXIT_SUCCESS;
}
- return HA_ERR_END_OF_FILE;
+ return iter_status_to_retval(m_scan_it, m_kd, HA_ERR_END_OF_FILE);
}
int Rdb_iterator_base::read_after_key(const rocksdb::Slice &key_slice) {
@@ -96,12 +97,15 @@ int Rdb_iterator_base::read_after_key(const rocksdb::Slice &key_slice) {
*/
rocksdb_smart_seek(m_kd->m_is_reverse_cf, m_scan_it, key_slice);
- return is_valid_iterator(m_scan_it) ? HA_EXIT_SUCCESS : HA_ERR_END_OF_FILE;
+ return is_valid_iterator(m_scan_it) ?
+ HA_EXIT_SUCCESS :
+ iter_status_to_retval(m_scan_it, m_kd, HA_ERR_END_OF_FILE);
}
void Rdb_iterator_base::release_scan_iterator() {
delete m_scan_it;
m_scan_it = nullptr;
+ m_use_locking_iter = false;
if (m_scan_it_snapshot) {
auto rdb = rdb_get_rocksdb_db();
@@ -135,6 +139,10 @@ void Rdb_iterator_base::setup_scan_iterator(const rocksdb::Slice *const slice,
skip_bloom = false;
}
+ // Save the value of m_use_locking_iter because release_scan_iterator()
+ // will set it to false.
+ bool use_locking_iter= m_use_locking_iter;
+
/*
In some cases, setup_scan_iterator() is called multiple times from
the same query but bloom filter can not always be used.
@@ -162,9 +170,10 @@ void Rdb_iterator_base::setup_scan_iterator(const rocksdb::Slice *const slice,
*/
if (!m_scan_it) {
m_scan_it = rdb_tx_get_iterator(
- m_thd, m_kd->get_cf(), skip_bloom, m_scan_it_lower_bound_slice,
+ m_thd, m_kd->get_cf(), m_kd->m_is_reverse_cf, skip_bloom,
+ m_scan_it_lower_bound_slice,
m_scan_it_upper_bound_slice, &m_scan_it_snapshot, read_current,
- !read_current);
+ !read_current, use_locking_iter);
m_scan_it_skips_bloom = skip_bloom;
}
}
@@ -206,6 +215,20 @@ int Rdb_iterator_base::calc_eq_cond_len(enum ha_rkey_function find_flag,
return Rdb_key_def::INDEX_ID_SIZE;
}
+int Rdb_iterator_base::iter_status_to_retval(rocksdb::Iterator *it,
+ const std::shared_ptr<Rdb_key_def> kd,
+ int not_found_code) {
+ if (it->Valid())
+ return HA_EXIT_SUCCESS;
+
+ rocksdb::Status s= it->status();
+ if (s.ok() || s.IsNotFound())
+ return not_found_code;
+
+ Rdb_transaction *tx = get_tx_from_thd(m_thd);
+ return rdb_tx_set_status_error(tx, s, *kd, m_tbl_def);
+}
+
int Rdb_iterator_base::next_with_direction(bool move_forward, bool skip_next) {
int rc = 0;
const auto &kd = *m_kd;
@@ -235,7 +258,7 @@ int Rdb_iterator_base::next_with_direction(bool move_forward, bool skip_next) {
}
if (!is_valid_iterator(m_scan_it)) {
- rc = HA_ERR_END_OF_FILE;
+ rc = iter_status_to_retval(m_scan_it, m_kd, HA_ERR_END_OF_FILE);
break;
}
diff --git a/storage/rocksdb/rdb_iterator.h b/storage/rocksdb/rdb_iterator.h
index 164d32c4099..4810b4e90fd 100644
--- a/storage/rocksdb/rdb_iterator.h
+++ b/storage/rocksdb/rdb_iterator.h
@@ -50,6 +50,8 @@ class Rdb_iterator {
virtual rocksdb::Slice key() = 0;
virtual rocksdb::Slice value() = 0;
virtual void reset() = 0;
+
+ virtual void set_use_locking()=0;
};
class Rdb_iterator_base : public Rdb_iterator {
@@ -89,6 +91,7 @@ class Rdb_iterator_base : public Rdb_iterator {
void reset() override { release_scan_iterator(); }
+ void set_use_locking() override { m_use_locking_iter = true; }
protected:
friend class Rdb_iterator;
const std::shared_ptr<Rdb_key_def> m_kd;
@@ -103,6 +106,8 @@ class Rdb_iterator_base : public Rdb_iterator {
/* Iterator used for range scans and for full table/index scans */
rocksdb::Iterator *m_scan_it;
+ bool m_use_locking_iter;
+
/* Whether m_scan_it was created with skip_bloom=true */
bool m_scan_it_skips_bloom;
@@ -116,6 +121,10 @@ class Rdb_iterator_base : public Rdb_iterator {
uchar *m_prefix_buf;
rocksdb::Slice m_prefix_tuple;
+
+ int iter_status_to_retval(rocksdb::Iterator *it,
+ const std::shared_ptr<Rdb_key_def> kd,
+ int not_found_code);
};
class Rdb_iterator_partial : public Rdb_iterator_base {
diff --git a/storage/rocksdb/rdb_locking_iter.cc b/storage/rocksdb/rdb_locking_iter.cc
new file mode 100644
index 00000000000..739f383a816
--- /dev/null
+++ b/storage/rocksdb/rdb_locking_iter.cc
@@ -0,0 +1,108 @@
+
+#ifdef USE_PRAGMA_IMPLEMENTATION
+#pragma implementation // gcc: Class implementation
+#endif
+
+#define MYSQL_SERVER 1
+
+/* This C++ file's header file */
+#include "./rdb_locking_iter.h"
+
+namespace myrocks {
+
+rocksdb::Iterator* GetLockingIterator(
+ rocksdb::Transaction *trx,
+ const rocksdb::ReadOptions& read_options,
+ rocksdb::ColumnFamilyHandle* column_family,
+ bool is_rev_cf,
+ ulonglong *counter) {
+ return new LockingIterator(trx, column_family, is_rev_cf, read_options,
+ counter);
+}
+
+/*
+ @brief
+ Seek to the first key K that is equal or greater than target,
+ locking the range [target; K].
+*/
+
+void LockingIterator::Seek(const rocksdb::Slice& target) {
+ iter_ = txn_->GetIterator(read_opts_, cfh_);
+ iter_->Seek(target);
+ ScanForward(target, false);
+}
+
+void LockingIterator::SeekForPrev(const rocksdb::Slice& target) {
+ iter_ = txn_->GetIterator(read_opts_, cfh_);
+ iter_->SeekForPrev(target);
+ ScanBackward(target, false);
+}
+
+/*
+ @brief
+ Move the iterator to the next key, locking the range between the current
+ and the next key.
+
+ @detail
+ Implementation is similar to Seek(next_key). Since we don't know what the
+ next_key is, we reach it by calling { Seek(current_key); Next(); }
+*/
+void LockingIterator::Next() {
+ DEBUG_SYNC(my_core::thd_get_current_thd(), "rocksdb.LockingIterator.Next");
+ assert(Valid());
+ // Save the current key value. We need it as the left endpoint
+ // of the range lock we're going to acquire
+ std::string current_key = iter_->key().ToString();
+
+ iter_->Next();
+ ScanForward(rocksdb::Slice(current_key), true);
+}
+
+/*
+ @brief
+ Move the iterator to the previous key, locking the range between the current
+ and the previous key.
+*/
+
+void LockingIterator::Prev() {
+ assert(Valid());
+
+ std::string current_key = iter_->key().ToString();
+ iter_->Prev();
+ ScanBackward(rocksdb::Slice(current_key), true);
+}
+
+
+/*
+ @detail
+ Ideally, this function should
+ - find the first key $first_key
+ - lock the range [-inf; $first_key]
+ - return, the iterator is positioned on $first_key
+
+ The problem here is that we cannot have "-infinity" bound.
+
+ Note: we don't have a practical use for this function - MyRocks always
+ searches within one index_name.table_name, which means we are only looking
+ at the keys with index_number as the prefix.
+*/
+
+void LockingIterator::SeekToFirst() {
+ DBUG_ASSERT(0);
+ status_ = rocksdb::Status::NotSupported("Not implemented");
+ valid_ = false;
+}
+
+/*
+ @detail
+ See SeekToFirst.
+*/
+
+void LockingIterator::SeekToLast() {
+ DBUG_ASSERT(0);
+ status_ = rocksdb::Status::NotSupported("Not implemented");
+ valid_ = false;
+}
+
+} // namespace myrocks
+
diff --git a/storage/rocksdb/rdb_locking_iter.h b/storage/rocksdb/rdb_locking_iter.h
new file mode 100644
index 00000000000..5a9ed6c275d
--- /dev/null
+++ b/storage/rocksdb/rdb_locking_iter.h
@@ -0,0 +1,190 @@
+
+/* MySQL header files */
+#include "sql/handler.h" /* handler */
+#include "sql/debug_sync.h"
+#include "./rdb_threads.h" /* for thd_get_current_thd */
+
+/* MyRocks header files */
+#include "./ha_rocksdb.h"
+
+namespace myrocks {
+
+//////////////////////////////////////////////////////////////////////////////
+// Locking iterator
+//////////////////////////////////////////////////////////////////////////////
+
+//
+// LockingIterator is an iterator that locks the rows before returning, as well
+// as scanned gaps between the rows.
+//
+// Example:
+// lock_iter= trx->GetLockingIterator();
+// lock_iter->Seek('abc');
+// lock_iter->Valid()==true && lock_iter->key() == 'bcd';
+//
+// After the above, the returned record 'bcd' is locked by transaction trx.
+// Also, the range between ['abc'..'bcd'] is empty and locked by trx.
+//
+// lock_iter->Next();
+// lock_iter->Valid()==true && lock_iter->key() == 'efg'
+//
+// Now, the range ['bcd'.. 'efg'] (bounds incluive) is also locked, and there are no
+// records between 'bcd' and 'efg'.
+//
+class LockingIterator : public rocksdb::Iterator {
+
+ rocksdb::Transaction *txn_;
+ rocksdb::ColumnFamilyHandle* cfh_;
+ bool m_is_rev_cf;
+ rocksdb::ReadOptions read_opts_;
+ rocksdb::Iterator *iter_;
+ rocksdb::Status status_;
+
+ // note: an iterator that has reached EOF has status()==OK && valid_==false
+ bool valid_;
+
+ ulonglong *lock_count_;
+ public:
+ LockingIterator(rocksdb::Transaction *txn,
+ rocksdb::ColumnFamilyHandle *cfh,
+ bool is_rev_cf,
+ const rocksdb::ReadOptions& opts,
+ ulonglong *lock_count=nullptr
+ ) :
+ txn_(txn), cfh_(cfh), m_is_rev_cf(is_rev_cf), read_opts_(opts), iter_(nullptr),
+ status_(rocksdb::Status::InvalidArgument()), valid_(false),
+ lock_count_(lock_count) {}
+
+ ~LockingIterator() {
+ delete iter_;
+ }
+
+ virtual bool Valid() const override { return valid_; }
+
+ // Note: MyRocks doesn't ever call these:
+ virtual void SeekToFirst() override;
+ virtual void SeekToLast() override;
+
+ virtual void Seek(const rocksdb::Slice& target) override;
+
+ // Position at the last key in the source that at or before target.
+ // The iterator is Valid() after this call iff the source contains
+ // an entry that comes at or before target.
+ virtual void SeekForPrev(const rocksdb::Slice& target) override;
+
+ virtual void Next() override;
+ virtual void Prev() override;
+
+ virtual rocksdb::Slice key() const override {
+ assert(Valid());
+ return iter_->key();
+ }
+
+ virtual rocksdb::Slice value() const override {
+ assert(Valid());
+ return iter_->value();
+ }
+
+ virtual rocksdb::Status status() const override {
+ return status_;
+ }
+
+ private:
+ template <bool forward> void Scan(const rocksdb::Slice& target,
+ bool call_next) {
+ if (!iter_->Valid()) {
+ status_ = iter_->status();
+ valid_ = false;
+ return;
+ }
+
+ while (1) {
+ /*
+ note: the underlying iterator checks iterator bounds, so we don't need
+ to check them here
+ */
+ DEBUG_SYNC(my_core::thd_get_current_thd(), "rocksdb.locking_iter_scan");
+ auto end_key = iter_->key();
+ bool endp_arg= m_is_rev_cf;
+ if (forward) {
+ status_ = txn_->GetRangeLock(cfh_,
+ rocksdb::Endpoint(target, endp_arg),
+ rocksdb::Endpoint(end_key, endp_arg));
+ } else {
+ status_ = txn_->GetRangeLock(cfh_,
+ rocksdb::Endpoint(end_key, endp_arg),
+ rocksdb::Endpoint(target, endp_arg));
+ }
+
+ if (!status_.ok()) {
+ // Failed to get a lock (most likely lock wait timeout)
+ valid_ = false;
+ return;
+ }
+ if (lock_count_) (*lock_count_)++;
+ std::string end_key_copy= end_key.ToString();
+
+ //Ok, now we have a lock which is inhibiting modifications in the range
+ // Somebody might have done external modifications, though:
+ // - removed the key we've found
+ // - added a key before that key.
+
+ // First, refresh the iterator:
+ delete iter_;
+ iter_ = txn_->GetIterator(read_opts_, cfh_);
+
+ // Then, try seeking to the same row
+ if (forward)
+ iter_->Seek(target);
+ else
+ iter_->SeekForPrev(target);
+
+ auto cmp= cfh_->GetComparator();
+
+ if (call_next && iter_->Valid() && !cmp->Compare(iter_->key(), target)) {
+ if (forward)
+ iter_->Next();
+ else
+ iter_->Prev();
+ }
+
+ if (iter_->Valid()) {
+ int inv = forward ? 1 : -1;
+ if (cmp->Compare(iter_->key(), rocksdb::Slice(end_key_copy))*inv <= 0) {
+ // Ok, the found key is within the range.
+ status_ = rocksdb::Status::OK();
+ valid_= true;
+ break;
+ } else {
+ // We've got a key but it is outside the range we've locked.
+ // Re-try the lock-and-read step.
+ continue;
+ }
+ } else {
+ // There's no row (within the iterator bounds perhaps). Exit now.
+ // (we might already have locked a range in this function but there's
+ // nothing we can do about it)
+ valid_ = false;
+ status_ = iter_->status();
+ break;
+ }
+ }
+ }
+
+ inline void ScanForward(const rocksdb::Slice& target, bool call_next) {
+ Scan<true>(target, call_next);
+ }
+
+ inline void ScanBackward(const rocksdb::Slice& target, bool call_next) {
+ Scan<false>(target, call_next);
+ }
+};
+
+rocksdb::Iterator*
+GetLockingIterator(rocksdb::Transaction *trx,
+ const rocksdb::ReadOptions& read_options,
+ rocksdb::ColumnFamilyHandle* column_family,
+ bool is_rev_cf,
+ ulonglong *counter);
+
+} // namespace myrocks
diff --git a/storage/rocksdb/rdb_utils.cc b/storage/rocksdb/rdb_utils.cc
index b14a3c94ea9..0c59ac27553 100644
--- a/storage/rocksdb/rdb_utils.cc
+++ b/storage/rocksdb/rdb_utils.cc
@@ -254,6 +254,33 @@ std::string rdb_hexdump(const char *data, const std::size_t data_len,
return str;
}
+/*
+ Print the range in hex, in "start_endpoint-end_endpoint" form
+*/
+
+std::string rdb_hexdump_range(const rocksdb::EndpointWithString& start,
+ const rocksdb::EndpointWithString& end) {
+ std::string res;
+ // For keys: :0 keys should look like point keys
+ if (!start.inf_suffix && !end.inf_suffix && (start.slice == end.slice)) {
+ // This is a single-point range, show it like a key
+ res = rdb_hexdump(start.slice.c_str(), start.slice.length(), FN_REFLEN);
+ } else {
+ res = rdb_hexdump(start.slice.c_str(), start.slice.length(), FN_REFLEN);
+ if (start.inf_suffix)
+ res.append(":1");
+
+ res.append("-");
+
+ std::string key2 = rdb_hexdump(end.slice.c_str(), end.slice.length(),
+ FN_REFLEN);
+ if (end.inf_suffix)
+ key2.append(":1");
+ res.append(key2);
+ }
+ return res;
+}
+
/*
Attempt to access the database subdirectory to see if it exists
*/
diff --git a/storage/rocksdb/rdb_utils.h b/storage/rocksdb/rdb_utils.h
index 74e735c5c2d..92a73f397af 100644
--- a/storage/rocksdb/rdb_utils.h
+++ b/storage/rocksdb/rdb_utils.h
@@ -29,6 +29,7 @@
/* RocksDB header files */
#include "rocksdb/slice.h"
#include "rocksdb/status.h"
+#include "rocksdb/utilities/transaction_db.h"
/* MyRocks header files */
#include "./rdb_global.h"
@@ -290,6 +291,8 @@ std::string rdb_hexdump(const char *data, const std::size_t data_len,
const std::size_t maxsize = 0)
MY_ATTRIBUTE((__nonnull__));
+std::string rdb_hexdump_range(const rocksdb::EndpointWithString& left,
+ const rocksdb::EndpointWithString& right);
/*
Helper function to see if a database exists
*/
1
0
revision-id: 64a1f75b1f122633470d693bcd71b0a237d7b347 (percona-202102-55-g64a1f75b1f1)
parent(s): d1d0c156c629689b013de067b6fa01e4009484d5
author: Sergei Petrunia
committer: Sergei Petrunia
timestamp: 2021-05-17 18:01:23 +0300
message:
Apply the Partial Iterator fix from Manuel
---
storage/rocksdb/rdb_iterator.cc | 1 +
1 file changed, 1 insertion(+)
diff --git a/storage/rocksdb/rdb_iterator.cc b/storage/rocksdb/rdb_iterator.cc
index 60dd0c4c6ab..978f5ab1023 100644
--- a/storage/rocksdb/rdb_iterator.cc
+++ b/storage/rocksdb/rdb_iterator.cc
@@ -637,6 +637,7 @@ int Rdb_iterator_partial::materialize_prefix() {
rocksdb_partial_index_rows_materialized += num_rows;
exit:
+ m_kd->get_infimum_key(m_cur_prefix_key, &tmp);
rdb_tx_release_lock(tx, *m_kd, cur_prefix_key);
return rc;
}
1
0