[Commits] 9545837: MDEV-26189 Missing handling of unknown column in WHERE of recursive CTE
by IgorBabaev 21 Jul '21
by IgorBabaev 21 Jul '21
21 Jul '21
revision-id: 9545837ad14c64e2c4491eaa1bd1c56bc2cbc589 (mariadb-10.2.31-1070-g9545837)
parent(s): 872422dcbbe3681a794935fb2cae422d9d5f4108
author: Igor Babaev
committer: Igor Babaev
timestamp: 2021-07-20 23:14:43 -0700
message:
MDEV-26189 Missing handling of unknown column in WHERE of recursive CTE
SQL processor failed to catch references to unknown columns and other
errors of the phase of semantic analysis in the specification of a
hanging recursive CTE. This happened because the function
With_clause::prepare_unreferenced_elements() failed to detect a CTE as
a hanging CTE if the CTE was recursive.
Fixing this problem in the code of the mentioned function opened another
problem: EXPLAIN started including the lines for the specifications of
hanging recursive CTEs in its output. This problem also was fixed in this
patch.
Approved by Dmitry Shulga <dmitry.shulga(a)mariadb.com>
---
mysql-test/r/cte_recursive.result | 46 +++++++++++++++++++++++++++++++++----
mysql-test/t/cte_recursive.test | 48 +++++++++++++++++++++++++++++++++++++--
sql/sql_cte.cc | 3 ++-
sql/sql_cte.h | 2 ++
sql/sql_select.cc | 12 ++++++----
5 files changed, 100 insertions(+), 11 deletions(-)
diff --git a/mysql-test/r/cte_recursive.result b/mysql-test/r/cte_recursive.result
index a4d821e..1b1fd8b 100644
--- a/mysql-test/r/cte_recursive.result
+++ b/mysql-test/r/cte_recursive.result
@@ -3689,7 +3689,7 @@ select * from t1 as t;
id select_type table type possible_keys key key_len ref rows filtered Extra
1 PRIMARY t ALL NULL NULL NULL NULL 4 100.00
Warnings:
-Note 1003 with recursive cte as (select `*` AS `*` from `test`.`t1` where `a` = 1 union select `a` + 1 AS `a+1` from `cte` where `a` < 3)select `test`.`t`.`a` AS `a` from `test`.`t1` `t`
+Note 1003 with recursive cte as (select `test`.`t1`.`a` AS `a` from `test`.`t1` where `test`.`t1`.`a` = 1 union select `cte`.`a` + 1 AS `a+1` from `cte` where `cte`.`a` < 3)select `test`.`t`.`a` AS `a` from `test`.`t1` `t`
with recursive cte as
(select * from t1 where a=1 union select a+1 from cte where a<3)
select * from t1 as t;
@@ -3702,10 +3702,10 @@ create table t2 ( i1 int, i2 int);
insert into t2 values (1,1),(2,2);
explain
with recursive cte as
-( select * from t1 union select s1.* from t1 as s1, cte where s1.i1 = cte.i2 )
-select * from t1 as t;
+( select * from t2 union select s1.* from t2 as s1, cte where s1.i1 = cte.i2 )
+select * from t2 as t;
id select_type table type possible_keys key key_len ref rows Extra
-1 PRIMARY t ALL NULL NULL NULL NULL 4
+1 PRIMARY t ALL NULL NULL NULL NULL 2
drop table t1,t2;
#
# MDEV-22042: ANALYZE of query using stored function and recursive CTE
@@ -4481,5 +4481,43 @@ b
deallocate prepare stmt;
drop table t1,t2;
#
+# MDEV-26189: Unknown column reference within hanging recursive CTE
+#
+create table t1 (a int);
+insert into t1 values (3), (7), (1);
+with recursive
+r as (select * from t1 union select s1.* from t1 as s1, r where s1.a = r.b)
+select * from t1 as t;
+ERROR 42S22: Unknown column 'r.b' in 'where clause'
+explain with recursive
+r as (select * from t1 union select s1.* from t1 as s1, r where s1.a = r.b)
+select * from t1 as t;
+ERROR 42S22: Unknown column 'r.b' in 'where clause'
+create procedure sp1() with recursive
+r as (select * from t1 union select s1.* from t1 as s1, r where s1.a = r.b)
+select * from t1 as t;
+call sp1();
+ERROR 42S22: Unknown column 'r.b' in 'where clause'
+call sp1();
+ERROR 42S22: Unknown column 'r.b' in 'where clause'
+with recursive
+r as (select * from t1 union select s1.* from t1 as s1, r where s1.b = r.a)
+select * from t1 as t;
+ERROR 42S22: Unknown column 's1.b' in 'where clause'
+explain with recursive
+r as (select * from t1 union select s1.* from t1 as s1, r where s1.b = r.a)
+select * from t1 as t;
+ERROR 42S22: Unknown column 's1.b' in 'where clause'
+create procedure sp2() with recursive
+r as (select * from t1 union select s1.* from t1 as s1, r where s1.b = r.a)
+select * from t1 as t;
+call sp2();
+ERROR 42S22: Unknown column 's1.b' in 'where clause'
+call sp2();
+ERROR 42S22: Unknown column 's1.b' in 'where clause'
+drop procedure sp1;
+drop procedure sp2;
+drop table t1;
+#
# End of 10.2 tests
#
diff --git a/mysql-test/t/cte_recursive.test b/mysql-test/t/cte_recursive.test
index 49f9c1f..cdd3a07 100644
--- a/mysql-test/t/cte_recursive.test
+++ b/mysql-test/t/cte_recursive.test
@@ -2556,8 +2556,8 @@ insert into t2 values (1,1),(2,2);
explain
with recursive cte as
- ( select * from t1 union select s1.* from t1 as s1, cte where s1.i1 = cte.i2 )
-select * from t1 as t;
+ ( select * from t2 union select s1.* from t2 as s1, cte where s1.i1 = cte.i2 )
+select * from t2 as t;
drop table t1,t2;
@@ -2841,5 +2841,49 @@ deallocate prepare stmt;
drop table t1,t2;
--echo #
+--echo # MDEV-26189: Unknown column reference within hanging recursive CTE
+--echo #
+
+create table t1 (a int);
+insert into t1 values (3), (7), (1);
+
+let $q1=
+with recursive
+ r as (select * from t1 union select s1.* from t1 as s1, r where s1.a = r.b)
+select * from t1 as t;
+
+--ERROR ER_BAD_FIELD_ERROR
+eval $q1;
+--ERROR ER_BAD_FIELD_ERROR
+eval explain $q1;
+
+eval create procedure sp1() $q1;
+--ERROR ER_BAD_FIELD_ERROR
+call sp1();
+--ERROR ER_BAD_FIELD_ERROR
+call sp1();
+
+let $q2=
+with recursive
+ r as (select * from t1 union select s1.* from t1 as s1, r where s1.b = r.a)
+select * from t1 as t;
+
+--ERROR ER_BAD_FIELD_ERROR
+eval $q2;
+--ERROR ER_BAD_FIELD_ERROR
+eval explain $q2;
+
+eval create procedure sp2() $q2;
+--ERROR ER_BAD_FIELD_ERROR
+call sp2();
+--ERROR ER_BAD_FIELD_ERROR
+call sp2();
+
+drop procedure sp1;
+drop procedure sp2;
+
+drop table t1;
+
+--echo #
--echo # End of 10.2 tests
--echo #
diff --git a/sql/sql_cte.cc b/sql/sql_cte.cc
index b720eac..22a9984 100644
--- a/sql/sql_cte.cc
+++ b/sql/sql_cte.cc
@@ -911,7 +911,8 @@ bool With_clause::prepare_unreferenced_elements(THD *thd)
with_elem;
with_elem= with_elem->next)
{
- if (!with_elem->is_referenced() && with_elem->prepare_unreferenced(thd))
+ if ((with_elem->is_hanging_recursive() || !with_elem->is_referenced()) &&
+ with_elem->prepare_unreferenced(thd))
return true;
}
diff --git a/sql/sql_cte.h b/sql/sql_cte.h
index 5f30894..d484dcf 100644
--- a/sql/sql_cte.h
+++ b/sql/sql_cte.h
@@ -242,6 +242,8 @@ class With_element : public Sql_alloc
bool is_referenced() { return referenced; }
+ bool is_hanging_recursive() { return is_recursive && !rec_outer_references; }
+
void inc_references() { references++; }
bool rename_columns_of_derived_unit(THD *thd, st_select_lex_unit *unit);
diff --git a/sql/sql_select.cc b/sql/sql_select.cc
index 2bb01ee..ff584e9 100644
--- a/sql/sql_select.cc
+++ b/sql/sql_select.cc
@@ -25276,8 +25276,10 @@ int JOIN::save_explain_data_intern(Explain_query *output,
if (!(tmp_unit->item && tmp_unit->item->eliminated) && // (1)
(!tmp_unit->derived ||
tmp_unit->derived->is_materialized_derived()) && // (2)
- !(tmp_unit->with_element &&
- (!tmp_unit->derived || !tmp_unit->derived->derived_result))) // (3)
+ (!tmp_unit->with_element ||
+ (tmp_unit->derived &&
+ tmp_unit->derived->derived_result &&
+ !tmp_unit->with_element->is_hanging_recursive()))) // (3)
{
explain->add_child(tmp_unit->first_select()->select_number);
}
@@ -25342,8 +25344,10 @@ static void select_describe(JOIN *join, bool need_tmp_table, bool need_order,
*/
if (!(unit->item && unit->item->eliminated) && // (1)
!(unit->derived && unit->derived->merged_for_insert) && // (2)
- !(unit->with_element &&
- (!unit->derived || !unit->derived->derived_result))) // (3)
+ (!unit->with_element ||
+ (unit->derived &&
+ unit->derived->derived_result &&
+ !unit->with_element->is_hanging_recursive()))) // (3)
{
if (mysql_explain_union(thd, unit, result))
DBUG_VOID_RETURN;
1
0
[Commits] 6f1628b: MDEV-25565 Crash on 2-nd execution of SP/PS for query calculating window functions
by IgorBabaev 20 Jul '21
by IgorBabaev 20 Jul '21
20 Jul '21
revision-id: 6f1628b917d365ecfc9c4c9951011613f4212592 (mariadb-10.2.31-1048-g6f1628b)
parent(s): fb0b28932ce82903f2fcfb690a71bff52355507f
author: Igor Babaev
committer: Igor Babaev
timestamp: 2021-07-09 18:56:34 -0700
message:
MDEV-25565 Crash on 2-nd execution of SP/PS for query calculating window functions
from view
A crash of the server happened when executing a stored procedure whose the
only query calculated window functions over a mergeable view specified
as a select from non-mergeable view. The crash could be reproduced if
the window specifications of the window functions were identical and both
contained PARTITION lists and ORDER BY lists. A crash also happened on
the second execution of the prepared statement created for such query.
If to use derived tables or CTE instead of views the problem still
manifests itself crashing the server.
When optimizing the window specifications of a window function the
server can substitute the partition lists and the order lists for
the corresponding lists from another window specification in the case
when the lists are identical. This substitution is not permanent and should
be rolled back before the second execution. It was not done and this
ultimately led to a crash when resolving the column names at the second
execution of SP/PS.
---
mysql-test/r/win.result | 287 ++++++++++++++++++++++++++++++++++++++++++++++++
mysql-test/t/win.test | 147 +++++++++++++++++++++++++
sql/sql_union.cc | 26 +++++
sql/sql_window.cc | 12 ++
sql/sql_window.h | 5 +-
5 files changed, 476 insertions(+), 1 deletion(-)
diff --git a/mysql-test/r/win.result b/mysql-test/r/win.result
index 8a31dcc..bc017ea 100644
--- a/mysql-test/r/win.result
+++ b/mysql-test/r/win.result
@@ -3911,5 +3911,292 @@ sum(i) over () IN ( SELECT 1 FROM t1 a)
0
DROP TABLE t1;
#
+# MDEV-25565: 2-nd call of SP with SELECT from view / derived table / CTE
+# returning the result of calculation of 2 window
+# functions that use the same window specification
+#
+create table t1 (a int);
+insert into t1 values (3), (7), (1), (7), (1), (1), (3), (1), (5);
+create view v2 as select a from t1 group by a;
+create view v1 as select * from v2;
+create procedure sp1() select v1.a,
+sum(v1.a) over (partition by v1.a order by v1.a) as k,
+avg(v1.a) over (partition by v1.a order by v1.a) as m
+from v1;
+call sp1();
+a k m
+1 1 1.0000
+3 3 3.0000
+5 5 5.0000
+7 7 7.0000
+call sp1();
+a k m
+1 1 1.0000
+3 3 3.0000
+5 5 5.0000
+7 7 7.0000
+prepare stmt from "select v1.a,
+sum(v1.a) over (partition by v1.a order by v1.a) as k,
+avg(v1.a) over (partition by v1.a order by v1.a) as m
+from v1";
+execute stmt;
+a k m
+1 1 1.0000
+3 3 3.0000
+5 5 5.0000
+7 7 7.0000
+execute stmt;
+a k m
+1 1 1.0000
+3 3 3.0000
+5 5 5.0000
+7 7 7.0000
+deallocate prepare stmt;
+create procedure sp2() select * from
+( select dt1.a,
+sum(dt1.a) over (partition by dt1.a order by dt1.a) as k,
+avg(dt1.a) over (partition by dt1.a order by dt1.a) as m
+from (select * from v2) as dt1
+) as dt;
+call sp2();
+a k m
+1 1 1.0000
+3 3 3.0000
+5 5 5.0000
+7 7 7.0000
+call sp2();
+a k m
+1 1 1.0000
+3 3 3.0000
+5 5 5.0000
+7 7 7.0000
+prepare stmt from "select * from
+( select dt1.a,
+sum(dt1.a) over (partition by dt1.a order by dt1.a) as k,
+avg(dt1.a) over (partition by dt1.a order by dt1.a) as m
+from (select * from v2) as dt1
+) as dt";
+execute stmt;
+a k m
+1 1 1.0000
+3 3 3.0000
+5 5 5.0000
+7 7 7.0000
+execute stmt;
+a k m
+1 1 1.0000
+3 3 3.0000
+5 5 5.0000
+7 7 7.0000
+deallocate prepare stmt;
+create procedure sp3() select * from
+( select dt1.a,
+sum(dt1.a) over (partition by dt1.a order by dt1.a) as k,
+avg(dt1.a) over (partition by dt1.a order by dt1.a) as m
+from ( select * from (select * from t1 group by a) as dt2 ) as dt1
+) as dt;
+call sp3();
+a k m
+1 1 1.0000
+3 3 3.0000
+5 5 5.0000
+7 7 7.0000
+call sp3();
+a k m
+1 1 1.0000
+3 3 3.0000
+5 5 5.0000
+7 7 7.0000
+prepare stmt from "select * from
+( select dt1.a,
+sum(dt1.a) over (partition by dt1.a order by dt1.a) as k,
+avg(dt1.a) over (partition by dt1.a order by dt1.a) as m
+from ( select * from (select * from t1 group by a) as dt2 ) as dt1
+) as dt";
+execute stmt;
+a k m
+1 1 1.0000
+3 3 3.0000
+5 5 5.0000
+7 7 7.0000
+execute stmt;
+a k m
+1 1 1.0000
+3 3 3.0000
+5 5 5.0000
+7 7 7.0000
+deallocate prepare stmt;
+create procedure sp4() with cte1 as (select * from (select * from t1 group by a) as dt2),
+cte as
+( select cte1.a,
+sum(cte1.a) over (partition by cte1.a order by cte1.a) as k,
+avg(cte1.a) over (partition by cte1.a order by cte1.a) as m
+from cte1 )
+select * from cte;
+call sp4();
+a k m
+1 1 1.0000
+3 3 3.0000
+5 5 5.0000
+7 7 7.0000
+call sp4();
+a k m
+1 1 1.0000
+3 3 3.0000
+5 5 5.0000
+7 7 7.0000
+prepare stmt from "with cte1 as (select * from (select * from t1 group by a) as dt2),
+cte as
+( select cte1.a,
+sum(cte1.a) over (partition by cte1.a order by cte1.a) as k,
+avg(cte1.a) over (partition by cte1.a order by cte1.a) as m
+from cte1 )
+select * from cte";
+execute stmt;
+a k m
+1 1 1.0000
+3 3 3.0000
+5 5 5.0000
+7 7 7.0000
+execute stmt;
+a k m
+1 1 1.0000
+3 3 3.0000
+5 5 5.0000
+7 7 7.0000
+deallocate prepare stmt;
+create procedure sp5() with cte1 as (select * from v2),
+cte as
+( select cte1.a,
+sum(cte1.a) over (partition by cte1.a order by cte1.a) as k,
+avg(cte1.a) over (partition by cte1.a order by cte1.a) as m
+from cte1 )
+select * from cte;
+call sp5();
+a k m
+1 1 1.0000
+3 3 3.0000
+5 5 5.0000
+7 7 7.0000
+call sp5();
+a k m
+1 1 1.0000
+3 3 3.0000
+5 5 5.0000
+7 7 7.0000
+prepare stmt from "with cte1 as (select * from v2),
+cte as
+( select cte1.a,
+sum(cte1.a) over (partition by cte1.a order by cte1.a) as k,
+avg(cte1.a) over (partition by cte1.a order by cte1.a) as m
+from cte1 )
+select * from cte";
+execute stmt;
+a k m
+1 1 1.0000
+3 3 3.0000
+5 5 5.0000
+7 7 7.0000
+execute stmt;
+a k m
+1 1 1.0000
+3 3 3.0000
+5 5 5.0000
+7 7 7.0000
+deallocate prepare stmt;
+create procedure sp6() with
+cte1 as (with cte2 as (select * from t1 group by a) select * from cte2),
+cte as
+( select cte1.a,
+sum(cte1.a) over (partition by cte1.a order by cte1.a) as k,
+avg(cte1.a) over (partition by cte1.a order by cte1.a) as m
+from cte1 )
+select * from cte;
+call sp6();
+a k m
+1 1 1.0000
+3 3 3.0000
+5 5 5.0000
+7 7 7.0000
+call sp6();
+a k m
+1 1 1.0000
+3 3 3.0000
+5 5 5.0000
+7 7 7.0000
+prepare stmt from "with
+cte1 as (with cte2 as (select * from t1 group by a) select * from cte2),
+cte as
+( select cte1.a,
+sum(cte1.a) over (partition by cte1.a order by cte1.a) as k,
+avg(cte1.a) over (partition by cte1.a order by cte1.a) as m
+from cte1 )
+select * from cte";
+execute stmt;
+a k m
+1 1 1.0000
+3 3 3.0000
+5 5 5.0000
+7 7 7.0000
+execute stmt;
+a k m
+1 1 1.0000
+3 3 3.0000
+5 5 5.0000
+7 7 7.0000
+deallocate prepare stmt;
+create procedure sp7() with
+cte2 as (select * from v1),
+cte1 as (select * from cte2),
+cte as
+( select cte1.a,
+sum(cte1.a) over (partition by cte1.a order by cte1.a) as k,
+avg(cte1.a) over (partition by cte1.a order by cte1.a) as m
+from cte1 )
+select * from cte;
+call sp7();
+a k m
+1 1 1.0000
+3 3 3.0000
+5 5 5.0000
+7 7 7.0000
+call sp7();
+a k m
+1 1 1.0000
+3 3 3.0000
+5 5 5.0000
+7 7 7.0000
+prepare stmt from "with
+cte2 as (select * from v1),
+cte1 as (select * from cte2),
+cte as
+( select cte1.a,
+sum(cte1.a) over (partition by cte1.a order by cte1.a) as k,
+avg(cte1.a) over (partition by cte1.a order by cte1.a) as m
+from cte1 )
+select * from cte";
+execute stmt;
+a k m
+1 1 1.0000
+3 3 3.0000
+5 5 5.0000
+7 7 7.0000
+execute stmt;
+a k m
+1 1 1.0000
+3 3 3.0000
+5 5 5.0000
+7 7 7.0000
+deallocate prepare stmt;
+drop procedure sp1;
+drop procedure sp2;
+drop procedure sp3;
+drop procedure sp4;
+drop procedure sp5;
+drop procedure sp6;
+drop procedure sp7;
+drop view v1,v2;
+drop table t1;
+#
# End of 10.2 tests
#
diff --git a/mysql-test/t/win.test b/mysql-test/t/win.test
index c07a81f..72e789d 100644
--- a/mysql-test/t/win.test
+++ b/mysql-test/t/win.test
@@ -2557,5 +2557,152 @@ SELECT sum(i) over () IN ( SELECT 1 FROM t1 a) FROM t1;
DROP TABLE t1;
--echo #
+--echo # MDEV-25565: 2-nd call of SP with SELECT from view / derived table / CTE
+--echo # returning the result of calculation of 2 window
+--echo # functions that use the same window specification
+--echo #
+
+create table t1 (a int);
+insert into t1 values (3), (7), (1), (7), (1), (1), (3), (1), (5);
+
+create view v2 as select a from t1 group by a;
+create view v1 as select * from v2;
+
+let $q1=
+select v1.a,
+ sum(v1.a) over (partition by v1.a order by v1.a) as k,
+ avg(v1.a) over (partition by v1.a order by v1.a) as m
+from v1;
+
+eval create procedure sp1() $q1;
+call sp1();
+call sp1();
+
+eval prepare stmt from "$q1";
+execute stmt;
+execute stmt;
+deallocate prepare stmt;
+
+let $q2=
+select * from
+ ( select dt1.a,
+ sum(dt1.a) over (partition by dt1.a order by dt1.a) as k,
+ avg(dt1.a) over (partition by dt1.a order by dt1.a) as m
+ from (select * from v2) as dt1
+ ) as dt;
+
+eval create procedure sp2() $q2;
+call sp2();
+call sp2();
+
+eval prepare stmt from "$q2";
+execute stmt;
+execute stmt;
+deallocate prepare stmt;
+
+let $q3=
+select * from
+ ( select dt1.a,
+ sum(dt1.a) over (partition by dt1.a order by dt1.a) as k,
+ avg(dt1.a) over (partition by dt1.a order by dt1.a) as m
+ from ( select * from (select * from t1 group by a) as dt2 ) as dt1
+ ) as dt;
+
+eval create procedure sp3() $q3;
+call sp3();
+call sp3();
+
+eval prepare stmt from "$q3";
+execute stmt;
+execute stmt;
+deallocate prepare stmt;
+
+let $q4=
+with cte1 as (select * from (select * from t1 group by a) as dt2),
+ cte as
+ ( select cte1.a,
+ sum(cte1.a) over (partition by cte1.a order by cte1.a) as k,
+ avg(cte1.a) over (partition by cte1.a order by cte1.a) as m
+ from cte1 )
+select * from cte;
+
+eval create procedure sp4() $q4;
+call sp4();
+call sp4();
+
+eval prepare stmt from "$q4";
+execute stmt;
+execute stmt;
+deallocate prepare stmt;
+
+let $q5=
+with cte1 as (select * from v2),
+ cte as
+ ( select cte1.a,
+ sum(cte1.a) over (partition by cte1.a order by cte1.a) as k,
+ avg(cte1.a) over (partition by cte1.a order by cte1.a) as m
+ from cte1 )
+select * from cte;
+
+eval create procedure sp5() $q5;
+call sp5();
+call sp5();
+
+eval prepare stmt from "$q5";
+execute stmt;
+execute stmt;
+deallocate prepare stmt;
+
+let $q6=
+with
+cte1 as (with cte2 as (select * from t1 group by a) select * from cte2),
+ cte as
+ ( select cte1.a,
+ sum(cte1.a) over (partition by cte1.a order by cte1.a) as k,
+ avg(cte1.a) over (partition by cte1.a order by cte1.a) as m
+ from cte1 )
+select * from cte;
+
+eval create procedure sp6() $q6;
+call sp6();
+call sp6();
+
+eval prepare stmt from "$q6";
+execute stmt;
+execute stmt;
+deallocate prepare stmt;
+
+let $q7=
+with
+ cte2 as (select * from v1),
+ cte1 as (select * from cte2),
+ cte as
+ ( select cte1.a,
+ sum(cte1.a) over (partition by cte1.a order by cte1.a) as k,
+ avg(cte1.a) over (partition by cte1.a order by cte1.a) as m
+ from cte1 )
+select * from cte;
+
+eval create procedure sp7() $q7;
+call sp7();
+call sp7();
+
+eval prepare stmt from "$q7";
+execute stmt;
+execute stmt;
+deallocate prepare stmt;
+
+
+drop procedure sp1;
+drop procedure sp2;
+drop procedure sp3;
+drop procedure sp4;
+drop procedure sp5;
+drop procedure sp6;
+drop procedure sp7;
+drop view v1,v2;
+drop table t1;
+
+--echo #
--echo # End of 10.2 tests
--echo #
diff --git a/sql/sql_union.cc b/sql/sql_union.cc
index 7baedfb..f3c90b8 100644
--- a/sql/sql_union.cc
+++ b/sql/sql_union.cc
@@ -30,6 +30,7 @@
#include "filesort.h" // filesort_free_buffers
#include "sql_view.h"
#include "sql_cte.h"
+#include "item_windowfunc.h"
bool mysql_union(THD *thd, LEX *lex, select_result *result,
SELECT_LEX_UNIT *unit, ulong setup_tables_done_option)
@@ -1550,6 +1551,29 @@ static void cleanup_order(ORDER *order)
}
+static void cleanup_window_funcs(List<Item_window_func> &win_funcs)
+{
+ List_iterator_fast<Item_window_func> it(win_funcs);
+ Item_window_func *win_func;
+ while ((win_func= it++))
+ {
+ Window_spec *win_spec= win_func->window_spec;
+ if (!win_spec)
+ continue;
+ if (win_spec->save_partition_list)
+ {
+ win_spec->partition_list= win_spec->save_partition_list;
+ win_spec->save_partition_list= NULL;
+ }
+ if (win_spec->save_order_list)
+ {
+ win_spec->order_list= win_spec->save_order_list;
+ win_spec->save_order_list= NULL;
+ }
+ }
+}
+
+
bool st_select_lex::cleanup()
{
bool error= FALSE;
@@ -1558,6 +1582,8 @@ bool st_select_lex::cleanup()
cleanup_order(order_list.first);
cleanup_order(group_list.first);
+ cleanup_window_funcs(window_funcs);
+
if (join)
{
List_iterator<TABLE_LIST> ti(leaf_tables);
diff --git a/sql/sql_window.cc b/sql/sql_window.cc
index 612c6e6..3ef751b 100644
--- a/sql/sql_window.cc
+++ b/sql/sql_window.cc
@@ -479,9 +479,15 @@ int compare_window_funcs_by_window_specs(Item_window_func *win_func1,
Let's use only one of the lists.
*/
if (!win_spec1->name() && win_spec2->name())
+ {
+ win_spec1->save_partition_list= win_spec1->partition_list;
win_spec1->partition_list= win_spec2->partition_list;
+ }
else
+ {
+ win_spec2->save_partition_list= win_spec2->partition_list;
win_spec2->partition_list= win_spec1->partition_list;
+ }
cmp= compare_order_lists(win_spec1->order_list,
win_spec2->order_list);
@@ -494,9 +500,15 @@ int compare_window_funcs_by_window_specs(Item_window_func *win_func1,
Let's use only one of the lists.
*/
if (!win_spec1->name() && win_spec2->name())
+ {
+ win_spec1->save_order_list= win_spec2->order_list;
win_spec1->order_list= win_spec2->order_list;
+ }
else
+ {
+ win_spec1->save_order_list= win_spec2->order_list;
win_spec2->order_list= win_spec1->order_list;
+ }
cmp= compare_window_frames(win_spec1->window_frame,
win_spec2->window_frame);
diff --git a/sql/sql_window.h b/sql/sql_window.h
index e0c1563..417d0bc 100644
--- a/sql/sql_window.h
+++ b/sql/sql_window.h
@@ -99,8 +99,10 @@ class Window_spec : public Sql_alloc
LEX_STRING *window_ref;
SQL_I_List<ORDER> *partition_list;
+ SQL_I_List<ORDER> *save_partition_list;
SQL_I_List<ORDER> *order_list;
+ SQL_I_List<ORDER> *save_order_list;
Window_frame *window_frame;
@@ -111,7 +113,8 @@ class Window_spec : public Sql_alloc
SQL_I_List<ORDER> *ord_list,
Window_frame *win_frame)
: window_names_are_checked(false), window_ref(win_ref),
- partition_list(part_list), order_list(ord_list),
+ partition_list(part_list), save_partition_list(NULL),
+ order_list(ord_list), save_order_list(NULL),
window_frame(win_frame), referenced_win_spec(NULL) {}
virtual char *name() { return NULL; }
2
1
[Commits] 0eaeb28: MDEV-26025 Server crashes while executing query with CTE in PS/SP
by IgorBabaev 20 Jul '21
by IgorBabaev 20 Jul '21
20 Jul '21
revision-id: 0eaeb28c4d1914b6fa648214c18d111fe86ef773 (mariadb-10.2.31-1062-g0eaeb28)
parent(s): f053349797a1dca5206a3b8d5ff33353f45430d8
author: Igor Babaev
committer: Igor Babaev
timestamp: 2021-07-20 00:07:31 -0700
message:
MDEV-26025 Server crashes while executing query with CTE in PS/SP
This bug appeared after the patch for bug MDEV-23886. Due to this bug
execution of queries with CTEs used the same CTE at least twice via
prepared statements or with stored procedures caused crashes of the server.
It happened because the select created for any of not the first usage of
a CTE erroneously was not included into all_selects_list.
This patch corrects the patch applied to fix the bug MDEV-26108.
Approved by Oleksandr Byelkin <sanja(a)mariadb.com>
---
mysql-test/r/cte_nonrecursive.result | 42 ++++++++++++++++++++++++++++++++++++
mysql-test/t/cte_nonrecursive.test | 27 +++++++++++++++++++++++
sql/sql_cte.cc | 15 ++++++++-----
3 files changed, 79 insertions(+), 5 deletions(-)
diff --git a/mysql-test/r/cte_nonrecursive.result b/mysql-test/r/cte_nonrecursive.result
index 5cc5a25..2504e55 100644
--- a/mysql-test/r/cte_nonrecursive.result
+++ b/mysql-test/r/cte_nonrecursive.result
@@ -2044,4 +2044,46 @@ select a from t1 union select a+1 as a from cte_r r where a < 10
) select * from cte_e;
ERROR 42S02: Table 'test.cte_r' doesn't exist
drop table t1;
+#
+# MDEV-26025: query with two usage of a CTE executing via PS /SP
+#
+create table t1 (a int, b int);
+insert into t1 value (1,3), (3,2), (1,3), (4,1);
+prepare stmt from "with
+cte1 as ( select a,b from t1 where a = 1 AND b = 3 ),
+cte2 as ( select a,b from cte1 ),
+cte3 as ( select a,b from cte2 )
+select * from cte3, cte2";
+execute stmt;
+a b a b
+1 3 1 3
+1 3 1 3
+1 3 1 3
+1 3 1 3
+execute stmt;
+a b a b
+1 3 1 3
+1 3 1 3
+1 3 1 3
+1 3 1 3
+deallocate prepare stmt;
+create procedure sp() with
+cte1 as ( select a,b from t1 where a = 1 AND b = 3 ),
+cte2 as ( select a,b from cte1 ),
+cte3 as ( select a,b from cte2 )
+select * from cte3, cte2;
+call sp();
+a b a b
+1 3 1 3
+1 3 1 3
+1 3 1 3
+1 3 1 3
+call sp();
+a b a b
+1 3 1 3
+1 3 1 3
+1 3 1 3
+1 3 1 3
+drop procedure sp;
+drop table t1;
# End of 10.2 tests
diff --git a/mysql-test/t/cte_nonrecursive.test b/mysql-test/t/cte_nonrecursive.test
index 68dbc0c..c20a0dc 100644
--- a/mysql-test/t/cte_nonrecursive.test
+++ b/mysql-test/t/cte_nonrecursive.test
@@ -1515,4 +1515,31 @@ with cte_e as (
drop table t1;
+--echo #
+--echo # MDEV-26025: query with two usage of a CTE executing via PS /SP
+--echo #
+
+create table t1 (a int, b int);
+insert into t1 value (1,3), (3,2), (1,3), (4,1);
+
+let $q=
+with
+ cte1 as ( select a,b from t1 where a = 1 AND b = 3 ),
+ cte2 as ( select a,b from cte1 ),
+ cte3 as ( select a,b from cte2 )
+select * from cte3, cte2;
+
+eval prepare stmt from "$q";
+execute stmt;
+execute stmt;
+deallocate prepare stmt;
+
+eval create procedure sp() $q;
+
+call sp();
+call sp();
+
+drop procedure sp;
+drop table t1;
+
--echo # End of 10.2 tests
diff --git a/sql/sql_cte.cc b/sql/sql_cte.cc
index 702db8f..b720eac 100644
--- a/sql/sql_cte.cc
+++ b/sql/sql_cte.cc
@@ -1012,6 +1012,7 @@ st_select_lex_unit *With_element::clone_parsed_spec(LEX *old_lex,
bool parse_status= false;
st_select_lex *with_select;
+ st_select_lex *last_clone_select;
char save_end= unparsed_spec.str[unparsed_spec.length];
unparsed_spec.str[unparsed_spec.length]= '\0';
@@ -1099,11 +1100,6 @@ st_select_lex_unit *With_element::clone_parsed_spec(LEX *old_lex,
lex->unit.include_down(with_table->select_lex);
lex->unit.set_slave(with_select);
lex->unit.cloned_from= spec;
- old_lex->all_selects_list=
- (st_select_lex*) (lex->all_selects_list->
- insert_chain_before(
- (st_select_lex_node **) &(old_lex->all_selects_list),
- with_select));
/*
Now all references to the CTE defined outside of the cloned specification
@@ -1119,6 +1115,15 @@ st_select_lex_unit *With_element::clone_parsed_spec(LEX *old_lex,
goto err;
}
+ last_clone_select= lex->all_selects_list;
+ while (last_clone_select->next_select_in_list())
+ last_clone_select= last_clone_select->next_select_in_list();
+ old_lex->all_selects_list=
+ (st_select_lex*) (lex->all_selects_list->
+ insert_chain_before(
+ (st_select_lex_node **) &(old_lex->all_selects_list),
+ last_clone_select));
+
lex->sphead= NULL; // in order not to delete lex->sphead
lex_end(lex);
err:
1
0
[Commits] 3ded690f323: Range Locking: get tests to pass: select_count_for_update, skip_Locked_nowait.
by psergey 19 Jul '21
by psergey 19 Jul '21
19 Jul '21
revision-id: 3ded690f3236c48665971a628dc419f523ffe0b9 (percona-202103-66-g3ded690f323)
parent(s): af2c310e95343e4455c1cc3dcf893651d27e6c06
author: Sergei Petrunia
committer: Sergei Petrunia
timestamp: 2021-07-19 13:55:18 +0300
message:
Range Locking: get tests to pass: select_count_for_update, skip_Locked_nowait.
---
mysql-test/suite/rocksdb/r/select_count_for_update.result | 4 ++--
mysql-test/suite/rocksdb/t/select_count_for_update.test | 14 ++++++++++++++
mysql-test/suite/rocksdb/t/skip_locked_nowait.test | 3 +++
3 files changed, 19 insertions(+), 2 deletions(-)
diff --git a/mysql-test/suite/rocksdb/r/select_count_for_update.result b/mysql-test/suite/rocksdb/r/select_count_for_update.result
index 1107aa2f6cb..6672d43eb43 100644
--- a/mysql-test/suite/rocksdb/r/select_count_for_update.result
+++ b/mysql-test/suite/rocksdb/r/select_count_for_update.result
@@ -35,9 +35,9 @@ SELECT COUNT(*) FROM t1 FORCE INDEX (sk);
COUNT(*)
3
SELECT COUNT(*) FROM t1 FORCE INDEX (sk) LOCK IN SHARE MODE;
-ERROR HY000: Lock wait timeout exceeded; try restarting transaction: Timeout on index: test.t1.PRIMARY
+ERROR HY000: Lock wait timeout exceeded; try restarting transaction: Timeout on index: $FAILING_INDEX
SELECT COUNT(*) FROM t1 FORCE INDEX (sk) FOR UPDATE;
-ERROR HY000: Lock wait timeout exceeded; try restarting transaction: Timeout on index: test.t1.PRIMARY
+ERROR HY000: Lock wait timeout exceeded; try restarting transaction: Timeout on index: $FAILING_INDEX
connection con1;
COMMIT;
SELECT COUNT(*) FROM t1 FORCE INDEX (sk);
diff --git a/mysql-test/suite/rocksdb/t/select_count_for_update.test b/mysql-test/suite/rocksdb/t/select_count_for_update.test
index 2c6f5d474a1..aa7059dfc7e 100644
--- a/mysql-test/suite/rocksdb/t/select_count_for_update.test
+++ b/mysql-test/suite/rocksdb/t/select_count_for_update.test
@@ -52,9 +52,23 @@ SET lock_wait_timeout = 1;
SELECT COUNT(*) FROM t1 FORCE INDEX (sk);
# ... but not with LOCK IN SHARE MODE / FOR UPDATE
+let $uses_range_locking=`select @@rocksdb_use_range_locking`;
+
+if ($uses_range_locking == "0") {
+--replace_regex /test.t1.PRIMARY/$FAILING_INDEX/
+}
+if ($uses_range_locking == "1") {
+--replace_regex /test.t1.sk/$FAILING_INDEX/
+}
--error ER_LOCK_WAIT_TIMEOUT
SELECT COUNT(*) FROM t1 FORCE INDEX (sk) LOCK IN SHARE MODE;
+if ($uses_range_locking == "0") {
+--replace_regex /test.t1.PRIMARY/$FAILING_INDEX/
+}
+if ($uses_range_locking == "1") {
+--replace_regex /test.t1.sk/$FAILING_INDEX/
+}
--error ER_LOCK_WAIT_TIMEOUT
SELECT COUNT(*) FROM t1 FORCE INDEX (sk) FOR UPDATE;
diff --git a/mysql-test/suite/rocksdb/t/skip_locked_nowait.test b/mysql-test/suite/rocksdb/t/skip_locked_nowait.test
index bfa36714816..3b8bcb033c0 100644
--- a/mysql-test/suite/rocksdb/t/skip_locked_nowait.test
+++ b/mysql-test/suite/rocksdb/t/skip_locked_nowait.test
@@ -2,5 +2,8 @@
# wl#8919 Implement NOWAIT and SKIP LOCKED
#
+# Range locking cannot support SKIP LOCKED? (TODO: but can support NOWAIT)
+--source suite/rocksdb/include/not_range_locking.inc
+
--let $engine=ROCKSDB
--source include/skip_locked_nowait.inc
1
0
[Commits] af2c310e953: This adds a global my.cnf parameter, rocksdb_use_range_locking.
by psergey 19 Jul '21
by psergey 19 Jul '21
19 Jul '21
revision-id: af2c310e95343e4455c1cc3dcf893651d27e6c06 (percona-202103-65-gaf2c310e953)
parent(s): cd90913f526a8166a9230442375bdaa862f133f3
author: Sergei Petrunia
committer: Sergei Petrunia
timestamp: 2021-07-19 12:45:16 +0300
message:
This adds a global my.cnf parameter, rocksdb_use_range_locking.
(Cherry-picked on top of the pushed iterator patches)
When it is ON, MyRocks will:
- initialize RocksDB to use range-locking lock manager
- for all DML operations (including SELECT .. FOR UPDATE) will lock
the scanned range before reading/modifying rows.
- In range locking mode, there is no snapshot checking (cannot do that
for ranges). Instead, MyRocks will read and modify latest committed
data, just like InnoDB does (in the code, grep for (start|end)
_ignore_snapshot)
- Queries that do not have a finite range to scan, like
UPDATE t1 .... ORDER BY t1.key LIMIT n
will use a "Locking iterator" which will read rows, lock the range,
and re-read the rows. See class LockingIterator.
---
mysql-test/suite/rocksdb/combinations | 3 +
.../suite/rocksdb/include/have_range_locking.inc | 3 +
.../suite/rocksdb/include/not_range_locking.inc | 5 +
.../rocksdb/include/select_from_is_rowlocks.inc | 66 +++
.../suite/rocksdb/r/hermitage-range_locking.result | 652 +++++++++++++++++++++
...issue243_transactionStatus-range_locking.result | 182 ++++++
.../r/level_repeatable_read-range_locking.result | 106 ++++
mysql-test/suite/rocksdb/r/range_locking.result | 522 +++++++++++++++++
.../r/range_locking_deadlock_tracking.result | 453 ++++++++++++++
.../rocksdb/r/range_locking_escalation.result | 27 +
.../rocksdb/r/range_locking_refresh_iter.result | 50 ++
.../suite/rocksdb/r/range_locking_rev_cf.result | 482 +++++++++++++++
.../rocksdb/r/range_locking_seek_for_update.result | 279 +++++++++
.../rocksdb/r/range_locking_shared_locks.result | 251 ++++++++
mysql-test/suite/rocksdb/r/rocksdb.result | 3 +
.../suite/rocksdb/r/rocksdb_read_free_rpl.result | 2 +-
.../rocksdb/r/rocksdb_timeout_rollback.result | 3 +
mysql-test/suite/rocksdb/r/unique_sec.result | 4 +
.../suite/rocksdb/r/unique_sec_rev_cf.result | 4 +
mysql-test/suite/rocksdb/t/deadlock_tracking.test | 7 +-
.../t/drop_cf_before_show_deadlock_info.test | 4 +
.../suite/rocksdb/t/hermitage-range_locking.test | 15 +
mysql-test/suite/rocksdb/t/hermitage.inc | 14 +-
mysql-test/suite/rocksdb/t/hermitage.test | 3 +
mysql-test/suite/rocksdb/t/i_s_deadlock.test | 4 +
mysql-test/suite/rocksdb/t/issue111.test | 4 +
.../issue243_transactionStatus-range_locking.test | 10 +
.../rocksdb/t/issue243_transactionStatus.test | 4 +
.../t/level_repeatable_read-range_locking.test | 9 +
.../suite/rocksdb/t/level_repeatable_read.test | 3 +
mysql-test/suite/rocksdb/t/lock_info.test | 3 +
mysql-test/suite/rocksdb/t/locking_issues.test | 3 +
mysql-test/suite/rocksdb/t/max_row_locks.test | 1 +
mysql-test/suite/rocksdb/t/range_locking.inc | 544 +++++++++++++++++
mysql-test/suite/rocksdb/t/range_locking.test | 6 +
.../rocksdb/t/range_locking_deadlock_tracking.test | 196 +++++++
.../rocksdb/t/range_locking_escalation-master.opt | 1 +
.../suite/rocksdb/t/range_locking_escalation.test | 39 ++
.../rocksdb/t/range_locking_refresh_iter.test | 70 +++
.../suite/rocksdb/t/range_locking_rev_cf.test | 12 +
.../rocksdb/t/range_locking_seek_for_update.test | 288 +++++++++
.../rocksdb/t/range_locking_shared_locks.test | 202 +++++++
mysql-test/suite/rocksdb/t/rocksdb.test | 3 +
.../suite/rocksdb/t/rocksdb_concurrent_delete.test | 4 +
mysql-test/suite/rocksdb/t/rocksdb_locks.test | 3 +
.../suite/rocksdb/t/rocksdb_read_free_rpl.test | 2 +-
.../suite/rocksdb/t/rocksdb_timeout_rollback.test | 2 +
mysql-test/suite/rocksdb/t/rpl_row_not_found.inc | 2 +
.../suite/rocksdb/t/select_lock_in_share_mode.test | 3 +
mysql-test/suite/rocksdb/t/unique_check.test | 5 +
mysql-test/suite/rocksdb/t/unique_sec.inc | 10 +-
mysql-test/suite/rocksdb/t/unique_sec_rev_cf.test | 1 +
mysql-test/suite/rocksdb/t/varbinary_format.test | 4 +
mysql-test/suite/rocksdb/t/varchar_format.test | 2 +
.../r/rocksdb_max_lock_memory_basic.result | 7 +
.../r/rocksdb_use_range_locking_basic.result | 7 +
.../t/rocksdb_max_lock_memory_basic.test | 5 +
.../t/rocksdb_use_range_locking_basic.test | 5 +
storage/rocksdb/CMakeLists.txt | 1 +
storage/rocksdb/get_rocksdb_files.sh | 2 +-
storage/rocksdb/ha_rocksdb.cc | 637 ++++++++++++++++++--
storage/rocksdb/ha_rocksdb.h | 18 +-
storage/rocksdb/nosql_access.cc | 6 +-
storage/rocksdb/rdb_i_s.cc | 82 ++-
storage/rocksdb/rdb_iterator.cc | 33 +-
storage/rocksdb/rdb_iterator.h | 9 +
storage/rocksdb/rdb_locking_iter.cc | 108 ++++
storage/rocksdb/rdb_locking_iter.h | 190 ++++++
storage/rocksdb/rdb_utils.cc | 27 +
storage/rocksdb/rdb_utils.h | 3 +
70 files changed, 5635 insertions(+), 85 deletions(-)
diff --git a/mysql-test/suite/rocksdb/combinations b/mysql-test/suite/rocksdb/combinations
index acf2f49a0c3..5e3b56932c6 100644
--- a/mysql-test/suite/rocksdb/combinations
+++ b/mysql-test/suite/rocksdb/combinations
@@ -7,3 +7,6 @@ rocksdb_write_policy=write_prepared
[write_unprepared]
rocksdb_write_policy=write_unprepared
rocksdb_write_batch_flush_threshold=1
+
+[range_locking]
+rocksdb_use_range_locking=1
diff --git a/mysql-test/suite/rocksdb/include/have_range_locking.inc b/mysql-test/suite/rocksdb/include/have_range_locking.inc
new file mode 100644
index 00000000000..a8600daea77
--- /dev/null
+++ b/mysql-test/suite/rocksdb/include/have_range_locking.inc
@@ -0,0 +1,3 @@
+if (`select count(*) = 0 from performance_schema.session_variables where variable_name = 'rocksdb_use_range_locking' and variable_value = 'ON';`) {
+ --skip Test requires range locking
+}
diff --git a/mysql-test/suite/rocksdb/include/not_range_locking.inc b/mysql-test/suite/rocksdb/include/not_range_locking.inc
new file mode 100644
index 00000000000..62c26b134bc
--- /dev/null
+++ b/mysql-test/suite/rocksdb/include/not_range_locking.inc
@@ -0,0 +1,5 @@
+--let $_use_range_locking= `select @@rocksdb_use_range_locking`
+if ($_use_range_locking == 1)
+{
+ --skip Test doesn't support range locking
+}
diff --git a/mysql-test/suite/rocksdb/include/select_from_is_rowlocks.inc b/mysql-test/suite/rocksdb/include/select_from_is_rowlocks.inc
new file mode 100644
index 00000000000..e5f54d68914
--- /dev/null
+++ b/mysql-test/suite/rocksdb/include/select_from_is_rowlocks.inc
@@ -0,0 +1,66 @@
+--echo # select * from information_schema.rocksdb_locks; # With replacements by select_from_is_rowlocks.inc
+#
+# An include to print contents of I_S.ROCKSB_LOCKS
+#
+# Implicit "parameters"
+# - Currently it prints locks on t1.PRIMARY
+#
+# Explicit "parameter" variables:
+# - $TRX1_ID - print this transaction as "TRX1"
+# - $TRX2_ID - print this transaction as "TRX2"
+#
+# - $select_from_is_rowlocks_current_trx_only
+# - $order_by_rowkey
+
+--disable_query_log
+set @cf_id=(select column_family from information_schema.rocksdb_ddl
+ where table_name='t1' and index_name='PRIMARY');
+set @rtrx_id=(select transaction_id from information_schema.rocksdb_trx
+ where thread_id=connection_id());
+set @indexnr= (select lower(lpad(hex(index_number),8,'0')) from information_schema.rocksdb_ddl
+ where table_name='t1' and index_name='PRIMARY');
+
+set @indexnr_next= (select lower(lpad(hex(index_number+1),8,'0'))
+ from information_schema.rocksdb_ddl
+ where table_name='t1' and index_name='PRIMARY');
+
+let $extra_where = where 1;
+
+if ($select_from_is_rowlocks_current_trx_only)
+{
+ let $extra_where = where transaction_id=(select transaction_id from information_schema.rocksdb_trx where connection_id()=thread_id);
+}
+
+# If TRX1_ID is not specified, get the current transaction:
+let $transaction_col= replace(transaction_id, @rtrx_id, "\$trx_id");
+if ($TRX1_ID)
+{
+ let $transaction_col = replace(transaction_id, '$TRX1_ID', "\$TRX1_ID");
+}
+
+if ($TRX2_ID)
+{
+ let $transaction_col = replace($transaction_col, '$TRX2_ID', "\$TRX2_ID");
+}
+
+if ($order_by_rowkey)
+{
+ let $extra_order_by = ORDER BY 3,2;
+}
+
+if (!$order_by_rowkey)
+{
+ --sorted_result
+}
+
+eval select
+ replace(column_family_id, @cf_id, "\$cf_id") as COLUMN_FAMILY_ID,
+ $transaction_col as TRANSACTION_ID,
+ replace(
+ replace(`key`, @indexnr, '\${indexnr}'),
+ @indexnr_next, '\${indexnr+1}'
+ ) as `KEY`,
+ mode
+from information_schema.rocksdb_locks $extra_where $extra_order_by;
+
+--enable_query_log
diff --git a/mysql-test/suite/rocksdb/r/hermitage-range_locking.result b/mysql-test/suite/rocksdb/r/hermitage-range_locking.result
new file mode 100644
index 00000000000..3938fa38b6c
--- /dev/null
+++ b/mysql-test/suite/rocksdb/r/hermitage-range_locking.result
@@ -0,0 +1,652 @@
+DROP TABLE IF EXISTS test;
+connect con1,localhost,root,,;
+SET SESSION TRANSACTION ISOLATION LEVEL READ COMMITTED;
+connect con2,localhost,root,,;
+SET SESSION TRANSACTION ISOLATION LEVEL READ COMMITTED;
+connect con3,localhost,root,,;
+SET SESSION TRANSACTION ISOLATION LEVEL READ COMMITTED;
+connection con1;
+create table test (id int primary key, value int) engine=rocksdb;
+connection con1;
+truncate table test;
+insert into test (id, value) values (1, 10), (2, 20);
+begin;
+connection con2;
+begin;
+connection con3;
+begin;
+connection con1;
+select * from test;
+id value
+1 10
+2 20
+update test set value = 101 where id = 1;
+connection con2;
+select * from test;
+id value
+1 10
+2 20
+connection con1;
+rollback;
+connection con2;
+select * from test;
+id value
+1 10
+2 20
+commit;
+connection con1;
+truncate table test;
+insert into test (id, value) values (1, 10), (2, 20);
+begin;
+connection con2;
+begin;
+connection con3;
+begin;
+connection con1;
+update test set value = 101 where id = 1;
+connection con2;
+select * from test;
+id value
+1 10
+2 20
+connection con1;
+update test set value = 11 where id = 1;
+commit;
+connection con2;
+select * from test;
+id value
+1 11
+2 20
+commit;
+connection con1;
+truncate table test;
+insert into test (id, value) values (1, 10), (2, 20);
+begin;
+connection con2;
+begin;
+connection con3;
+begin;
+connection con1;
+update test set value = 11 where id = 1;
+connection con2;
+update test set value = 22 where id = 2;
+connection con1;
+select * from test where id = 2;
+id value
+2 20
+connection con2;
+select * from test where id = 1;
+id value
+1 10
+connection con1;
+commit;
+connection con2;
+commit;
+connection con1;
+truncate table test;
+insert into test (id, value) values (1, 10), (2, 20);
+begin;
+connection con2;
+begin;
+connection con3;
+begin;
+connection con1;
+update test set value = 11 where id = 1;
+update test set value = 19 where id = 2;
+connection con2;
+update test set value = 12 where id = 1;
+connection con1;
+commit;
+connection con2;
+connection con3;
+select * from test;
+id value
+1 11
+2 19
+connection con2;
+update test set value = 18 where id = 2;
+connection con3;
+select * from test;
+id value
+1 11
+2 19
+connection con2;
+commit;
+connection con3;
+select * from test;
+id value
+1 12
+2 18
+commit;
+connection con1;
+truncate table test;
+insert into test (id, value) values (1, 10), (2, 20);
+begin;
+connection con2;
+begin;
+connection con3;
+begin;
+connection con1;
+select * from test where value = 30;
+id value
+connection con2;
+insert into test (id, value) values(3, 30);
+commit;
+connection con1;
+select * from test where value % 3 = 0;
+id value
+3 30
+commit;
+connection con1;
+truncate table test;
+insert into test (id, value) values (1, 10), (2, 20);
+begin;
+connection con2;
+begin;
+connection con3;
+begin;
+connection con1;
+update test set value = value + 10;
+connection con2;
+select variable_value into @a from performance_schema.global_status where variable_name='rocksdb_snapshot_conflict_errors';
+select * from test;
+id value
+1 10
+2 20
+delete from test where value = 20;
+connection con1;
+commit;
+connection con2;
+select * from test;
+id value
+2 30
+commit;
+connection con1;
+truncate table test;
+insert into test (id, value) values (1, 10), (2, 20);
+begin;
+connection con2;
+begin;
+connection con3;
+begin;
+connection con1;
+select * from test where id = 1;
+id value
+1 10
+connection con2;
+select * from test where id = 1;
+id value
+1 10
+connection con1;
+update test set value = 11 where id = 1;
+connection con2;
+update test set value = 12 where id = 1;
+connection con1;
+commit;
+connection con2;
+select * from test;
+id value
+1 12
+2 20
+commit;
+connection con1;
+truncate table test;
+insert into test (id, value) values (1, 10), (2, 20);
+begin;
+connection con2;
+begin;
+connection con3;
+begin;
+connection con1;
+select * from test where id = 1;
+id value
+1 10
+connection con2;
+select * from test where id = 1;
+id value
+1 10
+select * from test where id = 2;
+id value
+2 20
+update test set value = 12 where id = 1;
+update test set value = 18 where id = 2;
+commit;
+connection con1;
+select * from test where id = 2;
+id value
+2 18
+commit;
+connection con1;
+truncate table test;
+insert into test (id, value) values (1, 10), (2, 20);
+begin;
+connection con2;
+begin;
+connection con3;
+begin;
+connection con1;
+select * from test where value % 5 = 0;
+id value
+1 10
+2 20
+connection con2;
+update test set value = 12 where value = 10;
+commit;
+connection con1;
+select * from test where value % 3 = 0;
+id value
+1 12
+commit;
+connection con1;
+truncate table test;
+insert into test (id, value) values (1, 10), (2, 20);
+begin;
+connection con2;
+begin;
+connection con3;
+begin;
+connection con1;
+select * from test where id = 1;
+id value
+1 10
+connection con2;
+select * from test;
+id value
+1 10
+2 20
+update test set value = 12 where id = 1;
+update test set value = 18 where id = 2;
+commit;
+connection con1;
+delete from test where value = 20;
+select * from test where id = 2;
+id value
+2 18
+commit;
+connection con1;
+truncate table test;
+insert into test (id, value) values (1, 10), (2, 20);
+begin;
+connection con2;
+begin;
+connection con3;
+begin;
+connection con1;
+select * from test where id in (1,2);
+id value
+1 10
+2 20
+connection con2;
+select * from test where id in (1,2);
+id value
+1 10
+2 20
+connection con1;
+update test set value = 11 where id = 1;
+connection con2;
+update test set value = 21 where id = 2;
+connection con1;
+commit;
+connection con2;
+commit;
+connection con1;
+truncate table test;
+insert into test (id, value) values (1, 10), (2, 20);
+begin;
+connection con2;
+begin;
+connection con3;
+begin;
+connection con1;
+select * from test where value % 3 = 0;
+id value
+connection con2;
+select * from test where value % 3 = 0;
+id value
+connection con1;
+insert into test (id, value) values(3, 30);
+connection con2;
+insert into test (id, value) values(4, 42);
+connection con1;
+commit;
+connection con2;
+commit;
+select * from test where value % 3 = 0;
+id value
+3 30
+4 42
+connection con1;
+select * from test where value % 3 = 0;
+id value
+3 30
+4 42
+connection default;
+drop table test;
+disconnect con1;
+disconnect con2;
+disconnect con3;
+DROP TABLE IF EXISTS test;
+connect con1,localhost,root,,;
+SET SESSION TRANSACTION ISOLATION LEVEL REPEATABLE READ;
+connect con2,localhost,root,,;
+SET SESSION TRANSACTION ISOLATION LEVEL REPEATABLE READ;
+connect con3,localhost,root,,;
+SET SESSION TRANSACTION ISOLATION LEVEL REPEATABLE READ;
+connection con1;
+create table test (id int primary key, value int) engine=rocksdb;
+connection con1;
+truncate table test;
+insert into test (id, value) values (1, 10), (2, 20);
+begin;
+connection con2;
+begin;
+connection con3;
+begin;
+connection con1;
+select * from test;
+id value
+1 10
+2 20
+update test set value = 101 where id = 1;
+connection con2;
+select * from test;
+id value
+1 10
+2 20
+connection con1;
+rollback;
+connection con2;
+select * from test;
+id value
+1 10
+2 20
+commit;
+connection con1;
+truncate table test;
+insert into test (id, value) values (1, 10), (2, 20);
+begin;
+connection con2;
+begin;
+connection con3;
+begin;
+connection con1;
+update test set value = 101 where id = 1;
+connection con2;
+select * from test;
+id value
+1 10
+2 20
+connection con1;
+update test set value = 11 where id = 1;
+commit;
+connection con2;
+select * from test;
+id value
+1 10
+2 20
+commit;
+connection con1;
+truncate table test;
+insert into test (id, value) values (1, 10), (2, 20);
+begin;
+connection con2;
+begin;
+connection con3;
+begin;
+connection con1;
+update test set value = 11 where id = 1;
+connection con2;
+update test set value = 22 where id = 2;
+connection con1;
+select * from test where id = 2;
+id value
+2 20
+connection con2;
+select * from test where id = 1;
+id value
+1 10
+connection con1;
+commit;
+connection con2;
+commit;
+connection con1;
+truncate table test;
+insert into test (id, value) values (1, 10), (2, 20);
+begin;
+connection con2;
+begin;
+connection con3;
+begin;
+connection con1;
+update test set value = 11 where id = 1;
+update test set value = 19 where id = 2;
+connection con2;
+update test set value = 12 where id = 1;
+connection con1;
+commit;
+connection con2;
+connection con3;
+select * from test;
+id value
+1 11
+2 19
+connection con2;
+update test set value = 18 where id = 2;
+connection con3;
+select * from test;
+id value
+1 11
+2 19
+connection con2;
+commit;
+connection con3;
+select * from test;
+id value
+1 11
+2 19
+commit;
+connection con1;
+truncate table test;
+insert into test (id, value) values (1, 10), (2, 20);
+begin;
+connection con2;
+begin;
+connection con3;
+begin;
+connection con1;
+select * from test where value = 30;
+id value
+connection con2;
+insert into test (id, value) values(3, 30);
+commit;
+connection con1;
+select * from test where value % 3 = 0;
+id value
+commit;
+connection con1;
+truncate table test;
+insert into test (id, value) values (1, 10), (2, 20);
+begin;
+connection con2;
+begin;
+connection con3;
+begin;
+connection con1;
+update test set value = value + 10;
+connection con2;
+select variable_value into @a from performance_schema.global_status where variable_name='rocksdb_snapshot_conflict_errors';
+select * from test;
+id value
+1 10
+2 20
+delete from test where value = 20;
+connection con1;
+commit;
+connection con2;
+select * from test;
+id value
+2 20
+commit;
+connection con1;
+truncate table test;
+insert into test (id, value) values (1, 10), (2, 20);
+begin;
+connection con2;
+begin;
+connection con3;
+begin;
+connection con1;
+select * from test where id = 1;
+id value
+1 10
+connection con2;
+select * from test where id = 1;
+id value
+1 10
+connection con1;
+update test set value = 11 where id = 1;
+connection con2;
+update test set value = 12 where id = 1;
+connection con1;
+commit;
+connection con2;
+select * from test;
+id value
+1 12
+2 20
+commit;
+connection con1;
+truncate table test;
+insert into test (id, value) values (1, 10), (2, 20);
+begin;
+connection con2;
+begin;
+connection con3;
+begin;
+connection con1;
+select * from test where id = 1;
+id value
+1 10
+connection con2;
+select * from test where id = 1;
+id value
+1 10
+select * from test where id = 2;
+id value
+2 20
+update test set value = 12 where id = 1;
+update test set value = 18 where id = 2;
+commit;
+connection con1;
+select * from test where id = 2;
+id value
+2 20
+commit;
+connection con1;
+truncate table test;
+insert into test (id, value) values (1, 10), (2, 20);
+begin;
+connection con2;
+begin;
+connection con3;
+begin;
+connection con1;
+select * from test where value % 5 = 0;
+id value
+1 10
+2 20
+connection con2;
+update test set value = 12 where value = 10;
+commit;
+connection con1;
+select * from test where value % 3 = 0;
+id value
+commit;
+connection con1;
+truncate table test;
+insert into test (id, value) values (1, 10), (2, 20);
+begin;
+connection con2;
+begin;
+connection con3;
+begin;
+connection con1;
+select * from test where id = 1;
+id value
+1 10
+connection con2;
+select * from test;
+id value
+1 10
+2 20
+update test set value = 12 where id = 1;
+update test set value = 18 where id = 2;
+commit;
+connection con1;
+delete from test where value = 20;
+select * from test where id = 2;
+id value
+2 20
+commit;
+connection con1;
+truncate table test;
+insert into test (id, value) values (1, 10), (2, 20);
+begin;
+connection con2;
+begin;
+connection con3;
+begin;
+connection con1;
+select * from test where id in (1,2);
+id value
+1 10
+2 20
+connection con2;
+select * from test where id in (1,2);
+id value
+1 10
+2 20
+connection con1;
+update test set value = 11 where id = 1;
+connection con2;
+update test set value = 21 where id = 2;
+connection con1;
+commit;
+connection con2;
+commit;
+connection con1;
+truncate table test;
+insert into test (id, value) values (1, 10), (2, 20);
+begin;
+connection con2;
+begin;
+connection con3;
+begin;
+connection con1;
+select * from test where value % 3 = 0;
+id value
+connection con2;
+select * from test where value % 3 = 0;
+id value
+connection con1;
+insert into test (id, value) values(3, 30);
+connection con2;
+insert into test (id, value) values(4, 42);
+connection con1;
+commit;
+connection con2;
+commit;
+select * from test where value % 3 = 0;
+id value
+3 30
+4 42
+connection con1;
+select * from test where value % 3 = 0;
+id value
+3 30
+4 42
+connection default;
+drop table test;
+disconnect con1;
+disconnect con2;
+disconnect con3;
diff --git a/mysql-test/suite/rocksdb/r/issue243_transactionStatus-range_locking.result b/mysql-test/suite/rocksdb/r/issue243_transactionStatus-range_locking.result
new file mode 100644
index 00000000000..b48535c5ee6
--- /dev/null
+++ b/mysql-test/suite/rocksdb/r/issue243_transactionStatus-range_locking.result
@@ -0,0 +1,182 @@
+DROP TABLE IF EXISTS t1;
+CREATE TABLE t1 (
+id INT,
+val1 INT,
+val2 INT,
+PRIMARY KEY (id)
+) ENGINE=rocksdb;
+INSERT INTO t1 VALUES(1,1,1),(2,1,2);
+SELECT * FROM t1;
+id val1 val2
+1 1 1
+2 1 2
+UPDATE t1 SET val1=2 WHERE id=2;
+SELECT * FROM t1;
+id val1 val2
+1 1 1
+2 2 2
+SHOW ENGINE rocksdb TRANSACTION STATUS;
+Type Name Status
+rocksdb
+============================================================
+TIMESTAMP ROCKSDB TRANSACTION MONITOR OUTPUT
+============================================================
+---------
+SNAPSHOTS
+---------
+LIST OF SNAPSHOTS FOR EACH SESSION:
+----------LATEST DETECTED DEADLOCKS----------
+-----------------------------------------
+END OF ROCKSDB TRANSACTION MONITOR OUTPUT
+=========================================
+
+SET AUTOCOMMIT=0;
+START TRANSACTION;
+INSERT INTO t1 VALUES(20,1,1),(30,30,30);
+SELECT * FROM t1;
+id val1 val2
+1 1 1
+2 2 2
+20 1 1
+30 30 30
+UPDATE t1 SET val1=20, val2=20 WHERE id=20;
+SELECT * FROM t1;
+id val1 val2
+1 1 1
+2 2 2
+20 20 20
+30 30 30
+DELETE FROM t1 WHERE id=30;
+SHOW ENGINE rocksdb TRANSACTION STATUS;
+Type Name Status
+rocksdb
+============================================================
+TIMESTAMP ROCKSDB TRANSACTION MONITOR OUTPUT
+============================================================
+---------
+SNAPSHOTS
+---------
+LIST OF SNAPSHOTS FOR EACH SESSION:
+---SNAPSHOT, ACTIVE NUM sec
+MySQL thread id TID, OS thread handle PTR, query id QID localhost root ACTION
+SHOW ENGINE rocksdb TRANSACTION STATUS
+lock count 4, write count 4
+insert count 2, update count 1, delete count 1
+----------LATEST DETECTED DEADLOCKS----------
+-----------------------------------------
+END OF ROCKSDB TRANSACTION MONITOR OUTPUT
+=========================================
+
+ROLLBACK;
+SHOW ENGINE rocksdb TRANSACTION STATUS;
+Type Name Status
+rocksdb
+============================================================
+TIMESTAMP ROCKSDB TRANSACTION MONITOR OUTPUT
+============================================================
+---------
+SNAPSHOTS
+---------
+LIST OF SNAPSHOTS FOR EACH SESSION:
+----------LATEST DETECTED DEADLOCKS----------
+-----------------------------------------
+END OF ROCKSDB TRANSACTION MONITOR OUTPUT
+=========================================
+
+START TRANSACTION;
+INSERT INTO t1 VALUES(40,40,40);
+SHOW ENGINE rocksdb TRANSACTION STATUS;
+Type Name Status
+rocksdb
+============================================================
+TIMESTAMP ROCKSDB TRANSACTION MONITOR OUTPUT
+============================================================
+---------
+SNAPSHOTS
+---------
+LIST OF SNAPSHOTS FOR EACH SESSION:
+----------LATEST DETECTED DEADLOCKS----------
+-----------------------------------------
+END OF ROCKSDB TRANSACTION MONITOR OUTPUT
+=========================================
+
+COMMIT;
+SHOW ENGINE rocksdb TRANSACTION STATUS;
+Type Name Status
+rocksdb
+============================================================
+TIMESTAMP ROCKSDB TRANSACTION MONITOR OUTPUT
+============================================================
+---------
+SNAPSHOTS
+---------
+LIST OF SNAPSHOTS FOR EACH SESSION:
+----------LATEST DETECTED DEADLOCKS----------
+-----------------------------------------
+END OF ROCKSDB TRANSACTION MONITOR OUTPUT
+=========================================
+
+SET AUTOCOMMIT=1;
+DROP TABLE t1;
+DROP TABLE IF EXISTS t2;
+CREATE TABLE t2 (
+id1 INT,
+id2 INT,
+value INT,
+PRIMARY KEY (id1),
+KEY (id2)
+) ENGINE=rocksdb;
+SET AUTOCOMMIT=0;
+START TRANSACTION;
+INSERT INTO t2 VALUES(1,2,0),(10,20,30);
+UPDATE t2 SET value=3 WHERE id2=2;
+DELETE FROM t2 WHERE id1=10;
+SHOW ENGINE rocksdb TRANSACTION STATUS;
+Type Name Status
+rocksdb
+============================================================
+TIMESTAMP ROCKSDB TRANSACTION MONITOR OUTPUT
+============================================================
+---------
+SNAPSHOTS
+---------
+LIST OF SNAPSHOTS FOR EACH SESSION:
+----------LATEST DETECTED DEADLOCKS----------
+-----------------------------------------
+END OF ROCKSDB TRANSACTION MONITOR OUTPUT
+=========================================
+
+ROLLBACK;
+SET AUTOCOMMIT=1;
+DROP TABLE t2;
+DROP TABLE IF EXISTS t2;
+CREATE TABLE t2 (
+id1 INT,
+id2 INT,
+value INT,
+PRIMARY KEY (id1),
+UNIQUE KEY (id2)
+) ENGINE=rocksdb;
+SET AUTOCOMMIT=0;
+START TRANSACTION;
+INSERT INTO t2 VALUES(1,2,0),(10,20,30);
+UPDATE t2 SET value=3 WHERE id2=2;
+DELETE FROM t2 WHERE id1=10;
+SHOW ENGINE rocksdb TRANSACTION STATUS;
+Type Name Status
+rocksdb
+============================================================
+TIMESTAMP ROCKSDB TRANSACTION MONITOR OUTPUT
+============================================================
+---------
+SNAPSHOTS
+---------
+LIST OF SNAPSHOTS FOR EACH SESSION:
+----------LATEST DETECTED DEADLOCKS----------
+-----------------------------------------
+END OF ROCKSDB TRANSACTION MONITOR OUTPUT
+=========================================
+
+ROLLBACK;
+SET AUTOCOMMIT=1;
+DROP TABLE t2;
diff --git a/mysql-test/suite/rocksdb/r/level_repeatable_read-range_locking.result b/mysql-test/suite/rocksdb/r/level_repeatable_read-range_locking.result
new file mode 100644
index 00000000000..0592b099238
--- /dev/null
+++ b/mysql-test/suite/rocksdb/r/level_repeatable_read-range_locking.result
@@ -0,0 +1,106 @@
+DROP TABLE IF EXISTS t1;
+connect con1,localhost,root,,;
+SET SESSION TRANSACTION ISOLATION LEVEL REPEATABLE READ;
+connect con2,localhost,root,,;
+SET SESSION TRANSACTION ISOLATION LEVEL REPEATABLE READ;
+connection con1;
+CREATE TABLE t1 (a INT, pk INT AUTO_INCREMENT PRIMARY KEY) ENGINE=rocksdb;
+START TRANSACTION;
+SELECT a FROM t1;
+a
+connection con2;
+BEGIN;
+INSERT INTO t1 (a) VALUES(1);
+connection con1;
+SELECT a FROM t1;
+a
+connection con2;
+INSERT INTO t1 (a) VALUES (2);
+connection con1;
+SELECT a FROM t1;
+a
+INSERT INTO t1 (a) SELECT a+100 FROM t1;
+SELECT a FROM t1;
+a
+connection con2;
+SELECT a FROM t1;
+a
+1
+2
+COMMIT;
+SELECT a FROM t1;
+a
+1
+2
+connection con1;
+SELECT a FROM t1;
+a
+INSERT INTO t1 (a) SELECT a+200 FROM t1;
+SELECT a FROM t1;
+a
+201
+202
+COMMIT;
+SELECT a FROM t1;
+a
+1
+2
+201
+202
+connection con2;
+SELECT a FROM t1;
+a
+1
+2
+201
+202
+connection default;
+CREATE TABLE t2 (a INT PRIMARY KEY) ENGINE=rocksdb;
+INSERT INTO t2 (a) VALUES (1);
+COMMIT;
+connection con1;
+BEGIN;
+SELECT a from t2;
+a
+1
+INSERT INTO t2 (a) VALUES (1), (3);
+ERROR 23000: Duplicate entry '1' for key 't2.PRIMARY'
+connection con2;
+INSERT INTO t2 (a) VALUES (2);
+COMMIT;
+connection con1;
+SELECT a from t2;
+a
+1
+COMMIT;
+connection default;
+disconnect con1;
+disconnect con2;
+DROP TABLE t1;
+DROP TABLE t2;
+CREATE TABLE t3 (
+pk int unsigned PRIMARY KEY,
+count int unsigned DEFAULT '0'
+) ENGINE=ROCKSDB;
+connect con1,localhost,root,,;
+SET SESSION TRANSACTION ISOLATION LEVEL REPEATABLE READ;
+connect con2,localhost,root,,;
+SET SESSION TRANSACTION ISOLATION LEVEL REPEATABLE READ;
+connection con1;
+BEGIN;
+SELECT * FROM t3;
+pk count
+connection con2;
+BEGIN;
+INSERT INTO t3 (pk) VALUES(1) ON DUPLICATE KEY UPDATE count=count+1;
+COMMIT;
+connection con1;
+INSERT INTO t3 (pk) VALUES(1) ON DUPLICATE KEY UPDATE count=count+1;
+COMMIT;
+SELECT count FROM t3;
+count
+1
+connection default;
+disconnect con1;
+disconnect con2;
+DROP TABLE t3;
diff --git a/mysql-test/suite/rocksdb/r/range_locking.result b/mysql-test/suite/rocksdb/r/range_locking.result
new file mode 100644
index 00000000000..603c0b99f09
--- /dev/null
+++ b/mysql-test/suite/rocksdb/r/range_locking.result
@@ -0,0 +1,522 @@
+show variables like 'rocksdb_use_range_locking';
+Variable_name Value
+rocksdb_use_range_locking ON
+create table t1 (
+pk int,
+a int,
+primary key (pk) comment 'default'
+) engine=rocksdb;
+insert into t1 values
+(10,10),(20,20),(30,30);
+connect con1,localhost,root,,;
+connect con2,localhost,root,,;
+### Test: check that range lock inhibits a point lock
+connection con1;
+begin;
+select * from t1 where pk between 5 and 25 for update;
+pk a
+10 10
+20 20
+connection con2;
+insert into t1 values (15,15);
+ERROR HY000: Lock wait timeout exceeded; try restarting transaction: Timeout on index: test.t1.PRIMARY
+connection con1;
+rollback;
+## Test: check that range lock inhibits another range lock
+connection con1;
+begin;
+select * from t1 where pk between 5 and 25 for update;
+pk a
+10 10
+20 20
+connection con2;
+begin;
+select * from t1 where pk between 15 and 35 for update;
+ERROR HY000: Lock wait timeout exceeded; try restarting transaction: Timeout on index: test.t1.PRIMARY
+rollback;
+connection con1;
+rollback;
+## Test: check that regular read does not get a range lock
+connection con1;
+begin;
+select * from t1 where pk between 5 and 25;
+pk a
+10 10
+20 20
+connection con2;
+begin;
+select * from t1 where pk between 15 and 35 for update;
+pk a
+20 20
+30 30
+rollback;
+connection con1;
+rollback;
+## Test that locks are not released when a statement inside
+## a transaction is rolled back
+create table t2 (
+pk int,
+a int,
+primary key (pk) comment 'default',
+unique key(a) comment 'default'
+) engine=rocksdb;
+insert into t2 values (1,1),(2,2);
+begin;
+insert into t2 values (3,3);
+insert into t2 values (10,2);
+ERROR 23000: Duplicate entry '2' for key 't2.a'
+connection con2;
+begin;
+select * from t2 where pk=3 for update;
+ERROR HY000: Lock wait timeout exceeded; try restarting transaction: Timeout on index: test.t2.PRIMARY
+rollback;
+connection con1;
+rollback;
+drop table t2;
+connection default;
+disconnect con1;
+disconnect con2;
+drop table t1;
+#
+# Test INFORMATION_SCHEMA.lock_info in range-locking mode
+#
+connect con1,localhost,root,,;
+connection con1;
+create table t0 (a int primary key);
+begin;
+insert into t0 values (1);
+connection default;
+create table t1 (
+pk int,
+a int,
+primary key (pk) comment 'default'
+) engine=rocksdb;
+insert into t1 values
+(10,10),(20,20),(30,30);
+begin;
+select * from t1 where pk=10 for update;
+pk a
+10 10
+# select * from information_schema.rocksdb_locks; # With replacements by select_from_is_rowlocks.inc
+COLUMN_FAMILY_ID TRANSACTION_ID KEY mode
+$cf_id $trx_id ${indexnr}8000000a X
+delete from t1 where pk between 25 and 40;
+# select * from information_schema.rocksdb_locks; # With replacements by select_from_is_rowlocks.inc
+COLUMN_FAMILY_ID TRANSACTION_ID KEY mode
+$cf_id $trx_id ${indexnr}8000000a X
+$cf_id $trx_id ${indexnr}80000019-${indexnr}80000028:1 X
+rollback;
+begin;
+# The following will show a range lock on 2-9 and also a point lock on 10.
+# This is how things currently work. (after MDEV-21314, not anymore)
+select * from t1 where pk between 2 and 9 for update;
+pk a
+# select * from information_schema.rocksdb_locks; # With replacements by select_from_is_rowlocks.inc
+COLUMN_FAMILY_ID TRANSACTION_ID KEY mode
+$cf_id $trx_id ${indexnr}80000002-${indexnr}80000009:1 X
+rollback;
+drop table t1;
+connection con1;
+rollback;
+drop table t0;
+connection default;
+disconnect con1;
+#
+# MDEV-18104: MyRocks-Gap-Lock: range locking bounds are incorrect for multi-part keys
+#
+create table t0(a int);
+insert into t0 values (0),(1),(2),(3),(4),(5),(6),(7),(8),(9);
+create table t1 (
+kp1 int not null,
+kp2 int not null,
+a int,
+primary key(kp1, kp2) comment 'default'
+) engine=rocksdb;
+insert into t1 select 1, a, 1234 from t0;
+insert into t1 select 2, a, 1234 from t0;
+insert into t1 select 3, a, 1234 from t0;
+connect con1,localhost,root,,;
+connection con1;
+begin;
+select * from t1 where kp1=2 for update;
+kp1 kp2 a
+2 0 1234
+2 1 1234
+2 2 1234
+2 3 1234
+2 4 1234
+2 5 1234
+2 6 1234
+2 7 1234
+2 8 1234
+2 9 1234
+connection default;
+# The lock on kp1=2 should inhibit the following INSERT:
+insert into t1 values ( 2,5,9999);
+ERROR HY000: Lock wait timeout exceeded; try restarting transaction: Timeout on index: test.t1.PRIMARY
+rollback;
+connection con1;
+rollback;
+connection default;
+disconnect con1;
+drop table t0,t1;
+#
+# Test that locks on ranges on non-unique secondary keys inhibit
+# modifications of the contents of these ranges
+#
+create table t0(a int);
+insert into t0 values (0),(1),(2),(3),(4),(5),(6),(7),(8),(9);
+create table t1 (
+kp1 int not null,
+kp2 int not null,
+a int,
+key(kp1, kp2) comment 'default'
+) engine=rocksdb;
+insert into t1 select 1, a, 1234 from t0;
+insert into t1 values (2, 3, 1234);
+insert into t1 values (2, 5, 1234);
+insert into t1 values (2, 7, 1234);
+insert into t1 select 3, a, 1234 from t0;
+connect con1,localhost,root,,;
+connection con1;
+begin;
+explain
+select * from t1 where kp1=2 for update;
+id select_type table partitions type possible_keys key key_len ref rows filtered Extra
+1 SIMPLE t1 NULL ref kp1 kp1 4 const # 100.00 NULL
+Warnings:
+Note 1003 /* select#1 */ select `test`.`t1`.`kp1` AS `kp1`,`test`.`t1`.`kp2` AS `kp2`,`test`.`t1`.`a` AS `a` from `test`.`t1` where (`test`.`t1`.`kp1` = 2)
+select * from t1 where kp1=2 for update;
+kp1 kp2 a
+2 3 1234
+2 5 1234
+2 7 1234
+connection default;
+begin;
+insert into t1 values (2, 9, 9999);
+ERROR HY000: Lock wait timeout exceeded; try restarting transaction: Timeout on index: test.t1.kp1
+delete from t1 where kp1=2 and kp2=5;
+ERROR HY000: Lock wait timeout exceeded; try restarting transaction: Timeout on index: test.t1.kp1
+update t1 set kp1=333 where kp1=2 and kp2=3;
+ERROR HY000: Lock wait timeout exceeded; try restarting transaction: Timeout on index: test.t1.kp1
+update t1 set kp1=2 where kp1=1 and kp2=8;
+ERROR HY000: Lock wait timeout exceeded; try restarting transaction: Timeout on index: test.t1.kp1
+rollback;
+connection con1;
+rollback;
+disconnect con1;
+connection default;
+drop table t0,t1;
+#
+# Transaction isolation test
+#
+create table t1 (pk int primary key, a int) engine=rocksdb;
+insert into t1 values (1,1),(2,2),(3,3);
+connect con1,localhost,root,,;
+# TRX1: Start, Allocate a snapshot
+connection con1;
+begin;
+select * from t1;
+pk a
+1 1
+2 2
+3 3
+# TRX2: Make a change that TRX1 will not see
+connection default;
+update t1 set a=2222 where pk=2;
+# TRX1: Now, make a change that would overwrite TRX2'x change and commit
+connection con1;
+update t1 set a=a+1 where pk=2;
+commit;
+# Examine the result:
+# pk=2, a=2223 means UPDATE in TRX1 used "read committed" (InnoDB-like isolation)
+# pk=2, a=3 means UPDATE in TRX1 silently overwrote TRX2
+# (and with key tracking, one would get an error on the second UPDATE)
+connection default;
+select * from t1;
+pk a
+1 1
+2 2223
+3 3
+disconnect con1;
+connection default;
+drop table t1;
+#
+# Same test as above, but check the range scan
+#
+create table t1 (
+pk int,
+a int,
+primary key (pk) comment 'default'
+) engine=rocksdb;
+insert into t1 values (1,1),(2,2),(3,3),(4,4),(5,5),(6,6);
+connect con1,localhost,root,,;
+# TRX1: Start, Allocate a snapshot
+connection con1;
+begin;
+select * from t1;
+pk a
+1 1
+2 2
+3 3
+4 4
+5 5
+6 6
+# TRX2: Make a change that TRX1 will not see
+connection default;
+update t1 set a=2222 where pk between 3 and 5;
+# TRX1: Now, make a change that would overwrite TRX2'x change and commit
+connection con1;
+update t1 set a=a+1 where pk between 3 and 5;
+commit;
+# Examine the result:
+# pk={3,4,5} a=2223 means UPDATE in TRX1 used "read committed" (InnoDB-like isolation)
+connection default;
+select * from t1;
+pk a
+1 1
+2 2
+3 2223
+4 2223
+5 2223
+6 6
+disconnect con1;
+connection default;
+drop table t1;
+#
+# Same as above, but test SELECT FOR UPDATE.
+#
+create table t1 (
+pk int,
+a int,
+primary key (pk) comment 'default'
+) engine=rocksdb;
+insert into t1 values (1,1),(2,2),(3,3),(4,4),(5,5),(6,6);
+connect con1,localhost,root,,;
+# TRX1: Start, Allocate a snapshot
+connection con1;
+begin;
+select * from t1;
+pk a
+1 1
+2 2
+3 3
+4 4
+5 5
+6 6
+# TRX2: Make a change that TRX1 will not see
+connection default;
+update t1 set a=222 where pk=2;
+update t1 set a=333 where pk=3;
+# TRX1: Check what select [FOR UPDATE] sees
+connection con1;
+select * from t1 where pk in (2,3);
+pk a
+2 2
+3 3
+select * from t1 where pk=2 for update;
+pk a
+2 222
+select * from t1 where pk=2;
+pk a
+2 2
+commit;
+disconnect con1;
+connection default;
+drop table t1;
+#
+# Another no-snapshot-checking test, this time for single-statement
+# transaction
+#
+create table t1 (
+pk int,
+a int,
+name varchar(16),
+primary key(pk) comment 'default'
+) engine=rocksdb;
+insert into t1 values (1,1, 'row1'), (2,2,'row2');
+connect con1,localhost,root,,;
+connection con1;
+select get_lock('row1', 100);
+get_lock('row1', 100)
+1
+connection default;
+# The following will read the first row (1,1,'row1'), and stop.
+update t1 set a=a+100 where get_lock(name, 1000)=1;
+connection con1;
+update t1 set a=5 where pk=2;
+select release_lock('row1');
+release_lock('row1')
+1
+connection default;
+# Look at the row with pk=2:
+# 2, 105, row2 - means the UPDATE was reading current data (Correct)
+# 2, 102, row - means the UPDATE read the snapshot (incorrect)
+select * from t1;
+pk a name
+1 101 row1
+2 105 row2
+# Try releasing both locks (in 5.6, we will be holding only the second one)
+select release_lock(name) from t1;
+release_lock(name)
+1
+1
+disconnect con1;
+connection default;
+drop table t1;
+#
+# Check that I_S.processlist.state is set correctly now.
+#
+create table t1(
+pk int,
+a int,
+primary key(pk) comment 'default'
+) engine=rocksdb;
+insert into t1 values (1,1),(2,2),(3,3);
+begin;
+select * from t1 where pk=2 for update;
+pk a
+2 2
+connect con1,localhost,root,,;
+begin;
+set rocksdb_lock_wait_timeout=300;
+select * from t1 where pk=2 for update;
+connection default;
+# Now, will wait until we see con1 have state="Waiting for row lock"
+rollback;
+connection con1;
+pk a
+2 2
+rollback;
+disconnect con1;
+connection default;
+drop table t1;
+#
+# Test range locking for ranges with HA_READ_PREFIX_LAST
+#
+create table t0(a int) engine=rocksdb;
+insert into t0 values (0),(1),(2),(3),(4),(5),(6),(7),(8),(9);
+create table t1 (
+pk1 int,
+pk2 int,
+a int,
+primary key(pk1, pk2) comment 'default'
+) engine=rocksdb;
+insert into t1
+select
+A.a, B.a, A.a*10+B.a
+from
+t0 A, t0 B;
+connect con1,localhost,root,,;
+connection con1;
+begin;
+insert into t1 values (0x1112222,0x1112222,0);
+connection default;
+begin;
+# Should use ref access w/o filesort:
+explain
+select * from t1
+where pk1=3
+order by pk1 desc, pk2 desc
+for update;
+id select_type table partitions type possible_keys key key_len ref rows filtered Extra
+1 SIMPLE t1 NULL ref PRIMARY PRIMARY 4 const # 100.00 Backward index scan
+Warnings:
+Note 1003 /* select#1 */ select `test`.`t1`.`pk1` AS `pk1`,`test`.`t1`.`pk2` AS `pk2`,`test`.`t1`.`a` AS `a` from `test`.`t1` where (`test`.`t1`.`pk1` = 3) order by `test`.`t1`.`pk1` desc,`test`.`t1`.`pk2` desc
+select * from t1
+where pk1=3
+order by pk1 desc, pk2 desc
+for update;
+pk1 pk2 a
+3 9 39
+3 8 38
+3 7 37
+3 6 36
+3 5 35
+3 4 34
+3 3 33
+3 2 32
+3 1 31
+3 0 30
+# select * from information_schema.rocksdb_locks; # With replacements by select_from_is_rowlocks.inc
+COLUMN_FAMILY_ID TRANSACTION_ID KEY mode
+$cf_id $trx_id ${indexnr}80000003-${indexnr}80000003:1 X
+rollback;
+#
+# Test range locking for ranges with HA_READ_PREFIX_LAST_OR_PREV
+#
+begin;
+# Should use range access with 2 keyparts and w/o filesort:
+explain
+select * from t1
+where pk1=4 and pk2 between 5 and 8
+order by pk1 desc, pk2 desc
+for update;
+id select_type table partitions type possible_keys key key_len ref rows filtered Extra
+1 SIMPLE t1 NULL range PRIMARY PRIMARY 8 NULL # 100.00 Using where; Backward index scan
+Warnings:
+Note 1003 /* select#1 */ select `test`.`t1`.`pk1` AS `pk1`,`test`.`t1`.`pk2` AS `pk2`,`test`.`t1`.`a` AS `a` from `test`.`t1` where ((`test`.`t1`.`pk1` = 4) and (`test`.`t1`.`pk2` between 5 and 8)) order by `test`.`t1`.`pk1` desc,`test`.`t1`.`pk2` desc
+select * from t1
+where pk1=4 and pk2 between 5 and 8
+order by pk1 desc, pk2 desc
+for update;
+pk1 pk2 a
+4 8 48
+4 7 47
+4 6 46
+4 5 45
+# select * from information_schema.rocksdb_locks; # With replacements by select_from_is_rowlocks.inc
+COLUMN_FAMILY_ID TRANSACTION_ID KEY mode
+$cf_id $trx_id ${indexnr}8000000480000005-${indexnr}8000000480000008:1 X
+rollback;
+connection con1;
+rollback;
+connection default;
+drop table t0, t1;
+#
+# A bug: range locking was not used when scan started at table start or end
+#
+create table t0(a int);
+insert into t0 values (0),(1),(2),(3),(4),(5),(6),(7),(8),(9);
+create table t10(a int);
+insert into t10 select A.a + B.a* 10 + C.a * 100 from t0 A, t0 B, t0 C;
+create table t1 (
+pk int not null,
+a int,
+primary key(pk)
+) engine=rocksdb;
+insert into t1 select a*2,a*2 from t10;
+connection con1;
+begin;
+select * from t1 where pk=500 for update;
+pk a
+500 500
+connection default;
+begin;
+select * from t1 where pk<10 order by pk limit 10 for update;
+pk a
+0 0
+2 2
+4 4
+6 6
+8 8
+# select * from information_schema.rocksdb_locks; # With replacements by select_from_is_rowlocks.inc
+COLUMN_FAMILY_ID TRANSACTION_ID KEY mode
+$cf_id $trx_id ${indexnr}-${indexnr}8000000a X
+rollback;
+begin;
+select * from t1 where pk>1990 order by pk desc limit 10 for update;
+pk a
+1998 1998
+1996 1996
+1994 1994
+1992 1992
+# select * from information_schema.rocksdb_locks; # With replacements by select_from_is_rowlocks.inc
+COLUMN_FAMILY_ID TRANSACTION_ID KEY mode
+$cf_id $trx_id ${indexnr}800007c6-${indexnr+1} X
+rollback;
+connection con1;
+rollback;
+disconnect con1;
+connection default;
+drop table t0,t10,t1;
diff --git a/mysql-test/suite/rocksdb/r/range_locking_deadlock_tracking.result b/mysql-test/suite/rocksdb/r/range_locking_deadlock_tracking.result
new file mode 100644
index 00000000000..00fd1788dfd
--- /dev/null
+++ b/mysql-test/suite/rocksdb/r/range_locking_deadlock_tracking.result
@@ -0,0 +1,453 @@
+set @prior_lock_wait_timeout = @@rocksdb_lock_wait_timeout;
+set @prior_deadlock_detect = @@rocksdb_deadlock_detect;
+set @prior_max_latest_deadlocks = @@rocksdb_max_latest_deadlocks;
+set global rocksdb_deadlock_detect = on;
+set global rocksdb_lock_wait_timeout = 10000;
+# Clears deadlock buffer of any prior deadlocks.
+set global rocksdb_max_latest_deadlocks = 0;
+set global rocksdb_max_latest_deadlocks = @prior_max_latest_deadlocks;
+create table t (i int primary key) engine=rocksdb;
+insert into t values (1), (2), (3);
+show engine rocksdb transaction status;
+Type Name Status
+rocksdb
+============================================================
+TIMESTAMP ROCKSDB TRANSACTION MONITOR OUTPUT
+============================================================
+---------
+SNAPSHOTS
+---------
+LIST OF SNAPSHOTS FOR EACH SESSION:
+----------LATEST DETECTED DEADLOCKS----------
+-----------------------------------------
+END OF ROCKSDB TRANSACTION MONITOR OUTPUT
+=========================================
+
+Deadlock #1
+begin;
+select * from t where i=1 for update;
+i
+1
+begin;
+select * from t where i=2 for update;
+i
+2
+select * from t where i=2 for update;
+select * from t where i=1 for update;
+ERROR 40001: Deadlock found when trying to get lock; try restarting transaction
+rollback;
+i
+2
+rollback;
+show engine rocksdb transaction status;
+Type Name Status
+rocksdb
+============================================================
+TIMESTAMP ROCKSDB TRANSACTION MONITOR OUTPUT
+============================================================
+---------
+SNAPSHOTS
+---------
+LIST OF SNAPSHOTS FOR EACH SESSION:
+----------LATEST DETECTED DEADLOCKS----------
+
+*** DEADLOCK PATH
+=========================================
+TSTAMP
+TXN_ID
+COLUMN FAMILY NAME: default
+KEY
+LOCK TYPE: EXCLUSIVE
+INDEX NAME: PRIMARY
+TABLE NAME: test.t
+---------------WAITING FOR---------------
+TSTAMP
+TXN_ID
+COLUMN FAMILY NAME: default
+KEY
+LOCK TYPE: EXCLUSIVE
+INDEX NAME: PRIMARY
+TABLE NAME: test.t
+
+--------TXN_ID GOT DEADLOCK---------
+-----------------------------------------
+END OF ROCKSDB TRANSACTION MONITOR OUTPUT
+=========================================
+
+Deadlock #2
+begin;
+select * from t where i=1 for update;
+i
+1
+begin;
+select * from t where i=2 for update;
+i
+2
+select * from t where i=2 for update;
+select * from t where i=1 for update;
+ERROR 40001: Deadlock found when trying to get lock; try restarting transaction
+rollback;
+i
+2
+rollback;
+show engine rocksdb transaction status;
+Type Name Status
+rocksdb
+============================================================
+TIMESTAMP ROCKSDB TRANSACTION MONITOR OUTPUT
+============================================================
+---------
+SNAPSHOTS
+---------
+LIST OF SNAPSHOTS FOR EACH SESSION:
+----------LATEST DETECTED DEADLOCKS----------
+
+*** DEADLOCK PATH
+=========================================
+TSTAMP
+TXN_ID
+COLUMN FAMILY NAME: default
+KEY
+LOCK TYPE: EXCLUSIVE
+INDEX NAME: PRIMARY
+TABLE NAME: test.t
+---------------WAITING FOR---------------
+TSTAMP
+TXN_ID
+COLUMN FAMILY NAME: default
+KEY
+LOCK TYPE: EXCLUSIVE
+INDEX NAME: PRIMARY
+TABLE NAME: test.t
+
+--------TXN_ID GOT DEADLOCK---------
+
+*** DEADLOCK PATH
+=========================================
+TSTAMP
+TXN_ID
+COLUMN FAMILY NAME: default
+KEY
+LOCK TYPE: EXCLUSIVE
+INDEX NAME: PRIMARY
+TABLE NAME: test.t
+---------------WAITING FOR---------------
+TSTAMP
+TXN_ID
+COLUMN FAMILY NAME: default
+KEY
+LOCK TYPE: EXCLUSIVE
+INDEX NAME: PRIMARY
+TABLE NAME: test.t
+
+--------TXN_ID GOT DEADLOCK---------
+-----------------------------------------
+END OF ROCKSDB TRANSACTION MONITOR OUTPUT
+=========================================
+
+set global rocksdb_max_latest_deadlocks = 10;
+Deadlock #3
+begin;
+select * from t where i=1 for update;
+i
+1
+begin;
+select * from t where i=2 for update;
+i
+2
+select * from t where i=2 for update;
+select * from t where i=1 for update;
+ERROR 40001: Deadlock found when trying to get lock; try restarting transaction
+rollback;
+i
+2
+rollback;
+show engine rocksdb transaction status;
+Type Name Status
+rocksdb
+============================================================
+TIMESTAMP ROCKSDB TRANSACTION MONITOR OUTPUT
+============================================================
+---------
+SNAPSHOTS
+---------
+LIST OF SNAPSHOTS FOR EACH SESSION:
+----------LATEST DETECTED DEADLOCKS----------
+
+*** DEADLOCK PATH
+=========================================
+TSTAMP
+TXN_ID
+COLUMN FAMILY NAME: default
+KEY
+LOCK TYPE: EXCLUSIVE
+INDEX NAME: PRIMARY
+TABLE NAME: test.t
+---------------WAITING FOR---------------
+TSTAMP
+TXN_ID
+COLUMN FAMILY NAME: default
+KEY
+LOCK TYPE: EXCLUSIVE
+INDEX NAME: PRIMARY
+TABLE NAME: test.t
+
+--------TXN_ID GOT DEADLOCK---------
+
+*** DEADLOCK PATH
+=========================================
+TSTAMP
+TXN_ID
+COLUMN FAMILY NAME: default
+KEY
+LOCK TYPE: EXCLUSIVE
+INDEX NAME: PRIMARY
+TABLE NAME: test.t
+---------------WAITING FOR---------------
+TSTAMP
+TXN_ID
+COLUMN FAMILY NAME: default
+KEY
+LOCK TYPE: EXCLUSIVE
+INDEX NAME: PRIMARY
+TABLE NAME: test.t
+
+--------TXN_ID GOT DEADLOCK---------
+
+*** DEADLOCK PATH
+=========================================
+TSTAMP
+TXN_ID
+COLUMN FAMILY NAME: default
+KEY
+LOCK TYPE: EXCLUSIVE
+INDEX NAME: PRIMARY
+TABLE NAME: test.t
+---------------WAITING FOR---------------
+TSTAMP
+TXN_ID
+COLUMN FAMILY NAME: default
+KEY
+LOCK TYPE: EXCLUSIVE
+INDEX NAME: PRIMARY
+TABLE NAME: test.t
+
+--------TXN_ID GOT DEADLOCK---------
+-----------------------------------------
+END OF ROCKSDB TRANSACTION MONITOR OUTPUT
+=========================================
+
+set global rocksdb_max_latest_deadlocks = 1;
+show engine rocksdb transaction status;
+Type Name Status
+rocksdb
+============================================================
+TIMESTAMP ROCKSDB TRANSACTION MONITOR OUTPUT
+============================================================
+---------
+SNAPSHOTS
+---------
+LIST OF SNAPSHOTS FOR EACH SESSION:
+----------LATEST DETECTED DEADLOCKS----------
+
+*** DEADLOCK PATH
+=========================================
+TSTAMP
+TXN_ID
+COLUMN FAMILY NAME: default
+KEY
+LOCK TYPE: EXCLUSIVE
+INDEX NAME: PRIMARY
+TABLE NAME: test.t
+---------------WAITING FOR---------------
+TSTAMP
+TXN_ID
+COLUMN FAMILY NAME: default
+KEY
+LOCK TYPE: EXCLUSIVE
+INDEX NAME: PRIMARY
+TABLE NAME: test.t
+
+--------TXN_ID GOT DEADLOCK---------
+-----------------------------------------
+END OF ROCKSDB TRANSACTION MONITOR OUTPUT
+=========================================
+
+set rocksdb_deadlock_detect_depth = 2;
+# Range locking code will report deadlocks, because it doesn't honor
+# rocksdb_deadlock_detect_depth:
+Deadlock #4
+begin;
+select * from t where i=1 for update;
+i
+1
+begin;
+select * from t where i=2 for update;
+i
+2
+begin;
+select * from t where i=3 for update;
+i
+3
+select * from t where i=2 for update;
+select * from t where i=3 for update;
+select variable_value into @a from performance_schema.global_status where variable_name='rocksdb_row_lock_deadlocks';
+select * from t where i=1 for update;
+ERROR 40001: Deadlock found when trying to get lock; try restarting transaction
+select case when variable_value-@a = 1 then 'true' else 'false' end as deadlocks from performance_schema.global_status where variable_name='rocksdb_row_lock_deadlocks';
+deadlocks
+true
+rollback;
+i
+3
+rollback;
+i
+2
+rollback;
+set global rocksdb_max_latest_deadlocks = 5;
+show engine rocksdb transaction status;
+Type Name Status
+rocksdb
+============================================================
+TIMESTAMP ROCKSDB TRANSACTION MONITOR OUTPUT
+============================================================
+---------
+SNAPSHOTS
+---------
+LIST OF SNAPSHOTS FOR EACH SESSION:
+----------LATEST DETECTED DEADLOCKS----------
+
+*** DEADLOCK PATH
+=========================================
+TSTAMP
+TXN_ID
+COLUMN FAMILY NAME: default
+KEY
+LOCK TYPE: EXCLUSIVE
+INDEX NAME: PRIMARY
+TABLE NAME: test.t
+---------------WAITING FOR---------------
+TSTAMP
+TXN_ID
+COLUMN FAMILY NAME: default
+KEY
+LOCK TYPE: EXCLUSIVE
+INDEX NAME: PRIMARY
+TABLE NAME: test.t
+---------------WAITING FOR---------------
+TSTAMP
+TXN_ID
+COLUMN FAMILY NAME: default
+KEY
+LOCK TYPE: EXCLUSIVE
+INDEX NAME: PRIMARY
+TABLE NAME: test.t
+
+--------TXN_ID GOT DEADLOCK---------
+-----------------------------------------
+END OF ROCKSDB TRANSACTION MONITOR OUTPUT
+=========================================
+
+Deadlock #6
+create table t1 (id int primary key, value int) engine=rocksdb;
+insert into t1 values (1,1),(2,2),(3,3),(4,4),(5,5);
+begin;
+update t1 set value=value+100 where id=1;
+update t1 set value=value+100 where id=2;
+begin;
+update t1 set value=value+200 where id=3;
+update t1 set value=value+100 where id=3;
+update t1 set value=value+200 where id=1;
+ERROR 40001: Deadlock found when trying to get lock; try restarting transaction
+select * from t1;
+id value
+1 101
+2 102
+3 103
+4 4
+5 5
+drop table t1;
+set global rocksdb_lock_wait_timeout = @prior_lock_wait_timeout;
+set global rocksdb_deadlock_detect = @prior_deadlock_detect;
+drop table t;
+show engine rocksdb transaction status;
+Type Name Status
+rocksdb
+============================================================
+TIMESTAMP ROCKSDB TRANSACTION MONITOR OUTPUT
+============================================================
+---------
+SNAPSHOTS
+---------
+LIST OF SNAPSHOTS FOR EACH SESSION:
+----------LATEST DETECTED DEADLOCKS----------
+
+*** DEADLOCK PATH
+=========================================
+TSTAMP
+TXN_ID
+COLUMN FAMILY NAME: default
+KEY
+LOCK TYPE: EXCLUSIVE
+INDEX NAME: NOT FOUND; IDX_ID
+TABLE NAME: NOT FOUND; IDX_ID
+---------------WAITING FOR---------------
+TSTAMP
+TXN_ID
+COLUMN FAMILY NAME: default
+KEY
+LOCK TYPE: EXCLUSIVE
+INDEX NAME: NOT FOUND; IDX_ID
+TABLE NAME: NOT FOUND; IDX_ID
+
+--------TXN_ID GOT DEADLOCK---------
+
+*** DEADLOCK PATH
+=========================================
+TSTAMP
+TXN_ID
+COLUMN FAMILY NAME: default
+KEY
+LOCK TYPE: EXCLUSIVE
+INDEX NAME: NOT FOUND; IDX_ID
+TABLE NAME: NOT FOUND; IDX_ID
+---------------WAITING FOR---------------
+TSTAMP
+TXN_ID
+COLUMN FAMILY NAME: default
+KEY
+LOCK TYPE: EXCLUSIVE
+INDEX NAME: NOT FOUND; IDX_ID
+TABLE NAME: NOT FOUND; IDX_ID
+---------------WAITING FOR---------------
+TSTAMP
+TXN_ID
+COLUMN FAMILY NAME: default
+KEY
+LOCK TYPE: EXCLUSIVE
+INDEX NAME: NOT FOUND; IDX_ID
+TABLE NAME: NOT FOUND; IDX_ID
+
+--------TXN_ID GOT DEADLOCK---------
+-----------------------------------------
+END OF ROCKSDB TRANSACTION MONITOR OUTPUT
+=========================================
+
+set global rocksdb_max_latest_deadlocks = 0;
+# Clears deadlock buffer of any existent deadlocks.
+set global rocksdb_max_latest_deadlocks = @prior_max_latest_deadlocks;
+show engine rocksdb transaction status;
+Type Name Status
+rocksdb
+============================================================
+TIMESTAMP ROCKSDB TRANSACTION MONITOR OUTPUT
+============================================================
+---------
+SNAPSHOTS
+---------
+LIST OF SNAPSHOTS FOR EACH SESSION:
+----------LATEST DETECTED DEADLOCKS----------
+-----------------------------------------
+END OF ROCKSDB TRANSACTION MONITOR OUTPUT
+=========================================
+
diff --git a/mysql-test/suite/rocksdb/r/range_locking_escalation.result b/mysql-test/suite/rocksdb/r/range_locking_escalation.result
new file mode 100644
index 00000000000..dd19d728ef2
--- /dev/null
+++ b/mysql-test/suite/rocksdb/r/range_locking_escalation.result
@@ -0,0 +1,27 @@
+show variables like 'rocksdb_use_range_locking';
+Variable_name Value
+rocksdb_use_range_locking ON
+show variables like 'rocksdb_max_lock_memory';
+Variable_name Value
+rocksdb_max_lock_memory 1024
+show status like 'rocksdb_locktree_escalation_count';
+Variable_name Value
+rocksdb_locktree_escalation_count 0
+create table t0(a int);
+insert into t0 values (0),(1),(2),(3),(4),(5),(6),(7),(8),(9);
+create table t1 (
+pk int primary key,
+a int
+) engine=rocksdb;
+insert into t1
+select
+A.a + B.a*10 + C.a*100 + D.a*1000,
+12345
+from t0 A, t0 B, t0 C, t0 D;
+select count(*) from t1;
+count(*)
+10000
+show status like 'rocksdb_locktree_escalation_count';
+Variable_name Value
+rocksdb_locktree_escalation_count 127
+drop table t0,t1;
diff --git a/mysql-test/suite/rocksdb/r/range_locking_refresh_iter.result b/mysql-test/suite/rocksdb/r/range_locking_refresh_iter.result
new file mode 100644
index 00000000000..1067087e816
--- /dev/null
+++ b/mysql-test/suite/rocksdb/r/range_locking_refresh_iter.result
@@ -0,0 +1,50 @@
+select @@rocksdb_use_range_locking;
+@@rocksdb_use_range_locking
+1
+set debug_sync='RESET';
+create table ten(a int primary key);
+insert into ten values (0),(1),(2),(3),(4),(5),(6),(7),(8),(9);
+create table one_k(a int primary key);
+insert into one_k select A.a + B.a* 10 + C.a * 100 from ten A, ten B, ten C;
+create table t1 (
+pk int primary key,
+a int
+) engine=rocksdb;
+insert into t1 select a,a from ten;
+insert into t1 select a+40, a+40 from ten;
+insert into t1 select a+100, a+100 from one_k;
+delete from t1 where pk=44;
+set global rocksdb_force_flush_memtable_and_lzero_now=1;
+begin;
+set debug_sync='rocksdb.check_flags_iri SIGNAL con1_stopped WAIT_FOR con1_cont';
+update t1 set a=a+100 where pk < 3 or pk between 10 and 50;
+set debug_sync='now WAIT_FOR con1_stopped';
+insert into t1 values (44,5000);
+delete from t1 where pk= 42;
+update t1 set a=5000 where pk between 40 and 45;
+set global rocksdb_force_flush_memtable_and_lzero_now=1;
+set debug_sync='now SIGNAL con1_cont';
+select * from t1 where pk<100;
+pk a
+0 100
+1 101
+2 102
+3 3
+4 4
+5 5
+6 6
+7 7
+8 8
+9 9
+40 5100
+41 5100
+43 5100
+44 5100
+45 5100
+46 146
+47 147
+48 148
+49 149
+commit;
+set debug_sync='RESET';
+drop table t1, ten, one_k;
diff --git a/mysql-test/suite/rocksdb/r/range_locking_rev_cf.result b/mysql-test/suite/rocksdb/r/range_locking_rev_cf.result
new file mode 100644
index 00000000000..5e1c2cf98a5
--- /dev/null
+++ b/mysql-test/suite/rocksdb/r/range_locking_rev_cf.result
@@ -0,0 +1,482 @@
+show variables like 'rocksdb_use_range_locking';
+Variable_name Value
+rocksdb_use_range_locking ON
+create table t1 (
+pk int,
+a int,
+primary key (pk) comment 'rev:cf1'
+) engine=rocksdb;
+insert into t1 values
+(10,10),(20,20),(30,30);
+connect con1,localhost,root,,;
+connect con2,localhost,root,,;
+### Test: check that range lock inhibits a point lock
+connection con1;
+begin;
+select * from t1 where pk between 5 and 25 for update;
+pk a
+10 10
+20 20
+connection con2;
+insert into t1 values (15,15);
+ERROR HY000: Lock wait timeout exceeded; try restarting transaction: Timeout on index: test.t1.PRIMARY
+connection con1;
+rollback;
+## Test: check that range lock inhibits another range lock
+connection con1;
+begin;
+select * from t1 where pk between 5 and 25 for update;
+pk a
+10 10
+20 20
+connection con2;
+begin;
+select * from t1 where pk between 15 and 35 for update;
+ERROR HY000: Lock wait timeout exceeded; try restarting transaction: Timeout on index: test.t1.PRIMARY
+rollback;
+connection con1;
+rollback;
+## Test: check that regular read does not get a range lock
+connection con1;
+begin;
+select * from t1 where pk between 5 and 25;
+pk a
+10 10
+20 20
+connection con2;
+begin;
+select * from t1 where pk between 15 and 35 for update;
+pk a
+20 20
+30 30
+rollback;
+connection con1;
+rollback;
+## Test that locks are not released when a statement inside
+## a transaction is rolled back
+create table t2 (
+pk int,
+a int,
+primary key (pk) comment 'rev:cf1',
+unique key(a) comment ''
+) engine=rocksdb;
+insert into t2 values (1,1),(2,2);
+begin;
+insert into t2 values (3,3);
+insert into t2 values (10,2);
+ERROR 23000: Duplicate entry '2' for key 't2.a'
+connection con2;
+begin;
+select * from t2 where pk=3 for update;
+ERROR HY000: Lock wait timeout exceeded; try restarting transaction: Timeout on index: test.t2.PRIMARY
+rollback;
+connection con1;
+rollback;
+drop table t2;
+connection default;
+disconnect con1;
+disconnect con2;
+drop table t1;
+#
+# Test INFORMATION_SCHEMA.lock_info in range-locking mode
+#
+connect con1,localhost,root,,;
+connection con1;
+create table t0 (a int primary key);
+begin;
+insert into t0 values (1);
+connection default;
+create table t1 (
+pk int,
+a int,
+primary key (pk) comment 'rev:cf1'
+) engine=rocksdb;
+insert into t1 values
+(10,10),(20,20),(30,30);
+begin;
+select * from t1 where pk=10 for update;
+pk a
+10 10
+# select * from information_schema.rocksdb_locks; # With replacements by select_from_is_rowlocks.inc
+COLUMN_FAMILY_ID TRANSACTION_ID KEY mode
+$cf_id $trx_id ${indexnr}8000000a X
+delete from t1 where pk between 25 and 40;
+# select * from information_schema.rocksdb_locks; # With replacements by select_from_is_rowlocks.inc
+COLUMN_FAMILY_ID TRANSACTION_ID KEY mode
+$cf_id $trx_id ${indexnr}8000000a X
+$cf_id $trx_id ${indexnr}80000028-${indexnr}80000019:1 X
+rollback;
+begin;
+# The following will show a range lock on 2-9 and also a point lock on 10.
+# This is how things currently work. (after MDEV-21314, not anymore)
+select * from t1 where pk between 2 and 9 for update;
+pk a
+# select * from information_schema.rocksdb_locks; # With replacements by select_from_is_rowlocks.inc
+COLUMN_FAMILY_ID TRANSACTION_ID KEY mode
+$cf_id $trx_id ${indexnr}80000009-${indexnr}80000002:1 X
+rollback;
+drop table t1;
+connection con1;
+rollback;
+drop table t0;
+connection default;
+disconnect con1;
+#
+# MDEV-18104: MyRocks-Gap-Lock: range locking bounds are incorrect for multi-part keys
+#
+create table t0(a int);
+insert into t0 values (0),(1),(2),(3),(4),(5),(6),(7),(8),(9);
+create table t1 (
+kp1 int not null,
+kp2 int not null,
+a int,
+primary key(kp1, kp2) comment 'rev:cf1'
+) engine=rocksdb;
+insert into t1 select 1, a, 1234 from t0;
+insert into t1 select 2, a, 1234 from t0;
+insert into t1 select 3, a, 1234 from t0;
+connect con1,localhost,root,,;
+connection con1;
+begin;
+select * from t1 where kp1=2 for update;
+kp1 kp2 a
+2 0 1234
+2 1 1234
+2 2 1234
+2 3 1234
+2 4 1234
+2 5 1234
+2 6 1234
+2 7 1234
+2 8 1234
+2 9 1234
+connection default;
+# The lock on kp1=2 should inhibit the following INSERT:
+insert into t1 values ( 2,5,9999);
+ERROR HY000: Lock wait timeout exceeded; try restarting transaction: Timeout on index: test.t1.PRIMARY
+rollback;
+connection con1;
+rollback;
+connection default;
+disconnect con1;
+drop table t0,t1;
+#
+# Test that locks on ranges on non-unique secondary keys inhibit
+# modifications of the contents of these ranges
+#
+create table t0(a int);
+insert into t0 values (0),(1),(2),(3),(4),(5),(6),(7),(8),(9);
+create table t1 (
+kp1 int not null,
+kp2 int not null,
+a int,
+key(kp1, kp2) comment 'rev:cf1'
+) engine=rocksdb;
+insert into t1 select 1, a, 1234 from t0;
+insert into t1 values (2, 3, 1234);
+insert into t1 values (2, 5, 1234);
+insert into t1 values (2, 7, 1234);
+insert into t1 select 3, a, 1234 from t0;
+connect con1,localhost,root,,;
+connection con1;
+begin;
+explain
+select * from t1 where kp1=2 for update;
+id select_type table partitions type possible_keys key key_len ref rows filtered Extra
+1 SIMPLE t1 NULL ref kp1 kp1 4 const # 100.00 NULL
+Warnings:
+Note 1003 /* select#1 */ select `test`.`t1`.`kp1` AS `kp1`,`test`.`t1`.`kp2` AS `kp2`,`test`.`t1`.`a` AS `a` from `test`.`t1` where (`test`.`t1`.`kp1` = 2)
+select * from t1 where kp1=2 for update;
+kp1 kp2 a
+2 3 1234
+2 5 1234
+2 7 1234
+connection default;
+begin;
+insert into t1 values (2, 9, 9999);
+ERROR HY000: Lock wait timeout exceeded; try restarting transaction: Timeout on index: test.t1.kp1
+delete from t1 where kp1=2 and kp2=5;
+ERROR HY000: Lock wait timeout exceeded; try restarting transaction: Timeout on index: test.t1.kp1
+update t1 set kp1=333 where kp1=2 and kp2=3;
+ERROR HY000: Lock wait timeout exceeded; try restarting transaction: Timeout on index: test.t1.kp1
+update t1 set kp1=2 where kp1=1 and kp2=8;
+ERROR HY000: Lock wait timeout exceeded; try restarting transaction: Timeout on index: test.t1.kp1
+rollback;
+connection con1;
+rollback;
+disconnect con1;
+connection default;
+drop table t0,t1;
+#
+# Transaction isolation test
+#
+create table t1 (pk int primary key, a int) engine=rocksdb;
+insert into t1 values (1,1),(2,2),(3,3);
+connect con1,localhost,root,,;
+# TRX1: Start, Allocate a snapshot
+connection con1;
+begin;
+select * from t1;
+pk a
+1 1
+2 2
+3 3
+# TRX2: Make a change that TRX1 will not see
+connection default;
+update t1 set a=2222 where pk=2;
+# TRX1: Now, make a change that would overwrite TRX2'x change and commit
+connection con1;
+update t1 set a=a+1 where pk=2;
+commit;
+# Examine the result:
+# pk=2, a=2223 means UPDATE in TRX1 used "read committed" (InnoDB-like isolation)
+# pk=2, a=3 means UPDATE in TRX1 silently overwrote TRX2
+# (and with key tracking, one would get an error on the second UPDATE)
+connection default;
+select * from t1;
+pk a
+1 1
+2 2223
+3 3
+disconnect con1;
+connection default;
+drop table t1;
+#
+# Same test as above, but check the range scan
+#
+create table t1 (
+pk int,
+a int,
+primary key (pk) comment 'rev:cf1'
+) engine=rocksdb;
+insert into t1 values (1,1),(2,2),(3,3),(4,4),(5,5),(6,6);
+connect con1,localhost,root,,;
+# TRX1: Start, Allocate a snapshot
+connection con1;
+begin;
+select * from t1;
+pk a
+6 6
+5 5
+4 4
+3 3
+2 2
+1 1
+# TRX2: Make a change that TRX1 will not see
+connection default;
+update t1 set a=2222 where pk between 3 and 5;
+# TRX1: Now, make a change that would overwrite TRX2'x change and commit
+connection con1;
+update t1 set a=a+1 where pk between 3 and 5;
+commit;
+# Examine the result:
+# pk={3,4,5} a=2223 means UPDATE in TRX1 used "read committed" (InnoDB-like isolation)
+connection default;
+select * from t1;
+pk a
+6 6
+5 2223
+4 2223
+3 2223
+2 2
+1 1
+disconnect con1;
+connection default;
+drop table t1;
+#
+# Same as above, but test SELECT FOR UPDATE.
+#
+create table t1 (
+pk int,
+a int,
+primary key (pk) comment 'rev:cf1'
+) engine=rocksdb;
+insert into t1 values (1,1),(2,2),(3,3),(4,4),(5,5),(6,6);
+connect con1,localhost,root,,;
+# TRX1: Start, Allocate a snapshot
+connection con1;
+begin;
+select * from t1;
+pk a
+6 6
+5 5
+4 4
+3 3
+2 2
+1 1
+# TRX2: Make a change that TRX1 will not see
+connection default;
+update t1 set a=222 where pk=2;
+update t1 set a=333 where pk=3;
+# TRX1: Check what select [FOR UPDATE] sees
+connection con1;
+select * from t1 where pk in (2,3);
+pk a
+2 2
+3 3
+select * from t1 where pk=2 for update;
+pk a
+2 222
+select * from t1 where pk=2;
+pk a
+2 2
+commit;
+disconnect con1;
+connection default;
+drop table t1;
+#
+# Check that I_S.processlist.state is set correctly now.
+#
+create table t1(
+pk int,
+a int,
+primary key(pk) comment 'rev:cf1'
+) engine=rocksdb;
+insert into t1 values (1,1),(2,2),(3,3);
+begin;
+select * from t1 where pk=2 for update;
+pk a
+2 2
+connect con1,localhost,root,,;
+begin;
+set rocksdb_lock_wait_timeout=300;
+select * from t1 where pk=2 for update;
+connection default;
+# Now, will wait until we see con1 have state="Waiting for row lock"
+rollback;
+connection con1;
+pk a
+2 2
+rollback;
+disconnect con1;
+connection default;
+drop table t1;
+#
+# Test range locking for ranges with HA_READ_PREFIX_LAST
+#
+create table t0(a int) engine=rocksdb;
+insert into t0 values (0),(1),(2),(3),(4),(5),(6),(7),(8),(9);
+create table t1 (
+pk1 int,
+pk2 int,
+a int,
+primary key(pk1, pk2) comment 'rev:cf1'
+) engine=rocksdb;
+insert into t1
+select
+A.a, B.a, A.a*10+B.a
+from
+t0 A, t0 B;
+connect con1,localhost,root,,;
+connection con1;
+begin;
+insert into t1 values (0x1112222,0x1112222,0);
+connection default;
+begin;
+# Should use ref access w/o filesort:
+explain
+select * from t1
+where pk1=3
+order by pk1 desc, pk2 desc
+for update;
+id select_type table partitions type possible_keys key key_len ref rows filtered Extra
+1 SIMPLE t1 NULL ref PRIMARY PRIMARY 4 const # 100.00 Backward index scan
+Warnings:
+Note 1003 /* select#1 */ select `test`.`t1`.`pk1` AS `pk1`,`test`.`t1`.`pk2` AS `pk2`,`test`.`t1`.`a` AS `a` from `test`.`t1` where (`test`.`t1`.`pk1` = 3) order by `test`.`t1`.`pk1` desc,`test`.`t1`.`pk2` desc
+select * from t1
+where pk1=3
+order by pk1 desc, pk2 desc
+for update;
+pk1 pk2 a
+3 9 39
+3 8 38
+3 7 37
+3 6 36
+3 5 35
+3 4 34
+3 3 33
+3 2 32
+3 1 31
+3 0 30
+# select * from information_schema.rocksdb_locks; # With replacements by select_from_is_rowlocks.inc
+COLUMN_FAMILY_ID TRANSACTION_ID KEY mode
+$cf_id $trx_id ${indexnr}80000003-${indexnr}80000003:1 X
+rollback;
+#
+# Test range locking for ranges with HA_READ_PREFIX_LAST_OR_PREV
+#
+begin;
+# Should use range access with 2 keyparts and w/o filesort:
+explain
+select * from t1
+where pk1=4 and pk2 between 5 and 8
+order by pk1 desc, pk2 desc
+for update;
+id select_type table partitions type possible_keys key key_len ref rows filtered Extra
+1 SIMPLE t1 NULL range PRIMARY PRIMARY 8 NULL # 100.00 Using where; Backward index scan
+Warnings:
+Note 1003 /* select#1 */ select `test`.`t1`.`pk1` AS `pk1`,`test`.`t1`.`pk2` AS `pk2`,`test`.`t1`.`a` AS `a` from `test`.`t1` where ((`test`.`t1`.`pk1` = 4) and (`test`.`t1`.`pk2` between 5 and 8)) order by `test`.`t1`.`pk1` desc,`test`.`t1`.`pk2` desc
+select * from t1
+where pk1=4 and pk2 between 5 and 8
+order by pk1 desc, pk2 desc
+for update;
+pk1 pk2 a
+4 8 48
+4 7 47
+4 6 46
+4 5 45
+# select * from information_schema.rocksdb_locks; # With replacements by select_from_is_rowlocks.inc
+COLUMN_FAMILY_ID TRANSACTION_ID KEY mode
+$cf_id $trx_id ${indexnr}8000000480000008-${indexnr}8000000480000005:1 X
+rollback;
+connection con1;
+rollback;
+connection default;
+drop table t0, t1;
+#
+# A bug: range locking was not used when scan started at table start or end
+#
+create table t0(a int);
+insert into t0 values (0),(1),(2),(3),(4),(5),(6),(7),(8),(9);
+create table t10(a int);
+insert into t10 select A.a + B.a* 10 + C.a * 100 from t0 A, t0 B, t0 C;
+create table t1 (
+pk int not null,
+a int,
+primary key(pk)
+) engine=rocksdb;
+insert into t1 select a*2,a*2 from t10;
+connection con1;
+begin;
+select * from t1 where pk=500 for update;
+pk a
+500 500
+connection default;
+begin;
+select * from t1 where pk<10 order by pk limit 10 for update;
+pk a
+0 0
+2 2
+4 4
+6 6
+8 8
+# select * from information_schema.rocksdb_locks; # With replacements by select_from_is_rowlocks.inc
+COLUMN_FAMILY_ID TRANSACTION_ID KEY mode
+$cf_id $trx_id ${indexnr}-${indexnr}8000000a X
+rollback;
+begin;
+select * from t1 where pk>1990 order by pk desc limit 10 for update;
+pk a
+1998 1998
+1996 1996
+1994 1994
+1992 1992
+# select * from information_schema.rocksdb_locks; # With replacements by select_from_is_rowlocks.inc
+COLUMN_FAMILY_ID TRANSACTION_ID KEY mode
+$cf_id $trx_id ${indexnr}800007c6-${indexnr+1} X
+rollback;
+connection con1;
+rollback;
+disconnect con1;
+connection default;
+drop table t0,t10,t1;
diff --git a/mysql-test/suite/rocksdb/r/range_locking_seek_for_update.result b/mysql-test/suite/rocksdb/r/range_locking_seek_for_update.result
new file mode 100644
index 00000000000..514916eaa22
--- /dev/null
+++ b/mysql-test/suite/rocksdb/r/range_locking_seek_for_update.result
@@ -0,0 +1,279 @@
+show variables like 'rocksdb_use_range_locking';
+Variable_name Value
+rocksdb_use_range_locking ON
+create table t0(a int primary key);
+insert into t0 values (0),(1),(2),(3),(4),(5),(6),(7),(8),(9);
+create table t1 (
+pk int,
+a int,
+primary key (pk)
+) engine=rocksdb;
+insert into t1 select
+A.a + B.a*10 + C.a*100,
+A.a + B.a*10 + C.a*100
+from
+t0 A, t0 B, t0 C;
+# Make another connection to get the lock tree out of the STO-mode
+connect con1,localhost,root,,;
+connection con1;
+begin;
+select * from t1 where pk=10 for update;
+pk a
+10 10
+connection default;
+begin;
+select * from t1 where pk=11 for update;
+pk a
+11 11
+# Now, we will just see locks on 10=0xA and 11=0xB:
+# select * from information_schema.rocksdb_locks; # With replacements by select_from_is_rowlocks.inc
+COLUMN_FAMILY_ID TRANSACTION_ID KEY mode
+$cf_id $trx_id ${indexnr}8000000b X
+#
+# SeekForUpdate Test #1: A query with type=range (without upper bound) and LIMIT
+#
+explain
+select * from t1 where pk>=500 order by pk limit 3 for update;
+id select_type table partitions type possible_keys key key_len ref rows filtered Extra
+1 SIMPLE t1 NULL range PRIMARY PRIMARY 4 NULL # 100.00 Using where
+Warnings:
+Note 1003 /* select#1 */ select `test`.`t1`.`pk` AS `pk`,`test`.`t1`.`a` AS `a` from `test`.`t1` where (`test`.`t1`.`pk` >= 500) order by `test`.`t1`.`pk` limit 3
+select * from t1 where pk>=500 order by pk limit 3 for update;
+pk a
+500 500
+501 501
+502 502
+# select * from information_schema.rocksdb_locks; # With replacements by select_from_is_rowlocks.inc
+COLUMN_FAMILY_ID TRANSACTION_ID KEY mode
+$cf_id $trx_id ${indexnr}8000000b X
+$cf_id $trx_id ${indexnr}800001f4-${indexnr}800001f6 X
+rollback;
+begin;
+select * from t1 where pk=11 for update;
+pk a
+11 11
+explain
+select * from t1 order by pk limit 3 for update;
+id select_type table partitions type possible_keys key key_len ref rows filtered Extra
+1 SIMPLE t1 NULL index NULL PRIMARY 4 NULL 3 100.00 NULL
+Warnings:
+Note 1003 /* select#1 */ select `test`.`t1`.`pk` AS `pk`,`test`.`t1`.`a` AS `a` from `test`.`t1` order by `test`.`t1`.`pk` limit 3
+select * from t1 order by pk limit 3 for update;
+pk a
+0 0
+1 1
+2 2
+# select * from information_schema.rocksdb_locks; # With replacements by select_from_is_rowlocks.inc
+COLUMN_FAMILY_ID TRANSACTION_ID KEY mode
+$cf_id $trx_id ${indexnr}-${indexnr}80000002 X
+$cf_id $trx_id ${indexnr}8000000b X
+rollback;
+connection con1;
+rollback;
+disconnect con1;
+connection default;
+drop table t0, t1;
+#
+# Concurrent tests: let one thread do SeekForUpdate and the other
+# interfere by committing modifications
+#
+create table t0(a int primary key);
+insert into t0 values (0),(1),(2),(3),(4),(5),(6),(7),(8),(9);
+create table t1 (
+pk int,
+a int,
+primary key (pk)
+) engine=rocksdb;
+insert into t1 select
+A.a + B.a*10 + C.a*100,
+A.a + B.a*10 + C.a*100
+from
+t0 A, t0 B, t0 C;
+select * from t1 where pk<10;
+pk a
+0 0
+1 1
+2 2
+3 3
+4 4
+5 5
+6 6
+7 7
+8 8
+9 9
+delete from t1 where pk<10;
+select * from t1 where pk<10;
+pk a
+# Test what happens when another transaction commits a row
+# right before the range we are about to lock (nothing)
+explain
+select * from t1 where pk >=5 order by pk limit 3 for update;
+id select_type table partitions type possible_keys key key_len ref rows filtered Extra
+1 SIMPLE t1 NULL range PRIMARY PRIMARY 4 NULL # 100.00 Using where
+Warnings:
+Note 1003 /* select#1 */ select `test`.`t1`.`pk` AS `pk`,`test`.`t1`.`a` AS `a` from `test`.`t1` where (`test`.`t1`.`pk` >= 5) order by `test`.`t1`.`pk` limit 3
+begin;
+set debug_sync='rocksdb.locking_iter_scan SIGNAL about_to_lock_range WAIT_FOR spoiler_inserted';
+select * from t1 where pk >=5 order by pk limit 3 for update;
+connect con1,localhost,root,,;
+connection con1;
+set debug_sync='now WAIT_FOR about_to_lock_range';
+insert into t1 values (3,3);
+set debug_sync='now SIGNAL spoiler_inserted';
+connection default;
+pk a
+10 10
+11 11
+12 12
+rollback;
+delete from t1 where pk=3;
+#
+# Now, repeat the test but let the other transaction insert the row into
+# the range we are locking
+explain
+select * from t1 where pk >=5 order by pk limit 1 for update;
+id select_type table partitions type possible_keys key key_len ref rows filtered Extra
+1 SIMPLE t1 NULL range PRIMARY PRIMARY 4 NULL # 100.00 Using where
+Warnings:
+Note 1003 /* select#1 */ select `test`.`t1`.`pk` AS `pk`,`test`.`t1`.`a` AS `a` from `test`.`t1` where (`test`.`t1`.`pk` >= 5) order by `test`.`t1`.`pk` limit 1
+begin;
+set debug_sync='rocksdb.locking_iter_scan SIGNAL about_to_lock_range WAIT_FOR spoiler_inserted';
+select * from t1 where pk >=5 order by pk limit 1 for update;
+connection con1;
+set debug_sync='now WAIT_FOR about_to_lock_range';
+insert into t1 values (8,8);
+set debug_sync='now SIGNAL spoiler_inserted';
+connection default;
+pk a
+8 8
+# select * from information_schema.rocksdb_locks; # With replacements by select_from_is_rowlocks.inc
+COLUMN_FAMILY_ID TRANSACTION_ID KEY mode
+$cf_id $trx_id ${indexnr}80000005-${indexnr}8000000a X
+rollback;
+delete from t1 where pk=8;
+#
+# Repeat the third time, this time deleting the row that SeekForUpdate saw
+#
+insert into t1 values (7,7);
+begin;
+set debug_sync='rocksdb.locking_iter_scan SIGNAL about_to_lock_range WAIT_FOR spoiler_inserted';
+select * from t1 where pk >=5 order by pk limit 1 for update;
+connection con1;
+set debug_sync='now WAIT_FOR about_to_lock_range';
+delete from t1 where pk=7;
+set debug_sync='now SIGNAL spoiler_inserted';
+connection default;
+pk a
+10 10
+rollback;
+#
+# Repeat the above test, but let the read fail with ER_LOCK_WAIT_TIMEOUT
+# error. MyRocks code should now be prepared that data reads cause this
+# error
+#
+insert into t1 values (7,7);
+begin;
+set debug_sync='rocksdb.locking_iter_scan SIGNAL about_to_lock_range WAIT_FOR spoiler_inserted';
+select * from t1 where pk >=5 order by pk limit 1 for update;
+connection con1;
+set debug_sync='now WAIT_FOR about_to_lock_range';
+begin;
+delete from t1 where pk=7;
+set debug_sync='now SIGNAL spoiler_inserted';
+connection default;
+ERROR HY000: Lock wait timeout exceeded; try restarting transaction: Timeout on index: test.t1.PRIMARY
+rollback;
+connection con1;
+rollback;
+connection default;
+#
+# Backward scan test
+#
+connection con1;
+begin;
+select * from t1 where pk=500 for update;
+pk a
+500 500
+connection default;
+insert into t1 values
+(1001, 1001),
+(1005, 1005),
+(1007, 1007),
+(1010, 1010);
+begin;
+select * from t1 order by pk desc limit 2 for update;
+pk a
+1010 1010
+1007 1007
+# The below will lock from pk=1007 (0x3ef) till the end of the table:
+# select * from information_schema.rocksdb_locks; # With replacements by select_from_is_rowlocks.inc
+COLUMN_FAMILY_ID TRANSACTION_ID KEY mode
+$cf_id $trx_id ${indexnr}800003ef-${indexnr+1} X
+rollback;
+begin;
+select * from t1 where pk <1007 order by pk desc limit 2 for update;
+pk a
+1005 1005
+1001 1001
+# select * from information_schema.rocksdb_locks; # With replacements by select_from_is_rowlocks.inc
+COLUMN_FAMILY_ID TRANSACTION_ID KEY mode
+$cf_id $trx_id ${indexnr}800003e9-${indexnr}800003ef X
+connection con1;
+rollback;
+connection default;
+rollback;
+#
+# Backward scan test 2: error condition
+#
+connection con1;
+begin;
+select * from t1 where pk=1010 for update;
+pk a
+1010 1010
+connection default;
+begin;
+select * from t1 order by pk desc limit 2 for update;
+ERROR HY000: Lock wait timeout exceeded; try restarting transaction: Timeout on index: test.t1.PRIMARY
+rollback;
+connection con1;
+rollback;
+begin;
+select * from t1 where pk=1007 for update;
+pk a
+1007 1007
+connection default;
+begin;
+select * from t1 order by pk desc limit 2 for update;
+ERROR HY000: Lock wait timeout exceeded; try restarting transaction: Timeout on index: test.t1.PRIMARY
+rollback;
+connection con1;
+rollback;
+disconnect con1;
+connection default;
+drop table t0,t1;
+#
+# A test: full table scan doesn't lock gaps
+#
+create table t1 (
+pk int primary key,
+a int
+) engine=rocksdb;
+insert into t1 values (10,10),(20,20),(30,30);
+connect con1,localhost,root,,;
+connect con2,localhost,root,,;
+connection con1;
+begin;
+select * from t1 for update;
+pk a
+10 10
+20 20
+30 30
+connection con2;
+insert into t1 values (5,5);
+ERROR HY000: Lock wait timeout exceeded; try restarting transaction: Timeout on index: test.t1.PRIMARY
+connection con1;
+rollback;
+disconnect con1;
+disconnect con2;
+connection default;
+drop table t1;
diff --git a/mysql-test/suite/rocksdb/r/range_locking_shared_locks.result b/mysql-test/suite/rocksdb/r/range_locking_shared_locks.result
new file mode 100644
index 00000000000..580108de6f6
--- /dev/null
+++ b/mysql-test/suite/rocksdb/r/range_locking_shared_locks.result
@@ -0,0 +1,251 @@
+select @@rocksdb_use_range_locking;
+@@rocksdb_use_range_locking
+1
+create table t0 (a int);
+insert into t0 values (0),(1),(2),(3),(4),(5),(6),(7),(8),(9);
+create table t1 (
+pk int primary key,
+a int
+) engine=rocksdb;
+insert into t1 select a,a from t0;
+# A basic test for shared locks
+begin;
+select * from t1 where pk=3 for update;
+pk a
+3 3
+select * from t1 where pk=5 lock in share mode;
+pk a
+5 5
+connect con1,localhost,root,,;
+connection con1;
+begin;
+select * from t1 where pk=5 lock in share mode;
+pk a
+5 5
+# Now for pk=5 we should see two locks by TRX1 and TRX2 with mode=S:
+# select * from information_schema.rocksdb_locks; # With replacements by select_from_is_rowlocks.inc
+COLUMN_FAMILY_ID TRANSACTION_ID KEY mode
+$cf_id $TRX1_ID ${indexnr}80000003 X
+$cf_id $TRX1_ID ${indexnr}80000005 S
+$cf_id $TRX2_ID ${indexnr}80000005 S
+rollback;
+# Now, TRX2_ID should be gone:
+# select * from information_schema.rocksdb_locks; # With replacements by select_from_is_rowlocks.inc
+COLUMN_FAMILY_ID TRANSACTION_ID KEY mode
+$cf_id $TRX1_ID ${indexnr}80000003 X
+$cf_id $TRX1_ID ${indexnr}80000005 S
+connection default;
+# Get a read lock on pk=3 (where we have a write lock).
+# The result should be that we will still have a write lock
+select * from t1 where pk=3 for update;
+pk a
+3 3
+# select * from information_schema.rocksdb_locks; # With replacements by select_from_is_rowlocks.inc
+COLUMN_FAMILY_ID TRANSACTION_ID KEY mode
+$cf_id $TRX1_ID ${indexnr}80000003 X
+$cf_id $TRX1_ID ${indexnr}80000005 S
+# Get a write lock on pk=5 (where we have a read lock).
+# The result should be that we will have a write lock.
+select * from t1 where pk=5 for update;
+pk a
+5 5
+# select * from information_schema.rocksdb_locks; # With replacements by select_from_is_rowlocks.inc
+COLUMN_FAMILY_ID TRANSACTION_ID KEY mode
+$cf_id $TRX1_ID ${indexnr}80000003 X
+$cf_id $TRX1_ID ${indexnr}80000005 X
+connection default;
+rollback;
+#
+# Test if a read lock inhibits write locks
+#
+begin;
+select * from t1 where pk=2 lock in share mode;
+pk a
+2 2
+select * from t1 where pk=8 for update;
+pk a
+8 8
+connection con1;
+begin;
+select * from t1 where pk=2 for update;
+ERROR HY000: Lock wait timeout exceeded; try restarting transaction: Timeout on index: test.t1.PRIMARY
+select * from t1 where pk between 0 and 4 for update;
+ERROR HY000: Lock wait timeout exceeded; try restarting transaction: Timeout on index: test.t1.PRIMARY
+delete from t1 where pk=2;
+ERROR HY000: Lock wait timeout exceeded; try restarting transaction: Timeout on index: test.t1.PRIMARY
+# Get a shared lock
+select * from t1 where pk=2 lock in share mode;
+pk a
+2 2
+# But this should still prevent us from acquiring a write lock on that value:
+select * from t1 where pk=2 for update;
+ERROR HY000: Lock wait timeout exceeded; try restarting transaction: Timeout on index: test.t1.PRIMARY
+rollback;
+connection default;
+rollback;
+drop table t1;
+create table t1 (
+pk int not null primary key,
+a int not null,
+key(a)
+) engine=rocksdb;
+insert into t1
+select
+A.a+10*B.a+100*C.a+1000*D.a, A.a+10*B.a+100*C.a+1000*D.a
+from
+t0 A, t0 B, t0 C, t0 D;
+set global rocksdb_force_flush_memtable_now=1;
+connection con1;
+begin;
+select * from t1 where pk=900 for update;
+pk a
+900 900
+connection default;
+begin;
+explain
+select * from t1 where a between 2 and 5 lock in share mode;
+id select_type table partitions type possible_keys key key_len ref rows filtered Extra
+1 SIMPLE t1 NULL range a a 4 NULL # 100.00 Using where; Using index
+Warnings:
+Note 1003 /* select#1 */ select `test`.`t1`.`pk` AS `pk`,`test`.`t1`.`a` AS `a` from `test`.`t1` where (`test`.`t1`.`a` between 2 and 5)
+select * from t1 where a between 2 and 5 lock in share mode;
+pk a
+2 2
+3 3
+4 4
+5 5
+# select * from information_schema.rocksdb_locks; # With replacements by select_from_is_rowlocks.inc
+COLUMN_FAMILY_ID TRANSACTION_ID KEY mode
+$cf_id $TRX1_ID ${indexnr+1}80000002-${indexnr+1}80000005:1 X
+$cf_id $TRX1_ID ${indexnr}80000002 S
+$cf_id $TRX1_ID ${indexnr}80000003 S
+$cf_id $TRX1_ID ${indexnr}80000004 S
+$cf_id $TRX1_ID ${indexnr}80000005 S
+$cf_id $TRX1_ID ${indexnr}80000006 S
+$cf_id $TRX2_ID ${indexnr}80000384 X
+rollback;
+disconnect con1;
+drop table t0,t1;
+#
+# Test shared point locks and lock escalation
+#
+create table t0 (a int);
+insert into t0 values (0),(1),(2),(3),(4),(5),(6),(7),(8),(9);
+create table t1 (
+pk int primary key,
+a int
+) engine=rocksdb;
+insert into t1
+select 1000 + 100*A.a + 10*B.a + C.a, 12345 from t0 A, t0 B, t0 C;
+show status like 'rocksdb_locktree_current_lock_memory';
+Variable_name Value
+rocksdb_locktree_current_lock_memory 0
+connect con1,localhost,root,,;
+connection con1;
+begin;
+# CON1: get some shared locks
+select * from t1 where pk=1001 lock in share mode;
+pk a
+1001 12345
+select * from t1 where pk=1100 lock in share mode;
+pk a
+1100 12345
+select * from t1 where pk=1200 lock in share mode;
+pk a
+1200 12345
+select * from t1 where pk=2500 lock in share mode;
+pk a
+connection default;
+begin;
+# DEFAULT: get the same locks so we have locks with multiple owners
+select * from t1 where pk=1001 lock in share mode;
+pk a
+1001 12345
+select * from t1 where pk=1100 lock in share mode;
+pk a
+1100 12345
+select * from t1 where pk=1200 lock in share mode;
+pk a
+1200 12345
+# DEFAULT: get shared locks with one owner:
+select * from t1 where pk=2510 lock in share mode;
+pk a
+# DEFAULT: exclusive locks on 0-10:
+insert into t1 select A.a, 0 from t0 A;
+connection con1;
+# CON1: exclusive locks on 2000-2010:
+insert into t1 select 2000+A.a, 0 from t0 A;
+# select * from information_schema.rocksdb_locks; # With replacements by select_from_is_rowlocks.inc
+COLUMN_FAMILY_ID TRANSACTION_ID KEY mode
+$cf_id $TRX2_ID ${indexnr}80000000 X
+$cf_id $TRX2_ID ${indexnr}80000001 X
+$cf_id $TRX2_ID ${indexnr}80000002 X
+$cf_id $TRX2_ID ${indexnr}80000003 X
+$cf_id $TRX2_ID ${indexnr}80000004 X
+$cf_id $TRX2_ID ${indexnr}80000005 X
+$cf_id $TRX2_ID ${indexnr}80000006 X
+$cf_id $TRX2_ID ${indexnr}80000007 X
+$cf_id $TRX2_ID ${indexnr}80000008 X
+$cf_id $TRX2_ID ${indexnr}80000009 X
+$cf_id $TRX1_ID ${indexnr}800003e9 S
+$cf_id $TRX2_ID ${indexnr}800003e9 S
+$cf_id $TRX1_ID ${indexnr}8000044c S
+$cf_id $TRX2_ID ${indexnr}8000044c S
+$cf_id $TRX1_ID ${indexnr}800004b0 S
+$cf_id $TRX2_ID ${indexnr}800004b0 S
+$cf_id $TRX1_ID ${indexnr}800007d0 X
+$cf_id $TRX1_ID ${indexnr}800007d1 X
+$cf_id $TRX1_ID ${indexnr}800007d2 X
+$cf_id $TRX1_ID ${indexnr}800007d3 X
+$cf_id $TRX1_ID ${indexnr}800007d4 X
+$cf_id $TRX1_ID ${indexnr}800007d5 X
+$cf_id $TRX1_ID ${indexnr}800007d6 X
+$cf_id $TRX1_ID ${indexnr}800007d7 X
+$cf_id $TRX1_ID ${indexnr}800007d8 X
+$cf_id $TRX1_ID ${indexnr}800007d9 X
+$cf_id $TRX1_ID ${indexnr}800009c4 S
+$cf_id $TRX2_ID ${indexnr}800009ce S
+connection default;
+show status like 'rocksdb_locktree_current_lock_memory';
+Variable_name Value
+rocksdb_locktree_current_lock_memory 8792
+set @save_mlm= @@rocksdb_max_lock_memory;
+# Set the limit to cause lock escalation:
+set @cur_mem_usage= (select
+variable_value
+from
+performance_schema.global_status
+where
+variable_name='rocksdb_locktree_current_lock_memory');
+set global rocksdb_max_lock_memory = cast(@cur_mem_usage+4 as SIGNED);
+connection con1;
+insert into t1 select 3000+A.a, 0 from t0 A;
+# select * from information_schema.rocksdb_locks; # With replacements by select_from_is_rowlocks.inc
+COLUMN_FAMILY_ID TRANSACTION_ID KEY mode
+$cf_id $TRX2_ID ${indexnr}80000000-${indexnr}80000009 X
+$cf_id $TRX1_ID ${indexnr}800003e9 S
+$cf_id $TRX2_ID ${indexnr}800003e9 S
+$cf_id $TRX1_ID ${indexnr}8000044c S
+$cf_id $TRX2_ID ${indexnr}8000044c S
+$cf_id $TRX1_ID ${indexnr}800004b0 S
+$cf_id $TRX2_ID ${indexnr}800004b0 S
+$cf_id $TRX1_ID ${indexnr}800007d0-${indexnr}800007d9 X
+$cf_id $TRX1_ID ${indexnr}800009c4 S
+$cf_id $TRX2_ID ${indexnr}800009ce S
+$cf_id $TRX1_ID ${indexnr}80000bb8 X
+$cf_id $TRX1_ID ${indexnr}80000bb9 X
+$cf_id $TRX1_ID ${indexnr}80000bba X
+$cf_id $TRX1_ID ${indexnr}80000bbb X
+$cf_id $TRX1_ID ${indexnr}80000bbc X
+$cf_id $TRX1_ID ${indexnr}80000bbd X
+$cf_id $TRX1_ID ${indexnr}80000bbe X
+$cf_id $TRX1_ID ${indexnr}80000bbf X
+$cf_id $TRX1_ID ${indexnr}80000bc0 X
+$cf_id $TRX1_ID ${indexnr}80000bc1 X
+connection con1;
+rollback;
+connection default;
+rollback;
+disconnect con1;
+set global rocksdb_max_lock_memory= cast(@save_mlm as SIGNED);
+drop table t0, t1;
diff --git a/mysql-test/suite/rocksdb/r/rocksdb.result b/mysql-test/suite/rocksdb/r/rocksdb.result
index c343f66c97c..b1e6878e2d9 100644
--- a/mysql-test/suite/rocksdb/r/rocksdb.result
+++ b/mysql-test/suite/rocksdb/r/rocksdb.result
@@ -985,6 +985,7 @@ rocksdb_max_background_jobs 2
rocksdb_max_bottom_pri_background_compactions 0
rocksdb_max_compaction_history 64
rocksdb_max_latest_deadlocks 5
+rocksdb_max_lock_memory 1073741824
rocksdb_max_log_file_size 0
rocksdb_max_manifest_file_size 1073741824
rocksdb_max_manual_compactions 10
@@ -1054,6 +1055,8 @@ rocksdb_use_default_sk_cf OFF
rocksdb_use_direct_io_for_flush_and_compaction OFF
rocksdb_use_direct_reads OFF
rocksdb_use_fsync OFF
+rocksdb_use_range_lock_manager_as_point OFF
+rocksdb_use_range_locking OFF
rocksdb_validate_tables 1
rocksdb_verify_row_debug_checksums OFF
rocksdb_wal_bytes_per_sync 0
diff --git a/mysql-test/suite/rocksdb/r/rocksdb_read_free_rpl.result b/mysql-test/suite/rocksdb/r/rocksdb_read_free_rpl.result
index 46e846afda0..b418bfa9336 100644
--- a/mysql-test/suite/rocksdb/r/rocksdb_read_free_rpl.result
+++ b/mysql-test/suite/rocksdb/r/rocksdb_read_free_rpl.result
@@ -72,7 +72,7 @@ update t1 set c2=100 where c1=3;
delete from t1 where c1 <= 2;
include/sync_slave_sql_with_master.inc
[connection slave]
-select case when variable_value-@up > 0 then 'false' else 'true' end as read_free from performance_schema.global_status where variable_name='rocksdb_num_get_for_update_calls';
+select case when (@@rocksdb_use_range_locking=1 OR variable_value-@up > 0) then 'false' else 'true' end as read_free from performance_schema.global_status where variable_name='rocksdb_num_get_for_update_calls';
read_free
false
select * from t1;
diff --git a/mysql-test/suite/rocksdb/r/rocksdb_timeout_rollback.result b/mysql-test/suite/rocksdb/r/rocksdb_timeout_rollback.result
index 1e253a9974b..08a0a2f5942 100644
--- a/mysql-test/suite/rocksdb/r/rocksdb_timeout_rollback.result
+++ b/mysql-test/suite/rocksdb/r/rocksdb_timeout_rollback.result
@@ -36,6 +36,9 @@ rocksdb_rollback_on_timeout OFF
begin work;
insert into t1 values (9);
insert into t1 values (10);
+# Fix for Range Locking: force a snapshot to be taken:
+select * from t1 where a=100;
+a
update t1 set a = a + 1 where a = 2;
begin work;
insert into t1 values (11);
diff --git a/mysql-test/suite/rocksdb/r/unique_sec.result b/mysql-test/suite/rocksdb/r/unique_sec.result
index 1da78db24b1..d4ef2e0ff2e 100644
--- a/mysql-test/suite/rocksdb/r/unique_sec.result
+++ b/mysql-test/suite/rocksdb/r/unique_sec.result
@@ -114,6 +114,10 @@ ERROR 23000: Duplicate entry '37' for key 't1.id5'
UPDATE t1 SET id5=34 WHERE id1=38;
ERROR HY000: Lock wait timeout exceeded; try restarting transaction: Timeout on index: test.t1.id5
# NULL values are unique
+# (Note: the following UPDATE reads through the whole table without
+# finding anything to update. With point locking, this is fine,
+# but with range locking it will time out while waiting on a row lock
+# that the other transaction is holding)
UPDATE t1 SET id5=NULL WHERE value1 > 37;
COMMIT;
COMMIT;
diff --git a/mysql-test/suite/rocksdb/r/unique_sec_rev_cf.result b/mysql-test/suite/rocksdb/r/unique_sec_rev_cf.result
index d6d06f6ece5..0e71e6481aa 100644
--- a/mysql-test/suite/rocksdb/r/unique_sec_rev_cf.result
+++ b/mysql-test/suite/rocksdb/r/unique_sec_rev_cf.result
@@ -114,6 +114,10 @@ ERROR 23000: Duplicate entry '37' for key 't1.id5'
UPDATE t1 SET id5=34 WHERE id1=38;
ERROR HY000: Lock wait timeout exceeded; try restarting transaction: Timeout on index: test.t1.id5
# NULL values are unique
+# (Note: the following UPDATE reads through the whole table without
+# finding anything to update. With point locking, this is fine,
+# but with range locking it will time out while waiting on a row lock
+# that the other transaction is holding)
UPDATE t1 SET id5=NULL WHERE value1 > 37;
COMMIT;
COMMIT;
diff --git a/mysql-test/suite/rocksdb/t/deadlock_tracking.test b/mysql-test/suite/rocksdb/t/deadlock_tracking.test
index 42e46bb0f28..55e6502c079 100644
--- a/mysql-test/suite/rocksdb/t/deadlock_tracking.test
+++ b/mysql-test/suite/rocksdb/t/deadlock_tracking.test
@@ -1,3 +1,9 @@
+# Deadlock #5 uses SELECT ... LOCK IN SHARE MODE;
+# SHOW ENGINE ROCKSDB TRANSACTION status prints information about deadlocks.
+# A part of this test that works with range locking is in
+# range_locking_deadlock_tracking.test
+--source suite/rocksdb/include/not_range_locking.inc
+
set @prior_lock_wait_timeout = @@rocksdb_lock_wait_timeout;
set @prior_deadlock_detect = @@rocksdb_deadlock_detect;
set @prior_max_latest_deadlocks = @@rocksdb_max_latest_deadlocks;
@@ -137,7 +143,6 @@ rollback;
connection default;
--replace_regex /[0-9]{4}-[0-9]{2}-[0-9]{2} [0-9]{2}:[0-9]{2}:[0-9]{2}/TIMESTAMP/ /WAITING KEY: [0-9a-f]{16}/KEY/ /TRANSACTION ID: [0-9]*/TXN_ID/ /TIMESTAMP: [0-9]*/TSTAMP/
show engine rocksdb transaction status;
-
echo Deadlock #6;
connection con1;
create table t1 (id int primary key, value int) engine=rocksdb;
diff --git a/mysql-test/suite/rocksdb/t/drop_cf_before_show_deadlock_info.test b/mysql-test/suite/rocksdb/t/drop_cf_before_show_deadlock_info.test
index f7eb8151f40..05ae30f2ddd 100644
--- a/mysql-test/suite/rocksdb/t/drop_cf_before_show_deadlock_info.test
+++ b/mysql-test/suite/rocksdb/t/drop_cf_before_show_deadlock_info.test
@@ -3,6 +3,10 @@
--source include/have_rocksdb.inc
--source include/count_sessions.inc
+# Doesn't work with range locking because range locking
+# does not provide info in rocksdb_deadlock.
+--source suite/rocksdb/include/not_range_locking.inc
+
--disable_query_log
call mtr.add_suppression("Column family '[a-z_]+' not found");
--enable_query_log
diff --git a/mysql-test/suite/rocksdb/t/hermitage-range_locking.test b/mysql-test/suite/rocksdb/t/hermitage-range_locking.test
new file mode 100644
index 00000000000..55203af9cf8
--- /dev/null
+++ b/mysql-test/suite/rocksdb/t/hermitage-range_locking.test
@@ -0,0 +1,15 @@
+--source include/have_rocksdb.inc
+
+# Range locking uses InnoDB-like transaction isolation, which
+# means the results differ from "true" Repeatable Read.
+--source suite/rocksdb/include/have_range_locking.inc
+
+
+# Hermitage is an attempt to test transaction isolation levels.
+# https://github.com/ept/hermitage
+
+let $trx_isolation = READ COMMITTED;
+--source hermitage.inc
+
+let $trx_isolation = REPEATABLE READ;
+--source hermitage.inc
diff --git a/mysql-test/suite/rocksdb/t/hermitage.inc b/mysql-test/suite/rocksdb/t/hermitage.inc
index 90f7d482533..83815a70459 100644
--- a/mysql-test/suite/rocksdb/t/hermitage.inc
+++ b/mysql-test/suite/rocksdb/t/hermitage.inc
@@ -108,6 +108,8 @@ select * from test where value % 3 = 0;
commit;
--source hermitage_init.inc
+let $RC_OR_RANGE_LOCKING=`select @@tx_isolation='READ-COMMITTED' OR @@rocksdb_use_range_locking=1`;
+let $RR_AND_NOT_RANGE_LOCKING=`select @@tx_isolation='REPEATABLE-READ' AND @@rocksdb_use_range_locking=0`;
connection con1;
update test set value = value + 10;
connection con2;
@@ -117,13 +119,13 @@ send delete from test where value = 20;
connection con1;
commit;
connection con2;
-if ($trx_isolation == "READ COMMITTED")
+if ($RC_OR_RANGE_LOCKING)
{
reap;
# RC: Returns 2 => 30
select * from test;
}
-if ($trx_isolation == "REPEATABLE READ")
+if ($RR_AND_NOT_RANGE_LOCKING)
{
--error ER_LOCK_DEADLOCK
reap;
@@ -147,13 +149,13 @@ send update test set value = 12 where id = 1;
connection con1;
commit;
connection con2;
-if ($trx_isolation == "READ COMMITTED")
+if ($RC_OR_RANGE_LOCKING)
{
reap;
# RC: Returns 1 => 12
select * from test;
}
-if ($trx_isolation == "REPEATABLE READ")
+if ($RR_AND_NOT_RANGE_LOCKING)
{
--error ER_LOCK_DEADLOCK
reap;
@@ -200,12 +202,12 @@ update test set value = 12 where id = 1;
update test set value = 18 where id = 2;
commit;
connection con1;
-if ($trx_isolation == "READ COMMITTED")
+if ($RC_OR_RANGE_LOCKING)
{
delete from test where value = 20; # doesn't delete anything
select * from test where id = 2; # shows 2 => 18
}
-if ($trx_isolation == "REPEATABLE READ")
+if ($RR_AND_NOT_RANGE_LOCKING)
{
--error ER_LOCK_DEADLOCK
delete from test where value = 20;
diff --git a/mysql-test/suite/rocksdb/t/hermitage.test b/mysql-test/suite/rocksdb/t/hermitage.test
index e4138e8d89f..51f3f286a0e 100644
--- a/mysql-test/suite/rocksdb/t/hermitage.test
+++ b/mysql-test/suite/rocksdb/t/hermitage.test
@@ -1,5 +1,8 @@
--source include/have_rocksdb.inc
+# See hermitage-range_locking variant
+--source suite/rocksdb/include/not_range_locking.inc
+
# Hermitage is an attempt to test transaction isolation levels.
# https://github.com/ept/hermitage
diff --git a/mysql-test/suite/rocksdb/t/i_s_deadlock.test b/mysql-test/suite/rocksdb/t/i_s_deadlock.test
index e0479d6a337..82fa9fc6bbd 100644
--- a/mysql-test/suite/rocksdb/t/i_s_deadlock.test
+++ b/mysql-test/suite/rocksdb/t/i_s_deadlock.test
@@ -1,5 +1,9 @@
--source include/have_rocksdb.inc
+# Uses LOCK IN SHARE MODE and so will hang in range-locking mode. The part that
+# doesn't hang is in rocksdb.range_locking_i_s_deadlock.test
+--source suite/rocksdb/include/not_range_locking.inc
+
set @prior_lock_wait_timeout = @@rocksdb_lock_wait_timeout;
set @prior_deadlock_detect = @@rocksdb_deadlock_detect;
set @prior_max_latest_deadlocks = @@rocksdb_max_latest_deadlocks;
diff --git a/mysql-test/suite/rocksdb/t/issue111.test b/mysql-test/suite/rocksdb/t/issue111.test
index 671ea4708d6..3657e977a70 100644
--- a/mysql-test/suite/rocksdb/t/issue111.test
+++ b/mysql-test/suite/rocksdb/t/issue111.test
@@ -1,5 +1,9 @@
--source include/have_rocksdb.inc
+# The testcase here assumes key tracking is present
+# (and range locking uses InnoDB-like approach, "DMLs use Read Commited")
+--source suite/rocksdb/include/not_range_locking.inc
+
connect (con2,localhost,root,,);
connection default;
diff --git a/mysql-test/suite/rocksdb/t/issue243_transactionStatus-range_locking.test b/mysql-test/suite/rocksdb/t/issue243_transactionStatus-range_locking.test
new file mode 100644
index 00000000000..465fb9099da
--- /dev/null
+++ b/mysql-test/suite/rocksdb/t/issue243_transactionStatus-range_locking.test
@@ -0,0 +1,10 @@
+#
+# A range-locking variant of issue243_transactionStatus.test
+
+--source include/have_rocksdb.inc
+--source suite/rocksdb/include/have_range_locking.inc
+
+let $forced_range_locking=1;
+--source issue243_transactionStatus.test
+
+
diff --git a/mysql-test/suite/rocksdb/t/issue243_transactionStatus.test b/mysql-test/suite/rocksdb/t/issue243_transactionStatus.test
index 1e2f0b41226..5c1948ebe81 100644
--- a/mysql-test/suite/rocksdb/t/issue243_transactionStatus.test
+++ b/mysql-test/suite/rocksdb/t/issue243_transactionStatus.test
@@ -1,5 +1,9 @@
--source include/have_rocksdb.inc
+if (!$forced_range_locking) {
+--source suite/rocksdb/include/not_range_locking.inc
+}
+
--disable_warnings
DROP TABLE IF EXISTS t1;
--enable_warnings
diff --git a/mysql-test/suite/rocksdb/t/level_repeatable_read-range_locking.test b/mysql-test/suite/rocksdb/t/level_repeatable_read-range_locking.test
new file mode 100644
index 00000000000..6c42c7be12c
--- /dev/null
+++ b/mysql-test/suite/rocksdb/t/level_repeatable_read-range_locking.test
@@ -0,0 +1,9 @@
+--source include/have_rocksdb.inc
+
+# Range locking uses InnoDB-like transaction isolation, which
+# means the results differ from "true" Repeatable Read.
+--source suite/rocksdb/include/have_range_locking.inc
+
+let $trx_isolation = REPEATABLE READ;
+--source transaction_isolation.inc
+
diff --git a/mysql-test/suite/rocksdb/t/level_repeatable_read.test b/mysql-test/suite/rocksdb/t/level_repeatable_read.test
index cf29073f69e..b81dcf31ab1 100644
--- a/mysql-test/suite/rocksdb/t/level_repeatable_read.test
+++ b/mysql-test/suite/rocksdb/t/level_repeatable_read.test
@@ -1,5 +1,8 @@
--source include/have_rocksdb.inc
+# See level_repeatable_read-range_locking variant
+--source suite/rocksdb/include/not_range_locking.inc
+
let $trx_isolation = REPEATABLE READ;
--source transaction_isolation.inc
diff --git a/mysql-test/suite/rocksdb/t/lock_info.test b/mysql-test/suite/rocksdb/t/lock_info.test
index 1b624cf38c0..a277c1b8d8d 100644
--- a/mysql-test/suite/rocksdb/t/lock_info.test
+++ b/mysql-test/suite/rocksdb/t/lock_info.test
@@ -1,5 +1,8 @@
--source include/have_rocksdb.inc
+# Range Locking supports I_S.lock_info but its printout is different (see range_locking.test)
+--source suite/rocksdb/include/not_range_locking.inc
+
--disable_warnings
DROP TABLE IF EXISTS t1;
DROP TABLE IF EXISTS t2;
diff --git a/mysql-test/suite/rocksdb/t/locking_issues.test b/mysql-test/suite/rocksdb/t/locking_issues.test
index 035046ae368..95a6676f78a 100644
--- a/mysql-test/suite/rocksdb/t/locking_issues.test
+++ b/mysql-test/suite/rocksdb/t/locking_issues.test
@@ -1,5 +1,8 @@
--source include/have_rocksdb.inc
+# A lot of tests below assume point locking, not range.
+--source suite/rocksdb/include/not_range_locking.inc
+
let $isolation_level = REPEATABLE READ;
--source suite/rocksdb/include/locking_issues_case1_1.inc
diff --git a/mysql-test/suite/rocksdb/t/max_row_locks.test b/mysql-test/suite/rocksdb/t/max_row_locks.test
index 4b07f3d8492..d4b2604f1e3 100644
--- a/mysql-test/suite/rocksdb/t/max_row_locks.test
+++ b/mysql-test/suite/rocksdb/t/max_row_locks.test
@@ -1,4 +1,5 @@
--source include/have_rocksdb.inc
+--source suite/rocksdb/include/not_range_locking.inc
create table t1 (id1 bigint, id2 bigint, c1 bigint, c2 bigint, c3 bigint, c4 bigint, c5 bigint, c6 bigint, c7 bigint, primary key (id1, id2), index i(c1, c2));
--disable_query_log
diff --git a/mysql-test/suite/rocksdb/t/range_locking.inc b/mysql-test/suite/rocksdb/t/range_locking.inc
new file mode 100644
index 00000000000..4f1db4399cb
--- /dev/null
+++ b/mysql-test/suite/rocksdb/t/range_locking.inc
@@ -0,0 +1,544 @@
+#
+# Range locking tests.
+#
+
+--source include/have_rocksdb.inc
+--source suite/rocksdb/include/have_range_locking.inc
+
+--enable_connect_log
+
+
+show variables like 'rocksdb_use_range_locking';
+
+eval create table t1 (
+ pk int,
+ a int,
+ primary key (pk) comment '$pk_cf'
+) engine=rocksdb;
+
+insert into t1 values
+(10,10),(20,20),(30,30);
+
+connect (con1,localhost,root,,);
+connect (con2,localhost,root,,);
+
+--echo ### Test: check that range lock inhibits a point lock
+connection con1;
+begin;
+select * from t1 where pk between 5 and 25 for update;
+
+connection con2;
+--error ER_LOCK_WAIT_TIMEOUT
+insert into t1 values (15,15);
+
+connection con1;
+rollback;
+
+--echo ## Test: check that range lock inhibits another range lock
+connection con1;
+begin;
+select * from t1 where pk between 5 and 25 for update;
+
+connection con2;
+begin;
+--error ER_LOCK_WAIT_TIMEOUT
+select * from t1 where pk between 15 and 35 for update;
+rollback;
+
+connection con1;
+rollback;
+
+--echo ## Test: check that regular read does not get a range lock
+connection con1;
+begin;
+select * from t1 where pk between 5 and 25;
+
+connection con2;
+begin;
+# This must not block
+select * from t1 where pk between 15 and 35 for update;
+rollback;
+
+connection con1;
+rollback;
+
+--echo ## Test that locks are not released when a statement inside
+--echo ## a transaction is rolled back
+eval
+create table t2 (
+ pk int,
+ a int,
+ primary key (pk) comment '$pk_cf',
+ unique key(a) comment '$sk_cf'
+) engine=rocksdb;
+
+insert into t2 values (1,1),(2,2);
+
+begin;
+insert into t2 values (3,3);
+--error ER_DUP_ENTRY
+insert into t2 values (10,2);
+
+connection con2;
+begin;
+# This must time out:
+--error ER_LOCK_WAIT_TIMEOUT
+select * from t2 where pk=3 for update;
+
+rollback;
+connection con1;
+rollback;
+drop table t2;
+
+# cleanup
+connection default;
+disconnect con1;
+disconnect con2;
+drop table t1;
+
+--echo #
+--echo # Test INFORMATION_SCHEMA.lock_info in range-locking mode
+--echo #
+
+connect (con1,localhost,root,,);
+connection con1;
+eval create table t0 (a int primary key);
+begin;
+insert into t0 values (1);
+connection default;
+
+
+eval
+create table t1 (
+ pk int,
+ a int,
+ primary key (pk) comment '$pk_cf'
+) engine=rocksdb;
+
+insert into t1 values
+(10,10),(20,20),(30,30);
+
+begin;
+select * from t1 where pk=10 for update;
+
+#let TRX1_ID=`(select transaction_id from information_schema.rocksdb_trx where thread_id=connection_id())` ;
+let $select_from_is_rowlocks_current_trx_only=1;
+--source suite/rocksdb/include/select_from_is_rowlocks.inc
+
+delete from t1 where pk between 25 and 40;
+
+--source suite/rocksdb/include/select_from_is_rowlocks.inc
+
+rollback;
+begin;
+--echo # The following will show a range lock on 2-9 and also a point lock on 10.
+--echo # This is how things currently work. (after MDEV-21314, not anymore)
+select * from t1 where pk between 2 and 9 for update;
+--source suite/rocksdb/include/select_from_is_rowlocks.inc
+rollback;
+
+drop table t1;
+connection con1;
+rollback;
+drop table t0;
+connection default;
+disconnect con1;
+
+--echo #
+--echo # MDEV-18104: MyRocks-Gap-Lock: range locking bounds are incorrect for multi-part keys
+--echo #
+
+create table t0(a int);
+insert into t0 values (0),(1),(2),(3),(4),(5),(6),(7),(8),(9);
+eval
+create table t1 (
+ kp1 int not null,
+ kp2 int not null,
+ a int,
+ primary key(kp1, kp2) comment '$pk_cf'
+) engine=rocksdb;
+
+insert into t1 select 1, a, 1234 from t0;
+insert into t1 select 2, a, 1234 from t0;
+insert into t1 select 3, a, 1234 from t0;
+
+connect (con1,localhost,root,,);
+connection con1;
+
+begin;
+select * from t1 where kp1=2 for update;
+
+connection default;
+--echo # The lock on kp1=2 should inhibit the following INSERT:
+--error ER_LOCK_WAIT_TIMEOUT
+insert into t1 values ( 2,5,9999);
+rollback;
+
+connection con1;
+rollback;
+connection default;
+disconnect con1;
+drop table t0,t1;
+
+--echo #
+--echo # Test that locks on ranges on non-unique secondary keys inhibit
+--echo # modifications of the contents of these ranges
+--echo #
+
+create table t0(a int);
+insert into t0 values (0),(1),(2),(3),(4),(5),(6),(7),(8),(9);
+eval
+create table t1 (
+ kp1 int not null,
+ kp2 int not null,
+ a int,
+ key(kp1, kp2) comment '$pk_cf'
+) engine=rocksdb;
+
+insert into t1 select 1, a, 1234 from t0;
+insert into t1 values (2, 3, 1234);
+insert into t1 values (2, 5, 1234);
+insert into t1 values (2, 7, 1234);
+insert into t1 select 3, a, 1234 from t0;
+
+connect (con1,localhost,root,,);
+connection con1;
+begin;
+--replace_column 10 #
+explain
+select * from t1 where kp1=2 for update;
+select * from t1 where kp1=2 for update;
+
+connection default;
+begin;
+--error ER_LOCK_WAIT_TIMEOUT
+insert into t1 values (2, 9, 9999);
+
+--error ER_LOCK_WAIT_TIMEOUT
+delete from t1 where kp1=2 and kp2=5;
+
+# Update that "moves a row away" from the locked range
+--error ER_LOCK_WAIT_TIMEOUT
+update t1 set kp1=333 where kp1=2 and kp2=3;
+
+# Update that "moves a row into" the locked range
+--error ER_LOCK_WAIT_TIMEOUT
+update t1 set kp1=2 where kp1=1 and kp2=8;
+
+rollback;
+
+connection con1;
+rollback;
+disconnect con1;
+connection default;
+drop table t0,t1;
+
+--echo #
+--echo # Transaction isolation test
+--echo #
+
+create table t1 (pk int primary key, a int) engine=rocksdb;
+insert into t1 values (1,1),(2,2),(3,3);
+
+connect (con1,localhost,root,,);
+
+--echo # TRX1: Start, Allocate a snapshot
+connection con1;
+begin;
+select * from t1;
+
+--echo # TRX2: Make a change that TRX1 will not see
+connection default;
+update t1 set a=2222 where pk=2;
+
+--echo # TRX1: Now, make a change that would overwrite TRX2'x change and commit
+connection con1;
+update t1 set a=a+1 where pk=2;
+commit;
+
+--echo # Examine the result:
+--echo # pk=2, a=2223 means UPDATE in TRX1 used "read committed" (InnoDB-like isolation)
+--echo # pk=2, a=3 means UPDATE in TRX1 silently overwrote TRX2
+--echo # (and with key tracking, one would get an error on the second UPDATE)
+connection default;
+select * from t1;
+
+disconnect con1;
+connection default;
+drop table t1;
+
+--echo #
+--echo # Same test as above, but check the range scan
+--echo #
+
+eval
+create table t1 (
+ pk int,
+ a int,
+ primary key (pk) comment '$pk_cf'
+) engine=rocksdb;
+insert into t1 values (1,1),(2,2),(3,3),(4,4),(5,5),(6,6);
+
+connect (con1,localhost,root,,);
+
+--echo # TRX1: Start, Allocate a snapshot
+connection con1;
+begin;
+select * from t1;
+
+--echo # TRX2: Make a change that TRX1 will not see
+connection default;
+update t1 set a=2222 where pk between 3 and 5;
+
+--echo # TRX1: Now, make a change that would overwrite TRX2'x change and commit
+connection con1;
+update t1 set a=a+1 where pk between 3 and 5;
+commit;
+
+--echo # Examine the result:
+--echo # pk={3,4,5} a=2223 means UPDATE in TRX1 used "read committed" (InnoDB-like isolation)
+connection default;
+select * from t1;
+
+disconnect con1;
+connection default;
+drop table t1;
+
+--echo #
+--echo # Same as above, but test SELECT FOR UPDATE.
+--echo #
+eval
+create table t1 (
+ pk int,
+ a int,
+ primary key (pk) comment '$pk_cf'
+) engine=rocksdb;
+insert into t1 values (1,1),(2,2),(3,3),(4,4),(5,5),(6,6);
+
+connect (con1,localhost,root,,);
+
+--echo # TRX1: Start, Allocate a snapshot
+connection con1;
+begin;
+select * from t1;
+
+--echo # TRX2: Make a change that TRX1 will not see
+connection default;
+update t1 set a=222 where pk=2;
+update t1 set a=333 where pk=3;
+
+--echo # TRX1: Check what select [FOR UPDATE] sees
+connection con1;
+select * from t1 where pk in (2,3);
+select * from t1 where pk=2 for update;
+select * from t1 where pk=2;
+
+commit;
+
+disconnect con1;
+connection default;
+drop table t1;
+
+if (!$PK_USES_REVERSE_CF) {
+--echo #
+--echo # Another no-snapshot-checking test, this time for single-statement
+--echo # transaction
+--echo #
+eval
+create table t1 (
+ pk int,
+ a int,
+ name varchar(16),
+ primary key(pk) comment '$pk_cf'
+) engine=rocksdb;
+insert into t1 values (1,1, 'row1'), (2,2,'row2');
+
+connect (con1,localhost,root,,);
+connection con1;
+select get_lock('row1', 100);
+
+connection default;
+
+--echo # The following will read the first row (1,1,'row1'), and stop.
+
+send update t1 set a=a+100 where get_lock(name, 1000)=1;
+
+# Wait till the default connection has stopped:
+connection con1;
+
+let $wait_condition=
+ SELECT COUNT(*) = 1 FROM INFORMATION_SCHEMA.PROCESSLIST WHERE STATE = "User lock"
+ AND INFO = "update t1 set a=a+100 where get_lock(name, 1000)=1";
+--source include/wait_condition.inc
+
+# Update the second row
+update t1 set a=5 where pk=2;
+
+select release_lock('row1');
+
+connection default;
+reap;
+
+--echo # Look at the row with pk=2:
+--echo # 2, 105, row2 - means the UPDATE was reading current data (Correct)
+--echo # 2, 102, row - means the UPDATE read the snapshot (incorrect)
+select * from t1;
+
+--echo # Try releasing both locks (in 5.6, we will be holding only the second one)
+select release_lock(name) from t1;
+
+disconnect con1;
+connection default;
+drop table t1;
+}
+
+--echo #
+--echo # Check that I_S.processlist.state is set correctly now.
+--echo #
+eval
+create table t1(
+ pk int,
+ a int,
+ primary key(pk) comment '$pk_cf'
+) engine=rocksdb;
+insert into t1 values (1,1),(2,2),(3,3);
+
+begin;
+select * from t1 where pk=2 for update;
+
+--connect (con1,localhost,root,,)
+begin;
+set rocksdb_lock_wait_timeout=300;
+send select * from t1 where pk=2 for update;
+
+connection default;
+--echo # Now, will wait until we see con1 have state="Waiting for row lock"
+let $wait_condition=
+ SELECT COUNT(*) = 1 FROM INFORMATION_SCHEMA.PROCESSLIST WHERE STATE = "Waiting for row lock"
+ AND INFO = "select * from t1 where pk=2 for update";
+--source include/wait_condition.inc
+
+rollback;
+connection con1;
+--reap
+rollback;
+
+disconnect con1;
+connection default;
+drop table t1;
+
+--echo #
+--echo # Test range locking for ranges with HA_READ_PREFIX_LAST
+--echo #
+create table t0(a int) engine=rocksdb;
+insert into t0 values (0),(1),(2),(3),(4),(5),(6),(7),(8),(9);
+
+eval
+create table t1 (
+ pk1 int,
+ pk2 int,
+ a int,
+ primary key(pk1, pk2) comment '$pk_cf'
+) engine=rocksdb;
+
+insert into t1
+select
+ A.a, B.a, A.a*10+B.a
+from
+ t0 A, t0 B;
+
+
+# Get a lock in another connection so that the primary transaction is not using
+# STO optimization, and its locks can be seen in I_S.rocksdb_locks
+--connect (con1,localhost,root,,)
+connection con1;
+begin;
+insert into t1 values (0x1112222,0x1112222,0);
+
+connection default;
+begin;
+--echo # Should use ref access w/o filesort:
+--replace_column 10 #
+explain
+select * from t1
+where pk1=3
+order by pk1 desc, pk2 desc
+for update;
+
+select * from t1
+where pk1=3
+order by pk1 desc, pk2 desc
+for update;
+
+let $select_from_is_rowlocks_current_trx_only=1;
+--source suite/rocksdb/include/select_from_is_rowlocks.inc
+rollback;
+
+--echo #
+--echo # Test range locking for ranges with HA_READ_PREFIX_LAST_OR_PREV
+--echo #
+
+begin;
+--echo # Should use range access with 2 keyparts and w/o filesort:
+--replace_column 10 #
+explain
+select * from t1
+where pk1=4 and pk2 between 5 and 8
+order by pk1 desc, pk2 desc
+for update;
+
+select * from t1
+where pk1=4 and pk2 between 5 and 8
+order by pk1 desc, pk2 desc
+for update;
+
+let $select_from_is_rowlocks_current_trx_only=1;
+--source suite/rocksdb/include/select_from_is_rowlocks.inc
+rollback;
+
+connection con1;
+rollback;
+
+connection default;
+drop table t0, t1;
+
+--echo #
+--echo # A bug: range locking was not used when scan started at table start or end
+--echo #
+create table t0(a int);
+insert into t0 values (0),(1),(2),(3),(4),(5),(6),(7),(8),(9);
+create table t10(a int);
+insert into t10 select A.a + B.a* 10 + C.a * 100 from t0 A, t0 B, t0 C;
+
+create table t1 (
+ pk int not null,
+ a int,
+ primary key(pk)
+) engine=rocksdb;
+
+insert into t1 select a*2,a*2 from t10;
+
+connection con1;
+begin;
+select * from t1 where pk=500 for update;
+connection default;
+
+begin;
+select * from t1 where pk<10 order by pk limit 10 for update;
+
+let $select_from_is_rowlocks_current_trx_only=1;
+--source suite/rocksdb/include/select_from_is_rowlocks.inc
+rollback;
+
+begin;
+select * from t1 where pk>1990 order by pk desc limit 10 for update;
+let $select_from_is_rowlocks_current_trx_only=1;
+--source suite/rocksdb/include/select_from_is_rowlocks.inc
+rollback;
+
+connection con1;
+rollback;
+disconnect con1;
+
+connection default;
+drop table t0,t10,t1;
diff --git a/mysql-test/suite/rocksdb/t/range_locking.test b/mysql-test/suite/rocksdb/t/range_locking.test
new file mode 100644
index 00000000000..5c599238a0a
--- /dev/null
+++ b/mysql-test/suite/rocksdb/t/range_locking.test
@@ -0,0 +1,6 @@
+
+--let pk_cf=default
+--let sk_cf=default
+
+--source range_locking.inc
+
diff --git a/mysql-test/suite/rocksdb/t/range_locking_deadlock_tracking.test b/mysql-test/suite/rocksdb/t/range_locking_deadlock_tracking.test
new file mode 100644
index 00000000000..2a5966b65c3
--- /dev/null
+++ b/mysql-test/suite/rocksdb/t/range_locking_deadlock_tracking.test
@@ -0,0 +1,196 @@
+--source suite/rocksdb/include/have_range_locking.inc
+
+#
+# This is deadlock_tracking.test, variant for running with Range Locking:
+# - Deadlock #5 is disabled, it requires LOCK IN SHARE MODE tests
+# - In the result file, SHOW ENGINE ROCKSDB TRANSACTION STATUS does not print
+# deadlock information.
+#
+set @prior_lock_wait_timeout = @@rocksdb_lock_wait_timeout;
+set @prior_deadlock_detect = @@rocksdb_deadlock_detect;
+set @prior_max_latest_deadlocks = @@rocksdb_max_latest_deadlocks;
+set global rocksdb_deadlock_detect = on;
+set global rocksdb_lock_wait_timeout = 10000;
+--echo # Clears deadlock buffer of any prior deadlocks.
+set global rocksdb_max_latest_deadlocks = 0;
+set global rocksdb_max_latest_deadlocks = @prior_max_latest_deadlocks;
+let $engine = rocksdb;
+
+--source include/count_sessions.inc
+connect (con1,localhost,root,,);
+let $con1= `SELECT CONNECTION_ID()`;
+
+connect (con2,localhost,root,,);
+let $con2= `SELECT CONNECTION_ID()`;
+
+connect (con3,localhost,root,,);
+let $con3= `SELECT CONNECTION_ID()`;
+
+connection default;
+eval create table t (i int primary key) engine=$engine;
+insert into t values (1), (2), (3);
+--replace_regex /[0-9]{4}-[0-9]{2}-[0-9]{2} [0-9]{2}:[0-9]{2}:[0-9]{2}/TIMESTAMP/ /WAITING KEY: [0-9a-f]{16}/KEY/ /TRANSACTION ID: [0-9]*/TXN_ID/ /TIMESTAMP: [0-9]*/TSTAMP/
+show engine rocksdb transaction status;
+
+echo Deadlock #1;
+--source include/simple_deadlock.inc
+connection default;
+--replace_regex /[0-9]{4}-[0-9]{2}-[0-9]{2} [0-9]{2}:[0-9]{2}:[0-9]{2}/TIMESTAMP/ /WAITING KEY: [0-9a-f]{16}/KEY/ /TRANSACTION ID: [0-9]*/TXN_ID/ /TIMESTAMP: [0-9]*/TSTAMP/
+show engine rocksdb transaction status;
+
+echo Deadlock #2;
+--source include/simple_deadlock.inc
+connection default;
+--replace_regex /[0-9]{4}-[0-9]{2}-[0-9]{2} [0-9]{2}:[0-9]{2}:[0-9]{2}/TIMESTAMP/ /WAITING KEY: [0-9a-f]{16}/KEY/ /TRANSACTION ID: [0-9]*/TXN_ID/ /TIMESTAMP: [0-9]*/TSTAMP/
+show engine rocksdb transaction status;
+set global rocksdb_max_latest_deadlocks = 10;
+
+echo Deadlock #3;
+--source include/simple_deadlock.inc
+connection default;
+--replace_regex /[0-9]{4}-[0-9]{2}-[0-9]{2} [0-9]{2}:[0-9]{2}:[0-9]{2}/TIMESTAMP/ /WAITING KEY: [0-9a-f]{16}/KEY/ /TRANSACTION ID: [0-9]*/TXN_ID/ /TIMESTAMP: [0-9]*/TSTAMP/
+show engine rocksdb transaction status;
+set global rocksdb_max_latest_deadlocks = 1;
+--replace_regex /[0-9]{4}-[0-9]{2}-[0-9]{2} [0-9]{2}:[0-9]{2}:[0-9]{2}/TIMESTAMP/ /WAITING KEY: [0-9a-f]{16}/KEY/ /TRANSACTION ID: [0-9]*/TXN_ID/ /TIMESTAMP: [0-9]*/TSTAMP/
+show engine rocksdb transaction status;
+
+connection con3;
+set rocksdb_deadlock_detect_depth = 2;
+
+--echo # Range locking code will report deadlocks, because it doesn't honor
+--echo # rocksdb_deadlock_detect_depth:
+echo Deadlock #4;
+connection con1;
+begin;
+select * from t where i=1 for update;
+
+connection con2;
+begin;
+select * from t where i=2 for update;
+
+connection con3;
+begin;
+select * from t where i=3 for update;
+
+connection con1;
+send select * from t where i=2 for update;
+
+connection con2;
+let $wait_condition = select count(*) = 1 from information_schema.rocksdb_trx
+where thread_id = $con1 and waiting_key != "";
+--source include/wait_condition.inc
+
+send select * from t where i=3 for update;
+
+connection con3;
+let $wait_condition = select count(*) = 1 from information_schema.rocksdb_trx
+where thread_id = $con2 and waiting_key != "";
+--source include/wait_condition.inc
+
+select variable_value into @a from performance_schema.global_status where variable_name='rocksdb_row_lock_deadlocks';
+--error ER_LOCK_DEADLOCK
+select * from t where i=1 for update;
+select case when variable_value-@a = 1 then 'true' else 'false' end as deadlocks from performance_schema.global_status where variable_name='rocksdb_row_lock_deadlocks';
+rollback;
+
+connection con2;
+reap;
+rollback;
+
+connection con1;
+reap;
+rollback;
+
+connection default;
+set global rocksdb_max_latest_deadlocks = 5;
+--replace_regex /[0-9]{4}-[0-9]{2}-[0-9]{2} [0-9]{2}:[0-9]{2}:[0-9]{2}/TIMESTAMP/ /WAITING KEY: [0-9a-f]{16}/KEY/ /TRANSACTION ID: [0-9]*/TXN_ID/ /TIMESTAMP: [0-9]*/TSTAMP/
+show engine rocksdb transaction status;
+
+--disable_testcase BUG#0000
+echo Deadlock #5;
+connection con1;
+begin;
+select * from t where i=1 for update;
+
+connection con2;
+begin;
+select * from t where i=2 for update;
+
+connection con3;
+begin;
+select * from t where i=3 lock in share mode;
+
+connection con1;
+select * from t where i=100 for update;
+select * from t where i=101 for update;
+send select * from t where i=2 for update;
+
+connection con2;
+let $wait_condition = select count(*) = 1 from information_schema.rocksdb_trx
+where thread_id = $con1 and waiting_key != "";
+--source include/wait_condition.inc
+
+select * from t where i=3 lock in share mode;
+select * from t where i=200 for update;
+select * from t where i=201 for update;
+
+--error ER_LOCK_DEADLOCK
+select * from t where i=1 lock in share mode;
+rollback;
+
+connection con1;
+reap;
+rollback;
+
+connection con3;
+rollback;
+
+connection default;
+--replace_regex /[0-9]{4}-[0-9]{2}-[0-9]{2} [0-9]{2}:[0-9]{2}:[0-9]{2}/TIMESTAMP/ /WAITING KEY: [0-9a-f]{16}/KEY/ /TRANSACTION ID: [0-9]*/TXN_ID/ /TIMESTAMP: [0-9]*/TSTAMP/
+show engine rocksdb transaction status;
+--enable_testcase
+echo Deadlock #6;
+connection con1;
+create table t1 (id int primary key, value int) engine=rocksdb;
+insert into t1 values (1,1),(2,2),(3,3),(4,4),(5,5);
+begin;
+update t1 set value=value+100 where id=1;
+update t1 set value=value+100 where id=2;
+
+connection con2;
+begin;
+update t1 set value=value+200 where id=3;
+
+connection con1;
+send update t1 set value=value+100 where id=3;
+
+connection con2;
+let $wait_condition = select count(*) = 1 from information_schema.rocksdb_trx
+where thread_id = $con1 and waiting_key != "";
+--source include/wait_condition.inc
+--error ER_LOCK_DEADLOCK
+update t1 set value=value+200 where id=1;
+
+# con2 tx is automatically rolled back
+connection con1;
+reap;
+select * from t1;
+drop table t1;
+
+connection default;
+
+disconnect con1;
+disconnect con2;
+disconnect con3;
+
+set global rocksdb_lock_wait_timeout = @prior_lock_wait_timeout;
+set global rocksdb_deadlock_detect = @prior_deadlock_detect;
+drop table t;
+--replace_regex /[0-9]{4}-[0-9]{2}-[0-9]{2} [0-9]{2}:[0-9]{2}:[0-9]{2}/TIMESTAMP/ /WAITING KEY: [0-9a-f]{16}/KEY/ /TRANSACTION ID: [0-9]*/TXN_ID/ /INDEX_ID: [0-9a-f]*/IDX_ID/ /TIMESTAMP: [0-9]*/TSTAMP/
+show engine rocksdb transaction status;
+set global rocksdb_max_latest_deadlocks = 0;
+--echo # Clears deadlock buffer of any existent deadlocks.
+set global rocksdb_max_latest_deadlocks = @prior_max_latest_deadlocks;
+--replace_regex /[0-9]{4}-[0-9]{2}-[0-9]{2} [0-9]{2}:[0-9]{2}:[0-9]{2}/TIMESTAMP/ /WAITING KEY: [0-9a-f]{16}/KEY/ /TRANSACTION ID: [0-9]*/TXN_ID/ /INDEX_ID: [0-9a-f]*/IDX_ID/ /TIMESTAMP: [0-9]*/TSTAMP/
+show engine rocksdb transaction status;
+--source include/wait_until_count_sessions.inc
diff --git a/mysql-test/suite/rocksdb/t/range_locking_escalation-master.opt b/mysql-test/suite/rocksdb/t/range_locking_escalation-master.opt
new file mode 100644
index 00000000000..d0087e2a77b
--- /dev/null
+++ b/mysql-test/suite/rocksdb/t/range_locking_escalation-master.opt
@@ -0,0 +1 @@
+--rocksdb_use_range_locking=1 --rocksdb_max_lock_memory=1024
diff --git a/mysql-test/suite/rocksdb/t/range_locking_escalation.test b/mysql-test/suite/rocksdb/t/range_locking_escalation.test
new file mode 100644
index 00000000000..5a6e9fa6616
--- /dev/null
+++ b/mysql-test/suite/rocksdb/t/range_locking_escalation.test
@@ -0,0 +1,39 @@
+#
+# Range Locking - Lock Escalation Tests.
+#
+
+--source include/have_rocksdb.inc
+--source suite/rocksdb/include/have_range_locking.inc
+--enable_connect_log
+
+
+show variables like 'rocksdb_use_range_locking';
+show variables like 'rocksdb_max_lock_memory';
+show status like 'rocksdb_locktree_escalation_count';
+create table t0(a int);
+insert into t0 values (0),(1),(2),(3),(4),(5),(6),(7),(8),(9);
+
+create table t1 (
+ pk int primary key,
+ a int
+) engine=rocksdb;
+
+#begin;
+#insert into t1 values (1000111,100011);
+#connect (con1,localhost,root,,);
+#connection con1;
+
+insert into t1
+select
+ A.a + B.a*10 + C.a*100 + D.a*1000,
+ 12345
+from t0 A, t0 B, t0 C, t0 D;
+
+select count(*) from t1;
+
+#connection default;
+#disconnect con1;
+show status like 'rocksdb_locktree_escalation_count';
+
+drop table t0,t1;
+
diff --git a/mysql-test/suite/rocksdb/t/range_locking_refresh_iter.test b/mysql-test/suite/rocksdb/t/range_locking_refresh_iter.test
new file mode 100644
index 00000000000..9bbb1b9b392
--- /dev/null
+++ b/mysql-test/suite/rocksdb/t/range_locking_refresh_iter.test
@@ -0,0 +1,70 @@
+--source include/have_rocksdb.inc
+--source suite/rocksdb/include/have_range_locking.inc
+--source include/have_debug_sync.inc
+
+select @@rocksdb_use_range_locking;
+
+--disable_warnings
+set debug_sync='RESET';
+--enable_warnings
+#
+# Testcase for iterator snapshot refresh
+#
+create table ten(a int primary key);
+insert into ten values (0),(1),(2),(3),(4),(5),(6),(7),(8),(9);
+
+create table one_k(a int primary key);
+insert into one_k select A.a + B.a* 10 + C.a * 100 from ten A, ten B, ten C;
+
+create table t1 (
+ pk int primary key,
+ a int
+) engine=rocksdb;
+
+insert into t1 select a,a from ten;
+insert into t1 select a+40, a+40 from ten;
+insert into t1 select a+100, a+100 from one_k;
+delete from t1 where pk=44;
+set global rocksdb_force_flush_memtable_and_lzero_now=1;
+
+# Ok, now the table has these PK ranges:
+# 0..9 40..49 100...1000
+# and all rows have pk=a
+connect (con1,localhost,root,,);
+connect (con2,localhost,root,,);
+
+connection con1;
+begin;
+set debug_sync='rocksdb.check_flags_iri SIGNAL con1_stopped WAIT_FOR con1_cont';
+send
+update t1 set a=a+100 where pk < 3 or pk between 10 and 50;
+
+# The query is how stuck at the start of the second range.
+
+
+## con2>
+connection con2;
+set debug_sync='now WAIT_FOR con1_stopped';
+
+# Make some changes to check if the iterator is reading current data or
+# snapshot
+insert into t1 values (44,5000);
+delete from t1 where pk= 42;
+update t1 set a=5000 where pk between 40 and 45;
+set global rocksdb_force_flush_memtable_and_lzero_now=1;
+
+set debug_sync='now SIGNAL con1_cont';
+
+connection con1;
+#--error ER_GET_ERRMSG
+reap;
+select * from t1 where pk<100;
+
+commit;
+disconnect con1;
+disconnect con2;
+connection default;
+set debug_sync='RESET';
+
+drop table t1, ten, one_k;
+
diff --git a/mysql-test/suite/rocksdb/t/range_locking_rev_cf.test b/mysql-test/suite/rocksdb/t/range_locking_rev_cf.test
new file mode 100644
index 00000000000..8b993764235
--- /dev/null
+++ b/mysql-test/suite/rocksdb/t/range_locking_rev_cf.test
@@ -0,0 +1,12 @@
+#
+# Range locking tests.
+#
+
+--source include/have_rocksdb.inc
+--source suite/rocksdb/include/have_range_locking.inc
+
+--let pk_cf=rev:cf1
+--let PK_USES_REVERSE_CF=1
+
+--source range_locking.inc
+
diff --git a/mysql-test/suite/rocksdb/t/range_locking_seek_for_update.test b/mysql-test/suite/rocksdb/t/range_locking_seek_for_update.test
new file mode 100644
index 00000000000..c1f0fe312e0
--- /dev/null
+++ b/mysql-test/suite/rocksdb/t/range_locking_seek_for_update.test
@@ -0,0 +1,288 @@
+#
+# Range Locking : tests for SeekForUpdate feature
+#
+
+--source include/have_rocksdb.inc
+--source include/have_debug_sync.inc
+--source suite/rocksdb/include/have_range_locking.inc
+--enable_connect_log
+show variables like 'rocksdb_use_range_locking';
+
+create table t0(a int primary key);
+insert into t0 values (0),(1),(2),(3),(4),(5),(6),(7),(8),(9);
+
+create table t1 (
+ pk int,
+ a int,
+ primary key (pk)
+) engine=rocksdb;
+
+insert into t1 select
+ A.a + B.a*10 + C.a*100,
+ A.a + B.a*10 + C.a*100
+from
+ t0 A, t0 B, t0 C;
+
+--echo # Make another connection to get the lock tree out of the STO-mode
+connect (con1,localhost,root,,);
+connection con1;
+begin;
+select * from t1 where pk=10 for update;
+
+connection default;
+begin;
+select * from t1 where pk=11 for update;
+
+let $select_from_is_rowlocks_current_trx_only=1;
+--echo # Now, we will just see locks on 10=0xA and 11=0xB:
+--source suite/rocksdb/include/select_from_is_rowlocks.inc
+
+--echo #
+--echo # SeekForUpdate Test #1: A query with type=range (without upper bound) and LIMIT
+--echo #
+--replace_column 10 #
+explain
+select * from t1 where pk>=500 order by pk limit 3 for update;
+select * from t1 where pk>=500 order by pk limit 3 for update;
+--source suite/rocksdb/include/select_from_is_rowlocks.inc
+rollback;
+
+
+begin;
+select * from t1 where pk=11 for update;
+explain
+select * from t1 order by pk limit 3 for update;
+select * from t1 order by pk limit 3 for update;
+
+--source suite/rocksdb/include/select_from_is_rowlocks.inc
+
+rollback;
+connection con1;
+rollback;
+disconnect con1;
+connection default;
+drop table t0, t1;
+
+
+--echo #
+--echo # Concurrent tests: let one thread do SeekForUpdate and the other
+--echo # interfere by committing modifications
+--echo #
+
+create table t0(a int primary key);
+insert into t0 values (0),(1),(2),(3),(4),(5),(6),(7),(8),(9);
+
+create table t1 (
+ pk int,
+ a int,
+ primary key (pk)
+) engine=rocksdb;
+
+insert into t1 select
+ A.a + B.a*10 + C.a*100,
+ A.a + B.a*10 + C.a*100
+from
+ t0 A, t0 B, t0 C;
+
+select * from t1 where pk<10;
+delete from t1 where pk<10;
+select * from t1 where pk<10;
+
+
+--echo # Test what happens when another transaction commits a row
+--echo # right before the range we are about to lock (nothing)
+
+--replace_column 10 #
+explain
+select * from t1 where pk >=5 order by pk limit 3 for update;
+
+begin;
+
+set debug_sync='rocksdb.locking_iter_scan SIGNAL about_to_lock_range WAIT_FOR spoiler_inserted';
+send select * from t1 where pk >=5 order by pk limit 3 for update;
+
+connect (con1,localhost,root,,);
+connection con1;
+set debug_sync='now WAIT_FOR about_to_lock_range';
+insert into t1 values (3,3);
+set debug_sync='now SIGNAL spoiler_inserted';
+
+connection default;
+reap;
+rollback;
+
+delete from t1 where pk=3;
+
+--echo #
+--echo # Now, repeat the test but let the other transaction insert the row into
+--echo # the range we are locking
+
+--replace_column 10 #
+explain
+select * from t1 where pk >=5 order by pk limit 1 for update;
+
+begin;
+
+set debug_sync='rocksdb.locking_iter_scan SIGNAL about_to_lock_range WAIT_FOR spoiler_inserted';
+send
+select * from t1 where pk >=5 order by pk limit 1 for update;
+
+connection con1;
+set debug_sync='now WAIT_FOR about_to_lock_range';
+insert into t1 values (8,8);
+set debug_sync='now SIGNAL spoiler_inserted';
+
+connection default;
+reap;
+
+--source suite/rocksdb/include/select_from_is_rowlocks.inc
+
+rollback;
+delete from t1 where pk=8;
+
+--echo #
+--echo # Repeat the third time, this time deleting the row that SeekForUpdate saw
+--echo #
+insert into t1 values (7,7);
+
+begin;
+
+set debug_sync='rocksdb.locking_iter_scan SIGNAL about_to_lock_range WAIT_FOR spoiler_inserted';
+send
+select * from t1 where pk >=5 order by pk limit 1 for update;
+
+connection con1;
+set debug_sync='now WAIT_FOR about_to_lock_range';
+delete from t1 where pk=7;
+set debug_sync='now SIGNAL spoiler_inserted';
+
+connection default;
+reap;
+
+rollback;
+
+--echo #
+--echo # Repeat the above test, but let the read fail with ER_LOCK_WAIT_TIMEOUT
+--echo # error. MyRocks code should now be prepared that data reads cause this
+--echo # error
+--echo #
+insert into t1 values (7,7);
+
+begin;
+
+set debug_sync='rocksdb.locking_iter_scan SIGNAL about_to_lock_range WAIT_FOR spoiler_inserted';
+send
+select * from t1 where pk >=5 order by pk limit 1 for update;
+
+connection con1;
+set debug_sync='now WAIT_FOR about_to_lock_range';
+begin;
+delete from t1 where pk=7;
+set debug_sync='now SIGNAL spoiler_inserted';
+
+connection default;
+--error ER_LOCK_WAIT_TIMEOUT
+reap;
+
+rollback;
+
+connection con1;
+rollback;
+connection default;
+
+--echo #
+--echo # Backward scan test
+--echo #
+connection con1;
+begin;
+select * from t1 where pk=500 for update;
+connection default;
+
+insert into t1 values
+ (1001, 1001),
+ (1005, 1005),
+ (1007, 1007),
+ (1010, 1010);
+
+begin;
+select * from t1 order by pk desc limit 2 for update;
+
+let $select_from_is_rowlocks_current_trx_only=1;
+
+--echo # The below will lock from pk=1007 (0x3ef) till the end of the table:
+--source suite/rocksdb/include/select_from_is_rowlocks.inc
+
+rollback;
+
+begin;
+select * from t1 where pk <1007 order by pk desc limit 2 for update;
+--source suite/rocksdb/include/select_from_is_rowlocks.inc
+
+connection con1;
+rollback;
+
+connection default;
+rollback;
+
+--echo #
+--echo # Backward scan test 2: error condition
+--echo #
+connection con1;
+begin;
+select * from t1 where pk=1010 for update;
+
+connection default;
+begin;
+--error ER_LOCK_WAIT_TIMEOUT
+select * from t1 order by pk desc limit 2 for update;
+rollback;
+
+connection con1;
+rollback;
+begin;
+select * from t1 where pk=1007 for update;
+
+connection default;
+begin;
+--error ER_LOCK_WAIT_TIMEOUT
+select * from t1 order by pk desc limit 2 for update;
+rollback;
+
+connection con1;
+rollback;
+
+disconnect con1;
+connection default;
+drop table t0,t1;
+
+--echo #
+--echo # A test: full table scan doesn't lock gaps
+--echo #
+
+create table t1 (
+ pk int primary key,
+ a int
+) engine=rocksdb;
+
+insert into t1 values (10,10),(20,20),(30,30);
+
+connect (con1,localhost,root,,);
+connect (con2,localhost,root,,);
+
+connection con1;
+begin;
+
+select * from t1 for update;
+
+connection con2;
+
+--error ER_LOCK_WAIT_TIMEOUT
+insert into t1 values (5,5);
+
+connection con1;
+rollback;
+
+disconnect con1;
+disconnect con2;
+connection default;
+drop table t1;
diff --git a/mysql-test/suite/rocksdb/t/range_locking_shared_locks.test b/mysql-test/suite/rocksdb/t/range_locking_shared_locks.test
new file mode 100644
index 00000000000..c6e4e457897
--- /dev/null
+++ b/mysql-test/suite/rocksdb/t/range_locking_shared_locks.test
@@ -0,0 +1,202 @@
+#
+# Test for shared lock support for range locking
+#
+--source include/have_rocksdb.inc
+--source suite/rocksdb/include/have_range_locking.inc
+--enable_connect_log
+
+select @@rocksdb_use_range_locking;
+
+create table t0 (a int);
+insert into t0 values (0),(1),(2),(3),(4),(5),(6),(7),(8),(9);
+
+create table t1 (
+ pk int primary key,
+ a int
+) engine=rocksdb;
+
+
+insert into t1 select a,a from t0;
+
+--echo # A basic test for shared locks
+
+begin;
+select * from t1 where pk=3 for update;
+select * from t1 where pk=5 lock in share mode;
+let $TRX1_ID=`select transaction_id from information_schema.rocksdb_trx where thread_id=connection_id()`;
+
+connect (con1,localhost,root,,);
+connection con1;
+begin;
+select * from t1 where pk=5 lock in share mode;
+let $TRX2_ID=`select transaction_id from information_schema.rocksdb_trx where thread_id=connection_id()`;
+--echo # Now for pk=5 we should see two locks by TRX1 and TRX2 with mode=S:
+--source suite/rocksdb/include/select_from_is_rowlocks.inc
+
+rollback;
+--echo # Now, TRX2_ID should be gone:
+--source suite/rocksdb/include/select_from_is_rowlocks.inc
+
+connection default;
+
+--echo # Get a read lock on pk=3 (where we have a write lock).
+--echo # The result should be that we will still have a write lock
+select * from t1 where pk=3 for update;
+--source suite/rocksdb/include/select_from_is_rowlocks.inc
+
+--echo # Get a write lock on pk=5 (where we have a read lock).
+--echo # The result should be that we will have a write lock.
+select * from t1 where pk=5 for update;
+--source suite/rocksdb/include/select_from_is_rowlocks.inc
+
+connection default;
+rollback;
+
+--echo #
+--echo # Test if a read lock inhibits write locks
+--echo #
+
+begin;
+select * from t1 where pk=2 lock in share mode;
+select * from t1 where pk=8 for update;
+
+connection con1;
+begin;
+
+--error ER_LOCK_WAIT_TIMEOUT
+select * from t1 where pk=2 for update;
+
+--error ER_LOCK_WAIT_TIMEOUT
+select * from t1 where pk between 0 and 4 for update;
+
+--error ER_LOCK_WAIT_TIMEOUT
+delete from t1 where pk=2;
+
+--echo # Get a shared lock
+select * from t1 where pk=2 lock in share mode;
+
+--echo # But this should still prevent us from acquiring a write lock on that value:
+--error ER_LOCK_WAIT_TIMEOUT
+select * from t1 where pk=2 for update;
+
+rollback;
+connection default;
+rollback;
+
+drop table t1;
+create table t1 (
+ pk int not null primary key,
+ a int not null,
+ key(a)
+) engine=rocksdb;
+
+insert into t1
+select
+ A.a+10*B.a+100*C.a+1000*D.a, A.a+10*B.a+100*C.a+1000*D.a
+from
+ t0 A, t0 B, t0 C, t0 D;
+set global rocksdb_force_flush_memtable_now=1;
+
+connection con1;
+begin;
+select * from t1 where pk=900 for update;
+let $TRX2_ID=`select transaction_id from information_schema.rocksdb_trx where thread_id=connection_id()`;
+
+connection default;
+begin;
+--replace_column 10 #
+explain
+select * from t1 where a between 2 and 5 lock in share mode;
+select * from t1 where a between 2 and 5 lock in share mode;
+let $TRX1_ID=`select transaction_id from information_schema.rocksdb_trx where thread_id=connection_id()`;
+
+--source suite/rocksdb/include/select_from_is_rowlocks.inc
+
+rollback;
+
+disconnect con1;
+
+drop table t0,t1;
+
+--echo #
+--echo # Test shared point locks and lock escalation
+--echo #
+create table t0 (a int);
+insert into t0 values (0),(1),(2),(3),(4),(5),(6),(7),(8),(9);
+
+create table t1 (
+ pk int primary key,
+ a int
+) engine=rocksdb;
+
+insert into t1
+select 1000 + 100*A.a + 10*B.a + C.a, 12345 from t0 A, t0 B, t0 C;
+
+show status like 'rocksdb_locktree_current_lock_memory';
+
+connect (con1,localhost,root,,);
+connection con1;
+
+begin;
+--echo # CON1: get some shared locks
+select * from t1 where pk=1001 lock in share mode;
+select * from t1 where pk=1100 lock in share mode;
+select * from t1 where pk=1200 lock in share mode;
+
+select * from t1 where pk=2500 lock in share mode;
+let $TRX1_ID=`select transaction_id from information_schema.rocksdb_trx where thread_id=connection_id()`;
+
+connection default;
+begin;
+--echo # DEFAULT: get the same locks so we have locks with multiple owners
+select * from t1 where pk=1001 lock in share mode;
+select * from t1 where pk=1100 lock in share mode;
+select * from t1 where pk=1200 lock in share mode;
+
+--echo # DEFAULT: get shared locks with one owner:
+select * from t1 where pk=2510 lock in share mode;
+let $TRX2_ID=`select transaction_id from information_schema.rocksdb_trx where thread_id=connection_id()`;
+
+
+--echo # DEFAULT: exclusive locks on 0-10:
+insert into t1 select A.a, 0 from t0 A;
+
+connection con1;
+--echo # CON1: exclusive locks on 2000-2010:
+insert into t1 select 2000+A.a, 0 from t0 A;
+
+let $order_by_rowkey=1;
+#select * from information_schema.rocksdb_locks;
+--source suite/rocksdb/include/select_from_is_rowlocks.inc
+
+connection default;
+show status like 'rocksdb_locktree_current_lock_memory';
+set @save_mlm= @@rocksdb_max_lock_memory;
+
+--echo # Set the limit to cause lock escalation:
+set @cur_mem_usage= (select
+ variable_value
+ from
+ performance_schema.global_status
+ where
+ variable_name='rocksdb_locktree_current_lock_memory');
+
+set global rocksdb_max_lock_memory = cast(@cur_mem_usage+4 as SIGNED);
+
+connection con1;
+insert into t1 select 3000+A.a, 0 from t0 A;
+
+#select * from information_schema.rocksdb_locks;
+--source suite/rocksdb/include/select_from_is_rowlocks.inc
+
+connection con1;
+rollback;
+connection default;
+rollback;
+
+disconnect con1;
+set global rocksdb_max_lock_memory= cast(@save_mlm as SIGNED);
+
+drop table t0, t1;
+
+
diff --git a/mysql-test/suite/rocksdb/t/rocksdb.test b/mysql-test/suite/rocksdb/t/rocksdb.test
index c063d8c7ccb..0544214b8c9 100644
--- a/mysql-test/suite/rocksdb/t/rocksdb.test
+++ b/mysql-test/suite/rocksdb/t/rocksdb.test
@@ -2,6 +2,9 @@
--source suite/rocksdb/include/have_write_committed.inc
--source include/count_sessions.inc
+# Does SHOW WARNINGS and SHOW STATUS which change in Range Locking mode
+--source suite/rocksdb/include/not_range_locking.inc
+
#
# RocksDB Storage Engine tests
#
diff --git a/mysql-test/suite/rocksdb/t/rocksdb_concurrent_delete.test b/mysql-test/suite/rocksdb/t/rocksdb_concurrent_delete.test
index 47818bfdbe1..3aa51b7be80 100644
--- a/mysql-test/suite/rocksdb/t/rocksdb_concurrent_delete.test
+++ b/mysql-test/suite/rocksdb/t/rocksdb_concurrent_delete.test
@@ -27,6 +27,10 @@
# In all cases, RR gets snapshot conflict errors if non-first rows get
# deleted by another transaction after scanning.
+# The tests do not work with range locking as it locks it is about to
+# read, first.
+--source suite/rocksdb/include/not_range_locking.inc
+
--source include/have_rocksdb.inc
--source include/have_debug_sync.inc
diff --git a/mysql-test/suite/rocksdb/t/rocksdb_locks.test b/mysql-test/suite/rocksdb/t/rocksdb_locks.test
index ff092773737..8b3975723df 100644
--- a/mysql-test/suite/rocksdb/t/rocksdb_locks.test
+++ b/mysql-test/suite/rocksdb/t/rocksdb_locks.test
@@ -5,6 +5,9 @@
#
--source include/have_debug.inc
+# Range locking requests locks before doing snapshot checking.
+--source suite/rocksdb/include/not_range_locking.inc
+
--enable_connect_log
create table t1 (pk int not null primary key) engine=rocksdb;
diff --git a/mysql-test/suite/rocksdb/t/rocksdb_read_free_rpl.test b/mysql-test/suite/rocksdb/t/rocksdb_read_free_rpl.test
index 92981e19a43..cc5c1a90436 100644
--- a/mysql-test/suite/rocksdb/t/rocksdb_read_free_rpl.test
+++ b/mysql-test/suite/rocksdb/t/rocksdb_read_free_rpl.test
@@ -62,7 +62,7 @@ update t1 set c2=100 where c1=3;
delete from t1 where c1 <= 2;
--source include/sync_slave_sql_with_master.inc
--source include/rpl_connection_slave.inc
-select case when variable_value-@up > 0 then 'false' else 'true' end as read_free from performance_schema.global_status where variable_name='rocksdb_num_get_for_update_calls';
+select case when (@@rocksdb_use_range_locking=1 OR variable_value-@up > 0) then 'false' else 'true' end as read_free from performance_schema.global_status where variable_name='rocksdb_num_get_for_update_calls';
select * from t1;
--echo
diff --git a/mysql-test/suite/rocksdb/t/rocksdb_timeout_rollback.test b/mysql-test/suite/rocksdb/t/rocksdb_timeout_rollback.test
index 694594efd70..1273a2b6f70 100644
--- a/mysql-test/suite/rocksdb/t/rocksdb_timeout_rollback.test
+++ b/mysql-test/suite/rocksdb/t/rocksdb_timeout_rollback.test
@@ -46,6 +46,8 @@ begin work;
insert into t1 values (9);
insert into t1 values (10);
+--echo # Fix for Range Locking: force a snapshot to be taken:
+select * from t1 where a=100;
update t1 set a = a + 1 where a = 2;
connection con1;
diff --git a/mysql-test/suite/rocksdb/t/rpl_row_not_found.inc b/mysql-test/suite/rocksdb/t/rpl_row_not_found.inc
index 5a78979f048..63b72ce5c5a 100644
--- a/mysql-test/suite/rocksdb/t/rpl_row_not_found.inc
+++ b/mysql-test/suite/rocksdb/t/rpl_row_not_found.inc
@@ -3,6 +3,8 @@
--source include/have_debug.inc
--source include/have_debug_sync.inc
+--source suite/rocksdb/include/not_range_locking.inc
+
connection master;
--disable_warnings
drop table if exists t1;
diff --git a/mysql-test/suite/rocksdb/t/select_lock_in_share_mode.test b/mysql-test/suite/rocksdb/t/select_lock_in_share_mode.test
index 23ce6d45234..cf9d53ff88a 100644
--- a/mysql-test/suite/rocksdb/t/select_lock_in_share_mode.test
+++ b/mysql-test/suite/rocksdb/t/select_lock_in_share_mode.test
@@ -1,5 +1,8 @@
--source include/have_rocksdb.inc
+# Range locking only supports exclusive locks currently.
+--source suite/rocksdb/include/not_range_locking.inc
+
#
# SELECT .. LOCK IN SHARE MODE
#
diff --git a/mysql-test/suite/rocksdb/t/unique_check.test b/mysql-test/suite/rocksdb/t/unique_check.test
index 47ca74d0e5e..9814d89448d 100644
--- a/mysql-test/suite/rocksdb/t/unique_check.test
+++ b/mysql-test/suite/rocksdb/t/unique_check.test
@@ -2,6 +2,11 @@
--source include/have_debug_sync.inc
--source include/count_sessions.inc
+# Doesn't work with range locking because lock tree waits do not set
+# state="Waiting for row lock" in I_S.PROCESSLIST. See MDEV-17873 for
+# details.
+--source suite/rocksdb/include/not_range_locking.inc
+
# For GitHub issue#167 -- Unique key check doesn't work
connect (con1, localhost, root,,);
diff --git a/mysql-test/suite/rocksdb/t/unique_sec.inc b/mysql-test/suite/rocksdb/t/unique_sec.inc
index ce0bb1e39a9..508816e6ace 100644
--- a/mysql-test/suite/rocksdb/t/unique_sec.inc
+++ b/mysql-test/suite/rocksdb/t/unique_sec.inc
@@ -144,8 +144,16 @@ UPDATE t1 SET id5=37 WHERE id1=38;
UPDATE t1 SET id5=34 WHERE id1=38;
--echo # NULL values are unique
+--echo # (Note: the following UPDATE reads through the whole table without
+--echo # finding anything to update. With point locking, this is fine,
+--echo # but with range locking it will time out while waiting on a row lock
+--echo # that the other transaction is holding)
+if (`select @@rocksdb_use_range_locking=0`) {
UPDATE t1 SET id5=NULL WHERE value1 > 37;
-
+}
+if (`select @@rocksdb_use_range_locking=1`) {
+-- echo UPDATE t1 SET id5=NULL WHERE value1 > 37;
+}
connection con1;
COMMIT;
diff --git a/mysql-test/suite/rocksdb/t/unique_sec_rev_cf.test b/mysql-test/suite/rocksdb/t/unique_sec_rev_cf.test
index d6a8e3d5a1b..8d2e64e5890 100644
--- a/mysql-test/suite/rocksdb/t/unique_sec_rev_cf.test
+++ b/mysql-test/suite/rocksdb/t/unique_sec_rev_cf.test
@@ -3,3 +3,4 @@
let ddl= $MYSQL_TMP_DIR/unique_sec_rev_cf.sql;
--exec sed s/##CF##/" COMMENT 'rev:cf'"/g suite/rocksdb/t/unique_sec.inc > $ddl
--source $ddl
+--remove_file $ddl
diff --git a/mysql-test/suite/rocksdb/t/varbinary_format.test b/mysql-test/suite/rocksdb/t/varbinary_format.test
index fbebfeac85a..0d8a35a1321 100644
--- a/mysql-test/suite/rocksdb/t/varbinary_format.test
+++ b/mysql-test/suite/rocksdb/t/varbinary_format.test
@@ -1,6 +1,10 @@
--source include/have_debug.inc
--source include/have_rocksdb.inc
+# The test uses SELECT .. FOR UPDATE and examines which locks it acquires
+# Range Locking will use different locks from point locking
+--source suite/rocksdb/include/not_range_locking.inc
+
# Create a table with a varbinary key with the current format and validate
# that it sorts correctly
CREATE TABLE t1(
diff --git a/mysql-test/suite/rocksdb/t/varchar_format.test b/mysql-test/suite/rocksdb/t/varchar_format.test
index 3ea1a1a60b3..985b2c0c8e7 100644
--- a/mysql-test/suite/rocksdb/t/varchar_format.test
+++ b/mysql-test/suite/rocksdb/t/varchar_format.test
@@ -1,6 +1,8 @@
--source include/have_debug.inc
--source include/have_rocksdb.inc
+--source suite/rocksdb/include/not_range_locking.inc
+
####################
# Create a table with a varchar key with the current format and validate
# that it sorts correctly
diff --git a/mysql-test/suite/rocksdb_sys_vars/r/rocksdb_max_lock_memory_basic.result b/mysql-test/suite/rocksdb_sys_vars/r/rocksdb_max_lock_memory_basic.result
new file mode 100644
index 00000000000..614737fcfbc
--- /dev/null
+++ b/mysql-test/suite/rocksdb_sys_vars/r/rocksdb_max_lock_memory_basic.result
@@ -0,0 +1,7 @@
+SET @start_global_value = @@global.ROCKSDB_USE_RANGE_LOCKING;
+SELECT @start_global_value;
+@start_global_value
+0
+"Trying to set variable @@global.ROCKSDB_USE_RANGE_LOCKING to 444. It should fail because it is readonly."
+SET @@global.ROCKSDB_USE_RANGE_LOCKING = 444;
+ERROR HY000: Variable 'rocksdb_use_range_locking' is a read only variable
diff --git a/mysql-test/suite/rocksdb_sys_vars/r/rocksdb_use_range_locking_basic.result b/mysql-test/suite/rocksdb_sys_vars/r/rocksdb_use_range_locking_basic.result
new file mode 100644
index 00000000000..614737fcfbc
--- /dev/null
+++ b/mysql-test/suite/rocksdb_sys_vars/r/rocksdb_use_range_locking_basic.result
@@ -0,0 +1,7 @@
+SET @start_global_value = @@global.ROCKSDB_USE_RANGE_LOCKING;
+SELECT @start_global_value;
+@start_global_value
+0
+"Trying to set variable @@global.ROCKSDB_USE_RANGE_LOCKING to 444. It should fail because it is readonly."
+SET @@global.ROCKSDB_USE_RANGE_LOCKING = 444;
+ERROR HY000: Variable 'rocksdb_use_range_locking' is a read only variable
diff --git a/mysql-test/suite/rocksdb_sys_vars/t/rocksdb_max_lock_memory_basic.test b/mysql-test/suite/rocksdb_sys_vars/t/rocksdb_max_lock_memory_basic.test
new file mode 100644
index 00000000000..ee185aba660
--- /dev/null
+++ b/mysql-test/suite/rocksdb_sys_vars/t/rocksdb_max_lock_memory_basic.test
@@ -0,0 +1,5 @@
+--source include/have_rocksdb.inc
+--let $sys_var=ROCKSDB_USE_RANGE_LOCKING
+--let $read_only=1
+--let $session=0
+--source ../include/rocksdb_sys_var.inc
diff --git a/mysql-test/suite/rocksdb_sys_vars/t/rocksdb_use_range_locking_basic.test b/mysql-test/suite/rocksdb_sys_vars/t/rocksdb_use_range_locking_basic.test
new file mode 100644
index 00000000000..ee185aba660
--- /dev/null
+++ b/mysql-test/suite/rocksdb_sys_vars/t/rocksdb_use_range_locking_basic.test
@@ -0,0 +1,5 @@
+--source include/have_rocksdb.inc
+--let $sys_var=ROCKSDB_USE_RANGE_LOCKING
+--let $read_only=1
+--let $session=0
+--source ../include/rocksdb_sys_var.inc
diff --git a/storage/rocksdb/CMakeLists.txt b/storage/rocksdb/CMakeLists.txt
index 135a6af62df..5c25f89b6e9 100644
--- a/storage/rocksdb/CMakeLists.txt
+++ b/storage/rocksdb/CMakeLists.txt
@@ -126,6 +126,7 @@ SET(ROCKSDB_SOURCES
logger.h
rdb_datadic.cc rdb_datadic.h
rdb_iterator.cc rdb_iterator.h
+ rdb_locking_iter.cc rdb_locking_iter.h
rdb_cf_options.cc rdb_cf_options.h
rdb_cf_manager.cc rdb_cf_manager.h
rdb_converter.cc rdb_converter.h
diff --git a/storage/rocksdb/get_rocksdb_files.sh b/storage/rocksdb/get_rocksdb_files.sh
index e0fa66b54fa..07fee252a1e 100755
--- a/storage/rocksdb/get_rocksdb_files.sh
+++ b/storage/rocksdb/get_rocksdb_files.sh
@@ -4,7 +4,7 @@ MKFILE=`mktemp`
# include rocksdb make file relative to the path of this script
echo "include rocksdb/src.mk
all:
- @echo \$(LIB_SOURCES)" > $MKFILE
+ @echo \$(LIB_SOURCES) \$(RANGE_TREE_SOURCES)" > $MKFILE
for f in `make --makefile $MKFILE`
do
echo ../../rocksdb/$f
diff --git a/storage/rocksdb/ha_rocksdb.cc b/storage/rocksdb/ha_rocksdb.cc
index d97a7c237f3..b30f694f3d3 100644
--- a/storage/rocksdb/ha_rocksdb.cc
+++ b/storage/rocksdb/ha_rocksdb.cc
@@ -104,6 +104,9 @@
#include "./ObjectFactory.h"
#endif
+#include "./rdb_locking_iter.h"
+
+
// Internal MySQL APIs not exposed in any header.
extern "C" {
/**
@@ -634,6 +637,10 @@ static void rocksdb_set_delayed_write_rate(THD *thd, struct SYS_VAR *var,
static void rocksdb_set_max_latest_deadlocks(THD *thd, struct SYS_VAR *var,
void *var_ptr, const void *save);
+static void rocksdb_set_max_lock_memory(THD *thd,
+ struct SYS_VAR *var,
+ void *var_ptr, const void *save);
+
static void rdb_set_collation_exception_list(const char *exception_list);
static void rocksdb_set_collation_exception_list(THD *thd, struct SYS_VAR *var,
void *var_ptr,
@@ -786,6 +793,16 @@ static unsigned long long // NOLINT(runtime/int)
static bool rocksdb_skip_locks_if_skip_unique_check = false;
static bool rocksdb_alter_column_default_inplace = false;
+// Range Locking: how much memory can be used used for the lock data structure
+// (which hold the locks acquired by all clients).
+static ulonglong rocksdb_max_lock_memory;
+
+static bool rocksdb_use_range_locking = 0;
+static bool rocksdb_use_range_lock_manager_as_point = 0;
+std::shared_ptr<rocksdb::RangeLockManagerHandle> range_lock_mgr;
+
+std::shared_ptr<rocksdb::RangeLockManagerHandle> range_lock_mgr_used_as_point;
+
std::atomic<uint64_t> rocksdb_row_lock_deadlocks(0);
std::atomic<uint64_t> rocksdb_row_lock_wait_timeouts(0);
std::atomic<uint64_t> rocksdb_snapshot_conflict_errors(0);
@@ -1484,6 +1501,13 @@ static MYSQL_SYSVAR_UINT(max_latest_deadlocks, rocksdb_max_latest_deadlocks,
nullptr, rocksdb_set_max_latest_deadlocks,
rocksdb::kInitialMaxDeadlocks, 0, UINT32_MAX, 0);
+static MYSQL_SYSVAR_ULONGLONG(max_lock_memory, rocksdb_max_lock_memory,
+ PLUGIN_VAR_RQCMDARG,
+ "Range-locking mode: Maximum amount of memory "
+ "that locks from all transactions can use at a time",
+ nullptr, rocksdb_set_max_lock_memory,
+ /*initial*/1073741824, 0, UINT64_MAX, 0);
+
static MYSQL_SYSVAR_ENUM(
info_log_level, rocksdb_info_log_level, PLUGIN_VAR_RQCMDARG,
"Filter level for info logs to be written mysqld error log. "
@@ -2333,6 +2357,19 @@ static MYSQL_SYSVAR_BOOL(table_stats_use_table_scan,
rocksdb_update_table_stats_use_table_scan,
rocksdb_table_stats_use_table_scan);
+static MYSQL_SYSVAR_BOOL(use_range_locking, rocksdb_use_range_locking,
+ PLUGIN_VAR_OPCMDARG | PLUGIN_VAR_READONLY,
+ "Use Range Locking",
+ nullptr, nullptr,
+ rocksdb_use_range_locking);
+
+static MYSQL_SYSVAR_BOOL(use_range_lock_manager_as_point,
+ rocksdb_use_range_lock_manager_as_point,
+ PLUGIN_VAR_OPCMDARG | PLUGIN_VAR_READONLY,
+ "Use Range Lock Manager as point",
+ nullptr, nullptr,
+ rocksdb_use_range_lock_manager_as_point);
+
static MYSQL_SYSVAR_BOOL(
large_prefix, rocksdb_large_prefix, PLUGIN_VAR_RQCMDARG,
"Support large index prefix length of 3072 bytes. If off, the maximum "
@@ -2648,7 +2685,9 @@ static struct SYS_VAR *rocksdb_system_variables[] = {
MYSQL_SYSVAR(manual_compaction_threads),
MYSQL_SYSVAR(manual_compaction_bottommost_level),
MYSQL_SYSVAR(rollback_on_timeout),
-
+ MYSQL_SYSVAR(use_range_locking),
+ MYSQL_SYSVAR(use_range_lock_manager_as_point),
+ MYSQL_SYSVAR(max_lock_memory),
MYSQL_SYSVAR(enable_insert_with_update_caching),
MYSQL_SYSVAR(trace_block_cache_access),
MYSQL_SYSVAR(trace_queries),
@@ -2907,8 +2946,41 @@ class Rdb_transaction {
virtual rocksdb::Status do_pop_savepoint() = 0;
virtual void do_rollback_to_savepoint() = 0;
+ private:
+ /*
+ If true, the current statement should not use a snapshot for reading.
+ Note that in a multi-statement transaction, the snapshot may have been
+ allocated by another statement.
+ */
+ bool m_stmt_ignores_snapshot = false;
+
+ /* Snapshot-ignore mode will put away m_reads_opts.snapshot here: */
+ const rocksdb::Snapshot *m_saved_snapshot= nullptr;
+
public:
+
+ void start_ignore_snapshot() {
+ // note: this may be called several times for the same statement
+ if (!m_stmt_ignores_snapshot) {
+ m_saved_snapshot = m_read_opts.snapshot;
+ m_read_opts.snapshot = nullptr;
+ m_stmt_ignores_snapshot= true;
+ if (!m_snapshot_timestamp)
+ rdb->GetEnv()->GetCurrentTime(&m_snapshot_timestamp);
+ }
+ }
+
+ void end_ignore_snapshot_if_needed() {
+ if (m_stmt_ignores_snapshot) {
+ m_stmt_ignores_snapshot = false;
+ m_read_opts.snapshot = m_saved_snapshot;
+ m_saved_snapshot = nullptr;
+ }
+ }
+ bool in_snapshot_ignore_mode() const { return m_stmt_ignores_snapshot; }
+
rocksdb::ReadOptions m_read_opts;
+
const char *m_mysql_log_file_name;
my_off_t m_mysql_log_offset;
const char *m_mysql_gtid;
@@ -3089,6 +3161,19 @@ class Rdb_transaction {
virtual void release_lock(const Rdb_key_def &key_descr,
const std::string &rowkey, bool force = false) = 0;
+ virtual
+ rocksdb::Status lock_range(rocksdb::ColumnFamilyHandle *const cf,
+ const rocksdb::Endpoint &start,
+ const rocksdb::Endpoint &end) = 0;
+
+ rocksdb::Status lock_singlepoint_range(rocksdb::ColumnFamilyHandle *const cf,
+ const rocksdb::Slice &point) {
+ // Normally, one needs to "flip" the endpoint type for reverse-ordered CFs.
+ // But here we are locking just one point so this is not necessary.
+ rocksdb::Endpoint endp(point, false);
+ return lock_range(cf, endp, endp);
+ }
+
virtual bool prepare() = 0;
bool commit_or_rollback() {
@@ -3147,10 +3232,17 @@ class Rdb_transaction {
m_is_delayed_snapshot = false;
}
+ void locking_iter_created() {
+ if (!m_snapshot_timestamp)
+ rdb->GetEnv()->GetCurrentTime(&m_snapshot_timestamp);
+ }
+
virtual void acquire_snapshot(bool acquire_now) = 0;
virtual void release_snapshot() = 0;
- bool has_snapshot() const { return m_read_opts.snapshot != nullptr; }
+ bool has_snapshot() const {
+ return m_read_opts.snapshot != nullptr || m_saved_snapshot;
+ }
private:
// The Rdb_sst_info structures we are currently loading. In a partitioned
@@ -3704,7 +3796,9 @@ class Rdb_transaction {
virtual rocksdb::Iterator *get_iterator(
const rocksdb::ReadOptions &options,
- rocksdb::ColumnFamilyHandle *column_family) = 0;
+ rocksdb::ColumnFamilyHandle *column_family,
+ bool is_rev_cf,
+ bool use_locking_iterator=false) = 0;
virtual void multi_get(rocksdb::ColumnFamilyHandle *const column_family,
const size_t num_keys, const rocksdb::Slice *keys,
@@ -3713,10 +3807,12 @@ class Rdb_transaction {
const bool sorted_input) const = 0;
rocksdb::Iterator *get_iterator(
- rocksdb::ColumnFamilyHandle *const column_family, bool skip_bloom_filter,
+ rocksdb::ColumnFamilyHandle *const column_family, bool is_rev_cf,
+ bool skip_bloom_filter,
const rocksdb::Slice &eq_cond_lower_bound,
const rocksdb::Slice &eq_cond_upper_bound, bool read_current = false,
- bool create_snapshot = true) {
+ bool create_snapshot = true,
+ bool use_locking_iterator=false) {
// Make sure we are not doing both read_current (which implies we don't
// want a snapshot) and create_snapshot which makes sure we create
// a snapshot
@@ -3746,12 +3842,14 @@ class Rdb_transaction {
if (read_current) {
options.snapshot = nullptr;
}
- return get_iterator(options, column_family);
+ return get_iterator(options, column_family, is_rev_cf,
+ use_locking_iterator);
}
virtual bool is_tx_started() const = 0;
virtual void start_tx() = 0;
- virtual void start_stmt() = 0;
+ virtual void start_stmt(bool is_dml_statement) = 0;
+ virtual void start_autocommit_stmt(bool /*is_dml_statement*/){}
virtual void set_name() = 0;
protected:
@@ -3949,6 +4047,13 @@ class Rdb_transaction_impl : public Rdb_transaction {
virtual bool is_writebatch_trx() const override { return false; }
+ // Lock the range between two specified endpoints
+ rocksdb::Status lock_range(rocksdb::ColumnFamilyHandle *const cf,
+ const rocksdb::Endpoint &start_endp,
+ const rocksdb::Endpoint &end_endp) override {
+ ++m_row_lock_count;
+ return m_rocksdb_tx->GetRangeLock(cf, start_endp, end_endp);
+ }
private:
void release_tx(void) {
// We are done with the current active transaction object. Preserve it
@@ -4055,7 +4160,7 @@ class Rdb_transaction_impl : public Rdb_transaction {
}
void acquire_snapshot(bool acquire_now) override {
- if (m_read_opts.snapshot == nullptr) {
+ if (m_read_opts.snapshot == nullptr && !in_snapshot_ignore_mode()) {
const auto thd_ss = std::static_pointer_cast<Rdb_explicit_snapshot>(
m_thd->get_explicit_snapshot());
if (thd_ss) {
@@ -4211,9 +4316,17 @@ class Rdb_transaction_impl : public Rdb_transaction {
rocksdb::Iterator *get_iterator(
const rocksdb::ReadOptions &options,
- rocksdb::ColumnFamilyHandle *const column_family) override {
+ rocksdb::ColumnFamilyHandle *const column_family,
+ bool is_rev_cf,
+ bool use_locking_iterator) override {
global_stats.queries[QUERIES_RANGE].inc();
- return m_rocksdb_tx->GetIterator(options, column_family);
+ if (use_locking_iterator) {
+ locking_iter_created();
+ return GetLockingIterator(m_rocksdb_tx, options, column_family,
+ is_rev_cf, &m_row_lock_count);
+ }
+ else
+ return m_rocksdb_tx->GetIterator(options, column_family);
}
const rocksdb::Transaction *get_rdb_trx() const { return m_rocksdb_tx; }
@@ -4287,17 +4400,35 @@ class Rdb_transaction_impl : public Rdb_transaction {
/*
Start a statement inside a multi-statement transaction.
- @todo: are we sure this is called once (and not several times) per
- statement start?
+ @note: If a statement uses N tables, this function will be called N times,
+ for each TABLE object that is used.
For hooking to start of statement that is its own transaction, see
ha_rocksdb::external_lock().
*/
- void start_stmt() override {
+ void start_stmt(bool is_dml_statement) override {
+
+ if (rocksdb_use_range_locking && is_dml_statement) {
+ /*
+ In Range Locking mode, RocksDB does not do "key tracking".
+ Use InnoDB-like concurrency mode: make the DML statements always read
+ the latest data (instead of using transaction's snapshot).
+ This "downgrades" the transaction isolation to READ-COMMITTED on the
+ master, but in return the actions can be replayed on the slave.
+ */
+ start_ignore_snapshot();
+ }
+
// Set the snapshot to delayed acquisition (SetSnapshotOnNextOperation)
acquire_snapshot(false);
}
+ void start_autocommit_stmt(bool is_dml_statement) override {
+ if (rocksdb_use_range_locking && is_dml_statement) {
+ start_ignore_snapshot();
+ }
+ }
+
/*
This must be called when last statement is rolled back, but the transaction
continues
@@ -4426,6 +4557,12 @@ class Rdb_writebatch_impl : public Rdb_transaction {
// Nothing to do here since we don't hold any row locks.
}
+ rocksdb::Status lock_range(rocksdb::ColumnFamilyHandle *const,
+ const rocksdb::Endpoint&,
+ const rocksdb::Endpoint&) override {
+ return rocksdb::Status::OK();
+ }
+
void rollback() override {
on_rollback();
m_write_count = 0;
@@ -4525,7 +4662,9 @@ class Rdb_writebatch_impl : public Rdb_transaction {
rocksdb::Iterator *get_iterator(
const rocksdb::ReadOptions &options,
- rocksdb::ColumnFamilyHandle *const /* column_family */) override {
+ rocksdb::ColumnFamilyHandle *const /* column_family */,
+ bool /*is_rev_cf*/,
+ bool /*use_locking_iterator*/) override {
const auto it = rdb->NewIterator(options);
return m_batch->NewIteratorWithBase(it);
}
@@ -4543,9 +4682,9 @@ class Rdb_writebatch_impl : public Rdb_transaction {
set_initial_savepoint();
}
+ void start_stmt(bool /*is_dml_statement*/) override {}
void set_name() override {}
- void start_stmt() override {}
void rollback_stmt() override {
if (m_batch) rollback_to_stmt_savepoint();
@@ -4869,6 +5008,7 @@ static int rocksdb_prepare(handlerton *const hton MY_ATTRIBUTE((__unused__)),
DEBUG_SYNC(thd, "rocksdb.prepared");
} else {
tx->make_stmt_savepoint_permanent();
+ tx->end_ignore_snapshot_if_needed();
}
return HA_EXIT_SUCCESS;
@@ -5096,6 +5236,7 @@ static int rocksdb_commit(handlerton *const hton MY_ATTRIBUTE((__unused__)),
- For a COMMIT statement that finishes a multi-statement transaction
- For a statement that has its own transaction
*/
+ tx->end_ignore_snapshot_if_needed();
if (tx->commit()) {
DBUG_RETURN(HA_ERR_ROCKSDB_COMMIT_FAILED);
}
@@ -5105,6 +5246,7 @@ static int rocksdb_commit(handlerton *const hton MY_ATTRIBUTE((__unused__)),
*/
tx->set_tx_failed(false);
tx->make_stmt_savepoint_permanent();
+ tx->end_ignore_snapshot_if_needed();
}
if (my_core::thd_tx_isolation(thd) <= ISO_READ_COMMITTED) {
@@ -5142,6 +5284,7 @@ static int rocksdb_rollback(handlerton *const hton MY_ATTRIBUTE((__unused__)),
- a statement inside a transaction is rolled back
*/
+ tx->end_ignore_snapshot_if_needed();
tx->rollback_stmt();
tx->set_tx_failed(true);
}
@@ -5246,8 +5389,9 @@ class Rdb_snapshot_status : public Rdb_tx_list_walker {
"=========================================\n";
}
+ template<class PathStruct>
static Rdb_deadlock_info::Rdb_dl_trx_info get_dl_txn_info(
- const rocksdb::DeadlockInfo &txn, const GL_INDEX_ID &gl_index_id) {
+ const PathStruct &txn, const GL_INDEX_ID &gl_index_id) {
Rdb_deadlock_info::Rdb_dl_trx_info txn_data;
txn_data.trx_id = txn.m_txn_id;
@@ -5272,23 +5416,49 @@ class Rdb_snapshot_status : public Rdb_tx_list_walker {
? cfh->GetName()
: "NOT FOUND; CF_ID: " + std::to_string(txn.m_cf_id);
- txn_data.waiting_key =
- rdb_hexdump(txn.m_waiting_key.c_str(), txn.m_waiting_key.length());
+ txn_data.waiting_key = format_wait_key(txn);
txn_data.exclusive_lock = txn.m_exclusive;
return txn_data;
}
+ // Get the key to use to find the index number (and then, index name)
+ // Two functions with matching signatures so get_dl_path_trx_info() template
+ // can be used with both point and range locking.
+ static const std::string& get_key_for_indexnr(
+ const rocksdb::DeadlockInfo& info) {
+ return info.m_waiting_key;
+ }
+ static const std::string& get_key_for_indexnr(
+ const rocksdb::RangeDeadlockInfo& info) {
+ // Range locks do not span across indexes, so take the left bound
+ return info.m_start.slice;
+ }
+
+ // Print the locked key (or range) in hex
+ // Two functions with matching signatures so get_dl_path_trx_info() template
+ // can be used with both point and range locking.
+ static std::string format_wait_key(const rocksdb::DeadlockInfo& info) {
+ return rdb_hexdump(info.m_waiting_key.c_str(), info.m_waiting_key.length());
+ }
+ static std::string format_wait_key(const rocksdb::RangeDeadlockInfo& info) {
+ return rdb_hexdump_range(info.m_start, info.m_end);
+ }
+
+ // Get deadlock path info. A templated function so one can use it with both
+ // point and range locking.
+ template<class PathStruct>
static Rdb_deadlock_info get_dl_path_trx_info(
- const rocksdb::DeadlockPath &path_entry) {
+ const PathStruct &path_entry) {
Rdb_deadlock_info deadlock_info;
for (auto it = path_entry.path.begin(); it != path_entry.path.end(); it++) {
const auto &txn = *it;
+ auto waiting_key = get_key_for_indexnr(txn);
const GL_INDEX_ID gl_index_id = {
txn.m_cf_id, rdb_netbuf_to_uint32(reinterpret_cast<const uchar *>(
- txn.m_waiting_key.c_str()))};
+ waiting_key.c_str()))};
deadlock_info.path.push_back(get_dl_txn_info(txn, gl_index_id));
}
DBUG_ASSERT_IFF(path_entry.limit_exceeded, path_entry.path.empty());
@@ -5313,7 +5483,7 @@ class Rdb_snapshot_status : public Rdb_tx_list_walker {
/* Calculate the duration the snapshot has existed */
int64_t snapshot_timestamp = tx->m_snapshot_timestamp;
- if (snapshot_timestamp != 0) {
+ if (snapshot_timestamp != 0 && tx->has_snapshot()) {
int64_t curr_time;
rdb->GetEnv()->GetCurrentTime(&curr_time);
@@ -5332,8 +5502,8 @@ class Rdb_snapshot_status : public Rdb_tx_list_walker {
}
}
- void populate_deadlock_buffer() {
- auto dlock_buffer = rdb->GetDeadlockInfoBuffer();
+ template<class PathStruct>
+ void populate_deadlock_buffer_tmpl(PathStruct &dlock_buffer) {
m_data += "----------LATEST DETECTED DEADLOCKS----------\n";
for (const auto &path_entry : dlock_buffer) {
@@ -5373,12 +5543,32 @@ class Rdb_snapshot_status : public Rdb_tx_list_walker {
}
}
+ void populate_deadlock_buffer() {
+ if (range_lock_mgr) {
+ auto dlock_buffer = range_lock_mgr->GetRangeDeadlockInfoBuffer();
+ populate_deadlock_buffer_tmpl(dlock_buffer);
+ } else {
+ auto dlock_buffer = rdb->GetDeadlockInfoBuffer();
+ populate_deadlock_buffer_tmpl(dlock_buffer);
+ }
+ }
+
std::vector<Rdb_deadlock_info> get_deadlock_info() {
std::vector<Rdb_deadlock_info> deadlock_info;
- auto dlock_buffer = rdb->GetDeadlockInfoBuffer();
- for (const auto &path_entry : dlock_buffer) {
- if (!path_entry.limit_exceeded) {
- deadlock_info.push_back(get_dl_path_trx_info(path_entry));
+
+ if (range_lock_mgr) {
+ auto dlock_buffer = range_lock_mgr->GetRangeDeadlockInfoBuffer();
+ for (const auto &path_entry : dlock_buffer) {
+ if (!path_entry.limit_exceeded) {
+ deadlock_info.push_back(get_dl_path_trx_info(path_entry));
+ }
+ }
+ } else {
+ auto dlock_buffer = rdb->GetDeadlockInfoBuffer();
+ for (const auto &path_entry : dlock_buffer) {
+ if (!path_entry.limit_exceeded) {
+ deadlock_info.push_back(get_dl_path_trx_info(path_entry));
+ }
}
}
return deadlock_info;
@@ -5794,9 +5984,13 @@ static bool rocksdb_collect_hton_log_info(handlerton *const /* unused */,
return ret_val;
}
+/*
+ @param is_dml_statement If true, we are is a DML statement
+*/
static inline void rocksdb_register_tx(
handlerton *const hton MY_ATTRIBUTE((__unused__)), THD *const thd,
- Rdb_transaction *const tx) {
+ Rdb_transaction *const tx,
+ bool is_dml_stmt) {
DBUG_ASSERT(tx != nullptr);
trans_register_ha(thd, false, rocksdb_hton, NULL);
@@ -5811,8 +6005,10 @@ static inline void rocksdb_register_tx(
}
}
if (my_core::thd_test_options(thd, OPTION_NOT_AUTOCOMMIT | OPTION_BEGIN)) {
- tx->start_stmt();
+ tx->start_stmt(is_dml_stmt);
trans_register_ha(thd, true, rocksdb_hton, NULL);
+ } else {
+ tx->start_autocommit_stmt(is_dml_stmt);
}
}
@@ -5897,7 +6093,7 @@ static int rocksdb_start_tx_and_assign_read_view(
DBUG_ASSERT(!tx->has_snapshot());
tx->set_tx_read_only(true);
- rocksdb_register_tx(hton, thd, tx);
+ rocksdb_register_tx(hton, thd, tx, false);
tx->acquire_snapshot(true);
return HA_EXIT_SUCCESS;
@@ -5954,7 +6150,7 @@ static int rocksdb_start_tx_with_shared_read_view(
DBUG_ASSERT(!tx->has_snapshot());
tx->set_tx_read_only(true);
- rocksdb_register_tx(hton, thd, tx);
+ rocksdb_register_tx(hton, thd, tx, false);
tx->acquire_snapshot(true);
// case: an explicit snapshot was not assigned to this transaction
@@ -6633,6 +6829,25 @@ static int rocksdb_init_internal(void *const p) {
tx_db_options.custom_mutex_factory = std::make_shared<Rdb_mutex_factory>();
tx_db_options.write_policy =
static_cast<rocksdb::TxnDBWritePolicy>(rocksdb_write_policy);
+
+ if (rocksdb_use_range_locking && rocksdb_use_range_lock_manager_as_point) {
+ //rdb_log_status_error(
+ // status, "Can't have both range_locking and range_lock_manager_as_point");
+ //DBUG_RETURN(HA_EXIT_FAILURE);
+ rocksdb_use_range_lock_manager_as_point= 0;
+ }
+
+
+ if (rocksdb_use_range_locking) {
+ range_lock_mgr.reset(
+ rocksdb::NewRangeLockManager(tx_db_options.custom_mutex_factory));
+ tx_db_options.lock_mgr_handle = range_lock_mgr;
+ }
+ if (rocksdb_use_range_lock_manager_as_point) {
+ range_lock_mgr_used_as_point.reset(
+ rocksdb::NewRangeLockManager(tx_db_options.custom_mutex_factory));
+ tx_db_options.lock_mgr_handle = range_lock_mgr_used_as_point;
+ }
status =
check_rocksdb_options_compatibility(rocksdb_datadir, main_opts, cf_descr);
@@ -6667,6 +6882,15 @@ static int rocksdb_init_internal(void *const p) {
DBUG_RETURN(HA_EXIT_FAILURE);
}
+ if (range_lock_mgr)
+ {
+ range_lock_mgr->SetMaxLockMemory(rocksdb_max_lock_memory);
+ sql_print_information("RocksDB: USING NEW RANGE LOCKING");
+ sql_print_information("RocksDB: Max lock memory=%llu", rocksdb_max_lock_memory);
+ }
+ else
+ sql_print_information("RocksDB: USING POINT LOCKING");
+
cf_manager.init(std::move(cf_options_map), &cf_handles);
// NO_LINT_DEBUG
@@ -9208,6 +9432,15 @@ int ha_rocksdb::index_read_intern(uchar *const buf, const uchar *const key,
}
Rdb_transaction *const tx = get_or_create_tx(table->in_use);
+
+ bool use_locking_iter= false;
+
+ if ((rc = set_range_lock(tx, kd, find_flag, slice, end_range,
+ &use_locking_iter)))
+ DBUG_RETURN(rc);
+ if (use_locking_iter)
+ m_iterator->set_use_locking();
+
const bool is_new_snapshot = !tx->has_snapshot();
// Loop as long as we get a deadlock error AND we end up creating the
@@ -9256,6 +9489,234 @@ int ha_rocksdb::index_read_intern(uchar *const buf, const uchar *const key,
DBUG_RETURN(rc);
}
+
+/*
+ @brief
+ Compute the range lock endpoints and set the range lock, if necessary
+
+ @param use_locking_iter OUT If true, locks are not set and LockingIterator
+ should be used instead
+
+ @detail
+ If the scanned range doesn't have the endpoint we're scanning towards,
+ don't set the lock, it will be too coarse. Indicate that LockingIterator
+ should be used, instead.
+
+ @return
+ 0 Ok
+ Other Error acquiring the lock (wait timeout, deadlock, etc)
+*/
+
+int ha_rocksdb::set_range_lock(Rdb_transaction *tx,
+ const Rdb_key_def &kd,
+ const enum ha_rkey_function &find_flag,
+ const rocksdb::Slice &slice_arg,
+ const key_range *const end_key,
+ bool *use_locking_iterator)
+{
+ rocksdb::Slice end_slice;
+ uchar end_slice_buf[MAX_KEY_LENGTH];
+ bool start_has_inf_suffix = false, end_has_inf_suffix = false;
+ rocksdb::Slice slice(slice_arg);
+ *use_locking_iterator= false;
+
+ if (m_lock_rows == RDB_LOCK_NONE || !rocksdb_use_range_locking) {
+ return 0;
+ }
+ bool big_range= false;
+
+ /*
+ The 'slice' has the left endpoint of the range to lock.
+ Figure out the right endpoint.
+ */
+
+ if (find_flag == HA_READ_KEY_EXACT) {
+ if (slice.size() == Rdb_key_def::INDEX_NUMBER_SIZE) {
+ // This is a full table/index scan
+ start_has_inf_suffix= false;
+ big_range = true;
+ } else {
+ /*
+ This is "key_part= const" interval. We need to lock this range:
+ (lookup_value, -inf) < key < (lookup_value, +inf)
+ */
+ start_has_inf_suffix= false;
+ end_has_inf_suffix= true;
+ end_slice= slice;
+ }
+ }
+ else if (find_flag == HA_READ_PREFIX_LAST) {
+ if (slice.size() == Rdb_key_def::INDEX_NUMBER_SIZE) {
+ /* Reverse-ordered full index scan */
+ start_has_inf_suffix= true;
+ big_range = true;
+ } else {
+ /*
+ We get here for queries like:
+
+ select * from t1 where pk1=const order by pk1 desc for update
+
+ assuming this uses an index on (pk1, ...)
+ We get end_key=nullptr.
+
+ The range to lock is the same as with HA_READ_KEY_EXACT above.
+ */
+ end_slice= slice;
+ start_has_inf_suffix= false;
+ end_has_inf_suffix= true;
+ }
+ }
+ else if (find_flag == HA_READ_PREFIX_LAST_OR_PREV) {
+ /*
+ We get here for queries like:
+
+ select * from t1 where pk1=const1 and pk2 between const2 and const3
+ order by pk1 desc
+ for update
+
+ assuming this uses an index on (pk1, pk2).
+ The slice has the right endpoint: {const1, const3}
+ the end_key has the left endpoint: {const1, const2}.
+ */
+
+ // Move the right endpoint from slice to end_slice
+ end_slice= slice;
+
+ // Pack the left endpoint and make "slice" point to it
+ uchar pack_buffer[MAX_KEY_LENGTH];
+ uint end_slice_size=
+ kd.pack_index_tuple(table, pack_buffer, end_slice_buf,
+ end_key->key, end_key->keypart_map);
+ slice= rocksdb::Slice(reinterpret_cast<char *>(end_slice_buf),
+ end_slice_size);
+ start_has_inf_suffix= false;
+ end_has_inf_suffix= true;
+ }
+ else if (find_flag == HA_READ_BEFORE_KEY) {
+ /*
+ We get here for queries like
+ select * from t1
+ where pk <1007 order by pk desc limit 2 for update
+ select * from t1
+ where pk >=800 and pk <1007 order by pk desc limit 2 for update
+ */
+
+ // Move the right endpoint from slice to end_slice
+ end_slice= slice;
+
+ if (end_key) {
+ uchar pack_buffer[MAX_KEY_LENGTH];
+ uint end_slice_size=
+ kd.pack_index_tuple(table, pack_buffer, end_slice_buf,
+ end_key->key, end_key->keypart_map);
+
+ slice= rocksdb::Slice(reinterpret_cast<char *>(end_slice_buf),
+ end_slice_size);
+
+ end_has_inf_suffix= false;
+ big_range= false;
+ } else {
+ uint end_slice_size;
+ kd.get_infimum_key(end_slice_buf, &end_slice_size);
+ slice= rocksdb::Slice((char*)end_slice_buf, end_slice_size);
+
+ big_range= true;
+ }
+ }
+ else if (end_key) {
+ // Known start range bounds: HA_READ_KEY_OR_NEXT, HA_READ_AFTER_KEY
+ if (find_flag == HA_READ_KEY_OR_NEXT)
+ start_has_inf_suffix= false;
+ else if (find_flag == HA_READ_AFTER_KEY)
+ start_has_inf_suffix= true;
+ else
+ DBUG_ASSERT(0);
+
+ // Known end range bounds: HA_READ_AFTER_KEY, HA_READ_BEFORE_KEY
+ if (end_key->flag == HA_READ_AFTER_KEY) {
+ // this is "key_part <= const".
+ end_has_inf_suffix= true;
+ } else if (end_key->flag == HA_READ_BEFORE_KEY) {
+ // this is "key_part < const", non-inclusive.
+ end_has_inf_suffix= false;
+ } else
+ DBUG_ASSERT(0);
+
+ uchar pack_buffer[MAX_KEY_LENGTH];
+ uint end_slice_size= kd.pack_index_tuple(table, pack_buffer, end_slice_buf,
+ end_key->key,
+ end_key->keypart_map);
+
+ end_slice= rocksdb::Slice(reinterpret_cast<char *>(end_slice_buf),
+ end_slice_size);
+ }
+ else
+ {
+ big_range= true;
+#if 0
+ // The below is code to handle this without LockingIterator:
+ // No end key
+ // Known start range bounds: HA_READ_KEY_OR_NEXT, HA_READ_AFTER_KEY
+ if (find_flag == HA_READ_KEY_OR_NEXT)
+ start_has_inf_suffix= false;
+ else if (find_flag == HA_READ_AFTER_KEY)
+ start_has_inf_suffix= true;
+ else
+ DBUG_ASSERT(0);
+
+ uint end_slice_size;
+ kd.get_infimum_key(end_slice_buf, &end_slice_size);
+ end_slice= rocksdb::Slice((char*)end_slice_buf, end_slice_size);
+ end_has_inf_suffix= true;
+#endif
+ }
+
+ if (big_range)
+ {
+ *use_locking_iterator= true;
+ return 0;
+ }
+
+ rocksdb::Endpoint start_endp;
+ rocksdb::Endpoint end_endp;
+
+ if (kd.m_is_reverse_cf) {
+ // Flip the endpoints
+ start_endp =rocksdb::Endpoint(end_slice, !end_has_inf_suffix);
+ end_endp = rocksdb::Endpoint(slice, !start_has_inf_suffix);
+ } else {
+ start_endp= rocksdb::Endpoint(slice, start_has_inf_suffix);
+ end_endp= rocksdb::Endpoint(end_slice, end_has_inf_suffix);
+ }
+
+ /*
+ RocksDB's iterator is reading the snapshot of the data that was taken at
+ the time the iterator was created.
+
+ After we've got a lock on the range, we'll need to refresh the iterator
+ to read the latest contents. (If we use the iterator created before the
+ lock_range() call, we may miss the changes that were made/committed after
+ the iterator was created but before the lock_range() call was made).
+
+ RocksDB has Iterator::Refresh() method, but alas, it is not implemented for
+ the iterator returned by Transaction object (Transaction object returns
+ BaseDeltaIterator which allows one to see the transactions's own changes).
+
+ Our solution to this is to release the iterator and create the new one.
+ We release it here, it will be created as soon as there's a need to read
+ records.
+ */
+ //release_scan_iterator();
+ m_iterator->reset();
+
+ auto s= tx->lock_range(kd.get_cf(), start_endp, end_endp);
+ if (!s.ok()) {
+ return (tx->set_status_error(table->in_use, s, kd, m_tbl_def,
+ m_table_handler));
+ }
+ return 0;
+}
+
/*
See storage/rocksdb/rocksdb-range-access.txt for description of how MySQL
index navigation commands are converted into RocksDB lookup commands.
@@ -9770,7 +10231,7 @@ int ha_rocksdb::index_next_with_direction_intern(uchar *const buf,
}
}
- if (rc) {
+ if (rc != HA_EXIT_SUCCESS) {
break;
}
@@ -9781,7 +10242,7 @@ int ha_rocksdb::index_next_with_direction_intern(uchar *const buf,
table->m_status = 0;
rc = 0;
} else if (active_index == table->s->primary_key) {
- if (m_lock_rows != RDB_LOCK_NONE) {
+ if (m_lock_rows != RDB_LOCK_NONE && !rocksdb_use_range_locking) {
DEBUG_SYNC(ha_thd(), "rocksdb_concurrent_delete");
/* We need to put a lock and re-read */
bool skip_row = false;
@@ -10850,6 +11311,15 @@ int ha_rocksdb::update_write_sk(const TABLE *const table_arg,
old_key_slice = rocksdb::Slice(
reinterpret_cast<const char *>(m_sk_packed_tuple_old), old_packed_size);
+ /* Range locking: lock the index tuple being deleted */
+ if (rocksdb_use_range_locking) {
+ auto s= row_info.tx->lock_singlepoint_range(kd.get_cf(), old_key_slice);
+ if (!s.ok()) {
+ return (row_info.tx->set_status_error(table->in_use, s, kd,
+ m_tbl_def, m_table_handler));
+ }
+ }
+
row_info.tx->get_indexed_write_batch()->SingleDelete(kd.get_cf(),
old_key_slice);
@@ -10902,6 +11372,14 @@ int ha_rocksdb::update_write_sk(const TABLE *const table_arg,
if (bulk_load_sk && row_info.old_data == nullptr) {
rc = bulk_load_key(row_info.tx, kd, new_key_slice, new_value_slice, true);
} else {
+ /* Range locking: lock the index tuple being inserted */
+ if (rocksdb_use_range_locking) {
+ auto s= row_info.tx->lock_singlepoint_range(kd.get_cf(), new_key_slice);
+ if (!s.ok()) {
+ return (row_info.tx->set_status_error(table->in_use, s, kd,
+ m_tbl_def, m_table_handler));
+ }
+ }
row_info.tx->get_indexed_write_batch()->Put(kd.get_cf(), new_key_slice,
new_value_slice);
}
@@ -11327,6 +11805,19 @@ int ha_rocksdb::delete_row(const uchar *const buf) {
nullptr, false, hidden_pk_id);
rocksdb::Slice secondary_key_slice(
reinterpret_cast<const char *>(m_sk_packed_tuple), packed_size);
+
+ /*
+ For point locking, Deleting on secondary key doesn't need any locks.
+ Range locking must get a lock.
+ */
+ if (rocksdb_use_range_locking) {
+ auto s= tx->lock_singlepoint_range(kd.get_cf(), secondary_key_slice);
+ if (!s.ok()) {
+ DBUG_RETURN(tx->set_status_error(table->in_use, s, kd, m_tbl_def,
+ m_table_handler));
+ }
+ }
+
tx->get_indexed_write_batch()->SingleDelete(kd.get_cf(),
secondary_key_slice);
bytes_written += secondary_key_slice.size();
@@ -11900,7 +12391,7 @@ int ha_rocksdb::external_lock(THD *const thd, int lock_type) {
}
}
tx->m_n_mysql_tables_in_use++;
- rocksdb_register_tx(rocksdb_hton, thd, tx);
+ rocksdb_register_tx(rocksdb_hton, thd, tx, (lock_type == F_WRLCK));
tx->io_perf_start(&m_io_perf);
}
@@ -11928,7 +12419,7 @@ int ha_rocksdb::start_stmt(THD *const thd,
Rdb_transaction *const tx = get_or_create_tx(thd);
read_thd_vars(thd);
- rocksdb_register_tx(ht, thd, tx);
+ rocksdb_register_tx(ht, thd, tx, (lock_type == F_WRLCK));
tx->io_perf_start(&m_io_perf);
DBUG_RETURN(HA_EXIT_SUCCESS);
@@ -14324,6 +14815,36 @@ static int show_rocksdb_stall_vars(THD *thd MY_ATTRIBUTE((unused)),
return 0;
}
+//
+// Lock Tree Status variables
+//
+static longlong rocksdb_locktree_escalation_count=1234;
+static longlong rocksdb_locktree_current_lock_memory=0;
+
+static SHOW_VAR rocksdb_locktree_status_variables[] = {
+ DEF_STATUS_VAR_FUNC("escalation_count",
+ &rocksdb_locktree_escalation_count, SHOW_LONGLONG),
+ DEF_STATUS_VAR_FUNC("current_lock_memory",
+ &rocksdb_locktree_current_lock_memory, SHOW_LONGLONG),
+ // end of the array marker
+ {NullS, NullS, SHOW_LONG}};
+
+static SHOW_VAR rocksdb_empty_status_variables[] = {
+ {NullS, NullS, SHOW_LONG}};
+
+static void show_rocksdb_locktree_vars(THD*, SHOW_VAR *var, char*) {
+ var->type = SHOW_ARRAY;
+ if (range_lock_mgr)
+ {
+ auto status = range_lock_mgr->GetStatus();
+ rocksdb_locktree_escalation_count = status.escalation_count;
+ rocksdb_locktree_current_lock_memory = status.current_lock_memory;
+ var->value = reinterpret_cast<char *>(&rocksdb_locktree_status_variables);
+ }
+ else
+ var->value = reinterpret_cast<char *>(&rocksdb_empty_status_variables);
+}
+
static SHOW_VAR rocksdb_status_vars[] = {
DEF_STATUS_VAR(block_cache_miss),
DEF_STATUS_VAR(block_cache_hit),
@@ -14449,6 +14970,8 @@ static SHOW_VAR rocksdb_status_vars[] = {
SHOW_SCOPE_GLOBAL},
{"rocksdb_stall", reinterpret_cast<char *>(&show_rocksdb_stall_vars),
SHOW_FUNC, SHOW_SCOPE_GLOBAL},
+ {"rocksdb_locktree", reinterpret_cast<char *>(show_rocksdb_locktree_vars),
+ SHOW_FUNC, SHOW_SCOPE_GLOBAL},
{NullS, NullS, SHOW_LONG, SHOW_SCOPE_GLOBAL}};
/*
@@ -15326,6 +15849,23 @@ void rocksdb_set_delayed_write_rate(THD *thd MY_ATTRIBUTE((unused)),
RDB_MUTEX_UNLOCK_CHECK(rdb_sysvars_mutex);
}
+void rocksdb_set_max_lock_memory(THD *thd, struct SYS_VAR*,
+ void* /*var_ptr*/, const void *save) {
+ const uint64_t new_val = *static_cast<const uint64_t *>(save);
+ if (rocksdb_max_lock_memory != new_val) {
+ if (range_lock_mgr->SetMaxLockMemory(new_val)) {
+ /* NO_LINT_DEBUG */
+ sql_print_warning("MyRocks: failed to set max_lock_memory");
+ push_warning_printf(thd, Sql_condition::SL_WARNING,
+ ER_ERROR_WHEN_EXECUTING_COMMAND,
+ "Cannot set max_lock_memory to size below currently used");
+ } else {
+ // Succeeded
+ rocksdb_max_lock_memory = new_val;
+ }
+ }
+}
+
void rocksdb_set_max_latest_deadlocks(
THD *thd MY_ATTRIBUTE((unused)), struct SYS_VAR *var MY_ATTRIBUTE((unused)),
void *var_ptr MY_ATTRIBUTE((unused)), const void *save) {
@@ -15333,7 +15873,13 @@ void rocksdb_set_max_latest_deadlocks(
const uint32_t new_val = *static_cast<const uint32_t *>(save);
if (rocksdb_max_latest_deadlocks != new_val) {
rocksdb_max_latest_deadlocks = new_val;
- rdb->SetDeadlockInfoBufferSize(rocksdb_max_latest_deadlocks);
+ if (range_lock_mgr) {
+ auto n= rocksdb_max_latest_deadlocks;
+ range_lock_mgr->SetRangeDeadlockInfoBufferSize(n);
+ }
+ else
+ rdb->SetDeadlockInfoBufferSize(rocksdb_max_latest_deadlocks);
+
}
RDB_MUTEX_UNLOCK_CHECK(rdb_sysvars_mutex);
}
@@ -16046,11 +16592,22 @@ const rocksdb::ReadOptions &rdb_tx_acquire_snapshot(Rdb_transaction *tx) {
}
rocksdb::Iterator *rdb_tx_get_iterator(
- THD *thd, rocksdb::ColumnFamilyHandle *const cf, bool skip_bloom_filter,
+ Rdb_transaction *tx, rocksdb::ColumnFamilyHandle *const column_family,
+ bool is_rev_cf,
+ bool skip_bloom_filter, const rocksdb::Slice &lower_bound_slice,
+ const rocksdb::Slice &upper_bound_slice, bool read_current,
+ bool create_snapshot) {
+ return tx->get_iterator(column_family, is_rev_cf, skip_bloom_filter, lower_bound_slice,
+ upper_bound_slice, read_current, create_snapshot);
+}
+
+
+rocksdb::Iterator *rdb_tx_get_iterator(
+ THD *thd, rocksdb::ColumnFamilyHandle *const cf, bool is_rev_cf, bool skip_bloom_filter,
const rocksdb::Slice &eq_cond_lower_bound,
const rocksdb::Slice &eq_cond_upper_bound,
const rocksdb::Snapshot **snapshot, bool read_current,
- bool create_snapshot) {
+ bool create_snapshot, bool use_locking_iter) {
if (commit_in_the_middle(thd)) {
DBUG_ASSERT(snapshot && *snapshot == nullptr);
if (snapshot) {
@@ -16065,8 +16622,8 @@ rocksdb::Iterator *rdb_tx_get_iterator(
}
} else {
Rdb_transaction *tx = get_tx_from_thd(thd);
- return tx->get_iterator(cf, skip_bloom_filter, eq_cond_lower_bound,
- eq_cond_upper_bound, read_current, create_snapshot);
+ return tx->get_iterator(cf, is_rev_cf, skip_bloom_filter, eq_cond_lower_bound,
+ eq_cond_upper_bound, read_current, create_snapshot, use_locking_iter);
}
}
diff --git a/storage/rocksdb/ha_rocksdb.h b/storage/rocksdb/ha_rocksdb.h
index b5d35b448c1..f4aa151ea26 100644
--- a/storage/rocksdb/ha_rocksdb.h
+++ b/storage/rocksdb/ha_rocksdb.h
@@ -313,6 +313,13 @@ class ha_rocksdb : public my_core::handler {
const Rdb_key_def &kd,
const rocksdb::Slice &key) const;
+ int set_range_lock(Rdb_transaction *tx,
+ const Rdb_key_def &kd,
+ const enum ha_rkey_function &find_flag,
+ const rocksdb::Slice &slice,
+ const key_range *const end_key,
+ bool *use_locking_iterator);
+
int get_row_by_rowid(uchar *const buf, const char *const rowid,
const uint rowid_size, bool *skip_row = nullptr,
const bool skip_lookup = false,
@@ -958,6 +965,8 @@ class ha_rocksdb : public my_core::handler {
/* Need to build decoder on next read operation */
bool m_need_build_decoder;
+
+ int iter_status_to_retval(rocksdb::Iterator *it, const Rdb_key_def &kd, int not_found_code);
};
/*
@@ -1095,11 +1104,14 @@ Rdb_transaction *get_tx_from_thd(THD *const thd);
const rocksdb::ReadOptions &rdb_tx_acquire_snapshot(Rdb_transaction *tx);
rocksdb::Iterator *rdb_tx_get_iterator(
- THD *thd, rocksdb::ColumnFamilyHandle *const cf, bool skip_bloom_filter,
+ THD *thd, rocksdb::ColumnFamilyHandle *const cf,
+ bool is_rev_cf,
+ bool skip_bloom_filter,
const rocksdb::Slice &eq_cond_lower_bound,
const rocksdb::Slice &eq_cond_upper_bound,
const rocksdb::Snapshot **snapshot, bool read_current = false,
- bool create_snapshot = true);
+ bool create_snapshot = true,
+ bool use_locking_iter= false);
rocksdb::Status rdb_tx_get(Rdb_transaction *tx,
rocksdb::ColumnFamilyHandle *const column_family,
@@ -1172,4 +1184,6 @@ extern std::atomic<uint64_t> rocksdb_partial_index_groups_materialized;
extern std::atomic<uint64_t> rocksdb_partial_index_rows_sorted;
extern std::atomic<uint64_t> rocksdb_partial_index_rows_materialized;
+extern std::shared_ptr<rocksdb::RangeLockManagerHandle> range_lock_mgr;
+
} // namespace myrocks
diff --git a/storage/rocksdb/nosql_access.cc b/storage/rocksdb/nosql_access.cc
index eacae4b9315..0ad2349d8fb 100644
--- a/storage/rocksdb/nosql_access.cc
+++ b/storage/rocksdb/nosql_access.cc
@@ -676,10 +676,11 @@ class select_exec {
}
rocksdb::Iterator *get_iterator(rocksdb::ColumnFamilyHandle *cf,
+ bool is_rev_cf,
bool use_bloom,
const rocksdb::Slice &lower_bound,
const rocksdb::Slice &upper_bound) {
- return rdb_tx_get_iterator(m_thd, cf, !use_bloom, lower_bound,
+ return rdb_tx_get_iterator(m_thd, cf, is_rev_cf, !use_bloom, lower_bound, //psergey-mergey-todo: or m_tx ?
upper_bound, nullptr);
}
@@ -1523,7 +1524,8 @@ bool INLINE_ATTR select_exec::setup_iterator(txn_wrapper *txn,
m_thd, *m_key_def, eq_slice, bound_len, m_lower_bound_buf.data(),
m_upper_bound_buf.data(), &m_lower_bound_slice, &m_upper_bound_slice);
rocksdb::Iterator *it = txn->get_iterator(
- m_key_def->get_cf(), use_bloom, m_lower_bound_slice, m_upper_bound_slice);
+ m_key_def->get_cf(), m_key_def->m_is_reverse_cf, use_bloom,
+ m_lower_bound_slice, m_upper_bound_slice);
if (it == nullptr) {
return true;
}
diff --git a/storage/rocksdb/rdb_i_s.cc b/storage/rocksdb/rdb_i_s.cc
index cd015414fb1..45fb886f8ee 100644
--- a/storage/rocksdb/rdb_i_s.cc
+++ b/storage/rocksdb/rdb_i_s.cc
@@ -1771,31 +1771,63 @@ static int rdb_i_s_lock_info_fill_table(
}
/* cf id -> rocksdb::KeyLockInfo */
- std::unordered_multimap<uint32_t, rocksdb::KeyLockInfo> lock_info =
- rdb->GetLockStatusData();
-
- for (const auto &lock : lock_info) {
- const uint32_t cf_id = lock.first;
- const auto &key_lock_info = lock.second;
- const auto key_hexstr = rdb_hexdump(key_lock_info.key.c_str(),
- key_lock_info.key.length(), FN_REFLEN);
-
- for (const auto &id : key_lock_info.ids) {
- tables->table->field[RDB_LOCKS_FIELD::COLUMN_FAMILY_ID]->store(cf_id,
- true);
- tables->table->field[RDB_LOCKS_FIELD::TRANSACTION_ID]->store(id, true);
-
- tables->table->field[RDB_LOCKS_FIELD::KEY]->store(
- key_hexstr.c_str(), key_hexstr.size(), system_charset_info);
- tables->table->field[RDB_LOCKS_FIELD::MODE]->store(
- key_lock_info.exclusive ? "X" : "S", 1, system_charset_info);
-
- /* Tell MySQL about this row in the virtual table */
- ret = static_cast<int>(
- my_core::schema_table_store_record(thd, tables->table));
-
- if (ret != 0) {
- break;
+ if (range_lock_mgr) {
+ // Use Range Lock Manager's interface for obtaining more specific
+ // information about the acquired locks
+ auto lock_info = range_lock_mgr->GetRangeLockStatusData();
+
+ for (const auto &lock : lock_info) {
+ const uint32_t cf_id = lock.first;
+ const auto &range_lock_info = lock.second;
+
+ std::string key_hexstr = rdb_hexdump_range(range_lock_info.start,
+ range_lock_info.end);
+
+ for (const auto &id : range_lock_info.ids) {
+ tables->table->field[RDB_LOCKS_FIELD::COLUMN_FAMILY_ID]->store(cf_id,
+ true);
+ tables->table->field[RDB_LOCKS_FIELD::TRANSACTION_ID]->store(id, true);
+
+ tables->table->field[RDB_LOCKS_FIELD::KEY]->store(
+ key_hexstr.c_str(), key_hexstr.size(), system_charset_info);
+ tables->table->field[RDB_LOCKS_FIELD::MODE]->store(
+ range_lock_info.exclusive ? "X" : "S", 1, system_charset_info);
+
+ /* Tell MySQL about this row in the virtual table */
+ ret = static_cast<int>(
+ my_core::schema_table_store_record(thd, tables->table));
+
+ if (ret != 0) {
+ break;
+ }
+ }
+ }
+ } else {
+ std::unordered_multimap<uint32_t, rocksdb::KeyLockInfo> lock_info =
+ rdb->GetLockStatusData();
+
+ for (const auto &lock : lock_info) {
+ const uint32_t cf_id = lock.first;
+ const auto &key_lock_info = lock.second;
+ auto key_hexstr = rdb_hexdump(key_lock_info.key.c_str(),
+ key_lock_info.key.length(), FN_REFLEN);
+ for (const auto &id : key_lock_info.ids) {
+ tables->table->field[RDB_LOCKS_FIELD::COLUMN_FAMILY_ID]->store(cf_id,
+ true);
+ tables->table->field[RDB_LOCKS_FIELD::TRANSACTION_ID]->store(id, true);
+
+ tables->table->field[RDB_LOCKS_FIELD::KEY]->store(
+ key_hexstr.c_str(), key_hexstr.size(), system_charset_info);
+ tables->table->field[RDB_LOCKS_FIELD::MODE]->store(
+ key_lock_info.exclusive ? "X" : "S", 1, system_charset_info);
+
+ /* Tell MySQL about this row in the virtual table */
+ ret = static_cast<int>(
+ my_core::schema_table_store_record(thd, tables->table));
+
+ if (ret != 0) {
+ break;
+ }
}
}
}
diff --git a/storage/rocksdb/rdb_iterator.cc b/storage/rocksdb/rdb_iterator.cc
index b66d539e2af..761bccfa966 100644
--- a/storage/rocksdb/rdb_iterator.cc
+++ b/storage/rocksdb/rdb_iterator.cc
@@ -36,6 +36,7 @@ Rdb_iterator_base::Rdb_iterator_base(THD *thd,
m_tbl_def(tbl_def),
m_thd(thd),
m_scan_it(nullptr),
+ m_use_locking_iter(false),
m_scan_it_skips_bloom(false),
m_scan_it_snapshot(nullptr),
m_scan_it_lower_bound(nullptr),
@@ -83,7 +84,7 @@ int Rdb_iterator_base::read_before_key(const bool full_key_match,
return HA_EXIT_SUCCESS;
}
- return HA_ERR_END_OF_FILE;
+ return iter_status_to_retval(m_scan_it, m_kd, HA_ERR_END_OF_FILE);
}
int Rdb_iterator_base::read_after_key(const rocksdb::Slice &key_slice) {
@@ -98,12 +99,15 @@ int Rdb_iterator_base::read_after_key(const rocksdb::Slice &key_slice) {
*/
rocksdb_smart_seek(m_kd->m_is_reverse_cf, m_scan_it, key_slice);
- return is_valid_iterator(m_scan_it) ? HA_EXIT_SUCCESS : HA_ERR_END_OF_FILE;
+ return is_valid_iterator(m_scan_it) ?
+ HA_EXIT_SUCCESS :
+ iter_status_to_retval(m_scan_it, m_kd, HA_ERR_END_OF_FILE);
}
void Rdb_iterator_base::release_scan_iterator() {
delete m_scan_it;
m_scan_it = nullptr;
+ m_use_locking_iter = false;
if (m_scan_it_snapshot) {
auto rdb = rdb_get_rocksdb_db();
@@ -137,6 +141,10 @@ void Rdb_iterator_base::setup_scan_iterator(const rocksdb::Slice *const slice,
skip_bloom = false;
}
+ // Save the value of m_use_locking_iter because release_scan_iterator()
+ // will set it to false.
+ bool use_locking_iter= m_use_locking_iter;
+
/*
In some cases, setup_scan_iterator() is called multiple times from
the same query but bloom filter can not always be used.
@@ -164,9 +172,10 @@ void Rdb_iterator_base::setup_scan_iterator(const rocksdb::Slice *const slice,
*/
if (!m_scan_it) {
m_scan_it = rdb_tx_get_iterator(
- m_thd, m_kd->get_cf(), skip_bloom, m_scan_it_lower_bound_slice,
+ m_thd, m_kd->get_cf(), m_kd->m_is_reverse_cf, skip_bloom,
+ m_scan_it_lower_bound_slice,
m_scan_it_upper_bound_slice, &m_scan_it_snapshot, read_current,
- !read_current);
+ !read_current, use_locking_iter);
m_scan_it_skips_bloom = skip_bloom;
}
}
@@ -208,6 +217,20 @@ int Rdb_iterator_base::calc_eq_cond_len(enum ha_rkey_function find_flag,
return Rdb_key_def::INDEX_NUMBER_SIZE;
}
+int Rdb_iterator_base::iter_status_to_retval(rocksdb::Iterator *it,
+ const std::shared_ptr<Rdb_key_def> kd,
+ int not_found_code) {
+ if (it->Valid())
+ return HA_EXIT_SUCCESS;
+
+ rocksdb::Status s= it->status();
+ if (s.ok() || s.IsNotFound())
+ return not_found_code;
+
+ Rdb_transaction *tx = get_tx_from_thd(m_thd);
+ return rdb_tx_set_status_error(tx, s, *kd, m_tbl_def);
+}
+
int Rdb_iterator_base::next_with_direction(bool move_forward, bool skip_next) {
int rc = 0;
const auto &kd = *m_kd;
@@ -237,7 +260,7 @@ int Rdb_iterator_base::next_with_direction(bool move_forward, bool skip_next) {
}
if (!is_valid_iterator(m_scan_it)) {
- rc = HA_ERR_END_OF_FILE;
+ rc = iter_status_to_retval(m_scan_it, m_kd, HA_ERR_END_OF_FILE);
break;
}
diff --git a/storage/rocksdb/rdb_iterator.h b/storage/rocksdb/rdb_iterator.h
index 13f53b62a6f..34d2b53f35d 100644
--- a/storage/rocksdb/rdb_iterator.h
+++ b/storage/rocksdb/rdb_iterator.h
@@ -53,6 +53,8 @@ class Rdb_iterator {
virtual rocksdb::Slice key() = 0;
virtual rocksdb::Slice value() = 0;
virtual void reset() = 0;
+
+ virtual void set_use_locking()=0;
};
class Rdb_iterator_base : public Rdb_iterator {
@@ -93,6 +95,7 @@ class Rdb_iterator_base : public Rdb_iterator {
void reset() override { release_scan_iterator(); }
+ void set_use_locking() override { m_use_locking_iter = true; }
protected:
friend class Rdb_iterator;
const std::shared_ptr<Rdb_key_def> m_kd;
@@ -107,6 +110,8 @@ class Rdb_iterator_base : public Rdb_iterator {
/* Iterator used for range scans and for full table/index scans */
rocksdb::Iterator *m_scan_it;
+ bool m_use_locking_iter;
+
/* Whether m_scan_it was created with skip_bloom=true */
bool m_scan_it_skips_bloom;
@@ -120,6 +125,10 @@ class Rdb_iterator_base : public Rdb_iterator {
uchar *m_prefix_buf;
rocksdb::Slice m_prefix_tuple;
+
+ int iter_status_to_retval(rocksdb::Iterator *it,
+ const std::shared_ptr<Rdb_key_def> kd,
+ int not_found_code);
};
class Rdb_iterator_partial : public Rdb_iterator_base {
diff --git a/storage/rocksdb/rdb_locking_iter.cc b/storage/rocksdb/rdb_locking_iter.cc
new file mode 100644
index 00000000000..739f383a816
--- /dev/null
+++ b/storage/rocksdb/rdb_locking_iter.cc
@@ -0,0 +1,108 @@
+
+#ifdef USE_PRAGMA_IMPLEMENTATION
+#pragma implementation // gcc: Class implementation
+#endif
+
+#define MYSQL_SERVER 1
+
+/* This C++ file's header file */
+#include "./rdb_locking_iter.h"
+
+namespace myrocks {
+
+rocksdb::Iterator* GetLockingIterator(
+ rocksdb::Transaction *trx,
+ const rocksdb::ReadOptions& read_options,
+ rocksdb::ColumnFamilyHandle* column_family,
+ bool is_rev_cf,
+ ulonglong *counter) {
+ return new LockingIterator(trx, column_family, is_rev_cf, read_options,
+ counter);
+}
+
+/*
+ @brief
+ Seek to the first key K that is equal or greater than target,
+ locking the range [target; K].
+*/
+
+void LockingIterator::Seek(const rocksdb::Slice& target) {
+ iter_ = txn_->GetIterator(read_opts_, cfh_);
+ iter_->Seek(target);
+ ScanForward(target, false);
+}
+
+void LockingIterator::SeekForPrev(const rocksdb::Slice& target) {
+ iter_ = txn_->GetIterator(read_opts_, cfh_);
+ iter_->SeekForPrev(target);
+ ScanBackward(target, false);
+}
+
+/*
+ @brief
+ Move the iterator to the next key, locking the range between the current
+ and the next key.
+
+ @detail
+ Implementation is similar to Seek(next_key). Since we don't know what the
+ next_key is, we reach it by calling { Seek(current_key); Next(); }
+*/
+void LockingIterator::Next() {
+ DEBUG_SYNC(my_core::thd_get_current_thd(), "rocksdb.LockingIterator.Next");
+ assert(Valid());
+ // Save the current key value. We need it as the left endpoint
+ // of the range lock we're going to acquire
+ std::string current_key = iter_->key().ToString();
+
+ iter_->Next();
+ ScanForward(rocksdb::Slice(current_key), true);
+}
+
+/*
+ @brief
+ Move the iterator to the previous key, locking the range between the current
+ and the previous key.
+*/
+
+void LockingIterator::Prev() {
+ assert(Valid());
+
+ std::string current_key = iter_->key().ToString();
+ iter_->Prev();
+ ScanBackward(rocksdb::Slice(current_key), true);
+}
+
+
+/*
+ @detail
+ Ideally, this function should
+ - find the first key $first_key
+ - lock the range [-inf; $first_key]
+ - return, the iterator is positioned on $first_key
+
+ The problem here is that we cannot have "-infinity" bound.
+
+ Note: we don't have a practical use for this function - MyRocks always
+ searches within one index_name.table_name, which means we are only looking
+ at the keys with index_number as the prefix.
+*/
+
+void LockingIterator::SeekToFirst() {
+ DBUG_ASSERT(0);
+ status_ = rocksdb::Status::NotSupported("Not implemented");
+ valid_ = false;
+}
+
+/*
+ @detail
+ See SeekToFirst.
+*/
+
+void LockingIterator::SeekToLast() {
+ DBUG_ASSERT(0);
+ status_ = rocksdb::Status::NotSupported("Not implemented");
+ valid_ = false;
+}
+
+} // namespace myrocks
+
diff --git a/storage/rocksdb/rdb_locking_iter.h b/storage/rocksdb/rdb_locking_iter.h
new file mode 100644
index 00000000000..5a9ed6c275d
--- /dev/null
+++ b/storage/rocksdb/rdb_locking_iter.h
@@ -0,0 +1,190 @@
+
+/* MySQL header files */
+#include "sql/handler.h" /* handler */
+#include "sql/debug_sync.h"
+#include "./rdb_threads.h" /* for thd_get_current_thd */
+
+/* MyRocks header files */
+#include "./ha_rocksdb.h"
+
+namespace myrocks {
+
+//////////////////////////////////////////////////////////////////////////////
+// Locking iterator
+//////////////////////////////////////////////////////////////////////////////
+
+//
+// LockingIterator is an iterator that locks the rows before returning, as well
+// as scanned gaps between the rows.
+//
+// Example:
+// lock_iter= trx->GetLockingIterator();
+// lock_iter->Seek('abc');
+// lock_iter->Valid()==true && lock_iter->key() == 'bcd';
+//
+// After the above, the returned record 'bcd' is locked by transaction trx.
+// Also, the range between ['abc'..'bcd'] is empty and locked by trx.
+//
+// lock_iter->Next();
+// lock_iter->Valid()==true && lock_iter->key() == 'efg'
+//
+// Now, the range ['bcd'.. 'efg'] (bounds incluive) is also locked, and there are no
+// records between 'bcd' and 'efg'.
+//
+class LockingIterator : public rocksdb::Iterator {
+
+ rocksdb::Transaction *txn_;
+ rocksdb::ColumnFamilyHandle* cfh_;
+ bool m_is_rev_cf;
+ rocksdb::ReadOptions read_opts_;
+ rocksdb::Iterator *iter_;
+ rocksdb::Status status_;
+
+ // note: an iterator that has reached EOF has status()==OK && valid_==false
+ bool valid_;
+
+ ulonglong *lock_count_;
+ public:
+ LockingIterator(rocksdb::Transaction *txn,
+ rocksdb::ColumnFamilyHandle *cfh,
+ bool is_rev_cf,
+ const rocksdb::ReadOptions& opts,
+ ulonglong *lock_count=nullptr
+ ) :
+ txn_(txn), cfh_(cfh), m_is_rev_cf(is_rev_cf), read_opts_(opts), iter_(nullptr),
+ status_(rocksdb::Status::InvalidArgument()), valid_(false),
+ lock_count_(lock_count) {}
+
+ ~LockingIterator() {
+ delete iter_;
+ }
+
+ virtual bool Valid() const override { return valid_; }
+
+ // Note: MyRocks doesn't ever call these:
+ virtual void SeekToFirst() override;
+ virtual void SeekToLast() override;
+
+ virtual void Seek(const rocksdb::Slice& target) override;
+
+ // Position at the last key in the source that at or before target.
+ // The iterator is Valid() after this call iff the source contains
+ // an entry that comes at or before target.
+ virtual void SeekForPrev(const rocksdb::Slice& target) override;
+
+ virtual void Next() override;
+ virtual void Prev() override;
+
+ virtual rocksdb::Slice key() const override {
+ assert(Valid());
+ return iter_->key();
+ }
+
+ virtual rocksdb::Slice value() const override {
+ assert(Valid());
+ return iter_->value();
+ }
+
+ virtual rocksdb::Status status() const override {
+ return status_;
+ }
+
+ private:
+ template <bool forward> void Scan(const rocksdb::Slice& target,
+ bool call_next) {
+ if (!iter_->Valid()) {
+ status_ = iter_->status();
+ valid_ = false;
+ return;
+ }
+
+ while (1) {
+ /*
+ note: the underlying iterator checks iterator bounds, so we don't need
+ to check them here
+ */
+ DEBUG_SYNC(my_core::thd_get_current_thd(), "rocksdb.locking_iter_scan");
+ auto end_key = iter_->key();
+ bool endp_arg= m_is_rev_cf;
+ if (forward) {
+ status_ = txn_->GetRangeLock(cfh_,
+ rocksdb::Endpoint(target, endp_arg),
+ rocksdb::Endpoint(end_key, endp_arg));
+ } else {
+ status_ = txn_->GetRangeLock(cfh_,
+ rocksdb::Endpoint(end_key, endp_arg),
+ rocksdb::Endpoint(target, endp_arg));
+ }
+
+ if (!status_.ok()) {
+ // Failed to get a lock (most likely lock wait timeout)
+ valid_ = false;
+ return;
+ }
+ if (lock_count_) (*lock_count_)++;
+ std::string end_key_copy= end_key.ToString();
+
+ //Ok, now we have a lock which is inhibiting modifications in the range
+ // Somebody might have done external modifications, though:
+ // - removed the key we've found
+ // - added a key before that key.
+
+ // First, refresh the iterator:
+ delete iter_;
+ iter_ = txn_->GetIterator(read_opts_, cfh_);
+
+ // Then, try seeking to the same row
+ if (forward)
+ iter_->Seek(target);
+ else
+ iter_->SeekForPrev(target);
+
+ auto cmp= cfh_->GetComparator();
+
+ if (call_next && iter_->Valid() && !cmp->Compare(iter_->key(), target)) {
+ if (forward)
+ iter_->Next();
+ else
+ iter_->Prev();
+ }
+
+ if (iter_->Valid()) {
+ int inv = forward ? 1 : -1;
+ if (cmp->Compare(iter_->key(), rocksdb::Slice(end_key_copy))*inv <= 0) {
+ // Ok, the found key is within the range.
+ status_ = rocksdb::Status::OK();
+ valid_= true;
+ break;
+ } else {
+ // We've got a key but it is outside the range we've locked.
+ // Re-try the lock-and-read step.
+ continue;
+ }
+ } else {
+ // There's no row (within the iterator bounds perhaps). Exit now.
+ // (we might already have locked a range in this function but there's
+ // nothing we can do about it)
+ valid_ = false;
+ status_ = iter_->status();
+ break;
+ }
+ }
+ }
+
+ inline void ScanForward(const rocksdb::Slice& target, bool call_next) {
+ Scan<true>(target, call_next);
+ }
+
+ inline void ScanBackward(const rocksdb::Slice& target, bool call_next) {
+ Scan<false>(target, call_next);
+ }
+};
+
+rocksdb::Iterator*
+GetLockingIterator(rocksdb::Transaction *trx,
+ const rocksdb::ReadOptions& read_options,
+ rocksdb::ColumnFamilyHandle* column_family,
+ bool is_rev_cf,
+ ulonglong *counter);
+
+} // namespace myrocks
diff --git a/storage/rocksdb/rdb_utils.cc b/storage/rocksdb/rdb_utils.cc
index a723ac9e806..88695aa0539 100644
--- a/storage/rocksdb/rdb_utils.cc
+++ b/storage/rocksdb/rdb_utils.cc
@@ -259,6 +259,33 @@ std::string rdb_hexdump(const char *data, const std::size_t data_len,
return str;
}
+/*
+ Print the range in hex, in "start_endpoint-end_endpoint" form
+*/
+
+std::string rdb_hexdump_range(const rocksdb::EndpointWithString& start,
+ const rocksdb::EndpointWithString& end) {
+ std::string res;
+ // For keys: :0 keys should look like point keys
+ if (!start.inf_suffix && !end.inf_suffix && (start.slice == end.slice)) {
+ // This is a single-point range, show it like a key
+ res = rdb_hexdump(start.slice.c_str(), start.slice.length(), FN_REFLEN);
+ } else {
+ res = rdb_hexdump(start.slice.c_str(), start.slice.length(), FN_REFLEN);
+ if (start.inf_suffix)
+ res.append(":1");
+
+ res.append("-");
+
+ std::string key2 = rdb_hexdump(end.slice.c_str(), end.slice.length(),
+ FN_REFLEN);
+ if (end.inf_suffix)
+ key2.append(":1");
+ res.append(key2);
+ }
+ return res;
+}
+
/*
Attempt to access the database subdirectory to see if it exists
*/
diff --git a/storage/rocksdb/rdb_utils.h b/storage/rocksdb/rdb_utils.h
index f49f102a08c..39f2096dd35 100644
--- a/storage/rocksdb/rdb_utils.h
+++ b/storage/rocksdb/rdb_utils.h
@@ -29,6 +29,7 @@
/* RocksDB header files */
#include "rocksdb/slice.h"
#include "rocksdb/status.h"
+#include "rocksdb/utilities/transaction_db.h"
/* MyRocks header files */
#include "./rdb_global.h"
@@ -290,6 +291,8 @@ std::string rdb_hexdump(const char *data, const std::size_t data_len,
const std::size_t maxsize = 0)
MY_ATTRIBUTE((__nonnull__));
+std::string rdb_hexdump_range(const rocksdb::EndpointWithString& left,
+ const rocksdb::EndpointWithString& right);
/*
Helper function to see if a database exists
*/
1
0
[Commits] 7563d4c: MDEV-26135 Assertion failure when executing PS with a hanging recursive CTE
by IgorBabaev 17 Jul '21
by IgorBabaev 17 Jul '21
17 Jul '21
revision-id: 7563d4c9270ebb789ca645266d34dc83fb6bd784 (mariadb-10.2.31-1059-g7563d4c)
parent(s): c47e4aab62c65e1a1d431f9888ba1bc6b9841687
author: Igor Babaev
committer: Igor Babaev
timestamp: 2021-07-16 22:46:50 -0700
message:
MDEV-26135 Assertion failure when executing PS with a hanging recursive CTE
The bug affected execution of queries with With clauses containing so-called
hanging recursive CTEs in PREPARE mode. A CTE is hanging if it's not used
in the query. Preparation of a prepared statement from a query with a
hanging CTE caused a leak in the server and execution of this prepared
statement led to an assert failure of the server built in the debug mode.
This happened because the units specifying recursive CTEs erroneously were
not cleaned up if those CTEs were hanging.
The patch enforces cleanup of hanging recursive CTEs in the same way as
other hanging CTEs.
Approved by dmitry.shulga(a)mariadb.com
---
mysql-test/r/cte_recursive.result | 27 +++++++++++++++++++++++++++
mysql-test/t/cte_recursive.test | 21 +++++++++++++++++++++
sql/sql_union.cc | 6 ++++--
3 files changed, 52 insertions(+), 2 deletions(-)
diff --git a/mysql-test/r/cte_recursive.result b/mysql-test/r/cte_recursive.result
index 3e92652..a4d821e 100644
--- a/mysql-test/r/cte_recursive.result
+++ b/mysql-test/r/cte_recursive.result
@@ -4454,5 +4454,32 @@ deallocate prepare stmt;
drop table folks;
set big_tables=@save_big_tables;
#
+# MDEV-26135: execution of PS for query with hanging recursive CTE
+#
+create table t1 (a int);
+insert into t1 values (5), (7);
+create table t2 (b int);
+insert into t2 values (3), (7), (1);
+with recursive r as (select a from t1 union select a+1 from r where a < 10)
+select * from t2;
+b
+3
+7
+1
+prepare stmt from "with recursive r as (select a from t1 union select a+1 from r where a < 10)
+select * from t2";
+execute stmt;
+b
+3
+7
+1
+execute stmt;
+b
+3
+7
+1
+deallocate prepare stmt;
+drop table t1,t2;
+#
# End of 10.2 tests
#
diff --git a/mysql-test/t/cte_recursive.test b/mysql-test/t/cte_recursive.test
index 849e76b..49f9c1f 100644
--- a/mysql-test/t/cte_recursive.test
+++ b/mysql-test/t/cte_recursive.test
@@ -2820,5 +2820,26 @@ drop table folks;
set big_tables=@save_big_tables;
--echo #
+--echo # MDEV-26135: execution of PS for query with hanging recursive CTE
+--echo #
+
+create table t1 (a int);
+insert into t1 values (5), (7);
+create table t2 (b int);
+insert into t2 values (3), (7), (1);
+
+let $q=
+with recursive r as (select a from t1 union select a+1 from r where a < 10)
+select * from t2;
+
+eval $q;
+eval prepare stmt from "$q";
+execute stmt;
+execute stmt;
+deallocate prepare stmt;
+
+drop table t1,t2;
+
+--echo #
--echo # End of 10.2 tests
--echo #
diff --git a/sql/sql_union.cc b/sql/sql_union.cc
index 7baedfb..e5648e6 100644
--- a/sql/sql_union.cc
+++ b/sql/sql_union.cc
@@ -1382,7 +1382,8 @@ bool st_select_lex_unit::cleanup()
{
DBUG_RETURN(FALSE);
}
- if (with_element && with_element->is_recursive && union_result)
+ if (with_element && with_element->is_recursive && union_result &&
+ with_element->rec_outer_references)
{
select_union_recursive *result= with_element->rec_result;
if (++result->cleanup_count == with_element->rec_outer_references)
@@ -1584,7 +1585,8 @@ bool st_select_lex::cleanup()
for (SELECT_LEX_UNIT *lex_unit= first_inner_unit(); lex_unit ;
lex_unit= lex_unit->next_unit())
{
- if (lex_unit->with_element && lex_unit->with_element->is_recursive)
+ if (lex_unit->with_element && lex_unit->with_element->is_recursive &&
+ lex_unit->with_element->rec_outer_references)
continue;
error= (bool) ((uint) error | (uint) lex_unit->cleanup());
}
1
0
[Commits] 7929388: MDEV-26135 Assertion failure when executing PS with a hanging recursive CTE
by IgorBabaev 17 Jul '21
by IgorBabaev 17 Jul '21
17 Jul '21
revision-id: 7929388ed155a65f0693a98590b943bed0421aec (mariadb-10.2.31-1059-g7929388)
parent(s): c47e4aab62c65e1a1d431f9888ba1bc6b9841687
author: Igor Babaev
committer: Igor Babaev
timestamp: 2021-07-16 22:44:35 -0700
message:
MDEV-26135 Assertion failure when executing PS with a hanging recursive CTE
The bug affected execution of queries with With clauses containing so-called
hanging recursive CTEs in PREPARE mode. A CTE is hanging if it's not used
in the query. Preparation of a prepared statement from a query with a
hanging CTE caused a leak in the server and execution of this prepared
statement led to an assert failure of the server built in the debug mode.
This happened because the units specifying recursive CTEs erroneously were
not cleaned up if those CTEs were hanging.
The patch enforces cleanup of hanging recursive CTEs in the same way as
other hanging CTEs.
Approved by dmitry.shulga(a)mariadb.com
Approved by dmitry.shulga(a)mariadb.com pre_setup_aliases
Approved by dmitry.shulga(a)mariadb.com
Approved by dmitry.shulga(a)mariadb.com
Approved by dmitry.shulga(a)mariadb.com
Approved by dmitry.shulga(a)mariadb.com
Approved by dmitry.shulga(a)mariadb.com
Approved by dmitry.shulga(a)mariadb.com
---
mysql-test/r/cte_recursive.result | 27 +++++++++++++++++++++++++++
mysql-test/t/cte_recursive.test | 21 +++++++++++++++++++++
sql/sql_union.cc | 6 ++++--
3 files changed, 52 insertions(+), 2 deletions(-)
diff --git a/mysql-test/r/cte_recursive.result b/mysql-test/r/cte_recursive.result
index 3e92652..a4d821e 100644
--- a/mysql-test/r/cte_recursive.result
+++ b/mysql-test/r/cte_recursive.result
@@ -4454,5 +4454,32 @@ deallocate prepare stmt;
drop table folks;
set big_tables=@save_big_tables;
#
+# MDEV-26135: execution of PS for query with hanging recursive CTE
+#
+create table t1 (a int);
+insert into t1 values (5), (7);
+create table t2 (b int);
+insert into t2 values (3), (7), (1);
+with recursive r as (select a from t1 union select a+1 from r where a < 10)
+select * from t2;
+b
+3
+7
+1
+prepare stmt from "with recursive r as (select a from t1 union select a+1 from r where a < 10)
+select * from t2";
+execute stmt;
+b
+3
+7
+1
+execute stmt;
+b
+3
+7
+1
+deallocate prepare stmt;
+drop table t1,t2;
+#
# End of 10.2 tests
#
diff --git a/mysql-test/t/cte_recursive.test b/mysql-test/t/cte_recursive.test
index 849e76b..49f9c1f 100644
--- a/mysql-test/t/cte_recursive.test
+++ b/mysql-test/t/cte_recursive.test
@@ -2820,5 +2820,26 @@ drop table folks;
set big_tables=@save_big_tables;
--echo #
+--echo # MDEV-26135: execution of PS for query with hanging recursive CTE
+--echo #
+
+create table t1 (a int);
+insert into t1 values (5), (7);
+create table t2 (b int);
+insert into t2 values (3), (7), (1);
+
+let $q=
+with recursive r as (select a from t1 union select a+1 from r where a < 10)
+select * from t2;
+
+eval $q;
+eval prepare stmt from "$q";
+execute stmt;
+execute stmt;
+deallocate prepare stmt;
+
+drop table t1,t2;
+
+--echo #
--echo # End of 10.2 tests
--echo #
diff --git a/sql/sql_union.cc b/sql/sql_union.cc
index 7baedfb..e5648e6 100644
--- a/sql/sql_union.cc
+++ b/sql/sql_union.cc
@@ -1382,7 +1382,8 @@ bool st_select_lex_unit::cleanup()
{
DBUG_RETURN(FALSE);
}
- if (with_element && with_element->is_recursive && union_result)
+ if (with_element && with_element->is_recursive && union_result &&
+ with_element->rec_outer_references)
{
select_union_recursive *result= with_element->rec_result;
if (++result->cleanup_count == with_element->rec_outer_references)
@@ -1584,7 +1585,8 @@ bool st_select_lex::cleanup()
for (SELECT_LEX_UNIT *lex_unit= first_inner_unit(); lex_unit ;
lex_unit= lex_unit->next_unit())
{
- if (lex_unit->with_element && lex_unit->with_element->is_recursive)
+ if (lex_unit->with_element && lex_unit->with_element->is_recursive &&
+ lex_unit->with_element->rec_outer_references)
continue;
error= (bool) ((uint) error | (uint) lex_unit->cleanup());
}
1
0
13 Jul '21
revision-id: b326f3d672bc5a7775af83e97ababc6117640af2 (mariadb-10.5.2-1205-gb326f3d672b)
parent(s): e9a2dca4d3073d77ca13f77dd3efa779a077152d
author: Sergei Petrunia
committer: Sergei Petrunia
timestamp: 2021-07-13 21:09:41 +0300
message:
Update Xpand to: Change Xpand Version to 6.0.1, part 2
---
storage/xpand | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/storage/xpand b/storage/xpand
index ea285dff95c..e578dbf7ed7 160000
--- a/storage/xpand
+++ b/storage/xpand
@@ -1 +1 @@
-Subproject commit ea285dff95c0c192223bea3a089d1e0a54228c1d
+Subproject commit e578dbf7ed7d18c62ca18c64ea4c6bedbf6b0a11
1
0
revision-id: efb075c752ad37402fb754a53523e07fd24e8188 (mariadb-10.2.31-1002-gefb075c)
parent(s): 4352c77c5a3ac89acc5fd90a38f806d0ec500aa4
author: Igor Babaev
committer: Igor Babaev
timestamp: 2021-07-09 12:00:02 -0700
message:
MDEV-25565 Preliminary commit
Some new test cases are to be added after rebase.
---
mysql-test/r/win.result | 111 ++++++++++++++++++++++++++++++++++++++++++++++++
mysql-test/t/win.test | 70 ++++++++++++++++++++++++++++++
sql/sql_union.cc | 26 ++++++++++++
sql/sql_window.cc | 12 ++++++
sql/sql_window.h | 5 ++-
5 files changed, 223 insertions(+), 1 deletion(-)
diff --git a/mysql-test/r/win.result b/mysql-test/r/win.result
index 8a31dcc..432c12b 100644
--- a/mysql-test/r/win.result
+++ b/mysql-test/r/win.result
@@ -3911,5 +3911,116 @@ sum(i) over () IN ( SELECT 1 FROM t1 a)
0
DROP TABLE t1;
#
+# MDEV-25565: 2-nd call of SP with SELECT joining a view / derived table
+and returning the result of calculation of 2 window
+functions that use the same window specification
+#
+create table t1 (a int);
+insert into t1 values (3), (7), (1), (7), (1), (1), (3), (1), (5);
+create table t2 (b int);
+insert into t2 values (1), (4), (9), (8), (2), (9), (7), (1);
+create view v2 as select a from t1 group by a;
+create view v1 as select * from v2;
+create procedure sp1() select v1.a,
+sum(v1.a) over (partition by v1.a order by v1.a) as k,
+avg(v1.a) over (partition by v1.a order by v1.a) as m
+from v1, t2 where t2.b = v1.a;
+call sp1();
+a k m
+1 2 1.0000
+1 2 1.0000
+7 7 7.0000
+call sp1();
+a k m
+1 2 1.0000
+1 2 1.0000
+7 7 7.0000
+prepare stmt from "select v1.a,
+sum(v1.a) over (partition by v1.a order by v1.a) as k,
+avg(v1.a) over (partition by v1.a order by v1.a) as m
+from v1, t2 where t2.b = v1.a";
+execute stmt;
+a k m
+1 2 1.0000
+1 2 1.0000
+7 7 7.0000
+execute stmt;
+a k m
+1 2 1.0000
+1 2 1.0000
+7 7 7.0000
+deallocate prepare stmt;
+create procedure sp2() select * from
+( select dt1.a,
+sum(dt1.a) over (partition by dt1.a order by dt1.a) as k,
+avg(dt1.a) over (partition by dt1.a order by dt1.a) as m
+from (select * from v2) as dt1, t2 where t2.b=dt1.a ) as dt;
+call sp2();
+a k m
+1 2 1.0000
+1 2 1.0000
+7 7 7.0000
+call sp2();
+a k m
+1 2 1.0000
+1 2 1.0000
+7 7 7.0000
+prepare stmt from "select * from
+( select dt1.a,
+sum(dt1.a) over (partition by dt1.a order by dt1.a) as k,
+avg(dt1.a) over (partition by dt1.a order by dt1.a) as m
+from (select * from v2) as dt1, t2 where t2.b=dt1.a ) as dt";
+execute stmt;
+a k m
+1 2 1.0000
+1 2 1.0000
+7 7 7.0000
+execute stmt;
+a k m
+1 2 1.0000
+1 2 1.0000
+7 7 7.0000
+deallocate prepare stmt;
+create procedure sp3() select * from
+( select dt1.a,
+sum(dt1.a) over (partition by dt1.a order by dt1.a) as k,
+avg(dt1.a) over (partition by dt1.a order by dt1.a) as m
+from ( select * from (select * from t1 group by a) as dt2 ) as dt1,
+t2
+where t2.b=dt1.a ) as dt;
+call sp3();
+a k m
+1 2 1.0000
+1 2 1.0000
+7 7 7.0000
+call sp3();
+a k m
+1 2 1.0000
+1 2 1.0000
+7 7 7.0000
+prepare stmt from "select * from
+( select dt1.a,
+sum(dt1.a) over (partition by dt1.a order by dt1.a) as k,
+avg(dt1.a) over (partition by dt1.a order by dt1.a) as m
+from ( select * from (select * from t1 group by a) as dt2 ) as dt1,
+t2
+where t2.b=dt1.a ) as dt";
+execute stmt;
+a k m
+1 2 1.0000
+1 2 1.0000
+7 7 7.0000
+execute stmt;
+a k m
+1 2 1.0000
+1 2 1.0000
+7 7 7.0000
+deallocate prepare stmt;
+drop procedure sp1;
+drop procedure sp2;
+drop procedure sp3;
+drop view v1,v2;
+drop table t1,t2;
+#
# End of 10.2 tests
#
diff --git a/mysql-test/t/win.test b/mysql-test/t/win.test
index c07a81f..e42c8c9 100644
--- a/mysql-test/t/win.test
+++ b/mysql-test/t/win.test
@@ -2557,5 +2557,75 @@ SELECT sum(i) over () IN ( SELECT 1 FROM t1 a) FROM t1;
DROP TABLE t1;
--echo #
+--echo # MDEV-25565: 2-nd call of SP with SELECT joining a view / derived table
+--echo and returning the result of calculation of 2 window
+--echo functions that use the same window specification
+--echo #
+
+create table t1 (a int);
+insert into t1 values (3), (7), (1), (7), (1), (1), (3), (1), (5);
+create table t2 (b int);
+insert into t2 values (1), (4), (9), (8), (2), (9), (7), (1);
+
+create view v2 as select a from t1 group by a;
+create view v1 as select * from v2;
+
+let $q1=
+select v1.a,
+ sum(v1.a) over (partition by v1.a order by v1.a) as k,
+ avg(v1.a) over (partition by v1.a order by v1.a) as m
+from v1, t2 where t2.b = v1.a;
+
+eval create procedure sp1() $q1;
+call sp1();
+call sp1();
+
+eval prepare stmt from "$q1";
+execute stmt;
+execute stmt;
+deallocate prepare stmt;
+
+
+let $q2=
+select * from
+ ( select dt1.a,
+ sum(dt1.a) over (partition by dt1.a order by dt1.a) as k,
+ avg(dt1.a) over (partition by dt1.a order by dt1.a) as m
+ from (select * from v2) as dt1, t2 where t2.b=dt1.a ) as dt;
+
+eval create procedure sp2() $q2;
+call sp2();
+call sp2();
+
+eval prepare stmt from "$q2";
+execute stmt;
+execute stmt;
+deallocate prepare stmt;
+
+let $q3=
+select * from
+ ( select dt1.a,
+ sum(dt1.a) over (partition by dt1.a order by dt1.a) as k,
+ avg(dt1.a) over (partition by dt1.a order by dt1.a) as m
+ from ( select * from (select * from t1 group by a) as dt2 ) as dt1,
+ t2
+ where t2.b=dt1.a ) as dt;
+
+eval create procedure sp3() $q3;
+call sp3();
+call sp3();
+
+eval prepare stmt from "$q3";
+execute stmt;
+execute stmt;
+deallocate prepare stmt;
+
+drop procedure sp1;
+drop procedure sp2;
+drop procedure sp3;
+drop view v1,v2;
+drop table t1,t2;
+
+--echo #
--echo # End of 10.2 tests
--echo #
diff --git a/sql/sql_union.cc b/sql/sql_union.cc
index 7baedfb..f3c90b8 100644
--- a/sql/sql_union.cc
+++ b/sql/sql_union.cc
@@ -30,6 +30,7 @@
#include "filesort.h" // filesort_free_buffers
#include "sql_view.h"
#include "sql_cte.h"
+#include "item_windowfunc.h"
bool mysql_union(THD *thd, LEX *lex, select_result *result,
SELECT_LEX_UNIT *unit, ulong setup_tables_done_option)
@@ -1550,6 +1551,29 @@ static void cleanup_order(ORDER *order)
}
+static void cleanup_window_funcs(List<Item_window_func> &win_funcs)
+{
+ List_iterator_fast<Item_window_func> it(win_funcs);
+ Item_window_func *win_func;
+ while ((win_func= it++))
+ {
+ Window_spec *win_spec= win_func->window_spec;
+ if (!win_spec)
+ continue;
+ if (win_spec->save_partition_list)
+ {
+ win_spec->partition_list= win_spec->save_partition_list;
+ win_spec->save_partition_list= NULL;
+ }
+ if (win_spec->save_order_list)
+ {
+ win_spec->order_list= win_spec->save_order_list;
+ win_spec->save_order_list= NULL;
+ }
+ }
+}
+
+
bool st_select_lex::cleanup()
{
bool error= FALSE;
@@ -1558,6 +1582,8 @@ bool st_select_lex::cleanup()
cleanup_order(order_list.first);
cleanup_order(group_list.first);
+ cleanup_window_funcs(window_funcs);
+
if (join)
{
List_iterator<TABLE_LIST> ti(leaf_tables);
diff --git a/sql/sql_window.cc b/sql/sql_window.cc
index 612c6e6..3ef751b 100644
--- a/sql/sql_window.cc
+++ b/sql/sql_window.cc
@@ -479,9 +479,15 @@ int compare_window_funcs_by_window_specs(Item_window_func *win_func1,
Let's use only one of the lists.
*/
if (!win_spec1->name() && win_spec2->name())
+ {
+ win_spec1->save_partition_list= win_spec1->partition_list;
win_spec1->partition_list= win_spec2->partition_list;
+ }
else
+ {
+ win_spec2->save_partition_list= win_spec2->partition_list;
win_spec2->partition_list= win_spec1->partition_list;
+ }
cmp= compare_order_lists(win_spec1->order_list,
win_spec2->order_list);
@@ -494,9 +500,15 @@ int compare_window_funcs_by_window_specs(Item_window_func *win_func1,
Let's use only one of the lists.
*/
if (!win_spec1->name() && win_spec2->name())
+ {
+ win_spec1->save_order_list= win_spec2->order_list;
win_spec1->order_list= win_spec2->order_list;
+ }
else
+ {
+ win_spec1->save_order_list= win_spec2->order_list;
win_spec2->order_list= win_spec1->order_list;
+ }
cmp= compare_window_frames(win_spec1->window_frame,
win_spec2->window_frame);
diff --git a/sql/sql_window.h b/sql/sql_window.h
index e0c1563..417d0bc 100644
--- a/sql/sql_window.h
+++ b/sql/sql_window.h
@@ -99,8 +99,10 @@ class Window_spec : public Sql_alloc
LEX_STRING *window_ref;
SQL_I_List<ORDER> *partition_list;
+ SQL_I_List<ORDER> *save_partition_list;
SQL_I_List<ORDER> *order_list;
+ SQL_I_List<ORDER> *save_order_list;
Window_frame *window_frame;
@@ -111,7 +113,8 @@ class Window_spec : public Sql_alloc
SQL_I_List<ORDER> *ord_list,
Window_frame *win_frame)
: window_names_are_checked(false), window_ref(win_ref),
- partition_list(part_list), order_list(ord_list),
+ partition_list(part_list), save_partition_list(NULL),
+ order_list(ord_list), save_order_list(NULL),
window_frame(win_frame), referenced_win_spec(NULL) {}
virtual char *name() { return NULL; }
1
0
[Commits] 78735dc: MDEV-26108 Crash with query referencing twice CTE that uses embedded recursive CTE
by IgorBabaev 09 Jul '21
by IgorBabaev 09 Jul '21
09 Jul '21
revision-id: 78735dcaf757cd71c8f0ff3d21071b0f89018150 (mariadb-10.4.20-31-g78735dc)
parent(s): e56fe393104960eb62043c3777ce7d21de9362f4
author: Igor Babaev
committer: Igor Babaev
timestamp: 2021-07-08 17:47:17 -0700
message:
MDEV-26108 Crash with query referencing twice CTE that uses embedded recursive CTE
This bug could affect queries that had at least two references to a CTE that
used an embedded recursive CTE.
Starting from version 10.4 some code in With_element::clone_parsed_spec()
that assumed a certain order of selects after parsing the specification of
a CTE became not valid anymore. It could lead to global select lists where
some selects were missing. If a missing CTE happened to belong to the
recursive part of a recursive CTE some recursive table references were not
set as references to materialized derived tables and this caused a crash of
the server.
Approved by Oleksandr Byelkin <sanja(a)mariadb.com>
---
mysql-test/main/cte_nonrecursive.result | 2 +-
mysql-test/main/cte_recursive.result | 19 +++++++++++++++++++
mysql-test/main/cte_recursive.test | 17 +++++++++++++++++
sql/sql_cte.cc | 10 +++++++---
4 files changed, 44 insertions(+), 4 deletions(-)
diff --git a/mysql-test/main/cte_nonrecursive.result b/mysql-test/main/cte_nonrecursive.result
index 040afdf..4cd466a 100644
--- a/mysql-test/main/cte_nonrecursive.result
+++ b/mysql-test/main/cte_nonrecursive.result
@@ -1126,7 +1126,7 @@ NULL UNION RESULT <union4,5> ALL NULL NULL NULL NULL NULL NULL
NULL UNION RESULT <union11,12> ALL NULL NULL NULL NULL NULL NULL
NULL UNION RESULT <union1,6> ALL NULL NULL NULL NULL NULL NULL
Warnings:
-Note 1003 with cte_e as (with cte_o as (with cte_i as (select `test`.`t1`.`a` AS `a` from `test`.`t1` where `test`.`t1`.`a` < 7)select `test`.`t1`.`a` AS `a` from `test`.`t1` where `test`.`t1`.`a` > 1)select `test`.`t1`.`a` AS `a` from `test`.`t1` where `test`.`t1`.`a` < 3 and `test`.`t1`.`a` > 1 and `test`.`t1`.`a` < 7 and `test`.`t1`.`a` > 1 union select `test`.`t1`.`a` AS `a` from `test`.`t1` where `test`.`t1`.`a` > 4 and `test`.`t1`.`a` > 1 and `test`.`t1`.`a` < 7 and `test`.`t1`.`a` > 1)select `cte_e1`.`a` AS `a` from `cte_e` `cte_e1` where `cte_e1`.`a` > 1 union select `cte_e2`.`a` AS `a` from `cte_e` `cte_e2`
+Note 1003 with cte_e as (with cte_o as (with cte_i as (/* select#2 */ select `test`.`t1`.`a` AS `a` from `test`.`t1` where `test`.`t1`.`a` < 7)/* select#3 */ select `test`.`t1`.`a` AS `a` from `test`.`t1` where `test`.`t1`.`a` > 1)/* select#4 */ select `test`.`t1`.`a` AS `a` from `test`.`t1` where `test`.`t1`.`a` < 3 and `test`.`t1`.`a` > 1 and `test`.`t1`.`a` < 7 and `test`.`t1`.`a` > 1 union /* select#5 */ select `test`.`t1`.`a` AS `a` from `test`.`t1` where `test`.`t1`.`a` > 4 and `test`.`t1`.`a` > 1 and `test`.`t1`.`a` < 7 and `test`.`t1`.`a` > 1)/* select#1 */ select `cte_e1`.`a` AS `a` from `cte_e` `cte_e1` where `cte_e1`.`a` > 1 union /* select#6 */ select `cte_e2`.`a` AS `a` from `cte_e` `cte_e2`
drop table t1;
#
# MDEV-13753: embedded CTE in a VIEW created in prepared statement
diff --git a/mysql-test/main/cte_recursive.result b/mysql-test/main/cte_recursive.result
index 6f30de3..74b450f 100644
--- a/mysql-test/main/cte_recursive.result
+++ b/mysql-test/main/cte_recursive.result
@@ -4791,3 +4791,22 @@ a
NULL
DROP TABLE t1;
# End of 10.3 tests
+#
+# MDEV-26108: Recursive CTE embedded into another CTE which is used twice
+#
+create table t1 (a int);
+insert into t1 values (5), (7);
+with cte_e as (
+with recursive cte_r as (
+select a from t1 union select a+1 as a from cte_r r where a < 10
+) select * from cte_r
+) select * from cte_e s1, cte_e s2 where s1.a=s2.a;
+a a
+5 5
+7 7
+6 6
+8 8
+9 9
+10 10
+drop table t1;
+# End of 10.4 tests
diff --git a/mysql-test/main/cte_recursive.test b/mysql-test/main/cte_recursive.test
index c3537e5..3b140b3 100644
--- a/mysql-test/main/cte_recursive.test
+++ b/mysql-test/main/cte_recursive.test
@@ -3087,3 +3087,20 @@ SELECT * FROM cte;
DROP TABLE t1;
--echo # End of 10.3 tests
+
+--echo #
+--echo # MDEV-26108: Recursive CTE embedded into another CTE which is used twice
+--echo #
+
+create table t1 (a int);
+insert into t1 values (5), (7);
+
+with cte_e as (
+ with recursive cte_r as (
+ select a from t1 union select a+1 as a from cte_r r where a < 10
+ ) select * from cte_r
+) select * from cte_e s1, cte_e s2 where s1.a=s2.a;
+
+drop table t1;
+
+--echo # End of 10.4 tests
diff --git a/sql/sql_cte.cc b/sql/sql_cte.cc
index dfcb4e1..c5dcc15 100644
--- a/sql/sql_cte.cc
+++ b/sql/sql_cte.cc
@@ -1038,6 +1038,7 @@ st_select_lex_unit *With_element::clone_parsed_spec(LEX *old_lex,
bool parse_status= false;
st_select_lex *with_select;
+ st_select_lex *last_clone_select;
char save_end= unparsed_spec.str[unparsed_spec.length];
((char*) &unparsed_spec.str[unparsed_spec.length])[0]= '\0';
@@ -1124,11 +1125,14 @@ st_select_lex_unit *With_element::clone_parsed_spec(LEX *old_lex,
lex->unit.include_down(with_table->select_lex);
lex->unit.set_slave(with_select);
lex->unit.cloned_from= spec;
+ last_clone_select= lex->all_selects_list;
+ while (last_clone_select->next_select_in_list())
+ last_clone_select= last_clone_select->next_select_in_list();
old_lex->all_selects_list=
(st_select_lex*) (lex->all_selects_list->
- insert_chain_before(
- (st_select_lex_node **) &(old_lex->all_selects_list),
- with_select));
+ insert_chain_before(
+ (st_select_lex_node **) &(old_lex->all_selects_list),
+ last_clone_select));
/*
Now all references to the CTE defined outside of the cloned specification
1
0