[Commits] 8b62c2f5071: MDEV-27188: Suppress optimizer output when executing prepare
by psergey 07 Mar '22
by psergey 07 Mar '22
07 Mar '22
revision-id: 8b62c2f507113e988bceece8c29c074d1587260a (mariadb-10.7.1-3-g8b62c2f5071)
parent(s): 06988bdcaa2d1af2c178c199b7f65dbafda45a2c
author: Sergei Petrunia
committer: Sergei Petrunia
timestamp: 2022-03-07 16:59:15 +0300
message:
MDEV-27188: Suppress optimizer output when executing prepare
- Do not write anything into Optimizer Trace at Prepare phase
- When the query gets an error at Prepare phase, make sure there
is no trace written, either. This is important as we need to
produce the same trace for "mtr --ps-protocol" and regular mtr run.
- For other kinds of errors, trace is still produced as it might be
valuable.
---
mysql-test/main/opt_trace,ps.rdiff | 92 ---
mysql-test/main/opt_trace.result | 621 +++------------------
mysql-test/main/opt_trace_index_merge.result | 11 +-
.../main/opt_trace_index_merge_innodb.result | 11 +-
mysql-test/main/opt_trace_security.result | 38 +-
sql/my_json_writer.h | 8 +
sql/opt_subselect.cc | 13 +-
sql/opt_trace.cc | 60 ++
sql/opt_trace.h | 37 +-
sql/opt_trace_context.h | 6 +
sql/sql_derived.cc | 30 +-
sql/sql_parse.cc | 4 +
sql/sql_prepare.cc | 11 -
sql/sql_select.cc | 17 +-
14 files changed, 235 insertions(+), 724 deletions(-)
diff --git a/mysql-test/main/opt_trace,ps.rdiff b/mysql-test/main/opt_trace,ps.rdiff
deleted file mode 100644
index 3e2218de673..00000000000
--- a/mysql-test/main/opt_trace,ps.rdiff
+++ /dev/null
@@ -1,92 +0,0 @@
---- /Users/shulga/projects/mariadb/server-10.6/mysql-test/main/opt_trace.result 2021-07-21 19:17:11.000000000 +0700
-+++ /Users/shulga/projects/mariadb/server-10.6/mysql-test/main/opt_trace.reject 2021-07-21 19:17:48.000000000 +0700
-@@ -2829,14 +2829,6 @@
- }
- },
- {
-- "transformation": {
-- "select_id": 2,
-- "from": "IN (SELECT)",
-- "to": "semijoin",
-- "chosen": true
-- }
-- },
-- {
- "expanded_query": "/* select#2 */ select t10.pk from t10"
- }
- ]
-@@ -4402,14 +4394,6 @@
- }
- },
- {
-- "transformation": {
-- "select_id": 2,
-- "from": "IN (SELECT)",
-- "to": "semijoin",
-- "chosen": true
-- }
-- },
-- {
- "expanded_query": "/* select#2 */ select t_inner_1.a from t1 t_inner_1 join t1 t_inner_2"
- }
- ]
-@@ -4852,14 +4836,6 @@
- }
- },
- {
-- "transformation": {
-- "select_id": 2,
-- "from": "IN (SELECT)",
-- "to": "semijoin",
-- "chosen": true
-- }
-- },
-- {
- "expanded_query": "/* select#2 */ select t_inner_1.a from t2 t_inner_2 join t1 t_inner_1"
- }
- ]
-@@ -4879,14 +4855,6 @@
- }
- },
- {
-- "transformation": {
-- "select_id": 3,
-- "from": "IN (SELECT)",
-- "to": "semijoin",
-- "chosen": true
-- }
-- },
-- {
- "expanded_query": "/* select#3 */ select t_inner_3.a from t2 t_inner_3 join t1 t_inner_4"
- }
- ]
-@@ -6432,14 +6400,6 @@
- }
- },
- {
-- "transformation": {
-- "select_id": 2,
-- "from": "IN (SELECT)",
-- "to": "semijoin",
-- "chosen": true
-- }
-- },
-- {
- "expanded_query": "/* select#2 */ select t_inner_1.a from t2 t_inner_2 join t1 t_inner_1"
- }
- ]
-@@ -6459,14 +6419,6 @@
- }
- },
- {
-- "transformation": {
-- "select_id": 3,
-- "from": "IN (SELECT)",
-- "to": "semijoin",
-- "chosen": true
-- }
-- },
-- {
- "expanded_query": "/* select#3 */ select t_inner_3.a from t2 t_inner_3 join t1 t_inner_4"
- }
- ]
diff --git a/mysql-test/main/opt_trace.result b/mysql-test/main/opt_trace.result
index b0c2a9ca4d9..18ef904633b 100644
--- a/mysql-test/main/opt_trace.result
+++ b/mysql-test/main/opt_trace.result
@@ -38,36 +38,18 @@ QUERY TRACE MISSING_BYTES_BEYOND_MAX_MEM_SIZE INSUFFICIENT_PRIVILEGES
select * from v1 {
"steps": [
{
- "join_preparation": {
+ "join_optimization": {
"select_id": 1,
+ "expanded_query": "/* select#1 */ select t1.a AS a,t1.b AS b from v1",
"steps": [
{
"view": {
"table": "v1",
"select_id": 2,
- "algorithm": "merged"
+ "algorithm": "merged",
+ "expanded_query": "/* select#2 */ select t1.a AS a,t1.b AS b from t1 where t1.a = 1"
}
},
- {
- "join_preparation": {
- "select_id": 2,
- "steps": [
- {
- "expanded_query": "/* select#2 */ select t1.a AS a,t1.b AS b from t1 where t1.a = 1"
- }
- ]
- }
- },
- {
- "expanded_query": "/* select#1 */ select t1.a AS a,t1.b AS b from v1"
- }
- ]
- }
- },
- {
- "join_optimization": {
- "select_id": 1,
- "steps": [
{
"condition_processing": {
"condition": "WHERE",
@@ -189,36 +171,18 @@ QUERY TRACE MISSING_BYTES_BEYOND_MAX_MEM_SIZE INSUFFICIENT_PRIVILEGES
select * from (select * from t1 where t1.a=1)q {
"steps": [
{
- "join_preparation": {
+ "join_optimization": {
"select_id": 1,
+ "expanded_query": "/* select#1 */ select t1.a AS a,t1.b AS b from (/* select#2 */ select t1.a AS a,t1.b AS b from t1 where t1.a = 1) q",
"steps": [
{
"derived": {
"table": "q",
"select_id": 2,
- "algorithm": "merged"
- }
- },
- {
- "join_preparation": {
- "select_id": 2,
- "steps": [
- {
- "expanded_query": "/* select#2 */ select t1.a AS a,t1.b AS b from t1 where t1.a = 1"
- }
- ]
+ "algorithm": "merged",
+ "expanded_query": "/* select#2 */ select t1.a AS a,t1.b AS b from t1 where t1.a = 1"
}
},
- {
- "expanded_query": "/* select#1 */ select t1.a AS a,t1.b AS b from (/* select#2 */ select t1.a AS a,t1.b AS b from t1 where t1.a = 1) q"
- }
- ]
- }
- },
- {
- "join_optimization": {
- "select_id": 1,
- "steps": [
{
"condition_processing": {
"condition": "WHERE",
@@ -340,40 +304,15 @@ select * from information_schema.OPTIMIZER_TRACE;
QUERY TRACE MISSING_BYTES_BEYOND_MAX_MEM_SIZE INSUFFICIENT_PRIVILEGES
select * from v2 {
"steps": [
- {
- "join_preparation": {
- "select_id": 1,
- "steps": [
- {
- "view": {
- "table": "v2",
- "select_id": 2,
- "algorithm": "materialized"
- }
- },
- {
- "join_preparation": {
- "select_id": 2,
- "steps": [
- {
- "expanded_query": "/* select#2 */ select t1.a AS a,t1.b AS b from t1 where t1.a = 1 group by t1.b"
- }
- ]
- }
- },
- {
- "expanded_query": "/* select#1 */ select v2.a AS a,v2.b AS b from v2"
- }
- ]
- }
- },
{
"join_optimization": {
"select_id": 1,
+ "expanded_query": "/* select#1 */ select v2.a AS a,v2.b AS b from v2",
"steps": [
{
"join_optimization": {
"select_id": 2,
+ "expanded_query": "/* select#2 */ select t1.a AS a,t1.b AS b from t1 where t1.a = 1 group by t1.b",
"steps": [
{
"condition_processing": {
@@ -590,36 +529,18 @@ QUERY TRACE MISSING_BYTES_BEYOND_MAX_MEM_SIZE INSUFFICIENT_PRIVILEGES
explain select * from v2 {
"steps": [
{
- "join_preparation": {
+ "join_optimization": {
"select_id": 1,
+ "expanded_query": "/* select#1 */ select t2.a AS a from v2",
"steps": [
{
"view": {
"table": "v2",
"select_id": 2,
- "algorithm": "merged"
+ "algorithm": "merged",
+ "expanded_query": "/* select#2 */ select t2.a AS a from t2"
}
},
- {
- "join_preparation": {
- "select_id": 2,
- "steps": [
- {
- "expanded_query": "/* select#2 */ select t2.a AS a from t2"
- }
- ]
- }
- },
- {
- "expanded_query": "/* select#1 */ select t2.a AS a from v2"
- }
- ]
- }
- },
- {
- "join_optimization": {
- "select_id": 1,
- "steps": [
{
"table_dependencies": [
{
@@ -702,40 +623,15 @@ select * from information_schema.OPTIMIZER_TRACE;
QUERY TRACE MISSING_BYTES_BEYOND_MAX_MEM_SIZE INSUFFICIENT_PRIVILEGES
explain select * from v1 {
"steps": [
- {
- "join_preparation": {
- "select_id": 1,
- "steps": [
- {
- "view": {
- "table": "v1",
- "select_id": 2,
- "algorithm": "materialized"
- }
- },
- {
- "join_preparation": {
- "select_id": 2,
- "steps": [
- {
- "expanded_query": "/* select#2 */ select t1.a AS a from t1 group by t1.b"
- }
- ]
- }
- },
- {
- "expanded_query": "/* select#1 */ select v1.a AS a from v1"
- }
- ]
- }
- },
{
"join_optimization": {
"select_id": 1,
+ "expanded_query": "/* select#1 */ select v1.a AS a from v1",
"steps": [
{
"join_optimization": {
"select_id": 2,
+ "expanded_query": "/* select#2 */ select t1.a AS a from t1 group by t1.b",
"steps": [
{
"table_dependencies": [
@@ -911,19 +807,10 @@ select * from information_schema.OPTIMIZER_TRACE;
QUERY TRACE MISSING_BYTES_BEYOND_MAX_MEM_SIZE INSUFFICIENT_PRIVILEGES
explain select * from t1,t2 where t1.a=t2.b+2 and t2.a= t1.b {
"steps": [
- {
- "join_preparation": {
- "select_id": 1,
- "steps": [
- {
- "expanded_query": "select t1.a AS a,t1.b AS b,t1.c AS c,t2.a AS a,t2.b AS b,t2.c AS c from t1 join t2 where t1.a = t2.b + 2 and t2.a = t1.b"
- }
- ]
- }
- },
{
"join_optimization": {
"select_id": 1,
+ "expanded_query": "select t1.a AS a,t1.b AS b,t1.c AS c,t2.a AS a,t2.b AS b,t2.c AS c from t1 join t2 where t1.a = t2.b + 2 and t2.a = t1.b",
"steps": [
{
"condition_processing": {
@@ -1162,19 +1049,10 @@ select * from information_schema.OPTIMIZER_TRACE;
QUERY TRACE MISSING_BYTES_BEYOND_MAX_MEM_SIZE INSUFFICIENT_PRIVILEGES
EXPLAIN SELECT DISTINCT a FROM t1 {
"steps": [
- {
- "join_preparation": {
- "select_id": 1,
- "steps": [
- {
- "expanded_query": "select distinct t1.a AS a from t1"
- }
- ]
- }
- },
{
"join_optimization": {
"select_id": 1,
+ "expanded_query": "select distinct t1.a AS a from t1",
"steps": [
{
"table_dependencies": [
@@ -1327,19 +1205,10 @@ select * from information_schema.OPTIMIZER_TRACE;
QUERY TRACE MISSING_BYTES_BEYOND_MAX_MEM_SIZE INSUFFICIENT_PRIVILEGES
EXPLAIN SELECT MIN(d) FROM t1 where b=2 and c=3 group by a {
"steps": [
- {
- "join_preparation": {
- "select_id": 1,
- "steps": [
- {
- "expanded_query": "select min(t1.d) AS `MIN(d)` from t1 where t1.b = 2 and t1.c = 3 group by t1.a"
- }
- ]
- }
- },
{
"join_optimization": {
"select_id": 1,
+ "expanded_query": "select min(t1.d) AS `MIN(d)` from t1 where t1.b = 2 and t1.c = 3 group by t1.a",
"steps": [
{
"condition_processing": {
@@ -1526,19 +1395,10 @@ select * from INFORMATION_SCHEMA.OPTIMIZER_TRACE;
QUERY TRACE MISSING_BYTES_BEYOND_MAX_MEM_SIZE INSUFFICIENT_PRIVILEGES
EXPLAIN SELECT id,MIN(a),MAX(a) FROM t1 WHERE a>=20010104e0 GROUP BY id {
"steps": [
- {
- "join_preparation": {
- "select_id": 1,
- "steps": [
- {
- "expanded_query": "select t1.`id` AS `id`,min(t1.a) AS `MIN(a)`,max(t1.a) AS `MAX(a)` from t1 where t1.a >= 20010104e0 group by t1.`id`"
- }
- ]
- }
- },
{
"join_optimization": {
"select_id": 1,
+ "expanded_query": "select t1.`id` AS `id`,min(t1.a) AS `MIN(a)`,max(t1.a) AS `MAX(a)` from t1 where t1.a >= 20010104e0 group by t1.`id`",
"steps": [
{
"condition_processing": {
@@ -1714,19 +1574,10 @@ select * from INFORMATION_SCHEMA.OPTIMIZER_TRACE;
QUERY TRACE MISSING_BYTES_BEYOND_MAX_MEM_SIZE INSUFFICIENT_PRIVILEGES
EXPLAIN SELECT * FROM t1 WHERE a = 20010104e0 GROUP BY id {
"steps": [
- {
- "join_preparation": {
- "select_id": 1,
- "steps": [
- {
- "expanded_query": "select t1.`id` AS `id`,t1.a AS a from t1 where t1.a = 20010104e0 group by t1.`id`"
- }
- ]
- }
- },
{
"join_optimization": {
"select_id": 1,
+ "expanded_query": "select t1.`id` AS `id`,t1.a AS a from t1 where t1.a = 20010104e0 group by t1.`id`",
"steps": [
{
"condition_processing": {
@@ -1929,19 +1780,10 @@ select * from INFORMATION_SCHEMA.OPTIMIZER_TRACE;
QUERY TRACE MISSING_BYTES_BEYOND_MAX_MEM_SIZE INSUFFICIENT_PRIVILEGES
explain select * from t1 where a=1 and b=2 order by c limit 1 {
"steps": [
- {
- "join_preparation": {
- "select_id": 1,
- "steps": [
- {
- "expanded_query": "select t1.pk AS pk,t1.a AS a,t1.b AS b,t1.c AS c,t1.filler AS filler from t1 where t1.a = 1 and t1.b = 2 order by t1.c limit 1"
- }
- ]
- }
- },
{
"join_optimization": {
"select_id": 1,
+ "expanded_query": "select t1.pk AS pk,t1.a AS a,t1.b AS b,t1.c AS c,t1.filler AS filler from t1 where t1.a = 1 and t1.b = 2 order by t1.c limit 1",
"steps": [
{
"condition_processing": {
@@ -2305,19 +2147,10 @@ QUERY TRACE MISSING_BYTES_BEYOND_MAX_MEM_SIZE INSUFFICIENT_PRIVILEGES
explain
select t1.a from t1 left join t2 on t1.a=t2.a {
"steps": [
- {
- "join_preparation": {
- "select_id": 1,
- "steps": [
- {
- "expanded_query": "select t1.a AS a from (t1 left join t2 on(t1.a = t2.a))"
- }
- ]
- }
- },
{
"join_optimization": {
"select_id": 1,
+ "expanded_query": "select t1.a AS a from (t1 left join t2 on(t1.a = t2.a))",
"steps": [
{
"build_equal_items": {
@@ -2442,19 +2275,10 @@ select * from INFORMATION_SCHEMA.OPTIMIZER_TRACE;
QUERY TRACE MISSING_BYTES_BEYOND_MAX_MEM_SIZE INSUFFICIENT_PRIVILEGES
explain select * from t1 left join t2 on t2.a=t1.a {
"steps": [
- {
- "join_preparation": {
- "select_id": 1,
- "steps": [
- {
- "expanded_query": "select t1.a AS a,t2.a AS a,t2.b AS b from (t1 left join t2 on(t2.a = t1.a))"
- }
- ]
- }
- },
{
"join_optimization": {
"select_id": 1,
+ "expanded_query": "select t1.a AS a,t2.a AS a,t2.b AS b from (t1 left join t2 on(t2.a = t1.a))",
"steps": [
{
"build_equal_items": {
@@ -2621,19 +2445,10 @@ select * from INFORMATION_SCHEMA.OPTIMIZER_TRACE;
QUERY TRACE MISSING_BYTES_BEYOND_MAX_MEM_SIZE INSUFFICIENT_PRIVILEGES
explain select t1.a from t1 left join (t2 join t3 on t2.b=t3.b) on t2.a=t1.a and t3.a=t1.a {
"steps": [
- {
- "join_preparation": {
- "select_id": 1,
- "steps": [
- {
- "expanded_query": "select t1.a AS a from (t1 left join (t2 join t3 on(t2.b = t3.b)) on(t2.a = t1.a and t3.a = t1.a))"
- }
- ]
- }
- },
{
"join_optimization": {
"select_id": 1,
+ "expanded_query": "select t1.a AS a from (t1 left join (t2 join t3 on(t2.b = t3.b)) on(t2.a = t1.a and t3.a = t1.a))",
"steps": [
{
"build_equal_items": {
@@ -2812,46 +2627,19 @@ QUERY TRACE MISSING_BYTES_BEYOND_MAX_MEM_SIZE INSUFFICIENT_PRIVILEGES
explain extended select * from t1 where a in (select pk from t10) {
"steps": [
{
- "join_preparation": {
+ "join_optimization": {
"select_id": 1,
+ "expanded_query": "/* select#1 */ select t1.a AS a,t1.b AS b from t1 where t1.a in (/* select#2 */ select t10.pk from t10)",
"steps": [
{
- "join_preparation": {
+ "transformation": {
"select_id": 2,
- "steps": [
- {
- "transformation": {
- "select_id": 2,
- "from": "IN (SELECT)",
- "to": "materialization",
- "sjm_scan_allowed": true,
- "possible": true
- }
- },
- {
- "transformation": {
- "select_id": 2,
- "from": "IN (SELECT)",
- "to": "semijoin",
- "chosen": true
- }
- },
- {
- "expanded_query": "/* select#2 */ select t10.pk from t10"
- }
- ]
+ "from": "IN (SELECT)",
+ "to": "materialization",
+ "sjm_scan_allowed": true,
+ "possible": true
}
},
- {
- "expanded_query": "/* select#1 */ select t1.a AS a,t1.b AS b from t1 where t1.a in (/* select#2 */ select t10.pk from t10)"
- }
- ]
- }
- },
- {
- "join_optimization": {
- "select_id": 1,
- "steps": [
{
"transformation": {
"select_id": 2,
@@ -3135,19 +2923,10 @@ select * from INFORMATION_SCHEMA.OPTIMIZER_TRACE;
QUERY TRACE MISSING_BYTES_BEYOND_MAX_MEM_SIZE INSUFFICIENT_PRIVILEGES
explain select * from t1 where pk = 2 and a=5 and b=1 {
"steps": [
- {
- "join_preparation": {
- "select_id": 1,
- "steps": [
- {
- "expanded_query": "select t1.pk AS pk,t1.a AS a,t1.b AS b from t1 where t1.pk = 2 and t1.a = 5 and t1.b = 1"
- }
- ]
- }
- },
{
"join_optimization": {
"select_id": 1,
+ "expanded_query": "select t1.pk AS pk,t1.a AS a,t1.b AS b from t1 where t1.pk = 2 and t1.a = 5 and t1.b = 1",
"steps": [
{
"condition_processing": {
@@ -3496,19 +3275,10 @@ select * from INFORMATION_SCHEMA.OPTIMIZER_TRACE;
QUERY TRACE MISSING_BYTES_BEYOND_MAX_MEM_SIZE INSUFFICIENT_PRIVILEGES
select f1(a) from t1 {
"steps": [
- {
- "join_preparation": {
- "select_id": 1,
- "steps": [
- {
- "expanded_query": "select f1(t1.a) AS `f1(a)` from t1"
- }
- ]
- }
- },
{
"join_optimization": {
"select_id": 1,
+ "expanded_query": "select f1(t1.a) AS `f1(a)` from t1",
"steps": [
{
"table_dependencies": [
@@ -3593,19 +3363,10 @@ select * from INFORMATION_SCHEMA.OPTIMIZER_TRACE;
QUERY TRACE MISSING_BYTES_BEYOND_MAX_MEM_SIZE INSUFFICIENT_PRIVILEGES
select f2(a) from t1 {
"steps": [
- {
- "join_preparation": {
- "select_id": 1,
- "steps": [
- {
- "expanded_query": "select f2(t1.a) AS `f2(a)` from t1"
- }
- ]
- }
- },
{
"join_optimization": {
"select_id": 1,
+ "expanded_query": "select f2(t1.a) AS `f2(a)` from t1",
"steps": [
{
"table_dependencies": [
@@ -3697,7 +3458,7 @@ a
2
select length(trace) from INFORMATION_SCHEMA.OPTIMIZER_TRACE;
length(trace)
-2141
+2012
set optimizer_trace_max_mem_size=100;
select * from t1;
a
@@ -3708,10 +3469,9 @@ QUERY TRACE MISSING_BYTES_BEYOND_MAX_MEM_SIZE INSUFFICIENT_PRIVILEGES
select * from t1 {
"steps": [
{
- "join_preparation": {
+ "join_optimization": {
"select_id": 1,
- "steps": [
- 2041 0
+ "expanded_query": 1912 0
set optimizer_trace_max_mem_size=0;
select * from t1;
a
@@ -3719,7 +3479,7 @@ a
2
select * from INFORMATION_SCHEMA.OPTIMIZER_TRACE;
QUERY TRACE MISSING_BYTES_BEYOND_MAX_MEM_SIZE INSUFFICIENT_PRIVILEGES
-select * from t1 2141 0
+select * from t1 2012 0
drop table t1;
set optimizer_trace='enabled=off';
set @@optimizer_trace_max_mem_size= @save_optimizer_trace_max_mem_size;
@@ -3809,19 +3569,10 @@ select * from information_schema.optimizer_trace;
QUERY TRACE MISSING_BYTES_BEYOND_MAX_MEM_SIZE INSUFFICIENT_PRIVILEGES
explain delete t0,t1 from t0, t1 where t0.a=t1.a and t1.a<3 {
"steps": [
- {
- "join_preparation": {
- "select_id": 1,
- "steps": [
- {
- "expanded_query": "select NULL AS `NULL` from t0 join t1 where t0.a = t1.a and t1.a < 3"
- }
- ]
- }
- },
{
"join_optimization": {
"select_id": 1,
+ "expanded_query": "select NULL AS `NULL` from t0 join t1 where t0.a = t1.a and t1.a < 3",
"steps": [
{
"condition_processing": {
@@ -4175,36 +3926,10 @@ select * from INFORMATION_SCHEMA.OPTIMIZER_TRACE;
QUERY TRACE MISSING_BYTES_BEYOND_MAX_MEM_SIZE INSUFFICIENT_PRIVILEGES
explain select * from (select rand() from t1)q {
"steps": [
- {
- "join_preparation": {
- "select_id": 1,
- "steps": [
- {
- "derived": {
- "table": "q",
- "select_id": 2,
- "algorithm": "merged"
- }
- },
- {
- "join_preparation": {
- "select_id": 2,
- "steps": [
- {
- "expanded_query": "/* select#2 */ select rand() AS `rand()` from t1"
- }
- ]
- }
- },
- {
- "expanded_query": "/* select#1 */ select rand() AS `rand()` from (/* select#2 */ select rand() AS `rand()` from t1) q"
- }
- ]
- }
- },
{
"join_optimization": {
"select_id": 1,
+ "expanded_query": "/* select#1 */ select rand() AS `rand()` from (/* select#2 */ select rand() AS `rand()` from t1) q",
"steps": [
{
"derived": {
@@ -4217,6 +3942,7 @@ explain select * from (select rand() from t1)q {
{
"join_optimization": {
"select_id": 2,
+ "expanded_query": "/* select#2 */ select rand() AS `rand()` from t1",
"steps": [
{
"table_dependencies": [
@@ -4385,46 +4111,19 @@ QUERY TRACE MISSING_BYTES_BEYOND_MAX_MEM_SIZE INSUFFICIENT_PRIVILEGES
explain select * from t1 where a in (select t_inner_1.a from t1 t_inner_1, t1 t_inner_2) {
"steps": [
{
- "join_preparation": {
+ "join_optimization": {
"select_id": 1,
+ "expanded_query": "/* select#1 */ select t1.a AS a from t1 where t1.a in (/* select#2 */ select t_inner_1.a from t1 t_inner_1 join t1 t_inner_2)",
"steps": [
{
- "join_preparation": {
+ "transformation": {
"select_id": 2,
- "steps": [
- {
- "transformation": {
- "select_id": 2,
- "from": "IN (SELECT)",
- "to": "materialization",
- "sjm_scan_allowed": true,
- "possible": true
- }
- },
- {
- "transformation": {
- "select_id": 2,
- "from": "IN (SELECT)",
- "to": "semijoin",
- "chosen": true
- }
- },
- {
- "expanded_query": "/* select#2 */ select t_inner_1.a from t1 t_inner_1 join t1 t_inner_2"
- }
- ]
+ "from": "IN (SELECT)",
+ "to": "materialization",
+ "sjm_scan_allowed": true,
+ "possible": true
}
},
- {
- "expanded_query": "/* select#1 */ select t1.a AS a from t1 where t1.a in (/* select#2 */ select t_inner_1.a from t1 t_inner_1 join t1 t_inner_2)"
- }
- ]
- }
- },
- {
- "join_optimization": {
- "select_id": 1,
- "steps": [
{
"transformation": {
"select_id": 2,
@@ -4835,73 +4534,19 @@ explain select * from t1 t_outer_1,t2 t_outer_2 where t_outer_1.a in (select t_
t_outer_2.a in (select t_inner_3.a from t2 t_inner_3, t1 t_inner_4) {
"steps": [
{
- "join_preparation": {
+ "join_optimization": {
"select_id": 1,
+ "expanded_query": "/* select#1 */ select t_outer_1.a AS a,t_outer_2.a AS a from t1 t_outer_1 join t2 t_outer_2 where t_outer_1.a in (/* select#2 */ select t_inner_1.a from t2 t_inner_2 join t1 t_inner_1) and t_outer_2.a in (/* select#3 */ select t_inner_3.a from t2 t_inner_3 join t1 t_inner_4)",
"steps": [
{
- "join_preparation": {
+ "transformation": {
"select_id": 2,
- "steps": [
- {
- "transformation": {
- "select_id": 2,
- "from": "IN (SELECT)",
- "to": "materialization",
- "sjm_scan_allowed": true,
- "possible": true
- }
- },
- {
- "transformation": {
- "select_id": 2,
- "from": "IN (SELECT)",
- "to": "semijoin",
- "chosen": true
- }
- },
- {
- "expanded_query": "/* select#2 */ select t_inner_1.a from t2 t_inner_2 join t1 t_inner_1"
- }
- ]
- }
- },
- {
- "join_preparation": {
- "select_id": 3,
- "steps": [
- {
- "transformation": {
- "select_id": 3,
- "from": "IN (SELECT)",
- "to": "materialization",
- "sjm_scan_allowed": true,
- "possible": true
- }
- },
- {
- "transformation": {
- "select_id": 3,
- "from": "IN (SELECT)",
- "to": "semijoin",
- "chosen": true
- }
- },
- {
- "expanded_query": "/* select#3 */ select t_inner_3.a from t2 t_inner_3 join t1 t_inner_4"
- }
- ]
+ "from": "IN (SELECT)",
+ "to": "materialization",
+ "sjm_scan_allowed": true,
+ "possible": true
}
},
- {
- "expanded_query": "/* select#1 */ select t_outer_1.a AS a,t_outer_2.a AS a from t1 t_outer_1 join t2 t_outer_2 where t_outer_1.a in (/* select#2 */ select t_inner_1.a from t2 t_inner_2 join t1 t_inner_1) and t_outer_2.a in (/* select#3 */ select t_inner_3.a from t2 t_inner_3 join t1 t_inner_4)"
- }
- ]
- }
- },
- {
- "join_optimization": {
- "select_id": 1,
- "steps": [
{
"transformation": {
"select_id": 2,
@@ -4910,6 +4555,15 @@ t_outer_2.a in (select t_inner_3.a from t2 t_inner_3, t1 t_inner_4) {
"converted_to_semi_join": true
}
},
+ {
+ "transformation": {
+ "select_id": 3,
+ "from": "IN (SELECT)",
+ "to": "materialization",
+ "sjm_scan_allowed": true,
+ "possible": true
+ }
+ },
{
"transformation": {
"select_id": 3,
@@ -6415,73 +6069,19 @@ explain select * from t1 t_outer_1,t2 t_outer_2 where t_outer_1.a in (select t_
t_outer_2.a in (select t_inner_3.a from t2 t_inner_3, t1 t_inner_4) {
"steps": [
{
- "join_preparation": {
+ "join_optimization": {
"select_id": 1,
+ "expanded_query": "/* select#1 */ select t_outer_1.a AS a,t_outer_2.a AS a from t1 t_outer_1 join t2 t_outer_2 where t_outer_1.a in (/* select#2 */ select t_inner_1.a from t2 t_inner_2 join t1 t_inner_1) and t_outer_2.a in (/* select#3 */ select t_inner_3.a from t2 t_inner_3 join t1 t_inner_4)",
"steps": [
{
- "join_preparation": {
+ "transformation": {
"select_id": 2,
- "steps": [
- {
- "transformation": {
- "select_id": 2,
- "from": "IN (SELECT)",
- "to": "materialization",
- "sjm_scan_allowed": true,
- "possible": true
- }
- },
- {
- "transformation": {
- "select_id": 2,
- "from": "IN (SELECT)",
- "to": "semijoin",
- "chosen": true
- }
- },
- {
- "expanded_query": "/* select#2 */ select t_inner_1.a from t2 t_inner_2 join t1 t_inner_1"
- }
- ]
- }
- },
- {
- "join_preparation": {
- "select_id": 3,
- "steps": [
- {
- "transformation": {
- "select_id": 3,
- "from": "IN (SELECT)",
- "to": "materialization",
- "sjm_scan_allowed": true,
- "possible": true
- }
- },
- {
- "transformation": {
- "select_id": 3,
- "from": "IN (SELECT)",
- "to": "semijoin",
- "chosen": true
- }
- },
- {
- "expanded_query": "/* select#3 */ select t_inner_3.a from t2 t_inner_3 join t1 t_inner_4"
- }
- ]
+ "from": "IN (SELECT)",
+ "to": "materialization",
+ "sjm_scan_allowed": true,
+ "possible": true
}
},
- {
- "expanded_query": "/* select#1 */ select t_outer_1.a AS a,t_outer_2.a AS a from t1 t_outer_1 join t2 t_outer_2 where t_outer_1.a in (/* select#2 */ select t_inner_1.a from t2 t_inner_2 join t1 t_inner_1) and t_outer_2.a in (/* select#3 */ select t_inner_3.a from t2 t_inner_3 join t1 t_inner_4)"
- }
- ]
- }
- },
- {
- "join_optimization": {
- "select_id": 1,
- "steps": [
{
"transformation": {
"select_id": 2,
@@ -6490,6 +6090,15 @@ t_outer_2.a in (select t_inner_3.a from t2 t_inner_3, t1 t_inner_4) {
"converted_to_semi_join": true
}
},
+ {
+ "transformation": {
+ "select_id": 3,
+ "from": "IN (SELECT)",
+ "to": "materialization",
+ "sjm_scan_allowed": true,
+ "possible": true
+ }
+ },
{
"transformation": {
"select_id": 3,
@@ -8677,19 +8286,10 @@ SELECT query, trace FROM INFORMATION_SCHEMA.OPTIMIZER_TRACE;
query trace
SELECT 'a\0' LIMIT 0 {
"steps": [
- {
- "join_preparation": {
- "select_id": 1,
- "steps": [
- {
- "expanded_query": "select 'a\0' AS `a\x00` limit 0"
- }
- ]
- }
- },
{
"join_optimization": {
"select_id": 1,
+ "expanded_query": "select 'a\0' AS `a\x00` limit 0",
"steps": []
}
},
@@ -8716,19 +8316,10 @@ select * from INFORMATION_SCHEMA.OPTIMIZER_TRACE;
QUERY TRACE MISSING_BYTES_BEYOND_MAX_MEM_SIZE INSUFFICIENT_PRIVILEGES
select count(*) from seq_1_to_10000000 {
"steps": [
- {
- "join_preparation": {
- "select_id": 1,
- "steps": [
- {
- "expanded_query": "select count(0) AS `count(*)` from seq_1_to_10000000"
- }
- ]
- }
- },
{
"join_optimization": {
"select_id": 1,
+ "expanded_query": "select count(0) AS `count(*)` from seq_1_to_10000000",
"steps": [
{
"table_dependencies": [
@@ -8855,50 +8446,6 @@ json_detailed(json_extract(trace, '$**.in_to_subquery_conversion'))
"item": "t0.a in (1,2,3,4,5,6)",
"conversion":
[
-
- {
- "join_preparation":
- {
- "select_id": 2,
- "steps":
- [
-
- {
- "derived":
- {
- "table": "tvc_0",
- "select_id": 3,
- "algorithm": "materialized"
- }
- },
-
- {
- "transformation":
- {
- "select_id": 2,
- "from": "IN (SELECT)",
- "to": "materialization",
- "sjm_scan_allowed": true,
- "possible": true
- }
- },
-
- {
- "transformation":
- {
- "select_id": 2,
- "from": "IN (SELECT)",
- "to": "semijoin",
- "chosen": true
- }
- },
-
- {
- "expanded_query": "/* select#2 */ select tvc_0._col_1 from (values (1),(2),(3),(4),(5),(6)) tvc_0"
- }
- ]
- }
- }
]
}
]
@@ -8954,7 +8501,7 @@ set @path= (select json_search(@trace, 'one', 'no predicate for first keypart'))
set @sub_path= substr(@path, 2, locate('.best_access_path', @path)-2);
select @sub_path;
@sub_path
-$.steps[1].join_optimization.steps[4].considered_execution_plans[0].rest_of_plan[0]
+$.steps[0].join_optimization.steps[4].considered_execution_plans[0].rest_of_plan[0]
select
json_detailed(json_extract(
@trace,
diff --git a/mysql-test/main/opt_trace_index_merge.result b/mysql-test/main/opt_trace_index_merge.result
index f1e13586eda..b319639e9fa 100644
--- a/mysql-test/main/opt_trace_index_merge.result
+++ b/mysql-test/main/opt_trace_index_merge.result
@@ -19,19 +19,10 @@ select * from information_schema.OPTIMIZER_TRACE;
QUERY TRACE MISSING_BYTES_BEYOND_MAX_MEM_SIZE INSUFFICIENT_PRIVILEGES
explain select * from t1 where a=1 or b=1 {
"steps": [
- {
- "join_preparation": {
- "select_id": 1,
- "steps": [
- {
- "expanded_query": "select t1.a AS a,t1.b AS b,t1.c AS c,t1.filler AS filler from t1 where t1.a = 1 or t1.b = 1"
- }
- ]
- }
- },
{
"join_optimization": {
"select_id": 1,
+ "expanded_query": "select t1.a AS a,t1.b AS b,t1.c AS c,t1.filler AS filler from t1 where t1.a = 1 or t1.b = 1",
"steps": [
{
"condition_processing": {
diff --git a/mysql-test/main/opt_trace_index_merge_innodb.result b/mysql-test/main/opt_trace_index_merge_innodb.result
index 0ddaaeae89d..bc063015e6d 100644
--- a/mysql-test/main/opt_trace_index_merge_innodb.result
+++ b/mysql-test/main/opt_trace_index_merge_innodb.result
@@ -27,19 +27,10 @@ select * from information_schema.OPTIMIZER_TRACE;
QUERY TRACE MISSING_BYTES_BEYOND_MAX_MEM_SIZE INSUFFICIENT_PRIVILEGES
explain select * from t1 where pk1 != 0 and key1 = 1 {
"steps": [
- {
- "join_preparation": {
- "select_id": 1,
- "steps": [
- {
- "expanded_query": "select t1.pk1 AS pk1,t1.pk2 AS pk2,t1.key1 AS key1,t1.key2 AS key2 from t1 where t1.pk1 <> 0 and t1.key1 = 1"
- }
- ]
- }
- },
{
"join_optimization": {
"select_id": 1,
+ "expanded_query": "select t1.pk1 AS pk1,t1.pk2 AS pk2,t1.key1 AS key1,t1.key2 AS key2 from t1 where t1.pk1 <> 0 and t1.key1 = 1",
"steps": [
{
"condition_processing": {
diff --git a/mysql-test/main/opt_trace_security.result b/mysql-test/main/opt_trace_security.result
index e1937e744a4..9223f6c0a28 100644
--- a/mysql-test/main/opt_trace_security.result
+++ b/mysql-test/main/opt_trace_security.result
@@ -16,7 +16,6 @@ select * from db1.t1;
ERROR 42000: SELECT command denied to user 'foo'@'localhost' for table 't1'
select * from information_schema.OPTIMIZER_TRACE;
QUERY TRACE MISSING_BYTES_BEYOND_MAX_MEM_SIZE INSUFFICIENT_PRIVILEGES
- 0 1
set optimizer_trace="enabled=off";
grant select(a) on db1.t1 to 'foo'@'%';
set optimizer_trace="enabled=on";
@@ -50,19 +49,10 @@ select * from information_schema.OPTIMIZER_TRACE;
QUERY TRACE MISSING_BYTES_BEYOND_MAX_MEM_SIZE INSUFFICIENT_PRIVILEGES
select * from db1.t1 {
"steps": [
- {
- "join_preparation": {
- "select_id": 1,
- "steps": [
- {
- "expanded_query": "select db1.t1.a AS a from t1"
- }
- ]
- }
- },
{
"join_optimization": {
"select_id": 1,
+ "expanded_query": "select db1.t1.a AS a from t1",
"steps": [
{
"table_dependencies": [
@@ -156,36 +146,18 @@ QUERY TRACE MISSING_BYTES_BEYOND_MAX_MEM_SIZE INSUFFICIENT_PRIVILEGES
select * from db1.v1 {
"steps": [
{
- "join_preparation": {
+ "join_optimization": {
"select_id": 1,
+ "expanded_query": "/* select#1 */ select db1.t1.a AS a from v1",
"steps": [
{
"view": {
"table": "v1",
"select_id": 2,
- "algorithm": "merged"
- }
- },
- {
- "join_preparation": {
- "select_id": 2,
- "steps": [
- {
- "expanded_query": "/* select#2 */ select db1.t1.a AS a from t1"
- }
- ]
+ "algorithm": "merged",
+ "expanded_query": "/* select#2 */ select db1.t1.a AS a from t1"
}
},
- {
- "expanded_query": "/* select#1 */ select db1.t1.a AS a from v1"
- }
- ]
- }
- },
- {
- "join_optimization": {
- "select_id": 1,
- "steps": [
{
"table_dependencies": [
{
diff --git a/sql/my_json_writer.h b/sql/my_json_writer.h
index d82313f996f..20f479dc6d0 100644
--- a/sql/my_json_writer.h
+++ b/sql/my_json_writer.h
@@ -224,6 +224,14 @@ class Json_writer
size_t get_truncated_bytes() { return output.get_truncated_bytes(); }
+ /*
+ Note: this may not return exact value due to pretty-printer doing
+ buffering
+ */
+ size_t get_written_size() {
+ return output.length() + output.get_truncated_bytes();
+ }
+
Json_writer() :
indent_level(0), document_start(true), element_started(false),
first_child(true)
diff --git a/sql/opt_subselect.cc b/sql/opt_subselect.cc
index d91557c5be2..953f2af0be8 100644
--- a/sql/opt_subselect.cc
+++ b/sql/opt_subselect.cc
@@ -549,6 +549,12 @@ bool is_materialization_applicable(THD *thd, Item_in_subselect *in_subs,
and, depending on the rewrite, either do it, or record it to be done at a
later phase.
+ NOTE
+ * This function called at Prepare phase. It should NOT do any rewrites.
+ It only collects information that's used for doing the rewrites at the
+ optimization phase.
+ * Optimizer trace is NOT yet enabled when this function is called.
+
RETURN
0 - OK
Other - Some sort of query error
@@ -703,8 +709,6 @@ int check_and_do_in_subquery_rewrites(JOIN *join)
{
DBUG_PRINT("info", ("Subquery is semi-join conversion candidate"));
- (void)subquery_types_allow_materialization(thd, in_subs);
-
in_subs->is_flattenable_semijoin= TRUE;
/* Register the subquery for further processing in flatten_subqueries() */
@@ -717,10 +721,6 @@ int check_and_do_in_subquery_rewrites(JOIN *join)
if (arena)
thd->restore_active_arena(arena, &backup);
in_subs->is_registered_semijoin= TRUE;
- OPT_TRACE_TRANSFORM(thd, trace_wrapper, trace_transform,
- select_lex->select_number,
- "IN (SELECT)", "semijoin");
- trace_transform.add("chosen", true);
}
}
else
@@ -1262,6 +1262,7 @@ bool convert_join_subqueries_to_semijoins(JOIN *join)
while ((in_subq= li++))
{
bool remove_item= TRUE;
+ (void)subquery_types_allow_materialization(thd, in_subq);
/* Stop processing if we've reached a subquery that's attached to the ON clause */
if (in_subq->do_not_convert_to_sj)
diff --git a/sql/opt_trace.cc b/sql/opt_trace.cc
index ba9220cac44..2227519d991 100644
--- a/sql/opt_trace.cc
+++ b/sql/opt_trace.cc
@@ -106,6 +106,27 @@ inline bool sql_command_can_be_traced(enum enum_sql_command sql_command)
sql_command == SQLCOM_UPDATE_MULTI;
}
+
+void opt_trace_print_expanded_union(THD *thd, SELECT_LEX_UNIT *unit,
+ Json_writer_object *writer)
+{
+ DBUG_ASSERT(thd->trace_started());
+
+ StringBuffer<1024> str(system_charset_info);
+ ulonglong save_option_bits= thd->variables.option_bits;
+ thd->variables.option_bits &= ~OPTION_QUOTE_SHOW_CREATE;
+ unit->print(&str, enum_query_type(QT_TO_SYSTEM_CHARSET |
+ QT_SHOW_SELECT_NUMBER |
+ QT_ITEM_IDENT_SKIP_DB_NAMES |
+ QT_VIEW_INTERNAL));
+ thd->variables.option_bits= save_option_bits;
+ /*
+ The output is not very pretty lots of back-ticks, the output
+ is as the one in explain extended , lets try to improved it here.
+ */
+ writer->add("expanded_query", str.c_ptr_safe(), str.length());
+}
+
void opt_trace_print_expanded_query(THD *thd, SELECT_LEX *select_lex,
Json_writer_object *writer)
@@ -499,10 +520,49 @@ Opt_trace_start::Opt_trace_start(THD *thd, TABLE_LIST *tbl,
}
}
+
+/*
+ @brief
+ See "Handing Query Errors" section of comment for Opt_trace_start
+*/
+
+void Opt_trace_start::trace_heading_done()
+{
+ Json_writer *w;
+ if (traceable && (w= ctx->get_current_json()))
+ trace_heading_size= w->get_written_size();
+ else
+ trace_heading_size= 0;
+}
+
+
+/*
+ @brief
+ See "Handing Query Errors" section of comment for Opt_trace_start
+
+ @detail
+ We can't delete the trace right now, because some final writes (e.g.
+ the top-level closing '}' will still be made to it. Just set clean_me=true
+ so that it is deleted instead of saving it.
+*/
+
+void Opt_trace_start::clean_empty_trace()
+{
+ Json_writer *w;
+ if (traceable && (w= ctx->get_current_json()))
+ {
+ if (w->get_written_size() == trace_heading_size)
+ clean_me= true;
+ }
+}
+
+
Opt_trace_start::~Opt_trace_start()
{
if (traceable)
{
+ if (clean_me)
+ ctx->abort_trace();
ctx->end();
traceable= FALSE;
}
diff --git a/sql/opt_trace.h b/sql/opt_trace.h
index 101fb5f707e..180ecd7bdd3 100644
--- a/sql/opt_trace.h
+++ b/sql/opt_trace.h
@@ -69,10 +69,33 @@ struct Opt_trace_info
@param query query
@param length query's length
@param charset charset which was used to encode this query
+
+ @detail
+ == Lifecycle ==
+ The trace is created before the Name Resolution phase. Reasons:
+ 1. This way, we can have one place where we start the trace for all kinds of
+ queries. If we tried to start tracing right before query optimization
+ starts, we would have to construct Opt_trace_start object in many
+ places: one for SELECT, for UPDATE, for DELETE, etc.
+
+ 2. Privilege checking code may notify the trace (which must exist already)
+ that the user doesn't have enough permissions to perform tracing. See
+ missing_privilege() and the opt_trace_disable_if_*** functions below.
+
+ == Handling Query Errors ==
+ The trace is kept when query error occurs, except for the case when
+ nothing [meaningful] was traced. The second part is necessary for mtr to
+ produce the same output with and without --ps-protocol. If there is an
+ error on prepare phase, then:
+ - In --ps-protocol: PREPARE command produces no trace. The EXECUTE
+ command is not run. The trace is not generated at all.
+ - Regular SQL query: should also NOT produce any trace to match the above.
+ This is handled by trace_heading_done() and clean_empty_trace().
*/
-class Opt_trace_start {
+class Opt_trace_start
+{
public:
Opt_trace_start(THD *thd_arg, TABLE_LIST *tbl,
enum enum_sql_command sql_command,
@@ -82,8 +105,17 @@ class Opt_trace_start {
const CHARSET_INFO *query_charset);
~Opt_trace_start();
+ void trace_heading_done();
+ void clean_empty_trace();
private:
Opt_trace_context *const ctx;
+
+ /* Number of bytes written to the trace after the heading was written/ */
+ size_t trace_heading_size;
+
+ /* If true, trace should be removed (See Handling Query Errors above) */
+ bool clean_me= false;
+
/*
True: the query will be traced
False: otherwise
@@ -102,6 +134,9 @@ class Opt_trace_start {
void opt_trace_print_expanded_query(THD *thd, SELECT_LEX *select_lex,
Json_writer_object *trace_object);
+void opt_trace_print_expanded_union(THD *thd, SELECT_LEX_UNIT *unit,
+ Json_writer_object *writer);
+
void add_table_scan_values_to_trace(THD *thd, JOIN_TAB *tab);
void trace_plan_prefix(JOIN *join, uint idx, table_map join_tables);
void print_final_join_order(JOIN *join);
diff --git a/sql/opt_trace_context.h b/sql/opt_trace_context.h
index f578a0c67ec..ae77c94fdc2 100644
--- a/sql/opt_trace_context.h
+++ b/sql/opt_trace_context.h
@@ -114,6 +114,12 @@ class Opt_trace_context
bool is_enabled();
+ void abort_trace()
+ {
+ delete current_trace;
+ current_trace= NULL;
+ }
+
void missing_privilege();
static const char *flag_names[];
diff --git a/sql/sql_derived.cc b/sql/sql_derived.cc
index 579ea34b8e4..cf878ad29c0 100644
--- a/sql/sql_derived.cc
+++ b/sql/sql_derived.cc
@@ -420,6 +420,24 @@ bool mysql_derived_merge(THD *thd, LEX *lex, TABLE_LIST *derived)
goto exit_merge;
}
+ if (unlikely(thd->trace_started()))
+ {
+ /*
+ Add to optimizer trace whether a derived table/view
+ is merged into the parent select or not.
+ */
+ OPT_TRACE_VIEWS_TRANSFORM(thd, trace_wrapper, trace_derived,
+ derived->is_derived() ? "derived" : "view",
+ derived->alias.str ? derived->alias.str : "<NULL>",
+ derived->get_unit()->first_select()->select_number,
+ derived->is_merged_derived() ? "merged" : "materialized");
+ if (derived->is_merged_derived())
+ {
+ opt_trace_print_expanded_union(thd, derived->get_unit(),
+ &trace_derived);
+ }
+ }
+
/*
exclude select lex so it doesn't show up in explain.
do this only for derived table as for views this is already done.
@@ -822,18 +840,6 @@ bool mysql_derived_prepare(THD *thd, LEX *lex, TABLE_LIST *derived)
}
}
- if (unlikely(thd->trace_started()))
- {
- /*
- Add to optimizer trace whether a derived table/view
- is merged into the parent select or not.
- */
- OPT_TRACE_VIEWS_TRANSFORM(thd, trace_wrapper, trace_derived,
- derived->is_derived() ? "derived" : "view",
- derived->alias.str ? derived->alias.str : "<NULL>",
- derived->get_unit()->first_select()->select_number,
- derived->is_merged_derived() ? "merged" : "materialized");
- }
/*
Above cascade call of prepare is important for PS protocol, but after it
is called we can check if we really need prepare for this derived
diff --git a/sql/sql_parse.cc b/sql/sql_parse.cc
index b9d3eec5a60..47cf801d74c 100644
--- a/sql/sql_parse.cc
+++ b/sql/sql_parse.cc
@@ -3648,6 +3648,7 @@ mysql_execute_command(THD *thd, bool is_called_from_prepared_stmt)
Json_writer_object trace_command(thd);
Json_writer_array trace_command_steps(thd, "steps");
+ ots.trace_heading_done();
/* store old value of binlog format */
enum_binlog_format orig_binlog_format,orig_current_stmt_binlog_format;
@@ -6150,6 +6151,9 @@ mysql_execute_command(THD *thd, bool is_called_from_prepared_stmt)
wsrep_commit_empty(thd, true);
}
+ if (res || thd->is_error())
+ ots.clean_empty_trace();
+
/* assume PA safety for next transaction */
thd->wsrep_PA_safe= true;
#endif /* WITH_WSREP */
diff --git a/sql/sql_prepare.cc b/sql/sql_prepare.cc
index cc6f572ea64..b6bc9eff50e 100644
--- a/sql/sql_prepare.cc
+++ b/sql/sql_prepare.cc
@@ -2435,17 +2435,6 @@ static bool check_prepared_statement(Prepared_statement *stmt)
lex->first_select_lex()->context.resolve_in_table_list_only(select_lex->
get_table_list());
- /*
- For the optimizer trace, this is the symmetric, for statement preparation,
- of what is done at statement execution (in mysql_execute_command()).
- */
- Opt_trace_start ots(thd, tables, lex->sql_command, &lex->var_list,
- thd->query(), thd->query_length(),
- thd->variables.character_set_client);
-
- Json_writer_object trace_command(thd);
- Json_writer_array trace_command_steps(thd, "steps");
-
/* Reset warning count for each query that uses tables */
if (tables)
thd->get_stmt_da()->opt_clear_warning_info(thd->query_id);
diff --git a/sql/sql_select.cc b/sql/sql_select.cc
index a357d4f8c8a..45d8f54e264 100644
--- a/sql/sql_select.cc
+++ b/sql/sql_select.cc
@@ -1287,11 +1287,6 @@ JOIN::prepare(TABLE_LIST *tables_init, COND *conds_init, uint og_num,
join_list= &select_lex->top_join_list;
union_part= unit_arg->is_unit_op();
- Json_writer_object trace_wrapper(thd);
- Json_writer_object trace_prepare(thd, "join_preparation");
- trace_prepare.add_select_number(select_lex->select_number);
- Json_writer_array trace_steps(thd, "steps");
-
// simple check that we got usable conds
dbug_print_item(conds);
@@ -1675,12 +1670,6 @@ JOIN::prepare(TABLE_LIST *tables_init, COND *conds_init, uint og_num,
}
}
- if (thd->trace_started())
- {
- Json_writer_object trace_wrapper(thd);
- opt_trace_print_expanded_query(thd, select_lex, &trace_wrapper);
- }
-
if (!procedure && result && result->prepare(fields_list, unit_arg))
goto err; /* purecov: inspected */
@@ -1985,7 +1974,11 @@ JOIN::optimize_inner()
Json_writer_object trace_wrapper(thd);
Json_writer_object trace_prepare(thd, "join_optimization");
- trace_prepare.add_select_number(select_lex->select_number);
+ if (thd->trace_started())
+ {
+ trace_prepare.add_select_number(select_lex->select_number);
+ opt_trace_print_expanded_query(thd, select_lex, &trace_prepare);
+ }
Json_writer_array trace_steps(thd, "steps");
/*
1
0
revision-id: a1965b80e1ba7811a52fe54fc62a74879551598e (mariadb-10.5.14-28-ga1965b80e1b)
parent(s): a710016d578dc1165c511db19a5d4a094e736012
author: Sergei Petrunia
committer: Sergei Petrunia
timestamp: 2022-03-01 17:19:58 +0300
message:
Make testcase for MDEV-26585 stable.
---
mysql-test/main/group_min_max.result | 5 +++--
mysql-test/main/group_min_max.test | 4 +++-
2 files changed, 6 insertions(+), 3 deletions(-)
diff --git a/mysql-test/main/group_min_max.result b/mysql-test/main/group_min_max.result
index 4bdacd5e4a1..e6a5bc0ab50 100644
--- a/mysql-test/main/group_min_max.result
+++ b/mysql-test/main/group_min_max.result
@@ -4057,9 +4057,10 @@ KEY `index_t1_on_owner_id_and_foo` (`owner_id`,`foo`)
INSERT INTO t1 (owner_id, foo, whatever)
VALUES (1, TRUE, "yello"), (1, FALSE, "yello"), (2, TRUE, "yello"),
(2, TRUE, "yello"), (2, FALSE, "yello");
-EXPLAIN SELECT DISTINCT owner_id FROM t1 WHERE foo = true GROUP BY owner_id HAVING (COUNT(*) = 1);
+EXPLAIN
+SELECT DISTINCT owner_id FROM t1 WHERE foo = true GROUP BY owner_id HAVING (COUNT(*) = 1);
id select_type table type possible_keys key key_len ref rows Extra
-1 SIMPLE t1 index NULL index_t1_on_owner_id_and_foo 7 NULL 5 Using where; Using index
+1 SIMPLE t1 index NULL index_t1_on_owner_id_and_foo 7 NULL # Using where; Using index
SELECT DISTINCT owner_id FROM t1 WHERE foo = true GROUP BY owner_id HAVING (COUNT(*) = 1);
owner_id
1
diff --git a/mysql-test/main/group_min_max.test b/mysql-test/main/group_min_max.test
index b1d912684c6..3f7ef4ec9b8 100644
--- a/mysql-test/main/group_min_max.test
+++ b/mysql-test/main/group_min_max.test
@@ -1717,6 +1717,8 @@ CREATE TABLE `t1` (
INSERT INTO t1 (owner_id, foo, whatever)
VALUES (1, TRUE, "yello"), (1, FALSE, "yello"), (2, TRUE, "yello"),
(2, TRUE, "yello"), (2, FALSE, "yello");
-EXPLAIN SELECT DISTINCT owner_id FROM t1 WHERE foo = true GROUP BY owner_id HAVING (COUNT(*) = 1);
+--replace_column 9 #
+EXPLAIN
+SELECT DISTINCT owner_id FROM t1 WHERE foo = true GROUP BY owner_id HAVING (COUNT(*) = 1);
SELECT DISTINCT owner_id FROM t1 WHERE foo = true GROUP BY owner_id HAVING (COUNT(*) = 1);
DROP TABLE t1;
1
0
[Commits] 8e0e3968c37: MDEV-15208: server crashed, when using ORDER BY with window function and UNION
by psergey 21 Feb '22
by psergey 21 Feb '22
21 Feb '22
revision-id: 8e0e3968c37345d799982eefbb956d5e492fdc73 (mariadb-10.2.43-14-g8e0e3968c37)
parent(s): d140d276244508d7f2e69a3c1a23e0b09201f71a
author: Varun Gupta
committer: Sergei Petrunia
timestamp: 2022-02-21 20:23:39 +0300
message:
MDEV-15208: server crashed, when using ORDER BY with window function and UNION
(Edits by SergeiP: fix encryption.tempfiles_encrypted, re-word comment)
Global ORDER BY clause of a UNION may not refer to 1) aggregate functions
or 2) window functions. setup_order() checked for #1 but not for #2.
---
mysql-test/r/win.result | 8 ++++++++
mysql-test/suite/encryption/r/tempfiles_encrypted.result | 8 ++++++++
mysql-test/t/win.test | 10 ++++++++++
sql/sql_select.cc | 4 +++-
4 files changed, 29 insertions(+), 1 deletion(-)
diff --git a/mysql-test/r/win.result b/mysql-test/r/win.result
index 30650f29555..1ddafbd4f8f 100644
--- a/mysql-test/r/win.result
+++ b/mysql-test/r/win.result
@@ -4230,5 +4230,13 @@ i LAST_VALUE(COUNT(i)) OVER (PARTITION BY i ORDER BY j)
4 2
DROP TABLE t1;
#
+# MDEV-15208: server crashed, when using ORDER BY with window function and UNION
+#
+CREATE TABLE t1 (a INT);
+INSERT INTO t1 VALUES (1),(1),(1),(1),(1),(2),(2),(2),(2),(2),(2);
+SELECT 1 UNION SELECT a FROM t1 ORDER BY (row_number() over ());
+ERROR HY000: Expression #1 of ORDER BY contains aggregate function and applies to a UNION
+DROP TABLE t1;
+#
# End of 10.2 tests
#
diff --git a/mysql-test/suite/encryption/r/tempfiles_encrypted.result b/mysql-test/suite/encryption/r/tempfiles_encrypted.result
index aaba6bb044a..f7b5081f625 100644
--- a/mysql-test/suite/encryption/r/tempfiles_encrypted.result
+++ b/mysql-test/suite/encryption/r/tempfiles_encrypted.result
@@ -4236,6 +4236,14 @@ i LAST_VALUE(COUNT(i)) OVER (PARTITION BY i ORDER BY j)
4 2
DROP TABLE t1;
#
+# MDEV-15208: server crashed, when using ORDER BY with window function and UNION
+#
+CREATE TABLE t1 (a INT);
+INSERT INTO t1 VALUES (1),(1),(1),(1),(1),(2),(2),(2),(2),(2),(2);
+SELECT 1 UNION SELECT a FROM t1 ORDER BY (row_number() over ());
+ERROR HY000: Expression #1 of ORDER BY contains aggregate function and applies to a UNION
+DROP TABLE t1;
+#
# End of 10.2 tests
#
#
diff --git a/mysql-test/t/win.test b/mysql-test/t/win.test
index 126ed735c88..37f09a6e850 100644
--- a/mysql-test/t/win.test
+++ b/mysql-test/t/win.test
@@ -2730,6 +2730,16 @@ INSERT INTO t1 VALUES (1,1), (1,5),(1,4), (2,2),(2,5), (3,3),(4,4);
SELECT i, LAST_VALUE(COUNT(i)) OVER (PARTITION BY i ORDER BY j) FROM t1 GROUP BY i;
DROP TABLE t1;
+--echo #
+--echo # MDEV-15208: server crashed, when using ORDER BY with window function and UNION
+--echo #
+
+CREATE TABLE t1 (a INT);
+INSERT INTO t1 VALUES (1),(1),(1),(1),(1),(2),(2),(2),(2),(2),(2);
+--error ER_AGGREGATE_ORDER_FOR_UNION
+SELECT 1 UNION SELECT a FROM t1 ORDER BY (row_number() over ());
+DROP TABLE t1;
+
--echo #
--echo # End of 10.2 tests
--echo #
diff --git a/sql/sql_select.cc b/sql/sql_select.cc
index db8a63eeb48..a3e8b453897 100644
--- a/sql/sql_select.cc
+++ b/sql/sql_select.cc
@@ -22796,7 +22796,9 @@ int setup_order(THD *thd, Ref_ptr_array ref_pointer_array, TABLE_LIST *tables,
an ORDER BY clause
*/
- if (for_union && (*order->item)->with_sum_func)
+ if (for_union &&
+ ((*order->item)->with_sum_func ||
+ (*order->item)->with_window_func))
{
my_error(ER_AGGREGATE_ORDER_FOR_UNION, MYF(0), number);
return 1;
1
0
[Commits] 942a9791b22: MDEV-15208: server crashed, when using ORDER BY with window function and UNION
by psergey 21 Feb '22
by psergey 21 Feb '22
21 Feb '22
revision-id: 942a9791b2231273ba20649a658c856641268fae (mariadb-10.2.43-14-g942a9791b22)
parent(s): d140d276244508d7f2e69a3c1a23e0b09201f71a
author: Varun Gupta
committer: Sergei Petrunia
timestamp: 2022-02-21 19:19:04 +0300
message:
MDEV-15208: server crashed, when using ORDER BY with window function and UNION
SELECTs inside a UNION can have window function but not the global ORDER BY clause of the UNION.
---
mysql-test/r/win.result | 8 ++++++++
mysql-test/t/win.test | 10 ++++++++++
sql/sql_select.cc | 4 +++-
3 files changed, 21 insertions(+), 1 deletion(-)
diff --git a/mysql-test/r/win.result b/mysql-test/r/win.result
index 30650f29555..1ddafbd4f8f 100644
--- a/mysql-test/r/win.result
+++ b/mysql-test/r/win.result
@@ -4230,5 +4230,13 @@ i LAST_VALUE(COUNT(i)) OVER (PARTITION BY i ORDER BY j)
4 2
DROP TABLE t1;
#
+# MDEV-15208: server crashed, when using ORDER BY with window function and UNION
+#
+CREATE TABLE t1 (a INT);
+INSERT INTO t1 VALUES (1),(1),(1),(1),(1),(2),(2),(2),(2),(2),(2);
+SELECT 1 UNION SELECT a FROM t1 ORDER BY (row_number() over ());
+ERROR HY000: Expression #1 of ORDER BY contains aggregate function and applies to a UNION
+DROP TABLE t1;
+#
# End of 10.2 tests
#
diff --git a/mysql-test/t/win.test b/mysql-test/t/win.test
index 126ed735c88..37f09a6e850 100644
--- a/mysql-test/t/win.test
+++ b/mysql-test/t/win.test
@@ -2730,6 +2730,16 @@ INSERT INTO t1 VALUES (1,1), (1,5),(1,4), (2,2),(2,5), (3,3),(4,4);
SELECT i, LAST_VALUE(COUNT(i)) OVER (PARTITION BY i ORDER BY j) FROM t1 GROUP BY i;
DROP TABLE t1;
+--echo #
+--echo # MDEV-15208: server crashed, when using ORDER BY with window function and UNION
+--echo #
+
+CREATE TABLE t1 (a INT);
+INSERT INTO t1 VALUES (1),(1),(1),(1),(1),(2),(2),(2),(2),(2),(2);
+--error ER_AGGREGATE_ORDER_FOR_UNION
+SELECT 1 UNION SELECT a FROM t1 ORDER BY (row_number() over ());
+DROP TABLE t1;
+
--echo #
--echo # End of 10.2 tests
--echo #
diff --git a/sql/sql_select.cc b/sql/sql_select.cc
index db8a63eeb48..a3e8b453897 100644
--- a/sql/sql_select.cc
+++ b/sql/sql_select.cc
@@ -22796,7 +22796,9 @@ int setup_order(THD *thd, Ref_ptr_array ref_pointer_array, TABLE_LIST *tables,
an ORDER BY clause
*/
- if (for_union && (*order->item)->with_sum_func)
+ if (for_union &&
+ ((*order->item)->with_sum_func ||
+ (*order->item)->with_window_func))
{
my_error(ER_AGGREGATE_ORDER_FOR_UNION, MYF(0), number);
return 1;
1
0
[Commits] d140d276244: MDEV-22377: Subquery in an UPDATE query uses full scan instead of range
by psergey 21 Feb '22
by psergey 21 Feb '22
21 Feb '22
revision-id: d140d276244508d7f2e69a3c1a23e0b09201f71a (mariadb-10.2.43-13-gd140d276244)
parent(s): 24ec144c63b36402adaff2bc12aaabefa40bdd51
author: Sergei Petrunia
committer: Sergei Petrunia
timestamp: 2022-02-21 18:44:11 +0300
message:
MDEV-22377: Subquery in an UPDATE query uses full scan instead of range
[Patch idea by Igor Babaev]
Symptom: for IN (SELECT ...) subqueries using IN-to-EXISTS transformation,
the optimizer was unable to make inferences using multiple equalities.
The cause is code Item_in_subselect::inject_in_to_exists_cond() which may
break invariants that Multiple-Equality code relies on. In particular, it
may produce a WHERE condition with an empty Item_cond::m_cond_equal.
Fixed this by making Item_cond::m_cond_equal.
---
mysql-test/r/subselect2.result | 2 +-
mysql-test/r/subselect3.result | 4 +--
mysql-test/r/subselect3_jcl6.result | 4 +--
mysql-test/r/subselect4.result | 40 ++++++++++++++++++++++++++-
mysql-test/r/subselect_mat_cost_bugs.result | 4 +--
mysql-test/t/subselect4.test | 43 +++++++++++++++++++++++++++++
sql/item_subselect.cc | 9 ++----
7 files changed, 92 insertions(+), 14 deletions(-)
diff --git a/mysql-test/r/subselect2.result b/mysql-test/r/subselect2.result
index 155d5ab2a50..91d3e445b48 100644
--- a/mysql-test/r/subselect2.result
+++ b/mysql-test/r/subselect2.result
@@ -262,7 +262,7 @@ id select_type table type possible_keys key key_len ref rows filtered Extra
2 DEPENDENT SUBQUERY t2c ALL NULL NULL NULL NULL 2 100.00 Using where; Using join buffer (incremental, BNL join)
Warnings:
Note 1276 Field or reference 'test.t1.a' of SELECT #2 was resolved in SELECT #1
-Note 1003 select `test`.`t1`.`a` AS `a`,`test`.`t1`.`c1` AS `c1` from `test`.`t1` where !<expr_cache><`test`.`t1`.`c1`,`test`.`t1`.`a`>(<in_optimizer>(`test`.`t1`.`c1`,<exists>(select `test`.`t2a`.`c2` from `test`.`t2` `t2a` join `test`.`t2` `t2b` join `test`.`t2` `t2c` where (`test`.`t2b`.`m` <> `test`.`t1`.`a` or `test`.`t2b`.`m` = `test`.`t2a`.`m`) and trigcond(<cache>(`test`.`t1`.`c1`) = `test`.`t2a`.`c2` or `test`.`t2a`.`c2` is null) and `test`.`t2c`.`c2` = `test`.`t2b`.`c2` and `test`.`t2b`.`n` = `test`.`t2a`.`m` having trigcond(`test`.`t2a`.`c2` is null))))
+Note 1003 select `test`.`t1`.`a` AS `a`,`test`.`t1`.`c1` AS `c1` from `test`.`t1` where !<expr_cache><`test`.`t1`.`c1`,`test`.`t1`.`a`>(<in_optimizer>(`test`.`t1`.`c1`,<exists>(select `test`.`t2a`.`c2` from `test`.`t2` `t2a` join `test`.`t2` `t2b` join `test`.`t2` `t2c` where `test`.`t2c`.`c2` = `test`.`t2b`.`c2` and `test`.`t2b`.`n` = `test`.`t2a`.`m` and (`test`.`t2b`.`m` <> `test`.`t1`.`a` or `test`.`t2b`.`m` = `test`.`t2a`.`m`) and trigcond(<cache>(`test`.`t1`.`c1`) = `test`.`t2a`.`c2` or `test`.`t2a`.`c2` is null) having trigcond(`test`.`t2a`.`c2` is null))))
DROP TABLE t1,t2;
#
# MDEV-614, also MDEV-536, also LP:1050806:
diff --git a/mysql-test/r/subselect3.result b/mysql-test/r/subselect3.result
index 8d33e4e1606..d92626380ef 100644
--- a/mysql-test/r/subselect3.result
+++ b/mysql-test/r/subselect3.result
@@ -169,7 +169,7 @@ id select_type table type possible_keys key key_len ref rows filtered Extra
2 DEPENDENT SUBQUERY t2 ref a a 5 test.t1.b 1 100.00 Using where
Warnings:
Note 1276 Field or reference 'test.t3.oref' of SELECT #2 was resolved in SELECT #1
-Note 1003 select `test`.`t3`.`a` AS `a`,`test`.`t3`.`oref` AS `oref`,<expr_cache><`test`.`t3`.`a`,`test`.`t3`.`oref`>(<in_optimizer>(`test`.`t3`.`a`,<exists>(select `test`.`t1`.`a` from `test`.`t1` join `test`.`t2` where `test`.`t2`.`b` = `test`.`t3`.`oref` and trigcond(<cache>(`test`.`t3`.`a`) = `test`.`t1`.`a` or `test`.`t1`.`a` is null) and `test`.`t2`.`a` = `test`.`t1`.`b` having trigcond(`test`.`t1`.`a` is null)))) AS `Z` from `test`.`t3`
+Note 1003 select `test`.`t3`.`a` AS `a`,`test`.`t3`.`oref` AS `oref`,<expr_cache><`test`.`t3`.`a`,`test`.`t3`.`oref`>(<in_optimizer>(`test`.`t3`.`a`,<exists>(select `test`.`t1`.`a` from `test`.`t1` join `test`.`t2` where `test`.`t2`.`a` = `test`.`t1`.`b` and `test`.`t2`.`b` = `test`.`t3`.`oref` and trigcond(<cache>(`test`.`t3`.`a`) = `test`.`t1`.`a` or `test`.`t1`.`a` is null) having trigcond(`test`.`t1`.`a` is null)))) AS `Z` from `test`.`t3`
drop table t1, t2, t3;
create table t1 (a int NOT NULL, b int NOT NULL, key(a));
insert into t1 values
@@ -197,7 +197,7 @@ id select_type table type possible_keys key key_len ref rows filtered Extra
2 DEPENDENT SUBQUERY t2 ref a a 4 test.t1.b 1 100.00 Using where
Warnings:
Note 1276 Field or reference 'test.t3.oref' of SELECT #2 was resolved in SELECT #1
-Note 1003 select `test`.`t3`.`a` AS `a`,`test`.`t3`.`oref` AS `oref`,<expr_cache><`test`.`t3`.`a`,`test`.`t3`.`oref`>(<in_optimizer>(`test`.`t3`.`a`,<exists>(select `test`.`t1`.`a` from `test`.`t1` join `test`.`t2` where `test`.`t2`.`b` = `test`.`t3`.`oref` and trigcond(<cache>(`test`.`t3`.`a`) = `test`.`t1`.`a`) and `test`.`t2`.`a` = `test`.`t1`.`b`))) AS `Z` from `test`.`t3`
+Note 1003 select `test`.`t3`.`a` AS `a`,`test`.`t3`.`oref` AS `oref`,<expr_cache><`test`.`t3`.`a`,`test`.`t3`.`oref`>(<in_optimizer>(`test`.`t3`.`a`,<exists>(select `test`.`t1`.`a` from `test`.`t1` join `test`.`t2` where `test`.`t2`.`a` = `test`.`t1`.`b` and `test`.`t2`.`b` = `test`.`t3`.`oref` and trigcond(<cache>(`test`.`t3`.`a`) = `test`.`t1`.`a`)))) AS `Z` from `test`.`t3`
drop table t1,t2,t3;
create table t1 (oref int, grp int);
insert into t1 (oref, grp) values
diff --git a/mysql-test/r/subselect3_jcl6.result b/mysql-test/r/subselect3_jcl6.result
index 6d1e305bc49..2df3f2eea35 100644
--- a/mysql-test/r/subselect3_jcl6.result
+++ b/mysql-test/r/subselect3_jcl6.result
@@ -172,7 +172,7 @@ id select_type table type possible_keys key key_len ref rows filtered Extra
2 DEPENDENT SUBQUERY t2 ref a a 5 test.t1.b 1 100.00 Using where; Using join buffer (flat, BKA join); Key-ordered Rowid-ordered scan
Warnings:
Note 1276 Field or reference 'test.t3.oref' of SELECT #2 was resolved in SELECT #1
-Note 1003 select `test`.`t3`.`a` AS `a`,`test`.`t3`.`oref` AS `oref`,<expr_cache><`test`.`t3`.`a`,`test`.`t3`.`oref`>(<in_optimizer>(`test`.`t3`.`a`,<exists>(select `test`.`t1`.`a` from `test`.`t1` join `test`.`t2` where `test`.`t2`.`b` = `test`.`t3`.`oref` and trigcond(<cache>(`test`.`t3`.`a`) = `test`.`t1`.`a` or `test`.`t1`.`a` is null) and `test`.`t2`.`a` = `test`.`t1`.`b` having trigcond(`test`.`t1`.`a` is null)))) AS `Z` from `test`.`t3`
+Note 1003 select `test`.`t3`.`a` AS `a`,`test`.`t3`.`oref` AS `oref`,<expr_cache><`test`.`t3`.`a`,`test`.`t3`.`oref`>(<in_optimizer>(`test`.`t3`.`a`,<exists>(select `test`.`t1`.`a` from `test`.`t1` join `test`.`t2` where `test`.`t2`.`a` = `test`.`t1`.`b` and `test`.`t2`.`b` = `test`.`t3`.`oref` and trigcond(<cache>(`test`.`t3`.`a`) = `test`.`t1`.`a` or `test`.`t1`.`a` is null) having trigcond(`test`.`t1`.`a` is null)))) AS `Z` from `test`.`t3`
drop table t1, t2, t3;
create table t1 (a int NOT NULL, b int NOT NULL, key(a));
insert into t1 values
@@ -200,7 +200,7 @@ id select_type table type possible_keys key key_len ref rows filtered Extra
2 DEPENDENT SUBQUERY t2 ref a a 4 test.t1.b 1 100.00 Using where; Using join buffer (flat, BKA join); Key-ordered Rowid-ordered scan
Warnings:
Note 1276 Field or reference 'test.t3.oref' of SELECT #2 was resolved in SELECT #1
-Note 1003 select `test`.`t3`.`a` AS `a`,`test`.`t3`.`oref` AS `oref`,<expr_cache><`test`.`t3`.`a`,`test`.`t3`.`oref`>(<in_optimizer>(`test`.`t3`.`a`,<exists>(select `test`.`t1`.`a` from `test`.`t1` join `test`.`t2` where `test`.`t2`.`b` = `test`.`t3`.`oref` and trigcond(<cache>(`test`.`t3`.`a`) = `test`.`t1`.`a`) and `test`.`t2`.`a` = `test`.`t1`.`b`))) AS `Z` from `test`.`t3`
+Note 1003 select `test`.`t3`.`a` AS `a`,`test`.`t3`.`oref` AS `oref`,<expr_cache><`test`.`t3`.`a`,`test`.`t3`.`oref`>(<in_optimizer>(`test`.`t3`.`a`,<exists>(select `test`.`t1`.`a` from `test`.`t1` join `test`.`t2` where `test`.`t2`.`a` = `test`.`t1`.`b` and `test`.`t2`.`b` = `test`.`t3`.`oref` and trigcond(<cache>(`test`.`t3`.`a`) = `test`.`t1`.`a`)))) AS `Z` from `test`.`t3`
drop table t1,t2,t3;
create table t1 (oref int, grp int);
insert into t1 (oref, grp) values
diff --git a/mysql-test/r/subselect4.result b/mysql-test/r/subselect4.result
index 2657977dae7..43e698d2b56 100644
--- a/mysql-test/r/subselect4.result
+++ b/mysql-test/r/subselect4.result
@@ -2351,7 +2351,7 @@ id select_type table type possible_keys key key_len ref rows filtered Extra
1 PRIMARY t2 system NULL NULL NULL NULL 1 100.00
2 DEPENDENT SUBQUERY t1 ALL NULL NULL NULL NULL 2 100.00 Using where
Warnings:
-Note 1003 select 3 AS `f` from dual where !<expr_cache><3>(<in_optimizer>(3,<exists>(select `test`.`t1`.`b` from `test`.`t1` where (`test`.`t1`.`c` = 'USA' or `test`.`t1`.`c` <> 'USA') and trigcond(<cache>(3) = `test`.`t1`.`b` or `test`.`t1`.`b` is null) and `test`.`t1`.`b` = `test`.`t1`.`a` having trigcond(`test`.`t1`.`b` is null))))
+Note 1003 select 3 AS `f` from dual where !<expr_cache><3>(<in_optimizer>(3,<exists>(select `test`.`t1`.`b` from `test`.`t1` where `test`.`t1`.`b` = `test`.`t1`.`a` and (`test`.`t1`.`c` = 'USA' or `test`.`t1`.`c` <> 'USA') and trigcond(<cache>(3) = `test`.`t1`.`b` or `test`.`t1`.`b` is null) having trigcond(`test`.`t1`.`b` is null))))
SELECT * FROM t2
WHERE f NOT IN (SELECT b FROM t1
WHERE 0 OR (c IN ('USA') OR c NOT IN ('USA')) AND a = b);
@@ -2803,4 +2803,42 @@ FROM (t1 JOIN t1 AS ref_t1 ON
(t1.i1 > (SELECT ref_t1.i1 AS c0 FROM t1 b ORDER BY -c0)));
ERROR 21000: Subquery returns more than 1 row
DROP TABLE t1;
+#
+# MDEV-22377: Subquery in an UPDATE query uses full scan instead of range
+#
+CREATE TABLE t1 (
+key1 varchar(30) NOT NULL,
+col1 int(11) NOT NULL,
+filler char(100)
+);
+insert into t1 select seq, seq, seq from seq_1_to_100;
+CREATE TABLE t10 (
+key1 varchar(30) NOT NULL,
+col1 int,
+filler char(100),
+PRIMARY KEY (key1)
+);
+insert into t10 select seq, seq, seq from seq_1_to_1000;
+CREATE TABLE t11 (
+key1 varchar(30) NOT NULL,
+filler char(100),
+PRIMARY KEY (key1)
+);
+insert into t11 select seq, seq from seq_1_to_1000;
+set @tmp_os=@@optimizer_switch;
+set optimizer_switch='semijoin=off,materialization=off';
+# Must use range access (not full scan) for table tms:
+explain select * from t1 hist
+WHERE
+key1 IN ('1','2','3','4','5','6','7','8','9','10') AND
+hist.col1 NOT IN (SELECT tn.col1
+FROM t10 tn JOIN t11 tms ON tms.key1 = tn.key1
+WHERE tn.key1 IN ('1','2','3','4','5','6','7','8','9','10')
+);
+id select_type table type possible_keys key key_len ref rows Extra
+1 PRIMARY hist ALL NULL NULL NULL NULL 100 Using where
+2 DEPENDENT SUBQUERY tms range PRIMARY PRIMARY 32 NULL 10 Using where; Using index
+2 DEPENDENT SUBQUERY tn eq_ref PRIMARY PRIMARY 32 test.tms.key1 1 Using where
+set optimizer_switch=@tmp_os;
+drop table t1, t10, t11;
# End of 10.2 tests
diff --git a/mysql-test/r/subselect_mat_cost_bugs.result b/mysql-test/r/subselect_mat_cost_bugs.result
index fd768b1efd4..87cbc1bb078 100644
--- a/mysql-test/r/subselect_mat_cost_bugs.result
+++ b/mysql-test/r/subselect_mat_cost_bugs.result
@@ -100,7 +100,7 @@ id select_type table type possible_keys key key_len ref rows filtered Extra
2 DEPENDENT SUBQUERY t2 index c3 c3 9 NULL 2 100.00 Using where; Using index; Using join buffer (flat, BNL join)
Warnings:
Note 1276 Field or reference 'test.t1.pk' of SELECT #2 was resolved in SELECT #1
-Note 1003 select `test`.`t1`.`pk` AS `pk` from `test`.`t1` where <expr_cache><`test`.`t1`.`c1`,`test`.`t1`.`pk`>(<in_optimizer>(`test`.`t1`.`c1`,<exists>(select `test`.`t1a`.`c1` from `test`.`t1b` join `test`.`t2` left join `test`.`t1a` on(`test`.`t1a`.`c2` = `test`.`t1b`.`pk` and 2) where `test`.`t1`.`pk` <> 0 and <cache>(`test`.`t1`.`c1`) = `test`.`t1a`.`c1` and `test`.`t2`.`c3` = `test`.`t1b`.`c4`)))
+Note 1003 select `test`.`t1`.`pk` AS `pk` from `test`.`t1` where <expr_cache><`test`.`t1`.`c1`,`test`.`t1`.`pk`>(<in_optimizer>(`test`.`t1`.`c1`,<exists>(select `test`.`t1a`.`c1` from `test`.`t1b` join `test`.`t2` left join `test`.`t1a` on(`test`.`t1a`.`c2` = `test`.`t1b`.`pk` and 2) where `test`.`t2`.`c3` = `test`.`t1b`.`c4` and `test`.`t1`.`pk` <> 0 and <cache>(`test`.`t1`.`c1`) = `test`.`t1a`.`c1`)))
SELECT pk
FROM t1
WHERE c1 IN
@@ -363,7 +363,7 @@ AND a = SOME (SELECT b FROM t5));
id select_type table type possible_keys key key_len ref rows Extra
1 PRIMARY t3 ALL NULL NULL NULL NULL 2 Using where
2 DEPENDENT SUBQUERY t5 index c c 10 NULL 2 Using where; Using index; Start temporary
-2 DEPENDENT SUBQUERY t4 eq_ref PRIMARY PRIMARY 4 test.t5.b 1 Using index condition; Using where; End temporary
+2 DEPENDENT SUBQUERY t4 eq_ref PRIMARY PRIMARY 4 test.t5.b 1 Using where; End temporary
SELECT *
FROM t3
WHERE t3.b > ALL (
diff --git a/mysql-test/t/subselect4.test b/mysql-test/t/subselect4.test
index 93389571c5c..1313d2e49e7 100644
--- a/mysql-test/t/subselect4.test
+++ b/mysql-test/t/subselect4.test
@@ -2308,4 +2308,47 @@ FROM (t1 JOIN t1 AS ref_t1 ON
DROP TABLE t1;
+--echo #
+--echo # MDEV-22377: Subquery in an UPDATE query uses full scan instead of range
+--echo #
+
+CREATE TABLE t1 (
+ key1 varchar(30) NOT NULL,
+ col1 int(11) NOT NULL,
+ filler char(100)
+);
+insert into t1 select seq, seq, seq from seq_1_to_100;
+
+CREATE TABLE t10 (
+ key1 varchar(30) NOT NULL,
+ col1 int,
+ filler char(100),
+ PRIMARY KEY (key1)
+);
+insert into t10 select seq, seq, seq from seq_1_to_1000;
+
+CREATE TABLE t11 (
+ key1 varchar(30) NOT NULL,
+ filler char(100),
+ PRIMARY KEY (key1)
+);
+insert into t11 select seq, seq from seq_1_to_1000;
+
+
+set @tmp_os=@@optimizer_switch;
+set optimizer_switch='semijoin=off,materialization=off';
+
+--echo # Must use range access (not full scan) for table tms:
+explain select * from t1 hist
+WHERE
+ key1 IN ('1','2','3','4','5','6','7','8','9','10') AND
+ hist.col1 NOT IN (SELECT tn.col1
+ FROM t10 tn JOIN t11 tms ON tms.key1 = tn.key1
+ WHERE tn.key1 IN ('1','2','3','4','5','6','7','8','9','10')
+ );
+
+set optimizer_switch=@tmp_os;
+
+drop table t1, t10, t11;
+
--echo # End of 10.2 tests
diff --git a/sql/item_subselect.cc b/sql/item_subselect.cc
index 25621dfe104..92ba085af5b 100644
--- a/sql/item_subselect.cc
+++ b/sql/item_subselect.cc
@@ -2769,12 +2769,9 @@ bool Item_in_subselect::inject_in_to_exists_cond(JOIN *join_arg)
{
/* The argument list of the top-level AND may change after fix fields. */
and_args= ((Item_cond*) join_arg->conds)->argument_list();
- List_iterator<Item_equal> li(join_arg->cond_equal->current_level);
- Item_equal *elem;
- while ((elem= li++))
- {
- and_args->push_back(elem, thd->mem_root);
- }
+ ((Item_cond_and *) (join_arg->conds))->m_cond_equal=
+ *join_arg->cond_equal;
+ and_args->append((List<Item> *)&join_arg->cond_equal->current_level);
}
}
1
0
[Commits] 61c0b5d9551: MDEV-22377: Subquery in an UPDATE query uses full scan instead of range
by psergey 18 Feb '22
by psergey 18 Feb '22
18 Feb '22
revision-id: 61c0b5d9551084edc391dfb55819913cc37198e7 (mariadb-10.2.43-7-g61c0b5d9551)
parent(s): cf574cf53b168992b911d5fc32c590a6ee03a56a
author: Sergei Petrunia
committer: Sergei Petrunia
timestamp: 2022-02-18 22:50:56 +0300
message:
MDEV-22377: Subquery in an UPDATE query uses full scan instead of range
[Patch idea by Igor Babaev]
Symptom: for IN (SELECT ...) subqueries using IN-to-EXISTS transformation,
the optimizer was unable to make inferences using multiple equalities.
The cause is code Item_in_subselect::inject_in_to_exists_cond() which may
break invariants that Multiple-Equality code relies on. In particular, it
may produce a WHERE condition with an empty Item_cond::m_cond_equal.
Fixed this by making Item_cond::m_cond_equal.
---
mysql-test/r/subselect2.result | 2 +-
mysql-test/r/subselect3.result | 4 +--
mysql-test/r/subselect3_jcl6.result | 4 +--
mysql-test/r/subselect4.result | 38 ++++++++++++++++++++++++-
mysql-test/r/subselect_mat_cost_bugs.result | 4 +--
mysql-test/t/subselect4.test | 43 +++++++++++++++++++++++++++++
sql/item_subselect.cc | 9 ++----
7 files changed, 90 insertions(+), 14 deletions(-)
diff --git a/mysql-test/r/subselect2.result b/mysql-test/r/subselect2.result
index 155d5ab2a50..91d3e445b48 100644
--- a/mysql-test/r/subselect2.result
+++ b/mysql-test/r/subselect2.result
@@ -262,7 +262,7 @@ id select_type table type possible_keys key key_len ref rows filtered Extra
2 DEPENDENT SUBQUERY t2c ALL NULL NULL NULL NULL 2 100.00 Using where; Using join buffer (incremental, BNL join)
Warnings:
Note 1276 Field or reference 'test.t1.a' of SELECT #2 was resolved in SELECT #1
-Note 1003 select `test`.`t1`.`a` AS `a`,`test`.`t1`.`c1` AS `c1` from `test`.`t1` where !<expr_cache><`test`.`t1`.`c1`,`test`.`t1`.`a`>(<in_optimizer>(`test`.`t1`.`c1`,<exists>(select `test`.`t2a`.`c2` from `test`.`t2` `t2a` join `test`.`t2` `t2b` join `test`.`t2` `t2c` where (`test`.`t2b`.`m` <> `test`.`t1`.`a` or `test`.`t2b`.`m` = `test`.`t2a`.`m`) and trigcond(<cache>(`test`.`t1`.`c1`) = `test`.`t2a`.`c2` or `test`.`t2a`.`c2` is null) and `test`.`t2c`.`c2` = `test`.`t2b`.`c2` and `test`.`t2b`.`n` = `test`.`t2a`.`m` having trigcond(`test`.`t2a`.`c2` is null))))
+Note 1003 select `test`.`t1`.`a` AS `a`,`test`.`t1`.`c1` AS `c1` from `test`.`t1` where !<expr_cache><`test`.`t1`.`c1`,`test`.`t1`.`a`>(<in_optimizer>(`test`.`t1`.`c1`,<exists>(select `test`.`t2a`.`c2` from `test`.`t2` `t2a` join `test`.`t2` `t2b` join `test`.`t2` `t2c` where `test`.`t2c`.`c2` = `test`.`t2b`.`c2` and `test`.`t2b`.`n` = `test`.`t2a`.`m` and (`test`.`t2b`.`m` <> `test`.`t1`.`a` or `test`.`t2b`.`m` = `test`.`t2a`.`m`) and trigcond(<cache>(`test`.`t1`.`c1`) = `test`.`t2a`.`c2` or `test`.`t2a`.`c2` is null) having trigcond(`test`.`t2a`.`c2` is null))))
DROP TABLE t1,t2;
#
# MDEV-614, also MDEV-536, also LP:1050806:
diff --git a/mysql-test/r/subselect3.result b/mysql-test/r/subselect3.result
index 8d33e4e1606..d92626380ef 100644
--- a/mysql-test/r/subselect3.result
+++ b/mysql-test/r/subselect3.result
@@ -169,7 +169,7 @@ id select_type table type possible_keys key key_len ref rows filtered Extra
2 DEPENDENT SUBQUERY t2 ref a a 5 test.t1.b 1 100.00 Using where
Warnings:
Note 1276 Field or reference 'test.t3.oref' of SELECT #2 was resolved in SELECT #1
-Note 1003 select `test`.`t3`.`a` AS `a`,`test`.`t3`.`oref` AS `oref`,<expr_cache><`test`.`t3`.`a`,`test`.`t3`.`oref`>(<in_optimizer>(`test`.`t3`.`a`,<exists>(select `test`.`t1`.`a` from `test`.`t1` join `test`.`t2` where `test`.`t2`.`b` = `test`.`t3`.`oref` and trigcond(<cache>(`test`.`t3`.`a`) = `test`.`t1`.`a` or `test`.`t1`.`a` is null) and `test`.`t2`.`a` = `test`.`t1`.`b` having trigcond(`test`.`t1`.`a` is null)))) AS `Z` from `test`.`t3`
+Note 1003 select `test`.`t3`.`a` AS `a`,`test`.`t3`.`oref` AS `oref`,<expr_cache><`test`.`t3`.`a`,`test`.`t3`.`oref`>(<in_optimizer>(`test`.`t3`.`a`,<exists>(select `test`.`t1`.`a` from `test`.`t1` join `test`.`t2` where `test`.`t2`.`a` = `test`.`t1`.`b` and `test`.`t2`.`b` = `test`.`t3`.`oref` and trigcond(<cache>(`test`.`t3`.`a`) = `test`.`t1`.`a` or `test`.`t1`.`a` is null) having trigcond(`test`.`t1`.`a` is null)))) AS `Z` from `test`.`t3`
drop table t1, t2, t3;
create table t1 (a int NOT NULL, b int NOT NULL, key(a));
insert into t1 values
@@ -197,7 +197,7 @@ id select_type table type possible_keys key key_len ref rows filtered Extra
2 DEPENDENT SUBQUERY t2 ref a a 4 test.t1.b 1 100.00 Using where
Warnings:
Note 1276 Field or reference 'test.t3.oref' of SELECT #2 was resolved in SELECT #1
-Note 1003 select `test`.`t3`.`a` AS `a`,`test`.`t3`.`oref` AS `oref`,<expr_cache><`test`.`t3`.`a`,`test`.`t3`.`oref`>(<in_optimizer>(`test`.`t3`.`a`,<exists>(select `test`.`t1`.`a` from `test`.`t1` join `test`.`t2` where `test`.`t2`.`b` = `test`.`t3`.`oref` and trigcond(<cache>(`test`.`t3`.`a`) = `test`.`t1`.`a`) and `test`.`t2`.`a` = `test`.`t1`.`b`))) AS `Z` from `test`.`t3`
+Note 1003 select `test`.`t3`.`a` AS `a`,`test`.`t3`.`oref` AS `oref`,<expr_cache><`test`.`t3`.`a`,`test`.`t3`.`oref`>(<in_optimizer>(`test`.`t3`.`a`,<exists>(select `test`.`t1`.`a` from `test`.`t1` join `test`.`t2` where `test`.`t2`.`a` = `test`.`t1`.`b` and `test`.`t2`.`b` = `test`.`t3`.`oref` and trigcond(<cache>(`test`.`t3`.`a`) = `test`.`t1`.`a`)))) AS `Z` from `test`.`t3`
drop table t1,t2,t3;
create table t1 (oref int, grp int);
insert into t1 (oref, grp) values
diff --git a/mysql-test/r/subselect3_jcl6.result b/mysql-test/r/subselect3_jcl6.result
index 6d1e305bc49..2df3f2eea35 100644
--- a/mysql-test/r/subselect3_jcl6.result
+++ b/mysql-test/r/subselect3_jcl6.result
@@ -172,7 +172,7 @@ id select_type table type possible_keys key key_len ref rows filtered Extra
2 DEPENDENT SUBQUERY t2 ref a a 5 test.t1.b 1 100.00 Using where; Using join buffer (flat, BKA join); Key-ordered Rowid-ordered scan
Warnings:
Note 1276 Field or reference 'test.t3.oref' of SELECT #2 was resolved in SELECT #1
-Note 1003 select `test`.`t3`.`a` AS `a`,`test`.`t3`.`oref` AS `oref`,<expr_cache><`test`.`t3`.`a`,`test`.`t3`.`oref`>(<in_optimizer>(`test`.`t3`.`a`,<exists>(select `test`.`t1`.`a` from `test`.`t1` join `test`.`t2` where `test`.`t2`.`b` = `test`.`t3`.`oref` and trigcond(<cache>(`test`.`t3`.`a`) = `test`.`t1`.`a` or `test`.`t1`.`a` is null) and `test`.`t2`.`a` = `test`.`t1`.`b` having trigcond(`test`.`t1`.`a` is null)))) AS `Z` from `test`.`t3`
+Note 1003 select `test`.`t3`.`a` AS `a`,`test`.`t3`.`oref` AS `oref`,<expr_cache><`test`.`t3`.`a`,`test`.`t3`.`oref`>(<in_optimizer>(`test`.`t3`.`a`,<exists>(select `test`.`t1`.`a` from `test`.`t1` join `test`.`t2` where `test`.`t2`.`a` = `test`.`t1`.`b` and `test`.`t2`.`b` = `test`.`t3`.`oref` and trigcond(<cache>(`test`.`t3`.`a`) = `test`.`t1`.`a` or `test`.`t1`.`a` is null) having trigcond(`test`.`t1`.`a` is null)))) AS `Z` from `test`.`t3`
drop table t1, t2, t3;
create table t1 (a int NOT NULL, b int NOT NULL, key(a));
insert into t1 values
@@ -200,7 +200,7 @@ id select_type table type possible_keys key key_len ref rows filtered Extra
2 DEPENDENT SUBQUERY t2 ref a a 4 test.t1.b 1 100.00 Using where; Using join buffer (flat, BKA join); Key-ordered Rowid-ordered scan
Warnings:
Note 1276 Field or reference 'test.t3.oref' of SELECT #2 was resolved in SELECT #1
-Note 1003 select `test`.`t3`.`a` AS `a`,`test`.`t3`.`oref` AS `oref`,<expr_cache><`test`.`t3`.`a`,`test`.`t3`.`oref`>(<in_optimizer>(`test`.`t3`.`a`,<exists>(select `test`.`t1`.`a` from `test`.`t1` join `test`.`t2` where `test`.`t2`.`b` = `test`.`t3`.`oref` and trigcond(<cache>(`test`.`t3`.`a`) = `test`.`t1`.`a`) and `test`.`t2`.`a` = `test`.`t1`.`b`))) AS `Z` from `test`.`t3`
+Note 1003 select `test`.`t3`.`a` AS `a`,`test`.`t3`.`oref` AS `oref`,<expr_cache><`test`.`t3`.`a`,`test`.`t3`.`oref`>(<in_optimizer>(`test`.`t3`.`a`,<exists>(select `test`.`t1`.`a` from `test`.`t1` join `test`.`t2` where `test`.`t2`.`a` = `test`.`t1`.`b` and `test`.`t2`.`b` = `test`.`t3`.`oref` and trigcond(<cache>(`test`.`t3`.`a`) = `test`.`t1`.`a`)))) AS `Z` from `test`.`t3`
drop table t1,t2,t3;
create table t1 (oref int, grp int);
insert into t1 (oref, grp) values
diff --git a/mysql-test/r/subselect4.result b/mysql-test/r/subselect4.result
index 2657977dae7..e8a91d26efe 100644
--- a/mysql-test/r/subselect4.result
+++ b/mysql-test/r/subselect4.result
@@ -2351,7 +2351,7 @@ id select_type table type possible_keys key key_len ref rows filtered Extra
1 PRIMARY t2 system NULL NULL NULL NULL 1 100.00
2 DEPENDENT SUBQUERY t1 ALL NULL NULL NULL NULL 2 100.00 Using where
Warnings:
-Note 1003 select 3 AS `f` from dual where !<expr_cache><3>(<in_optimizer>(3,<exists>(select `test`.`t1`.`b` from `test`.`t1` where (`test`.`t1`.`c` = 'USA' or `test`.`t1`.`c` <> 'USA') and trigcond(<cache>(3) = `test`.`t1`.`b` or `test`.`t1`.`b` is null) and `test`.`t1`.`b` = `test`.`t1`.`a` having trigcond(`test`.`t1`.`b` is null))))
+Note 1003 select 3 AS `f` from dual where !<expr_cache><3>(<in_optimizer>(3,<exists>(select `test`.`t1`.`b` from `test`.`t1` where `test`.`t1`.`b` = `test`.`t1`.`a` and (`test`.`t1`.`c` = 'USA' or `test`.`t1`.`c` <> 'USA') and trigcond(<cache>(3) = `test`.`t1`.`b` or `test`.`t1`.`b` is null) having trigcond(`test`.`t1`.`b` is null))))
SELECT * FROM t2
WHERE f NOT IN (SELECT b FROM t1
WHERE 0 OR (c IN ('USA') OR c NOT IN ('USA')) AND a = b);
@@ -2803,4 +2803,40 @@ FROM (t1 JOIN t1 AS ref_t1 ON
(t1.i1 > (SELECT ref_t1.i1 AS c0 FROM t1 b ORDER BY -c0)));
ERROR 21000: Subquery returns more than 1 row
DROP TABLE t1;
+#
+# MDEV-22377: Subquery in an UPDATE query uses full scan instead of range
+#
+CREATE TABLE t1 (
+key1 varchar(30) NOT NULL,
+col1 int(11) NOT NULL,
+filler char(100)
+);
+insert into t1 select seq, seq, seq from seq_1_to_100;
+CREATE TABLE t10 (
+key1 varchar(30) NOT NULL,
+col1 int,
+filler char(100),
+PRIMARY KEY (key1)
+);
+insert into t10 select seq, seq, seq from seq_1_to_1000;
+CREATE TABLE t11 (
+key1 varchar(30) NOT NULL,
+filler char(100),
+PRIMARY KEY (key1)
+);
+insert into t11 select seq, seq from seq_1_to_1000;
+# Must use range access (not full scan) for table tms:
+explain update t1 hist
+set filler='aaa'
+WHERE
+key1 IN ('1','2','3','4','5','6','7','8','9','10') AND
+hist.col1 NOT IN (SELECT tn.col1
+FROM t10 tn JOIN t11 tms ON tms.key1 = tn.key1
+WHERE tn.key1 IN ('1','2','3','4','5','6','7','8','9','10')
+);
+id select_type table type possible_keys key key_len ref rows Extra
+1 PRIMARY hist ALL NULL NULL NULL NULL 100 Using where
+2 DEPENDENT SUBQUERY tms range PRIMARY PRIMARY 32 NULL 10 Using where; Using index
+2 DEPENDENT SUBQUERY tn eq_ref PRIMARY PRIMARY 32 test.tms.key1 1 Using where
+drop table t1, t10, t11;
# End of 10.2 tests
diff --git a/mysql-test/r/subselect_mat_cost_bugs.result b/mysql-test/r/subselect_mat_cost_bugs.result
index fd768b1efd4..87cbc1bb078 100644
--- a/mysql-test/r/subselect_mat_cost_bugs.result
+++ b/mysql-test/r/subselect_mat_cost_bugs.result
@@ -100,7 +100,7 @@ id select_type table type possible_keys key key_len ref rows filtered Extra
2 DEPENDENT SUBQUERY t2 index c3 c3 9 NULL 2 100.00 Using where; Using index; Using join buffer (flat, BNL join)
Warnings:
Note 1276 Field or reference 'test.t1.pk' of SELECT #2 was resolved in SELECT #1
-Note 1003 select `test`.`t1`.`pk` AS `pk` from `test`.`t1` where <expr_cache><`test`.`t1`.`c1`,`test`.`t1`.`pk`>(<in_optimizer>(`test`.`t1`.`c1`,<exists>(select `test`.`t1a`.`c1` from `test`.`t1b` join `test`.`t2` left join `test`.`t1a` on(`test`.`t1a`.`c2` = `test`.`t1b`.`pk` and 2) where `test`.`t1`.`pk` <> 0 and <cache>(`test`.`t1`.`c1`) = `test`.`t1a`.`c1` and `test`.`t2`.`c3` = `test`.`t1b`.`c4`)))
+Note 1003 select `test`.`t1`.`pk` AS `pk` from `test`.`t1` where <expr_cache><`test`.`t1`.`c1`,`test`.`t1`.`pk`>(<in_optimizer>(`test`.`t1`.`c1`,<exists>(select `test`.`t1a`.`c1` from `test`.`t1b` join `test`.`t2` left join `test`.`t1a` on(`test`.`t1a`.`c2` = `test`.`t1b`.`pk` and 2) where `test`.`t2`.`c3` = `test`.`t1b`.`c4` and `test`.`t1`.`pk` <> 0 and <cache>(`test`.`t1`.`c1`) = `test`.`t1a`.`c1`)))
SELECT pk
FROM t1
WHERE c1 IN
@@ -363,7 +363,7 @@ AND a = SOME (SELECT b FROM t5));
id select_type table type possible_keys key key_len ref rows Extra
1 PRIMARY t3 ALL NULL NULL NULL NULL 2 Using where
2 DEPENDENT SUBQUERY t5 index c c 10 NULL 2 Using where; Using index; Start temporary
-2 DEPENDENT SUBQUERY t4 eq_ref PRIMARY PRIMARY 4 test.t5.b 1 Using index condition; Using where; End temporary
+2 DEPENDENT SUBQUERY t4 eq_ref PRIMARY PRIMARY 4 test.t5.b 1 Using where; End temporary
SELECT *
FROM t3
WHERE t3.b > ALL (
diff --git a/mysql-test/t/subselect4.test b/mysql-test/t/subselect4.test
index 93389571c5c..1313d2e49e7 100644
--- a/mysql-test/t/subselect4.test
+++ b/mysql-test/t/subselect4.test
@@ -2308,4 +2308,47 @@ FROM (t1 JOIN t1 AS ref_t1 ON
DROP TABLE t1;
+--echo #
+--echo # MDEV-22377: Subquery in an UPDATE query uses full scan instead of range
+--echo #
+
+CREATE TABLE t1 (
+ key1 varchar(30) NOT NULL,
+ col1 int(11) NOT NULL,
+ filler char(100)
+);
+insert into t1 select seq, seq, seq from seq_1_to_100;
+
+CREATE TABLE t10 (
+ key1 varchar(30) NOT NULL,
+ col1 int,
+ filler char(100),
+ PRIMARY KEY (key1)
+);
+insert into t10 select seq, seq, seq from seq_1_to_1000;
+
+CREATE TABLE t11 (
+ key1 varchar(30) NOT NULL,
+ filler char(100),
+ PRIMARY KEY (key1)
+);
+insert into t11 select seq, seq from seq_1_to_1000;
+
+
+set @tmp_os=@@optimizer_switch;
+set optimizer_switch='semijoin=off,materialization=off';
+
+--echo # Must use range access (not full scan) for table tms:
+explain select * from t1 hist
+WHERE
+ key1 IN ('1','2','3','4','5','6','7','8','9','10') AND
+ hist.col1 NOT IN (SELECT tn.col1
+ FROM t10 tn JOIN t11 tms ON tms.key1 = tn.key1
+ WHERE tn.key1 IN ('1','2','3','4','5','6','7','8','9','10')
+ );
+
+set optimizer_switch=@tmp_os;
+
+drop table t1, t10, t11;
+
--echo # End of 10.2 tests
diff --git a/sql/item_subselect.cc b/sql/item_subselect.cc
index 25621dfe104..92ba085af5b 100644
--- a/sql/item_subselect.cc
+++ b/sql/item_subselect.cc
@@ -2769,12 +2769,9 @@ bool Item_in_subselect::inject_in_to_exists_cond(JOIN *join_arg)
{
/* The argument list of the top-level AND may change after fix fields. */
and_args= ((Item_cond*) join_arg->conds)->argument_list();
- List_iterator<Item_equal> li(join_arg->cond_equal->current_level);
- Item_equal *elem;
- while ((elem= li++))
- {
- and_args->push_back(elem, thd->mem_root);
- }
+ ((Item_cond_and *) (join_arg->conds))->m_cond_equal=
+ *join_arg->cond_equal;
+ and_args->append((List<Item> *)&join_arg->cond_equal->current_level);
}
}
1
0
[Commits] 55683fc8a98: MDEV-22377: Subquery in an UPDATE query uses full scan instead of range
by psergey 16 Feb '22
by psergey 16 Feb '22
16 Feb '22
revision-id: 55683fc8a981ad80bf2c74e6d585af50bdda4136 (mariadb-10.2.43-7-g55683fc8a98)
parent(s): cf574cf53b168992b911d5fc32c590a6ee03a56a
author: Sergei Petrunia
committer: Sergei Petrunia
timestamp: 2022-02-16 21:04:56 +0300
message:
MDEV-22377: Subquery in an UPDATE query uses full scan instead of range
[Patch idea by Igor Babaev]
Symptom: for IN (SELECT ...) subqueries using IN-to-EXISTS transformation,
the optimizer was unable to make inferences using multiple equalities.
The cause is code Item_in_subselect::inject_in_to_exists_cond() which may
break invariants that Multiple-Equality code relies on. In particular, it
may produce a WHERE condition with an empty Item_cond::m_cond_equal.
Fixed this by making Item_cond::m_cond_equal.
---
mysql-test/r/subselect2.result | 2 +-
mysql-test/r/subselect3.result | 4 +--
mysql-test/r/subselect3_jcl6.result | 4 +--
mysql-test/r/subselect4.result | 38 ++++++++++++++++++++++++++++-
mysql-test/r/subselect_mat_cost_bugs.result | 4 +--
mysql-test/t/subselect4.test | 38 +++++++++++++++++++++++++++++
sql/item_subselect.cc | 9 +++----
7 files changed, 85 insertions(+), 14 deletions(-)
diff --git a/mysql-test/r/subselect2.result b/mysql-test/r/subselect2.result
index 155d5ab2a50..91d3e445b48 100644
--- a/mysql-test/r/subselect2.result
+++ b/mysql-test/r/subselect2.result
@@ -262,7 +262,7 @@ id select_type table type possible_keys key key_len ref rows filtered Extra
2 DEPENDENT SUBQUERY t2c ALL NULL NULL NULL NULL 2 100.00 Using where; Using join buffer (incremental, BNL join)
Warnings:
Note 1276 Field or reference 'test.t1.a' of SELECT #2 was resolved in SELECT #1
-Note 1003 select `test`.`t1`.`a` AS `a`,`test`.`t1`.`c1` AS `c1` from `test`.`t1` where !<expr_cache><`test`.`t1`.`c1`,`test`.`t1`.`a`>(<in_optimizer>(`test`.`t1`.`c1`,<exists>(select `test`.`t2a`.`c2` from `test`.`t2` `t2a` join `test`.`t2` `t2b` join `test`.`t2` `t2c` where (`test`.`t2b`.`m` <> `test`.`t1`.`a` or `test`.`t2b`.`m` = `test`.`t2a`.`m`) and trigcond(<cache>(`test`.`t1`.`c1`) = `test`.`t2a`.`c2` or `test`.`t2a`.`c2` is null) and `test`.`t2c`.`c2` = `test`.`t2b`.`c2` and `test`.`t2b`.`n` = `test`.`t2a`.`m` having trigcond(`test`.`t2a`.`c2` is null))))
+Note 1003 select `test`.`t1`.`a` AS `a`,`test`.`t1`.`c1` AS `c1` from `test`.`t1` where !<expr_cache><`test`.`t1`.`c1`,`test`.`t1`.`a`>(<in_optimizer>(`test`.`t1`.`c1`,<exists>(select `test`.`t2a`.`c2` from `test`.`t2` `t2a` join `test`.`t2` `t2b` join `test`.`t2` `t2c` where `test`.`t2c`.`c2` = `test`.`t2b`.`c2` and `test`.`t2b`.`n` = `test`.`t2a`.`m` and (`test`.`t2b`.`m` <> `test`.`t1`.`a` or `test`.`t2b`.`m` = `test`.`t2a`.`m`) and trigcond(<cache>(`test`.`t1`.`c1`) = `test`.`t2a`.`c2` or `test`.`t2a`.`c2` is null) having trigcond(`test`.`t2a`.`c2` is null))))
DROP TABLE t1,t2;
#
# MDEV-614, also MDEV-536, also LP:1050806:
diff --git a/mysql-test/r/subselect3.result b/mysql-test/r/subselect3.result
index 8d33e4e1606..d92626380ef 100644
--- a/mysql-test/r/subselect3.result
+++ b/mysql-test/r/subselect3.result
@@ -169,7 +169,7 @@ id select_type table type possible_keys key key_len ref rows filtered Extra
2 DEPENDENT SUBQUERY t2 ref a a 5 test.t1.b 1 100.00 Using where
Warnings:
Note 1276 Field or reference 'test.t3.oref' of SELECT #2 was resolved in SELECT #1
-Note 1003 select `test`.`t3`.`a` AS `a`,`test`.`t3`.`oref` AS `oref`,<expr_cache><`test`.`t3`.`a`,`test`.`t3`.`oref`>(<in_optimizer>(`test`.`t3`.`a`,<exists>(select `test`.`t1`.`a` from `test`.`t1` join `test`.`t2` where `test`.`t2`.`b` = `test`.`t3`.`oref` and trigcond(<cache>(`test`.`t3`.`a`) = `test`.`t1`.`a` or `test`.`t1`.`a` is null) and `test`.`t2`.`a` = `test`.`t1`.`b` having trigcond(`test`.`t1`.`a` is null)))) AS `Z` from `test`.`t3`
+Note 1003 select `test`.`t3`.`a` AS `a`,`test`.`t3`.`oref` AS `oref`,<expr_cache><`test`.`t3`.`a`,`test`.`t3`.`oref`>(<in_optimizer>(`test`.`t3`.`a`,<exists>(select `test`.`t1`.`a` from `test`.`t1` join `test`.`t2` where `test`.`t2`.`a` = `test`.`t1`.`b` and `test`.`t2`.`b` = `test`.`t3`.`oref` and trigcond(<cache>(`test`.`t3`.`a`) = `test`.`t1`.`a` or `test`.`t1`.`a` is null) having trigcond(`test`.`t1`.`a` is null)))) AS `Z` from `test`.`t3`
drop table t1, t2, t3;
create table t1 (a int NOT NULL, b int NOT NULL, key(a));
insert into t1 values
@@ -197,7 +197,7 @@ id select_type table type possible_keys key key_len ref rows filtered Extra
2 DEPENDENT SUBQUERY t2 ref a a 4 test.t1.b 1 100.00 Using where
Warnings:
Note 1276 Field or reference 'test.t3.oref' of SELECT #2 was resolved in SELECT #1
-Note 1003 select `test`.`t3`.`a` AS `a`,`test`.`t3`.`oref` AS `oref`,<expr_cache><`test`.`t3`.`a`,`test`.`t3`.`oref`>(<in_optimizer>(`test`.`t3`.`a`,<exists>(select `test`.`t1`.`a` from `test`.`t1` join `test`.`t2` where `test`.`t2`.`b` = `test`.`t3`.`oref` and trigcond(<cache>(`test`.`t3`.`a`) = `test`.`t1`.`a`) and `test`.`t2`.`a` = `test`.`t1`.`b`))) AS `Z` from `test`.`t3`
+Note 1003 select `test`.`t3`.`a` AS `a`,`test`.`t3`.`oref` AS `oref`,<expr_cache><`test`.`t3`.`a`,`test`.`t3`.`oref`>(<in_optimizer>(`test`.`t3`.`a`,<exists>(select `test`.`t1`.`a` from `test`.`t1` join `test`.`t2` where `test`.`t2`.`a` = `test`.`t1`.`b` and `test`.`t2`.`b` = `test`.`t3`.`oref` and trigcond(<cache>(`test`.`t3`.`a`) = `test`.`t1`.`a`)))) AS `Z` from `test`.`t3`
drop table t1,t2,t3;
create table t1 (oref int, grp int);
insert into t1 (oref, grp) values
diff --git a/mysql-test/r/subselect3_jcl6.result b/mysql-test/r/subselect3_jcl6.result
index 6d1e305bc49..2df3f2eea35 100644
--- a/mysql-test/r/subselect3_jcl6.result
+++ b/mysql-test/r/subselect3_jcl6.result
@@ -172,7 +172,7 @@ id select_type table type possible_keys key key_len ref rows filtered Extra
2 DEPENDENT SUBQUERY t2 ref a a 5 test.t1.b 1 100.00 Using where; Using join buffer (flat, BKA join); Key-ordered Rowid-ordered scan
Warnings:
Note 1276 Field or reference 'test.t3.oref' of SELECT #2 was resolved in SELECT #1
-Note 1003 select `test`.`t3`.`a` AS `a`,`test`.`t3`.`oref` AS `oref`,<expr_cache><`test`.`t3`.`a`,`test`.`t3`.`oref`>(<in_optimizer>(`test`.`t3`.`a`,<exists>(select `test`.`t1`.`a` from `test`.`t1` join `test`.`t2` where `test`.`t2`.`b` = `test`.`t3`.`oref` and trigcond(<cache>(`test`.`t3`.`a`) = `test`.`t1`.`a` or `test`.`t1`.`a` is null) and `test`.`t2`.`a` = `test`.`t1`.`b` having trigcond(`test`.`t1`.`a` is null)))) AS `Z` from `test`.`t3`
+Note 1003 select `test`.`t3`.`a` AS `a`,`test`.`t3`.`oref` AS `oref`,<expr_cache><`test`.`t3`.`a`,`test`.`t3`.`oref`>(<in_optimizer>(`test`.`t3`.`a`,<exists>(select `test`.`t1`.`a` from `test`.`t1` join `test`.`t2` where `test`.`t2`.`a` = `test`.`t1`.`b` and `test`.`t2`.`b` = `test`.`t3`.`oref` and trigcond(<cache>(`test`.`t3`.`a`) = `test`.`t1`.`a` or `test`.`t1`.`a` is null) having trigcond(`test`.`t1`.`a` is null)))) AS `Z` from `test`.`t3`
drop table t1, t2, t3;
create table t1 (a int NOT NULL, b int NOT NULL, key(a));
insert into t1 values
@@ -200,7 +200,7 @@ id select_type table type possible_keys key key_len ref rows filtered Extra
2 DEPENDENT SUBQUERY t2 ref a a 4 test.t1.b 1 100.00 Using where; Using join buffer (flat, BKA join); Key-ordered Rowid-ordered scan
Warnings:
Note 1276 Field or reference 'test.t3.oref' of SELECT #2 was resolved in SELECT #1
-Note 1003 select `test`.`t3`.`a` AS `a`,`test`.`t3`.`oref` AS `oref`,<expr_cache><`test`.`t3`.`a`,`test`.`t3`.`oref`>(<in_optimizer>(`test`.`t3`.`a`,<exists>(select `test`.`t1`.`a` from `test`.`t1` join `test`.`t2` where `test`.`t2`.`b` = `test`.`t3`.`oref` and trigcond(<cache>(`test`.`t3`.`a`) = `test`.`t1`.`a`) and `test`.`t2`.`a` = `test`.`t1`.`b`))) AS `Z` from `test`.`t3`
+Note 1003 select `test`.`t3`.`a` AS `a`,`test`.`t3`.`oref` AS `oref`,<expr_cache><`test`.`t3`.`a`,`test`.`t3`.`oref`>(<in_optimizer>(`test`.`t3`.`a`,<exists>(select `test`.`t1`.`a` from `test`.`t1` join `test`.`t2` where `test`.`t2`.`a` = `test`.`t1`.`b` and `test`.`t2`.`b` = `test`.`t3`.`oref` and trigcond(<cache>(`test`.`t3`.`a`) = `test`.`t1`.`a`)))) AS `Z` from `test`.`t3`
drop table t1,t2,t3;
create table t1 (oref int, grp int);
insert into t1 (oref, grp) values
diff --git a/mysql-test/r/subselect4.result b/mysql-test/r/subselect4.result
index 2657977dae7..e8a91d26efe 100644
--- a/mysql-test/r/subselect4.result
+++ b/mysql-test/r/subselect4.result
@@ -2351,7 +2351,7 @@ id select_type table type possible_keys key key_len ref rows filtered Extra
1 PRIMARY t2 system NULL NULL NULL NULL 1 100.00
2 DEPENDENT SUBQUERY t1 ALL NULL NULL NULL NULL 2 100.00 Using where
Warnings:
-Note 1003 select 3 AS `f` from dual where !<expr_cache><3>(<in_optimizer>(3,<exists>(select `test`.`t1`.`b` from `test`.`t1` where (`test`.`t1`.`c` = 'USA' or `test`.`t1`.`c` <> 'USA') and trigcond(<cache>(3) = `test`.`t1`.`b` or `test`.`t1`.`b` is null) and `test`.`t1`.`b` = `test`.`t1`.`a` having trigcond(`test`.`t1`.`b` is null))))
+Note 1003 select 3 AS `f` from dual where !<expr_cache><3>(<in_optimizer>(3,<exists>(select `test`.`t1`.`b` from `test`.`t1` where `test`.`t1`.`b` = `test`.`t1`.`a` and (`test`.`t1`.`c` = 'USA' or `test`.`t1`.`c` <> 'USA') and trigcond(<cache>(3) = `test`.`t1`.`b` or `test`.`t1`.`b` is null) having trigcond(`test`.`t1`.`b` is null))))
SELECT * FROM t2
WHERE f NOT IN (SELECT b FROM t1
WHERE 0 OR (c IN ('USA') OR c NOT IN ('USA')) AND a = b);
@@ -2803,4 +2803,40 @@ FROM (t1 JOIN t1 AS ref_t1 ON
(t1.i1 > (SELECT ref_t1.i1 AS c0 FROM t1 b ORDER BY -c0)));
ERROR 21000: Subquery returns more than 1 row
DROP TABLE t1;
+#
+# MDEV-22377: Subquery in an UPDATE query uses full scan instead of range
+#
+CREATE TABLE t1 (
+key1 varchar(30) NOT NULL,
+col1 int(11) NOT NULL,
+filler char(100)
+);
+insert into t1 select seq, seq, seq from seq_1_to_100;
+CREATE TABLE t10 (
+key1 varchar(30) NOT NULL,
+col1 int,
+filler char(100),
+PRIMARY KEY (key1)
+);
+insert into t10 select seq, seq, seq from seq_1_to_1000;
+CREATE TABLE t11 (
+key1 varchar(30) NOT NULL,
+filler char(100),
+PRIMARY KEY (key1)
+);
+insert into t11 select seq, seq from seq_1_to_1000;
+# Must use range access (not full scan) for table tms:
+explain update t1 hist
+set filler='aaa'
+WHERE
+key1 IN ('1','2','3','4','5','6','7','8','9','10') AND
+hist.col1 NOT IN (SELECT tn.col1
+FROM t10 tn JOIN t11 tms ON tms.key1 = tn.key1
+WHERE tn.key1 IN ('1','2','3','4','5','6','7','8','9','10')
+);
+id select_type table type possible_keys key key_len ref rows Extra
+1 PRIMARY hist ALL NULL NULL NULL NULL 100 Using where
+2 DEPENDENT SUBQUERY tms range PRIMARY PRIMARY 32 NULL 10 Using where; Using index
+2 DEPENDENT SUBQUERY tn eq_ref PRIMARY PRIMARY 32 test.tms.key1 1 Using where
+drop table t1, t10, t11;
# End of 10.2 tests
diff --git a/mysql-test/r/subselect_mat_cost_bugs.result b/mysql-test/r/subselect_mat_cost_bugs.result
index fd768b1efd4..87cbc1bb078 100644
--- a/mysql-test/r/subselect_mat_cost_bugs.result
+++ b/mysql-test/r/subselect_mat_cost_bugs.result
@@ -100,7 +100,7 @@ id select_type table type possible_keys key key_len ref rows filtered Extra
2 DEPENDENT SUBQUERY t2 index c3 c3 9 NULL 2 100.00 Using where; Using index; Using join buffer (flat, BNL join)
Warnings:
Note 1276 Field or reference 'test.t1.pk' of SELECT #2 was resolved in SELECT #1
-Note 1003 select `test`.`t1`.`pk` AS `pk` from `test`.`t1` where <expr_cache><`test`.`t1`.`c1`,`test`.`t1`.`pk`>(<in_optimizer>(`test`.`t1`.`c1`,<exists>(select `test`.`t1a`.`c1` from `test`.`t1b` join `test`.`t2` left join `test`.`t1a` on(`test`.`t1a`.`c2` = `test`.`t1b`.`pk` and 2) where `test`.`t1`.`pk` <> 0 and <cache>(`test`.`t1`.`c1`) = `test`.`t1a`.`c1` and `test`.`t2`.`c3` = `test`.`t1b`.`c4`)))
+Note 1003 select `test`.`t1`.`pk` AS `pk` from `test`.`t1` where <expr_cache><`test`.`t1`.`c1`,`test`.`t1`.`pk`>(<in_optimizer>(`test`.`t1`.`c1`,<exists>(select `test`.`t1a`.`c1` from `test`.`t1b` join `test`.`t2` left join `test`.`t1a` on(`test`.`t1a`.`c2` = `test`.`t1b`.`pk` and 2) where `test`.`t2`.`c3` = `test`.`t1b`.`c4` and `test`.`t1`.`pk` <> 0 and <cache>(`test`.`t1`.`c1`) = `test`.`t1a`.`c1`)))
SELECT pk
FROM t1
WHERE c1 IN
@@ -363,7 +363,7 @@ AND a = SOME (SELECT b FROM t5));
id select_type table type possible_keys key key_len ref rows Extra
1 PRIMARY t3 ALL NULL NULL NULL NULL 2 Using where
2 DEPENDENT SUBQUERY t5 index c c 10 NULL 2 Using where; Using index; Start temporary
-2 DEPENDENT SUBQUERY t4 eq_ref PRIMARY PRIMARY 4 test.t5.b 1 Using index condition; Using where; End temporary
+2 DEPENDENT SUBQUERY t4 eq_ref PRIMARY PRIMARY 4 test.t5.b 1 Using where; End temporary
SELECT *
FROM t3
WHERE t3.b > ALL (
diff --git a/mysql-test/t/subselect4.test b/mysql-test/t/subselect4.test
index 93389571c5c..a0cb97bbc7e 100644
--- a/mysql-test/t/subselect4.test
+++ b/mysql-test/t/subselect4.test
@@ -2308,4 +2308,42 @@ FROM (t1 JOIN t1 AS ref_t1 ON
DROP TABLE t1;
+--echo #
+--echo # MDEV-22377: Subquery in an UPDATE query uses full scan instead of range
+--echo #
+
+CREATE TABLE t1 (
+ key1 varchar(30) NOT NULL,
+ col1 int(11) NOT NULL,
+ filler char(100)
+);
+insert into t1 select seq, seq, seq from seq_1_to_100;
+
+CREATE TABLE t10 (
+ key1 varchar(30) NOT NULL,
+ col1 int,
+ filler char(100),
+ PRIMARY KEY (key1)
+);
+insert into t10 select seq, seq, seq from seq_1_to_1000;
+
+CREATE TABLE t11 (
+ key1 varchar(30) NOT NULL,
+ filler char(100),
+ PRIMARY KEY (key1)
+);
+insert into t11 select seq, seq from seq_1_to_1000;
+
+--echo # Must use range access (not full scan) for table tms:
+explain update t1 hist
+set filler='aaa'
+WHERE
+ key1 IN ('1','2','3','4','5','6','7','8','9','10') AND
+ hist.col1 NOT IN (SELECT tn.col1
+ FROM t10 tn JOIN t11 tms ON tms.key1 = tn.key1
+ WHERE tn.key1 IN ('1','2','3','4','5','6','7','8','9','10')
+ );
+
+drop table t1, t10, t11;
+
--echo # End of 10.2 tests
diff --git a/sql/item_subselect.cc b/sql/item_subselect.cc
index 25621dfe104..92ba085af5b 100644
--- a/sql/item_subselect.cc
+++ b/sql/item_subselect.cc
@@ -2769,12 +2769,9 @@ bool Item_in_subselect::inject_in_to_exists_cond(JOIN *join_arg)
{
/* The argument list of the top-level AND may change after fix fields. */
and_args= ((Item_cond*) join_arg->conds)->argument_list();
- List_iterator<Item_equal> li(join_arg->cond_equal->current_level);
- Item_equal *elem;
- while ((elem= li++))
- {
- and_args->push_back(elem, thd->mem_root);
- }
+ ((Item_cond_and *) (join_arg->conds))->m_cond_equal=
+ *join_arg->cond_equal;
+ and_args->append((List<Item> *)&join_arg->cond_equal->current_level);
}
}
1
0
[Commits] 4a9df3b6264: This adds a my.cnf parameter, rocksdb_use_range_locking.
by psergey 14 Feb '22
by psergey 14 Feb '22
14 Feb '22
revision-id: 4a9df3b6264488a731af910576ef9fa8e375150c ()
parent(s): 5098a1526a5021064ed7307e7e0ba30453723f66
author: Sergei Petrunia
committer: Sergei Petrunia
timestamp: 2022-02-14 23:47:22 +0300
message:
This adds a my.cnf parameter, rocksdb_use_range_locking.
When it is ON, MyRocks will:
- initialize RocksDB to use range-locking lock manager
- for all DML operations (including SELECT .. FOR UPDATE) will lock
the scanned range before reading/modifying rows.
- In range locking mode, there is no snapshot checking (cannot do that
for ranges). Instead, MyRocks will read and modify latest committed
data, just like InnoDB does (in the code, grep for (start|end)_ignore_
snapshot)
- Queries that do not have a finite range to scan, like
UPDATE t1 .... ORDER BY t1.key LIMIT n
will use a "Locking iterator" which will read rows, lock the range,
and re-read the rows. See class LockingIterator.
---
mysql-test/suite/rocksdb/combinations | 3 +
.../suite/rocksdb/include/have_range_locking.inc | 3 +
.../suite/rocksdb/include/not_range_locking.inc | 5 +
.../rocksdb/include/select_from_is_rowlocks.inc | 99 +++
.../suite/rocksdb/r/hermitage-range_locking.result | 652 +++++++++++++++++++
...issue243_transactionStatus-range_locking.result | 182 ++++++
.../r/level_repeatable_read-range_locking.result | 106 +++
mysql-test/suite/rocksdb/r/range_locking.result | 596 +++++++++++++++++
.../suite/rocksdb/r/range_locking_conc_test.result | 4 +
.../r/range_locking_deadlock_tracking.result | 453 +++++++++++++
.../rocksdb/r/range_locking_escalation.result | 27 +
.../rocksdb/r/range_locking_partial_index.result | 203 ++++++
.../rocksdb/r/range_locking_refresh_iter.result | 50 ++
.../suite/rocksdb/r/range_locking_rev_cf.result | 556 ++++++++++++++++
.../rocksdb/r/range_locking_seek_for_update.result | 291 +++++++++
.../r/range_locking_seek_for_update2.result | 142 ++++
.../r/range_locking_seek_for_update2_rev_cf.result | 142 ++++
.../rocksdb/r/range_locking_shared_locks.result | 251 ++++++++
mysql-test/suite/rocksdb/r/rocksdb.result | 3 +
.../suite/rocksdb/r/rocksdb_read_free_rpl.result | 2 +-
.../rocksdb/r/rocksdb_timeout_rollback.result | 3 +
.../suite/rocksdb/r/select_count_for_update.result | 4 +-
mysql-test/suite/rocksdb/r/trx_info.result | 7 +-
mysql-test/suite/rocksdb/r/unique_sec.result | 4 +
.../suite/rocksdb/r/unique_sec_rev_cf.result | 4 +
mysql-test/suite/rocksdb/t/deadlock_tracking.test | 7 +-
.../t/drop_cf_before_show_deadlock_info.test | 4 +
.../suite/rocksdb/t/hermitage-range_locking.test | 15 +
mysql-test/suite/rocksdb/t/hermitage.inc | 14 +-
mysql-test/suite/rocksdb/t/hermitage.test | 3 +
mysql-test/suite/rocksdb/t/i_s_deadlock.test | 4 +
mysql-test/suite/rocksdb/t/issue111.test | 4 +
.../issue243_transactionStatus-range_locking.test | 10 +
.../rocksdb/t/issue243_transactionStatus.test | 4 +
.../t/level_repeatable_read-range_locking.test | 9 +
.../suite/rocksdb/t/level_repeatable_read.test | 3 +
mysql-test/suite/rocksdb/t/lock_info.test | 3 +
mysql-test/suite/rocksdb/t/locking_issues.test | 3 +
mysql-test/suite/rocksdb/t/max_row_locks.test | 1 +
mysql-test/suite/rocksdb/t/range_locking.inc | 612 ++++++++++++++++++
mysql-test/suite/rocksdb/t/range_locking.test | 6 +
.../suite/rocksdb/t/range_locking_conc_test.py | 450 +++++++++++++
.../suite/rocksdb/t/range_locking_conc_test.test | 18 +
.../suite/rocksdb/t/range_locking_conc_test.txt | 91 +++
.../rocksdb/t/range_locking_deadlock_tracking.test | 196 ++++++
.../rocksdb/t/range_locking_escalation-master.opt | 1 +
.../suite/rocksdb/t/range_locking_escalation.test | 39 ++
.../rocksdb/t/range_locking_partial_index.test | 120 ++++
.../rocksdb/t/range_locking_refresh_iter.test | 70 ++
.../suite/rocksdb/t/range_locking_rev_cf.test | 12 +
.../rocksdb/t/range_locking_seek_for_update.test | 308 +++++++++
.../rocksdb/t/range_locking_seek_for_update2.inc | 55 ++
.../rocksdb/t/range_locking_seek_for_update2.test | 4 +
.../t/range_locking_seek_for_update2_rev_cf.test | 4 +
.../t/range_locking_seek_for_update_iter_end.inc | 41 ++
.../rocksdb/t/range_locking_shared_locks.test | 202 ++++++
mysql-test/suite/rocksdb/t/rocksdb.test | 3 +
.../suite/rocksdb/t/rocksdb_concurrent_delete.test | 4 +
mysql-test/suite/rocksdb/t/rocksdb_locks.test | 3 +
.../suite/rocksdb/t/rocksdb_read_free_rpl.test | 2 +-
.../suite/rocksdb/t/rocksdb_timeout_rollback.test | 2 +
mysql-test/suite/rocksdb/t/rpl_row_not_found.inc | 2 +
.../suite/rocksdb/t/select_count_for_update.test | 14 +
.../suite/rocksdb/t/select_lock_in_share_mode.test | 3 +
mysql-test/suite/rocksdb/t/skip_locked_nowait.test | 3 +
mysql-test/suite/rocksdb/t/trx_info.test | 6 +-
mysql-test/suite/rocksdb/t/unique_check.test | 5 +
mysql-test/suite/rocksdb/t/unique_sec.inc | 10 +-
mysql-test/suite/rocksdb/t/varbinary_format.test | 4 +
mysql-test/suite/rocksdb/t/varchar_format.test | 2 +
.../r/rocksdb_max_lock_memory_basic.result | 7 +
.../r/rocksdb_use_range_locking_basic.result | 7 +
.../t/rocksdb_max_lock_memory_basic.test | 5 +
.../t/rocksdb_use_range_locking_basic.test | 5 +
rocksdb | 2 +-
storage/rocksdb/CMakeLists.txt | 1 +
storage/rocksdb/get_rocksdb_files.sh | 2 +-
storage/rocksdb/ha_rocksdb.cc | 717 +++++++++++++++++++--
storage/rocksdb/ha_rocksdb.h | 31 +-
storage/rocksdb/locking-iterator-partial-index.txt | 120 ++++
storage/rocksdb/nosql_access.cc | 4 +-
storage/rocksdb/rdb_i_s.cc | 93 ++-
storage/rocksdb/rdb_iterator.cc | 62 +-
storage/rocksdb/rdb_iterator.h | 29 +-
storage/rocksdb/rdb_locking_iter.cc | 285 ++++++++
storage/rocksdb/rdb_locking_iter.h | 120 ++++
storage/rocksdb/rdb_utils.cc | 27 +
storage/rocksdb/rdb_utils.h | 3 +
88 files changed, 7590 insertions(+), 98 deletions(-)
diff --git a/mysql-test/suite/rocksdb/combinations b/mysql-test/suite/rocksdb/combinations
index acf2f49a0c3..5e3b56932c6 100644
--- a/mysql-test/suite/rocksdb/combinations
+++ b/mysql-test/suite/rocksdb/combinations
@@ -7,3 +7,6 @@ rocksdb_write_policy=write_prepared
[write_unprepared]
rocksdb_write_policy=write_unprepared
rocksdb_write_batch_flush_threshold=1
+
+[range_locking]
+rocksdb_use_range_locking=1
diff --git a/mysql-test/suite/rocksdb/include/have_range_locking.inc b/mysql-test/suite/rocksdb/include/have_range_locking.inc
new file mode 100644
index 00000000000..a8600daea77
--- /dev/null
+++ b/mysql-test/suite/rocksdb/include/have_range_locking.inc
@@ -0,0 +1,3 @@
+if (`select count(*) = 0 from performance_schema.session_variables where variable_name = 'rocksdb_use_range_locking' and variable_value = 'ON';`) {
+ --skip Test requires range locking
+}
diff --git a/mysql-test/suite/rocksdb/include/not_range_locking.inc b/mysql-test/suite/rocksdb/include/not_range_locking.inc
new file mode 100644
index 00000000000..62c26b134bc
--- /dev/null
+++ b/mysql-test/suite/rocksdb/include/not_range_locking.inc
@@ -0,0 +1,5 @@
+--let $_use_range_locking= `select @@rocksdb_use_range_locking`
+if ($_use_range_locking == 1)
+{
+ --skip Test doesn't support range locking
+}
diff --git a/mysql-test/suite/rocksdb/include/select_from_is_rowlocks.inc b/mysql-test/suite/rocksdb/include/select_from_is_rowlocks.inc
new file mode 100644
index 00000000000..cc6a328e566
--- /dev/null
+++ b/mysql-test/suite/rocksdb/include/select_from_is_rowlocks.inc
@@ -0,0 +1,99 @@
+--echo # select * from information_schema.rocksdb_locks; # With replacements by select_from_is_rowlocks.inc
+#
+# An include to print contents of I_S.ROCKSB_LOCKS
+#
+# Implicit "parameters"
+# - Currently it prints locks on t1.PRIMARY
+#
+# Explicit "parameter" variables:
+# - $TRX1_ID - print this transaction as "TRX1"
+# - $TRX2_ID - print this transaction as "TRX2"
+#
+# - $select_from_is_rowlocks_current_trx_only
+# - $order_by_rowkey
+#
+# - $SECOND_INDEX_NAME
+
+--disable_query_log
+set @cf_id=(select column_family from information_schema.rocksdb_ddl
+ where table_name='t1' and index_name='PRIMARY');
+set @rtrx_id=(select transaction_id from information_schema.rocksdb_trx
+ where thread_id=connection_id());
+set @indexnr= (select lower(lpad(hex(index_number),8,'0')) from information_schema.rocksdb_ddl
+ where table_name='t1' and index_name='PRIMARY');
+
+set @indexnr_next= (select lower(lpad(hex(index_number+1),8,'0'))
+ from information_schema.rocksdb_ddl
+ where table_name='t1' and index_name='PRIMARY');
+
+let $extra_where = where 1;
+
+if ($select_from_is_rowlocks_current_trx_only)
+{
+ let $extra_where = where transaction_id=(select transaction_id from information_schema.rocksdb_trx where connection_id()=thread_id);
+}
+
+## transaction column
+
+# If TRX1_ID is not specified, get the current transaction:
+let $transaction_col= replace(transaction_id, @rtrx_id, "\$trx_id");
+if ($TRX1_ID)
+{
+ let $transaction_col = replace(transaction_id, '$TRX1_ID', "\$TRX1_ID");
+}
+
+if ($TRX2_ID)
+{
+ let $transaction_col = replace($transaction_col, '$TRX2_ID', "\$TRX2_ID");
+}
+
+## CF_ID column
+let $cf_id_col= column_family_id;
+
+if ($SECOND_INDEX_NAME)
+{
+ eval set @cf2_id=(select column_family from information_schema.rocksdb_ddl
+ where table_name='t1' and index_name='$SECOND_INDEX_NAME');
+
+ let $cf_id_col= replace($cf_id_col, @cf2_id, "\$cf2_id");
+}
+let $cf_id_col= replace($cf_id_col, @cf_id, "\$cf_id");
+
+## KEY column
+let $key_col= (`key`);
+if ($SECOND_INDEX_NAME)
+{
+ eval set @indexnr2= (select lower(lpad(hex(index_number),8,'0'))
+ from information_schema.rocksdb_ddl
+ where table_name='t1' and index_name='$SECOND_INDEX_NAME');
+
+ eval set @indexnr2_next= (select lower(lpad(hex(index_number+1),8,'0'))
+ from information_schema.rocksdb_ddl
+ where table_name='t1' and index_name='$SECOND_INDEX_NAME');
+
+ let $key_col = replace($key_col, @indexnr2, '\${indexnr2}');
+ let $key_col = replace($key_col, @indexnr2_next, '\${indexnr2+1}');
+}
+
+let $key_col = replace($key_col, @indexnr, '\${indexnr}');
+let $key_col = replace($key_col, @indexnr_next, '\${indexnr+1}');
+
+## ORDER BY
+if ($order_by_rowkey)
+{
+ let $extra_order_by = ORDER BY 3,2;
+}
+
+if (!$order_by_rowkey)
+{
+ --sorted_result
+}
+
+eval select
+ $cf_id_col as COLUMN_FAMILY_ID,
+ $transaction_col as TRANSACTION_ID,
+ $key_col as `KEY`,
+ mode
+from information_schema.rocksdb_locks $extra_where $extra_order_by;
+
+--enable_query_log
diff --git a/mysql-test/suite/rocksdb/r/hermitage-range_locking.result b/mysql-test/suite/rocksdb/r/hermitage-range_locking.result
new file mode 100644
index 00000000000..3938fa38b6c
--- /dev/null
+++ b/mysql-test/suite/rocksdb/r/hermitage-range_locking.result
@@ -0,0 +1,652 @@
+DROP TABLE IF EXISTS test;
+connect con1,localhost,root,,;
+SET SESSION TRANSACTION ISOLATION LEVEL READ COMMITTED;
+connect con2,localhost,root,,;
+SET SESSION TRANSACTION ISOLATION LEVEL READ COMMITTED;
+connect con3,localhost,root,,;
+SET SESSION TRANSACTION ISOLATION LEVEL READ COMMITTED;
+connection con1;
+create table test (id int primary key, value int) engine=rocksdb;
+connection con1;
+truncate table test;
+insert into test (id, value) values (1, 10), (2, 20);
+begin;
+connection con2;
+begin;
+connection con3;
+begin;
+connection con1;
+select * from test;
+id value
+1 10
+2 20
+update test set value = 101 where id = 1;
+connection con2;
+select * from test;
+id value
+1 10
+2 20
+connection con1;
+rollback;
+connection con2;
+select * from test;
+id value
+1 10
+2 20
+commit;
+connection con1;
+truncate table test;
+insert into test (id, value) values (1, 10), (2, 20);
+begin;
+connection con2;
+begin;
+connection con3;
+begin;
+connection con1;
+update test set value = 101 where id = 1;
+connection con2;
+select * from test;
+id value
+1 10
+2 20
+connection con1;
+update test set value = 11 where id = 1;
+commit;
+connection con2;
+select * from test;
+id value
+1 11
+2 20
+commit;
+connection con1;
+truncate table test;
+insert into test (id, value) values (1, 10), (2, 20);
+begin;
+connection con2;
+begin;
+connection con3;
+begin;
+connection con1;
+update test set value = 11 where id = 1;
+connection con2;
+update test set value = 22 where id = 2;
+connection con1;
+select * from test where id = 2;
+id value
+2 20
+connection con2;
+select * from test where id = 1;
+id value
+1 10
+connection con1;
+commit;
+connection con2;
+commit;
+connection con1;
+truncate table test;
+insert into test (id, value) values (1, 10), (2, 20);
+begin;
+connection con2;
+begin;
+connection con3;
+begin;
+connection con1;
+update test set value = 11 where id = 1;
+update test set value = 19 where id = 2;
+connection con2;
+update test set value = 12 where id = 1;
+connection con1;
+commit;
+connection con2;
+connection con3;
+select * from test;
+id value
+1 11
+2 19
+connection con2;
+update test set value = 18 where id = 2;
+connection con3;
+select * from test;
+id value
+1 11
+2 19
+connection con2;
+commit;
+connection con3;
+select * from test;
+id value
+1 12
+2 18
+commit;
+connection con1;
+truncate table test;
+insert into test (id, value) values (1, 10), (2, 20);
+begin;
+connection con2;
+begin;
+connection con3;
+begin;
+connection con1;
+select * from test where value = 30;
+id value
+connection con2;
+insert into test (id, value) values(3, 30);
+commit;
+connection con1;
+select * from test where value % 3 = 0;
+id value
+3 30
+commit;
+connection con1;
+truncate table test;
+insert into test (id, value) values (1, 10), (2, 20);
+begin;
+connection con2;
+begin;
+connection con3;
+begin;
+connection con1;
+update test set value = value + 10;
+connection con2;
+select variable_value into @a from performance_schema.global_status where variable_name='rocksdb_snapshot_conflict_errors';
+select * from test;
+id value
+1 10
+2 20
+delete from test where value = 20;
+connection con1;
+commit;
+connection con2;
+select * from test;
+id value
+2 30
+commit;
+connection con1;
+truncate table test;
+insert into test (id, value) values (1, 10), (2, 20);
+begin;
+connection con2;
+begin;
+connection con3;
+begin;
+connection con1;
+select * from test where id = 1;
+id value
+1 10
+connection con2;
+select * from test where id = 1;
+id value
+1 10
+connection con1;
+update test set value = 11 where id = 1;
+connection con2;
+update test set value = 12 where id = 1;
+connection con1;
+commit;
+connection con2;
+select * from test;
+id value
+1 12
+2 20
+commit;
+connection con1;
+truncate table test;
+insert into test (id, value) values (1, 10), (2, 20);
+begin;
+connection con2;
+begin;
+connection con3;
+begin;
+connection con1;
+select * from test where id = 1;
+id value
+1 10
+connection con2;
+select * from test where id = 1;
+id value
+1 10
+select * from test where id = 2;
+id value
+2 20
+update test set value = 12 where id = 1;
+update test set value = 18 where id = 2;
+commit;
+connection con1;
+select * from test where id = 2;
+id value
+2 18
+commit;
+connection con1;
+truncate table test;
+insert into test (id, value) values (1, 10), (2, 20);
+begin;
+connection con2;
+begin;
+connection con3;
+begin;
+connection con1;
+select * from test where value % 5 = 0;
+id value
+1 10
+2 20
+connection con2;
+update test set value = 12 where value = 10;
+commit;
+connection con1;
+select * from test where value % 3 = 0;
+id value
+1 12
+commit;
+connection con1;
+truncate table test;
+insert into test (id, value) values (1, 10), (2, 20);
+begin;
+connection con2;
+begin;
+connection con3;
+begin;
+connection con1;
+select * from test where id = 1;
+id value
+1 10
+connection con2;
+select * from test;
+id value
+1 10
+2 20
+update test set value = 12 where id = 1;
+update test set value = 18 where id = 2;
+commit;
+connection con1;
+delete from test where value = 20;
+select * from test where id = 2;
+id value
+2 18
+commit;
+connection con1;
+truncate table test;
+insert into test (id, value) values (1, 10), (2, 20);
+begin;
+connection con2;
+begin;
+connection con3;
+begin;
+connection con1;
+select * from test where id in (1,2);
+id value
+1 10
+2 20
+connection con2;
+select * from test where id in (1,2);
+id value
+1 10
+2 20
+connection con1;
+update test set value = 11 where id = 1;
+connection con2;
+update test set value = 21 where id = 2;
+connection con1;
+commit;
+connection con2;
+commit;
+connection con1;
+truncate table test;
+insert into test (id, value) values (1, 10), (2, 20);
+begin;
+connection con2;
+begin;
+connection con3;
+begin;
+connection con1;
+select * from test where value % 3 = 0;
+id value
+connection con2;
+select * from test where value % 3 = 0;
+id value
+connection con1;
+insert into test (id, value) values(3, 30);
+connection con2;
+insert into test (id, value) values(4, 42);
+connection con1;
+commit;
+connection con2;
+commit;
+select * from test where value % 3 = 0;
+id value
+3 30
+4 42
+connection con1;
+select * from test where value % 3 = 0;
+id value
+3 30
+4 42
+connection default;
+drop table test;
+disconnect con1;
+disconnect con2;
+disconnect con3;
+DROP TABLE IF EXISTS test;
+connect con1,localhost,root,,;
+SET SESSION TRANSACTION ISOLATION LEVEL REPEATABLE READ;
+connect con2,localhost,root,,;
+SET SESSION TRANSACTION ISOLATION LEVEL REPEATABLE READ;
+connect con3,localhost,root,,;
+SET SESSION TRANSACTION ISOLATION LEVEL REPEATABLE READ;
+connection con1;
+create table test (id int primary key, value int) engine=rocksdb;
+connection con1;
+truncate table test;
+insert into test (id, value) values (1, 10), (2, 20);
+begin;
+connection con2;
+begin;
+connection con3;
+begin;
+connection con1;
+select * from test;
+id value
+1 10
+2 20
+update test set value = 101 where id = 1;
+connection con2;
+select * from test;
+id value
+1 10
+2 20
+connection con1;
+rollback;
+connection con2;
+select * from test;
+id value
+1 10
+2 20
+commit;
+connection con1;
+truncate table test;
+insert into test (id, value) values (1, 10), (2, 20);
+begin;
+connection con2;
+begin;
+connection con3;
+begin;
+connection con1;
+update test set value = 101 where id = 1;
+connection con2;
+select * from test;
+id value
+1 10
+2 20
+connection con1;
+update test set value = 11 where id = 1;
+commit;
+connection con2;
+select * from test;
+id value
+1 10
+2 20
+commit;
+connection con1;
+truncate table test;
+insert into test (id, value) values (1, 10), (2, 20);
+begin;
+connection con2;
+begin;
+connection con3;
+begin;
+connection con1;
+update test set value = 11 where id = 1;
+connection con2;
+update test set value = 22 where id = 2;
+connection con1;
+select * from test where id = 2;
+id value
+2 20
+connection con2;
+select * from test where id = 1;
+id value
+1 10
+connection con1;
+commit;
+connection con2;
+commit;
+connection con1;
+truncate table test;
+insert into test (id, value) values (1, 10), (2, 20);
+begin;
+connection con2;
+begin;
+connection con3;
+begin;
+connection con1;
+update test set value = 11 where id = 1;
+update test set value = 19 where id = 2;
+connection con2;
+update test set value = 12 where id = 1;
+connection con1;
+commit;
+connection con2;
+connection con3;
+select * from test;
+id value
+1 11
+2 19
+connection con2;
+update test set value = 18 where id = 2;
+connection con3;
+select * from test;
+id value
+1 11
+2 19
+connection con2;
+commit;
+connection con3;
+select * from test;
+id value
+1 11
+2 19
+commit;
+connection con1;
+truncate table test;
+insert into test (id, value) values (1, 10), (2, 20);
+begin;
+connection con2;
+begin;
+connection con3;
+begin;
+connection con1;
+select * from test where value = 30;
+id value
+connection con2;
+insert into test (id, value) values(3, 30);
+commit;
+connection con1;
+select * from test where value % 3 = 0;
+id value
+commit;
+connection con1;
+truncate table test;
+insert into test (id, value) values (1, 10), (2, 20);
+begin;
+connection con2;
+begin;
+connection con3;
+begin;
+connection con1;
+update test set value = value + 10;
+connection con2;
+select variable_value into @a from performance_schema.global_status where variable_name='rocksdb_snapshot_conflict_errors';
+select * from test;
+id value
+1 10
+2 20
+delete from test where value = 20;
+connection con1;
+commit;
+connection con2;
+select * from test;
+id value
+2 20
+commit;
+connection con1;
+truncate table test;
+insert into test (id, value) values (1, 10), (2, 20);
+begin;
+connection con2;
+begin;
+connection con3;
+begin;
+connection con1;
+select * from test where id = 1;
+id value
+1 10
+connection con2;
+select * from test where id = 1;
+id value
+1 10
+connection con1;
+update test set value = 11 where id = 1;
+connection con2;
+update test set value = 12 where id = 1;
+connection con1;
+commit;
+connection con2;
+select * from test;
+id value
+1 12
+2 20
+commit;
+connection con1;
+truncate table test;
+insert into test (id, value) values (1, 10), (2, 20);
+begin;
+connection con2;
+begin;
+connection con3;
+begin;
+connection con1;
+select * from test where id = 1;
+id value
+1 10
+connection con2;
+select * from test where id = 1;
+id value
+1 10
+select * from test where id = 2;
+id value
+2 20
+update test set value = 12 where id = 1;
+update test set value = 18 where id = 2;
+commit;
+connection con1;
+select * from test where id = 2;
+id value
+2 20
+commit;
+connection con1;
+truncate table test;
+insert into test (id, value) values (1, 10), (2, 20);
+begin;
+connection con2;
+begin;
+connection con3;
+begin;
+connection con1;
+select * from test where value % 5 = 0;
+id value
+1 10
+2 20
+connection con2;
+update test set value = 12 where value = 10;
+commit;
+connection con1;
+select * from test where value % 3 = 0;
+id value
+commit;
+connection con1;
+truncate table test;
+insert into test (id, value) values (1, 10), (2, 20);
+begin;
+connection con2;
+begin;
+connection con3;
+begin;
+connection con1;
+select * from test where id = 1;
+id value
+1 10
+connection con2;
+select * from test;
+id value
+1 10
+2 20
+update test set value = 12 where id = 1;
+update test set value = 18 where id = 2;
+commit;
+connection con1;
+delete from test where value = 20;
+select * from test where id = 2;
+id value
+2 20
+commit;
+connection con1;
+truncate table test;
+insert into test (id, value) values (1, 10), (2, 20);
+begin;
+connection con2;
+begin;
+connection con3;
+begin;
+connection con1;
+select * from test where id in (1,2);
+id value
+1 10
+2 20
+connection con2;
+select * from test where id in (1,2);
+id value
+1 10
+2 20
+connection con1;
+update test set value = 11 where id = 1;
+connection con2;
+update test set value = 21 where id = 2;
+connection con1;
+commit;
+connection con2;
+commit;
+connection con1;
+truncate table test;
+insert into test (id, value) values (1, 10), (2, 20);
+begin;
+connection con2;
+begin;
+connection con3;
+begin;
+connection con1;
+select * from test where value % 3 = 0;
+id value
+connection con2;
+select * from test where value % 3 = 0;
+id value
+connection con1;
+insert into test (id, value) values(3, 30);
+connection con2;
+insert into test (id, value) values(4, 42);
+connection con1;
+commit;
+connection con2;
+commit;
+select * from test where value % 3 = 0;
+id value
+3 30
+4 42
+connection con1;
+select * from test where value % 3 = 0;
+id value
+3 30
+4 42
+connection default;
+drop table test;
+disconnect con1;
+disconnect con2;
+disconnect con3;
diff --git a/mysql-test/suite/rocksdb/r/issue243_transactionStatus-range_locking.result b/mysql-test/suite/rocksdb/r/issue243_transactionStatus-range_locking.result
new file mode 100644
index 00000000000..b48535c5ee6
--- /dev/null
+++ b/mysql-test/suite/rocksdb/r/issue243_transactionStatus-range_locking.result
@@ -0,0 +1,182 @@
+DROP TABLE IF EXISTS t1;
+CREATE TABLE t1 (
+id INT,
+val1 INT,
+val2 INT,
+PRIMARY KEY (id)
+) ENGINE=rocksdb;
+INSERT INTO t1 VALUES(1,1,1),(2,1,2);
+SELECT * FROM t1;
+id val1 val2
+1 1 1
+2 1 2
+UPDATE t1 SET val1=2 WHERE id=2;
+SELECT * FROM t1;
+id val1 val2
+1 1 1
+2 2 2
+SHOW ENGINE rocksdb TRANSACTION STATUS;
+Type Name Status
+rocksdb
+============================================================
+TIMESTAMP ROCKSDB TRANSACTION MONITOR OUTPUT
+============================================================
+---------
+SNAPSHOTS
+---------
+LIST OF SNAPSHOTS FOR EACH SESSION:
+----------LATEST DETECTED DEADLOCKS----------
+-----------------------------------------
+END OF ROCKSDB TRANSACTION MONITOR OUTPUT
+=========================================
+
+SET AUTOCOMMIT=0;
+START TRANSACTION;
+INSERT INTO t1 VALUES(20,1,1),(30,30,30);
+SELECT * FROM t1;
+id val1 val2
+1 1 1
+2 2 2
+20 1 1
+30 30 30
+UPDATE t1 SET val1=20, val2=20 WHERE id=20;
+SELECT * FROM t1;
+id val1 val2
+1 1 1
+2 2 2
+20 20 20
+30 30 30
+DELETE FROM t1 WHERE id=30;
+SHOW ENGINE rocksdb TRANSACTION STATUS;
+Type Name Status
+rocksdb
+============================================================
+TIMESTAMP ROCKSDB TRANSACTION MONITOR OUTPUT
+============================================================
+---------
+SNAPSHOTS
+---------
+LIST OF SNAPSHOTS FOR EACH SESSION:
+---SNAPSHOT, ACTIVE NUM sec
+MySQL thread id TID, OS thread handle PTR, query id QID localhost root ACTION
+SHOW ENGINE rocksdb TRANSACTION STATUS
+lock count 4, write count 4
+insert count 2, update count 1, delete count 1
+----------LATEST DETECTED DEADLOCKS----------
+-----------------------------------------
+END OF ROCKSDB TRANSACTION MONITOR OUTPUT
+=========================================
+
+ROLLBACK;
+SHOW ENGINE rocksdb TRANSACTION STATUS;
+Type Name Status
+rocksdb
+============================================================
+TIMESTAMP ROCKSDB TRANSACTION MONITOR OUTPUT
+============================================================
+---------
+SNAPSHOTS
+---------
+LIST OF SNAPSHOTS FOR EACH SESSION:
+----------LATEST DETECTED DEADLOCKS----------
+-----------------------------------------
+END OF ROCKSDB TRANSACTION MONITOR OUTPUT
+=========================================
+
+START TRANSACTION;
+INSERT INTO t1 VALUES(40,40,40);
+SHOW ENGINE rocksdb TRANSACTION STATUS;
+Type Name Status
+rocksdb
+============================================================
+TIMESTAMP ROCKSDB TRANSACTION MONITOR OUTPUT
+============================================================
+---------
+SNAPSHOTS
+---------
+LIST OF SNAPSHOTS FOR EACH SESSION:
+----------LATEST DETECTED DEADLOCKS----------
+-----------------------------------------
+END OF ROCKSDB TRANSACTION MONITOR OUTPUT
+=========================================
+
+COMMIT;
+SHOW ENGINE rocksdb TRANSACTION STATUS;
+Type Name Status
+rocksdb
+============================================================
+TIMESTAMP ROCKSDB TRANSACTION MONITOR OUTPUT
+============================================================
+---------
+SNAPSHOTS
+---------
+LIST OF SNAPSHOTS FOR EACH SESSION:
+----------LATEST DETECTED DEADLOCKS----------
+-----------------------------------------
+END OF ROCKSDB TRANSACTION MONITOR OUTPUT
+=========================================
+
+SET AUTOCOMMIT=1;
+DROP TABLE t1;
+DROP TABLE IF EXISTS t2;
+CREATE TABLE t2 (
+id1 INT,
+id2 INT,
+value INT,
+PRIMARY KEY (id1),
+KEY (id2)
+) ENGINE=rocksdb;
+SET AUTOCOMMIT=0;
+START TRANSACTION;
+INSERT INTO t2 VALUES(1,2,0),(10,20,30);
+UPDATE t2 SET value=3 WHERE id2=2;
+DELETE FROM t2 WHERE id1=10;
+SHOW ENGINE rocksdb TRANSACTION STATUS;
+Type Name Status
+rocksdb
+============================================================
+TIMESTAMP ROCKSDB TRANSACTION MONITOR OUTPUT
+============================================================
+---------
+SNAPSHOTS
+---------
+LIST OF SNAPSHOTS FOR EACH SESSION:
+----------LATEST DETECTED DEADLOCKS----------
+-----------------------------------------
+END OF ROCKSDB TRANSACTION MONITOR OUTPUT
+=========================================
+
+ROLLBACK;
+SET AUTOCOMMIT=1;
+DROP TABLE t2;
+DROP TABLE IF EXISTS t2;
+CREATE TABLE t2 (
+id1 INT,
+id2 INT,
+value INT,
+PRIMARY KEY (id1),
+UNIQUE KEY (id2)
+) ENGINE=rocksdb;
+SET AUTOCOMMIT=0;
+START TRANSACTION;
+INSERT INTO t2 VALUES(1,2,0),(10,20,30);
+UPDATE t2 SET value=3 WHERE id2=2;
+DELETE FROM t2 WHERE id1=10;
+SHOW ENGINE rocksdb TRANSACTION STATUS;
+Type Name Status
+rocksdb
+============================================================
+TIMESTAMP ROCKSDB TRANSACTION MONITOR OUTPUT
+============================================================
+---------
+SNAPSHOTS
+---------
+LIST OF SNAPSHOTS FOR EACH SESSION:
+----------LATEST DETECTED DEADLOCKS----------
+-----------------------------------------
+END OF ROCKSDB TRANSACTION MONITOR OUTPUT
+=========================================
+
+ROLLBACK;
+SET AUTOCOMMIT=1;
+DROP TABLE t2;
diff --git a/mysql-test/suite/rocksdb/r/level_repeatable_read-range_locking.result b/mysql-test/suite/rocksdb/r/level_repeatable_read-range_locking.result
new file mode 100644
index 00000000000..0592b099238
--- /dev/null
+++ b/mysql-test/suite/rocksdb/r/level_repeatable_read-range_locking.result
@@ -0,0 +1,106 @@
+DROP TABLE IF EXISTS t1;
+connect con1,localhost,root,,;
+SET SESSION TRANSACTION ISOLATION LEVEL REPEATABLE READ;
+connect con2,localhost,root,,;
+SET SESSION TRANSACTION ISOLATION LEVEL REPEATABLE READ;
+connection con1;
+CREATE TABLE t1 (a INT, pk INT AUTO_INCREMENT PRIMARY KEY) ENGINE=rocksdb;
+START TRANSACTION;
+SELECT a FROM t1;
+a
+connection con2;
+BEGIN;
+INSERT INTO t1 (a) VALUES(1);
+connection con1;
+SELECT a FROM t1;
+a
+connection con2;
+INSERT INTO t1 (a) VALUES (2);
+connection con1;
+SELECT a FROM t1;
+a
+INSERT INTO t1 (a) SELECT a+100 FROM t1;
+SELECT a FROM t1;
+a
+connection con2;
+SELECT a FROM t1;
+a
+1
+2
+COMMIT;
+SELECT a FROM t1;
+a
+1
+2
+connection con1;
+SELECT a FROM t1;
+a
+INSERT INTO t1 (a) SELECT a+200 FROM t1;
+SELECT a FROM t1;
+a
+201
+202
+COMMIT;
+SELECT a FROM t1;
+a
+1
+2
+201
+202
+connection con2;
+SELECT a FROM t1;
+a
+1
+2
+201
+202
+connection default;
+CREATE TABLE t2 (a INT PRIMARY KEY) ENGINE=rocksdb;
+INSERT INTO t2 (a) VALUES (1);
+COMMIT;
+connection con1;
+BEGIN;
+SELECT a from t2;
+a
+1
+INSERT INTO t2 (a) VALUES (1), (3);
+ERROR 23000: Duplicate entry '1' for key 't2.PRIMARY'
+connection con2;
+INSERT INTO t2 (a) VALUES (2);
+COMMIT;
+connection con1;
+SELECT a from t2;
+a
+1
+COMMIT;
+connection default;
+disconnect con1;
+disconnect con2;
+DROP TABLE t1;
+DROP TABLE t2;
+CREATE TABLE t3 (
+pk int unsigned PRIMARY KEY,
+count int unsigned DEFAULT '0'
+) ENGINE=ROCKSDB;
+connect con1,localhost,root,,;
+SET SESSION TRANSACTION ISOLATION LEVEL REPEATABLE READ;
+connect con2,localhost,root,,;
+SET SESSION TRANSACTION ISOLATION LEVEL REPEATABLE READ;
+connection con1;
+BEGIN;
+SELECT * FROM t3;
+pk count
+connection con2;
+BEGIN;
+INSERT INTO t3 (pk) VALUES(1) ON DUPLICATE KEY UPDATE count=count+1;
+COMMIT;
+connection con1;
+INSERT INTO t3 (pk) VALUES(1) ON DUPLICATE KEY UPDATE count=count+1;
+COMMIT;
+SELECT count FROM t3;
+count
+1
+connection default;
+disconnect con1;
+disconnect con2;
+DROP TABLE t3;
diff --git a/mysql-test/suite/rocksdb/r/range_locking.result b/mysql-test/suite/rocksdb/r/range_locking.result
new file mode 100644
index 00000000000..4a2f99f86fc
--- /dev/null
+++ b/mysql-test/suite/rocksdb/r/range_locking.result
@@ -0,0 +1,596 @@
+show variables like 'rocksdb_use_range_locking';
+Variable_name Value
+rocksdb_use_range_locking ON
+create table t1 (
+pk int,
+a int,
+primary key (pk) comment 'default'
+) engine=rocksdb;
+insert into t1 values
+(10,10),(20,20),(30,30);
+connect con1,localhost,root,,;
+connect con2,localhost,root,,;
+### Test: check that range lock inhibits a point lock
+connection con1;
+begin;
+select * from t1 where pk between 5 and 25 for update;
+pk a
+10 10
+20 20
+connection con2;
+insert into t1 values (15,15);
+ERROR HY000: Lock wait timeout exceeded; try restarting transaction: Timeout on index: test.t1.PRIMARY
+connection con1;
+rollback;
+## Test: check that range lock inhibits another range lock
+connection con1;
+begin;
+select * from t1 where pk between 5 and 25 for update;
+pk a
+10 10
+20 20
+connection con2;
+begin;
+select * from t1 where pk between 15 and 35 for update;
+ERROR HY000: Lock wait timeout exceeded; try restarting transaction: Timeout on index: test.t1.PRIMARY
+rollback;
+connection con1;
+rollback;
+## Test: check that regular read does not get a range lock
+connection con1;
+begin;
+select * from t1 where pk between 5 and 25;
+pk a
+10 10
+20 20
+connection con2;
+begin;
+select * from t1 where pk between 15 and 35 for update;
+pk a
+20 20
+30 30
+rollback;
+connection con1;
+rollback;
+## Test that locks are not released when a statement inside
+## a transaction is rolled back
+create table t2 (
+pk int,
+a int,
+primary key (pk) comment 'default',
+unique key(a) comment 'default'
+) engine=rocksdb;
+insert into t2 values (1,1),(2,2);
+begin;
+insert into t2 values (3,3);
+insert into t2 values (10,2);
+ERROR 23000: Duplicate entry '2' for key 't2.a'
+connection con2;
+begin;
+select * from t2 where pk=3 for update;
+ERROR HY000: Lock wait timeout exceeded; try restarting transaction: Timeout on index: test.t2.PRIMARY
+rollback;
+connection con1;
+rollback;
+drop table t2;
+connection default;
+disconnect con1;
+disconnect con2;
+drop table t1;
+#
+# Test INFORMATION_SCHEMA.lock_info in range-locking mode
+#
+connect con1,localhost,root,,;
+connection con1;
+create table t0 (a int primary key);
+begin;
+insert into t0 values (1);
+connection default;
+create table t1 (
+pk int,
+a int,
+primary key (pk) comment 'default'
+) engine=rocksdb;
+insert into t1 values
+(10,10),(20,20),(30,30);
+begin;
+select * from t1 where pk=10 for update;
+pk a
+10 10
+# select * from information_schema.rocksdb_locks; # With replacements by select_from_is_rowlocks.inc
+COLUMN_FAMILY_ID TRANSACTION_ID KEY mode
+$cf_id $trx_id ${indexnr}8000000a X
+delete from t1 where pk between 25 and 40;
+# select * from information_schema.rocksdb_locks; # With replacements by select_from_is_rowlocks.inc
+COLUMN_FAMILY_ID TRANSACTION_ID KEY mode
+$cf_id $trx_id ${indexnr}8000000a X
+$cf_id $trx_id ${indexnr}80000019-${indexnr}80000028:1 X
+rollback;
+begin;
+# The following will show a range lock on 2-9 and also a point lock on 10.
+# This is how things currently work. (after MDEV-21314, not anymore)
+select * from t1 where pk between 2 and 9 for update;
+pk a
+# select * from information_schema.rocksdb_locks; # With replacements by select_from_is_rowlocks.inc
+COLUMN_FAMILY_ID TRANSACTION_ID KEY mode
+$cf_id $trx_id ${indexnr}80000002-${indexnr}80000009:1 X
+rollback;
+drop table t1;
+connection con1;
+rollback;
+drop table t0;
+connection default;
+disconnect con1;
+#
+# MDEV-18104: MyRocks-Gap-Lock: range locking bounds are incorrect for multi-part keys
+#
+create table t0(a int);
+insert into t0 values (0),(1),(2),(3),(4),(5),(6),(7),(8),(9);
+create table t1 (
+kp1 int not null,
+kp2 int not null,
+a int,
+primary key(kp1, kp2) comment 'default'
+) engine=rocksdb;
+insert into t1 select 1, a, 1234 from t0;
+insert into t1 select 2, a, 1234 from t0;
+insert into t1 select 3, a, 1234 from t0;
+connect con1,localhost,root,,;
+connection con1;
+begin;
+select * from t1 where kp1=2 for update;
+kp1 kp2 a
+2 0 1234
+2 1 1234
+2 2 1234
+2 3 1234
+2 4 1234
+2 5 1234
+2 6 1234
+2 7 1234
+2 8 1234
+2 9 1234
+connection default;
+# The lock on kp1=2 should inhibit the following INSERT:
+insert into t1 values ( 2,5,9999);
+ERROR HY000: Lock wait timeout exceeded; try restarting transaction: Timeout on index: test.t1.PRIMARY
+rollback;
+connection con1;
+rollback;
+connection default;
+disconnect con1;
+drop table t0,t1;
+#
+# Test that locks on ranges on non-unique secondary keys inhibit
+# modifications of the contents of these ranges
+#
+create table t0(a int);
+insert into t0 values (0),(1),(2),(3),(4),(5),(6),(7),(8),(9);
+create table t1 (
+kp1 int not null,
+kp2 int not null,
+a int,
+key(kp1, kp2) comment 'default'
+) engine=rocksdb;
+insert into t1 select 1, a, 1234 from t0;
+insert into t1 values (2, 3, 1234);
+insert into t1 values (2, 5, 1234);
+insert into t1 values (2, 7, 1234);
+insert into t1 select 3, a, 1234 from t0;
+connect con1,localhost,root,,;
+connection con1;
+begin;
+explain
+select * from t1 where kp1=2 for update;
+id select_type table partitions type possible_keys key key_len ref rows filtered Extra
+1 SIMPLE t1 NULL ref kp1 kp1 4 const # 100.00 NULL
+Warnings:
+Note 1003 /* select#1 */ select `test`.`t1`.`kp1` AS `kp1`,`test`.`t1`.`kp2` AS `kp2`,`test`.`t1`.`a` AS `a` from `test`.`t1` where (`test`.`t1`.`kp1` = 2)
+select * from t1 where kp1=2 for update;
+kp1 kp2 a
+2 3 1234
+2 5 1234
+2 7 1234
+connection default;
+begin;
+insert into t1 values (2, 9, 9999);
+ERROR HY000: Lock wait timeout exceeded; try restarting transaction: Timeout on index: test.t1.kp1
+delete from t1 where kp1=2 and kp2=5;
+ERROR HY000: Lock wait timeout exceeded; try restarting transaction: Timeout on index: test.t1.kp1
+update t1 set kp1=333 where kp1=2 and kp2=3;
+ERROR HY000: Lock wait timeout exceeded; try restarting transaction: Timeout on index: test.t1.kp1
+update t1 set kp1=2 where kp1=1 and kp2=8;
+ERROR HY000: Lock wait timeout exceeded; try restarting transaction: Timeout on index: test.t1.kp1
+rollback;
+connection con1;
+rollback;
+disconnect con1;
+connection default;
+drop table t0,t1;
+#
+# Transaction isolation test
+#
+create table t1 (pk int primary key, a int) engine=rocksdb;
+insert into t1 values (1,1),(2,2),(3,3);
+connect con1,localhost,root,,;
+# TRX1: Start, Allocate a snapshot
+connection con1;
+begin;
+select * from t1;
+pk a
+1 1
+2 2
+3 3
+# TRX2: Make a change that TRX1 will not see
+connection default;
+update t1 set a=2222 where pk=2;
+# TRX1: Now, make a change that would overwrite TRX2'x change and commit
+connection con1;
+update t1 set a=a+1 where pk=2;
+commit;
+# Examine the result:
+# pk=2, a=2223 means UPDATE in TRX1 used "read committed" (InnoDB-like isolation)
+# pk=2, a=3 means UPDATE in TRX1 silently overwrote TRX2
+# (and with key tracking, one would get an error on the second UPDATE)
+connection default;
+select * from t1;
+pk a
+1 1
+2 2223
+3 3
+disconnect con1;
+connection default;
+drop table t1;
+#
+# Same test as above, but check the range scan
+#
+create table t1 (
+pk int,
+a int,
+primary key (pk) comment 'default'
+) engine=rocksdb;
+insert into t1 values (1,1),(2,2),(3,3),(4,4),(5,5),(6,6);
+connect con1,localhost,root,,;
+# TRX1: Start, Allocate a snapshot
+connection con1;
+begin;
+select * from t1;
+pk a
+1 1
+2 2
+3 3
+4 4
+5 5
+6 6
+# TRX2: Make a change that TRX1 will not see
+connection default;
+update t1 set a=2222 where pk between 3 and 5;
+# TRX1: Now, make a change that would overwrite TRX2'x change and commit
+connection con1;
+update t1 set a=a+1 where pk between 3 and 5;
+commit;
+# Examine the result:
+# pk={3,4,5} a=2223 means UPDATE in TRX1 used "read committed" (InnoDB-like isolation)
+connection default;
+select * from t1;
+pk a
+1 1
+2 2
+3 2223
+4 2223
+5 2223
+6 6
+disconnect con1;
+connection default;
+drop table t1;
+#
+# Same as above, but test SELECT FOR UPDATE.
+#
+create table t1 (
+pk int,
+a int,
+primary key (pk) comment 'default'
+) engine=rocksdb;
+insert into t1 values (1,1),(2,2),(3,3),(4,4),(5,5),(6,6);
+connect con1,localhost,root,,;
+# TRX1: Start, Allocate a snapshot
+connection con1;
+begin;
+select * from t1;
+pk a
+1 1
+2 2
+3 3
+4 4
+5 5
+6 6
+# TRX2: Make a change that TRX1 will not see
+connection default;
+update t1 set a=222 where pk=2;
+update t1 set a=333 where pk=3;
+# TRX1: Check what select [FOR UPDATE] sees
+connection con1;
+select * from t1 where pk in (2,3);
+pk a
+2 2
+3 3
+select * from t1 where pk=2 for update;
+pk a
+2 222
+select * from t1 where pk=2 lock in share mode;
+pk a
+2 222
+select * from t1 where pk=2;
+pk a
+2 2
+commit;
+disconnect con1;
+connection default;
+drop table t1;
+#
+# Another no-snapshot-checking test, this time for single-statement
+# transaction
+#
+create table t1 (
+pk int,
+a int,
+name varchar(16),
+primary key(pk) comment 'default'
+) engine=rocksdb;
+insert into t1 values (1,1, 'row1'), (2,2,'row2');
+connect con1,localhost,root,,;
+connection con1;
+select get_lock('row1', 100);
+get_lock('row1', 100)
+1
+connection default;
+# The following will read the first row (1,1,'row1'), and stop.
+update t1 set a=a+100 where get_lock(name, 1000)=1;
+connection con1;
+update t1 set a=5 where pk=2;
+select release_lock('row1');
+release_lock('row1')
+1
+connection default;
+# Look at the row with pk=2:
+# 2, 105, row2 - means the UPDATE was reading current data (Correct)
+# 2, 102, row - means the UPDATE read the snapshot (incorrect)
+select * from t1;
+pk a name
+1 101 row1
+2 105 row2
+# Try releasing both locks (in 5.6, we will be holding only the second one)
+select release_lock(name) from t1;
+release_lock(name)
+1
+1
+disconnect con1;
+connection default;
+drop table t1;
+#
+# Check that I_S.processlist.state is set correctly now.
+#
+create table t1(
+pk int,
+a int,
+primary key(pk) comment 'default'
+) engine=rocksdb;
+insert into t1 values (1,1),(2,2),(3,3);
+begin;
+select * from t1 where pk=2 for update;
+pk a
+2 2
+connect con1,localhost,root,,;
+begin;
+set rocksdb_lock_wait_timeout=300;
+select * from t1 where pk=2 for update;
+connection default;
+# Now, will wait until we see con1 have state="Waiting for row lock"
+rollback;
+connection con1;
+pk a
+2 2
+rollback;
+disconnect con1;
+connection default;
+drop table t1;
+#
+# Test range locking for ranges with HA_READ_PREFIX_LAST
+#
+create table t0(a int) engine=rocksdb;
+insert into t0 values (0),(1),(2),(3),(4),(5),(6),(7),(8),(9);
+create table t1 (
+pk1 int,
+pk2 int,
+a int,
+primary key(pk1, pk2) comment 'default'
+) engine=rocksdb;
+insert into t1
+select
+A.a, B.a, A.a*10+B.a
+from
+t0 A, t0 B;
+connect con1,localhost,root,,;
+connection con1;
+begin;
+insert into t1 values (0x1112222,0x1112222,0);
+connection default;
+begin;
+# Should use ref access w/o filesort:
+explain
+select * from t1
+where pk1=3
+order by pk1 desc, pk2 desc
+for update;
+id select_type table partitions type possible_keys key key_len ref rows filtered Extra
+1 SIMPLE t1 NULL ref PRIMARY PRIMARY 4 const # 100.00 Backward index scan
+Warnings:
+Note 1003 /* select#1 */ select `test`.`t1`.`pk1` AS `pk1`,`test`.`t1`.`pk2` AS `pk2`,`test`.`t1`.`a` AS `a` from `test`.`t1` where (`test`.`t1`.`pk1` = 3) order by `test`.`t1`.`pk1` desc,`test`.`t1`.`pk2` desc
+select * from t1
+where pk1=3
+order by pk1 desc, pk2 desc
+for update;
+pk1 pk2 a
+3 9 39
+3 8 38
+3 7 37
+3 6 36
+3 5 35
+3 4 34
+3 3 33
+3 2 32
+3 1 31
+3 0 30
+# select * from information_schema.rocksdb_locks; # With replacements by select_from_is_rowlocks.inc
+COLUMN_FAMILY_ID TRANSACTION_ID KEY mode
+$cf_id $trx_id ${indexnr}80000003-${indexnr}80000003:1 X
+rollback;
+#
+# Test range locking for ranges with HA_READ_PREFIX_LAST_OR_PREV
+#
+begin;
+# Should use range access with 2 keyparts and w/o filesort:
+explain
+select * from t1
+where pk1=4 and pk2 between 5 and 8
+order by pk1 desc, pk2 desc
+for update;
+id select_type table partitions type possible_keys key key_len ref rows filtered Extra
+1 SIMPLE t1 NULL range PRIMARY PRIMARY 8 NULL # 100.00 Using where; Backward index scan
+Warnings:
+Note 1003 /* select#1 */ select `test`.`t1`.`pk1` AS `pk1`,`test`.`t1`.`pk2` AS `pk2`,`test`.`t1`.`a` AS `a` from `test`.`t1` where ((`test`.`t1`.`pk1` = 4) and (`test`.`t1`.`pk2` between 5 and 8)) order by `test`.`t1`.`pk1` desc,`test`.`t1`.`pk2` desc
+select * from t1
+where pk1=4 and pk2 between 5 and 8
+order by pk1 desc, pk2 desc
+for update;
+pk1 pk2 a
+4 8 48
+4 7 47
+4 6 46
+4 5 45
+# select * from information_schema.rocksdb_locks; # With replacements by select_from_is_rowlocks.inc
+COLUMN_FAMILY_ID TRANSACTION_ID KEY mode
+$cf_id $trx_id ${indexnr}8000000480000005-${indexnr}8000000480000008:1 X
+rollback;
+connection con1;
+rollback;
+connection default;
+drop table t0, t1;
+#
+# A bug: range locking was not used when scan started at table start or end
+#
+create table t0(a int);
+insert into t0 values (0),(1),(2),(3),(4),(5),(6),(7),(8),(9);
+create table t10(a int);
+insert into t10 select A.a + B.a* 10 + C.a * 100 from t0 A, t0 B, t0 C;
+create table t1 (
+pk int not null,
+a int,
+primary key(pk)
+) engine=rocksdb;
+insert into t1 select a*2,a*2 from t10;
+connection con1;
+begin;
+select * from t1 where pk=500 for update;
+pk a
+500 500
+connection default;
+begin;
+select * from t1 where pk<10 order by pk limit 10 for update;
+pk a
+0 0
+2 2
+4 4
+6 6
+8 8
+# select * from information_schema.rocksdb_locks; # With replacements by select_from_is_rowlocks.inc
+COLUMN_FAMILY_ID TRANSACTION_ID KEY mode
+$cf_id $trx_id ${indexnr}-${indexnr}8000000a X
+rollback;
+begin;
+select * from t1 where pk>1990 order by pk desc limit 10 for update;
+pk a
+1998 1998
+1996 1996
+1994 1994
+1992 1992
+# select * from information_schema.rocksdb_locks; # With replacements by select_from_is_rowlocks.inc
+COLUMN_FAMILY_ID TRANSACTION_ID KEY mode
+$cf_id $trx_id ${indexnr}800007c6-${indexnr+1} X
+rollback;
+connection con1;
+rollback;
+disconnect con1;
+connection default;
+drop table t0,t10,t1;
+#
+# Range locking and READ-COMMITTED isolation level
+#
+connect con1,localhost,root,,;
+connection con1;
+set session transaction isolation level read committed;
+create table t1 (
+pk int not null,
+a int,
+primary key(pk)
+) engine=rocksdb;
+insert into t1(pk) values (0),(1),(2),(3),(4),(5),(6),(7),(8),(9);
+begin;
+select * from t1 where pk between 2 and 5 for update;
+pk a
+2 NULL
+3 NULL
+4 NULL
+5 NULL
+# Below should show individual row locks, not locked range:
+# select * from information_schema.rocksdb_locks; # With replacements by select_from_is_rowlocks.inc
+COLUMN_FAMILY_ID TRANSACTION_ID KEY mode
+$cf_id $trx_id ${indexnr}80000002 X
+$cf_id $trx_id ${indexnr}80000003 X
+$cf_id $trx_id ${indexnr}80000004 X
+$cf_id $trx_id ${indexnr}80000005 X
+$cf_id $trx_id ${indexnr}80000006 X
+rollback;
+begin;
+update t1 set a=a+1 where pk between 2 and 5;
+# Below should show individual row locks, not locked range:
+# select * from information_schema.rocksdb_locks; # With replacements by select_from_is_rowlocks.inc
+COLUMN_FAMILY_ID TRANSACTION_ID KEY mode
+$cf_id $trx_id ${indexnr}80000002 X
+$cf_id $trx_id ${indexnr}80000003 X
+$cf_id $trx_id ${indexnr}80000004 X
+$cf_id $trx_id ${indexnr}80000005 X
+$cf_id $trx_id ${indexnr}80000006 X
+rollback;
+drop table t1;
+disconnect con1;
+connection default;
+#
+# Range Locking and READ-COMMITTED, another test
+#
+create table t1 (
+pk int,
+a int,
+b int,
+primary key (pk),
+key(a)
+) engine=rocksdb;
+insert into t1 values
+(1, 100, 1000),
+(2, 200, 2000),
+(3, 300, 3000);
+set transaction isolation level repeatable read;
+begin;
+update t1 set b = b + 1 where a > 200;
+connect con1,localhost,root,,;
+connection con1;
+set transaction isolation level read committed;
+begin;
+insert into t1 values (4, 150, 1500);
+insert into t1 values (5, 250, 1500);
+ERROR HY000: Lock wait timeout exceeded; try restarting transaction: Timeout on index: test.t1.a
+rollback;
+disconnect con1;
+connection default;
+rollback;
+drop table t1;
diff --git a/mysql-test/suite/rocksdb/r/range_locking_conc_test.result b/mysql-test/suite/rocksdb/r/range_locking_conc_test.result
new file mode 100644
index 00000000000..a70152da808
--- /dev/null
+++ b/mysql-test/suite/rocksdb/r/range_locking_conc_test.result
@@ -0,0 +1,4 @@
+set @save_rlwt=@@rocksdb_lock_wait_timeout;
+# Run range_locking_conc_test.py
+set global rocksdb_lock_wait_timeout= @save_rlwt;
+DROP TABLE t1;
diff --git a/mysql-test/suite/rocksdb/r/range_locking_deadlock_tracking.result b/mysql-test/suite/rocksdb/r/range_locking_deadlock_tracking.result
new file mode 100644
index 00000000000..00fd1788dfd
--- /dev/null
+++ b/mysql-test/suite/rocksdb/r/range_locking_deadlock_tracking.result
@@ -0,0 +1,453 @@
+set @prior_lock_wait_timeout = @@rocksdb_lock_wait_timeout;
+set @prior_deadlock_detect = @@rocksdb_deadlock_detect;
+set @prior_max_latest_deadlocks = @@rocksdb_max_latest_deadlocks;
+set global rocksdb_deadlock_detect = on;
+set global rocksdb_lock_wait_timeout = 10000;
+# Clears deadlock buffer of any prior deadlocks.
+set global rocksdb_max_latest_deadlocks = 0;
+set global rocksdb_max_latest_deadlocks = @prior_max_latest_deadlocks;
+create table t (i int primary key) engine=rocksdb;
+insert into t values (1), (2), (3);
+show engine rocksdb transaction status;
+Type Name Status
+rocksdb
+============================================================
+TIMESTAMP ROCKSDB TRANSACTION MONITOR OUTPUT
+============================================================
+---------
+SNAPSHOTS
+---------
+LIST OF SNAPSHOTS FOR EACH SESSION:
+----------LATEST DETECTED DEADLOCKS----------
+-----------------------------------------
+END OF ROCKSDB TRANSACTION MONITOR OUTPUT
+=========================================
+
+Deadlock #1
+begin;
+select * from t where i=1 for update;
+i
+1
+begin;
+select * from t where i=2 for update;
+i
+2
+select * from t where i=2 for update;
+select * from t where i=1 for update;
+ERROR 40001: Deadlock found when trying to get lock; try restarting transaction
+rollback;
+i
+2
+rollback;
+show engine rocksdb transaction status;
+Type Name Status
+rocksdb
+============================================================
+TIMESTAMP ROCKSDB TRANSACTION MONITOR OUTPUT
+============================================================
+---------
+SNAPSHOTS
+---------
+LIST OF SNAPSHOTS FOR EACH SESSION:
+----------LATEST DETECTED DEADLOCKS----------
+
+*** DEADLOCK PATH
+=========================================
+TSTAMP
+TXN_ID
+COLUMN FAMILY NAME: default
+KEY
+LOCK TYPE: EXCLUSIVE
+INDEX NAME: PRIMARY
+TABLE NAME: test.t
+---------------WAITING FOR---------------
+TSTAMP
+TXN_ID
+COLUMN FAMILY NAME: default
+KEY
+LOCK TYPE: EXCLUSIVE
+INDEX NAME: PRIMARY
+TABLE NAME: test.t
+
+--------TXN_ID GOT DEADLOCK---------
+-----------------------------------------
+END OF ROCKSDB TRANSACTION MONITOR OUTPUT
+=========================================
+
+Deadlock #2
+begin;
+select * from t where i=1 for update;
+i
+1
+begin;
+select * from t where i=2 for update;
+i
+2
+select * from t where i=2 for update;
+select * from t where i=1 for update;
+ERROR 40001: Deadlock found when trying to get lock; try restarting transaction
+rollback;
+i
+2
+rollback;
+show engine rocksdb transaction status;
+Type Name Status
+rocksdb
+============================================================
+TIMESTAMP ROCKSDB TRANSACTION MONITOR OUTPUT
+============================================================
+---------
+SNAPSHOTS
+---------
+LIST OF SNAPSHOTS FOR EACH SESSION:
+----------LATEST DETECTED DEADLOCKS----------
+
+*** DEADLOCK PATH
+=========================================
+TSTAMP
+TXN_ID
+COLUMN FAMILY NAME: default
+KEY
+LOCK TYPE: EXCLUSIVE
+INDEX NAME: PRIMARY
+TABLE NAME: test.t
+---------------WAITING FOR---------------
+TSTAMP
+TXN_ID
+COLUMN FAMILY NAME: default
+KEY
+LOCK TYPE: EXCLUSIVE
+INDEX NAME: PRIMARY
+TABLE NAME: test.t
+
+--------TXN_ID GOT DEADLOCK---------
+
+*** DEADLOCK PATH
+=========================================
+TSTAMP
+TXN_ID
+COLUMN FAMILY NAME: default
+KEY
+LOCK TYPE: EXCLUSIVE
+INDEX NAME: PRIMARY
+TABLE NAME: test.t
+---------------WAITING FOR---------------
+TSTAMP
+TXN_ID
+COLUMN FAMILY NAME: default
+KEY
+LOCK TYPE: EXCLUSIVE
+INDEX NAME: PRIMARY
+TABLE NAME: test.t
+
+--------TXN_ID GOT DEADLOCK---------
+-----------------------------------------
+END OF ROCKSDB TRANSACTION MONITOR OUTPUT
+=========================================
+
+set global rocksdb_max_latest_deadlocks = 10;
+Deadlock #3
+begin;
+select * from t where i=1 for update;
+i
+1
+begin;
+select * from t where i=2 for update;
+i
+2
+select * from t where i=2 for update;
+select * from t where i=1 for update;
+ERROR 40001: Deadlock found when trying to get lock; try restarting transaction
+rollback;
+i
+2
+rollback;
+show engine rocksdb transaction status;
+Type Name Status
+rocksdb
+============================================================
+TIMESTAMP ROCKSDB TRANSACTION MONITOR OUTPUT
+============================================================
+---------
+SNAPSHOTS
+---------
+LIST OF SNAPSHOTS FOR EACH SESSION:
+----------LATEST DETECTED DEADLOCKS----------
+
+*** DEADLOCK PATH
+=========================================
+TSTAMP
+TXN_ID
+COLUMN FAMILY NAME: default
+KEY
+LOCK TYPE: EXCLUSIVE
+INDEX NAME: PRIMARY
+TABLE NAME: test.t
+---------------WAITING FOR---------------
+TSTAMP
+TXN_ID
+COLUMN FAMILY NAME: default
+KEY
+LOCK TYPE: EXCLUSIVE
+INDEX NAME: PRIMARY
+TABLE NAME: test.t
+
+--------TXN_ID GOT DEADLOCK---------
+
+*** DEADLOCK PATH
+=========================================
+TSTAMP
+TXN_ID
+COLUMN FAMILY NAME: default
+KEY
+LOCK TYPE: EXCLUSIVE
+INDEX NAME: PRIMARY
+TABLE NAME: test.t
+---------------WAITING FOR---------------
+TSTAMP
+TXN_ID
+COLUMN FAMILY NAME: default
+KEY
+LOCK TYPE: EXCLUSIVE
+INDEX NAME: PRIMARY
+TABLE NAME: test.t
+
+--------TXN_ID GOT DEADLOCK---------
+
+*** DEADLOCK PATH
+=========================================
+TSTAMP
+TXN_ID
+COLUMN FAMILY NAME: default
+KEY
+LOCK TYPE: EXCLUSIVE
+INDEX NAME: PRIMARY
+TABLE NAME: test.t
+---------------WAITING FOR---------------
+TSTAMP
+TXN_ID
+COLUMN FAMILY NAME: default
+KEY
+LOCK TYPE: EXCLUSIVE
+INDEX NAME: PRIMARY
+TABLE NAME: test.t
+
+--------TXN_ID GOT DEADLOCK---------
+-----------------------------------------
+END OF ROCKSDB TRANSACTION MONITOR OUTPUT
+=========================================
+
+set global rocksdb_max_latest_deadlocks = 1;
+show engine rocksdb transaction status;
+Type Name Status
+rocksdb
+============================================================
+TIMESTAMP ROCKSDB TRANSACTION MONITOR OUTPUT
+============================================================
+---------
+SNAPSHOTS
+---------
+LIST OF SNAPSHOTS FOR EACH SESSION:
+----------LATEST DETECTED DEADLOCKS----------
+
+*** DEADLOCK PATH
+=========================================
+TSTAMP
+TXN_ID
+COLUMN FAMILY NAME: default
+KEY
+LOCK TYPE: EXCLUSIVE
+INDEX NAME: PRIMARY
+TABLE NAME: test.t
+---------------WAITING FOR---------------
+TSTAMP
+TXN_ID
+COLUMN FAMILY NAME: default
+KEY
+LOCK TYPE: EXCLUSIVE
+INDEX NAME: PRIMARY
+TABLE NAME: test.t
+
+--------TXN_ID GOT DEADLOCK---------
+-----------------------------------------
+END OF ROCKSDB TRANSACTION MONITOR OUTPUT
+=========================================
+
+set rocksdb_deadlock_detect_depth = 2;
+# Range locking code will report deadlocks, because it doesn't honor
+# rocksdb_deadlock_detect_depth:
+Deadlock #4
+begin;
+select * from t where i=1 for update;
+i
+1
+begin;
+select * from t where i=2 for update;
+i
+2
+begin;
+select * from t where i=3 for update;
+i
+3
+select * from t where i=2 for update;
+select * from t where i=3 for update;
+select variable_value into @a from performance_schema.global_status where variable_name='rocksdb_row_lock_deadlocks';
+select * from t where i=1 for update;
+ERROR 40001: Deadlock found when trying to get lock; try restarting transaction
+select case when variable_value-@a = 1 then 'true' else 'false' end as deadlocks from performance_schema.global_status where variable_name='rocksdb_row_lock_deadlocks';
+deadlocks
+true
+rollback;
+i
+3
+rollback;
+i
+2
+rollback;
+set global rocksdb_max_latest_deadlocks = 5;
+show engine rocksdb transaction status;
+Type Name Status
+rocksdb
+============================================================
+TIMESTAMP ROCKSDB TRANSACTION MONITOR OUTPUT
+============================================================
+---------
+SNAPSHOTS
+---------
+LIST OF SNAPSHOTS FOR EACH SESSION:
+----------LATEST DETECTED DEADLOCKS----------
+
+*** DEADLOCK PATH
+=========================================
+TSTAMP
+TXN_ID
+COLUMN FAMILY NAME: default
+KEY
+LOCK TYPE: EXCLUSIVE
+INDEX NAME: PRIMARY
+TABLE NAME: test.t
+---------------WAITING FOR---------------
+TSTAMP
+TXN_ID
+COLUMN FAMILY NAME: default
+KEY
+LOCK TYPE: EXCLUSIVE
+INDEX NAME: PRIMARY
+TABLE NAME: test.t
+---------------WAITING FOR---------------
+TSTAMP
+TXN_ID
+COLUMN FAMILY NAME: default
+KEY
+LOCK TYPE: EXCLUSIVE
+INDEX NAME: PRIMARY
+TABLE NAME: test.t
+
+--------TXN_ID GOT DEADLOCK---------
+-----------------------------------------
+END OF ROCKSDB TRANSACTION MONITOR OUTPUT
+=========================================
+
+Deadlock #6
+create table t1 (id int primary key, value int) engine=rocksdb;
+insert into t1 values (1,1),(2,2),(3,3),(4,4),(5,5);
+begin;
+update t1 set value=value+100 where id=1;
+update t1 set value=value+100 where id=2;
+begin;
+update t1 set value=value+200 where id=3;
+update t1 set value=value+100 where id=3;
+update t1 set value=value+200 where id=1;
+ERROR 40001: Deadlock found when trying to get lock; try restarting transaction
+select * from t1;
+id value
+1 101
+2 102
+3 103
+4 4
+5 5
+drop table t1;
+set global rocksdb_lock_wait_timeout = @prior_lock_wait_timeout;
+set global rocksdb_deadlock_detect = @prior_deadlock_detect;
+drop table t;
+show engine rocksdb transaction status;
+Type Name Status
+rocksdb
+============================================================
+TIMESTAMP ROCKSDB TRANSACTION MONITOR OUTPUT
+============================================================
+---------
+SNAPSHOTS
+---------
+LIST OF SNAPSHOTS FOR EACH SESSION:
+----------LATEST DETECTED DEADLOCKS----------
+
+*** DEADLOCK PATH
+=========================================
+TSTAMP
+TXN_ID
+COLUMN FAMILY NAME: default
+KEY
+LOCK TYPE: EXCLUSIVE
+INDEX NAME: NOT FOUND; IDX_ID
+TABLE NAME: NOT FOUND; IDX_ID
+---------------WAITING FOR---------------
+TSTAMP
+TXN_ID
+COLUMN FAMILY NAME: default
+KEY
+LOCK TYPE: EXCLUSIVE
+INDEX NAME: NOT FOUND; IDX_ID
+TABLE NAME: NOT FOUND; IDX_ID
+
+--------TXN_ID GOT DEADLOCK---------
+
+*** DEADLOCK PATH
+=========================================
+TSTAMP
+TXN_ID
+COLUMN FAMILY NAME: default
+KEY
+LOCK TYPE: EXCLUSIVE
+INDEX NAME: NOT FOUND; IDX_ID
+TABLE NAME: NOT FOUND; IDX_ID
+---------------WAITING FOR---------------
+TSTAMP
+TXN_ID
+COLUMN FAMILY NAME: default
+KEY
+LOCK TYPE: EXCLUSIVE
+INDEX NAME: NOT FOUND; IDX_ID
+TABLE NAME: NOT FOUND; IDX_ID
+---------------WAITING FOR---------------
+TSTAMP
+TXN_ID
+COLUMN FAMILY NAME: default
+KEY
+LOCK TYPE: EXCLUSIVE
+INDEX NAME: NOT FOUND; IDX_ID
+TABLE NAME: NOT FOUND; IDX_ID
+
+--------TXN_ID GOT DEADLOCK---------
+-----------------------------------------
+END OF ROCKSDB TRANSACTION MONITOR OUTPUT
+=========================================
+
+set global rocksdb_max_latest_deadlocks = 0;
+# Clears deadlock buffer of any existent deadlocks.
+set global rocksdb_max_latest_deadlocks = @prior_max_latest_deadlocks;
+show engine rocksdb transaction status;
+Type Name Status
+rocksdb
+============================================================
+TIMESTAMP ROCKSDB TRANSACTION MONITOR OUTPUT
+============================================================
+---------
+SNAPSHOTS
+---------
+LIST OF SNAPSHOTS FOR EACH SESSION:
+----------LATEST DETECTED DEADLOCKS----------
+-----------------------------------------
+END OF ROCKSDB TRANSACTION MONITOR OUTPUT
+=========================================
+
diff --git a/mysql-test/suite/rocksdb/r/range_locking_escalation.result b/mysql-test/suite/rocksdb/r/range_locking_escalation.result
new file mode 100644
index 00000000000..dd19d728ef2
--- /dev/null
+++ b/mysql-test/suite/rocksdb/r/range_locking_escalation.result
@@ -0,0 +1,27 @@
+show variables like 'rocksdb_use_range_locking';
+Variable_name Value
+rocksdb_use_range_locking ON
+show variables like 'rocksdb_max_lock_memory';
+Variable_name Value
+rocksdb_max_lock_memory 1024
+show status like 'rocksdb_locktree_escalation_count';
+Variable_name Value
+rocksdb_locktree_escalation_count 0
+create table t0(a int);
+insert into t0 values (0),(1),(2),(3),(4),(5),(6),(7),(8),(9);
+create table t1 (
+pk int primary key,
+a int
+) engine=rocksdb;
+insert into t1
+select
+A.a + B.a*10 + C.a*100 + D.a*1000,
+12345
+from t0 A, t0 B, t0 C, t0 D;
+select count(*) from t1;
+count(*)
+10000
+show status like 'rocksdb_locktree_escalation_count';
+Variable_name Value
+rocksdb_locktree_escalation_count 127
+drop table t0,t1;
diff --git a/mysql-test/suite/rocksdb/r/range_locking_partial_index.result b/mysql-test/suite/rocksdb/r/range_locking_partial_index.result
new file mode 100644
index 00000000000..2e22d8b3206
--- /dev/null
+++ b/mysql-test/suite/rocksdb/r/range_locking_partial_index.result
@@ -0,0 +1,203 @@
+create table t0(a int primary key) engine=rocksdb;
+insert into t0 values (0),(1),(2),(3),(4),(5),(6),(7),(8),(9);
+create table t1 (
+pk1 int,
+pk2 int,
+a int not null,
+b int,
+primary key (pk1, pk2),
+key key1(pk1, a) comment 'partial_group_keyparts=1;partial_group_threshold=5'
+) engine=rocksdb;
+insert into t1 select
+1,
+A.a,
+100 + A.a,
+123456
+from t0 A;
+select * from t1 force index (key1) where pk1=1;
+pk1 pk2 a b
+1 0 100 123456
+1 1 101 123456
+1 2 102 123456
+1 3 103 123456
+1 4 104 123456
+1 5 105 123456
+1 6 106 123456
+1 7 107 123456
+1 8 108 123456
+1 9 109 123456
+insert into t1 select
+2,
+A.a,
+100 + A.a,
+123456
+from t0 A limit 3;
+insert into t1 select
+10000 + A.a +10 *B.a +100*C.a,
+A.a,
+100 + A.a,
+123456
+from t0 A, t0 B, t0 C;
+create table t3(pk int primary key);
+connect con2,localhost,root,,;
+connection con2;
+begin;
+insert into t3 values(3333333);
+connection default;
+#
+# First, test a query with range lock
+#
+explain
+select * from t1 force index (key1) where pk1>=1 and pk1<=10;
+id select_type table partitions type possible_keys key key_len ref rows filtered Extra
+1 SIMPLE t1 NULL range key1 key1 4 NULL # 100.00 Using index condition
+Warnings:
+Note 1003 /* select#1 */ select `test`.`t1`.`pk1` AS `pk1`,`test`.`t1`.`pk2` AS `pk2`,`test`.`t1`.`a` AS `a`,`test`.`t1`.`b` AS `b` from `test`.`t1` FORCE INDEX (`key1`) where ((`test`.`t1`.`pk1` >= 1) and (`test`.`t1`.`pk1` <= 10))
+connect con1,localhost,root,,;
+connection con1;
+begin;
+# Allocate a snapshot
+select * from t0 where a=3;
+a
+3
+connection default;
+# Make some modifications not visible in the snapshot
+insert into t1 values (1,11, 99999, 99999);
+insert into t1 values (2,11, 99999, 99999);
+connection con1;
+# This doesn't see the modifications
+select * from t1 force index (key1) where pk1>=1 and pk1<=10;
+pk1 pk2 a b
+1 0 100 123456
+1 1 101 123456
+1 2 102 123456
+1 3 103 123456
+1 4 104 123456
+1 5 105 123456
+1 6 106 123456
+1 7 107 123456
+1 8 108 123456
+1 9 109 123456
+2 0 100 123456
+2 1 101 123456
+2 2 102 123456
+# This DOES see the modifications
+select * from t1 force index (key1) where pk1>=1 and pk1<=10 for update;
+pk1 pk2 a b
+1 0 100 123456
+1 1 101 123456
+1 2 102 123456
+1 3 103 123456
+1 4 104 123456
+1 5 105 123456
+1 6 106 123456
+1 7 107 123456
+1 8 108 123456
+1 9 109 123456
+1 11 99999 99999
+2 0 100 123456
+2 1 101 123456
+2 2 102 123456
+2 11 99999 99999
+# select * from information_schema.rocksdb_locks; # With replacements by select_from_is_rowlocks.inc
+COLUMN_FAMILY_ID TRANSACTION_ID KEY mode
+$cf2_id $trx_id ${indexnr2}80000001-${indexnr2}8000000a:1 X
+$cf_id $trx_id ${indexnr}8000000180000000 X
+$cf_id $trx_id ${indexnr}8000000180000001 X
+$cf_id $trx_id ${indexnr}8000000180000002 X
+$cf_id $trx_id ${indexnr}8000000180000003 X
+$cf_id $trx_id ${indexnr}8000000180000004 X
+$cf_id $trx_id ${indexnr}8000000180000005 X
+$cf_id $trx_id ${indexnr}8000000180000006 X
+$cf_id $trx_id ${indexnr}8000000180000007 X
+$cf_id $trx_id ${indexnr}8000000180000008 X
+$cf_id $trx_id ${indexnr}8000000180000009 X
+$cf_id $trx_id ${indexnr}800000018000000b X
+$cf_id $trx_id ${indexnr}8000000280000000 X
+$cf_id $trx_id ${indexnr}8000000280000001 X
+$cf_id $trx_id ${indexnr}8000000280000002 X
+$cf_id $trx_id ${indexnr}800000028000000b X
+rollback;
+#
+# Now, test a query with LockingIterator
+#
+delete from t1 where b=99999;
+begin;
+# Allocate a snapshot
+select * from t0 where a=3;
+a
+3
+connection default;
+# Make some modifications not visible in the snapshot
+insert into t1 values (1,11, 99999, 99999);
+insert into t1 values (2,11, 99999, 99999);
+connection con1;
+# This doesn't see the modifications:
+select * from t1 force index (key1) where pk1>=1 order by pk1 limit 15;
+pk1 pk2 a b
+1 0 100 123456
+1 1 101 123456
+1 2 102 123456
+1 3 103 123456
+1 4 104 123456
+1 5 105 123456
+1 6 106 123456
+1 7 107 123456
+1 8 108 123456
+1 9 109 123456
+2 0 100 123456
+2 1 101 123456
+2 2 102 123456
+10000 0 100 123456
+10001 1 101 123456
+# This DOES see the modifications:
+select * from t1 force index (key1) where pk1>=1 order by pk1 limit 15 for update;
+pk1 pk2 a b
+1 0 100 123456
+1 1 101 123456
+1 2 102 123456
+1 3 103 123456
+1 4 104 123456
+1 5 105 123456
+1 6 106 123456
+1 7 107 123456
+1 8 108 123456
+1 9 109 123456
+1 11 99999 99999
+2 0 100 123456
+2 1 101 123456
+2 2 102 123456
+2 11 99999 99999
+# select * from information_schema.rocksdb_locks; # With replacements by select_from_is_rowlocks.inc
+COLUMN_FAMILY_ID TRANSACTION_ID KEY mode
+$cf2_id $trx_id ${indexnr2}80000001-${indexnr2}800000018000006480000000 X
+$cf2_id $trx_id ${indexnr2}800000018000006480000000-${indexnr2}800000018000006580000001 X
+$cf2_id $trx_id ${indexnr2}800000018000006580000001-${indexnr2}800000018000006680000002 X
+$cf2_id $trx_id ${indexnr2}800000018000006680000002-${indexnr2}800000018000006780000003 X
+$cf2_id $trx_id ${indexnr2}800000018000006780000003-${indexnr2}800000018000006880000004 X
+$cf2_id $trx_id ${indexnr2}800000018000006880000004-${indexnr2}800000018000006980000005 X
+$cf2_id $trx_id ${indexnr2}800000018000006980000005-${indexnr2}800000018000006a80000006 X
+$cf2_id $trx_id ${indexnr2}800000018000006a80000006-${indexnr2}800000018000006b80000007 X
+$cf2_id $trx_id ${indexnr2}800000018000006b80000007-${indexnr2}800000018000006c80000008 X
+$cf2_id $trx_id ${indexnr2}800000018000006c80000008-${indexnr2}800000018000006d80000009 X
+$cf2_id $trx_id ${indexnr2}800000018000006d80000009-${indexnr2}800000018001869f8000000b X
+$cf2_id $trx_id ${indexnr2}800000018001869f8000000b-${indexnr2+1} X
+$cf2_id $trx_id ${indexnr2}80000002-${indexnr2}80000002:1 X
+$cf2_id $trx_id ${indexnr2}80000002-${indexnr2}80000003 X
+$cf_id $trx_id ${indexnr}8000000180000000 X
+$cf_id $trx_id ${indexnr}8000000180000001 X
+$cf_id $trx_id ${indexnr}8000000180000002 X
+$cf_id $trx_id ${indexnr}8000000180000003 X
+$cf_id $trx_id ${indexnr}8000000180000004 X
+$cf_id $trx_id ${indexnr}8000000180000005 X
+$cf_id $trx_id ${indexnr}8000000180000006 X
+$cf_id $trx_id ${indexnr}8000000180000007 X
+$cf_id $trx_id ${indexnr}8000000180000008 X
+$cf_id $trx_id ${indexnr}8000000180000009 X
+$cf_id $trx_id ${indexnr}800000018000000b X
+$cf_id $trx_id ${indexnr}80000002-${indexnr}80000003 X
+rollback;
+disconnect con1;
+connection default;
+disconnect con2;
+drop table t0, t1,t3;
diff --git a/mysql-test/suite/rocksdb/r/range_locking_refresh_iter.result b/mysql-test/suite/rocksdb/r/range_locking_refresh_iter.result
new file mode 100644
index 00000000000..1067087e816
--- /dev/null
+++ b/mysql-test/suite/rocksdb/r/range_locking_refresh_iter.result
@@ -0,0 +1,50 @@
+select @@rocksdb_use_range_locking;
+@@rocksdb_use_range_locking
+1
+set debug_sync='RESET';
+create table ten(a int primary key);
+insert into ten values (0),(1),(2),(3),(4),(5),(6),(7),(8),(9);
+create table one_k(a int primary key);
+insert into one_k select A.a + B.a* 10 + C.a * 100 from ten A, ten B, ten C;
+create table t1 (
+pk int primary key,
+a int
+) engine=rocksdb;
+insert into t1 select a,a from ten;
+insert into t1 select a+40, a+40 from ten;
+insert into t1 select a+100, a+100 from one_k;
+delete from t1 where pk=44;
+set global rocksdb_force_flush_memtable_and_lzero_now=1;
+begin;
+set debug_sync='rocksdb.check_flags_iri SIGNAL con1_stopped WAIT_FOR con1_cont';
+update t1 set a=a+100 where pk < 3 or pk between 10 and 50;
+set debug_sync='now WAIT_FOR con1_stopped';
+insert into t1 values (44,5000);
+delete from t1 where pk= 42;
+update t1 set a=5000 where pk between 40 and 45;
+set global rocksdb_force_flush_memtable_and_lzero_now=1;
+set debug_sync='now SIGNAL con1_cont';
+select * from t1 where pk<100;
+pk a
+0 100
+1 101
+2 102
+3 3
+4 4
+5 5
+6 6
+7 7
+8 8
+9 9
+40 5100
+41 5100
+43 5100
+44 5100
+45 5100
+46 146
+47 147
+48 148
+49 149
+commit;
+set debug_sync='RESET';
+drop table t1, ten, one_k;
diff --git a/mysql-test/suite/rocksdb/r/range_locking_rev_cf.result b/mysql-test/suite/rocksdb/r/range_locking_rev_cf.result
new file mode 100644
index 00000000000..e39c6bb3339
--- /dev/null
+++ b/mysql-test/suite/rocksdb/r/range_locking_rev_cf.result
@@ -0,0 +1,556 @@
+show variables like 'rocksdb_use_range_locking';
+Variable_name Value
+rocksdb_use_range_locking ON
+create table t1 (
+pk int,
+a int,
+primary key (pk) comment 'rev:cf1'
+) engine=rocksdb;
+insert into t1 values
+(10,10),(20,20),(30,30);
+connect con1,localhost,root,,;
+connect con2,localhost,root,,;
+### Test: check that range lock inhibits a point lock
+connection con1;
+begin;
+select * from t1 where pk between 5 and 25 for update;
+pk a
+10 10
+20 20
+connection con2;
+insert into t1 values (15,15);
+ERROR HY000: Lock wait timeout exceeded; try restarting transaction: Timeout on index: test.t1.PRIMARY
+connection con1;
+rollback;
+## Test: check that range lock inhibits another range lock
+connection con1;
+begin;
+select * from t1 where pk between 5 and 25 for update;
+pk a
+10 10
+20 20
+connection con2;
+begin;
+select * from t1 where pk between 15 and 35 for update;
+ERROR HY000: Lock wait timeout exceeded; try restarting transaction: Timeout on index: test.t1.PRIMARY
+rollback;
+connection con1;
+rollback;
+## Test: check that regular read does not get a range lock
+connection con1;
+begin;
+select * from t1 where pk between 5 and 25;
+pk a
+10 10
+20 20
+connection con2;
+begin;
+select * from t1 where pk between 15 and 35 for update;
+pk a
+20 20
+30 30
+rollback;
+connection con1;
+rollback;
+## Test that locks are not released when a statement inside
+## a transaction is rolled back
+create table t2 (
+pk int,
+a int,
+primary key (pk) comment 'rev:cf1',
+unique key(a) comment ''
+) engine=rocksdb;
+insert into t2 values (1,1),(2,2);
+begin;
+insert into t2 values (3,3);
+insert into t2 values (10,2);
+ERROR 23000: Duplicate entry '2' for key 't2.a'
+connection con2;
+begin;
+select * from t2 where pk=3 for update;
+ERROR HY000: Lock wait timeout exceeded; try restarting transaction: Timeout on index: test.t2.PRIMARY
+rollback;
+connection con1;
+rollback;
+drop table t2;
+connection default;
+disconnect con1;
+disconnect con2;
+drop table t1;
+#
+# Test INFORMATION_SCHEMA.lock_info in range-locking mode
+#
+connect con1,localhost,root,,;
+connection con1;
+create table t0 (a int primary key);
+begin;
+insert into t0 values (1);
+connection default;
+create table t1 (
+pk int,
+a int,
+primary key (pk) comment 'rev:cf1'
+) engine=rocksdb;
+insert into t1 values
+(10,10),(20,20),(30,30);
+begin;
+select * from t1 where pk=10 for update;
+pk a
+10 10
+# select * from information_schema.rocksdb_locks; # With replacements by select_from_is_rowlocks.inc
+COLUMN_FAMILY_ID TRANSACTION_ID KEY mode
+$cf_id $trx_id ${indexnr}8000000a X
+delete from t1 where pk between 25 and 40;
+# select * from information_schema.rocksdb_locks; # With replacements by select_from_is_rowlocks.inc
+COLUMN_FAMILY_ID TRANSACTION_ID KEY mode
+$cf_id $trx_id ${indexnr}8000000a X
+$cf_id $trx_id ${indexnr}80000028-${indexnr}80000019:1 X
+rollback;
+begin;
+# The following will show a range lock on 2-9 and also a point lock on 10.
+# This is how things currently work. (after MDEV-21314, not anymore)
+select * from t1 where pk between 2 and 9 for update;
+pk a
+# select * from information_schema.rocksdb_locks; # With replacements by select_from_is_rowlocks.inc
+COLUMN_FAMILY_ID TRANSACTION_ID KEY mode
+$cf_id $trx_id ${indexnr}80000009-${indexnr}80000002:1 X
+rollback;
+drop table t1;
+connection con1;
+rollback;
+drop table t0;
+connection default;
+disconnect con1;
+#
+# MDEV-18104: MyRocks-Gap-Lock: range locking bounds are incorrect for multi-part keys
+#
+create table t0(a int);
+insert into t0 values (0),(1),(2),(3),(4),(5),(6),(7),(8),(9);
+create table t1 (
+kp1 int not null,
+kp2 int not null,
+a int,
+primary key(kp1, kp2) comment 'rev:cf1'
+) engine=rocksdb;
+insert into t1 select 1, a, 1234 from t0;
+insert into t1 select 2, a, 1234 from t0;
+insert into t1 select 3, a, 1234 from t0;
+connect con1,localhost,root,,;
+connection con1;
+begin;
+select * from t1 where kp1=2 for update;
+kp1 kp2 a
+2 0 1234
+2 1 1234
+2 2 1234
+2 3 1234
+2 4 1234
+2 5 1234
+2 6 1234
+2 7 1234
+2 8 1234
+2 9 1234
+connection default;
+# The lock on kp1=2 should inhibit the following INSERT:
+insert into t1 values ( 2,5,9999);
+ERROR HY000: Lock wait timeout exceeded; try restarting transaction: Timeout on index: test.t1.PRIMARY
+rollback;
+connection con1;
+rollback;
+connection default;
+disconnect con1;
+drop table t0,t1;
+#
+# Test that locks on ranges on non-unique secondary keys inhibit
+# modifications of the contents of these ranges
+#
+create table t0(a int);
+insert into t0 values (0),(1),(2),(3),(4),(5),(6),(7),(8),(9);
+create table t1 (
+kp1 int not null,
+kp2 int not null,
+a int,
+key(kp1, kp2) comment 'rev:cf1'
+) engine=rocksdb;
+insert into t1 select 1, a, 1234 from t0;
+insert into t1 values (2, 3, 1234);
+insert into t1 values (2, 5, 1234);
+insert into t1 values (2, 7, 1234);
+insert into t1 select 3, a, 1234 from t0;
+connect con1,localhost,root,,;
+connection con1;
+begin;
+explain
+select * from t1 where kp1=2 for update;
+id select_type table partitions type possible_keys key key_len ref rows filtered Extra
+1 SIMPLE t1 NULL ref kp1 kp1 4 const # 100.00 NULL
+Warnings:
+Note 1003 /* select#1 */ select `test`.`t1`.`kp1` AS `kp1`,`test`.`t1`.`kp2` AS `kp2`,`test`.`t1`.`a` AS `a` from `test`.`t1` where (`test`.`t1`.`kp1` = 2)
+select * from t1 where kp1=2 for update;
+kp1 kp2 a
+2 3 1234
+2 5 1234
+2 7 1234
+connection default;
+begin;
+insert into t1 values (2, 9, 9999);
+ERROR HY000: Lock wait timeout exceeded; try restarting transaction: Timeout on index: test.t1.kp1
+delete from t1 where kp1=2 and kp2=5;
+ERROR HY000: Lock wait timeout exceeded; try restarting transaction: Timeout on index: test.t1.kp1
+update t1 set kp1=333 where kp1=2 and kp2=3;
+ERROR HY000: Lock wait timeout exceeded; try restarting transaction: Timeout on index: test.t1.kp1
+update t1 set kp1=2 where kp1=1 and kp2=8;
+ERROR HY000: Lock wait timeout exceeded; try restarting transaction: Timeout on index: test.t1.kp1
+rollback;
+connection con1;
+rollback;
+disconnect con1;
+connection default;
+drop table t0,t1;
+#
+# Transaction isolation test
+#
+create table t1 (pk int primary key, a int) engine=rocksdb;
+insert into t1 values (1,1),(2,2),(3,3);
+connect con1,localhost,root,,;
+# TRX1: Start, Allocate a snapshot
+connection con1;
+begin;
+select * from t1;
+pk a
+1 1
+2 2
+3 3
+# TRX2: Make a change that TRX1 will not see
+connection default;
+update t1 set a=2222 where pk=2;
+# TRX1: Now, make a change that would overwrite TRX2'x change and commit
+connection con1;
+update t1 set a=a+1 where pk=2;
+commit;
+# Examine the result:
+# pk=2, a=2223 means UPDATE in TRX1 used "read committed" (InnoDB-like isolation)
+# pk=2, a=3 means UPDATE in TRX1 silently overwrote TRX2
+# (and with key tracking, one would get an error on the second UPDATE)
+connection default;
+select * from t1;
+pk a
+1 1
+2 2223
+3 3
+disconnect con1;
+connection default;
+drop table t1;
+#
+# Same test as above, but check the range scan
+#
+create table t1 (
+pk int,
+a int,
+primary key (pk) comment 'rev:cf1'
+) engine=rocksdb;
+insert into t1 values (1,1),(2,2),(3,3),(4,4),(5,5),(6,6);
+connect con1,localhost,root,,;
+# TRX1: Start, Allocate a snapshot
+connection con1;
+begin;
+select * from t1;
+pk a
+6 6
+5 5
+4 4
+3 3
+2 2
+1 1
+# TRX2: Make a change that TRX1 will not see
+connection default;
+update t1 set a=2222 where pk between 3 and 5;
+# TRX1: Now, make a change that would overwrite TRX2'x change and commit
+connection con1;
+update t1 set a=a+1 where pk between 3 and 5;
+commit;
+# Examine the result:
+# pk={3,4,5} a=2223 means UPDATE in TRX1 used "read committed" (InnoDB-like isolation)
+connection default;
+select * from t1;
+pk a
+6 6
+5 2223
+4 2223
+3 2223
+2 2
+1 1
+disconnect con1;
+connection default;
+drop table t1;
+#
+# Same as above, but test SELECT FOR UPDATE.
+#
+create table t1 (
+pk int,
+a int,
+primary key (pk) comment 'rev:cf1'
+) engine=rocksdb;
+insert into t1 values (1,1),(2,2),(3,3),(4,4),(5,5),(6,6);
+connect con1,localhost,root,,;
+# TRX1: Start, Allocate a snapshot
+connection con1;
+begin;
+select * from t1;
+pk a
+6 6
+5 5
+4 4
+3 3
+2 2
+1 1
+# TRX2: Make a change that TRX1 will not see
+connection default;
+update t1 set a=222 where pk=2;
+update t1 set a=333 where pk=3;
+# TRX1: Check what select [FOR UPDATE] sees
+connection con1;
+select * from t1 where pk in (2,3);
+pk a
+2 2
+3 3
+select * from t1 where pk=2 for update;
+pk a
+2 222
+select * from t1 where pk=2 lock in share mode;
+pk a
+2 222
+select * from t1 where pk=2;
+pk a
+2 2
+commit;
+disconnect con1;
+connection default;
+drop table t1;
+#
+# Check that I_S.processlist.state is set correctly now.
+#
+create table t1(
+pk int,
+a int,
+primary key(pk) comment 'rev:cf1'
+) engine=rocksdb;
+insert into t1 values (1,1),(2,2),(3,3);
+begin;
+select * from t1 where pk=2 for update;
+pk a
+2 2
+connect con1,localhost,root,,;
+begin;
+set rocksdb_lock_wait_timeout=300;
+select * from t1 where pk=2 for update;
+connection default;
+# Now, will wait until we see con1 have state="Waiting for row lock"
+rollback;
+connection con1;
+pk a
+2 2
+rollback;
+disconnect con1;
+connection default;
+drop table t1;
+#
+# Test range locking for ranges with HA_READ_PREFIX_LAST
+#
+create table t0(a int) engine=rocksdb;
+insert into t0 values (0),(1),(2),(3),(4),(5),(6),(7),(8),(9);
+create table t1 (
+pk1 int,
+pk2 int,
+a int,
+primary key(pk1, pk2) comment 'rev:cf1'
+) engine=rocksdb;
+insert into t1
+select
+A.a, B.a, A.a*10+B.a
+from
+t0 A, t0 B;
+connect con1,localhost,root,,;
+connection con1;
+begin;
+insert into t1 values (0x1112222,0x1112222,0);
+connection default;
+begin;
+# Should use ref access w/o filesort:
+explain
+select * from t1
+where pk1=3
+order by pk1 desc, pk2 desc
+for update;
+id select_type table partitions type possible_keys key key_len ref rows filtered Extra
+1 SIMPLE t1 NULL ref PRIMARY PRIMARY 4 const # 100.00 Backward index scan
+Warnings:
+Note 1003 /* select#1 */ select `test`.`t1`.`pk1` AS `pk1`,`test`.`t1`.`pk2` AS `pk2`,`test`.`t1`.`a` AS `a` from `test`.`t1` where (`test`.`t1`.`pk1` = 3) order by `test`.`t1`.`pk1` desc,`test`.`t1`.`pk2` desc
+select * from t1
+where pk1=3
+order by pk1 desc, pk2 desc
+for update;
+pk1 pk2 a
+3 9 39
+3 8 38
+3 7 37
+3 6 36
+3 5 35
+3 4 34
+3 3 33
+3 2 32
+3 1 31
+3 0 30
+# select * from information_schema.rocksdb_locks; # With replacements by select_from_is_rowlocks.inc
+COLUMN_FAMILY_ID TRANSACTION_ID KEY mode
+$cf_id $trx_id ${indexnr}80000003-${indexnr}80000003:1 X
+rollback;
+#
+# Test range locking for ranges with HA_READ_PREFIX_LAST_OR_PREV
+#
+begin;
+# Should use range access with 2 keyparts and w/o filesort:
+explain
+select * from t1
+where pk1=4 and pk2 between 5 and 8
+order by pk1 desc, pk2 desc
+for update;
+id select_type table partitions type possible_keys key key_len ref rows filtered Extra
+1 SIMPLE t1 NULL range PRIMARY PRIMARY 8 NULL # 100.00 Using where; Backward index scan
+Warnings:
+Note 1003 /* select#1 */ select `test`.`t1`.`pk1` AS `pk1`,`test`.`t1`.`pk2` AS `pk2`,`test`.`t1`.`a` AS `a` from `test`.`t1` where ((`test`.`t1`.`pk1` = 4) and (`test`.`t1`.`pk2` between 5 and 8)) order by `test`.`t1`.`pk1` desc,`test`.`t1`.`pk2` desc
+select * from t1
+where pk1=4 and pk2 between 5 and 8
+order by pk1 desc, pk2 desc
+for update;
+pk1 pk2 a
+4 8 48
+4 7 47
+4 6 46
+4 5 45
+# select * from information_schema.rocksdb_locks; # With replacements by select_from_is_rowlocks.inc
+COLUMN_FAMILY_ID TRANSACTION_ID KEY mode
+$cf_id $trx_id ${indexnr}8000000480000008-${indexnr}8000000480000005:1 X
+rollback;
+connection con1;
+rollback;
+connection default;
+drop table t0, t1;
+#
+# A bug: range locking was not used when scan started at table start or end
+#
+create table t0(a int);
+insert into t0 values (0),(1),(2),(3),(4),(5),(6),(7),(8),(9);
+create table t10(a int);
+insert into t10 select A.a + B.a* 10 + C.a * 100 from t0 A, t0 B, t0 C;
+create table t1 (
+pk int not null,
+a int,
+primary key(pk)
+) engine=rocksdb;
+insert into t1 select a*2,a*2 from t10;
+connection con1;
+begin;
+select * from t1 where pk=500 for update;
+pk a
+500 500
+connection default;
+begin;
+select * from t1 where pk<10 order by pk limit 10 for update;
+pk a
+0 0
+2 2
+4 4
+6 6
+8 8
+# select * from information_schema.rocksdb_locks; # With replacements by select_from_is_rowlocks.inc
+COLUMN_FAMILY_ID TRANSACTION_ID KEY mode
+$cf_id $trx_id ${indexnr}-${indexnr}8000000a X
+rollback;
+begin;
+select * from t1 where pk>1990 order by pk desc limit 10 for update;
+pk a
+1998 1998
+1996 1996
+1994 1994
+1992 1992
+# select * from information_schema.rocksdb_locks; # With replacements by select_from_is_rowlocks.inc
+COLUMN_FAMILY_ID TRANSACTION_ID KEY mode
+$cf_id $trx_id ${indexnr}800007c6-${indexnr+1} X
+rollback;
+connection con1;
+rollback;
+disconnect con1;
+connection default;
+drop table t0,t10,t1;
+#
+# Range locking and READ-COMMITTED isolation level
+#
+connect con1,localhost,root,,;
+connection con1;
+set session transaction isolation level read committed;
+create table t1 (
+pk int not null,
+a int,
+primary key(pk)
+) engine=rocksdb;
+insert into t1(pk) values (0),(1),(2),(3),(4),(5),(6),(7),(8),(9);
+begin;
+select * from t1 where pk between 2 and 5 for update;
+pk a
+2 NULL
+3 NULL
+4 NULL
+5 NULL
+# Below should show individual row locks, not locked range:
+# select * from information_schema.rocksdb_locks; # With replacements by select_from_is_rowlocks.inc
+COLUMN_FAMILY_ID TRANSACTION_ID KEY mode
+$cf_id $trx_id ${indexnr}80000002 X
+$cf_id $trx_id ${indexnr}80000003 X
+$cf_id $trx_id ${indexnr}80000004 X
+$cf_id $trx_id ${indexnr}80000005 X
+$cf_id $trx_id ${indexnr}80000006 X
+rollback;
+begin;
+update t1 set a=a+1 where pk between 2 and 5;
+# Below should show individual row locks, not locked range:
+# select * from information_schema.rocksdb_locks; # With replacements by select_from_is_rowlocks.inc
+COLUMN_FAMILY_ID TRANSACTION_ID KEY mode
+$cf_id $trx_id ${indexnr}80000002 X
+$cf_id $trx_id ${indexnr}80000003 X
+$cf_id $trx_id ${indexnr}80000004 X
+$cf_id $trx_id ${indexnr}80000005 X
+$cf_id $trx_id ${indexnr}80000006 X
+rollback;
+drop table t1;
+disconnect con1;
+connection default;
+#
+# Range Locking and READ-COMMITTED, another test
+#
+create table t1 (
+pk int,
+a int,
+b int,
+primary key (pk),
+key(a)
+) engine=rocksdb;
+insert into t1 values
+(1, 100, 1000),
+(2, 200, 2000),
+(3, 300, 3000);
+set transaction isolation level repeatable read;
+begin;
+update t1 set b = b + 1 where a > 200;
+connect con1,localhost,root,,;
+connection con1;
+set transaction isolation level read committed;
+begin;
+insert into t1 values (4, 150, 1500);
+insert into t1 values (5, 250, 1500);
+ERROR HY000: Lock wait timeout exceeded; try restarting transaction: Timeout on index: test.t1.a
+rollback;
+disconnect con1;
+connection default;
+rollback;
+drop table t1;
diff --git a/mysql-test/suite/rocksdb/r/range_locking_seek_for_update.result b/mysql-test/suite/rocksdb/r/range_locking_seek_for_update.result
new file mode 100644
index 00000000000..bf1cb916a5b
--- /dev/null
+++ b/mysql-test/suite/rocksdb/r/range_locking_seek_for_update.result
@@ -0,0 +1,291 @@
+show variables like 'rocksdb_use_range_locking';
+Variable_name Value
+rocksdb_use_range_locking ON
+create table t0(a int primary key);
+insert into t0 values (0),(1),(2),(3),(4),(5),(6),(7),(8),(9);
+create table t1 (
+pk int,
+a int,
+primary key (pk)
+) engine=rocksdb;
+insert into t1 select
+A.a + B.a*10 + C.a*100,
+A.a + B.a*10 + C.a*100
+from
+t0 A, t0 B, t0 C;
+# Make another connection to get the lock tree out of the STO-mode
+connect con1,localhost,root,,;
+connection con1;
+begin;
+select * from t1 where pk=10 for update;
+pk a
+10 10
+connection default;
+begin;
+select * from t1 where pk=11 for update;
+pk a
+11 11
+# Now, we will just see locks on 10=0xA and 11=0xB:
+# select * from information_schema.rocksdb_locks; # With replacements by select_from_is_rowlocks.inc
+COLUMN_FAMILY_ID TRANSACTION_ID KEY mode
+$cf_id $trx_id ${indexnr}8000000b X
+#
+# SeekForUpdate Test #1: A query with type=range (without upper bound) and LIMIT
+#
+explain
+select * from t1 where pk>=500 order by pk limit 3 for update;
+id select_type table partitions type possible_keys key key_len ref rows filtered Extra
+1 SIMPLE t1 NULL range PRIMARY PRIMARY 4 NULL # 100.00 Using where
+Warnings:
+Note 1003 /* select#1 */ select `test`.`t1`.`pk` AS `pk`,`test`.`t1`.`a` AS `a` from `test`.`t1` where (`test`.`t1`.`pk` >= 500) order by `test`.`t1`.`pk` limit 3
+select * from t1 where pk>=500 order by pk limit 3 for update;
+pk a
+500 500
+501 501
+502 502
+# select * from information_schema.rocksdb_locks; # With replacements by select_from_is_rowlocks.inc
+COLUMN_FAMILY_ID TRANSACTION_ID KEY mode
+$cf_id $trx_id ${indexnr}8000000b X
+$cf_id $trx_id ${indexnr}800001f4-${indexnr}800001f6 X
+rollback;
+begin;
+select * from t1 where pk=11 for update;
+pk a
+11 11
+explain
+select * from t1 order by pk limit 3 for update;
+id select_type table partitions type possible_keys key key_len ref rows filtered Extra
+1 SIMPLE t1 NULL index NULL PRIMARY 4 NULL 3 100.00 NULL
+Warnings:
+Note 1003 /* select#1 */ select `test`.`t1`.`pk` AS `pk`,`test`.`t1`.`a` AS `a` from `test`.`t1` order by `test`.`t1`.`pk` limit 3
+select * from t1 order by pk limit 3 for update;
+pk a
+0 0
+1 1
+2 2
+# select * from information_schema.rocksdb_locks; # With replacements by select_from_is_rowlocks.inc
+COLUMN_FAMILY_ID TRANSACTION_ID KEY mode
+$cf_id $trx_id ${indexnr}-${indexnr}80000002 X
+$cf_id $trx_id ${indexnr}8000000b X
+rollback;
+connection con1;
+rollback;
+disconnect con1;
+connection default;
+drop table t0, t1;
+#
+# Concurrent tests: let one thread do SeekForUpdate and the other
+# interfere by committing modifications
+#
+create table t0(a int primary key);
+insert into t0 values (0),(1),(2),(3),(4),(5),(6),(7),(8),(9);
+create table t1 (
+pk int,
+a int,
+primary key (pk)
+) engine=rocksdb;
+insert into t1 select
+A.a + B.a*10 + C.a*100,
+A.a + B.a*10 + C.a*100
+from
+t0 A, t0 B, t0 C;
+select * from t1 where pk<10;
+pk a
+0 0
+1 1
+2 2
+3 3
+4 4
+5 5
+6 6
+7 7
+8 8
+9 9
+delete from t1 where pk<10;
+select * from t1 where pk<10;
+pk a
+# Test what happens when another transaction commits a row
+# right before the range we are about to lock (nothing)
+explain
+select * from t1 where pk >=5 order by pk limit 3 for update;
+id select_type table partitions type possible_keys key key_len ref rows filtered Extra
+1 SIMPLE t1 NULL range PRIMARY PRIMARY 4 NULL # 100.00 Using where
+Warnings:
+Note 1003 /* select#1 */ select `test`.`t1`.`pk` AS `pk`,`test`.`t1`.`a` AS `a` from `test`.`t1` where (`test`.`t1`.`pk` >= 5) order by `test`.`t1`.`pk` limit 3
+begin;
+set debug_sync='rocksdb.locking_iter_scan SIGNAL about_to_lock_range WAIT_FOR spoiler_inserted';
+select * from t1 where pk >=5 order by pk limit 3 for update;
+connect con1,localhost,root,,;
+connection con1;
+set debug_sync='now WAIT_FOR about_to_lock_range';
+insert into t1 values (3,3);
+set debug_sync='now SIGNAL spoiler_inserted';
+connection default;
+pk a
+10 10
+11 11
+12 12
+rollback;
+delete from t1 where pk=3;
+#
+# Now, repeat the test but let the other transaction insert the row into
+# the range we are locking
+explain
+select * from t1 where pk >=5 order by pk limit 1 for update;
+id select_type table partitions type possible_keys key key_len ref rows filtered Extra
+1 SIMPLE t1 NULL range PRIMARY PRIMARY 4 NULL # 100.00 Using where
+Warnings:
+Note 1003 /* select#1 */ select `test`.`t1`.`pk` AS `pk`,`test`.`t1`.`a` AS `a` from `test`.`t1` where (`test`.`t1`.`pk` >= 5) order by `test`.`t1`.`pk` limit 1
+begin;
+set debug_sync='rocksdb.locking_iter_scan SIGNAL about_to_lock_range WAIT_FOR spoiler_inserted';
+select * from t1 where pk >=5 order by pk limit 1 for update;
+connection con1;
+set debug_sync='now WAIT_FOR about_to_lock_range';
+insert into t1 values (8,8);
+set debug_sync='now SIGNAL spoiler_inserted';
+connection default;
+pk a
+8 8
+# select * from information_schema.rocksdb_locks; # With replacements by select_from_is_rowlocks.inc
+COLUMN_FAMILY_ID TRANSACTION_ID KEY mode
+$cf_id $trx_id ${indexnr}80000005-${indexnr}8000000a X
+rollback;
+delete from t1 where pk=8;
+#
+# Repeat the third time, this time deleting the row that SeekForUpdate saw
+#
+insert into t1 values (7,7);
+begin;
+set debug_sync='rocksdb.locking_iter_scan SIGNAL about_to_lock_range WAIT_FOR spoiler_inserted';
+select * from t1 where pk >=5 order by pk limit 1 for update;
+connection con1;
+set debug_sync='now WAIT_FOR about_to_lock_range';
+delete from t1 where pk=7;
+set debug_sync='now SIGNAL spoiler_inserted';
+connection default;
+pk a
+10 10
+rollback;
+#
+# Repeat the above test, but let the read fail with ER_LOCK_WAIT_TIMEOUT
+# error. MyRocks code should now be prepared that data reads cause this
+# error
+#
+insert into t1 values (7,7);
+begin;
+set debug_sync='rocksdb.locking_iter_scan SIGNAL about_to_lock_range WAIT_FOR spoiler_inserted';
+select * from t1 where pk >=5 order by pk limit 1 for update;
+connection con1;
+set debug_sync='now WAIT_FOR about_to_lock_range';
+begin;
+delete from t1 where pk=7;
+set debug_sync='now SIGNAL spoiler_inserted';
+connection default;
+ERROR HY000: Lock wait timeout exceeded; try restarting transaction: Timeout on index: test.t1.PRIMARY
+rollback;
+connection con1;
+rollback;
+connection default;
+#
+# Test the thd_killed check in the iterator
+#
+begin;
+set debug_sync='rocksdb.locking_iter_scan SIGNAL about_to_lock_range WAIT_FOR go_get_killed';
+select * from t1 where pk >=5 order by pk limit 1 for update;
+connection con1;
+set debug_sync='now WAIT_FOR about_to_lock_range';
+kill query CONN_ID;
+connection default;
+ERROR HY000: Got error 10 'Operation aborted: ' from ROCKSDB
+rollback;
+#
+# Backward scan test
+#
+connection con1;
+begin;
+select * from t1 where pk=500 for update;
+pk a
+500 500
+connection default;
+insert into t1 values
+(1001, 1001),
+(1005, 1005),
+(1007, 1007),
+(1010, 1010);
+begin;
+select * from t1 order by pk desc limit 2 for update;
+pk a
+1010 1010
+1007 1007
+# The below will lock from pk=1007 (0x3ef) till the end of the table:
+# select * from information_schema.rocksdb_locks; # With replacements by select_from_is_rowlocks.inc
+COLUMN_FAMILY_ID TRANSACTION_ID KEY mode
+$cf_id $trx_id ${indexnr}800003ef-${indexnr+1} X
+rollback;
+begin;
+select * from t1 where pk <1007 order by pk desc limit 2 for update;
+pk a
+1005 1005
+1001 1001
+# select * from information_schema.rocksdb_locks; # With replacements by select_from_is_rowlocks.inc
+COLUMN_FAMILY_ID TRANSACTION_ID KEY mode
+$cf_id $trx_id ${indexnr}800003e9-${indexnr}800003ef X
+connection con1;
+rollback;
+connection default;
+rollback;
+#
+# Backward scan test 2: error condition
+#
+connection con1;
+begin;
+select * from t1 where pk=1010 for update;
+pk a
+1010 1010
+connection default;
+begin;
+select * from t1 order by pk desc limit 2 for update;
+ERROR HY000: Lock wait timeout exceeded; try restarting transaction: Timeout on index: test.t1.PRIMARY
+rollback;
+connection con1;
+rollback;
+begin;
+select * from t1 where pk=1007 for update;
+pk a
+1007 1007
+connection default;
+begin;
+select * from t1 order by pk desc limit 2 for update;
+ERROR HY000: Lock wait timeout exceeded; try restarting transaction: Timeout on index: test.t1.PRIMARY
+rollback;
+connection con1;
+rollback;
+disconnect con1;
+connection default;
+drop table t0,t1;
+#
+# A test: full table scan doesn't lock gaps
+#
+create table t1 (
+pk int primary key,
+a int
+) engine=rocksdb;
+insert into t1 values (10,10),(20,20),(30,30);
+connect con1,localhost,root,,;
+connect con2,localhost,root,,;
+connection con1;
+begin;
+select * from t1 for update;
+pk a
+10 10
+20 20
+30 30
+connection con2;
+insert into t1 values (5,5);
+ERROR HY000: Lock wait timeout exceeded; try restarting transaction: Timeout on index: test.t1.PRIMARY
+connection con1;
+rollback;
+disconnect con1;
+disconnect con2;
+connection default;
+drop table t1;
diff --git a/mysql-test/suite/rocksdb/r/range_locking_seek_for_update2.result b/mysql-test/suite/rocksdb/r/range_locking_seek_for_update2.result
new file mode 100644
index 00000000000..2afb2eea589
--- /dev/null
+++ b/mysql-test/suite/rocksdb/r/range_locking_seek_for_update2.result
@@ -0,0 +1,142 @@
+show variables like 'rocksdb_use_range_locking';
+Variable_name Value
+rocksdb_use_range_locking ON
+create table t0(a int primary key);
+insert into t0 values (0),(1),(2),(3),(4),(5),(6),(7),(8),(9);
+create table t1 (
+pk int,
+a int,
+primary key (pk) comment 'rlsfu_test'
+) engine=rocksdb;
+insert into t1 (pk)
+select
+A.a + B.a*10 + C.a*100
+from
+t0 A, t0 B, t0 C;
+delete from t1 where pk<100;
+connect con1,localhost,root,,;
+connection con1;
+begin;
+set debug_sync='rocksdb.locking_iter_scan SIGNAL about_to_lock_range WAIT_FOR spoiler_inserted';
+select * from t1 where pk >=5 order by pk limit 5 for update;
+connection default;
+set debug_sync='now WAIT_FOR about_to_lock_range';
+insert into t1 (pk) values
+(10),(20),(30),(40),(50);
+set debug_sync='now SIGNAL spoiler_inserted';
+connection con1;
+pk a
+10 NULL
+20 NULL
+30 NULL
+40 NULL
+50 NULL
+# This must return 1, no 5:
+select lock_count from information_schema.rocksdb_trx
+where thread_id=CONNECTION_ID();
+lock_count
+1
+rollback;
+disconnect con1;
+connection default;
+drop table t0, t1;
+#
+# A testcase for locking at the end of the scan
+#
+create table t1 (
+pk int,
+primary key (pk) comment 'rlsfu_test'
+) engine=rocksdb;
+connect con1,localhost,root,,;
+connection con1;
+insert into t1 values (1), (10), (100);
+begin;
+select * from t1 for update;
+pk
+1
+10
+100
+connection default;
+select * from t1;
+pk
+1
+10
+100
+insert into t1 values (150);
+ERROR HY000: Lock wait timeout exceeded; try restarting transaction: Timeout on index: test.t1.PRIMARY
+connection con1;
+rollback;
+begin;
+explain
+select * from t1 order by pk desc for update;
+id select_type table partitions type possible_keys key key_len ref rows filtered Extra
+1 SIMPLE t1 NULL index NULL PRIMARY 4 NULL # 100.00 Backward index scan; Using index
+Warnings:
+Note 1003 /* select#1 */ select `test`.`t1`.`pk` AS `pk` from `test`.`t1` order by `test`.`t1`.`pk` desc
+select * from t1 order by pk desc for update;
+pk
+100
+10
+1
+connection default;
+select * from t1;
+pk
+1
+10
+100
+insert into t1 values (0);
+ERROR HY000: Lock wait timeout exceeded; try restarting transaction: Timeout on index: test.t1.PRIMARY
+disconnect con1;
+connection default;
+drop table t1;
+set global rocksdb_enable_iterate_bounds=off;
+#
+# A testcase for locking at the end of the scan
+#
+create table t1 (
+pk int,
+primary key (pk) comment 'rlsfu_test'
+) engine=rocksdb;
+connect con1,localhost,root,,;
+connection con1;
+insert into t1 values (1), (10), (100);
+begin;
+select * from t1 for update;
+pk
+1
+10
+100
+connection default;
+select * from t1;
+pk
+1
+10
+100
+insert into t1 values (150);
+ERROR HY000: Lock wait timeout exceeded; try restarting transaction: Timeout on index: test.t1.PRIMARY
+connection con1;
+rollback;
+begin;
+explain
+select * from t1 order by pk desc for update;
+id select_type table partitions type possible_keys key key_len ref rows filtered Extra
+1 SIMPLE t1 NULL index NULL PRIMARY 4 NULL # 100.00 Backward index scan; Using index
+Warnings:
+Note 1003 /* select#1 */ select `test`.`t1`.`pk` AS `pk` from `test`.`t1` order by `test`.`t1`.`pk` desc
+select * from t1 order by pk desc for update;
+pk
+100
+10
+1
+connection default;
+select * from t1;
+pk
+1
+10
+100
+insert into t1 values (0);
+ERROR HY000: Lock wait timeout exceeded; try restarting transaction: Timeout on index: test.t1.PRIMARY
+disconnect con1;
+connection default;
+drop table t1;
+set global rocksdb_enable_iterate_bounds=on;
diff --git a/mysql-test/suite/rocksdb/r/range_locking_seek_for_update2_rev_cf.result b/mysql-test/suite/rocksdb/r/range_locking_seek_for_update2_rev_cf.result
new file mode 100644
index 00000000000..fef9e0ef6a0
--- /dev/null
+++ b/mysql-test/suite/rocksdb/r/range_locking_seek_for_update2_rev_cf.result
@@ -0,0 +1,142 @@
+show variables like 'rocksdb_use_range_locking';
+Variable_name Value
+rocksdb_use_range_locking ON
+create table t0(a int primary key);
+insert into t0 values (0),(1),(2),(3),(4),(5),(6),(7),(8),(9);
+create table t1 (
+pk int,
+a int,
+primary key (pk) comment 'rev:rlsfu_test'
+) engine=rocksdb;
+insert into t1 (pk)
+select
+A.a + B.a*10 + C.a*100
+from
+t0 A, t0 B, t0 C;
+delete from t1 where pk<100;
+connect con1,localhost,root,,;
+connection con1;
+begin;
+set debug_sync='rocksdb.locking_iter_scan SIGNAL about_to_lock_range WAIT_FOR spoiler_inserted';
+select * from t1 where pk >=5 order by pk limit 5 for update;
+connection default;
+set debug_sync='now WAIT_FOR about_to_lock_range';
+insert into t1 (pk) values
+(10),(20),(30),(40),(50);
+set debug_sync='now SIGNAL spoiler_inserted';
+connection con1;
+pk a
+10 NULL
+20 NULL
+30 NULL
+40 NULL
+50 NULL
+# This must return 1, no 5:
+select lock_count from information_schema.rocksdb_trx
+where thread_id=CONNECTION_ID();
+lock_count
+1
+rollback;
+disconnect con1;
+connection default;
+drop table t0, t1;
+#
+# A testcase for locking at the end of the scan
+#
+create table t1 (
+pk int,
+primary key (pk) comment 'rev:rlsfu_test'
+) engine=rocksdb;
+connect con1,localhost,root,,;
+connection con1;
+insert into t1 values (1), (10), (100);
+begin;
+select * from t1 for update;
+pk
+1
+10
+100
+connection default;
+select * from t1;
+pk
+1
+10
+100
+insert into t1 values (150);
+ERROR HY000: Lock wait timeout exceeded; try restarting transaction: Timeout on index: test.t1.PRIMARY
+connection con1;
+rollback;
+begin;
+explain
+select * from t1 order by pk desc for update;
+id select_type table partitions type possible_keys key key_len ref rows filtered Extra
+1 SIMPLE t1 NULL index NULL PRIMARY 4 NULL # 100.00 Backward index scan; Using index
+Warnings:
+Note 1003 /* select#1 */ select `test`.`t1`.`pk` AS `pk` from `test`.`t1` order by `test`.`t1`.`pk` desc
+select * from t1 order by pk desc for update;
+pk
+100
+10
+1
+connection default;
+select * from t1;
+pk
+1
+10
+100
+insert into t1 values (0);
+ERROR HY000: Lock wait timeout exceeded; try restarting transaction: Timeout on index: test.t1.PRIMARY
+disconnect con1;
+connection default;
+drop table t1;
+set global rocksdb_enable_iterate_bounds=off;
+#
+# A testcase for locking at the end of the scan
+#
+create table t1 (
+pk int,
+primary key (pk) comment 'rev:rlsfu_test'
+) engine=rocksdb;
+connect con1,localhost,root,,;
+connection con1;
+insert into t1 values (1), (10), (100);
+begin;
+select * from t1 for update;
+pk
+1
+10
+100
+connection default;
+select * from t1;
+pk
+1
+10
+100
+insert into t1 values (150);
+ERROR HY000: Lock wait timeout exceeded; try restarting transaction: Timeout on index: test.t1.PRIMARY
+connection con1;
+rollback;
+begin;
+explain
+select * from t1 order by pk desc for update;
+id select_type table partitions type possible_keys key key_len ref rows filtered Extra
+1 SIMPLE t1 NULL index NULL PRIMARY 4 NULL # 100.00 Backward index scan; Using index
+Warnings:
+Note 1003 /* select#1 */ select `test`.`t1`.`pk` AS `pk` from `test`.`t1` order by `test`.`t1`.`pk` desc
+select * from t1 order by pk desc for update;
+pk
+100
+10
+1
+connection default;
+select * from t1;
+pk
+1
+10
+100
+insert into t1 values (0);
+ERROR HY000: Lock wait timeout exceeded; try restarting transaction: Timeout on index: test.t1.PRIMARY
+disconnect con1;
+connection default;
+drop table t1;
+set global rocksdb_enable_iterate_bounds=on;
diff --git a/mysql-test/suite/rocksdb/r/range_locking_shared_locks.result b/mysql-test/suite/rocksdb/r/range_locking_shared_locks.result
new file mode 100644
index 00000000000..580108de6f6
--- /dev/null
+++ b/mysql-test/suite/rocksdb/r/range_locking_shared_locks.result
@@ -0,0 +1,251 @@
+select @@rocksdb_use_range_locking;
+@@rocksdb_use_range_locking
+1
+create table t0 (a int);
+insert into t0 values (0),(1),(2),(3),(4),(5),(6),(7),(8),(9);
+create table t1 (
+pk int primary key,
+a int
+) engine=rocksdb;
+insert into t1 select a,a from t0;
+# A basic test for shared locks
+begin;
+select * from t1 where pk=3 for update;
+pk a
+3 3
+select * from t1 where pk=5 lock in share mode;
+pk a
+5 5
+connect con1,localhost,root,,;
+connection con1;
+begin;
+select * from t1 where pk=5 lock in share mode;
+pk a
+5 5
+# Now for pk=5 we should see two locks by TRX1 and TRX2 with mode=S:
+# select * from information_schema.rocksdb_locks; # With replacements by select_from_is_rowlocks.inc
+COLUMN_FAMILY_ID TRANSACTION_ID KEY mode
+$cf_id $TRX1_ID ${indexnr}80000003 X
+$cf_id $TRX1_ID ${indexnr}80000005 S
+$cf_id $TRX2_ID ${indexnr}80000005 S
+rollback;
+# Now, TRX2_ID should be gone:
+# select * from information_schema.rocksdb_locks; # With replacements by select_from_is_rowlocks.inc
+COLUMN_FAMILY_ID TRANSACTION_ID KEY mode
+$cf_id $TRX1_ID ${indexnr}80000003 X
+$cf_id $TRX1_ID ${indexnr}80000005 S
+connection default;
+# Get a read lock on pk=3 (where we have a write lock).
+# The result should be that we will still have a write lock
+select * from t1 where pk=3 for update;
+pk a
+3 3
+# select * from information_schema.rocksdb_locks; # With replacements by select_from_is_rowlocks.inc
+COLUMN_FAMILY_ID TRANSACTION_ID KEY mode
+$cf_id $TRX1_ID ${indexnr}80000003 X
+$cf_id $TRX1_ID ${indexnr}80000005 S
+# Get a write lock on pk=5 (where we have a read lock).
+# The result should be that we will have a write lock.
+select * from t1 where pk=5 for update;
+pk a
+5 5
+# select * from information_schema.rocksdb_locks; # With replacements by select_from_is_rowlocks.inc
+COLUMN_FAMILY_ID TRANSACTION_ID KEY mode
+$cf_id $TRX1_ID ${indexnr}80000003 X
+$cf_id $TRX1_ID ${indexnr}80000005 X
+connection default;
+rollback;
+#
+# Test if a read lock inhibits write locks
+#
+begin;
+select * from t1 where pk=2 lock in share mode;
+pk a
+2 2
+select * from t1 where pk=8 for update;
+pk a
+8 8
+connection con1;
+begin;
+select * from t1 where pk=2 for update;
+ERROR HY000: Lock wait timeout exceeded; try restarting transaction: Timeout on index: test.t1.PRIMARY
+select * from t1 where pk between 0 and 4 for update;
+ERROR HY000: Lock wait timeout exceeded; try restarting transaction: Timeout on index: test.t1.PRIMARY
+delete from t1 where pk=2;
+ERROR HY000: Lock wait timeout exceeded; try restarting transaction: Timeout on index: test.t1.PRIMARY
+# Get a shared lock
+select * from t1 where pk=2 lock in share mode;
+pk a
+2 2
+# But this should still prevent us from acquiring a write lock on that value:
+select * from t1 where pk=2 for update;
+ERROR HY000: Lock wait timeout exceeded; try restarting transaction: Timeout on index: test.t1.PRIMARY
+rollback;
+connection default;
+rollback;
+drop table t1;
+create table t1 (
+pk int not null primary key,
+a int not null,
+key(a)
+) engine=rocksdb;
+insert into t1
+select
+A.a+10*B.a+100*C.a+1000*D.a, A.a+10*B.a+100*C.a+1000*D.a
+from
+t0 A, t0 B, t0 C, t0 D;
+set global rocksdb_force_flush_memtable_now=1;
+connection con1;
+begin;
+select * from t1 where pk=900 for update;
+pk a
+900 900
+connection default;
+begin;
+explain
+select * from t1 where a between 2 and 5 lock in share mode;
+id select_type table partitions type possible_keys key key_len ref rows filtered Extra
+1 SIMPLE t1 NULL range a a 4 NULL # 100.00 Using where; Using index
+Warnings:
+Note 1003 /* select#1 */ select `test`.`t1`.`pk` AS `pk`,`test`.`t1`.`a` AS `a` from `test`.`t1` where (`test`.`t1`.`a` between 2 and 5)
+select * from t1 where a between 2 and 5 lock in share mode;
+pk a
+2 2
+3 3
+4 4
+5 5
+# select * from information_schema.rocksdb_locks; # With replacements by select_from_is_rowlocks.inc
+COLUMN_FAMILY_ID TRANSACTION_ID KEY mode
+$cf_id $TRX1_ID ${indexnr+1}80000002-${indexnr+1}80000005:1 X
+$cf_id $TRX1_ID ${indexnr}80000002 S
+$cf_id $TRX1_ID ${indexnr}80000003 S
+$cf_id $TRX1_ID ${indexnr}80000004 S
+$cf_id $TRX1_ID ${indexnr}80000005 S
+$cf_id $TRX1_ID ${indexnr}80000006 S
+$cf_id $TRX2_ID ${indexnr}80000384 X
+rollback;
+disconnect con1;
+drop table t0,t1;
+#
+# Test shared point locks and lock escalation
+#
+create table t0 (a int);
+insert into t0 values (0),(1),(2),(3),(4),(5),(6),(7),(8),(9);
+create table t1 (
+pk int primary key,
+a int
+) engine=rocksdb;
+insert into t1
+select 1000 + 100*A.a + 10*B.a + C.a, 12345 from t0 A, t0 B, t0 C;
+show status like 'rocksdb_locktree_current_lock_memory';
+Variable_name Value
+rocksdb_locktree_current_lock_memory 0
+connect con1,localhost,root,,;
+connection con1;
+begin;
+# CON1: get some shared locks
+select * from t1 where pk=1001 lock in share mode;
+pk a
+1001 12345
+select * from t1 where pk=1100 lock in share mode;
+pk a
+1100 12345
+select * from t1 where pk=1200 lock in share mode;
+pk a
+1200 12345
+select * from t1 where pk=2500 lock in share mode;
+pk a
+connection default;
+begin;
+# DEFAULT: get the same locks so we have locks with multiple owners
+select * from t1 where pk=1001 lock in share mode;
+pk a
+1001 12345
+select * from t1 where pk=1100 lock in share mode;
+pk a
+1100 12345
+select * from t1 where pk=1200 lock in share mode;
+pk a
+1200 12345
+# DEFAULT: get shared locks with one owner:
+select * from t1 where pk=2510 lock in share mode;
+pk a
+# DEFAULT: exclusive locks on 0-10:
+insert into t1 select A.a, 0 from t0 A;
+connection con1;
+# CON1: exclusive locks on 2000-2010:
+insert into t1 select 2000+A.a, 0 from t0 A;
+# select * from information_schema.rocksdb_locks; # With replacements by select_from_is_rowlocks.inc
+COLUMN_FAMILY_ID TRANSACTION_ID KEY mode
+$cf_id $TRX2_ID ${indexnr}80000000 X
+$cf_id $TRX2_ID ${indexnr}80000001 X
+$cf_id $TRX2_ID ${indexnr}80000002 X
+$cf_id $TRX2_ID ${indexnr}80000003 X
+$cf_id $TRX2_ID ${indexnr}80000004 X
+$cf_id $TRX2_ID ${indexnr}80000005 X
+$cf_id $TRX2_ID ${indexnr}80000006 X
+$cf_id $TRX2_ID ${indexnr}80000007 X
+$cf_id $TRX2_ID ${indexnr}80000008 X
+$cf_id $TRX2_ID ${indexnr}80000009 X
+$cf_id $TRX1_ID ${indexnr}800003e9 S
+$cf_id $TRX2_ID ${indexnr}800003e9 S
+$cf_id $TRX1_ID ${indexnr}8000044c S
+$cf_id $TRX2_ID ${indexnr}8000044c S
+$cf_id $TRX1_ID ${indexnr}800004b0 S
+$cf_id $TRX2_ID ${indexnr}800004b0 S
+$cf_id $TRX1_ID ${indexnr}800007d0 X
+$cf_id $TRX1_ID ${indexnr}800007d1 X
+$cf_id $TRX1_ID ${indexnr}800007d2 X
+$cf_id $TRX1_ID ${indexnr}800007d3 X
+$cf_id $TRX1_ID ${indexnr}800007d4 X
+$cf_id $TRX1_ID ${indexnr}800007d5 X
+$cf_id $TRX1_ID ${indexnr}800007d6 X
+$cf_id $TRX1_ID ${indexnr}800007d7 X
+$cf_id $TRX1_ID ${indexnr}800007d8 X
+$cf_id $TRX1_ID ${indexnr}800007d9 X
+$cf_id $TRX1_ID ${indexnr}800009c4 S
+$cf_id $TRX2_ID ${indexnr}800009ce S
+connection default;
+show status like 'rocksdb_locktree_current_lock_memory';
+Variable_name Value
+rocksdb_locktree_current_lock_memory 8792
+set @save_mlm= @@rocksdb_max_lock_memory;
+# Set the limit to cause lock escalation:
+set @cur_mem_usage= (select
+variable_value
+from
+performance_schema.global_status
+where
+variable_name='rocksdb_locktree_current_lock_memory');
+set global rocksdb_max_lock_memory = cast(@cur_mem_usage+4 as SIGNED);
+connection con1;
+insert into t1 select 3000+A.a, 0 from t0 A;
+# select * from information_schema.rocksdb_locks; # With replacements by select_from_is_rowlocks.inc
+COLUMN_FAMILY_ID TRANSACTION_ID KEY mode
+$cf_id $TRX2_ID ${indexnr}80000000-${indexnr}80000009 X
+$cf_id $TRX1_ID ${indexnr}800003e9 S
+$cf_id $TRX2_ID ${indexnr}800003e9 S
+$cf_id $TRX1_ID ${indexnr}8000044c S
+$cf_id $TRX2_ID ${indexnr}8000044c S
+$cf_id $TRX1_ID ${indexnr}800004b0 S
+$cf_id $TRX2_ID ${indexnr}800004b0 S
+$cf_id $TRX1_ID ${indexnr}800007d0-${indexnr}800007d9 X
+$cf_id $TRX1_ID ${indexnr}800009c4 S
+$cf_id $TRX2_ID ${indexnr}800009ce S
+$cf_id $TRX1_ID ${indexnr}80000bb8 X
+$cf_id $TRX1_ID ${indexnr}80000bb9 X
+$cf_id $TRX1_ID ${indexnr}80000bba X
+$cf_id $TRX1_ID ${indexnr}80000bbb X
+$cf_id $TRX1_ID ${indexnr}80000bbc X
+$cf_id $TRX1_ID ${indexnr}80000bbd X
+$cf_id $TRX1_ID ${indexnr}80000bbe X
+$cf_id $TRX1_ID ${indexnr}80000bbf X
+$cf_id $TRX1_ID ${indexnr}80000bc0 X
+$cf_id $TRX1_ID ${indexnr}80000bc1 X
+connection con1;
+rollback;
+connection default;
+rollback;
+disconnect con1;
+set global rocksdb_max_lock_memory= cast(@save_mlm as SIGNED);
+drop table t0, t1;
diff --git a/mysql-test/suite/rocksdb/r/rocksdb.result b/mysql-test/suite/rocksdb/r/rocksdb.result
index c6397708446..6c1e9a0d938 100644
--- a/mysql-test/suite/rocksdb/r/rocksdb.result
+++ b/mysql-test/suite/rocksdb/r/rocksdb.result
@@ -987,6 +987,7 @@ rocksdb_max_background_jobs 2
rocksdb_max_bottom_pri_background_compactions 0
rocksdb_max_compaction_history 64
rocksdb_max_latest_deadlocks 5
+rocksdb_max_lock_memory 1073741824
rocksdb_max_log_file_size 0
rocksdb_max_manifest_file_size 1073741824
rocksdb_max_manual_compactions 10
@@ -1056,6 +1057,8 @@ rocksdb_use_default_sk_cf OFF
rocksdb_use_direct_io_for_flush_and_compaction OFF
rocksdb_use_direct_reads OFF
rocksdb_use_fsync OFF
+rocksdb_use_range_lock_manager_as_point OFF
+rocksdb_use_range_locking OFF
rocksdb_validate_tables 1
rocksdb_verify_row_debug_checksums OFF
rocksdb_wal_bytes_per_sync 0
diff --git a/mysql-test/suite/rocksdb/r/rocksdb_read_free_rpl.result b/mysql-test/suite/rocksdb/r/rocksdb_read_free_rpl.result
index 58329d03ebc..71e6ff4d30b 100644
--- a/mysql-test/suite/rocksdb/r/rocksdb_read_free_rpl.result
+++ b/mysql-test/suite/rocksdb/r/rocksdb_read_free_rpl.result
@@ -72,7 +72,7 @@ update t1 set c2=100 where c1=3;
delete from t1 where c1 <= 2;
include/sync_slave_sql_with_master.inc
[connection slave]
-select case when variable_value-@up > 0 then 'false' else 'true' end as read_free from performance_schema.global_status where variable_name='rocksdb_num_get_for_update_calls';
+select case when (@@rocksdb_use_range_locking=1 OR variable_value-@up > 0) then 'false' else 'true' end as read_free from performance_schema.global_status where variable_name='rocksdb_num_get_for_update_calls';
read_free
false
select * from t1;
diff --git a/mysql-test/suite/rocksdb/r/rocksdb_timeout_rollback.result b/mysql-test/suite/rocksdb/r/rocksdb_timeout_rollback.result
index 1e253a9974b..08a0a2f5942 100644
--- a/mysql-test/suite/rocksdb/r/rocksdb_timeout_rollback.result
+++ b/mysql-test/suite/rocksdb/r/rocksdb_timeout_rollback.result
@@ -36,6 +36,9 @@ rocksdb_rollback_on_timeout OFF
begin work;
insert into t1 values (9);
insert into t1 values (10);
+# Fix for Range Locking: force a snapshot to be taken:
+select * from t1 where a=100;
+a
update t1 set a = a + 1 where a = 2;
begin work;
insert into t1 values (11);
diff --git a/mysql-test/suite/rocksdb/r/select_count_for_update.result b/mysql-test/suite/rocksdb/r/select_count_for_update.result
index 1107aa2f6cb..6672d43eb43 100644
--- a/mysql-test/suite/rocksdb/r/select_count_for_update.result
+++ b/mysql-test/suite/rocksdb/r/select_count_for_update.result
@@ -35,9 +35,9 @@ SELECT COUNT(*) FROM t1 FORCE INDEX (sk);
COUNT(*)
3
SELECT COUNT(*) FROM t1 FORCE INDEX (sk) LOCK IN SHARE MODE;
-ERROR HY000: Lock wait timeout exceeded; try restarting transaction: Timeout on index: test.t1.PRIMARY
+ERROR HY000: Lock wait timeout exceeded; try restarting transaction: Timeout on index: $FAILING_INDEX
SELECT COUNT(*) FROM t1 FORCE INDEX (sk) FOR UPDATE;
-ERROR HY000: Lock wait timeout exceeded; try restarting transaction: Timeout on index: test.t1.PRIMARY
+ERROR HY000: Lock wait timeout exceeded; try restarting transaction: Timeout on index: $FAILING_INDEX
connection con1;
COMMIT;
SELECT COUNT(*) FROM t1 FORCE INDEX (sk);
diff --git a/mysql-test/suite/rocksdb/r/trx_info.result b/mysql-test/suite/rocksdb/r/trx_info.result
index ada2e127021..562c1d7022e 100644
--- a/mysql-test/suite/rocksdb/r/trx_info.result
+++ b/mysql-test/suite/rocksdb/r/trx_info.result
@@ -9,5 +9,10 @@ a
2
select * from information_schema.rocksdb_trx;
TRANSACTION_ID STATE NAME WRITE_COUNT LOCK_COUNT TIMEOUT_SEC WAITING_KEY WAITING_COLUMN_FAMILY_ID IS_REPLICATION SKIP_TRX_API READ_ONLY HAS_DEADLOCK_DETECTION NUM_ONGOING_BULKLOAD THREAD_ID QUERY
-_TRX_ID_ STARTED _NAME_ 0 2 1 _KEY_ 0 0 0 0 0 0 _THREAD_ID_ select * from information_schema.rocksdb_trx
+_TRX_ID_ STARTED _NAME_ 0 2_or_3 1 _KEY_ 0 0 0 0 0 0 _THREAD_ID_ select * from information_schema.rocksdb_trx
+select
+if(@@rocksdb_use_range_locking=1, LOCK_COUNT=3, LOCK_COUNT=2) as LOCK_COUNT_IS_CORRECT
+from information_schema.rocksdb_trx;
+LOCK_COUNT_IS_CORRECT
+1
DROP TABLE t1;
diff --git a/mysql-test/suite/rocksdb/r/unique_sec.result b/mysql-test/suite/rocksdb/r/unique_sec.result
index 1da78db24b1..d4ef2e0ff2e 100644
--- a/mysql-test/suite/rocksdb/r/unique_sec.result
+++ b/mysql-test/suite/rocksdb/r/unique_sec.result
@@ -114,6 +114,10 @@ ERROR 23000: Duplicate entry '37' for key 't1.id5'
UPDATE t1 SET id5=34 WHERE id1=38;
ERROR HY000: Lock wait timeout exceeded; try restarting transaction: Timeout on index: test.t1.id5
# NULL values are unique
+# (Note: the following UPDATE reads through the whole table without
+# finding anything to update. With point locking, this is fine,
+# but with range locking it will time out while waiting on a row lock
+# that the other transaction is holding)
UPDATE t1 SET id5=NULL WHERE value1 > 37;
COMMIT;
COMMIT;
diff --git a/mysql-test/suite/rocksdb/r/unique_sec_rev_cf.result b/mysql-test/suite/rocksdb/r/unique_sec_rev_cf.result
index d6d06f6ece5..0e71e6481aa 100644
--- a/mysql-test/suite/rocksdb/r/unique_sec_rev_cf.result
+++ b/mysql-test/suite/rocksdb/r/unique_sec_rev_cf.result
@@ -114,6 +114,10 @@ ERROR 23000: Duplicate entry '37' for key 't1.id5'
UPDATE t1 SET id5=34 WHERE id1=38;
ERROR HY000: Lock wait timeout exceeded; try restarting transaction: Timeout on index: test.t1.id5
# NULL values are unique
+# (Note: the following UPDATE reads through the whole table without
+# finding anything to update. With point locking, this is fine,
+# but with range locking it will time out while waiting on a row lock
+# that the other transaction is holding)
UPDATE t1 SET id5=NULL WHERE value1 > 37;
COMMIT;
COMMIT;
diff --git a/mysql-test/suite/rocksdb/t/deadlock_tracking.test b/mysql-test/suite/rocksdb/t/deadlock_tracking.test
index 42e46bb0f28..55e6502c079 100644
--- a/mysql-test/suite/rocksdb/t/deadlock_tracking.test
+++ b/mysql-test/suite/rocksdb/t/deadlock_tracking.test
@@ -1,3 +1,9 @@
+# Deadlock #5 uses SELECT ... LOCK IN SHARE MODE;
+# SHOW ENGINE ROCKSDB TRANSACTION status prints information about deadlocks.
+# A part of this test that works with range locking is in
+# range_locking_deadlock_tracking.test
+--source suite/rocksdb/include/not_range_locking.inc
+
set @prior_lock_wait_timeout = @@rocksdb_lock_wait_timeout;
set @prior_deadlock_detect = @@rocksdb_deadlock_detect;
set @prior_max_latest_deadlocks = @@rocksdb_max_latest_deadlocks;
@@ -137,7 +143,6 @@ rollback;
connection default;
--replace_regex /[0-9]{4}-[0-9]{2}-[0-9]{2} [0-9]{2}:[0-9]{2}:[0-9]{2}/TIMESTAMP/ /WAITING KEY: [0-9a-f]{16}/KEY/ /TRANSACTION ID: [0-9]*/TXN_ID/ /TIMESTAMP: [0-9]*/TSTAMP/
show engine rocksdb transaction status;
-
echo Deadlock #6;
connection con1;
create table t1 (id int primary key, value int) engine=rocksdb;
diff --git a/mysql-test/suite/rocksdb/t/drop_cf_before_show_deadlock_info.test b/mysql-test/suite/rocksdb/t/drop_cf_before_show_deadlock_info.test
index f7eb8151f40..05ae30f2ddd 100644
--- a/mysql-test/suite/rocksdb/t/drop_cf_before_show_deadlock_info.test
+++ b/mysql-test/suite/rocksdb/t/drop_cf_before_show_deadlock_info.test
@@ -3,6 +3,10 @@
--source include/have_rocksdb.inc
--source include/count_sessions.inc
+# Doesn't work with range locking because range locking
+# does not provide info in rocksdb_deadlock.
+--source suite/rocksdb/include/not_range_locking.inc
+
--disable_query_log
call mtr.add_suppression("Column family '[a-z_]+' not found");
--enable_query_log
diff --git a/mysql-test/suite/rocksdb/t/hermitage-range_locking.test b/mysql-test/suite/rocksdb/t/hermitage-range_locking.test
new file mode 100644
index 00000000000..55203af9cf8
--- /dev/null
+++ b/mysql-test/suite/rocksdb/t/hermitage-range_locking.test
@@ -0,0 +1,15 @@
+--source include/have_rocksdb.inc
+
+# Range locking uses InnoDB-like transaction isolation, which
+# means the results differ from "true" Repeatable Read.
+--source suite/rocksdb/include/have_range_locking.inc
+
+
+# Hermitage is an attempt to test transaction isolation levels.
+# https://github.com/ept/hermitage
+
+let $trx_isolation = READ COMMITTED;
+--source hermitage.inc
+
+let $trx_isolation = REPEATABLE READ;
+--source hermitage.inc
diff --git a/mysql-test/suite/rocksdb/t/hermitage.inc b/mysql-test/suite/rocksdb/t/hermitage.inc
index 90f7d482533..83815a70459 100644
--- a/mysql-test/suite/rocksdb/t/hermitage.inc
+++ b/mysql-test/suite/rocksdb/t/hermitage.inc
@@ -108,6 +108,8 @@ select * from test where value % 3 = 0;
commit;
--source hermitage_init.inc
+let $RC_OR_RANGE_LOCKING=`select @@tx_isolation='READ-COMMITTED' OR @@rocksdb_use_range_locking=1`;
+let $RR_AND_NOT_RANGE_LOCKING=`select @@tx_isolation='REPEATABLE-READ' AND @@rocksdb_use_range_locking=0`;
connection con1;
update test set value = value + 10;
connection con2;
@@ -117,13 +119,13 @@ send delete from test where value = 20;
connection con1;
commit;
connection con2;
-if ($trx_isolation == "READ COMMITTED")
+if ($RC_OR_RANGE_LOCKING)
{
reap;
# RC: Returns 2 => 30
select * from test;
}
-if ($trx_isolation == "REPEATABLE READ")
+if ($RR_AND_NOT_RANGE_LOCKING)
{
--error ER_LOCK_DEADLOCK
reap;
@@ -147,13 +149,13 @@ send update test set value = 12 where id = 1;
connection con1;
commit;
connection con2;
-if ($trx_isolation == "READ COMMITTED")
+if ($RC_OR_RANGE_LOCKING)
{
reap;
# RC: Returns 1 => 12
select * from test;
}
-if ($trx_isolation == "REPEATABLE READ")
+if ($RR_AND_NOT_RANGE_LOCKING)
{
--error ER_LOCK_DEADLOCK
reap;
@@ -200,12 +202,12 @@ update test set value = 12 where id = 1;
update test set value = 18 where id = 2;
commit;
connection con1;
-if ($trx_isolation == "READ COMMITTED")
+if ($RC_OR_RANGE_LOCKING)
{
delete from test where value = 20; # doesn't delete anything
select * from test where id = 2; # shows 2 => 18
}
-if ($trx_isolation == "REPEATABLE READ")
+if ($RR_AND_NOT_RANGE_LOCKING)
{
--error ER_LOCK_DEADLOCK
delete from test where value = 20;
diff --git a/mysql-test/suite/rocksdb/t/hermitage.test b/mysql-test/suite/rocksdb/t/hermitage.test
index e4138e8d89f..51f3f286a0e 100644
--- a/mysql-test/suite/rocksdb/t/hermitage.test
+++ b/mysql-test/suite/rocksdb/t/hermitage.test
@@ -1,5 +1,8 @@
--source include/have_rocksdb.inc
+# See hermitage-range_locking variant
+--source suite/rocksdb/include/not_range_locking.inc
+
# Hermitage is an attempt to test transaction isolation levels.
# https://github.com/ept/hermitage
diff --git a/mysql-test/suite/rocksdb/t/i_s_deadlock.test b/mysql-test/suite/rocksdb/t/i_s_deadlock.test
index e0479d6a337..82fa9fc6bbd 100644
--- a/mysql-test/suite/rocksdb/t/i_s_deadlock.test
+++ b/mysql-test/suite/rocksdb/t/i_s_deadlock.test
@@ -1,5 +1,9 @@
--source include/have_rocksdb.inc
+# Uses LOCK IN SHARE MODE and so will hang in range-locking mode. The part that
+# doesn't hang is in rocksdb.range_locking_i_s_deadlock.test
+--source suite/rocksdb/include/not_range_locking.inc
+
set @prior_lock_wait_timeout = @@rocksdb_lock_wait_timeout;
set @prior_deadlock_detect = @@rocksdb_deadlock_detect;
set @prior_max_latest_deadlocks = @@rocksdb_max_latest_deadlocks;
diff --git a/mysql-test/suite/rocksdb/t/issue111.test b/mysql-test/suite/rocksdb/t/issue111.test
index 671ea4708d6..3657e977a70 100644
--- a/mysql-test/suite/rocksdb/t/issue111.test
+++ b/mysql-test/suite/rocksdb/t/issue111.test
@@ -1,5 +1,9 @@
--source include/have_rocksdb.inc
+# The testcase here assumes key tracking is present
+# (and range locking uses InnoDB-like approach, "DMLs use Read Commited")
+--source suite/rocksdb/include/not_range_locking.inc
+
connect (con2,localhost,root,,);
connection default;
diff --git a/mysql-test/suite/rocksdb/t/issue243_transactionStatus-range_locking.test b/mysql-test/suite/rocksdb/t/issue243_transactionStatus-range_locking.test
new file mode 100644
index 00000000000..465fb9099da
--- /dev/null
+++ b/mysql-test/suite/rocksdb/t/issue243_transactionStatus-range_locking.test
@@ -0,0 +1,10 @@
+#
+# A range-locking variant of issue243_transactionStatus.test
+
+--source include/have_rocksdb.inc
+--source suite/rocksdb/include/have_range_locking.inc
+
+let $forced_range_locking=1;
+--source issue243_transactionStatus.test
+
+
diff --git a/mysql-test/suite/rocksdb/t/issue243_transactionStatus.test b/mysql-test/suite/rocksdb/t/issue243_transactionStatus.test
index 1e2f0b41226..5c1948ebe81 100644
--- a/mysql-test/suite/rocksdb/t/issue243_transactionStatus.test
+++ b/mysql-test/suite/rocksdb/t/issue243_transactionStatus.test
@@ -1,5 +1,9 @@
--source include/have_rocksdb.inc
+if (!$forced_range_locking) {
+--source suite/rocksdb/include/not_range_locking.inc
+}
+
--disable_warnings
DROP TABLE IF EXISTS t1;
--enable_warnings
diff --git a/mysql-test/suite/rocksdb/t/level_repeatable_read-range_locking.test b/mysql-test/suite/rocksdb/t/level_repeatable_read-range_locking.test
new file mode 100644
index 00000000000..6c42c7be12c
--- /dev/null
+++ b/mysql-test/suite/rocksdb/t/level_repeatable_read-range_locking.test
@@ -0,0 +1,9 @@
+--source include/have_rocksdb.inc
+
+# Range locking uses InnoDB-like transaction isolation, which
+# means the results differ from "true" Repeatable Read.
+--source suite/rocksdb/include/have_range_locking.inc
+
+let $trx_isolation = REPEATABLE READ;
+--source transaction_isolation.inc
+
diff --git a/mysql-test/suite/rocksdb/t/level_repeatable_read.test b/mysql-test/suite/rocksdb/t/level_repeatable_read.test
index cf29073f69e..b81dcf31ab1 100644
--- a/mysql-test/suite/rocksdb/t/level_repeatable_read.test
+++ b/mysql-test/suite/rocksdb/t/level_repeatable_read.test
@@ -1,5 +1,8 @@
--source include/have_rocksdb.inc
+# See level_repeatable_read-range_locking variant
+--source suite/rocksdb/include/not_range_locking.inc
+
let $trx_isolation = REPEATABLE READ;
--source transaction_isolation.inc
diff --git a/mysql-test/suite/rocksdb/t/lock_info.test b/mysql-test/suite/rocksdb/t/lock_info.test
index 1b624cf38c0..a277c1b8d8d 100644
--- a/mysql-test/suite/rocksdb/t/lock_info.test
+++ b/mysql-test/suite/rocksdb/t/lock_info.test
@@ -1,5 +1,8 @@
--source include/have_rocksdb.inc
+# Range Locking supports I_S.lock_info but its printout is different (see range_locking.test)
+--source suite/rocksdb/include/not_range_locking.inc
+
--disable_warnings
DROP TABLE IF EXISTS t1;
DROP TABLE IF EXISTS t2;
diff --git a/mysql-test/suite/rocksdb/t/locking_issues.test b/mysql-test/suite/rocksdb/t/locking_issues.test
index 035046ae368..95a6676f78a 100644
--- a/mysql-test/suite/rocksdb/t/locking_issues.test
+++ b/mysql-test/suite/rocksdb/t/locking_issues.test
@@ -1,5 +1,8 @@
--source include/have_rocksdb.inc
+# A lot of tests below assume point locking, not range.
+--source suite/rocksdb/include/not_range_locking.inc
+
let $isolation_level = REPEATABLE READ;
--source suite/rocksdb/include/locking_issues_case1_1.inc
diff --git a/mysql-test/suite/rocksdb/t/max_row_locks.test b/mysql-test/suite/rocksdb/t/max_row_locks.test
index 4b07f3d8492..d4b2604f1e3 100644
--- a/mysql-test/suite/rocksdb/t/max_row_locks.test
+++ b/mysql-test/suite/rocksdb/t/max_row_locks.test
@@ -1,4 +1,5 @@
--source include/have_rocksdb.inc
+--source suite/rocksdb/include/not_range_locking.inc
create table t1 (id1 bigint, id2 bigint, c1 bigint, c2 bigint, c3 bigint, c4 bigint, c5 bigint, c6 bigint, c7 bigint, primary key (id1, id2), index i(c1, c2));
--disable_query_log
diff --git a/mysql-test/suite/rocksdb/t/range_locking.inc b/mysql-test/suite/rocksdb/t/range_locking.inc
new file mode 100644
index 00000000000..b3ec0ae7367
--- /dev/null
+++ b/mysql-test/suite/rocksdb/t/range_locking.inc
@@ -0,0 +1,612 @@
+#
+# Range locking tests.
+#
+
+--source include/have_rocksdb.inc
+--source suite/rocksdb/include/have_range_locking.inc
+
+--enable_connect_log
+
+
+show variables like 'rocksdb_use_range_locking';
+
+eval create table t1 (
+ pk int,
+ a int,
+ primary key (pk) comment '$pk_cf'
+) engine=rocksdb;
+
+insert into t1 values
+(10,10),(20,20),(30,30);
+
+connect (con1,localhost,root,,);
+connect (con2,localhost,root,,);
+
+--echo ### Test: check that range lock inhibits a point lock
+connection con1;
+begin;
+select * from t1 where pk between 5 and 25 for update;
+
+connection con2;
+--error ER_LOCK_WAIT_TIMEOUT
+insert into t1 values (15,15);
+
+connection con1;
+rollback;
+
+--echo ## Test: check that range lock inhibits another range lock
+connection con1;
+begin;
+select * from t1 where pk between 5 and 25 for update;
+
+connection con2;
+begin;
+--error ER_LOCK_WAIT_TIMEOUT
+select * from t1 where pk between 15 and 35 for update;
+rollback;
+
+connection con1;
+rollback;
+
+--echo ## Test: check that regular read does not get a range lock
+connection con1;
+begin;
+select * from t1 where pk between 5 and 25;
+
+connection con2;
+begin;
+# This must not block
+select * from t1 where pk between 15 and 35 for update;
+rollback;
+
+connection con1;
+rollback;
+
+--echo ## Test that locks are not released when a statement inside
+--echo ## a transaction is rolled back
+eval
+create table t2 (
+ pk int,
+ a int,
+ primary key (pk) comment '$pk_cf',
+ unique key(a) comment '$sk_cf'
+) engine=rocksdb;
+
+insert into t2 values (1,1),(2,2);
+
+begin;
+insert into t2 values (3,3);
+--error ER_DUP_ENTRY
+insert into t2 values (10,2);
+
+connection con2;
+begin;
+# This must time out:
+--error ER_LOCK_WAIT_TIMEOUT
+select * from t2 where pk=3 for update;
+
+rollback;
+connection con1;
+rollback;
+drop table t2;
+
+# cleanup
+connection default;
+disconnect con1;
+disconnect con2;
+drop table t1;
+
+--echo #
+--echo # Test INFORMATION_SCHEMA.lock_info in range-locking mode
+--echo #
+
+connect (con1,localhost,root,,);
+connection con1;
+eval create table t0 (a int primary key);
+begin;
+insert into t0 values (1);
+connection default;
+
+
+eval
+create table t1 (
+ pk int,
+ a int,
+ primary key (pk) comment '$pk_cf'
+) engine=rocksdb;
+
+insert into t1 values
+(10,10),(20,20),(30,30);
+
+begin;
+select * from t1 where pk=10 for update;
+
+#let TRX1_ID=`(select transaction_id from information_schema.rocksdb_trx where thread_id=connection_id())` ;
+let $select_from_is_rowlocks_current_trx_only=1;
+--source suite/rocksdb/include/select_from_is_rowlocks.inc
+
+delete from t1 where pk between 25 and 40;
+
+--source suite/rocksdb/include/select_from_is_rowlocks.inc
+
+rollback;
+begin;
+--echo # The following will show a range lock on 2-9 and also a point lock on 10.
+--echo # This is how things currently work. (after MDEV-21314, not anymore)
+select * from t1 where pk between 2 and 9 for update;
+--source suite/rocksdb/include/select_from_is_rowlocks.inc
+rollback;
+
+drop table t1;
+connection con1;
+rollback;
+drop table t0;
+connection default;
+disconnect con1;
+
+--echo #
+--echo # MDEV-18104: MyRocks-Gap-Lock: range locking bounds are incorrect for multi-part keys
+--echo #
+
+create table t0(a int);
+insert into t0 values (0),(1),(2),(3),(4),(5),(6),(7),(8),(9);
+eval
+create table t1 (
+ kp1 int not null,
+ kp2 int not null,
+ a int,
+ primary key(kp1, kp2) comment '$pk_cf'
+) engine=rocksdb;
+
+insert into t1 select 1, a, 1234 from t0;
+insert into t1 select 2, a, 1234 from t0;
+insert into t1 select 3, a, 1234 from t0;
+
+connect (con1,localhost,root,,);
+connection con1;
+
+begin;
+select * from t1 where kp1=2 for update;
+
+connection default;
+--echo # The lock on kp1=2 should inhibit the following INSERT:
+--error ER_LOCK_WAIT_TIMEOUT
+insert into t1 values ( 2,5,9999);
+rollback;
+
+connection con1;
+rollback;
+connection default;
+disconnect con1;
+drop table t0,t1;
+
+--echo #
+--echo # Test that locks on ranges on non-unique secondary keys inhibit
+--echo # modifications of the contents of these ranges
+--echo #
+
+create table t0(a int);
+insert into t0 values (0),(1),(2),(3),(4),(5),(6),(7),(8),(9);
+eval
+create table t1 (
+ kp1 int not null,
+ kp2 int not null,
+ a int,
+ key(kp1, kp2) comment '$pk_cf'
+) engine=rocksdb;
+
+insert into t1 select 1, a, 1234 from t0;
+insert into t1 values (2, 3, 1234);
+insert into t1 values (2, 5, 1234);
+insert into t1 values (2, 7, 1234);
+insert into t1 select 3, a, 1234 from t0;
+
+connect (con1,localhost,root,,);
+connection con1;
+begin;
+--replace_column 10 #
+explain
+select * from t1 where kp1=2 for update;
+select * from t1 where kp1=2 for update;
+
+connection default;
+begin;
+--error ER_LOCK_WAIT_TIMEOUT
+insert into t1 values (2, 9, 9999);
+
+--error ER_LOCK_WAIT_TIMEOUT
+delete from t1 where kp1=2 and kp2=5;
+
+# Update that "moves a row away" from the locked range
+--error ER_LOCK_WAIT_TIMEOUT
+update t1 set kp1=333 where kp1=2 and kp2=3;
+
+# Update that "moves a row into" the locked range
+--error ER_LOCK_WAIT_TIMEOUT
+update t1 set kp1=2 where kp1=1 and kp2=8;
+
+rollback;
+
+connection con1;
+rollback;
+disconnect con1;
+connection default;
+drop table t0,t1;
+
+--echo #
+--echo # Transaction isolation test
+--echo #
+
+create table t1 (pk int primary key, a int) engine=rocksdb;
+insert into t1 values (1,1),(2,2),(3,3);
+
+connect (con1,localhost,root,,);
+
+--echo # TRX1: Start, Allocate a snapshot
+connection con1;
+begin;
+select * from t1;
+
+--echo # TRX2: Make a change that TRX1 will not see
+connection default;
+update t1 set a=2222 where pk=2;
+
+--echo # TRX1: Now, make a change that would overwrite TRX2'x change and commit
+connection con1;
+update t1 set a=a+1 where pk=2;
+commit;
+
+--echo # Examine the result:
+--echo # pk=2, a=2223 means UPDATE in TRX1 used "read committed" (InnoDB-like isolation)
+--echo # pk=2, a=3 means UPDATE in TRX1 silently overwrote TRX2
+--echo # (and with key tracking, one would get an error on the second UPDATE)
+connection default;
+select * from t1;
+
+disconnect con1;
+connection default;
+drop table t1;
+
+--echo #
+--echo # Same test as above, but check the range scan
+--echo #
+
+eval
+create table t1 (
+ pk int,
+ a int,
+ primary key (pk) comment '$pk_cf'
+) engine=rocksdb;
+insert into t1 values (1,1),(2,2),(3,3),(4,4),(5,5),(6,6);
+
+connect (con1,localhost,root,,);
+
+--echo # TRX1: Start, Allocate a snapshot
+connection con1;
+begin;
+select * from t1;
+
+--echo # TRX2: Make a change that TRX1 will not see
+connection default;
+update t1 set a=2222 where pk between 3 and 5;
+
+--echo # TRX1: Now, make a change that would overwrite TRX2'x change and commit
+connection con1;
+update t1 set a=a+1 where pk between 3 and 5;
+commit;
+
+--echo # Examine the result:
+--echo # pk={3,4,5} a=2223 means UPDATE in TRX1 used "read committed" (InnoDB-like isolation)
+connection default;
+select * from t1;
+
+disconnect con1;
+connection default;
+drop table t1;
+
+--echo #
+--echo # Same as above, but test SELECT FOR UPDATE.
+--echo #
+eval
+create table t1 (
+ pk int,
+ a int,
+ primary key (pk) comment '$pk_cf'
+) engine=rocksdb;
+insert into t1 values (1,1),(2,2),(3,3),(4,4),(5,5),(6,6);
+
+connect (con1,localhost,root,,);
+
+--echo # TRX1: Start, Allocate a snapshot
+connection con1;
+begin;
+select * from t1;
+
+--echo # TRX2: Make a change that TRX1 will not see
+connection default;
+update t1 set a=222 where pk=2;
+update t1 set a=333 where pk=3;
+
+--echo # TRX1: Check what select [FOR UPDATE] sees
+connection con1;
+select * from t1 where pk in (2,3);
+select * from t1 where pk=2 for update;
+select * from t1 where pk=2 lock in share mode;
+select * from t1 where pk=2;
+
+commit;
+
+disconnect con1;
+connection default;
+drop table t1;
+
+if (!$PK_USES_REVERSE_CF) {
+--echo #
+--echo # Another no-snapshot-checking test, this time for single-statement
+--echo # transaction
+--echo #
+eval
+create table t1 (
+ pk int,
+ a int,
+ name varchar(16),
+ primary key(pk) comment '$pk_cf'
+) engine=rocksdb;
+insert into t1 values (1,1, 'row1'), (2,2,'row2');
+
+connect (con1,localhost,root,,);
+connection con1;
+select get_lock('row1', 100);
+
+connection default;
+
+--echo # The following will read the first row (1,1,'row1'), and stop.
+
+send update t1 set a=a+100 where get_lock(name, 1000)=1;
+
+# Wait till the default connection has stopped:
+connection con1;
+
+let $wait_condition=
+ SELECT COUNT(*) = 1 FROM INFORMATION_SCHEMA.PROCESSLIST WHERE STATE = "User lock"
+ AND INFO = "update t1 set a=a+100 where get_lock(name, 1000)=1";
+--source include/wait_condition.inc
+
+# Update the second row
+update t1 set a=5 where pk=2;
+
+select release_lock('row1');
+
+connection default;
+reap;
+
+--echo # Look at the row with pk=2:
+--echo # 2, 105, row2 - means the UPDATE was reading current data (Correct)
+--echo # 2, 102, row - means the UPDATE read the snapshot (incorrect)
+select * from t1;
+
+--echo # Try releasing both locks (in 5.6, we will be holding only the second one)
+select release_lock(name) from t1;
+
+disconnect con1;
+connection default;
+drop table t1;
+}
+
+--echo #
+--echo # Check that I_S.processlist.state is set correctly now.
+--echo #
+eval
+create table t1(
+ pk int,
+ a int,
+ primary key(pk) comment '$pk_cf'
+) engine=rocksdb;
+insert into t1 values (1,1),(2,2),(3,3);
+
+begin;
+select * from t1 where pk=2 for update;
+
+--connect (con1,localhost,root,,)
+begin;
+set rocksdb_lock_wait_timeout=300;
+send select * from t1 where pk=2 for update;
+
+connection default;
+--echo # Now, will wait until we see con1 have state="Waiting for row lock"
+let $wait_condition=
+ SELECT COUNT(*) = 1 FROM INFORMATION_SCHEMA.PROCESSLIST WHERE STATE = "Waiting for row lock"
+ AND INFO = "select * from t1 where pk=2 for update";
+--source include/wait_condition.inc
+
+rollback;
+connection con1;
+--reap
+rollback;
+
+disconnect con1;
+connection default;
+drop table t1;
+
+--echo #
+--echo # Test range locking for ranges with HA_READ_PREFIX_LAST
+--echo #
+create table t0(a int) engine=rocksdb;
+insert into t0 values (0),(1),(2),(3),(4),(5),(6),(7),(8),(9);
+
+eval
+create table t1 (
+ pk1 int,
+ pk2 int,
+ a int,
+ primary key(pk1, pk2) comment '$pk_cf'
+) engine=rocksdb;
+
+insert into t1
+select
+ A.a, B.a, A.a*10+B.a
+from
+ t0 A, t0 B;
+
+
+# Get a lock in another connection so that the primary transaction is not using
+# STO optimization, and its locks can be seen in I_S.rocksdb_locks
+--connect (con1,localhost,root,,)
+connection con1;
+begin;
+insert into t1 values (0x1112222,0x1112222,0);
+
+connection default;
+begin;
+--echo # Should use ref access w/o filesort:
+--replace_column 10 #
+explain
+select * from t1
+where pk1=3
+order by pk1 desc, pk2 desc
+for update;
+
+select * from t1
+where pk1=3
+order by pk1 desc, pk2 desc
+for update;
+
+let $select_from_is_rowlocks_current_trx_only=1;
+--source suite/rocksdb/include/select_from_is_rowlocks.inc
+rollback;
+
+--echo #
+--echo # Test range locking for ranges with HA_READ_PREFIX_LAST_OR_PREV
+--echo #
+
+begin;
+--echo # Should use range access with 2 keyparts and w/o filesort:
+--replace_column 10 #
+explain
+select * from t1
+where pk1=4 and pk2 between 5 and 8
+order by pk1 desc, pk2 desc
+for update;
+
+select * from t1
+where pk1=4 and pk2 between 5 and 8
+order by pk1 desc, pk2 desc
+for update;
+
+let $select_from_is_rowlocks_current_trx_only=1;
+--source suite/rocksdb/include/select_from_is_rowlocks.inc
+rollback;
+
+connection con1;
+rollback;
+
+connection default;
+drop table t0, t1;
+
+--echo #
+--echo # A bug: range locking was not used when scan started at table start or end
+--echo #
+create table t0(a int);
+insert into t0 values (0),(1),(2),(3),(4),(5),(6),(7),(8),(9);
+create table t10(a int);
+insert into t10 select A.a + B.a* 10 + C.a * 100 from t0 A, t0 B, t0 C;
+
+create table t1 (
+ pk int not null,
+ a int,
+ primary key(pk)
+) engine=rocksdb;
+
+insert into t1 select a*2,a*2 from t10;
+
+connection con1;
+begin;
+select * from t1 where pk=500 for update;
+connection default;
+
+begin;
+select * from t1 where pk<10 order by pk limit 10 for update;
+
+let $select_from_is_rowlocks_current_trx_only=1;
+--source suite/rocksdb/include/select_from_is_rowlocks.inc
+rollback;
+
+begin;
+select * from t1 where pk>1990 order by pk desc limit 10 for update;
+let $select_from_is_rowlocks_current_trx_only=1;
+--source suite/rocksdb/include/select_from_is_rowlocks.inc
+rollback;
+
+connection con1;
+rollback;
+disconnect con1;
+
+connection default;
+drop table t0,t10,t1;
+
+--echo #
+--echo # Range locking and READ-COMMITTED isolation level
+--echo #
+connect (con1,localhost,root,,);
+connection con1;
+set session transaction isolation level read committed;
+create table t1 (
+ pk int not null,
+ a int,
+ primary key(pk)
+) engine=rocksdb;
+
+insert into t1(pk) values (0),(1),(2),(3),(4),(5),(6),(7),(8),(9);
+
+begin;
+select * from t1 where pk between 2 and 5 for update;
+let $select_from_is_rowlocks_current_trx_only=1;
+--echo # Below should show individual row locks, not locked range:
+--source suite/rocksdb/include/select_from_is_rowlocks.inc
+rollback;
+
+begin;
+update t1 set a=a+1 where pk between 2 and 5;
+let $select_from_is_rowlocks_current_trx_only=1;
+--echo # Below should show individual row locks, not locked range:
+--source suite/rocksdb/include/select_from_is_rowlocks.inc
+rollback;
+
+drop table t1;
+disconnect con1;
+connection default;
+
+--echo #
+--echo # Range Locking and READ-COMMITTED, another test
+--echo #
+create table t1 (
+ pk int,
+ a int,
+ b int,
+ primary key (pk),
+ key(a)
+) engine=rocksdb;
+
+insert into t1 values
+(1, 100, 1000),
+(2, 200, 2000),
+(3, 300, 3000);
+
+set transaction isolation level repeatable read;
+begin;
+update t1 set b = b + 1 where a > 200;
+
+connect (con1,localhost,root,,);
+connection con1;
+set transaction isolation level read committed;
+begin;
+insert into t1 values (4, 150, 1500);
+--error ER_LOCK_WAIT_TIMEOUT
+insert into t1 values (5, 250, 1500);
+
+rollback;
+
+disconnect con1;
+connection default;
+rollback;
+drop table t1;
diff --git a/mysql-test/suite/rocksdb/t/range_locking.test b/mysql-test/suite/rocksdb/t/range_locking.test
new file mode 100644
index 00000000000..5c599238a0a
--- /dev/null
+++ b/mysql-test/suite/rocksdb/t/range_locking.test
@@ -0,0 +1,6 @@
+
+--let pk_cf=default
+--let sk_cf=default
+
+--source range_locking.inc
+
diff --git a/mysql-test/suite/rocksdb/t/range_locking_conc_test.py b/mysql-test/suite/rocksdb/t/range_locking_conc_test.py
new file mode 100644
index 00000000000..d28006697d0
--- /dev/null
+++ b/mysql-test/suite/rocksdb/t/range_locking_conc_test.py
@@ -0,0 +1,450 @@
+"""
+This script tests Range Locking. See
+ mysql-test/suite/rocksdb/t/range_locking_conc_test.txt for details.
+
+Usage:
+
+ python3 suite/rocksdb/t/range_locking_conc_test.py \
+ [--progress] [--verbose] \
+ root 127.0.0.1 $MASTER_MYPORT test t1 \
+ num_inserts num_insert_threads \
+ num_group_ops num_group_threads
+
+
+ For Example:
+
+ time python3 suite/rocksdb/t/range_locking_conc_test.py --progress root 127.0.0.1 3314 test t1 2000 20 10000 40
+
+"""
+
+import hashlib
+import MySQLdb
+from MySQLdb.constants import ER
+import os
+import random
+import signal
+import sys
+import threading
+import time
+import string
+import traceback
+
+#MAX_PK_VAL = 1000*1000*1000
+MAX_PK_VAL = 1000*1000
+
+show_progress= False
+verbose_output= False
+
+counter_n_inserted = 0
+counter_n_deleted = 0
+counter_n_insert_failed = 0
+counter_n_groups_created = 0
+counter_n_group_create_fails = 0
+counter_n_groups_verified = 0
+counter_n_groups_deleted = 0
+
+def is_deadlock_error(exc):
+ error_code = exc.args[0]
+ return (error_code == MySQLdb.constants.ER.LOCK_DEADLOCK)
+
+#
+# Watcher prints the test progress and some status variables once per second
+#
+class Watcher(threading.Thread):
+ Instance = None
+ def __init__(self, con):
+ threading.Thread.__init__(self)
+ self.should_stop = False
+ self.finished = False
+ self.con = con
+ self.start()
+
+ def run(self):
+ global counter_n_inserted
+ global counter_n_deleted
+ global counter_n_insert_failed
+ global counter_n_groups_created
+ global counter_n_group_create_fails
+ global counter_n_groups_verified
+ global counter_n_groups_deleted
+ event = threading.Event()
+
+ save_counter_n_inserted=0
+ save_counter_n_deleted=0
+ save_counter_n_insert_failed=0
+ save_counter_n_groups_created=0
+ save_counter_n_group_create_fails=0
+ save_counter_n_groups_verified=0
+ save_counter_n_groups_deleted=0
+ save_wait_count=0
+ n=0
+ cur= self.con.cursor();
+ try:
+ while not self.should_stop:
+ event.wait(1)
+
+ cur.execute("show status like '%rocksdb_locktree_wait_count%'");
+ row = cur.fetchone()
+ wait_count= int(row[1])
+
+ print("== %d ========================" % n)
+ print("counter_n_inserted=%d" % (counter_n_inserted - save_counter_n_inserted))
+ print("counter_n_deleted=%d" % (counter_n_deleted - save_counter_n_deleted))
+ print("counter_n_insert_failed=%d" % (counter_n_insert_failed - save_counter_n_insert_failed))
+ print("counter_n_groups_created=%d" % (counter_n_groups_created - save_counter_n_groups_created))
+ print("counter_n_group_create_fails=%d" % (counter_n_group_create_fails - save_counter_n_group_create_fails))
+ print("counter_n_groups_verified=%d" % (counter_n_groups_verified - save_counter_n_groups_verified))
+ print("counter_n_groups_deleted=%d" % (counter_n_groups_deleted - save_counter_n_groups_deleted))
+ print("wait_count=%d" % (wait_count - save_wait_count))
+
+ save_counter_n_inserted=counter_n_inserted
+ save_counter_n_deleted=counter_n_deleted
+ save_counter_n_insert_failed=counter_n_insert_failed
+ save_counter_n_groups_created=counter_n_groups_created
+ save_counter_n_group_create_fails=counter_n_group_create_fails
+ save_counter_n_groups_verified=counter_n_groups_verified
+ save_counter_n_groups_deleted=counter_n_groups_deleted
+ save_wait_count= wait_count
+ n+=1
+
+ except Exception as e:
+ self.exception = traceback.format_exc()
+ print("Watcher caught (%s)" % (e))
+
+ finally:
+ self.finish()
+
+ def finish(self):
+ n=0
+ # Do nothing
+
+
+#
+# A worker is one client thread producing the benchmark workload
+#
+class Worker(threading.Thread):
+ Instance = None
+ def __init__(self, con, table_name, worker_type_arg, num_inserts):
+ threading.Thread.__init__(self)
+ self.finished = False
+ self.num_inserts = num_inserts
+ con.autocommit(False)
+ self.con = con
+ self.rand = random.Random()
+ self.exception = None
+ self.table_name = table_name
+ self.worker_type = worker_type_arg
+ Worker.Instance = self
+ self.start()
+
+ def run(self):
+ self.rand.seed(threading.get_ident())
+ my_id= threading.get_ident()
+ try:
+ self.cur= self.con.cursor();
+ if (self.worker_type == "insert"):
+ self.run_inserts()
+ if (self.worker_type == "join_group"):
+ self.run_create_groups()
+ except Exception as e:
+ print(e)
+ self.exception = traceback.format_exc()
+ print("THR %d caught (%s)" % (my_id, e))
+
+ finally:
+ self.finish()
+
+ #
+ # Insert one row, making sure this doesn't break any groups
+ #
+ def run_one_insert(self):
+ global counter_n_inserted
+ global counter_n_deleted
+ global counter_n_insert_failed
+ cur = self.cur
+ #pk = self.rand.randint(1, MAX_PK_VAL)
+ # Note: here the distribution is intentionally 2x wider than the grouping
+ # thread has.
+ pk = int(self.rand.normalvariate(MAX_PK_VAL/2, MAX_PK_VAL/50.))
+
+ cur.execute("begin")
+ do_commit = False
+ cur.execute("select pk,parent_id,group_list from t1 where pk>=%s limit 1 for update", (pk,));
+ row = cur.fetchone()
+ group_to_delete= None
+ if row is None:
+ #print("No row found, inserting %d" % (pk+1000*1000))
+ cur.execute("insert into t1 (pk) values(%s)", (pk+1000*1000,));
+ do_commit = True
+ else:
+ if ((row[0] - pk)>2 and row[1] is None):
+ #print("Row found, inserting into gap, %d" % pk)
+ cur.execute("insert into t1 (pk) values(%s)", (pk,));
+ do_commit = True
+ else:
+ #print("Row found, grouped or too tight")
+ if row[2]:
+ # if parent_id is present, use it
+ group_to_delete = row[0]
+ #print("About to delete %d" % group_to_delete)
+ do_commit = False
+
+ if (do_commit):
+ cur.execute("commit")
+ counter_n_inserted += 1
+ return 1
+ else:
+ counter_n_insert_failed += 1
+ if group_to_delete:
+ counter_n_deleted += 5
+ self.delete_group(group_to_delete, True)
+ cur.execute("commit")
+ else:
+ cur.execute("rollback")
+ return 0
+
+ def run_one_create_group(self):
+ global counter_n_groups_created
+ global counter_n_group_create_fails
+ global counter_n_groups_deleted
+ cur = self.cur
+ #pk = self.rand.randint(1, MAX_PK_VAL)
+ pk = int(self.rand.normalvariate(MAX_PK_VAL/2, MAX_PK_VAL/100))
+
+ n_rows = 0
+ n_groups_deleted= 0
+ first_pk= None
+ cur.execute("select pk,parent_id,group_list from t1 where pk>=%s limit 5 for update", (pk,));
+ row = cur.fetchone()
+ while row is not None:
+ if (first_pk is None):
+ first_pk = row[0]
+ group_list= str(first_pk)
+ else:
+ group_list = group_list + "," + str(row[0])
+
+ last_pk = row[0]
+ if row[1] is not None:
+ # Found a row in a group
+ # Continue until group end.
+ found_next_group = False
+ row = cur.fetchone()
+ while row is not None:
+ if row[1] is None:
+ found_next_group = True
+ first_pk = row[0]
+ group_list= str(first_pk)
+ break
+ row= cur.fetchone()
+
+ if not found_next_group:
+ break;
+
+ if row[2] is not None:
+ # Found a group leader row.
+ ungrouped_ids = self.delete_group(row[0], False)
+ n_groups_deleted += 1;
+ i = 1
+ n_rows += 1
+ while (n_rows < 5):
+ group_list = group_list + "," + str(ungrouped_ids[i])
+ last_pk= ungrouped_ids[i];
+ i+= 1
+ n_rows += 1
+ break;
+ n_rows += 1
+ row = cur.fetchone()
+
+ if (n_rows == 5 or n_groups_deleted>0):
+ # Ok we got 5 rows in a row and they are all standalone
+ # Create a group.
+ # print("Creating group %d" % first_pk)
+ cur.execute("update t1 set group_list=%s where pk=%s", (group_list,first_pk,))
+ cur.execute("update t1 set parent_id=%s where pk > %s and pk <=%s",
+ (first_pk,first_pk, last_pk))
+ cur.execute("commit")
+ counter_n_groups_created += 1
+ counter_n_groups_deleted += n_groups_deleted;
+ return 1
+ else:
+ # print("Failed to join a group")
+ counter_n_group_create_fails += 1
+ cur.execute("rollback")
+ return 0
+
+ #
+ # Verify and delete the group
+ # @return An array listing the deleted PKs
+ #
+ def delete_group(self, group_id, delete_rows):
+ global counter_n_groups_verified
+ cur = self.con.cursor();
+ cur.execute("select pk,parent_id,group_list from t1 where pk>=%s limit 5 for update", (group_id,));
+ first_pk = None
+ n_rows = 0
+
+ row = cur.fetchone()
+ while row is not None:
+ if (first_pk is None):
+ first_pk = row[0]
+ group_list= str(first_pk)
+ group_arr= [];
+ group_arr.append(first_pk)
+ group_list_base= row[2]
+ if (first_pk != group_id):
+ self.raise_error("First row is not the group leader!");
+ else:
+ group_list = group_list + "," + str(row[0])
+ group_arr.append(row[0])
+
+ last_pk = row[0]
+ if (row[0] != first_pk and row[1] != first_pk):
+ self.raise_error("Row in group has wrong parent_id (expect %s got %s)" % (first_pk, row[1]))
+ break;
+ if (row[0] != first_pk and row[2] is not None):
+ self.raise_error("Row in group is a group leader?")
+ break;
+ n_rows += 1
+ row = cur.fetchone()
+
+ if (n_rows != 5):
+ self.raise_error("Expected %d rows got %d" % (5, n_rows,))
+ if (group_list != group_list_base):
+ self.raise_error("Group contents mismatch: expected '%s' got '%s'" % (group_list_base, group_list))
+ # Ok, everything seems to be in order.
+ if delete_rows:
+ cur.execute("delete from t1 where pk>=%s and pk<=%s", (group_id,last_pk,));
+ else:
+ cur.execute("update t1 set parent_id=NULL, group_list=NULL where pk>=%s and pk<=%s", (group_id,last_pk,));
+
+ counter_n_groups_verified += 1
+ return group_arr
+
+ def raise_error(self, msg):
+ print("Data corruption detected: " + msg)
+ sys.exit("Failed!")
+
+ def run_inserts(self):
+ #print("Worker.run_inserts")
+ i = 0
+ while (i < self.num_inserts):
+ try:
+ i += self.run_one_insert()
+ except MySQLdb.OperationalError as e:
+ self.con.rollback()
+ cur = self.con.cursor()
+ if not is_deadlock_error(e):
+ raise e
+
+ def run_create_groups(self):
+ #print("Worker.run_create_groups")
+ i = 0
+ while (i < self.num_inserts):
+ try:
+ i += self.run_one_create_group()
+ except MySQLdb.OperationalError as e:
+ self.con.rollback()
+ cur = self.con.cursor()
+ if not is_deadlock_error(e):
+ raise e
+
+ def finish(self):
+ self.finished = True
+
+
+if __name__ == '__main__':
+ if len(sys.argv) != 10 and len(sys.argv) != 11:
+ print("Usage: range_locking_conc_test.py " \
+ "[--progress] " \
+ "user host port db_name table_name " \
+ "num_inserts num_insert_threads " \
+ "num_grp_ops num_group_threads" \
+ )
+ sys.exit(1)
+ i=1;
+ if sys.argv[i] == "--progress":
+ show_progress = True
+ i+=1
+
+ if sys.argv[i] == "--verbose":
+ verbose_output = True
+ i+=1
+
+ user = sys.argv[i]
+ i+=1
+
+ host = sys.argv[i]
+ i+=1
+
+ port = int(sys.argv[i])
+ i+=1
+
+ db = sys.argv[i]
+ i+=1
+
+ table_name = sys.argv[i]
+ i+=1
+
+ num_inserts = int(sys.argv[i])
+ i+=1
+
+ num_insert_workers = int(sys.argv[i])
+ i+=1
+
+ num_group_ops = int(sys.argv[i])
+ i+=1
+
+ num_group_workers = int(sys.argv[i])
+ i+=1
+
+ con= MySQLdb.connect(user=user, host=host, port=port, db=db)
+ con.cursor().execute("set global rocksdb_lock_wait_timeout=20")
+ con.cursor().execute("drop table if exists t1")
+ con.cursor().execute("create table t1 ( " \
+ " pk bigint primary key, " \
+ " group_list varchar(128), " \
+ " parent_id bigint " \
+ ") engine=rocksdb;")
+
+ worker_failed = False
+ workers = []
+ worker_type = "insert"
+ num_loops = num_inserts
+ for i in range(num_insert_workers + num_group_workers):
+ worker = Worker(
+ MySQLdb.connect(user=user, host=host, port=port, db=db), table_name,
+ worker_type, num_loops)
+ workers.append(worker)
+ if (i == num_insert_workers - 1):
+ worker_type = "join_group"
+ num_loops = num_group_ops
+
+ # A watcher thread to print the statistics periodically
+ if show_progress:
+ watcher= Watcher(MySQLdb.connect(user=user, host=host, port=port, db=db))
+
+ for w in workers:
+ w.join()
+ if w.exception:
+ print("Worker hit an exception:\n%s\n" % w.exception)
+ worker_failed = True
+
+ # Stop the watcher
+ if show_progress:
+ watcher.should_stop= True;
+ watcher.join()
+
+ if verbose_output:
+ print("\n")
+ print("rows_inserted: %d" % counter_n_inserted)
+ print("rows_deleted: %d" % counter_n_deleted)
+ print("rows_insert_failed: %d" % counter_n_insert_failed)
+ print("groups_created: %d" % counter_n_groups_created)
+ print("groups_verified: %d" % counter_n_groups_verified)
+ print("groups_deleted: %d" % counter_n_groups_deleted)
+ print("group_create_fails: %d" % counter_n_group_create_fails)
+
+ if worker_failed:
+ sys.exit(1)
+
+
diff --git a/mysql-test/suite/rocksdb/t/range_locking_conc_test.test b/mysql-test/suite/rocksdb/t/range_locking_conc_test.test
new file mode 100644
index 00000000000..0b0d4681bdb
--- /dev/null
+++ b/mysql-test/suite/rocksdb/t/range_locking_conc_test.test
@@ -0,0 +1,18 @@
+
+#
+# Test Range Locking behavior. See
+# ./suite/rocksdb/t/range_locking_conc_test.txt for details
+#
+
+--source include/have_rocksdb.inc
+--source suite/rocksdb/include/have_range_locking.inc
+
+set @save_rlwt=@@rocksdb_lock_wait_timeout;
+let $SERVER_PORT=`select @@port`;
+let $exec = /usr/bin/python3 suite/rocksdb/t/range_locking_conc_test.py root 127.0.0.1 $SERVER_PORT test t1 2000 20 30000 10;
+
+--echo # Run range_locking_conc_test.py
+exec $exec;
+
+set global rocksdb_lock_wait_timeout= @save_rlwt;
+DROP TABLE t1;
diff --git a/mysql-test/suite/rocksdb/t/range_locking_conc_test.txt b/mysql-test/suite/rocksdb/t/range_locking_conc_test.txt
new file mode 100644
index 00000000000..f10bfe33925
--- /dev/null
+++ b/mysql-test/suite/rocksdb/t/range_locking_conc_test.txt
@@ -0,0 +1,91 @@
+
+
+A concurrent test for Range Locking code
+
+== Requirements ==
+
+The idea is to have a concurrent workload, where
+- Some clients make modifications to the database that use range locking (let's
+ denote these RL-UPDATES)
+
+- There is some concurrent data modification activity, which is likely to
+ make RL-UPDATES corrupt the data if transaction isolation does not perform
+ proper range locking
+
+- There is a way to detect this data corruption. First suggestion: have some
+ invariants that must remain true regardless of any action done by
+ RL-UPDATES. We can run the workload, then stop it and verify the invariants
+ still hold.
+
+== Basic idea ==
+
+Rows and groups.
+
+Consider a table:
+
+create table t1 (
+ pk bigint primary key,
+ group_list varchar(128),
+ parent_id bigint
+) engine=rocksdb;
+
+
+We allow the following to be stored:
+
+1. Individual rows. An individual row has group_list=NULL, parent_id=NULL.
+
+2. Groups.
+
+A group is 5 (potentially could be some other number) of rows with adjacent
+PK values.
+
+The row with the smallest PK value is the "group leader" and has
+group_list=(comma-separated-list-of-pks-of-group members).
+
+The rest of the rows are "followers" and have parent_id=$GROUP_LEADER.pk
+
+Example of a group:
+
+mysql> select * from t1 where pk>=720418192 order by pk limit 5;
++-----------+---------------------------------------------------+-----------+
+| pk | group_list | parent_id |
++-----------+---------------------------------------------------+-----------+
+| 720418192 | 720418192,721972360,730798130,741595383,742883456 | NULL |
+| 721972360 | NULL | 720418192 |
+| 730798130 | NULL | 720418192 |
+| 741595383 | NULL | 720418192 |
+| 742883456 | NULL | 720418192 |
++-----------+---------------------------------------------------+-----------+
+5 rows in set (0.01 sec)
+
+
+Operations:
+- Insert an individual row. It is obvious we may not insert a row into a group.
+
+- Convert 5 individual rows into a group. One needs range locking to prevent
+ other threads from deleting these rows or putting them into another group.
+
+- Disband a group (convert it back into 5 individual rows).
+ When we are disbanding a group, we verify it to be valid.
+
+- Delete a group. If we attempt to insert a row and hit a group leader, we
+ don't insert the row and delete the whole group we've hit, instead. (when
+ deleting the group, we also verify it to be valid)
+ This provides some balance between inserts and deletes.
+
+=== Making sure lock contention happens ===
+
+We use normal distribution (rand.normalvariate) to pick random PK values,
+which are then used to make an attempt to insert a row or create a group.
+
+This creates a much greater contention than the uniform distribution.
+
+With sufficiently small sigma parameter, the contention seems to be
+sufficiently high.
+
+=== Implementation ===
+
+range_locking_conc_test.py implements the above operations.
+
+
+
diff --git a/mysql-test/suite/rocksdb/t/range_locking_deadlock_tracking.test b/mysql-test/suite/rocksdb/t/range_locking_deadlock_tracking.test
new file mode 100644
index 00000000000..2a5966b65c3
--- /dev/null
+++ b/mysql-test/suite/rocksdb/t/range_locking_deadlock_tracking.test
@@ -0,0 +1,196 @@
+--source suite/rocksdb/include/have_range_locking.inc
+
+#
+# This is deadlock_tracking.test, variant for running with Range Locking:
+# - Deadlock #5 is disabled, it requires LOCK IN SHARE MODE tests
+# - In the result file, SHOW ENGINE ROCKSDB TRANSACTION STATUS does not print
+# deadlock information.
+#
+set @prior_lock_wait_timeout = @@rocksdb_lock_wait_timeout;
+set @prior_deadlock_detect = @@rocksdb_deadlock_detect;
+set @prior_max_latest_deadlocks = @@rocksdb_max_latest_deadlocks;
+set global rocksdb_deadlock_detect = on;
+set global rocksdb_lock_wait_timeout = 10000;
+--echo # Clears deadlock buffer of any prior deadlocks.
+set global rocksdb_max_latest_deadlocks = 0;
+set global rocksdb_max_latest_deadlocks = @prior_max_latest_deadlocks;
+let $engine = rocksdb;
+
+--source include/count_sessions.inc
+connect (con1,localhost,root,,);
+let $con1= `SELECT CONNECTION_ID()`;
+
+connect (con2,localhost,root,,);
+let $con2= `SELECT CONNECTION_ID()`;
+
+connect (con3,localhost,root,,);
+let $con3= `SELECT CONNECTION_ID()`;
+
+connection default;
+eval create table t (i int primary key) engine=$engine;
+insert into t values (1), (2), (3);
+--replace_regex /[0-9]{4}-[0-9]{2}-[0-9]{2} [0-9]{2}:[0-9]{2}:[0-9]{2}/TIMESTAMP/ /WAITING KEY: [0-9a-f]{16}/KEY/ /TRANSACTION ID: [0-9]*/TXN_ID/ /TIMESTAMP: [0-9]*/TSTAMP/
+show engine rocksdb transaction status;
+
+echo Deadlock #1;
+--source include/simple_deadlock.inc
+connection default;
+--replace_regex /[0-9]{4}-[0-9]{2}-[0-9]{2} [0-9]{2}:[0-9]{2}:[0-9]{2}/TIMESTAMP/ /WAITING KEY: [0-9a-f]{16}/KEY/ /TRANSACTION ID: [0-9]*/TXN_ID/ /TIMESTAMP: [0-9]*/TSTAMP/
+show engine rocksdb transaction status;
+
+echo Deadlock #2;
+--source include/simple_deadlock.inc
+connection default;
+--replace_regex /[0-9]{4}-[0-9]{2}-[0-9]{2} [0-9]{2}:[0-9]{2}:[0-9]{2}/TIMESTAMP/ /WAITING KEY: [0-9a-f]{16}/KEY/ /TRANSACTION ID: [0-9]*/TXN_ID/ /TIMESTAMP: [0-9]*/TSTAMP/
+show engine rocksdb transaction status;
+set global rocksdb_max_latest_deadlocks = 10;
+
+echo Deadlock #3;
+--source include/simple_deadlock.inc
+connection default;
+--replace_regex /[0-9]{4}-[0-9]{2}-[0-9]{2} [0-9]{2}:[0-9]{2}:[0-9]{2}/TIMESTAMP/ /WAITING KEY: [0-9a-f]{16}/KEY/ /TRANSACTION ID: [0-9]*/TXN_ID/ /TIMESTAMP: [0-9]*/TSTAMP/
+show engine rocksdb transaction status;
+set global rocksdb_max_latest_deadlocks = 1;
+--replace_regex /[0-9]{4}-[0-9]{2}-[0-9]{2} [0-9]{2}:[0-9]{2}:[0-9]{2}/TIMESTAMP/ /WAITING KEY: [0-9a-f]{16}/KEY/ /TRANSACTION ID: [0-9]*/TXN_ID/ /TIMESTAMP: [0-9]*/TSTAMP/
+show engine rocksdb transaction status;
+
+connection con3;
+set rocksdb_deadlock_detect_depth = 2;
+
+--echo # Range locking code will report deadlocks, because it doesn't honor
+--echo # rocksdb_deadlock_detect_depth:
+echo Deadlock #4;
+connection con1;
+begin;
+select * from t where i=1 for update;
+
+connection con2;
+begin;
+select * from t where i=2 for update;
+
+connection con3;
+begin;
+select * from t where i=3 for update;
+
+connection con1;
+send select * from t where i=2 for update;
+
+connection con2;
+let $wait_condition = select count(*) = 1 from information_schema.rocksdb_trx
+where thread_id = $con1 and waiting_key != "";
+--source include/wait_condition.inc
+
+send select * from t where i=3 for update;
+
+connection con3;
+let $wait_condition = select count(*) = 1 from information_schema.rocksdb_trx
+where thread_id = $con2 and waiting_key != "";
+--source include/wait_condition.inc
+
+select variable_value into @a from performance_schema.global_status where variable_name='rocksdb_row_lock_deadlocks';
+--error ER_LOCK_DEADLOCK
+select * from t where i=1 for update;
+select case when variable_value-@a = 1 then 'true' else 'false' end as deadlocks from performance_schema.global_status where variable_name='rocksdb_row_lock_deadlocks';
+rollback;
+
+connection con2;
+reap;
+rollback;
+
+connection con1;
+reap;
+rollback;
+
+connection default;
+set global rocksdb_max_latest_deadlocks = 5;
+--replace_regex /[0-9]{4}-[0-9]{2}-[0-9]{2} [0-9]{2}:[0-9]{2}:[0-9]{2}/TIMESTAMP/ /WAITING KEY: [0-9a-f]{16}/KEY/ /TRANSACTION ID: [0-9]*/TXN_ID/ /TIMESTAMP: [0-9]*/TSTAMP/
+show engine rocksdb transaction status;
+
+--disable_testcase BUG#0000
+echo Deadlock #5;
+connection con1;
+begin;
+select * from t where i=1 for update;
+
+connection con2;
+begin;
+select * from t where i=2 for update;
+
+connection con3;
+begin;
+select * from t where i=3 lock in share mode;
+
+connection con1;
+select * from t where i=100 for update;
+select * from t where i=101 for update;
+send select * from t where i=2 for update;
+
+connection con2;
+let $wait_condition = select count(*) = 1 from information_schema.rocksdb_trx
+where thread_id = $con1 and waiting_key != "";
+--source include/wait_condition.inc
+
+select * from t where i=3 lock in share mode;
+select * from t where i=200 for update;
+select * from t where i=201 for update;
+
+--error ER_LOCK_DEADLOCK
+select * from t where i=1 lock in share mode;
+rollback;
+
+connection con1;
+reap;
+rollback;
+
+connection con3;
+rollback;
+
+connection default;
+--replace_regex /[0-9]{4}-[0-9]{2}-[0-9]{2} [0-9]{2}:[0-9]{2}:[0-9]{2}/TIMESTAMP/ /WAITING KEY: [0-9a-f]{16}/KEY/ /TRANSACTION ID: [0-9]*/TXN_ID/ /TIMESTAMP: [0-9]*/TSTAMP/
+show engine rocksdb transaction status;
+--enable_testcase
+echo Deadlock #6;
+connection con1;
+create table t1 (id int primary key, value int) engine=rocksdb;
+insert into t1 values (1,1),(2,2),(3,3),(4,4),(5,5);
+begin;
+update t1 set value=value+100 where id=1;
+update t1 set value=value+100 where id=2;
+
+connection con2;
+begin;
+update t1 set value=value+200 where id=3;
+
+connection con1;
+send update t1 set value=value+100 where id=3;
+
+connection con2;
+let $wait_condition = select count(*) = 1 from information_schema.rocksdb_trx
+where thread_id = $con1 and waiting_key != "";
+--source include/wait_condition.inc
+--error ER_LOCK_DEADLOCK
+update t1 set value=value+200 where id=1;
+
+# con2 tx is automatically rolled back
+connection con1;
+reap;
+select * from t1;
+drop table t1;
+
+connection default;
+
+disconnect con1;
+disconnect con2;
+disconnect con3;
+
+set global rocksdb_lock_wait_timeout = @prior_lock_wait_timeout;
+set global rocksdb_deadlock_detect = @prior_deadlock_detect;
+drop table t;
+--replace_regex /[0-9]{4}-[0-9]{2}-[0-9]{2} [0-9]{2}:[0-9]{2}:[0-9]{2}/TIMESTAMP/ /WAITING KEY: [0-9a-f]{16}/KEY/ /TRANSACTION ID: [0-9]*/TXN_ID/ /INDEX_ID: [0-9a-f]*/IDX_ID/ /TIMESTAMP: [0-9]*/TSTAMP/
+show engine rocksdb transaction status;
+set global rocksdb_max_latest_deadlocks = 0;
+--echo # Clears deadlock buffer of any existent deadlocks.
+set global rocksdb_max_latest_deadlocks = @prior_max_latest_deadlocks;
+--replace_regex /[0-9]{4}-[0-9]{2}-[0-9]{2} [0-9]{2}:[0-9]{2}:[0-9]{2}/TIMESTAMP/ /WAITING KEY: [0-9a-f]{16}/KEY/ /TRANSACTION ID: [0-9]*/TXN_ID/ /INDEX_ID: [0-9a-f]*/IDX_ID/ /TIMESTAMP: [0-9]*/TSTAMP/
+show engine rocksdb transaction status;
+--source include/wait_until_count_sessions.inc
diff --git a/mysql-test/suite/rocksdb/t/range_locking_escalation-master.opt b/mysql-test/suite/rocksdb/t/range_locking_escalation-master.opt
new file mode 100644
index 00000000000..d0087e2a77b
--- /dev/null
+++ b/mysql-test/suite/rocksdb/t/range_locking_escalation-master.opt
@@ -0,0 +1 @@
+--rocksdb_use_range_locking=1 --rocksdb_max_lock_memory=1024
diff --git a/mysql-test/suite/rocksdb/t/range_locking_escalation.test b/mysql-test/suite/rocksdb/t/range_locking_escalation.test
new file mode 100644
index 00000000000..5a6e9fa6616
--- /dev/null
+++ b/mysql-test/suite/rocksdb/t/range_locking_escalation.test
@@ -0,0 +1,39 @@
+#
+# Range Locking - Lock Escalation Tests.
+#
+
+--source include/have_rocksdb.inc
+--source suite/rocksdb/include/have_range_locking.inc
+--enable_connect_log
+
+
+show variables like 'rocksdb_use_range_locking';
+show variables like 'rocksdb_max_lock_memory';
+show status like 'rocksdb_locktree_escalation_count';
+create table t0(a int);
+insert into t0 values (0),(1),(2),(3),(4),(5),(6),(7),(8),(9);
+
+create table t1 (
+ pk int primary key,
+ a int
+) engine=rocksdb;
+
+#begin;
+#insert into t1 values (1000111,100011);
+#connect (con1,localhost,root,,);
+#connection con1;
+
+insert into t1
+select
+ A.a + B.a*10 + C.a*100 + D.a*1000,
+ 12345
+from t0 A, t0 B, t0 C, t0 D;
+
+select count(*) from t1;
+
+#connection default;
+#disconnect con1;
+show status like 'rocksdb_locktree_escalation_count';
+
+drop table t0,t1;
+
diff --git a/mysql-test/suite/rocksdb/t/range_locking_partial_index.test b/mysql-test/suite/rocksdb/t/range_locking_partial_index.test
new file mode 100644
index 00000000000..ce49737b2f6
--- /dev/null
+++ b/mysql-test/suite/rocksdb/t/range_locking_partial_index.test
@@ -0,0 +1,120 @@
+#
+# Range Locking and Partial Indexes
+#
+
+--source include/have_rocksdb.inc
+--source suite/rocksdb/include/have_range_locking.inc
+--source include/have_debug_sync.inc
+--enable_connect_log
+
+create table t0(a int primary key) engine=rocksdb;
+insert into t0 values (0),(1),(2),(3),(4),(5),(6),(7),(8),(9);
+
+create table t1 (
+ pk1 int,
+ pk2 int,
+ a int not null,
+ b int,
+ primary key (pk1, pk2),
+ key key1(pk1, a) comment 'partial_group_keyparts=1;partial_group_threshold=5'
+) engine=rocksdb;
+
+# pk1=1 IS materialized prefix (N=10)
+insert into t1 select
+ 1,
+ A.a,
+ 100 + A.a,
+ 123456
+from t0 A;
+# Run a SELECT so that it is acually materialized:
+select * from t1 force index (key1) where pk1=1;
+
+# pk1=2 IS NOT materialized (N=3)
+insert into t1 select
+ 2,
+ A.a,
+ 100 + A.a,
+ 123456
+from t0 A limit 3;
+
+# and some more rows
+insert into t1 select
+ 10000 + A.a +10 *B.a +100*C.a,
+ A.a,
+ 100 + A.a,
+ 123456
+from t0 A, t0 B, t0 C;
+
+create table t3(pk int primary key);
+
+connect (con2,localhost,root,,);
+connection con2;
+begin;
+insert into t3 values(3333333);
+connection default;
+
+--echo #
+--echo # First, test a query with range lock
+--echo #
+
+--replace_column 10 #
+explain
+select * from t1 force index (key1) where pk1>=1 and pk1<=10;
+
+connect (con1,localhost,root,,);
+connection con1;
+begin;
+--echo # Allocate a snapshot
+select * from t0 where a=3;
+
+connection default;
+--echo # Make some modifications not visible in the snapshot
+insert into t1 values (1,11, 99999, 99999);
+insert into t1 values (2,11, 99999, 99999);
+
+connection con1;
+--echo # This doesn't see the modifications
+select * from t1 force index (key1) where pk1>=1 and pk1<=10;
+--echo # This DOES see the modifications
+select * from t1 force index (key1) where pk1>=1 and pk1<=10 for update;
+
+let $order_by_rowkey=1;
+let $SECOND_INDEX_NAME=key1;
+let $select_from_is_rowlocks_current_trx_only=1;
+--source suite/rocksdb/include/select_from_is_rowlocks.inc
+
+rollback;
+
+--echo #
+--echo # Now, test a query with LockingIterator
+--echo #
+delete from t1 where b=99999;
+
+begin;
+--echo # Allocate a snapshot
+select * from t0 where a=3;
+
+connection default;
+--echo # Make some modifications not visible in the snapshot
+insert into t1 values (1,11, 99999, 99999);
+insert into t1 values (2,11, 99999, 99999);
+
+connection con1;
+--echo # This doesn't see the modifications:
+select * from t1 force index (key1) where pk1>=1 order by pk1 limit 15;
+--echo # This DOES see the modifications:
+select * from t1 force index (key1) where pk1>=1 order by pk1 limit 15 for update;
+
+let $order_by_rowkey=1;
+let $SECOND_INDEX_NAME=key1;
+let $select_from_is_rowlocks_current_trx_only=1;
+--source suite/rocksdb/include/select_from_is_rowlocks.inc
+
+rollback;
+
+disconnect con1;
+connection default;
+
+disconnect con2;
+
+drop table t0, t1,t3;
diff --git a/mysql-test/suite/rocksdb/t/range_locking_refresh_iter.test b/mysql-test/suite/rocksdb/t/range_locking_refresh_iter.test
new file mode 100644
index 00000000000..9bbb1b9b392
--- /dev/null
+++ b/mysql-test/suite/rocksdb/t/range_locking_refresh_iter.test
@@ -0,0 +1,70 @@
+--source include/have_rocksdb.inc
+--source suite/rocksdb/include/have_range_locking.inc
+--source include/have_debug_sync.inc
+
+select @@rocksdb_use_range_locking;
+
+--disable_warnings
+set debug_sync='RESET';
+--enable_warnings
+#
+# Testcase for iterator snapshot refresh
+#
+create table ten(a int primary key);
+insert into ten values (0),(1),(2),(3),(4),(5),(6),(7),(8),(9);
+
+create table one_k(a int primary key);
+insert into one_k select A.a + B.a* 10 + C.a * 100 from ten A, ten B, ten C;
+
+create table t1 (
+ pk int primary key,
+ a int
+) engine=rocksdb;
+
+insert into t1 select a,a from ten;
+insert into t1 select a+40, a+40 from ten;
+insert into t1 select a+100, a+100 from one_k;
+delete from t1 where pk=44;
+set global rocksdb_force_flush_memtable_and_lzero_now=1;
+
+# Ok, now the table has these PK ranges:
+# 0..9 40..49 100...1000
+# and all rows have pk=a
+connect (con1,localhost,root,,);
+connect (con2,localhost,root,,);
+
+connection con1;
+begin;
+set debug_sync='rocksdb.check_flags_iri SIGNAL con1_stopped WAIT_FOR con1_cont';
+send
+update t1 set a=a+100 where pk < 3 or pk between 10 and 50;
+
+# The query is how stuck at the start of the second range.
+
+
+## con2>
+connection con2;
+set debug_sync='now WAIT_FOR con1_stopped';
+
+# Make some changes to check if the iterator is reading current data or
+# snapshot
+insert into t1 values (44,5000);
+delete from t1 where pk= 42;
+update t1 set a=5000 where pk between 40 and 45;
+set global rocksdb_force_flush_memtable_and_lzero_now=1;
+
+set debug_sync='now SIGNAL con1_cont';
+
+connection con1;
+#--error ER_GET_ERRMSG
+reap;
+select * from t1 where pk<100;
+
+commit;
+disconnect con1;
+disconnect con2;
+connection default;
+set debug_sync='RESET';
+
+drop table t1, ten, one_k;
+
diff --git a/mysql-test/suite/rocksdb/t/range_locking_rev_cf.test b/mysql-test/suite/rocksdb/t/range_locking_rev_cf.test
new file mode 100644
index 00000000000..8b993764235
--- /dev/null
+++ b/mysql-test/suite/rocksdb/t/range_locking_rev_cf.test
@@ -0,0 +1,12 @@
+#
+# Range locking tests.
+#
+
+--source include/have_rocksdb.inc
+--source suite/rocksdb/include/have_range_locking.inc
+
+--let pk_cf=rev:cf1
+--let PK_USES_REVERSE_CF=1
+
+--source range_locking.inc
+
diff --git a/mysql-test/suite/rocksdb/t/range_locking_seek_for_update.test b/mysql-test/suite/rocksdb/t/range_locking_seek_for_update.test
new file mode 100644
index 00000000000..542d76661e2
--- /dev/null
+++ b/mysql-test/suite/rocksdb/t/range_locking_seek_for_update.test
@@ -0,0 +1,308 @@
+#
+# Range Locking : tests for SeekForUpdate feature
+#
+
+--source include/have_rocksdb.inc
+--source include/have_debug_sync.inc
+--source suite/rocksdb/include/have_range_locking.inc
+--enable_connect_log
+show variables like 'rocksdb_use_range_locking';
+
+create table t0(a int primary key);
+insert into t0 values (0),(1),(2),(3),(4),(5),(6),(7),(8),(9);
+
+create table t1 (
+ pk int,
+ a int,
+ primary key (pk)
+) engine=rocksdb;
+
+insert into t1 select
+ A.a + B.a*10 + C.a*100,
+ A.a + B.a*10 + C.a*100
+from
+ t0 A, t0 B, t0 C;
+
+--echo # Make another connection to get the lock tree out of the STO-mode
+connect (con1,localhost,root,,);
+connection con1;
+begin;
+select * from t1 where pk=10 for update;
+
+connection default;
+begin;
+select * from t1 where pk=11 for update;
+
+let $select_from_is_rowlocks_current_trx_only=1;
+--echo # Now, we will just see locks on 10=0xA and 11=0xB:
+--source suite/rocksdb/include/select_from_is_rowlocks.inc
+
+--echo #
+--echo # SeekForUpdate Test #1: A query with type=range (without upper bound) and LIMIT
+--echo #
+--replace_column 10 #
+explain
+select * from t1 where pk>=500 order by pk limit 3 for update;
+select * from t1 where pk>=500 order by pk limit 3 for update;
+--source suite/rocksdb/include/select_from_is_rowlocks.inc
+rollback;
+
+
+begin;
+select * from t1 where pk=11 for update;
+explain
+select * from t1 order by pk limit 3 for update;
+select * from t1 order by pk limit 3 for update;
+
+--source suite/rocksdb/include/select_from_is_rowlocks.inc
+
+rollback;
+connection con1;
+rollback;
+disconnect con1;
+connection default;
+drop table t0, t1;
+
+
+--echo #
+--echo # Concurrent tests: let one thread do SeekForUpdate and the other
+--echo # interfere by committing modifications
+--echo #
+
+create table t0(a int primary key);
+insert into t0 values (0),(1),(2),(3),(4),(5),(6),(7),(8),(9);
+
+create table t1 (
+ pk int,
+ a int,
+ primary key (pk)
+) engine=rocksdb;
+
+insert into t1 select
+ A.a + B.a*10 + C.a*100,
+ A.a + B.a*10 + C.a*100
+from
+ t0 A, t0 B, t0 C;
+
+select * from t1 where pk<10;
+delete from t1 where pk<10;
+select * from t1 where pk<10;
+
+
+--echo # Test what happens when another transaction commits a row
+--echo # right before the range we are about to lock (nothing)
+
+--replace_column 10 #
+explain
+select * from t1 where pk >=5 order by pk limit 3 for update;
+
+begin;
+
+set debug_sync='rocksdb.locking_iter_scan SIGNAL about_to_lock_range WAIT_FOR spoiler_inserted';
+send select * from t1 where pk >=5 order by pk limit 3 for update;
+
+connect (con1,localhost,root,,);
+connection con1;
+set debug_sync='now WAIT_FOR about_to_lock_range';
+insert into t1 values (3,3);
+set debug_sync='now SIGNAL spoiler_inserted';
+
+connection default;
+reap;
+rollback;
+
+delete from t1 where pk=3;
+
+--echo #
+--echo # Now, repeat the test but let the other transaction insert the row into
+--echo # the range we are locking
+
+--replace_column 10 #
+explain
+select * from t1 where pk >=5 order by pk limit 1 for update;
+
+begin;
+
+set debug_sync='rocksdb.locking_iter_scan SIGNAL about_to_lock_range WAIT_FOR spoiler_inserted';
+send
+select * from t1 where pk >=5 order by pk limit 1 for update;
+
+connection con1;
+set debug_sync='now WAIT_FOR about_to_lock_range';
+insert into t1 values (8,8);
+set debug_sync='now SIGNAL spoiler_inserted';
+
+connection default;
+reap;
+
+--source suite/rocksdb/include/select_from_is_rowlocks.inc
+
+rollback;
+delete from t1 where pk=8;
+
+--echo #
+--echo # Repeat the third time, this time deleting the row that SeekForUpdate saw
+--echo #
+insert into t1 values (7,7);
+
+begin;
+
+set debug_sync='rocksdb.locking_iter_scan SIGNAL about_to_lock_range WAIT_FOR spoiler_inserted';
+send
+select * from t1 where pk >=5 order by pk limit 1 for update;
+
+connection con1;
+set debug_sync='now WAIT_FOR about_to_lock_range';
+delete from t1 where pk=7;
+set debug_sync='now SIGNAL spoiler_inserted';
+
+connection default;
+reap;
+
+rollback;
+
+--echo #
+--echo # Repeat the above test, but let the read fail with ER_LOCK_WAIT_TIMEOUT
+--echo # error. MyRocks code should now be prepared that data reads cause this
+--echo # error
+--echo #
+insert into t1 values (7,7);
+
+begin;
+
+set debug_sync='rocksdb.locking_iter_scan SIGNAL about_to_lock_range WAIT_FOR spoiler_inserted';
+send
+select * from t1 where pk >=5 order by pk limit 1 for update;
+
+connection con1;
+set debug_sync='now WAIT_FOR about_to_lock_range';
+begin;
+delete from t1 where pk=7;
+set debug_sync='now SIGNAL spoiler_inserted';
+
+connection default;
+--error ER_LOCK_WAIT_TIMEOUT
+reap;
+
+rollback;
+
+connection con1;
+rollback;
+connection default;
+
+--echo #
+--echo # Test the thd_killed check in the iterator
+--echo #
+let $conn_id=`select connection_id()`;
+begin;
+set debug_sync='rocksdb.locking_iter_scan SIGNAL about_to_lock_range WAIT_FOR go_get_killed';
+send
+select * from t1 where pk >=5 order by pk limit 1 for update;
+
+connection con1;
+set debug_sync='now WAIT_FOR about_to_lock_range';
+--disable_query_log
+eval kill query $conn_id;
+--enable_query_log
+--echo kill query CONN_ID;
+connection default;
+--error ER_GET_ERRMSG
+reap;
+rollback;
+
+--echo #
+--echo # Backward scan test
+--echo #
+connection con1;
+begin;
+select * from t1 where pk=500 for update;
+connection default;
+
+insert into t1 values
+ (1001, 1001),
+ (1005, 1005),
+ (1007, 1007),
+ (1010, 1010);
+
+begin;
+select * from t1 order by pk desc limit 2 for update;
+
+let $select_from_is_rowlocks_current_trx_only=1;
+
+--echo # The below will lock from pk=1007 (0x3ef) till the end of the table:
+--source suite/rocksdb/include/select_from_is_rowlocks.inc
+
+rollback;
+
+begin;
+select * from t1 where pk <1007 order by pk desc limit 2 for update;
+--source suite/rocksdb/include/select_from_is_rowlocks.inc
+
+connection con1;
+rollback;
+
+connection default;
+rollback;
+
+--echo #
+--echo # Backward scan test 2: error condition
+--echo #
+connection con1;
+begin;
+select * from t1 where pk=1010 for update;
+
+connection default;
+begin;
+--error ER_LOCK_WAIT_TIMEOUT
+select * from t1 order by pk desc limit 2 for update;
+rollback;
+
+connection con1;
+rollback;
+begin;
+select * from t1 where pk=1007 for update;
+
+connection default;
+begin;
+--error ER_LOCK_WAIT_TIMEOUT
+select * from t1 order by pk desc limit 2 for update;
+rollback;
+
+connection con1;
+rollback;
+
+disconnect con1;
+connection default;
+drop table t0,t1;
+
+--echo #
+--echo # A test: full table scan doesn't lock gaps
+--echo #
+
+create table t1 (
+ pk int primary key,
+ a int
+) engine=rocksdb;
+
+insert into t1 values (10,10),(20,20),(30,30);
+
+connect (con1,localhost,root,,);
+connect (con2,localhost,root,,);
+
+connection con1;
+begin;
+
+select * from t1 for update;
+
+connection con2;
+
+--error ER_LOCK_WAIT_TIMEOUT
+insert into t1 values (5,5);
+
+connection con1;
+rollback;
+
+disconnect con1;
+disconnect con2;
+connection default;
+drop table t1;
diff --git a/mysql-test/suite/rocksdb/t/range_locking_seek_for_update2.inc b/mysql-test/suite/rocksdb/t/range_locking_seek_for_update2.inc
new file mode 100644
index 00000000000..6ab2f31cae5
--- /dev/null
+++ b/mysql-test/suite/rocksdb/t/range_locking_seek_for_update2.inc
@@ -0,0 +1,55 @@
+
+
+--source include/have_rocksdb.inc
+--source include/have_debug_sync.inc
+--source suite/rocksdb/include/have_range_locking.inc
+--enable_connect_log
+show variables like 'rocksdb_use_range_locking';
+
+
+create table t0(a int primary key);
+insert into t0 values (0),(1),(2),(3),(4),(5),(6),(7),(8),(9);
+
+eval create table t1 (
+ pk int,
+ a int,
+ primary key (pk) comment '$cf'
+) engine=rocksdb;
+
+insert into t1 (pk)
+select
+ A.a + B.a*10 + C.a*100
+from
+ t0 A, t0 B, t0 C;
+delete from t1 where pk<100;
+
+connect (con1,localhost,root,,);
+connection con1;
+
+begin;
+set debug_sync='rocksdb.locking_iter_scan SIGNAL about_to_lock_range WAIT_FOR spoiler_inserted';
+send
+select * from t1 where pk >=5 order by pk limit 5 for update;
+
+connection default;
+set debug_sync='now WAIT_FOR about_to_lock_range';
+insert into t1 (pk) values
+(10),(20),(30),(40),(50);
+set debug_sync='now SIGNAL spoiler_inserted';
+
+connection con1;
+reap;
+--echo # This must return 1, no 5:
+select lock_count from information_schema.rocksdb_trx
+where thread_id=CONNECTION_ID();
+
+rollback;
+disconnect con1;
+connection default;
+drop table t0, t1;
+
+--source range_locking_seek_for_update_iter_end.inc
+set global rocksdb_enable_iterate_bounds=off;
+--source range_locking_seek_for_update_iter_end.inc
+set global rocksdb_enable_iterate_bounds=on;
+
diff --git a/mysql-test/suite/rocksdb/t/range_locking_seek_for_update2.test b/mysql-test/suite/rocksdb/t/range_locking_seek_for_update2.test
new file mode 100644
index 00000000000..703331cab9a
--- /dev/null
+++ b/mysql-test/suite/rocksdb/t/range_locking_seek_for_update2.test
@@ -0,0 +1,4 @@
+
+--let cf=rlsfu_test
+--source range_locking_seek_for_update2.inc
+
diff --git a/mysql-test/suite/rocksdb/t/range_locking_seek_for_update2_rev_cf.test b/mysql-test/suite/rocksdb/t/range_locking_seek_for_update2_rev_cf.test
new file mode 100644
index 00000000000..0620593b4e7
--- /dev/null
+++ b/mysql-test/suite/rocksdb/t/range_locking_seek_for_update2_rev_cf.test
@@ -0,0 +1,4 @@
+
+--let cf=rev:rlsfu_test
+
+--source range_locking_seek_for_update2.inc
diff --git a/mysql-test/suite/rocksdb/t/range_locking_seek_for_update_iter_end.inc b/mysql-test/suite/rocksdb/t/range_locking_seek_for_update_iter_end.inc
new file mode 100644
index 00000000000..3b0fb6c53b3
--- /dev/null
+++ b/mysql-test/suite/rocksdb/t/range_locking_seek_for_update_iter_end.inc
@@ -0,0 +1,41 @@
+--echo #
+--echo # A testcase for locking at the end of the scan
+--echo #
+eval create table t1 (
+ pk int,
+ primary key (pk) comment '$cf'
+) engine=rocksdb;
+
+connect (con1,localhost,root,,);
+connection con1;
+
+insert into t1 values (1), (10), (100);
+
+begin;
+select * from t1 for update;
+
+connection default;
+select * from t1;
+
+--error ER_LOCK_WAIT_TIMEOUT
+insert into t1 values (150);
+
+connection con1;
+rollback;
+
+begin;
+--replace_column 10 #
+explain
+select * from t1 order by pk desc for update;
+select * from t1 order by pk desc for update;
+
+connection default;
+select * from t1;
+
+--error ER_LOCK_WAIT_TIMEOUT
+insert into t1 values (0);
+
+disconnect con1;
+connection default;
+drop table t1;
+
diff --git a/mysql-test/suite/rocksdb/t/range_locking_shared_locks.test b/mysql-test/suite/rocksdb/t/range_locking_shared_locks.test
new file mode 100644
index 00000000000..c6e4e457897
--- /dev/null
+++ b/mysql-test/suite/rocksdb/t/range_locking_shared_locks.test
@@ -0,0 +1,202 @@
+#
+# Test for shared lock support for range locking
+#
+--source include/have_rocksdb.inc
+--source suite/rocksdb/include/have_range_locking.inc
+--enable_connect_log
+
+select @@rocksdb_use_range_locking;
+
+create table t0 (a int);
+insert into t0 values (0),(1),(2),(3),(4),(5),(6),(7),(8),(9);
+
+create table t1 (
+ pk int primary key,
+ a int
+) engine=rocksdb;
+
+
+insert into t1 select a,a from t0;
+
+--echo # A basic test for shared locks
+
+begin;
+select * from t1 where pk=3 for update;
+select * from t1 where pk=5 lock in share mode;
+let $TRX1_ID=`select transaction_id from information_schema.rocksdb_trx where thread_id=connection_id()`;
+
+connect (con1,localhost,root,,);
+connection con1;
+begin;
+select * from t1 where pk=5 lock in share mode;
+let $TRX2_ID=`select transaction_id from information_schema.rocksdb_trx where thread_id=connection_id()`;
+--echo # Now for pk=5 we should see two locks by TRX1 and TRX2 with mode=S:
+--source suite/rocksdb/include/select_from_is_rowlocks.inc
+
+rollback;
+--echo # Now, TRX2_ID should be gone:
+--source suite/rocksdb/include/select_from_is_rowlocks.inc
+
+connection default;
+
+--echo # Get a read lock on pk=3 (where we have a write lock).
+--echo # The result should be that we will still have a write lock
+select * from t1 where pk=3 for update;
+--source suite/rocksdb/include/select_from_is_rowlocks.inc
+
+--echo # Get a write lock on pk=5 (where we have a read lock).
+--echo # The result should be that we will have a write lock.
+select * from t1 where pk=5 for update;
+--source suite/rocksdb/include/select_from_is_rowlocks.inc
+
+connection default;
+rollback;
+
+--echo #
+--echo # Test if a read lock inhibits write locks
+--echo #
+
+begin;
+select * from t1 where pk=2 lock in share mode;
+select * from t1 where pk=8 for update;
+
+connection con1;
+begin;
+
+--error ER_LOCK_WAIT_TIMEOUT
+select * from t1 where pk=2 for update;
+
+--error ER_LOCK_WAIT_TIMEOUT
+select * from t1 where pk between 0 and 4 for update;
+
+--error ER_LOCK_WAIT_TIMEOUT
+delete from t1 where pk=2;
+
+--echo # Get a shared lock
+select * from t1 where pk=2 lock in share mode;
+
+--echo # But this should still prevent us from acquiring a write lock on that value:
+--error ER_LOCK_WAIT_TIMEOUT
+select * from t1 where pk=2 for update;
+
+rollback;
+connection default;
+rollback;
+
+drop table t1;
+create table t1 (
+ pk int not null primary key,
+ a int not null,
+ key(a)
+) engine=rocksdb;
+
+insert into t1
+select
+ A.a+10*B.a+100*C.a+1000*D.a, A.a+10*B.a+100*C.a+1000*D.a
+from
+ t0 A, t0 B, t0 C, t0 D;
+set global rocksdb_force_flush_memtable_now=1;
+
+connection con1;
+begin;
+select * from t1 where pk=900 for update;
+let $TRX2_ID=`select transaction_id from information_schema.rocksdb_trx where thread_id=connection_id()`;
+
+connection default;
+begin;
+--replace_column 10 #
+explain
+select * from t1 where a between 2 and 5 lock in share mode;
+select * from t1 where a between 2 and 5 lock in share mode;
+let $TRX1_ID=`select transaction_id from information_schema.rocksdb_trx where thread_id=connection_id()`;
+
+--source suite/rocksdb/include/select_from_is_rowlocks.inc
+
+rollback;
+
+disconnect con1;
+
+drop table t0,t1;
+
+--echo #
+--echo # Test shared point locks and lock escalation
+--echo #
+create table t0 (a int);
+insert into t0 values (0),(1),(2),(3),(4),(5),(6),(7),(8),(9);
+
+create table t1 (
+ pk int primary key,
+ a int
+) engine=rocksdb;
+
+insert into t1
+select 1000 + 100*A.a + 10*B.a + C.a, 12345 from t0 A, t0 B, t0 C;
+
+show status like 'rocksdb_locktree_current_lock_memory';
+
+connect (con1,localhost,root,,);
+connection con1;
+
+begin;
+--echo # CON1: get some shared locks
+select * from t1 where pk=1001 lock in share mode;
+select * from t1 where pk=1100 lock in share mode;
+select * from t1 where pk=1200 lock in share mode;
+
+select * from t1 where pk=2500 lock in share mode;
+let $TRX1_ID=`select transaction_id from information_schema.rocksdb_trx where thread_id=connection_id()`;
+
+connection default;
+begin;
+--echo # DEFAULT: get the same locks so we have locks with multiple owners
+select * from t1 where pk=1001 lock in share mode;
+select * from t1 where pk=1100 lock in share mode;
+select * from t1 where pk=1200 lock in share mode;
+
+--echo # DEFAULT: get shared locks with one owner:
+select * from t1 where pk=2510 lock in share mode;
+let $TRX2_ID=`select transaction_id from information_schema.rocksdb_trx where thread_id=connection_id()`;
+
+
+--echo # DEFAULT: exclusive locks on 0-10:
+insert into t1 select A.a, 0 from t0 A;
+
+connection con1;
+--echo # CON1: exclusive locks on 2000-2010:
+insert into t1 select 2000+A.a, 0 from t0 A;
+
+let $order_by_rowkey=1;
+#select * from information_schema.rocksdb_locks;
+--source suite/rocksdb/include/select_from_is_rowlocks.inc
+
+connection default;
+show status like 'rocksdb_locktree_current_lock_memory';
+set @save_mlm= @@rocksdb_max_lock_memory;
+
+--echo # Set the limit to cause lock escalation:
+set @cur_mem_usage= (select
+ variable_value
+ from
+ performance_schema.global_status
+ where
+ variable_name='rocksdb_locktree_current_lock_memory');
+
+set global rocksdb_max_lock_memory = cast(@cur_mem_usage+4 as SIGNED);
+
+connection con1;
+insert into t1 select 3000+A.a, 0 from t0 A;
+
+#select * from information_schema.rocksdb_locks;
+--source suite/rocksdb/include/select_from_is_rowlocks.inc
+
+connection con1;
+rollback;
+connection default;
+rollback;
+
+disconnect con1;
+set global rocksdb_max_lock_memory= cast(@save_mlm as SIGNED);
+
+drop table t0, t1;
+
+
diff --git a/mysql-test/suite/rocksdb/t/rocksdb.test b/mysql-test/suite/rocksdb/t/rocksdb.test
index c063d8c7ccb..0544214b8c9 100644
--- a/mysql-test/suite/rocksdb/t/rocksdb.test
+++ b/mysql-test/suite/rocksdb/t/rocksdb.test
@@ -2,6 +2,9 @@
--source suite/rocksdb/include/have_write_committed.inc
--source include/count_sessions.inc
+# Does SHOW WARNINGS and SHOW STATUS which change in Range Locking mode
+--source suite/rocksdb/include/not_range_locking.inc
+
#
# RocksDB Storage Engine tests
#
diff --git a/mysql-test/suite/rocksdb/t/rocksdb_concurrent_delete.test b/mysql-test/suite/rocksdb/t/rocksdb_concurrent_delete.test
index 47818bfdbe1..3aa51b7be80 100644
--- a/mysql-test/suite/rocksdb/t/rocksdb_concurrent_delete.test
+++ b/mysql-test/suite/rocksdb/t/rocksdb_concurrent_delete.test
@@ -27,6 +27,10 @@
# In all cases, RR gets snapshot conflict errors if non-first rows get
# deleted by another transaction after scanning.
+# The tests do not work with range locking as it locks it is about to
+# read, first.
+--source suite/rocksdb/include/not_range_locking.inc
+
--source include/have_rocksdb.inc
--source include/have_debug_sync.inc
diff --git a/mysql-test/suite/rocksdb/t/rocksdb_locks.test b/mysql-test/suite/rocksdb/t/rocksdb_locks.test
index ff092773737..8b3975723df 100644
--- a/mysql-test/suite/rocksdb/t/rocksdb_locks.test
+++ b/mysql-test/suite/rocksdb/t/rocksdb_locks.test
@@ -5,6 +5,9 @@
#
--source include/have_debug.inc
+# Range locking requests locks before doing snapshot checking.
+--source suite/rocksdb/include/not_range_locking.inc
+
--enable_connect_log
create table t1 (pk int not null primary key) engine=rocksdb;
diff --git a/mysql-test/suite/rocksdb/t/rocksdb_read_free_rpl.test b/mysql-test/suite/rocksdb/t/rocksdb_read_free_rpl.test
index 6848459c445..2affb66f7af 100644
--- a/mysql-test/suite/rocksdb/t/rocksdb_read_free_rpl.test
+++ b/mysql-test/suite/rocksdb/t/rocksdb_read_free_rpl.test
@@ -62,7 +62,7 @@ update t1 set c2=100 where c1=3;
delete from t1 where c1 <= 2;
--source include/sync_slave_sql_with_master.inc
--source include/rpl_connection_slave.inc
-select case when variable_value-@up > 0 then 'false' else 'true' end as read_free from performance_schema.global_status where variable_name='rocksdb_num_get_for_update_calls';
+select case when (@@rocksdb_use_range_locking=1 OR variable_value-@up > 0) then 'false' else 'true' end as read_free from performance_schema.global_status where variable_name='rocksdb_num_get_for_update_calls';
select * from t1;
--echo
diff --git a/mysql-test/suite/rocksdb/t/rocksdb_timeout_rollback.test b/mysql-test/suite/rocksdb/t/rocksdb_timeout_rollback.test
index 694594efd70..1273a2b6f70 100644
--- a/mysql-test/suite/rocksdb/t/rocksdb_timeout_rollback.test
+++ b/mysql-test/suite/rocksdb/t/rocksdb_timeout_rollback.test
@@ -46,6 +46,8 @@ begin work;
insert into t1 values (9);
insert into t1 values (10);
+--echo # Fix for Range Locking: force a snapshot to be taken:
+select * from t1 where a=100;
update t1 set a = a + 1 where a = 2;
connection con1;
diff --git a/mysql-test/suite/rocksdb/t/rpl_row_not_found.inc b/mysql-test/suite/rocksdb/t/rpl_row_not_found.inc
index 5a78979f048..63b72ce5c5a 100644
--- a/mysql-test/suite/rocksdb/t/rpl_row_not_found.inc
+++ b/mysql-test/suite/rocksdb/t/rpl_row_not_found.inc
@@ -3,6 +3,8 @@
--source include/have_debug.inc
--source include/have_debug_sync.inc
+--source suite/rocksdb/include/not_range_locking.inc
+
connection master;
--disable_warnings
drop table if exists t1;
diff --git a/mysql-test/suite/rocksdb/t/select_count_for_update.test b/mysql-test/suite/rocksdb/t/select_count_for_update.test
index 2c6f5d474a1..aa7059dfc7e 100644
--- a/mysql-test/suite/rocksdb/t/select_count_for_update.test
+++ b/mysql-test/suite/rocksdb/t/select_count_for_update.test
@@ -52,9 +52,23 @@ SET lock_wait_timeout = 1;
SELECT COUNT(*) FROM t1 FORCE INDEX (sk);
# ... but not with LOCK IN SHARE MODE / FOR UPDATE
+let $uses_range_locking=`select @@rocksdb_use_range_locking`;
+
+if ($uses_range_locking == "0") {
+--replace_regex /test.t1.PRIMARY/$FAILING_INDEX/
+}
+if ($uses_range_locking == "1") {
+--replace_regex /test.t1.sk/$FAILING_INDEX/
+}
--error ER_LOCK_WAIT_TIMEOUT
SELECT COUNT(*) FROM t1 FORCE INDEX (sk) LOCK IN SHARE MODE;
+if ($uses_range_locking == "0") {
+--replace_regex /test.t1.PRIMARY/$FAILING_INDEX/
+}
+if ($uses_range_locking == "1") {
+--replace_regex /test.t1.sk/$FAILING_INDEX/
+}
--error ER_LOCK_WAIT_TIMEOUT
SELECT COUNT(*) FROM t1 FORCE INDEX (sk) FOR UPDATE;
diff --git a/mysql-test/suite/rocksdb/t/select_lock_in_share_mode.test b/mysql-test/suite/rocksdb/t/select_lock_in_share_mode.test
index 23ce6d45234..cf9d53ff88a 100644
--- a/mysql-test/suite/rocksdb/t/select_lock_in_share_mode.test
+++ b/mysql-test/suite/rocksdb/t/select_lock_in_share_mode.test
@@ -1,5 +1,8 @@
--source include/have_rocksdb.inc
+# Range locking only supports exclusive locks currently.
+--source suite/rocksdb/include/not_range_locking.inc
+
#
# SELECT .. LOCK IN SHARE MODE
#
diff --git a/mysql-test/suite/rocksdb/t/skip_locked_nowait.test b/mysql-test/suite/rocksdb/t/skip_locked_nowait.test
index bfa36714816..3b8bcb033c0 100644
--- a/mysql-test/suite/rocksdb/t/skip_locked_nowait.test
+++ b/mysql-test/suite/rocksdb/t/skip_locked_nowait.test
@@ -2,5 +2,8 @@
# wl#8919 Implement NOWAIT and SKIP LOCKED
#
+# Range locking cannot support SKIP LOCKED? (TODO: but can support NOWAIT)
+--source suite/rocksdb/include/not_range_locking.inc
+
--let $engine=ROCKSDB
--source include/skip_locked_nowait.inc
diff --git a/mysql-test/suite/rocksdb/t/trx_info.test b/mysql-test/suite/rocksdb/t/trx_info.test
index 975bed6132c..009c0ce67d4 100644
--- a/mysql-test/suite/rocksdb/t/trx_info.test
+++ b/mysql-test/suite/rocksdb/t/trx_info.test
@@ -11,7 +11,11 @@ insert into t1 values (2);
set autocommit=0;
select * from t1 for update;
---replace_column 1 _TRX_ID_ 3 _NAME_ 7 _KEY_ 14 _THREAD_ID_
+--replace_column 1 _TRX_ID_ 3 _NAME_ 5 2_or_3 7 _KEY_ 14 _THREAD_ID_
select * from information_schema.rocksdb_trx;
+select
+ if(@@rocksdb_use_range_locking=1, LOCK_COUNT=3, LOCK_COUNT=2) as LOCK_COUNT_IS_CORRECT
+from information_schema.rocksdb_trx;
+
DROP TABLE t1;
diff --git a/mysql-test/suite/rocksdb/t/unique_check.test b/mysql-test/suite/rocksdb/t/unique_check.test
index 47ca74d0e5e..9814d89448d 100644
--- a/mysql-test/suite/rocksdb/t/unique_check.test
+++ b/mysql-test/suite/rocksdb/t/unique_check.test
@@ -2,6 +2,11 @@
--source include/have_debug_sync.inc
--source include/count_sessions.inc
+# Doesn't work with range locking because lock tree waits do not set
+# state="Waiting for row lock" in I_S.PROCESSLIST. See MDEV-17873 for
+# details.
+--source suite/rocksdb/include/not_range_locking.inc
+
# For GitHub issue#167 -- Unique key check doesn't work
connect (con1, localhost, root,,);
diff --git a/mysql-test/suite/rocksdb/t/unique_sec.inc b/mysql-test/suite/rocksdb/t/unique_sec.inc
index ce0bb1e39a9..508816e6ace 100644
--- a/mysql-test/suite/rocksdb/t/unique_sec.inc
+++ b/mysql-test/suite/rocksdb/t/unique_sec.inc
@@ -144,8 +144,16 @@ UPDATE t1 SET id5=37 WHERE id1=38;
UPDATE t1 SET id5=34 WHERE id1=38;
--echo # NULL values are unique
+--echo # (Note: the following UPDATE reads through the whole table without
+--echo # finding anything to update. With point locking, this is fine,
+--echo # but with range locking it will time out while waiting on a row lock
+--echo # that the other transaction is holding)
+if (`select @@rocksdb_use_range_locking=0`) {
UPDATE t1 SET id5=NULL WHERE value1 > 37;
-
+}
+if (`select @@rocksdb_use_range_locking=1`) {
+-- echo UPDATE t1 SET id5=NULL WHERE value1 > 37;
+}
connection con1;
COMMIT;
diff --git a/mysql-test/suite/rocksdb/t/varbinary_format.test b/mysql-test/suite/rocksdb/t/varbinary_format.test
index fbebfeac85a..0d8a35a1321 100644
--- a/mysql-test/suite/rocksdb/t/varbinary_format.test
+++ b/mysql-test/suite/rocksdb/t/varbinary_format.test
@@ -1,6 +1,10 @@
--source include/have_debug.inc
--source include/have_rocksdb.inc
+# The test uses SELECT .. FOR UPDATE and examines which locks it acquires
+# Range Locking will use different locks from point locking
+--source suite/rocksdb/include/not_range_locking.inc
+
# Create a table with a varbinary key with the current format and validate
# that it sorts correctly
CREATE TABLE t1(
diff --git a/mysql-test/suite/rocksdb/t/varchar_format.test b/mysql-test/suite/rocksdb/t/varchar_format.test
index 3ea1a1a60b3..985b2c0c8e7 100644
--- a/mysql-test/suite/rocksdb/t/varchar_format.test
+++ b/mysql-test/suite/rocksdb/t/varchar_format.test
@@ -1,6 +1,8 @@
--source include/have_debug.inc
--source include/have_rocksdb.inc
+--source suite/rocksdb/include/not_range_locking.inc
+
####################
# Create a table with a varchar key with the current format and validate
# that it sorts correctly
diff --git a/mysql-test/suite/rocksdb_sys_vars/r/rocksdb_max_lock_memory_basic.result b/mysql-test/suite/rocksdb_sys_vars/r/rocksdb_max_lock_memory_basic.result
new file mode 100644
index 00000000000..614737fcfbc
--- /dev/null
+++ b/mysql-test/suite/rocksdb_sys_vars/r/rocksdb_max_lock_memory_basic.result
@@ -0,0 +1,7 @@
+SET @start_global_value = @@global.ROCKSDB_USE_RANGE_LOCKING;
+SELECT @start_global_value;
+@start_global_value
+0
+"Trying to set variable @@global.ROCKSDB_USE_RANGE_LOCKING to 444. It should fail because it is readonly."
+SET @@global.ROCKSDB_USE_RANGE_LOCKING = 444;
+ERROR HY000: Variable 'rocksdb_use_range_locking' is a read only variable
diff --git a/mysql-test/suite/rocksdb_sys_vars/r/rocksdb_use_range_locking_basic.result b/mysql-test/suite/rocksdb_sys_vars/r/rocksdb_use_range_locking_basic.result
new file mode 100644
index 00000000000..614737fcfbc
--- /dev/null
+++ b/mysql-test/suite/rocksdb_sys_vars/r/rocksdb_use_range_locking_basic.result
@@ -0,0 +1,7 @@
+SET @start_global_value = @@global.ROCKSDB_USE_RANGE_LOCKING;
+SELECT @start_global_value;
+@start_global_value
+0
+"Trying to set variable @@global.ROCKSDB_USE_RANGE_LOCKING to 444. It should fail because it is readonly."
+SET @@global.ROCKSDB_USE_RANGE_LOCKING = 444;
+ERROR HY000: Variable 'rocksdb_use_range_locking' is a read only variable
diff --git a/mysql-test/suite/rocksdb_sys_vars/t/rocksdb_max_lock_memory_basic.test b/mysql-test/suite/rocksdb_sys_vars/t/rocksdb_max_lock_memory_basic.test
new file mode 100644
index 00000000000..ee185aba660
--- /dev/null
+++ b/mysql-test/suite/rocksdb_sys_vars/t/rocksdb_max_lock_memory_basic.test
@@ -0,0 +1,5 @@
+--source include/have_rocksdb.inc
+--let $sys_var=ROCKSDB_USE_RANGE_LOCKING
+--let $read_only=1
+--let $session=0
+--source ../include/rocksdb_sys_var.inc
diff --git a/mysql-test/suite/rocksdb_sys_vars/t/rocksdb_use_range_locking_basic.test b/mysql-test/suite/rocksdb_sys_vars/t/rocksdb_use_range_locking_basic.test
new file mode 100644
index 00000000000..ee185aba660
--- /dev/null
+++ b/mysql-test/suite/rocksdb_sys_vars/t/rocksdb_use_range_locking_basic.test
@@ -0,0 +1,5 @@
+--source include/have_rocksdb.inc
+--let $sys_var=ROCKSDB_USE_RANGE_LOCKING
+--let $read_only=1
+--let $session=0
+--source ../include/rocksdb_sys_var.inc
diff --git a/rocksdb b/rocksdb
index fdb3125547b..c9042db6196 160000
--- a/rocksdb
+++ b/rocksdb
@@ -1 +1 @@
-Subproject commit fdb3125547b12883fedc31ef111ffb33d8934d26
+Subproject commit c9042db61962babbe0f2697a157feeeb3b8fb746
diff --git a/storage/rocksdb/CMakeLists.txt b/storage/rocksdb/CMakeLists.txt
index 714f29cfa92..3e111ae2c52 100644
--- a/storage/rocksdb/CMakeLists.txt
+++ b/storage/rocksdb/CMakeLists.txt
@@ -135,6 +135,7 @@ SET(ROCKSDB_SOURCES
logger.h
rdb_datadic.cc rdb_datadic.h
rdb_iterator.cc rdb_iterator.h
+ rdb_locking_iter.cc rdb_locking_iter.h
rdb_cf_options.cc rdb_cf_options.h
rdb_cf_manager.cc rdb_cf_manager.h
rdb_converter.cc rdb_converter.h
diff --git a/storage/rocksdb/get_rocksdb_files.sh b/storage/rocksdb/get_rocksdb_files.sh
index e0fa66b54fa..07fee252a1e 100755
--- a/storage/rocksdb/get_rocksdb_files.sh
+++ b/storage/rocksdb/get_rocksdb_files.sh
@@ -4,7 +4,7 @@ MKFILE=`mktemp`
# include rocksdb make file relative to the path of this script
echo "include rocksdb/src.mk
all:
- @echo \$(LIB_SOURCES)" > $MKFILE
+ @echo \$(LIB_SOURCES) \$(RANGE_TREE_SOURCES)" > $MKFILE
for f in `make --makefile $MKFILE`
do
echo ../../rocksdb/$f
diff --git a/storage/rocksdb/ha_rocksdb.cc b/storage/rocksdb/ha_rocksdb.cc
index 37b5b414e6e..2b48cc04dd0 100644
--- a/storage/rocksdb/ha_rocksdb.cc
+++ b/storage/rocksdb/ha_rocksdb.cc
@@ -104,6 +104,9 @@
#include "./ObjectFactory.h"
#endif
+#include "./rdb_locking_iter.h"
+
+
// Internal MySQL APIs not exposed in any header.
extern "C" {
/**
@@ -667,6 +670,10 @@ static void rocksdb_set_delayed_write_rate(THD *thd, struct SYS_VAR *var,
static void rocksdb_set_max_latest_deadlocks(THD *thd, struct SYS_VAR *var,
void *var_ptr, const void *save);
+static void rocksdb_set_max_lock_memory(THD *thd,
+ struct SYS_VAR *var,
+ void *var_ptr, const void *save);
+
static void rdb_set_collation_exception_list(const char *exception_list);
static void rocksdb_set_collation_exception_list(THD *thd, struct SYS_VAR *var,
void *var_ptr,
@@ -822,6 +829,16 @@ static bool rocksdb_alter_column_default_inplace = false;
static bool rocksdb_instant_ddl = false;
bool rocksdb_enable_tmp_table = false;
+// Range Locking: how much memory can be used used for the lock data structure
+// (which hold the locks acquired by all clients).
+static ulonglong rocksdb_max_lock_memory;
+
+bool rocksdb_use_range_locking = 0;
+static bool rocksdb_use_range_lock_manager_as_point = 0;
+std::shared_ptr<rocksdb::RangeLockManagerHandle> range_lock_mgr;
+
+std::shared_ptr<rocksdb::RangeLockManagerHandle> range_lock_mgr_used_as_point;
+
std::atomic<uint64_t> rocksdb_row_lock_deadlocks(0);
std::atomic<uint64_t> rocksdb_row_lock_wait_timeouts(0);
std::atomic<uint64_t> rocksdb_snapshot_conflict_errors(0);
@@ -1522,6 +1539,13 @@ static MYSQL_SYSVAR_UINT(max_latest_deadlocks, rocksdb_max_latest_deadlocks,
nullptr, rocksdb_set_max_latest_deadlocks,
rocksdb::kInitialMaxDeadlocks, 0, UINT32_MAX, 0);
+static MYSQL_SYSVAR_ULONGLONG(max_lock_memory, rocksdb_max_lock_memory,
+ PLUGIN_VAR_RQCMDARG,
+ "Range-locking mode: Maximum amount of memory "
+ "that locks from all transactions can use at a time",
+ nullptr, rocksdb_set_max_lock_memory,
+ /*initial*/1073741824, 0, UINT64_MAX, 0);
+
static MYSQL_SYSVAR_ENUM(
info_log_level, rocksdb_info_log_level, PLUGIN_VAR_RQCMDARG,
"Filter level for info logs to be written mysqld error log. "
@@ -2371,6 +2395,19 @@ static MYSQL_SYSVAR_BOOL(table_stats_use_table_scan,
rocksdb_update_table_stats_use_table_scan,
rocksdb_table_stats_use_table_scan);
+static MYSQL_SYSVAR_BOOL(use_range_locking, rocksdb_use_range_locking,
+ PLUGIN_VAR_OPCMDARG | PLUGIN_VAR_READONLY,
+ "Use Range Locking",
+ nullptr, nullptr,
+ rocksdb_use_range_locking);
+
+static MYSQL_SYSVAR_BOOL(use_range_lock_manager_as_point,
+ rocksdb_use_range_lock_manager_as_point,
+ PLUGIN_VAR_OPCMDARG | PLUGIN_VAR_READONLY,
+ "Use Range Lock Manager as point",
+ nullptr, nullptr,
+ rocksdb_use_range_lock_manager_as_point);
+
static MYSQL_SYSVAR_BOOL(
large_prefix, rocksdb_large_prefix, PLUGIN_VAR_RQCMDARG,
"Support large index prefix length of 3072 bytes. If off, the maximum "
@@ -2694,7 +2731,9 @@ static struct SYS_VAR *rocksdb_system_variables[] = {
MYSQL_SYSVAR(manual_compaction_threads),
MYSQL_SYSVAR(manual_compaction_bottommost_level),
MYSQL_SYSVAR(rollback_on_timeout),
-
+ MYSQL_SYSVAR(use_range_locking),
+ MYSQL_SYSVAR(use_range_lock_manager_as_point),
+ MYSQL_SYSVAR(max_lock_memory),
MYSQL_SYSVAR(enable_insert_with_update_caching),
MYSQL_SYSVAR(trace_block_cache_access),
MYSQL_SYSVAR(trace_queries),
@@ -2920,6 +2959,7 @@ class Rdb_transaction {
ulonglong m_update_count = 0;
ulonglong m_delete_count = 0;
// per row data
+ // (with range locking, each locked range is counted here, too)
ulonglong m_row_lock_count = 0;
std::unordered_map<GL_INDEX_ID, ulonglong> m_auto_incr_map;
@@ -2993,8 +3033,41 @@ class Rdb_transaction {
virtual rocksdb::Status do_pop_savepoint() = 0;
virtual void do_rollback_to_savepoint() = 0;
+ private:
+ /*
+ If true, the current statement should not use a snapshot for reading.
+ Note that in a multi-statement transaction, the snapshot may have been
+ allocated by another statement.
+ */
+ bool m_stmt_ignores_snapshot = false;
+
+ /* Snapshot-ignore mode will put away m_reads_opts.snapshot here: */
+ const rocksdb::Snapshot *m_saved_snapshot= nullptr;
+
public:
+
+ void start_ignore_snapshot() {
+ // note: this may be called several times for the same statement
+ if (!m_stmt_ignores_snapshot) {
+ m_saved_snapshot = m_read_opts.snapshot;
+ m_read_opts.snapshot = nullptr;
+ m_stmt_ignores_snapshot= true;
+ if (!m_snapshot_timestamp)
+ rdb->GetEnv()->GetCurrentTime(&m_snapshot_timestamp);
+ }
+ }
+
+ void end_ignore_snapshot_if_needed() {
+ if (m_stmt_ignores_snapshot) {
+ m_stmt_ignores_snapshot = false;
+ m_read_opts.snapshot = m_saved_snapshot;
+ m_saved_snapshot = nullptr;
+ }
+ }
+ bool in_snapshot_ignore_mode() const { return m_stmt_ignores_snapshot; }
+
rocksdb::ReadOptions m_read_opts;
+
const char *m_mysql_log_file_name;
my_off_t m_mysql_log_offset;
const char *m_mysql_gtid;
@@ -3175,6 +3248,19 @@ class Rdb_transaction {
virtual void release_lock(const Rdb_key_def &key_descr,
const std::string &rowkey, bool force = false) = 0;
+ virtual
+ rocksdb::Status lock_range(rocksdb::ColumnFamilyHandle *const cf,
+ const rocksdb::Endpoint &start,
+ const rocksdb::Endpoint &end) = 0;
+
+ rocksdb::Status lock_singlepoint_range(rocksdb::ColumnFamilyHandle *const cf,
+ const rocksdb::Slice &point) {
+ // Normally, one needs to "flip" the endpoint type for reverse-ordered CFs.
+ // But here we are locking just one point so this is not necessary.
+ rocksdb::Endpoint endp(point, false);
+ return lock_range(cf, endp, endp);
+ }
+
virtual bool prepare() = 0;
bool commit_or_rollback() {
@@ -3233,10 +3319,17 @@ class Rdb_transaction {
m_is_delayed_snapshot = false;
}
+ void locking_iter_created() {
+ if (!m_snapshot_timestamp)
+ rdb->GetEnv()->GetCurrentTime(&m_snapshot_timestamp);
+ }
+
virtual void acquire_snapshot(bool acquire_now) = 0;
virtual void release_snapshot() = 0;
- bool has_snapshot() const { return m_read_opts.snapshot != nullptr; }
+ bool has_snapshot() const {
+ return m_read_opts.snapshot != nullptr || m_saved_snapshot;
+ }
private:
// The Rdb_sst_info structures we are currently loading. In a partitioned
@@ -3886,7 +3979,9 @@ class Rdb_transaction {
virtual rocksdb::Iterator *get_iterator(
const rocksdb::ReadOptions &options,
- rocksdb::ColumnFamilyHandle *column_family) = 0;
+ rocksdb::ColumnFamilyHandle *column_family,
+ const std::shared_ptr<Rdb_key_def>&,
+ bool use_locking_iterator=false) = 0;
virtual void multi_get(rocksdb::ColumnFamilyHandle *const column_family,
const size_t num_keys, const rocksdb::Slice *keys,
@@ -3895,10 +3990,13 @@ class Rdb_transaction {
const bool sorted_input) const = 0;
rocksdb::Iterator *get_iterator(
- rocksdb::ColumnFamilyHandle *const column_family, bool skip_bloom_filter,
+ rocksdb::ColumnFamilyHandle *const column_family,
+ const std::shared_ptr<Rdb_key_def>& kd,
+ bool skip_bloom_filter,
const rocksdb::Slice &eq_cond_lower_bound,
const rocksdb::Slice &eq_cond_upper_bound, bool read_current = false,
- bool create_snapshot = true) {
+ bool create_snapshot = true,
+ bool use_locking_iterator=false) {
// Make sure we are not doing both read_current (which implies we don't
// want a snapshot) and create_snapshot which makes sure we create
// a snapshot
@@ -3928,12 +4026,13 @@ class Rdb_transaction {
if (read_current) {
options.snapshot = nullptr;
}
- return get_iterator(options, column_family);
+ return get_iterator(options, column_family, kd, use_locking_iterator);
}
virtual bool is_tx_started() const = 0;
virtual void start_tx() = 0;
- virtual void start_stmt() = 0;
+ virtual void start_stmt(bool is_dml_statement) = 0;
+ virtual void start_autocommit_stmt(bool /*is_dml_statement*/){}
virtual void set_name() = 0;
protected:
@@ -4131,6 +4230,13 @@ class Rdb_transaction_impl : public Rdb_transaction {
virtual bool is_writebatch_trx() const override { return false; }
+ // Lock the range between two specified endpoints
+ rocksdb::Status lock_range(rocksdb::ColumnFamilyHandle *const cf,
+ const rocksdb::Endpoint &start_endp,
+ const rocksdb::Endpoint &end_endp) override {
+ ++m_row_lock_count;
+ return m_rocksdb_tx->GetRangeLock(cf, start_endp, end_endp);
+ }
private:
void release_tx(void) {
// We are done with the current active transaction object. Preserve it
@@ -4237,7 +4343,7 @@ class Rdb_transaction_impl : public Rdb_transaction {
}
void acquire_snapshot(bool acquire_now) override {
- if (m_read_opts.snapshot == nullptr) {
+ if (m_read_opts.snapshot == nullptr && !in_snapshot_ignore_mode()) {
const auto thd_ss = std::static_pointer_cast<Rdb_explicit_snapshot>(
m_thd->get_explicit_snapshot());
if (thd_ss) {
@@ -4393,9 +4499,17 @@ class Rdb_transaction_impl : public Rdb_transaction {
rocksdb::Iterator *get_iterator(
const rocksdb::ReadOptions &options,
- rocksdb::ColumnFamilyHandle *const column_family) override {
+ rocksdb::ColumnFamilyHandle *const column_family,
+ const std::shared_ptr<Rdb_key_def>& kd,
+ bool use_locking_iterator) override {
global_stats.queries[QUERIES_RANGE].inc();
- return m_rocksdb_tx->GetIterator(options, column_family);
+ if (use_locking_iterator) {
+ locking_iter_created();
+ return GetLockingIterator(m_rocksdb_tx, options, column_family,
+ kd, &m_row_lock_count);
+ }
+ else
+ return m_rocksdb_tx->GetIterator(options, column_family);
}
const rocksdb::Transaction *get_rdb_trx() const { return m_rocksdb_tx; }
@@ -4469,17 +4583,35 @@ class Rdb_transaction_impl : public Rdb_transaction {
/*
Start a statement inside a multi-statement transaction.
- @todo: are we sure this is called once (and not several times) per
- statement start?
+ @note: If a statement uses N tables, this function will be called N times,
+ for each TABLE object that is used.
For hooking to start of statement that is its own transaction, see
ha_rocksdb::external_lock().
*/
- void start_stmt() override {
+ void start_stmt(bool is_dml_statement) override {
+
+ if (rocksdb_use_range_locking && is_dml_statement) {
+ /*
+ In Range Locking mode, RocksDB does not do "key tracking".
+ Use InnoDB-like concurrency mode: make the DML statements always read
+ the latest data (instead of using transaction's snapshot).
+ This "downgrades" the transaction isolation to READ-COMMITTED on the
+ primary, but in return the actions can be replayed on the replica.
+ */
+ start_ignore_snapshot();
+ }
+
// Set the snapshot to delayed acquisition (SetSnapshotOnNextOperation)
acquire_snapshot(false);
}
+ void start_autocommit_stmt(bool is_dml_statement) override {
+ if (rocksdb_use_range_locking && is_dml_statement) {
+ start_ignore_snapshot();
+ }
+ }
+
/*
This must be called when last statement is rolled back, but the transaction
continues
@@ -4608,6 +4740,12 @@ class Rdb_writebatch_impl : public Rdb_transaction {
// Nothing to do here since we don't hold any row locks.
}
+ rocksdb::Status lock_range(rocksdb::ColumnFamilyHandle *const,
+ const rocksdb::Endpoint&,
+ const rocksdb::Endpoint&) override {
+ return rocksdb::Status::OK();
+ }
+
void rollback() override {
on_rollback();
m_write_count = 0;
@@ -4707,7 +4845,9 @@ class Rdb_writebatch_impl : public Rdb_transaction {
rocksdb::Iterator *get_iterator(
const rocksdb::ReadOptions &options,
- rocksdb::ColumnFamilyHandle *const /* column_family */) override {
+ rocksdb::ColumnFamilyHandle *const /* column_family */,
+ const std::shared_ptr<Rdb_key_def>&,
+ bool /*use_locking_iterator*/) override {
const auto it = rdb->NewIterator(options);
return m_batch->NewIteratorWithBase(it);
}
@@ -4725,9 +4865,9 @@ class Rdb_writebatch_impl : public Rdb_transaction {
set_initial_savepoint();
}
+ void start_stmt(bool /*is_dml_statement*/) override {}
void set_name() override {}
- void start_stmt() override {}
void rollback_stmt() override {
if (m_batch) rollback_to_stmt_savepoint();
@@ -5051,6 +5191,7 @@ static int rocksdb_prepare(handlerton *const hton MY_ATTRIBUTE((__unused__)),
DEBUG_SYNC(thd, "rocksdb.prepared");
} else {
tx->make_stmt_savepoint_permanent();
+ tx->end_ignore_snapshot_if_needed();
}
return HA_EXIT_SUCCESS;
@@ -5278,6 +5419,7 @@ static int rocksdb_commit(handlerton *const hton MY_ATTRIBUTE((__unused__)),
- For a COMMIT statement that finishes a multi-statement transaction
- For a statement that has its own transaction
*/
+ tx->end_ignore_snapshot_if_needed();
if (tx->commit()) {
DBUG_RETURN(HA_ERR_ROCKSDB_COMMIT_FAILED);
}
@@ -5287,6 +5429,7 @@ static int rocksdb_commit(handlerton *const hton MY_ATTRIBUTE((__unused__)),
*/
tx->set_tx_failed(false);
tx->make_stmt_savepoint_permanent();
+ tx->end_ignore_snapshot_if_needed();
}
if (my_core::thd_tx_isolation(thd) <= ISO_READ_COMMITTED) {
@@ -5324,6 +5467,7 @@ static int rocksdb_rollback(handlerton *const hton MY_ATTRIBUTE((__unused__)),
- a statement inside a transaction is rolled back
*/
+ tx->end_ignore_snapshot_if_needed();
tx->rollback_stmt();
tx->set_tx_failed(true);
}
@@ -5428,8 +5572,9 @@ class Rdb_snapshot_status : public Rdb_tx_list_walker {
"=========================================\n";
}
+ template<class PathStruct>
static Rdb_deadlock_info::Rdb_dl_trx_info get_dl_txn_info(
- const rocksdb::DeadlockInfo &txn, const GL_INDEX_ID &gl_index_id) {
+ const PathStruct &txn, const GL_INDEX_ID &gl_index_id) {
Rdb_deadlock_info::Rdb_dl_trx_info txn_data;
txn_data.trx_id = txn.m_txn_id;
@@ -5454,23 +5599,49 @@ class Rdb_snapshot_status : public Rdb_tx_list_walker {
? cfh->GetName()
: "NOT FOUND; CF_ID: " + std::to_string(txn.m_cf_id);
- txn_data.waiting_key =
- rdb_hexdump(txn.m_waiting_key.c_str(), txn.m_waiting_key.length());
+ txn_data.waiting_key = format_wait_key(txn);
txn_data.exclusive_lock = txn.m_exclusive;
return txn_data;
}
+ // Get the key to use to find the index number (and then, index name)
+ // Two functions with matching signatures so get_dl_path_trx_info() template
+ // can be used with both point and range locking.
+ static const std::string& get_key_for_indexnr(
+ const rocksdb::DeadlockInfo& info) {
+ return info.m_waiting_key;
+ }
+ static const std::string& get_key_for_indexnr(
+ const rocksdb::RangeDeadlockInfo& info) {
+ // Range locks do not span across indexes, so take the left bound
+ return info.m_start.slice;
+ }
+
+ // Print the locked key (or range) in hex
+ // Two functions with matching signatures so get_dl_path_trx_info() template
+ // can be used with both point and range locking.
+ static std::string format_wait_key(const rocksdb::DeadlockInfo& info) {
+ return rdb_hexdump(info.m_waiting_key.c_str(), info.m_waiting_key.length());
+ }
+ static std::string format_wait_key(const rocksdb::RangeDeadlockInfo& info) {
+ return rdb_hexdump_range(info.m_start, info.m_end);
+ }
+
+ // Get deadlock path info. A templated function so one can use it with both
+ // point and range locking.
+ template<class PathStruct>
static Rdb_deadlock_info get_dl_path_trx_info(
- const rocksdb::DeadlockPath &path_entry) {
+ const PathStruct &path_entry) {
Rdb_deadlock_info deadlock_info;
for (auto it = path_entry.path.begin(); it != path_entry.path.end(); it++) {
const auto &txn = *it;
+ auto waiting_key = get_key_for_indexnr(txn);
const GL_INDEX_ID gl_index_id = {
txn.m_cf_id, rdb_netbuf_to_uint32(reinterpret_cast<const uchar *>(
- txn.m_waiting_key.c_str()))};
+ waiting_key.c_str()))};
deadlock_info.path.push_back(get_dl_txn_info(txn, gl_index_id));
}
DBUG_ASSERT_IFF(path_entry.limit_exceeded, path_entry.path.empty());
@@ -5495,7 +5666,7 @@ class Rdb_snapshot_status : public Rdb_tx_list_walker {
/* Calculate the duration the snapshot has existed */
int64_t snapshot_timestamp = tx->m_snapshot_timestamp;
- if (snapshot_timestamp != 0) {
+ if (snapshot_timestamp != 0 && tx->has_snapshot()) {
int64_t curr_time;
rdb->GetEnv()->GetCurrentTime(&curr_time);
@@ -5514,8 +5685,8 @@ class Rdb_snapshot_status : public Rdb_tx_list_walker {
}
}
- void populate_deadlock_buffer() {
- auto dlock_buffer = rdb->GetDeadlockInfoBuffer();
+ template<class PathStruct>
+ void populate_deadlock_buffer_tmpl(PathStruct &dlock_buffer) {
m_data += "----------LATEST DETECTED DEADLOCKS----------\n";
for (const auto &path_entry : dlock_buffer) {
@@ -5555,12 +5726,32 @@ class Rdb_snapshot_status : public Rdb_tx_list_walker {
}
}
+ void populate_deadlock_buffer() {
+ if (range_lock_mgr) {
+ auto dlock_buffer = range_lock_mgr->GetRangeDeadlockInfoBuffer();
+ populate_deadlock_buffer_tmpl(dlock_buffer);
+ } else {
+ auto dlock_buffer = rdb->GetDeadlockInfoBuffer();
+ populate_deadlock_buffer_tmpl(dlock_buffer);
+ }
+ }
+
std::vector<Rdb_deadlock_info> get_deadlock_info() {
std::vector<Rdb_deadlock_info> deadlock_info;
- auto dlock_buffer = rdb->GetDeadlockInfoBuffer();
- for (const auto &path_entry : dlock_buffer) {
- if (!path_entry.limit_exceeded) {
- deadlock_info.push_back(get_dl_path_trx_info(path_entry));
+
+ if (range_lock_mgr) {
+ auto dlock_buffer = range_lock_mgr->GetRangeDeadlockInfoBuffer();
+ for (const auto &path_entry : dlock_buffer) {
+ if (!path_entry.limit_exceeded) {
+ deadlock_info.push_back(get_dl_path_trx_info(path_entry));
+ }
+ }
+ } else {
+ auto dlock_buffer = rdb->GetDeadlockInfoBuffer();
+ for (const auto &path_entry : dlock_buffer) {
+ if (!path_entry.limit_exceeded) {
+ deadlock_info.push_back(get_dl_path_trx_info(path_entry));
+ }
}
}
return deadlock_info;
@@ -5976,9 +6167,13 @@ static bool rocksdb_collect_hton_log_info(handlerton *const /* unused */,
return ret_val;
}
+/*
+ @param is_dml_statement If true, we are is a DML statement
+*/
static inline void rocksdb_register_tx(
handlerton *const hton MY_ATTRIBUTE((__unused__)), THD *const thd,
- Rdb_transaction *const tx) {
+ Rdb_transaction *const tx,
+ bool is_dml_stmt) {
DBUG_ASSERT(tx != nullptr);
trans_register_ha(thd, false, rocksdb_hton, NULL);
@@ -5993,8 +6188,10 @@ static inline void rocksdb_register_tx(
}
}
if (my_core::thd_test_options(thd, OPTION_NOT_AUTOCOMMIT | OPTION_BEGIN)) {
- tx->start_stmt();
+ tx->start_stmt(is_dml_stmt);
trans_register_ha(thd, true, rocksdb_hton, NULL);
+ } else {
+ tx->start_autocommit_stmt(is_dml_stmt);
}
}
@@ -6079,7 +6276,7 @@ static int rocksdb_start_tx_and_assign_read_view(
DBUG_ASSERT(!tx->has_snapshot());
tx->set_tx_read_only(true);
- rocksdb_register_tx(hton, thd, tx);
+ rocksdb_register_tx(hton, thd, tx, false);
tx->acquire_snapshot(true);
return HA_EXIT_SUCCESS;
@@ -6136,7 +6333,7 @@ static int rocksdb_start_tx_with_shared_read_view(
DBUG_ASSERT(!tx->has_snapshot());
tx->set_tx_read_only(true);
- rocksdb_register_tx(hton, thd, tx);
+ rocksdb_register_tx(hton, thd, tx, false);
tx->acquire_snapshot(true);
// case: an explicit snapshot was not assigned to this transaction
@@ -6815,6 +7012,25 @@ static int rocksdb_init_internal(void *const p) {
tx_db_options.custom_mutex_factory = std::make_shared<Rdb_mutex_factory>();
tx_db_options.write_policy =
static_cast<rocksdb::TxnDBWritePolicy>(rocksdb_write_policy);
+
+ if (rocksdb_use_range_locking && rocksdb_use_range_lock_manager_as_point) {
+ //rdb_log_status_error(
+ // status, "Can't have both range_locking and range_lock_manager_as_point");
+ //DBUG_RETURN(HA_EXIT_FAILURE);
+ rocksdb_use_range_lock_manager_as_point= 0;
+ }
+
+
+ if (rocksdb_use_range_locking) {
+ range_lock_mgr.reset(
+ rocksdb::NewRangeLockManager(tx_db_options.custom_mutex_factory));
+ tx_db_options.lock_mgr_handle = range_lock_mgr;
+ }
+ if (rocksdb_use_range_lock_manager_as_point) {
+ range_lock_mgr_used_as_point.reset(
+ rocksdb::NewRangeLockManager(tx_db_options.custom_mutex_factory));
+ tx_db_options.lock_mgr_handle = range_lock_mgr_used_as_point;
+ }
status =
check_rocksdb_options_compatibility(rocksdb_datadir, main_opts, cf_descr);
@@ -6849,6 +7065,15 @@ static int rocksdb_init_internal(void *const p) {
DBUG_RETURN(HA_EXIT_FAILURE);
}
+ if (range_lock_mgr)
+ {
+ range_lock_mgr->SetMaxLockMemory(rocksdb_max_lock_memory);
+ sql_print_information("RocksDB: USING RANGE LOCKING");
+ sql_print_information("RocksDB: Max lock memory=%llu", rocksdb_max_lock_memory);
+ }
+ else
+ sql_print_information("RocksDB: USING POINT LOCKING");
+
// NO_LINT_DEBUG
sql_print_information("RocksDB:Init column families...");
if (st_rdb_exec_time.exec("cf_manager::init", [&]() {
@@ -7580,6 +7805,7 @@ ha_rocksdb::ha_rocksdb(my_core::handlerton *const hton,
m_sk_packed_tuple_old(nullptr),
m_pack_buffer(nullptr),
m_lock_rows(RDB_LOCK_NONE),
+ m_use_range_locking(false),
m_keyread_only(false),
m_iteration_only(false),
m_insert_with_update(false),
@@ -7902,6 +8128,7 @@ int ha_rocksdb::open(const char *const name,
}
m_lock_rows = RDB_LOCK_NONE;
+ m_use_range_locking = false;
m_locked_row_action = THR_WAIT;
m_key_descr_arr = m_tbl_def->m_key_descr_arr;
@@ -9453,6 +9680,15 @@ int ha_rocksdb::index_read_intern(uchar *const buf, const uchar *const key,
}
Rdb_transaction *const tx = get_or_create_tx(table->in_use);
+
+ bool use_locking_iter= false;
+
+ if ((rc = set_range_lock(tx, kd, find_flag, slice, end_range,
+ &use_locking_iter)))
+ DBUG_RETURN(rc);
+ if (use_locking_iter)
+ m_iterator->set_use_locking();
+
const bool is_new_snapshot = !tx->has_snapshot();
// Loop as long as we get a deadlock error AND we end up creating the
@@ -9501,6 +9737,253 @@ int ha_rocksdb::index_read_intern(uchar *const buf, const uchar *const key,
DBUG_RETURN(rc);
}
+
+/*
+ @brief
+ Compute the range lock endpoints and set the range lock, if necessary
+
+ @param use_locking_iter OUT If true, locks are not set and LockingIterator
+ should be used instead
+
+ @detail
+ If the scanned range doesn't have the endpoint we're scanning towards,
+ don't set the lock, it will be too coarse. Indicate that LockingIterator
+ should be used, instead.
+
+ == RangeFlagsShouldBeFlippedForRevCF ==
+ When using reverse column families, the value of Endpoint::inf_suffix has
+ the reverse meaning.
+
+ Let's consider a forward-ordered CF and some keys and endpoints in it:
+
+ key=a, inf_suffix=false
+ key=ab
+ key=az
+ key=a, inf_suffix=true
+
+ Now, let's put the same data and endpoints into a reverse-ordered CF. The
+ physical order of the data will be the reverse of the above:
+
+ key=a, inf_suffix=true
+ key=az
+ key=ab
+ key=a, inf_suffix=false
+
+ Note that inf_suffix=false comes *before* any values with the same prefix.
+ And inf_suffix=true comes *after* all values with the same prefix.
+
+ The Endpoint comparison function in RocksDB doesn't "know" if the CF is
+ reverse-ordered or not. It uses the Key Comparator for key values, and
+ then it assumes that Endpoint(key=$VAL, inf_suffix=false) comes before
+ the row with key=$VAL.
+
+ The only way to achieve the required ordering is to flip the endpoint
+ flag value before passing class Endpoint to RocksDB. This function does
+ it before the lock_range() call.
+
+ @return
+ 0 Ok
+ Other Error acquiring the lock (wait timeout, deadlock, etc)
+*/
+
+int ha_rocksdb::set_range_lock(Rdb_transaction *tx,
+ const Rdb_key_def &kd,
+ const enum ha_rkey_function &find_flag,
+ const rocksdb::Slice &slice_arg,
+ const key_range *const end_key,
+ bool *use_locking_iterator)
+{
+ rocksdb::Slice end_slice;
+ uchar end_slice_buf[MAX_KEY_LENGTH];
+ bool start_has_inf_suffix = false, end_has_inf_suffix = false;
+ rocksdb::Slice slice(slice_arg);
+ *use_locking_iterator= false;
+
+ if (!m_use_range_locking) {
+ return 0;
+ }
+ bool big_range= false;
+
+ /*
+ The 'slice' has the left endpoint of the range to lock.
+ Figure out the right endpoint.
+ */
+
+ if (find_flag == HA_READ_KEY_EXACT ||
+ find_flag == HA_READ_PREFIX_LAST) {
+ if (slice.size() == Rdb_key_def::INDEX_NUMBER_SIZE) {
+ // This is a full table/index scan
+ // (in case of HA_READ_PREFIX_LAST, a reverse-ordered one)
+ start_has_inf_suffix= false;
+ big_range = true;
+ } else {
+ /*
+ HA_READ_KEY_EXACT:
+ This is "key_part= const" interval. We need to lock this range:
+ (lookup_value, -inf) < key < (lookup_value, +inf)
+ HA_READ_PREFIX_LAST:
+ We get here for queries like:
+
+ select * from t1 where pk1=const order by pk1 desc for update
+
+ assuming this uses an index on (pk1, ...).
+ We get end_key=nullptr.
+ */
+ start_has_inf_suffix= false;
+ end_has_inf_suffix= true;
+ end_slice= slice;
+ }
+ } else if (find_flag == HA_READ_PREFIX_LAST_OR_PREV) {
+ /*
+ We get here for queries like:
+
+ select * from t1 where pk1=const1 and pk2 between const2 and const3
+ order by pk1 desc
+ for update
+
+ assuming this uses an index on (pk1, pk2).
+ The slice has the right endpoint: {const1, const3}
+ the end_key has the left endpoint: {const1, const2}.
+ */
+
+ // Move the right endpoint from slice to end_slice
+ end_slice= slice;
+
+ // Pack the left endpoint and make "slice" point to it
+ uchar pack_buffer[MAX_KEY_LENGTH];
+ uint end_slice_size=
+ kd.pack_index_tuple(table, pack_buffer, end_slice_buf,
+ end_key->key, end_key->keypart_map);
+ slice= rocksdb::Slice(reinterpret_cast<char *>(end_slice_buf),
+ end_slice_size);
+ start_has_inf_suffix= false;
+ end_has_inf_suffix= true;
+ } else if (find_flag == HA_READ_BEFORE_KEY) {
+ /*
+ We get here for queries like
+ select * from t1
+ where pk <1007 order by pk desc limit 2 for update
+ select * from t1
+ where pk >=800 and pk <1007 order by pk desc limit 2 for update
+ */
+
+ // Move the right endpoint from slice to end_slice
+ end_slice= slice;
+
+ if (end_key) {
+ uchar pack_buffer[MAX_KEY_LENGTH];
+ uint end_slice_size=
+ kd.pack_index_tuple(table, pack_buffer, end_slice_buf,
+ end_key->key, end_key->keypart_map);
+
+ slice= rocksdb::Slice(reinterpret_cast<char *>(end_slice_buf),
+ end_slice_size);
+
+ // end_has_inf_suffix is false, because we're looking key<const
+ } else {
+ uint end_slice_size;
+ kd.get_infimum_key(end_slice_buf, &end_slice_size);
+ slice= rocksdb::Slice((char*)end_slice_buf, end_slice_size);
+
+ big_range= true;
+ }
+ } else if (end_key) {
+ // Known start range bounds: HA_READ_KEY_OR_NEXT, HA_READ_AFTER_KEY
+ if (find_flag == HA_READ_KEY_OR_NEXT)
+ start_has_inf_suffix= false;
+ else if (find_flag == HA_READ_AFTER_KEY)
+ start_has_inf_suffix= true;
+ else {
+ // Unknown type of range, shouldn't happen
+ DBUG_ASSERT(0);
+ big_range = true;
+ }
+
+ // Known end range bounds: HA_READ_AFTER_KEY, HA_READ_BEFORE_KEY
+ if (end_key->flag == HA_READ_AFTER_KEY) {
+ // this is "key_part <= const".
+ end_has_inf_suffix= true;
+ } else if (end_key->flag == HA_READ_BEFORE_KEY) {
+ // this is "key_part < const", non-inclusive.
+ end_has_inf_suffix= false;
+ } else {
+ // Unknown type of range, shouldn't happen
+ DBUG_ASSERT(0);
+ big_range = true;
+ }
+
+ uchar pack_buffer[MAX_KEY_LENGTH];
+ uint end_slice_size= kd.pack_index_tuple(table, pack_buffer, end_slice_buf,
+ end_key->key,
+ end_key->keypart_map);
+
+ end_slice= rocksdb::Slice(reinterpret_cast<char *>(end_slice_buf),
+ end_slice_size);
+ } else {
+ big_range= true;
+#if 0
+ // The below is code to handle this without LockingIterator:
+ // No end key
+ // Known start range bounds: HA_READ_KEY_OR_NEXT, HA_READ_AFTER_KEY
+ if (find_flag == HA_READ_KEY_OR_NEXT)
+ start_has_inf_suffix= false;
+ else if (find_flag == HA_READ_AFTER_KEY)
+ start_has_inf_suffix= true;
+ else
+ DBUG_ASSERT(0);
+
+ uint end_slice_size;
+ kd.get_infimum_key(end_slice_buf, &end_slice_size);
+ end_slice= rocksdb::Slice((char*)end_slice_buf, end_slice_size);
+ end_has_inf_suffix= true;
+#endif
+ }
+
+ if (big_range) {
+ *use_locking_iterator= true;
+ return 0;
+ }
+
+ rocksdb::Endpoint start_endp;
+ rocksdb::Endpoint end_endp;
+
+ if (kd.m_is_reverse_cf) {
+ // Flip the endpoint flag values, as explained in the
+ // RangeFlagsShouldBeFlippedForRevCF comment above.
+ start_endp =rocksdb::Endpoint(end_slice, !end_has_inf_suffix);
+ end_endp = rocksdb::Endpoint(slice, !start_has_inf_suffix);
+ } else {
+ start_endp= rocksdb::Endpoint(slice, start_has_inf_suffix);
+ end_endp= rocksdb::Endpoint(end_slice, end_has_inf_suffix);
+ }
+
+ /*
+ RocksDB's iterator is reading the snapshot of the data that was taken at
+ the time the iterator was created.
+
+ After we've got a lock on the range, we'll need to refresh the iterator
+ to read the latest contents. (If we use the iterator created before the
+ lock_range() call, we may miss the changes that were made/committed after
+ the iterator was created but before the lock_range() call was made).
+
+ RocksDB has Iterator::Refresh() method, but alas, it is not implemented for
+ the iterator returned by Transaction object (Transaction object returns
+ BaseDeltaIterator which allows one to see the transactions's own changes).
+
+ Our solution to this is to release the iterator and create the new one.
+ We release it here, it will be created as soon as there's a need to read
+ records.
+ */
+ m_iterator->reset();
+
+ auto s= tx->lock_range(kd.get_cf(), start_endp, end_endp);
+ if (!s.ok()) {
+ return (tx->set_status_error(table->in_use, s, kd, m_tbl_def,
+ m_table_handler));
+ }
+ return 0;
+}
+
/*
See storage/rocksdb/rocksdb-range-access.txt for description of how MySQL
index navigation commands are converted into RocksDB lookup commands.
@@ -10015,7 +10498,7 @@ int ha_rocksdb::index_next_with_direction_intern(uchar *const buf,
}
}
- if (rc) {
+ if (rc != HA_EXIT_SUCCESS) {
break;
}
@@ -10026,7 +10509,7 @@ int ha_rocksdb::index_next_with_direction_intern(uchar *const buf,
table->m_status = 0;
rc = 0;
} else if (active_index == table->s->primary_key) {
- if (m_lock_rows != RDB_LOCK_NONE) {
+ if (m_lock_rows != RDB_LOCK_NONE && !m_use_range_locking) {
DEBUG_SYNC(ha_thd(), "rocksdb_concurrent_delete");
/* We need to put a lock and re-read */
bool skip_row = false;
@@ -10222,7 +10705,9 @@ int ha_rocksdb::index_read_intern(uchar *const buf, bool first) {
void ha_rocksdb::unlock_row() {
DBUG_ENTER_FUNC();
- if (m_lock_rows != RDB_LOCK_NONE) {
+ // Don't release the lock when using range locking.
+ // This breaks m_row_lock_count
+ if (m_lock_rows != RDB_LOCK_NONE && !m_use_range_locking) {
Rdb_transaction *const tx = get_or_create_tx(table->in_use);
tx->release_lock(*m_pk_descr,
std::string(m_last_rowkey.ptr(), m_last_rowkey.length()));
@@ -10694,6 +11179,8 @@ int ha_rocksdb::check_and_lock_sk(
lock for this key.
*/
if (!(key_info->flags & HA_NOSAME)) {
+ if (rocksdb_use_range_locking)
+ return check_and_lock_non_unique_sk(key_id, row_info);
return HA_EXIT_SUCCESS;
}
@@ -10810,6 +11297,57 @@ int ha_rocksdb::check_and_lock_sk(
return rc;
}
+
+/**
+ @brief
+ Lock the non-unique sk for range locking
+*/
+int ha_rocksdb::check_and_lock_non_unique_sk(
+ const uint key_id, const struct update_row_info &row_info) {
+
+ DBUG_ASSERT(rocksdb_use_range_locking);
+ const Rdb_key_def &kd = *m_key_descr_arr[key_id];
+ bool store_row_debug_checksums = should_store_row_debug_checksums();
+
+ if (row_info.old_data != nullptr) {
+ rocksdb::Slice old_key_slice;
+ int old_packed_size;
+
+ old_packed_size = kd.pack_record(
+ table, m_pack_buffer, row_info.old_data, m_sk_packed_tuple_old,
+ &m_sk_tails_old, store_row_debug_checksums, row_info.hidden_pk_id, 0,
+ nullptr, m_ttl_bytes);
+
+ old_key_slice = rocksdb::Slice(
+ reinterpret_cast<const char *>(m_sk_packed_tuple_old), old_packed_size);
+
+ auto s= row_info.tx->lock_singlepoint_range(kd.get_cf(), old_key_slice);
+ if (!s.ok()) {
+ return (row_info.tx->set_status_error(table->in_use, s, kd,
+ m_tbl_def, m_table_handler));
+ }
+ }
+
+ int new_packed_size;
+ rocksdb::Slice new_key_slice;
+ rocksdb::Slice new_value_slice;
+ new_packed_size =
+ kd.pack_record(table, m_pack_buffer, row_info.new_data,
+ m_sk_packed_tuple, &m_sk_tails, 0,
+ row_info.hidden_pk_id, 0, nullptr, m_ttl_bytes);
+ new_key_slice = rocksdb::Slice(
+ reinterpret_cast<const char *>(m_sk_packed_tuple), new_packed_size);
+
+ auto s= row_info.tx->lock_singlepoint_range(kd.get_cf(), new_key_slice);
+ if (!s.ok()) {
+ return (row_info.tx->set_status_error(table->in_use, s, kd,
+ m_tbl_def, m_table_handler));
+ }
+
+ return HA_EXIT_SUCCESS;
+}
+
+
/**
Enumerate all keys to check their uniquess and also lock it
@@ -11606,6 +12144,19 @@ int ha_rocksdb::delete_row(const uchar *const buf) {
nullptr, false, hidden_pk_id);
rocksdb::Slice secondary_key_slice(
reinterpret_cast<const char *>(m_sk_packed_tuple), packed_size);
+
+ /*
+ For point locking, Deleting on secondary key doesn't need any locks.
+ Range locking must get a lock.
+ */
+ if (rocksdb_use_range_locking) {
+ auto s= tx->lock_singlepoint_range(kd.get_cf(), secondary_key_slice);
+ if (!s.ok()) {
+ DBUG_RETURN(tx->set_status_error(table->in_use, s, kd, m_tbl_def,
+ m_table_handler));
+ }
+ }
+
tx->get_indexed_write_batch()->SingleDelete(kd.get_cf(),
secondary_key_slice);
bytes_written += secondary_key_slice.size();
@@ -12189,8 +12740,15 @@ int ha_rocksdb::external_lock(THD *const thd, int lock_type) {
}
}
tx->m_n_mysql_tables_in_use++;
- rocksdb_register_tx(rocksdb_hton, thd, tx);
+ rocksdb_register_tx(rocksdb_hton, thd, tx, (m_lock_rows != RDB_LOCK_NONE));
tx->io_perf_start(&m_io_perf);
+
+ m_use_range_locking= false;
+ if (rocksdb_use_range_locking &&
+ m_lock_rows != RDB_LOCK_NONE &&
+ my_core::thd_tx_isolation(thd) >= ISO_REPEATABLE_READ) {
+ m_use_range_locking= true;
+ }
}
DBUG_RETURN(res);
@@ -12217,7 +12775,7 @@ int ha_rocksdb::start_stmt(THD *const thd,
Rdb_transaction *const tx = get_or_create_tx(thd);
read_thd_vars(thd);
- rocksdb_register_tx(ht, thd, tx);
+ rocksdb_register_tx(ht, thd, tx, (m_lock_rows != RDB_LOCK_NONE));
tx->io_perf_start(&m_io_perf);
DBUG_RETURN(HA_EXIT_SUCCESS);
@@ -14713,6 +15271,36 @@ static int show_rocksdb_stall_vars(THD *thd MY_ATTRIBUTE((unused)),
return 0;
}
+//
+// Lock Tree Status variables
+//
+static longlong rocksdb_locktree_escalation_count=1234;
+static longlong rocksdb_locktree_current_lock_memory=0;
+
+static SHOW_VAR rocksdb_locktree_status_variables[] = {
+ DEF_STATUS_VAR_FUNC("escalation_count",
+ &rocksdb_locktree_escalation_count, SHOW_LONGLONG),
+ DEF_STATUS_VAR_FUNC("current_lock_memory",
+ &rocksdb_locktree_current_lock_memory, SHOW_LONGLONG),
+ // end of the array marker
+ {NullS, NullS, SHOW_LONG, SHOW_SCOPE_GLOBAL}};
+
+static SHOW_VAR rocksdb_empty_status_variables[] = {
+ {NullS, NullS, SHOW_LONG, SHOW_SCOPE_GLOBAL}};
+
+static void show_rocksdb_locktree_vars(THD*, SHOW_VAR *var, char*) {
+ var->type = SHOW_ARRAY;
+ if (range_lock_mgr)
+ {
+ auto status = range_lock_mgr->GetStatus();
+ rocksdb_locktree_escalation_count = status.escalation_count;
+ rocksdb_locktree_current_lock_memory = status.current_lock_memory;
+ var->value = reinterpret_cast<char *>(&rocksdb_locktree_status_variables);
+ }
+ else
+ var->value = reinterpret_cast<char *>(&rocksdb_empty_status_variables);
+}
+
static SHOW_VAR rocksdb_status_vars[] = {
DEF_STATUS_VAR(block_cache_miss),
DEF_STATUS_VAR(block_cache_hit),
@@ -14842,6 +15430,8 @@ static SHOW_VAR rocksdb_status_vars[] = {
SHOW_SCOPE_GLOBAL},
{"rocksdb_stall", reinterpret_cast<char *>(&show_rocksdb_stall_vars),
SHOW_FUNC, SHOW_SCOPE_GLOBAL},
+ {"rocksdb_locktree", reinterpret_cast<char *>(show_rocksdb_locktree_vars),
+ SHOW_FUNC, SHOW_SCOPE_GLOBAL},
{NullS, NullS, SHOW_LONG, SHOW_SCOPE_GLOBAL}};
/*
@@ -15834,6 +16424,23 @@ void rocksdb_set_delayed_write_rate(THD *thd MY_ATTRIBUTE((unused)),
RDB_MUTEX_UNLOCK_CHECK(rdb_sysvars_mutex);
}
+void rocksdb_set_max_lock_memory(THD *thd, struct SYS_VAR*,
+ void* /*var_ptr*/, const void *save) {
+ const uint64_t new_val = *static_cast<const uint64_t *>(save);
+ if (rocksdb_max_lock_memory != new_val) {
+ if (range_lock_mgr->SetMaxLockMemory(new_val)) {
+ /* NO_LINT_DEBUG */
+ sql_print_warning("MyRocks: failed to set max_lock_memory");
+ push_warning_printf(thd, Sql_condition::SL_WARNING,
+ ER_ERROR_WHEN_EXECUTING_COMMAND,
+ "Cannot set max_lock_memory to size below currently used");
+ } else {
+ // Succeeded
+ rocksdb_max_lock_memory = new_val;
+ }
+ }
+}
+
void rocksdb_set_max_latest_deadlocks(
THD *thd MY_ATTRIBUTE((unused)), struct SYS_VAR *var MY_ATTRIBUTE((unused)),
void *var_ptr MY_ATTRIBUTE((unused)), const void *save) {
@@ -15841,7 +16448,13 @@ void rocksdb_set_max_latest_deadlocks(
const uint32_t new_val = *static_cast<const uint32_t *>(save);
if (rocksdb_max_latest_deadlocks != new_val) {
rocksdb_max_latest_deadlocks = new_val;
- rdb->SetDeadlockInfoBufferSize(rocksdb_max_latest_deadlocks);
+ if (range_lock_mgr) {
+ auto n= rocksdb_max_latest_deadlocks;
+ range_lock_mgr->SetRangeDeadlockInfoBufferSize(n);
+ }
+ else
+ rdb->SetDeadlockInfoBufferSize(rocksdb_max_latest_deadlocks);
+
}
RDB_MUTEX_UNLOCK_CHECK(rdb_sysvars_mutex);
}
@@ -16556,11 +17169,24 @@ const rocksdb::ReadOptions &rdb_tx_acquire_snapshot(Rdb_transaction *tx) {
}
rocksdb::Iterator *rdb_tx_get_iterator(
- THD *thd, rocksdb::ColumnFamilyHandle *const cf, bool skip_bloom_filter,
+ Rdb_transaction *tx, rocksdb::ColumnFamilyHandle *const column_family,
+ const std::shared_ptr<Rdb_key_def> &kd,
+ bool skip_bloom_filter, const rocksdb::Slice &lower_bound_slice,
+ const rocksdb::Slice &upper_bound_slice, bool read_current,
+ bool create_snapshot) {
+ return tx->get_iterator(column_family, kd, skip_bloom_filter, lower_bound_slice,
+ upper_bound_slice, read_current, create_snapshot);
+}
+
+
+rocksdb::Iterator *rdb_tx_get_iterator(
+ THD *thd, rocksdb::ColumnFamilyHandle *const cf,
+ const std::shared_ptr<Rdb_key_def> &kd,
+ bool skip_bloom_filter,
const rocksdb::Slice &eq_cond_lower_bound,
const rocksdb::Slice &eq_cond_upper_bound,
const rocksdb::Snapshot **snapshot, bool read_current,
- bool create_snapshot) {
+ bool create_snapshot, bool use_locking_iter) {
if (commit_in_the_middle(thd)) {
DBUG_ASSERT(snapshot && *snapshot == nullptr);
if (snapshot) {
@@ -16575,8 +17201,8 @@ rocksdb::Iterator *rdb_tx_get_iterator(
}
} else {
Rdb_transaction *tx = get_tx_from_thd(thd);
- return tx->get_iterator(cf, skip_bloom_filter, eq_cond_lower_bound,
- eq_cond_upper_bound, read_current, create_snapshot);
+ return tx->get_iterator(cf, kd, skip_bloom_filter, eq_cond_lower_bound,
+ eq_cond_upper_bound, read_current, create_snapshot, use_locking_iter);
}
}
@@ -16610,6 +17236,15 @@ void rdb_tx_release_lock(Rdb_transaction *tx, const Rdb_key_def &kd,
tx->release_lock(kd, std::string(key.data(), key.size()), force);
}
+
+rocksdb::Status rdb_tx_lock_range(Rdb_transaction *tx,
+ const Rdb_key_def &kd,
+ const rocksdb::Endpoint &start_key,
+ const rocksdb::Endpoint &end_key)
+{
+ return tx->lock_range(kd.get_cf(), start_key, end_key);
+}
+
void rdb_tx_multi_get(Rdb_transaction *tx,
rocksdb::ColumnFamilyHandle *const column_family,
const size_t num_keys, const rocksdb::Slice *keys,
diff --git a/storage/rocksdb/ha_rocksdb.h b/storage/rocksdb/ha_rocksdb.h
index 2af4013bf72..bdb4ba3bab9 100644
--- a/storage/rocksdb/ha_rocksdb.h
+++ b/storage/rocksdb/ha_rocksdb.h
@@ -90,6 +90,8 @@ extern PSI_rwlock_key key_rwlock_read_free_rpl_tables;
#endif
extern Regex_list_handler rdb_read_free_regex_handler;
+extern bool rocksdb_use_range_locking;
+
/**
@brief
Rdb_table_handler is a reference-counted structure storing information for
@@ -274,6 +276,8 @@ class ha_rocksdb : public my_core::handler, public blob_buffer {
/* Type of locking to apply to rows */
Rdb_lock_type m_lock_rows;
+ bool m_use_range_locking;
+
thr_locked_row_action m_locked_row_action;
/* true means we're doing an index-only read. false means otherwise. */
@@ -338,6 +342,13 @@ class ha_rocksdb : public my_core::handler, public blob_buffer {
const Rdb_key_def &kd,
const rocksdb::Slice &key) const;
+ int set_range_lock(Rdb_transaction *tx,
+ const Rdb_key_def &kd,
+ const enum ha_rkey_function &find_flag,
+ const rocksdb::Slice &slice,
+ const key_range *const end_key,
+ bool *use_locking_iterator);
+
int get_row_by_rowid(uchar *const buf, const char *const rowid,
const uint rowid_size, bool *skip_row = nullptr,
const bool skip_lookup = false,
@@ -792,6 +803,10 @@ class ha_rocksdb : public my_core::handler, public blob_buffer {
const struct update_row_info &row_info,
bool *const found, const bool skip_unique_check)
MY_ATTRIBUTE((__warn_unused_result__));
+
+ int check_and_lock_non_unique_sk(const uint key_id,
+ const struct update_row_info &row_info)
+ MY_ATTRIBUTE((__warn_unused_result__));
int check_uniqueness_and_lock(const struct update_row_info &row_info,
bool pk_changed, const bool skip_unique_check)
MY_ATTRIBUTE((__warn_unused_result__));
@@ -988,6 +1003,8 @@ class ha_rocksdb : public my_core::handler, public blob_buffer {
/* Need to build decoder on next read operation */
bool m_need_build_decoder;
+
+ int iter_status_to_retval(rocksdb::Iterator *it, const Rdb_key_def &kd, int not_found_code);
};
/*
@@ -1125,11 +1142,14 @@ Rdb_transaction *get_tx_from_thd(THD *const thd);
const rocksdb::ReadOptions &rdb_tx_acquire_snapshot(Rdb_transaction *tx);
rocksdb::Iterator *rdb_tx_get_iterator(
- THD *thd, rocksdb::ColumnFamilyHandle *const cf, bool skip_bloom_filter,
+ THD *thd, rocksdb::ColumnFamilyHandle *const cf,
+ const std::shared_ptr<Rdb_key_def> &kd,
+ bool skip_bloom_filter,
const rocksdb::Slice &eq_cond_lower_bound,
const rocksdb::Slice &eq_cond_upper_bound,
const rocksdb::Snapshot **snapshot, bool read_current = false,
- bool create_snapshot = true);
+ bool create_snapshot = true,
+ bool use_locking_iter= false);
rocksdb::Status rdb_tx_get(Rdb_transaction *tx,
rocksdb::ColumnFamilyHandle *const column_family,
@@ -1142,6 +1162,11 @@ rocksdb::Status rdb_tx_get_for_update(Rdb_transaction *tx,
rocksdb::PinnableSlice *const value,
bool exclusive, bool skip_wait);
+rocksdb::Status rdb_tx_lock_range(Rdb_transaction *tx,
+ const Rdb_key_def &kd,
+ const rocksdb::Endpoint &start_key,
+ const rocksdb::Endpoint &end_key);
+
void rdb_tx_release_lock(Rdb_transaction *tx, const Rdb_key_def &kd,
const rocksdb::Slice &key, bool force);
@@ -1203,4 +1228,6 @@ extern std::atomic<uint64_t> rocksdb_partial_index_rows_sorted;
extern std::atomic<uint64_t> rocksdb_partial_index_rows_materialized;
extern bool rocksdb_enable_tmp_table;
+extern std::shared_ptr<rocksdb::RangeLockManagerHandle> range_lock_mgr;
+
} // namespace myrocks
diff --git a/storage/rocksdb/locking-iterator-partial-index.txt b/storage/rocksdb/locking-iterator-partial-index.txt
new file mode 100644
index 00000000000..d3ffb41d8f0
--- /dev/null
+++ b/storage/rocksdb/locking-iterator-partial-index.txt
@@ -0,0 +1,120 @@
+
+This document describes how Locking Reads are implemented with partial index
+iterator (Rdb_iterator_partial)
+
+== Closed ranges ==
+ha_rocksdb will make the lock_range() call before making any reads.
+
+We just need to make sure that after that, the iterator reads the latest
+committed data (not the data from the read snapshot).
+
+== Open ranges and LockingIterator ==
+
+With Open Ranges, regular indexes use LockingIterator.
+
+How does one make Rdb_iterator_partial use the LockingIterator?
+
+=== Background info: Rdb_iterator_Partial ===
+
+Partial iterator is used for partial secondary keys.
+
+PK and SK share the same key prefix (see 'partial_group_keyparts=N' parameter).
+
+For a given value of key prefix:
+
+* if the number of rows exceeds 'partial_group_threshold' parameter, one reads
+ the secondary index, as usual (the SK is "materialized" for this prefix)
+
+* if the number of rows is less than 'partial_group_threshold', then the SK is
+ "not materialized" and has no rows for the prefix. The rows are obtained by
+ reading the records using PK, buffering+sorting them in memory, and then
+ returning them.
+
+* Switching from non-materialized to materialized is done "lazily" on-read.
+ The write locks the group prefix in SK (see
+ Rdb_iterator_partial::materialize_prefix())
+
+* Update/Delete operations also lock the group prefix in SK (see
+ ha_rocksdb::acquire_prefix_lock)
+
+* De-materialization is currently not performed.
+
+=== Regular reads with Rdb_iterator_partial ===
+
+Regular (that is, non locking) reads are done as follows:
+
+Step #1: first, we need to figure out the key prefix we're going to read.
+There are two possibilities:
+
+A. It can be inferred from the lookup key value.
+B. It cannot be inferred (e.g. we scan from the start of the table).
+ In this case, we read from the PK to find out the first key prefix.
+
+See Rdb_iterator_partial::get_prefix_from_start().
+
+Step #2 is to read rows within this prefix.
+
+We first try to read through the SK. If it has a row within the prefix, it
+means this prefix is materialized and we continue to read from the SK within
+the bounds of the key prefix.
+
+If the SK has no data we read all rows through the PK, buffer, sort them, and
+return. (See Rdb_iterator_partial::read_prefix_from_pk())
+
+Step #3:
+When we have exhausted rows in this key prefix, we check if we need to continue
+the scan.
+If we do, we take the current prefix value and try to get the next row after
+it using the approach in Step #1. See Rdb_iterator_partial::get_next_prefix().
+
+=== Locking Reads with Rdb_iterator_partial ===
+
+This section describes how one can perform locking reads for Steps 1-3 from the
+previous section.
+
+Step #1:
+for Case A, there's no need to lock anything.
+
+for case B, we need to lock the range from the lookup key to the start of
+the first key prefix. This can be achieved by making m_iterator_pk use
+a LockingIterator.
+
+(PK iterator uses a prefix of the lookup key, so we may end up with a
+coarser-grained lock, but this is still correct).
+
+Step #2:
+
+Quoting the previous section:
+
+> We first try to read through the SK.
+> If it has a row within the prefix, it means this prefix is materialized and we
+> continue to read from the SK within the bounds of the key prefix.
+
+If we use a LockingIterator for scanning the SK, we are doing a locking read
+and have achieved our goal.
+
+> If the SK has no data we read all rows through the PK
+
+Suppose we use a LockingIterator to try read through the SK. The read is done
+with this call in Rdb_iterator_partial::seek_next_prefix:
+
+ rc = Rdb_iterator_base::seek(
+ direction ? HA_READ_KEY_EXACT : HA_READ_PREFIX_LAST, cur_prefix_key,
+ false, empty_end_key);
+
+If the SK has no data for the cur_prefix_key, the LockingIterator will lock the
+range before returning from this call.
+Note that it will lock just this key prefix: this is achieved by use of iterator
+bounds.
+
+If there is a Materialization operation in progress, the locking read will
+eventually come into conflict with it. To avoid hitting the conflict at the
+end, Rdb_iterator_partial::materialize_prefix() is made to acquire a lock
+on the whole prefix when Range Locking is used.
+
+Step #3: The same approach as in step #1. If we use LockingIterator for the PK
+it will correctly lock the gap between the prefixes (or just the gap after the
+last prefix if there are no further prefixes).
+
+This way, one can provide LockingIterator semantics with Rdb_iterator_partial.
+
diff --git a/storage/rocksdb/nosql_access.cc b/storage/rocksdb/nosql_access.cc
index 37da5741cb4..2cdadf4caf2 100644
--- a/storage/rocksdb/nosql_access.cc
+++ b/storage/rocksdb/nosql_access.cc
@@ -727,10 +727,11 @@ class select_exec {
}
rocksdb::Iterator *get_iterator(rocksdb::ColumnFamilyHandle *cf,
+ const std::shared_ptr<Rdb_key_def> &kd,
bool use_bloom,
const rocksdb::Slice &lower_bound,
const rocksdb::Slice &upper_bound) {
- return rdb_tx_get_iterator(m_thd, cf, !use_bloom, lower_bound,
+ return rdb_tx_get_iterator(m_thd, cf, kd, !use_bloom, lower_bound,
upper_bound, nullptr);
}
@@ -1569,6 +1570,7 @@ bool INLINE_ATTR select_exec::setup_iterator(THD *thd) {
} else {
m_iterator.reset(
new Rdb_iterator_base(thd, m_key_def, m_pk_def, m_tbl_def));
+// m_lower_bound_slice, m_upper_bound_slice);
}
return m_iterator == nullptr;
diff --git a/storage/rocksdb/rdb_i_s.cc b/storage/rocksdb/rdb_i_s.cc
index 3c5f6ef7b38..13c205b8908 100644
--- a/storage/rocksdb/rdb_i_s.cc
+++ b/storage/rocksdb/rdb_i_s.cc
@@ -1772,6 +1772,61 @@ static ST_FIELD_INFO rdb_i_s_lock_info_fields_info[] = {
ROCKSDB_FIELD_INFO("MODE", 32, MYSQL_TYPE_STRING, 0),
ROCKSDB_FIELD_INFO_END};
+
+// Dump the locked key (or range) into a string. A template and its
+// specializations
+template <typename LockInfo> std::string dump_key(const LockInfo& info);
+
+// Specialization for point lock manager.
+template<>
+std::string dump_key<rocksdb::KeyLockInfo>(const rocksdb::KeyLockInfo& info) {
+ return rdb_hexdump(info.key.c_str(), info.key.length(), FN_REFLEN);
+}
+
+// Specialization for Range Lock manager.
+template<>
+std::string dump_key<rocksdb::RangeLockInfo>(
+ const rocksdb::RangeLockInfo &info) {
+ // todo: Do we need to enforce the total length limit of FN_REFLEN?
+ return rdb_hexdump_range(info.start, info.end);
+}
+
+
+//
+// A template that walks the Lock info data structure and dumps its contents.
+//
+template <typename LockInfo>
+int dump_locks(my_core::THD *thd, my_core::TABLE *table, LockInfo& lock_info) {
+ for (const auto &lock : lock_info) {
+ const uint32_t cf_id = lock.first;
+ const auto &one_lock_info = lock.second;
+
+ // Call the templated function to get string representation of the acquired
+ // lock
+ std::string key_hexstr = dump_key(one_lock_info);
+
+ for (const auto &id : one_lock_info.ids) {
+ table->field[RDB_LOCKS_FIELD::COLUMN_FAMILY_ID]->store(cf_id, true);
+ table->field[RDB_LOCKS_FIELD::TRANSACTION_ID]->store(id, true);
+
+ table->field[RDB_LOCKS_FIELD::KEY]->store(key_hexstr.c_str(),
+ key_hexstr.size(),
+ system_charset_info);
+ table->field[RDB_LOCKS_FIELD::MODE]->store(
+ one_lock_info.exclusive ? "X" : "S", 1, system_charset_info);
+
+ /* Tell MySQL about this row in the virtual table */
+ int ret = static_cast<int>(
+ my_core::schema_table_store_record(thd, table));
+
+ if (ret != 0) {
+ return ret;
+ }
+ }
+ }
+ return 0;
+}
+
/* Fill the information_schema.rocksdb_locks virtual table */
static int rdb_i_s_lock_info_fill_table(
my_core::THD *const thd, my_core::TABLE_LIST *const tables,
@@ -1791,36 +1846,16 @@ static int rdb_i_s_lock_info_fill_table(
DBUG_RETURN(ret);
}
- /* cf id -> rocksdb::KeyLockInfo */
- std::unordered_multimap<uint32_t, rocksdb::KeyLockInfo> lock_info =
- rdb->GetLockStatusData();
-
- for (const auto &lock : lock_info) {
- const uint32_t cf_id = lock.first;
- const auto &key_lock_info = lock.second;
- const auto key_hexstr = rdb_hexdump(key_lock_info.key.c_str(),
- key_lock_info.key.length(), FN_REFLEN);
-
- for (const auto &id : key_lock_info.ids) {
- tables->table->field[RDB_LOCKS_FIELD::COLUMN_FAMILY_ID]->store(cf_id,
- true);
- tables->table->field[RDB_LOCKS_FIELD::TRANSACTION_ID]->store(id, true);
-
- tables->table->field[RDB_LOCKS_FIELD::KEY]->store(
- key_hexstr.c_str(), key_hexstr.size(), system_charset_info);
- tables->table->field[RDB_LOCKS_FIELD::MODE]->store(
- key_lock_info.exclusive ? "X" : "S", 1, system_charset_info);
-
- /* Tell MySQL about this row in the virtual table */
- ret = static_cast<int>(
- my_core::schema_table_store_record(thd, tables->table));
-
- if (ret != 0) {
- break;
- }
- }
+ if (range_lock_mgr) {
+ // Use Range Lock Manager's interface for obtaining more specific
+ // information about the acquired locks
+ auto lock_info = range_lock_mgr->GetRangeLockStatusData();
+ ret= dump_locks(thd, tables->table, lock_info);
+ } else {
+ // Obtain information from point lock manager
+ auto lock_info = rdb->GetLockStatusData();
+ ret= dump_locks(thd, tables->table, lock_info);
}
-
DBUG_RETURN(ret);
}
diff --git a/storage/rocksdb/rdb_iterator.cc b/storage/rocksdb/rdb_iterator.cc
index 11e538370fd..bfe38c23199 100644
--- a/storage/rocksdb/rdb_iterator.cc
+++ b/storage/rocksdb/rdb_iterator.cc
@@ -36,6 +36,8 @@ Rdb_iterator_base::Rdb_iterator_base(THD *thd,
m_tbl_def(tbl_def),
m_thd(thd),
m_scan_it(nullptr),
+ m_iter_uses_locking(false),
+ m_iter_should_use_locking(false),
m_scan_it_skips_bloom(false),
m_scan_it_snapshot(nullptr),
m_scan_it_lower_bound(nullptr),
@@ -83,7 +85,7 @@ int Rdb_iterator_base::read_before_key(const bool full_key_match,
return HA_EXIT_SUCCESS;
}
- return HA_ERR_END_OF_FILE;
+ return iter_status_to_retval(m_scan_it, m_kd, HA_ERR_END_OF_FILE);
}
int Rdb_iterator_base::read_after_key(const rocksdb::Slice &key_slice) {
@@ -98,12 +100,15 @@ int Rdb_iterator_base::read_after_key(const rocksdb::Slice &key_slice) {
*/
rocksdb_smart_seek(m_kd->m_is_reverse_cf, m_scan_it, key_slice);
- return is_valid_iterator(m_scan_it) ? HA_EXIT_SUCCESS : HA_ERR_END_OF_FILE;
+ return is_valid_iterator(m_scan_it) ?
+ HA_EXIT_SUCCESS :
+ iter_status_to_retval(m_scan_it, m_kd, HA_ERR_END_OF_FILE);
}
void Rdb_iterator_base::release_scan_iterator() {
delete m_scan_it;
m_scan_it = nullptr;
+ m_iter_uses_locking = false;
if (m_scan_it_snapshot) {
auto rdb = rdb_get_rocksdb_db();
@@ -154,7 +159,8 @@ void Rdb_iterator_base::setup_scan_iterator(const rocksdb::Slice *const slice,
and
re-create Iterator.
*/
- if (m_scan_it_skips_bloom != skip_bloom) {
+ if (m_scan_it_skips_bloom != skip_bloom ||
+ m_iter_uses_locking != m_iter_should_use_locking) {
release_scan_iterator();
}
@@ -164,10 +170,12 @@ void Rdb_iterator_base::setup_scan_iterator(const rocksdb::Slice *const slice,
*/
if (!m_scan_it) {
m_scan_it = rdb_tx_get_iterator(
- m_thd, m_kd->get_cf(), skip_bloom, m_scan_it_lower_bound_slice,
+ m_thd, m_kd->get_cf(), m_kd, skip_bloom,
+ m_scan_it_lower_bound_slice,
m_scan_it_upper_bound_slice, &m_scan_it_snapshot, read_current,
- !read_current);
+ !read_current, m_iter_should_use_locking);
m_scan_it_skips_bloom = skip_bloom;
+ m_iter_uses_locking = m_iter_should_use_locking;
}
}
@@ -208,6 +216,20 @@ int Rdb_iterator_base::calc_eq_cond_len(enum ha_rkey_function find_flag,
return Rdb_key_def::INDEX_NUMBER_SIZE;
}
+int Rdb_iterator_base::iter_status_to_retval(rocksdb::Iterator *it,
+ const std::shared_ptr<Rdb_key_def> kd,
+ int not_found_code) {
+ if (it->Valid())
+ return HA_EXIT_SUCCESS;
+
+ rocksdb::Status s= it->status();
+ if (s.ok() || s.IsNotFound())
+ return not_found_code;
+
+ Rdb_transaction *tx = get_tx_from_thd(m_thd);
+ return rdb_tx_set_status_error(tx, s, *kd, m_tbl_def);
+}
+
int Rdb_iterator_base::next_with_direction(bool move_forward, bool skip_next) {
int rc = 0;
const auto &kd = *m_kd;
@@ -237,7 +259,7 @@ int Rdb_iterator_base::next_with_direction(bool move_forward, bool skip_next) {
}
if (!is_valid_iterator(m_scan_it)) {
- rc = HA_ERR_END_OF_FILE;
+ rc = iter_status_to_retval(m_scan_it, m_kd, HA_ERR_END_OF_FILE);
break;
}
@@ -617,8 +639,16 @@ int Rdb_iterator_partial::materialize_prefix() {
const char *old_proc_info = m_thd->get_proc_info();
thd_proc_info(m_thd, "Materializing group in partial index");
- auto s =
- rdb_tx_get_for_update(tx, *m_kd, cur_prefix_key, nullptr, true, false);
+ rocksdb::Status s;
+ if (rocksdb_use_range_locking) {
+ rocksdb::Endpoint start_endp(cur_prefix_key, false);
+ rocksdb::Endpoint end_endp(cur_prefix_key, true);
+ s = rdb_tx_lock_range(tx, *m_kd, start_endp, end_endp);
+ } else {
+ s = rdb_tx_get_for_update(tx, *m_kd, cur_prefix_key, nullptr, true,
+ false);
+ }
+
if (!s.ok()) {
thd_proc_info(m_thd, old_proc_info);
return rdb_tx_set_status_error(tx, s, *m_kd, m_tbl_def);
@@ -815,7 +845,14 @@ int Rdb_iterator_partial::seek(enum ha_rkey_function find_flag,
return HA_ERR_INTERNAL_ERROR;
}
+ // Save the value because reset() clears it:
+ auto save_iter_should_use_locking= m_iter_should_use_locking;
reset();
+ m_iter_should_use_locking= save_iter_should_use_locking;
+ // Range Locking: when the iterator does a locking read,
+ // both secondary and primary key iterators should use locking reads.
+ if (m_iter_should_use_locking)
+ m_iterator_pk.set_use_locking();
bool direction = (find_flag == HA_READ_KEY_EXACT) ||
(find_flag == HA_READ_AFTER_KEY) ||
@@ -831,6 +868,15 @@ int Rdb_iterator_partial::seek(enum ha_rkey_function find_flag,
m_cur_prefix_key_len);
m_kd->get_infimum_key(m_cur_prefix_key, &tmp);
+ //
+ // Range Locking note: When using a locking iterator (i.e.
+ // m_iter_should_use_locking=true), this will read (and lock)
+ // and the value space up to the next prefix.
+ // If the next prefix is not materialized, it will lock the whole
+ // prefix in the secondary key. It will not lock more than that,
+ // because the iterator use the iterator bounds to limit the scan
+ // to the prefix specified.
+ //
rc = Rdb_iterator_base::seek(find_flag, start_key, true, end_key,
read_current);
diff --git a/storage/rocksdb/rdb_iterator.h b/storage/rocksdb/rdb_iterator.h
index 7fd1ff470c7..0a516c384f5 100644
--- a/storage/rocksdb/rdb_iterator.h
+++ b/storage/rocksdb/rdb_iterator.h
@@ -69,6 +69,8 @@ class Rdb_iterator {
virtual rocksdb::Slice key() = 0;
virtual rocksdb::Slice value() = 0;
virtual void reset() = 0;
+
+ virtual void set_use_locking()=0;
};
class Rdb_iterator_base : public Rdb_iterator {
@@ -107,8 +109,12 @@ class Rdb_iterator_base : public Rdb_iterator {
rocksdb::Slice value() override { return m_scan_it->value(); }
- void reset() override { release_scan_iterator(); }
+ void reset() override {
+ m_iter_should_use_locking = false;
+ release_scan_iterator();
+ }
+ void set_use_locking() override { m_iter_should_use_locking = true; }
protected:
friend class Rdb_iterator;
const std::shared_ptr<Rdb_key_def> m_kd;
@@ -123,6 +129,18 @@ class Rdb_iterator_base : public Rdb_iterator {
/* Iterator used for range scans and for full table/index scans */
rocksdb::Iterator *m_scan_it;
+ /* Whether m_scan_it is a locking iterator */
+ bool m_iter_uses_locking;
+
+ /*
+ Whether the iterator should use locking. The intended workflow is:
+
+ iter.set_use_locking() // sets m_iter_should_use_locking=true
+ iter.seek() // this will re-create m_scan_it to be the right kind
+ // of iterator
+ */
+ bool m_iter_should_use_locking;
+
/* Whether m_scan_it was created with skip_bloom=true */
bool m_scan_it_skips_bloom;
@@ -136,8 +154,17 @@ class Rdb_iterator_base : public Rdb_iterator {
uchar *m_prefix_buf;
rocksdb::Slice m_prefix_tuple;
+
+ int iter_status_to_retval(rocksdb::Iterator *it,
+ const std::shared_ptr<Rdb_key_def> kd,
+ int not_found_code);
};
+/*
+ Iterator for reading partial secondary indexes
+
+ It can do locking reads, see locking_iterator_partial_index.txt for details.
+*/
class Rdb_iterator_partial : public Rdb_iterator_base {
private:
TABLE *m_table;
diff --git a/storage/rocksdb/rdb_locking_iter.cc b/storage/rocksdb/rdb_locking_iter.cc
new file mode 100644
index 00000000000..35b3763f7f3
--- /dev/null
+++ b/storage/rocksdb/rdb_locking_iter.cc
@@ -0,0 +1,285 @@
+
+#ifdef USE_PRAGMA_IMPLEMENTATION
+#pragma implementation // gcc: Class implementation
+#endif
+
+#define MYSQL_SERVER 1
+
+/* This C++ file's header file */
+#include "./rdb_locking_iter.h"
+
+namespace myrocks {
+
+rocksdb::Iterator* GetLockingIterator(
+ rocksdb::Transaction *trx,
+ const rocksdb::ReadOptions& read_options,
+ rocksdb::ColumnFamilyHandle* column_family,
+ const std::shared_ptr<Rdb_key_def> &kd,
+ ulonglong *counter) {
+ return new LockingIterator(trx, column_family, kd, read_options,
+ counter);
+}
+
+/*
+ @brief
+ Seek to the first key K that is equal or greater than target,
+ locking the range [target; K].
+*/
+
+void LockingIterator::Seek(const rocksdb::Slice& target) {
+ m_have_locked_until = false;
+ m_iter = m_txn->GetIterator(m_read_opts, m_cfh);
+ m_iter->Seek(target);
+ ScanForward(target, false);
+}
+
+void LockingIterator::SeekForPrev(const rocksdb::Slice& target) {
+ m_have_locked_until = false;
+ m_iter = m_txn->GetIterator(m_read_opts, m_cfh);
+ m_iter->SeekForPrev(target);
+ ScanBackward(target, false);
+}
+
+/*
+ @brief
+ Move the iterator to the next key, locking the range between the current
+ and the next key.
+
+ @detail
+ Implementation is similar to Seek(next_key). Since we don't know what the
+ next_key is, we reach it by calling { Seek(current_key); Next(); }
+*/
+void LockingIterator::Next() {
+ DEBUG_SYNC(my_core::thd_get_current_thd(), "rocksdb.LockingIterator.Next");
+ assert(Valid());
+ // Save the current key value. We need it as the left endpoint
+ // of the range lock we're going to acquire
+ std::string current_key = m_iter->key().ToString();
+
+ m_iter->Next();
+ ScanForward(rocksdb::Slice(current_key), true);
+}
+
+/*
+ @brief
+ Move the iterator to the previous key, locking the range between the current
+ and the previous key.
+*/
+
+void LockingIterator::Prev() {
+ assert(Valid());
+
+ std::string current_key = m_iter->key().ToString();
+ m_iter->Prev();
+ ScanBackward(rocksdb::Slice(current_key), true);
+}
+
+/*
+ @brief
+ Lock range from target to end_key.
+
+ @detail
+ In forward-ordered scan, target < end_key. In backward-ordered scan, it's
+ other way around.
+
+ We might have already locked a subset of this range, a subrange that
+ starts from target and extends to some point between target and end_key.
+*/
+void LockingIterator::lock_up_to(bool scan_forward,
+ const rocksdb::Slice& target,
+ const rocksdb::Slice& end_key) {
+ const int inv = scan_forward ? 1 : -1;
+ auto cmp= m_cfh->GetComparator();
+ bool endp_arg= m_kd->m_is_reverse_cf;
+
+ if (m_have_locked_until &&
+ cmp->Compare(end_key, rocksdb::Slice(m_locked_until))*inv <= 0) {
+ // We've already locked this range. The following has happened:
+ // - m_iter->key() returned $KEY
+ // - other transaction(s) have inserted row $ROW before the $KEY.
+ // - we got a range lock on [range_start, $KEY]
+ // - we've read $ROW and returned.
+ // Now, we're looking to lock [$ROW, $KEY] but we don't need to,
+ // we already have a lock on this range.
+ } else {
+ if (scan_forward) {
+ m_status = m_txn->GetRangeLock(m_cfh,
+ rocksdb::Endpoint(target, endp_arg),
+ rocksdb::Endpoint(end_key, endp_arg));
+ } else {
+ m_status = m_txn->GetRangeLock(m_cfh,
+ rocksdb::Endpoint(end_key, endp_arg),
+ rocksdb::Endpoint(target, endp_arg));
+ }
+
+ if (!m_status.ok())
+ return;
+
+ // Save the bound where we locked until:
+ m_have_locked_until= true;
+ m_locked_until.assign(end_key.data(), end_key.size());
+ if (m_lock_count) (*m_lock_count)++;
+ }
+}
+
+/*
+ Lock the range from target till the iterator end point that we are scaning
+ towards. If there's no iterator bound, use index start (or end, depending
+ on the scan direction)
+*/
+void LockingIterator::lock_till_iterator_end(bool scan_forward,
+ const rocksdb::Slice& target) {
+ rocksdb::Slice end;
+ uchar buf[Rdb_key_def::INDEX_NUMBER_SIZE];
+ uint size;
+ if (scan_forward) {
+ if (m_read_opts.iterate_upper_bound)
+ end = *m_read_opts.iterate_upper_bound;
+ else {
+ if (m_kd->m_is_reverse_cf)
+ m_kd->get_infimum_key(buf, &size);
+ else
+ m_kd->get_supremum_key(buf, &size);
+
+ DBUG_ASSERT(size == Rdb_key_def::INDEX_NUMBER_SIZE);
+ end = rocksdb::Slice((const char*)buf, size);
+ }
+ } else {
+ if (m_read_opts.iterate_lower_bound)
+ end = *m_read_opts.iterate_lower_bound;
+ else {
+ if (m_kd->m_is_reverse_cf)
+ m_kd->get_supremum_key(buf, &size);
+ else
+ m_kd->get_infimum_key(buf, &size);
+
+ DBUG_ASSERT(size == Rdb_key_def::INDEX_NUMBER_SIZE);
+ end = rocksdb::Slice((const char*)buf, size);
+ }
+ }
+ // This will set m_status accordingly
+ lock_up_to(scan_forward, target, end);
+}
+
+/*
+ Lock the range between [target, (current m_iter position)] and position
+ the iterator on the first record in it.
+
+ @param call_next false means current iterator position is achieved by
+ calling Seek(target).
+ true means one also needs to call Next()
+*/
+void LockingIterator::Scan(bool scan_forward,
+ const rocksdb::Slice& target, bool call_next) {
+ if (!m_iter->Valid()) {
+ m_status = m_iter->status();
+ m_valid = false;
+ if (m_status.ok()) {
+ // m_iter has reached EOF
+ lock_till_iterator_end(scan_forward, target);
+ }
+ return;
+ }
+
+ while (1) {
+
+ DEBUG_SYNC(my_core::thd_get_current_thd(), "rocksdb.locking_iter_scan");
+
+ if (my_core::thd_killed(current_thd)) {
+ m_status = rocksdb::Status::Aborted();
+ m_valid = false;
+ return;
+ }
+
+ const int inv = scan_forward ? 1 : -1;
+ auto cmp= m_cfh->GetComparator();
+
+ auto end_key = m_iter->key();
+ std::string end_key_copy= end_key.ToString();
+
+ lock_up_to(scan_forward, target, end_key);
+ if (!m_status.ok()) {
+ // Failed to get a lock (most likely lock wait timeout)
+ m_valid = false;
+ return;
+ }
+
+ //Ok, now we have a lock which is inhibiting modifications in the range
+ // Somebody might have done external modifications, though:
+ // - removed the key we've found
+ // - added a key before that key.
+
+ // First, refresh the iterator:
+ delete m_iter;
+ m_iter = m_txn->GetIterator(m_read_opts, m_cfh);
+
+ // Then, try seeking to the same row
+ if (scan_forward)
+ m_iter->Seek(target);
+ else
+ m_iter->SeekForPrev(target);
+
+ if (call_next && m_iter->Valid() &&
+ !cmp->Compare(m_iter->key(), target)) {
+ if (scan_forward)
+ m_iter->Next();
+ else
+ m_iter->Prev();
+ }
+
+ if (m_iter->Valid()) {
+ if (cmp->Compare(m_iter->key(), rocksdb::Slice(end_key_copy))*inv <= 0) {
+ // Ok, the found key is within the locked range.
+ m_status = rocksdb::Status::OK();
+ m_valid= true;
+ break;
+ } else {
+ // We've got a key but it is outside the range we've locked.
+ // Re-try the lock-and-read step.
+ continue;
+ }
+ } else {
+ m_valid = false;
+ m_status = m_iter->status();
+ if (m_status.ok()) {
+ // m_iter has reached EOF
+ lock_till_iterator_end(scan_forward, target);
+ }
+ break;
+ }
+ }
+}
+
+/*
+ @detail
+ Ideally, this function should
+ - find the first key $first_key
+ - lock the range [-inf; $first_key]
+ - return, the iterator is positioned on $first_key
+
+ The problem here is that we cannot have "-infinity" bound.
+
+ Note: we don't have a practical use for this function - MyRocks always
+ searches within one index_name.table_name, which means we are only looking
+ at the keys with index_number as the prefix.
+*/
+
+void LockingIterator::SeekToFirst() {
+ DBUG_ASSERT(0);
+ m_status = rocksdb::Status::NotSupported("Not implemented");
+ m_valid = false;
+}
+
+/*
+ @detail
+ See SeekToFirst.
+*/
+
+void LockingIterator::SeekToLast() {
+ DBUG_ASSERT(0);
+ m_status = rocksdb::Status::NotSupported("Not implemented");
+ m_valid = false;
+}
+
+} // namespace myrocks
+
diff --git a/storage/rocksdb/rdb_locking_iter.h b/storage/rocksdb/rdb_locking_iter.h
new file mode 100644
index 00000000000..3cd09143649
--- /dev/null
+++ b/storage/rocksdb/rdb_locking_iter.h
@@ -0,0 +1,120 @@
+
+/* MySQL header files */
+#include "sql/handler.h" /* handler */
+#include "sql/debug_sync.h"
+#include "./rdb_threads.h" /* for thd_get_current_thd */
+
+/* MyRocks header files */
+#include "./ha_rocksdb.h"
+#include "./rdb_datadic.h"
+
+namespace myrocks {
+
+//
+// LockingIterator is an iterator that locks the rows before returning, as well
+// as scanned gaps between the rows.
+//
+// Example:
+// lock_iter= trx->GetLockingIterator();
+// lock_iter->Seek('abc');
+// lock_iter->Valid()==true && lock_iter->key() == 'bcd';
+//
+// After the above, the returned record 'bcd' is locked by transaction trx.
+// Also, the range between ['abc'..'bcd'] is empty and locked by trx.
+//
+// lock_iter->Next();
+// lock_iter->Valid()==true && lock_iter->key() == 'efg'
+//
+// Now, the range ['bcd'.. 'efg'] (bounds inclusive) is also locked, and there are no
+// records between 'bcd' and 'efg'.
+//
+class LockingIterator : public rocksdb::Iterator {
+
+ rocksdb::Transaction *m_txn;
+ rocksdb::ColumnFamilyHandle* m_cfh;
+ const std::shared_ptr<Rdb_key_def> &m_kd;
+
+ rocksdb::ReadOptions m_read_opts;
+ rocksdb::Iterator *m_iter;
+ rocksdb::Status m_status;
+
+ // note: an iterator that has reached EOF has status()==OK && m_valid==false
+ bool m_valid;
+
+ ulonglong *m_lock_count;
+
+ // If true, m_locked_until has a valid key value.
+ bool m_have_locked_until;
+
+ // The key value until we've locked the range. That is, we have a range lock
+ // on [current_position ... m_locked_until].
+ // This is used to avoid making extra GetRangeLock() calls.
+ std::string m_locked_until;
+ public:
+ LockingIterator(rocksdb::Transaction *txn,
+ rocksdb::ColumnFamilyHandle *cfh,
+ const std::shared_ptr<Rdb_key_def> &kd,
+ const rocksdb::ReadOptions& opts,
+ ulonglong *lock_count=nullptr
+ ) :
+ m_txn(txn), m_cfh(cfh), m_kd(kd), m_read_opts(opts), m_iter(nullptr),
+ m_status(rocksdb::Status::InvalidArgument()), m_valid(false),
+ m_lock_count(lock_count), m_have_locked_until(false) {}
+
+ ~LockingIterator() {
+ delete m_iter;
+ }
+
+ virtual bool Valid() const override { return m_valid; }
+
+ // Note: MyRocks doesn't ever call these:
+ virtual void SeekToFirst() override;
+ virtual void SeekToLast() override;
+
+ virtual void Seek(const rocksdb::Slice& target) override;
+
+ // Position at the last key in the source that at or before target.
+ // The iterator is Valid() after this call iff the source contains
+ // an entry that comes at or before target.
+ virtual void SeekForPrev(const rocksdb::Slice& target) override;
+
+ virtual void Next() override;
+ virtual void Prev() override;
+
+ virtual rocksdb::Slice key() const override {
+ assert(Valid());
+ return m_iter->key();
+ }
+
+ virtual rocksdb::Slice value() const override {
+ assert(Valid());
+ return m_iter->value();
+ }
+
+ virtual rocksdb::Status status() const override {
+ return m_status;
+ }
+
+ private:
+ void lock_up_to(bool scan_forward, const rocksdb::Slice& target,
+ const rocksdb::Slice& end_key);
+ void lock_till_iterator_end(bool scan_forward, const rocksdb::Slice& target);
+ void Scan(bool scan_forward, const rocksdb::Slice& target, bool call_next);
+
+ inline void ScanForward(const rocksdb::Slice& target, bool call_next) {
+ Scan(true, target, call_next);
+ }
+
+ inline void ScanBackward(const rocksdb::Slice& target, bool call_next) {
+ Scan(false, target, call_next);
+ }
+};
+
+rocksdb::Iterator*
+GetLockingIterator(rocksdb::Transaction *trx,
+ const rocksdb::ReadOptions& read_options,
+ rocksdb::ColumnFamilyHandle* column_family,
+ const std::shared_ptr<Rdb_key_def>& kd,
+ ulonglong *counter);
+
+} // namespace myrocks
diff --git a/storage/rocksdb/rdb_utils.cc b/storage/rocksdb/rdb_utils.cc
index a723ac9e806..88695aa0539 100644
--- a/storage/rocksdb/rdb_utils.cc
+++ b/storage/rocksdb/rdb_utils.cc
@@ -259,6 +259,33 @@ std::string rdb_hexdump(const char *data, const std::size_t data_len,
return str;
}
+/*
+ Print the range in hex, in "start_endpoint-end_endpoint" form
+*/
+
+std::string rdb_hexdump_range(const rocksdb::EndpointWithString& start,
+ const rocksdb::EndpointWithString& end) {
+ std::string res;
+ // For keys: :0 keys should look like point keys
+ if (!start.inf_suffix && !end.inf_suffix && (start.slice == end.slice)) {
+ // This is a single-point range, show it like a key
+ res = rdb_hexdump(start.slice.c_str(), start.slice.length(), FN_REFLEN);
+ } else {
+ res = rdb_hexdump(start.slice.c_str(), start.slice.length(), FN_REFLEN);
+ if (start.inf_suffix)
+ res.append(":1");
+
+ res.append("-");
+
+ std::string key2 = rdb_hexdump(end.slice.c_str(), end.slice.length(),
+ FN_REFLEN);
+ if (end.inf_suffix)
+ key2.append(":1");
+ res.append(key2);
+ }
+ return res;
+}
+
/*
Attempt to access the database subdirectory to see if it exists
*/
diff --git a/storage/rocksdb/rdb_utils.h b/storage/rocksdb/rdb_utils.h
index f49f102a08c..39f2096dd35 100644
--- a/storage/rocksdb/rdb_utils.h
+++ b/storage/rocksdb/rdb_utils.h
@@ -29,6 +29,7 @@
/* RocksDB header files */
#include "rocksdb/slice.h"
#include "rocksdb/status.h"
+#include "rocksdb/utilities/transaction_db.h"
/* MyRocks header files */
#include "./rdb_global.h"
@@ -290,6 +291,8 @@ std::string rdb_hexdump(const char *data, const std::size_t data_len,
const std::size_t maxsize = 0)
MY_ATTRIBUTE((__nonnull__));
+std::string rdb_hexdump_range(const rocksdb::EndpointWithString& left,
+ const rocksdb::EndpointWithString& right);
/*
Helper function to see if a database exists
*/
1
0
[Commits] 168c97ea50f: This adds a my.cnf parameter, rocksdb_use_range_locking.
by psergey 14 Feb '22
by psergey 14 Feb '22
14 Feb '22
revision-id: 168c97ea50fe367956cb7a570660d2fdf8b2c5e6 ()
parent(s): 5098a1526a5021064ed7307e7e0ba30453723f66
author: Sergei Petrunia
committer: Sergei Petrunia
timestamp: 2022-02-14 23:36:26 +0300
message:
This adds a my.cnf parameter, rocksdb_use_range_locking.
When it is ON, MyRocks will:
- initialize RocksDB to use range-locking lock manager
- for all DML operations (including SELECT .. FOR UPDATE) will lock
the scanned range before reading/modifying rows.
- In range locking mode, there is no snapshot checking (cannot do that
for ranges). Instead, MyRocks will read and modify latest committed
data, just like InnoDB does (in the code, grep for (start|end)_ignore_
snapshot)
- Queries that do not have a finite range to scan, like
UPDATE t1 .... ORDER BY t1.key LIMIT n
will use a "Locking iterator" which will read rows, lock the range,
and re-read the rows. See class LockingIterator.
---
mysql-test/suite/rocksdb/combinations | 3 +
.../suite/rocksdb/include/have_range_locking.inc | 3 +
.../suite/rocksdb/include/not_range_locking.inc | 5 +
.../rocksdb/include/select_from_is_rowlocks.inc | 99 +++
.../suite/rocksdb/r/hermitage-range_locking.result | 652 +++++++++++++++++++
...issue243_transactionStatus-range_locking.result | 182 ++++++
.../r/level_repeatable_read-range_locking.result | 106 +++
mysql-test/suite/rocksdb/r/range_locking.result | 596 +++++++++++++++++
.../suite/rocksdb/r/range_locking_conc_test.result | 4 +
.../r/range_locking_deadlock_tracking.result | 453 +++++++++++++
.../rocksdb/r/range_locking_escalation.result | 27 +
.../rocksdb/r/range_locking_partial_index.result | 203 ++++++
.../rocksdb/r/range_locking_refresh_iter.result | 50 ++
.../suite/rocksdb/r/range_locking_rev_cf.result | 556 ++++++++++++++++
.../rocksdb/r/range_locking_seek_for_update.result | 291 +++++++++
.../r/range_locking_seek_for_update2.result | 142 ++++
.../r/range_locking_seek_for_update2_rev_cf.result | 142 ++++
.../rocksdb/r/range_locking_shared_locks.result | 251 ++++++++
mysql-test/suite/rocksdb/r/rocksdb.result | 3 +
.../suite/rocksdb/r/rocksdb_read_free_rpl.result | 2 +-
.../rocksdb/r/rocksdb_timeout_rollback.result | 3 +
.../suite/rocksdb/r/select_count_for_update.result | 4 +-
mysql-test/suite/rocksdb/r/trx_info.result | 7 +-
mysql-test/suite/rocksdb/r/unique_sec.result | 4 +
.../suite/rocksdb/r/unique_sec_rev_cf.result | 4 +
mysql-test/suite/rocksdb/t/deadlock_tracking.test | 7 +-
.../t/drop_cf_before_show_deadlock_info.test | 4 +
.../suite/rocksdb/t/hermitage-range_locking.test | 15 +
mysql-test/suite/rocksdb/t/hermitage.inc | 14 +-
mysql-test/suite/rocksdb/t/hermitage.test | 3 +
mysql-test/suite/rocksdb/t/i_s_deadlock.test | 4 +
mysql-test/suite/rocksdb/t/issue111.test | 4 +
.../issue243_transactionStatus-range_locking.test | 10 +
.../rocksdb/t/issue243_transactionStatus.test | 4 +
.../t/level_repeatable_read-range_locking.test | 9 +
.../suite/rocksdb/t/level_repeatable_read.test | 3 +
mysql-test/suite/rocksdb/t/lock_info.test | 3 +
mysql-test/suite/rocksdb/t/locking_issues.test | 3 +
mysql-test/suite/rocksdb/t/max_row_locks.test | 1 +
mysql-test/suite/rocksdb/t/range_locking.inc | 612 ++++++++++++++++++
mysql-test/suite/rocksdb/t/range_locking.test | 6 +
.../suite/rocksdb/t/range_locking_conc_test.py | 448 +++++++++++++
.../suite/rocksdb/t/range_locking_conc_test.test | 17 +
.../suite/rocksdb/t/range_locking_conc_test.txt | 91 +++
.../rocksdb/t/range_locking_deadlock_tracking.test | 196 ++++++
.../rocksdb/t/range_locking_escalation-master.opt | 1 +
.../suite/rocksdb/t/range_locking_escalation.test | 39 ++
.../rocksdb/t/range_locking_partial_index.test | 120 ++++
.../rocksdb/t/range_locking_refresh_iter.test | 70 ++
.../suite/rocksdb/t/range_locking_rev_cf.test | 12 +
.../rocksdb/t/range_locking_seek_for_update.test | 308 +++++++++
.../rocksdb/t/range_locking_seek_for_update2.inc | 55 ++
.../rocksdb/t/range_locking_seek_for_update2.test | 4 +
.../t/range_locking_seek_for_update2_rev_cf.test | 4 +
.../t/range_locking_seek_for_update_iter_end.inc | 41 ++
.../rocksdb/t/range_locking_shared_locks.test | 202 ++++++
mysql-test/suite/rocksdb/t/rocksdb.test | 3 +
.../suite/rocksdb/t/rocksdb_concurrent_delete.test | 4 +
mysql-test/suite/rocksdb/t/rocksdb_locks.test | 3 +
.../suite/rocksdb/t/rocksdb_read_free_rpl.test | 2 +-
.../suite/rocksdb/t/rocksdb_timeout_rollback.test | 2 +
mysql-test/suite/rocksdb/t/rpl_row_not_found.inc | 2 +
.../suite/rocksdb/t/select_count_for_update.test | 14 +
.../suite/rocksdb/t/select_lock_in_share_mode.test | 3 +
mysql-test/suite/rocksdb/t/skip_locked_nowait.test | 3 +
mysql-test/suite/rocksdb/t/trx_info.test | 6 +-
mysql-test/suite/rocksdb/t/unique_check.test | 5 +
mysql-test/suite/rocksdb/t/unique_sec.inc | 10 +-
mysql-test/suite/rocksdb/t/varbinary_format.test | 4 +
mysql-test/suite/rocksdb/t/varchar_format.test | 2 +
.../r/rocksdb_max_lock_memory_basic.result | 7 +
.../r/rocksdb_use_range_locking_basic.result | 7 +
.../t/rocksdb_max_lock_memory_basic.test | 5 +
.../t/rocksdb_use_range_locking_basic.test | 5 +
rocksdb | 2 +-
storage/rocksdb/CMakeLists.txt | 1 +
storage/rocksdb/get_rocksdb_files.sh | 2 +-
storage/rocksdb/ha_rocksdb.cc | 717 +++++++++++++++++++--
storage/rocksdb/ha_rocksdb.h | 31 +-
storage/rocksdb/locking-iterator-partial-index.txt | 120 ++++
storage/rocksdb/nosql_access.cc | 4 +-
storage/rocksdb/rdb_i_s.cc | 93 ++-
storage/rocksdb/rdb_iterator.cc | 62 +-
storage/rocksdb/rdb_iterator.h | 29 +-
storage/rocksdb/rdb_locking_iter.cc | 285 ++++++++
storage/rocksdb/rdb_locking_iter.h | 120 ++++
storage/rocksdb/rdb_utils.cc | 27 +
storage/rocksdb/rdb_utils.h | 3 +
88 files changed, 7587 insertions(+), 98 deletions(-)
diff --git a/mysql-test/suite/rocksdb/combinations b/mysql-test/suite/rocksdb/combinations
index acf2f49a0c3..5e3b56932c6 100644
--- a/mysql-test/suite/rocksdb/combinations
+++ b/mysql-test/suite/rocksdb/combinations
@@ -7,3 +7,6 @@ rocksdb_write_policy=write_prepared
[write_unprepared]
rocksdb_write_policy=write_unprepared
rocksdb_write_batch_flush_threshold=1
+
+[range_locking]
+rocksdb_use_range_locking=1
diff --git a/mysql-test/suite/rocksdb/include/have_range_locking.inc b/mysql-test/suite/rocksdb/include/have_range_locking.inc
new file mode 100644
index 00000000000..a8600daea77
--- /dev/null
+++ b/mysql-test/suite/rocksdb/include/have_range_locking.inc
@@ -0,0 +1,3 @@
+if (`select count(*) = 0 from performance_schema.session_variables where variable_name = 'rocksdb_use_range_locking' and variable_value = 'ON';`) {
+ --skip Test requires range locking
+}
diff --git a/mysql-test/suite/rocksdb/include/not_range_locking.inc b/mysql-test/suite/rocksdb/include/not_range_locking.inc
new file mode 100644
index 00000000000..62c26b134bc
--- /dev/null
+++ b/mysql-test/suite/rocksdb/include/not_range_locking.inc
@@ -0,0 +1,5 @@
+--let $_use_range_locking= `select @@rocksdb_use_range_locking`
+if ($_use_range_locking == 1)
+{
+ --skip Test doesn't support range locking
+}
diff --git a/mysql-test/suite/rocksdb/include/select_from_is_rowlocks.inc b/mysql-test/suite/rocksdb/include/select_from_is_rowlocks.inc
new file mode 100644
index 00000000000..cc6a328e566
--- /dev/null
+++ b/mysql-test/suite/rocksdb/include/select_from_is_rowlocks.inc
@@ -0,0 +1,99 @@
+--echo # select * from information_schema.rocksdb_locks; # With replacements by select_from_is_rowlocks.inc
+#
+# An include to print contents of I_S.ROCKSB_LOCKS
+#
+# Implicit "parameters"
+# - Currently it prints locks on t1.PRIMARY
+#
+# Explicit "parameter" variables:
+# - $TRX1_ID - print this transaction as "TRX1"
+# - $TRX2_ID - print this transaction as "TRX2"
+#
+# - $select_from_is_rowlocks_current_trx_only
+# - $order_by_rowkey
+#
+# - $SECOND_INDEX_NAME
+
+--disable_query_log
+set @cf_id=(select column_family from information_schema.rocksdb_ddl
+ where table_name='t1' and index_name='PRIMARY');
+set @rtrx_id=(select transaction_id from information_schema.rocksdb_trx
+ where thread_id=connection_id());
+set @indexnr= (select lower(lpad(hex(index_number),8,'0')) from information_schema.rocksdb_ddl
+ where table_name='t1' and index_name='PRIMARY');
+
+set @indexnr_next= (select lower(lpad(hex(index_number+1),8,'0'))
+ from information_schema.rocksdb_ddl
+ where table_name='t1' and index_name='PRIMARY');
+
+let $extra_where = where 1;
+
+if ($select_from_is_rowlocks_current_trx_only)
+{
+ let $extra_where = where transaction_id=(select transaction_id from information_schema.rocksdb_trx where connection_id()=thread_id);
+}
+
+## transaction column
+
+# If TRX1_ID is not specified, get the current transaction:
+let $transaction_col= replace(transaction_id, @rtrx_id, "\$trx_id");
+if ($TRX1_ID)
+{
+ let $transaction_col = replace(transaction_id, '$TRX1_ID', "\$TRX1_ID");
+}
+
+if ($TRX2_ID)
+{
+ let $transaction_col = replace($transaction_col, '$TRX2_ID', "\$TRX2_ID");
+}
+
+## CF_ID column
+let $cf_id_col= column_family_id;
+
+if ($SECOND_INDEX_NAME)
+{
+ eval set @cf2_id=(select column_family from information_schema.rocksdb_ddl
+ where table_name='t1' and index_name='$SECOND_INDEX_NAME');
+
+ let $cf_id_col= replace($cf_id_col, @cf2_id, "\$cf2_id");
+}
+let $cf_id_col= replace($cf_id_col, @cf_id, "\$cf_id");
+
+## KEY column
+let $key_col= (`key`);
+if ($SECOND_INDEX_NAME)
+{
+ eval set @indexnr2= (select lower(lpad(hex(index_number),8,'0'))
+ from information_schema.rocksdb_ddl
+ where table_name='t1' and index_name='$SECOND_INDEX_NAME');
+
+ eval set @indexnr2_next= (select lower(lpad(hex(index_number+1),8,'0'))
+ from information_schema.rocksdb_ddl
+ where table_name='t1' and index_name='$SECOND_INDEX_NAME');
+
+ let $key_col = replace($key_col, @indexnr2, '\${indexnr2}');
+ let $key_col = replace($key_col, @indexnr2_next, '\${indexnr2+1}');
+}
+
+let $key_col = replace($key_col, @indexnr, '\${indexnr}');
+let $key_col = replace($key_col, @indexnr_next, '\${indexnr+1}');
+
+## ORDER BY
+if ($order_by_rowkey)
+{
+ let $extra_order_by = ORDER BY 3,2;
+}
+
+if (!$order_by_rowkey)
+{
+ --sorted_result
+}
+
+eval select
+ $cf_id_col as COLUMN_FAMILY_ID,
+ $transaction_col as TRANSACTION_ID,
+ $key_col as `KEY`,
+ mode
+from information_schema.rocksdb_locks $extra_where $extra_order_by;
+
+--enable_query_log
diff --git a/mysql-test/suite/rocksdb/r/hermitage-range_locking.result b/mysql-test/suite/rocksdb/r/hermitage-range_locking.result
new file mode 100644
index 00000000000..3938fa38b6c
--- /dev/null
+++ b/mysql-test/suite/rocksdb/r/hermitage-range_locking.result
@@ -0,0 +1,652 @@
+DROP TABLE IF EXISTS test;
+connect con1,localhost,root,,;
+SET SESSION TRANSACTION ISOLATION LEVEL READ COMMITTED;
+connect con2,localhost,root,,;
+SET SESSION TRANSACTION ISOLATION LEVEL READ COMMITTED;
+connect con3,localhost,root,,;
+SET SESSION TRANSACTION ISOLATION LEVEL READ COMMITTED;
+connection con1;
+create table test (id int primary key, value int) engine=rocksdb;
+connection con1;
+truncate table test;
+insert into test (id, value) values (1, 10), (2, 20);
+begin;
+connection con2;
+begin;
+connection con3;
+begin;
+connection con1;
+select * from test;
+id value
+1 10
+2 20
+update test set value = 101 where id = 1;
+connection con2;
+select * from test;
+id value
+1 10
+2 20
+connection con1;
+rollback;
+connection con2;
+select * from test;
+id value
+1 10
+2 20
+commit;
+connection con1;
+truncate table test;
+insert into test (id, value) values (1, 10), (2, 20);
+begin;
+connection con2;
+begin;
+connection con3;
+begin;
+connection con1;
+update test set value = 101 where id = 1;
+connection con2;
+select * from test;
+id value
+1 10
+2 20
+connection con1;
+update test set value = 11 where id = 1;
+commit;
+connection con2;
+select * from test;
+id value
+1 11
+2 20
+commit;
+connection con1;
+truncate table test;
+insert into test (id, value) values (1, 10), (2, 20);
+begin;
+connection con2;
+begin;
+connection con3;
+begin;
+connection con1;
+update test set value = 11 where id = 1;
+connection con2;
+update test set value = 22 where id = 2;
+connection con1;
+select * from test where id = 2;
+id value
+2 20
+connection con2;
+select * from test where id = 1;
+id value
+1 10
+connection con1;
+commit;
+connection con2;
+commit;
+connection con1;
+truncate table test;
+insert into test (id, value) values (1, 10), (2, 20);
+begin;
+connection con2;
+begin;
+connection con3;
+begin;
+connection con1;
+update test set value = 11 where id = 1;
+update test set value = 19 where id = 2;
+connection con2;
+update test set value = 12 where id = 1;
+connection con1;
+commit;
+connection con2;
+connection con3;
+select * from test;
+id value
+1 11
+2 19
+connection con2;
+update test set value = 18 where id = 2;
+connection con3;
+select * from test;
+id value
+1 11
+2 19
+connection con2;
+commit;
+connection con3;
+select * from test;
+id value
+1 12
+2 18
+commit;
+connection con1;
+truncate table test;
+insert into test (id, value) values (1, 10), (2, 20);
+begin;
+connection con2;
+begin;
+connection con3;
+begin;
+connection con1;
+select * from test where value = 30;
+id value
+connection con2;
+insert into test (id, value) values(3, 30);
+commit;
+connection con1;
+select * from test where value % 3 = 0;
+id value
+3 30
+commit;
+connection con1;
+truncate table test;
+insert into test (id, value) values (1, 10), (2, 20);
+begin;
+connection con2;
+begin;
+connection con3;
+begin;
+connection con1;
+update test set value = value + 10;
+connection con2;
+select variable_value into @a from performance_schema.global_status where variable_name='rocksdb_snapshot_conflict_errors';
+select * from test;
+id value
+1 10
+2 20
+delete from test where value = 20;
+connection con1;
+commit;
+connection con2;
+select * from test;
+id value
+2 30
+commit;
+connection con1;
+truncate table test;
+insert into test (id, value) values (1, 10), (2, 20);
+begin;
+connection con2;
+begin;
+connection con3;
+begin;
+connection con1;
+select * from test where id = 1;
+id value
+1 10
+connection con2;
+select * from test where id = 1;
+id value
+1 10
+connection con1;
+update test set value = 11 where id = 1;
+connection con2;
+update test set value = 12 where id = 1;
+connection con1;
+commit;
+connection con2;
+select * from test;
+id value
+1 12
+2 20
+commit;
+connection con1;
+truncate table test;
+insert into test (id, value) values (1, 10), (2, 20);
+begin;
+connection con2;
+begin;
+connection con3;
+begin;
+connection con1;
+select * from test where id = 1;
+id value
+1 10
+connection con2;
+select * from test where id = 1;
+id value
+1 10
+select * from test where id = 2;
+id value
+2 20
+update test set value = 12 where id = 1;
+update test set value = 18 where id = 2;
+commit;
+connection con1;
+select * from test where id = 2;
+id value
+2 18
+commit;
+connection con1;
+truncate table test;
+insert into test (id, value) values (1, 10), (2, 20);
+begin;
+connection con2;
+begin;
+connection con3;
+begin;
+connection con1;
+select * from test where value % 5 = 0;
+id value
+1 10
+2 20
+connection con2;
+update test set value = 12 where value = 10;
+commit;
+connection con1;
+select * from test where value % 3 = 0;
+id value
+1 12
+commit;
+connection con1;
+truncate table test;
+insert into test (id, value) values (1, 10), (2, 20);
+begin;
+connection con2;
+begin;
+connection con3;
+begin;
+connection con1;
+select * from test where id = 1;
+id value
+1 10
+connection con2;
+select * from test;
+id value
+1 10
+2 20
+update test set value = 12 where id = 1;
+update test set value = 18 where id = 2;
+commit;
+connection con1;
+delete from test where value = 20;
+select * from test where id = 2;
+id value
+2 18
+commit;
+connection con1;
+truncate table test;
+insert into test (id, value) values (1, 10), (2, 20);
+begin;
+connection con2;
+begin;
+connection con3;
+begin;
+connection con1;
+select * from test where id in (1,2);
+id value
+1 10
+2 20
+connection con2;
+select * from test where id in (1,2);
+id value
+1 10
+2 20
+connection con1;
+update test set value = 11 where id = 1;
+connection con2;
+update test set value = 21 where id = 2;
+connection con1;
+commit;
+connection con2;
+commit;
+connection con1;
+truncate table test;
+insert into test (id, value) values (1, 10), (2, 20);
+begin;
+connection con2;
+begin;
+connection con3;
+begin;
+connection con1;
+select * from test where value % 3 = 0;
+id value
+connection con2;
+select * from test where value % 3 = 0;
+id value
+connection con1;
+insert into test (id, value) values(3, 30);
+connection con2;
+insert into test (id, value) values(4, 42);
+connection con1;
+commit;
+connection con2;
+commit;
+select * from test where value % 3 = 0;
+id value
+3 30
+4 42
+connection con1;
+select * from test where value % 3 = 0;
+id value
+3 30
+4 42
+connection default;
+drop table test;
+disconnect con1;
+disconnect con2;
+disconnect con3;
+DROP TABLE IF EXISTS test;
+connect con1,localhost,root,,;
+SET SESSION TRANSACTION ISOLATION LEVEL REPEATABLE READ;
+connect con2,localhost,root,,;
+SET SESSION TRANSACTION ISOLATION LEVEL REPEATABLE READ;
+connect con3,localhost,root,,;
+SET SESSION TRANSACTION ISOLATION LEVEL REPEATABLE READ;
+connection con1;
+create table test (id int primary key, value int) engine=rocksdb;
+connection con1;
+truncate table test;
+insert into test (id, value) values (1, 10), (2, 20);
+begin;
+connection con2;
+begin;
+connection con3;
+begin;
+connection con1;
+select * from test;
+id value
+1 10
+2 20
+update test set value = 101 where id = 1;
+connection con2;
+select * from test;
+id value
+1 10
+2 20
+connection con1;
+rollback;
+connection con2;
+select * from test;
+id value
+1 10
+2 20
+commit;
+connection con1;
+truncate table test;
+insert into test (id, value) values (1, 10), (2, 20);
+begin;
+connection con2;
+begin;
+connection con3;
+begin;
+connection con1;
+update test set value = 101 where id = 1;
+connection con2;
+select * from test;
+id value
+1 10
+2 20
+connection con1;
+update test set value = 11 where id = 1;
+commit;
+connection con2;
+select * from test;
+id value
+1 10
+2 20
+commit;
+connection con1;
+truncate table test;
+insert into test (id, value) values (1, 10), (2, 20);
+begin;
+connection con2;
+begin;
+connection con3;
+begin;
+connection con1;
+update test set value = 11 where id = 1;
+connection con2;
+update test set value = 22 where id = 2;
+connection con1;
+select * from test where id = 2;
+id value
+2 20
+connection con2;
+select * from test where id = 1;
+id value
+1 10
+connection con1;
+commit;
+connection con2;
+commit;
+connection con1;
+truncate table test;
+insert into test (id, value) values (1, 10), (2, 20);
+begin;
+connection con2;
+begin;
+connection con3;
+begin;
+connection con1;
+update test set value = 11 where id = 1;
+update test set value = 19 where id = 2;
+connection con2;
+update test set value = 12 where id = 1;
+connection con1;
+commit;
+connection con2;
+connection con3;
+select * from test;
+id value
+1 11
+2 19
+connection con2;
+update test set value = 18 where id = 2;
+connection con3;
+select * from test;
+id value
+1 11
+2 19
+connection con2;
+commit;
+connection con3;
+select * from test;
+id value
+1 11
+2 19
+commit;
+connection con1;
+truncate table test;
+insert into test (id, value) values (1, 10), (2, 20);
+begin;
+connection con2;
+begin;
+connection con3;
+begin;
+connection con1;
+select * from test where value = 30;
+id value
+connection con2;
+insert into test (id, value) values(3, 30);
+commit;
+connection con1;
+select * from test where value % 3 = 0;
+id value
+commit;
+connection con1;
+truncate table test;
+insert into test (id, value) values (1, 10), (2, 20);
+begin;
+connection con2;
+begin;
+connection con3;
+begin;
+connection con1;
+update test set value = value + 10;
+connection con2;
+select variable_value into @a from performance_schema.global_status where variable_name='rocksdb_snapshot_conflict_errors';
+select * from test;
+id value
+1 10
+2 20
+delete from test where value = 20;
+connection con1;
+commit;
+connection con2;
+select * from test;
+id value
+2 20
+commit;
+connection con1;
+truncate table test;
+insert into test (id, value) values (1, 10), (2, 20);
+begin;
+connection con2;
+begin;
+connection con3;
+begin;
+connection con1;
+select * from test where id = 1;
+id value
+1 10
+connection con2;
+select * from test where id = 1;
+id value
+1 10
+connection con1;
+update test set value = 11 where id = 1;
+connection con2;
+update test set value = 12 where id = 1;
+connection con1;
+commit;
+connection con2;
+select * from test;
+id value
+1 12
+2 20
+commit;
+connection con1;
+truncate table test;
+insert into test (id, value) values (1, 10), (2, 20);
+begin;
+connection con2;
+begin;
+connection con3;
+begin;
+connection con1;
+select * from test where id = 1;
+id value
+1 10
+connection con2;
+select * from test where id = 1;
+id value
+1 10
+select * from test where id = 2;
+id value
+2 20
+update test set value = 12 where id = 1;
+update test set value = 18 where id = 2;
+commit;
+connection con1;
+select * from test where id = 2;
+id value
+2 20
+commit;
+connection con1;
+truncate table test;
+insert into test (id, value) values (1, 10), (2, 20);
+begin;
+connection con2;
+begin;
+connection con3;
+begin;
+connection con1;
+select * from test where value % 5 = 0;
+id value
+1 10
+2 20
+connection con2;
+update test set value = 12 where value = 10;
+commit;
+connection con1;
+select * from test where value % 3 = 0;
+id value
+commit;
+connection con1;
+truncate table test;
+insert into test (id, value) values (1, 10), (2, 20);
+begin;
+connection con2;
+begin;
+connection con3;
+begin;
+connection con1;
+select * from test where id = 1;
+id value
+1 10
+connection con2;
+select * from test;
+id value
+1 10
+2 20
+update test set value = 12 where id = 1;
+update test set value = 18 where id = 2;
+commit;
+connection con1;
+delete from test where value = 20;
+select * from test where id = 2;
+id value
+2 20
+commit;
+connection con1;
+truncate table test;
+insert into test (id, value) values (1, 10), (2, 20);
+begin;
+connection con2;
+begin;
+connection con3;
+begin;
+connection con1;
+select * from test where id in (1,2);
+id value
+1 10
+2 20
+connection con2;
+select * from test where id in (1,2);
+id value
+1 10
+2 20
+connection con1;
+update test set value = 11 where id = 1;
+connection con2;
+update test set value = 21 where id = 2;
+connection con1;
+commit;
+connection con2;
+commit;
+connection con1;
+truncate table test;
+insert into test (id, value) values (1, 10), (2, 20);
+begin;
+connection con2;
+begin;
+connection con3;
+begin;
+connection con1;
+select * from test where value % 3 = 0;
+id value
+connection con2;
+select * from test where value % 3 = 0;
+id value
+connection con1;
+insert into test (id, value) values(3, 30);
+connection con2;
+insert into test (id, value) values(4, 42);
+connection con1;
+commit;
+connection con2;
+commit;
+select * from test where value % 3 = 0;
+id value
+3 30
+4 42
+connection con1;
+select * from test where value % 3 = 0;
+id value
+3 30
+4 42
+connection default;
+drop table test;
+disconnect con1;
+disconnect con2;
+disconnect con3;
diff --git a/mysql-test/suite/rocksdb/r/issue243_transactionStatus-range_locking.result b/mysql-test/suite/rocksdb/r/issue243_transactionStatus-range_locking.result
new file mode 100644
index 00000000000..b48535c5ee6
--- /dev/null
+++ b/mysql-test/suite/rocksdb/r/issue243_transactionStatus-range_locking.result
@@ -0,0 +1,182 @@
+DROP TABLE IF EXISTS t1;
+CREATE TABLE t1 (
+id INT,
+val1 INT,
+val2 INT,
+PRIMARY KEY (id)
+) ENGINE=rocksdb;
+INSERT INTO t1 VALUES(1,1,1),(2,1,2);
+SELECT * FROM t1;
+id val1 val2
+1 1 1
+2 1 2
+UPDATE t1 SET val1=2 WHERE id=2;
+SELECT * FROM t1;
+id val1 val2
+1 1 1
+2 2 2
+SHOW ENGINE rocksdb TRANSACTION STATUS;
+Type Name Status
+rocksdb
+============================================================
+TIMESTAMP ROCKSDB TRANSACTION MONITOR OUTPUT
+============================================================
+---------
+SNAPSHOTS
+---------
+LIST OF SNAPSHOTS FOR EACH SESSION:
+----------LATEST DETECTED DEADLOCKS----------
+-----------------------------------------
+END OF ROCKSDB TRANSACTION MONITOR OUTPUT
+=========================================
+
+SET AUTOCOMMIT=0;
+START TRANSACTION;
+INSERT INTO t1 VALUES(20,1,1),(30,30,30);
+SELECT * FROM t1;
+id val1 val2
+1 1 1
+2 2 2
+20 1 1
+30 30 30
+UPDATE t1 SET val1=20, val2=20 WHERE id=20;
+SELECT * FROM t1;
+id val1 val2
+1 1 1
+2 2 2
+20 20 20
+30 30 30
+DELETE FROM t1 WHERE id=30;
+SHOW ENGINE rocksdb TRANSACTION STATUS;
+Type Name Status
+rocksdb
+============================================================
+TIMESTAMP ROCKSDB TRANSACTION MONITOR OUTPUT
+============================================================
+---------
+SNAPSHOTS
+---------
+LIST OF SNAPSHOTS FOR EACH SESSION:
+---SNAPSHOT, ACTIVE NUM sec
+MySQL thread id TID, OS thread handle PTR, query id QID localhost root ACTION
+SHOW ENGINE rocksdb TRANSACTION STATUS
+lock count 4, write count 4
+insert count 2, update count 1, delete count 1
+----------LATEST DETECTED DEADLOCKS----------
+-----------------------------------------
+END OF ROCKSDB TRANSACTION MONITOR OUTPUT
+=========================================
+
+ROLLBACK;
+SHOW ENGINE rocksdb TRANSACTION STATUS;
+Type Name Status
+rocksdb
+============================================================
+TIMESTAMP ROCKSDB TRANSACTION MONITOR OUTPUT
+============================================================
+---------
+SNAPSHOTS
+---------
+LIST OF SNAPSHOTS FOR EACH SESSION:
+----------LATEST DETECTED DEADLOCKS----------
+-----------------------------------------
+END OF ROCKSDB TRANSACTION MONITOR OUTPUT
+=========================================
+
+START TRANSACTION;
+INSERT INTO t1 VALUES(40,40,40);
+SHOW ENGINE rocksdb TRANSACTION STATUS;
+Type Name Status
+rocksdb
+============================================================
+TIMESTAMP ROCKSDB TRANSACTION MONITOR OUTPUT
+============================================================
+---------
+SNAPSHOTS
+---------
+LIST OF SNAPSHOTS FOR EACH SESSION:
+----------LATEST DETECTED DEADLOCKS----------
+-----------------------------------------
+END OF ROCKSDB TRANSACTION MONITOR OUTPUT
+=========================================
+
+COMMIT;
+SHOW ENGINE rocksdb TRANSACTION STATUS;
+Type Name Status
+rocksdb
+============================================================
+TIMESTAMP ROCKSDB TRANSACTION MONITOR OUTPUT
+============================================================
+---------
+SNAPSHOTS
+---------
+LIST OF SNAPSHOTS FOR EACH SESSION:
+----------LATEST DETECTED DEADLOCKS----------
+-----------------------------------------
+END OF ROCKSDB TRANSACTION MONITOR OUTPUT
+=========================================
+
+SET AUTOCOMMIT=1;
+DROP TABLE t1;
+DROP TABLE IF EXISTS t2;
+CREATE TABLE t2 (
+id1 INT,
+id2 INT,
+value INT,
+PRIMARY KEY (id1),
+KEY (id2)
+) ENGINE=rocksdb;
+SET AUTOCOMMIT=0;
+START TRANSACTION;
+INSERT INTO t2 VALUES(1,2,0),(10,20,30);
+UPDATE t2 SET value=3 WHERE id2=2;
+DELETE FROM t2 WHERE id1=10;
+SHOW ENGINE rocksdb TRANSACTION STATUS;
+Type Name Status
+rocksdb
+============================================================
+TIMESTAMP ROCKSDB TRANSACTION MONITOR OUTPUT
+============================================================
+---------
+SNAPSHOTS
+---------
+LIST OF SNAPSHOTS FOR EACH SESSION:
+----------LATEST DETECTED DEADLOCKS----------
+-----------------------------------------
+END OF ROCKSDB TRANSACTION MONITOR OUTPUT
+=========================================
+
+ROLLBACK;
+SET AUTOCOMMIT=1;
+DROP TABLE t2;
+DROP TABLE IF EXISTS t2;
+CREATE TABLE t2 (
+id1 INT,
+id2 INT,
+value INT,
+PRIMARY KEY (id1),
+UNIQUE KEY (id2)
+) ENGINE=rocksdb;
+SET AUTOCOMMIT=0;
+START TRANSACTION;
+INSERT INTO t2 VALUES(1,2,0),(10,20,30);
+UPDATE t2 SET value=3 WHERE id2=2;
+DELETE FROM t2 WHERE id1=10;
+SHOW ENGINE rocksdb TRANSACTION STATUS;
+Type Name Status
+rocksdb
+============================================================
+TIMESTAMP ROCKSDB TRANSACTION MONITOR OUTPUT
+============================================================
+---------
+SNAPSHOTS
+---------
+LIST OF SNAPSHOTS FOR EACH SESSION:
+----------LATEST DETECTED DEADLOCKS----------
+-----------------------------------------
+END OF ROCKSDB TRANSACTION MONITOR OUTPUT
+=========================================
+
+ROLLBACK;
+SET AUTOCOMMIT=1;
+DROP TABLE t2;
diff --git a/mysql-test/suite/rocksdb/r/level_repeatable_read-range_locking.result b/mysql-test/suite/rocksdb/r/level_repeatable_read-range_locking.result
new file mode 100644
index 00000000000..0592b099238
--- /dev/null
+++ b/mysql-test/suite/rocksdb/r/level_repeatable_read-range_locking.result
@@ -0,0 +1,106 @@
+DROP TABLE IF EXISTS t1;
+connect con1,localhost,root,,;
+SET SESSION TRANSACTION ISOLATION LEVEL REPEATABLE READ;
+connect con2,localhost,root,,;
+SET SESSION TRANSACTION ISOLATION LEVEL REPEATABLE READ;
+connection con1;
+CREATE TABLE t1 (a INT, pk INT AUTO_INCREMENT PRIMARY KEY) ENGINE=rocksdb;
+START TRANSACTION;
+SELECT a FROM t1;
+a
+connection con2;
+BEGIN;
+INSERT INTO t1 (a) VALUES(1);
+connection con1;
+SELECT a FROM t1;
+a
+connection con2;
+INSERT INTO t1 (a) VALUES (2);
+connection con1;
+SELECT a FROM t1;
+a
+INSERT INTO t1 (a) SELECT a+100 FROM t1;
+SELECT a FROM t1;
+a
+connection con2;
+SELECT a FROM t1;
+a
+1
+2
+COMMIT;
+SELECT a FROM t1;
+a
+1
+2
+connection con1;
+SELECT a FROM t1;
+a
+INSERT INTO t1 (a) SELECT a+200 FROM t1;
+SELECT a FROM t1;
+a
+201
+202
+COMMIT;
+SELECT a FROM t1;
+a
+1
+2
+201
+202
+connection con2;
+SELECT a FROM t1;
+a
+1
+2
+201
+202
+connection default;
+CREATE TABLE t2 (a INT PRIMARY KEY) ENGINE=rocksdb;
+INSERT INTO t2 (a) VALUES (1);
+COMMIT;
+connection con1;
+BEGIN;
+SELECT a from t2;
+a
+1
+INSERT INTO t2 (a) VALUES (1), (3);
+ERROR 23000: Duplicate entry '1' for key 't2.PRIMARY'
+connection con2;
+INSERT INTO t2 (a) VALUES (2);
+COMMIT;
+connection con1;
+SELECT a from t2;
+a
+1
+COMMIT;
+connection default;
+disconnect con1;
+disconnect con2;
+DROP TABLE t1;
+DROP TABLE t2;
+CREATE TABLE t3 (
+pk int unsigned PRIMARY KEY,
+count int unsigned DEFAULT '0'
+) ENGINE=ROCKSDB;
+connect con1,localhost,root,,;
+SET SESSION TRANSACTION ISOLATION LEVEL REPEATABLE READ;
+connect con2,localhost,root,,;
+SET SESSION TRANSACTION ISOLATION LEVEL REPEATABLE READ;
+connection con1;
+BEGIN;
+SELECT * FROM t3;
+pk count
+connection con2;
+BEGIN;
+INSERT INTO t3 (pk) VALUES(1) ON DUPLICATE KEY UPDATE count=count+1;
+COMMIT;
+connection con1;
+INSERT INTO t3 (pk) VALUES(1) ON DUPLICATE KEY UPDATE count=count+1;
+COMMIT;
+SELECT count FROM t3;
+count
+1
+connection default;
+disconnect con1;
+disconnect con2;
+DROP TABLE t3;
diff --git a/mysql-test/suite/rocksdb/r/range_locking.result b/mysql-test/suite/rocksdb/r/range_locking.result
new file mode 100644
index 00000000000..4a2f99f86fc
--- /dev/null
+++ b/mysql-test/suite/rocksdb/r/range_locking.result
@@ -0,0 +1,596 @@
+show variables like 'rocksdb_use_range_locking';
+Variable_name Value
+rocksdb_use_range_locking ON
+create table t1 (
+pk int,
+a int,
+primary key (pk) comment 'default'
+) engine=rocksdb;
+insert into t1 values
+(10,10),(20,20),(30,30);
+connect con1,localhost,root,,;
+connect con2,localhost,root,,;
+### Test: check that range lock inhibits a point lock
+connection con1;
+begin;
+select * from t1 where pk between 5 and 25 for update;
+pk a
+10 10
+20 20
+connection con2;
+insert into t1 values (15,15);
+ERROR HY000: Lock wait timeout exceeded; try restarting transaction: Timeout on index: test.t1.PRIMARY
+connection con1;
+rollback;
+## Test: check that range lock inhibits another range lock
+connection con1;
+begin;
+select * from t1 where pk between 5 and 25 for update;
+pk a
+10 10
+20 20
+connection con2;
+begin;
+select * from t1 where pk between 15 and 35 for update;
+ERROR HY000: Lock wait timeout exceeded; try restarting transaction: Timeout on index: test.t1.PRIMARY
+rollback;
+connection con1;
+rollback;
+## Test: check that regular read does not get a range lock
+connection con1;
+begin;
+select * from t1 where pk between 5 and 25;
+pk a
+10 10
+20 20
+connection con2;
+begin;
+select * from t1 where pk between 15 and 35 for update;
+pk a
+20 20
+30 30
+rollback;
+connection con1;
+rollback;
+## Test that locks are not released when a statement inside
+## a transaction is rolled back
+create table t2 (
+pk int,
+a int,
+primary key (pk) comment 'default',
+unique key(a) comment 'default'
+) engine=rocksdb;
+insert into t2 values (1,1),(2,2);
+begin;
+insert into t2 values (3,3);
+insert into t2 values (10,2);
+ERROR 23000: Duplicate entry '2' for key 't2.a'
+connection con2;
+begin;
+select * from t2 where pk=3 for update;
+ERROR HY000: Lock wait timeout exceeded; try restarting transaction: Timeout on index: test.t2.PRIMARY
+rollback;
+connection con1;
+rollback;
+drop table t2;
+connection default;
+disconnect con1;
+disconnect con2;
+drop table t1;
+#
+# Test INFORMATION_SCHEMA.lock_info in range-locking mode
+#
+connect con1,localhost,root,,;
+connection con1;
+create table t0 (a int primary key);
+begin;
+insert into t0 values (1);
+connection default;
+create table t1 (
+pk int,
+a int,
+primary key (pk) comment 'default'
+) engine=rocksdb;
+insert into t1 values
+(10,10),(20,20),(30,30);
+begin;
+select * from t1 where pk=10 for update;
+pk a
+10 10
+# select * from information_schema.rocksdb_locks; # With replacements by select_from_is_rowlocks.inc
+COLUMN_FAMILY_ID TRANSACTION_ID KEY mode
+$cf_id $trx_id ${indexnr}8000000a X
+delete from t1 where pk between 25 and 40;
+# select * from information_schema.rocksdb_locks; # With replacements by select_from_is_rowlocks.inc
+COLUMN_FAMILY_ID TRANSACTION_ID KEY mode
+$cf_id $trx_id ${indexnr}8000000a X
+$cf_id $trx_id ${indexnr}80000019-${indexnr}80000028:1 X
+rollback;
+begin;
+# The following will show a range lock on 2-9 and also a point lock on 10.
+# This is how things currently work. (after MDEV-21314, not anymore)
+select * from t1 where pk between 2 and 9 for update;
+pk a
+# select * from information_schema.rocksdb_locks; # With replacements by select_from_is_rowlocks.inc
+COLUMN_FAMILY_ID TRANSACTION_ID KEY mode
+$cf_id $trx_id ${indexnr}80000002-${indexnr}80000009:1 X
+rollback;
+drop table t1;
+connection con1;
+rollback;
+drop table t0;
+connection default;
+disconnect con1;
+#
+# MDEV-18104: MyRocks-Gap-Lock: range locking bounds are incorrect for multi-part keys
+#
+create table t0(a int);
+insert into t0 values (0),(1),(2),(3),(4),(5),(6),(7),(8),(9);
+create table t1 (
+kp1 int not null,
+kp2 int not null,
+a int,
+primary key(kp1, kp2) comment 'default'
+) engine=rocksdb;
+insert into t1 select 1, a, 1234 from t0;
+insert into t1 select 2, a, 1234 from t0;
+insert into t1 select 3, a, 1234 from t0;
+connect con1,localhost,root,,;
+connection con1;
+begin;
+select * from t1 where kp1=2 for update;
+kp1 kp2 a
+2 0 1234
+2 1 1234
+2 2 1234
+2 3 1234
+2 4 1234
+2 5 1234
+2 6 1234
+2 7 1234
+2 8 1234
+2 9 1234
+connection default;
+# The lock on kp1=2 should inhibit the following INSERT:
+insert into t1 values ( 2,5,9999);
+ERROR HY000: Lock wait timeout exceeded; try restarting transaction: Timeout on index: test.t1.PRIMARY
+rollback;
+connection con1;
+rollback;
+connection default;
+disconnect con1;
+drop table t0,t1;
+#
+# Test that locks on ranges on non-unique secondary keys inhibit
+# modifications of the contents of these ranges
+#
+create table t0(a int);
+insert into t0 values (0),(1),(2),(3),(4),(5),(6),(7),(8),(9);
+create table t1 (
+kp1 int not null,
+kp2 int not null,
+a int,
+key(kp1, kp2) comment 'default'
+) engine=rocksdb;
+insert into t1 select 1, a, 1234 from t0;
+insert into t1 values (2, 3, 1234);
+insert into t1 values (2, 5, 1234);
+insert into t1 values (2, 7, 1234);
+insert into t1 select 3, a, 1234 from t0;
+connect con1,localhost,root,,;
+connection con1;
+begin;
+explain
+select * from t1 where kp1=2 for update;
+id select_type table partitions type possible_keys key key_len ref rows filtered Extra
+1 SIMPLE t1 NULL ref kp1 kp1 4 const # 100.00 NULL
+Warnings:
+Note 1003 /* select#1 */ select `test`.`t1`.`kp1` AS `kp1`,`test`.`t1`.`kp2` AS `kp2`,`test`.`t1`.`a` AS `a` from `test`.`t1` where (`test`.`t1`.`kp1` = 2)
+select * from t1 where kp1=2 for update;
+kp1 kp2 a
+2 3 1234
+2 5 1234
+2 7 1234
+connection default;
+begin;
+insert into t1 values (2, 9, 9999);
+ERROR HY000: Lock wait timeout exceeded; try restarting transaction: Timeout on index: test.t1.kp1
+delete from t1 where kp1=2 and kp2=5;
+ERROR HY000: Lock wait timeout exceeded; try restarting transaction: Timeout on index: test.t1.kp1
+update t1 set kp1=333 where kp1=2 and kp2=3;
+ERROR HY000: Lock wait timeout exceeded; try restarting transaction: Timeout on index: test.t1.kp1
+update t1 set kp1=2 where kp1=1 and kp2=8;
+ERROR HY000: Lock wait timeout exceeded; try restarting transaction: Timeout on index: test.t1.kp1
+rollback;
+connection con1;
+rollback;
+disconnect con1;
+connection default;
+drop table t0,t1;
+#
+# Transaction isolation test
+#
+create table t1 (pk int primary key, a int) engine=rocksdb;
+insert into t1 values (1,1),(2,2),(3,3);
+connect con1,localhost,root,,;
+# TRX1: Start, Allocate a snapshot
+connection con1;
+begin;
+select * from t1;
+pk a
+1 1
+2 2
+3 3
+# TRX2: Make a change that TRX1 will not see
+connection default;
+update t1 set a=2222 where pk=2;
+# TRX1: Now, make a change that would overwrite TRX2'x change and commit
+connection con1;
+update t1 set a=a+1 where pk=2;
+commit;
+# Examine the result:
+# pk=2, a=2223 means UPDATE in TRX1 used "read committed" (InnoDB-like isolation)
+# pk=2, a=3 means UPDATE in TRX1 silently overwrote TRX2
+# (and with key tracking, one would get an error on the second UPDATE)
+connection default;
+select * from t1;
+pk a
+1 1
+2 2223
+3 3
+disconnect con1;
+connection default;
+drop table t1;
+#
+# Same test as above, but check the range scan
+#
+create table t1 (
+pk int,
+a int,
+primary key (pk) comment 'default'
+) engine=rocksdb;
+insert into t1 values (1,1),(2,2),(3,3),(4,4),(5,5),(6,6);
+connect con1,localhost,root,,;
+# TRX1: Start, Allocate a snapshot
+connection con1;
+begin;
+select * from t1;
+pk a
+1 1
+2 2
+3 3
+4 4
+5 5
+6 6
+# TRX2: Make a change that TRX1 will not see
+connection default;
+update t1 set a=2222 where pk between 3 and 5;
+# TRX1: Now, make a change that would overwrite TRX2'x change and commit
+connection con1;
+update t1 set a=a+1 where pk between 3 and 5;
+commit;
+# Examine the result:
+# pk={3,4,5} a=2223 means UPDATE in TRX1 used "read committed" (InnoDB-like isolation)
+connection default;
+select * from t1;
+pk a
+1 1
+2 2
+3 2223
+4 2223
+5 2223
+6 6
+disconnect con1;
+connection default;
+drop table t1;
+#
+# Same as above, but test SELECT FOR UPDATE.
+#
+create table t1 (
+pk int,
+a int,
+primary key (pk) comment 'default'
+) engine=rocksdb;
+insert into t1 values (1,1),(2,2),(3,3),(4,4),(5,5),(6,6);
+connect con1,localhost,root,,;
+# TRX1: Start, Allocate a snapshot
+connection con1;
+begin;
+select * from t1;
+pk a
+1 1
+2 2
+3 3
+4 4
+5 5
+6 6
+# TRX2: Make a change that TRX1 will not see
+connection default;
+update t1 set a=222 where pk=2;
+update t1 set a=333 where pk=3;
+# TRX1: Check what select [FOR UPDATE] sees
+connection con1;
+select * from t1 where pk in (2,3);
+pk a
+2 2
+3 3
+select * from t1 where pk=2 for update;
+pk a
+2 222
+select * from t1 where pk=2 lock in share mode;
+pk a
+2 222
+select * from t1 where pk=2;
+pk a
+2 2
+commit;
+disconnect con1;
+connection default;
+drop table t1;
+#
+# Another no-snapshot-checking test, this time for single-statement
+# transaction
+#
+create table t1 (
+pk int,
+a int,
+name varchar(16),
+primary key(pk) comment 'default'
+) engine=rocksdb;
+insert into t1 values (1,1, 'row1'), (2,2,'row2');
+connect con1,localhost,root,,;
+connection con1;
+select get_lock('row1', 100);
+get_lock('row1', 100)
+1
+connection default;
+# The following will read the first row (1,1,'row1'), and stop.
+update t1 set a=a+100 where get_lock(name, 1000)=1;
+connection con1;
+update t1 set a=5 where pk=2;
+select release_lock('row1');
+release_lock('row1')
+1
+connection default;
+# Look at the row with pk=2:
+# 2, 105, row2 - means the UPDATE was reading current data (Correct)
+# 2, 102, row - means the UPDATE read the snapshot (incorrect)
+select * from t1;
+pk a name
+1 101 row1
+2 105 row2
+# Try releasing both locks (in 5.6, we will be holding only the second one)
+select release_lock(name) from t1;
+release_lock(name)
+1
+1
+disconnect con1;
+connection default;
+drop table t1;
+#
+# Check that I_S.processlist.state is set correctly now.
+#
+create table t1(
+pk int,
+a int,
+primary key(pk) comment 'default'
+) engine=rocksdb;
+insert into t1 values (1,1),(2,2),(3,3);
+begin;
+select * from t1 where pk=2 for update;
+pk a
+2 2
+connect con1,localhost,root,,;
+begin;
+set rocksdb_lock_wait_timeout=300;
+select * from t1 where pk=2 for update;
+connection default;
+# Now, will wait until we see con1 have state="Waiting for row lock"
+rollback;
+connection con1;
+pk a
+2 2
+rollback;
+disconnect con1;
+connection default;
+drop table t1;
+#
+# Test range locking for ranges with HA_READ_PREFIX_LAST
+#
+create table t0(a int) engine=rocksdb;
+insert into t0 values (0),(1),(2),(3),(4),(5),(6),(7),(8),(9);
+create table t1 (
+pk1 int,
+pk2 int,
+a int,
+primary key(pk1, pk2) comment 'default'
+) engine=rocksdb;
+insert into t1
+select
+A.a, B.a, A.a*10+B.a
+from
+t0 A, t0 B;
+connect con1,localhost,root,,;
+connection con1;
+begin;
+insert into t1 values (0x1112222,0x1112222,0);
+connection default;
+begin;
+# Should use ref access w/o filesort:
+explain
+select * from t1
+where pk1=3
+order by pk1 desc, pk2 desc
+for update;
+id select_type table partitions type possible_keys key key_len ref rows filtered Extra
+1 SIMPLE t1 NULL ref PRIMARY PRIMARY 4 const # 100.00 Backward index scan
+Warnings:
+Note 1003 /* select#1 */ select `test`.`t1`.`pk1` AS `pk1`,`test`.`t1`.`pk2` AS `pk2`,`test`.`t1`.`a` AS `a` from `test`.`t1` where (`test`.`t1`.`pk1` = 3) order by `test`.`t1`.`pk1` desc,`test`.`t1`.`pk2` desc
+select * from t1
+where pk1=3
+order by pk1 desc, pk2 desc
+for update;
+pk1 pk2 a
+3 9 39
+3 8 38
+3 7 37
+3 6 36
+3 5 35
+3 4 34
+3 3 33
+3 2 32
+3 1 31
+3 0 30
+# select * from information_schema.rocksdb_locks; # With replacements by select_from_is_rowlocks.inc
+COLUMN_FAMILY_ID TRANSACTION_ID KEY mode
+$cf_id $trx_id ${indexnr}80000003-${indexnr}80000003:1 X
+rollback;
+#
+# Test range locking for ranges with HA_READ_PREFIX_LAST_OR_PREV
+#
+begin;
+# Should use range access with 2 keyparts and w/o filesort:
+explain
+select * from t1
+where pk1=4 and pk2 between 5 and 8
+order by pk1 desc, pk2 desc
+for update;
+id select_type table partitions type possible_keys key key_len ref rows filtered Extra
+1 SIMPLE t1 NULL range PRIMARY PRIMARY 8 NULL # 100.00 Using where; Backward index scan
+Warnings:
+Note 1003 /* select#1 */ select `test`.`t1`.`pk1` AS `pk1`,`test`.`t1`.`pk2` AS `pk2`,`test`.`t1`.`a` AS `a` from `test`.`t1` where ((`test`.`t1`.`pk1` = 4) and (`test`.`t1`.`pk2` between 5 and 8)) order by `test`.`t1`.`pk1` desc,`test`.`t1`.`pk2` desc
+select * from t1
+where pk1=4 and pk2 between 5 and 8
+order by pk1 desc, pk2 desc
+for update;
+pk1 pk2 a
+4 8 48
+4 7 47
+4 6 46
+4 5 45
+# select * from information_schema.rocksdb_locks; # With replacements by select_from_is_rowlocks.inc
+COLUMN_FAMILY_ID TRANSACTION_ID KEY mode
+$cf_id $trx_id ${indexnr}8000000480000005-${indexnr}8000000480000008:1 X
+rollback;
+connection con1;
+rollback;
+connection default;
+drop table t0, t1;
+#
+# A bug: range locking was not used when scan started at table start or end
+#
+create table t0(a int);
+insert into t0 values (0),(1),(2),(3),(4),(5),(6),(7),(8),(9);
+create table t10(a int);
+insert into t10 select A.a + B.a* 10 + C.a * 100 from t0 A, t0 B, t0 C;
+create table t1 (
+pk int not null,
+a int,
+primary key(pk)
+) engine=rocksdb;
+insert into t1 select a*2,a*2 from t10;
+connection con1;
+begin;
+select * from t1 where pk=500 for update;
+pk a
+500 500
+connection default;
+begin;
+select * from t1 where pk<10 order by pk limit 10 for update;
+pk a
+0 0
+2 2
+4 4
+6 6
+8 8
+# select * from information_schema.rocksdb_locks; # With replacements by select_from_is_rowlocks.inc
+COLUMN_FAMILY_ID TRANSACTION_ID KEY mode
+$cf_id $trx_id ${indexnr}-${indexnr}8000000a X
+rollback;
+begin;
+select * from t1 where pk>1990 order by pk desc limit 10 for update;
+pk a
+1998 1998
+1996 1996
+1994 1994
+1992 1992
+# select * from information_schema.rocksdb_locks; # With replacements by select_from_is_rowlocks.inc
+COLUMN_FAMILY_ID TRANSACTION_ID KEY mode
+$cf_id $trx_id ${indexnr}800007c6-${indexnr+1} X
+rollback;
+connection con1;
+rollback;
+disconnect con1;
+connection default;
+drop table t0,t10,t1;
+#
+# Range locking and READ-COMMITTED isolation level
+#
+connect con1,localhost,root,,;
+connection con1;
+set session transaction isolation level read committed;
+create table t1 (
+pk int not null,
+a int,
+primary key(pk)
+) engine=rocksdb;
+insert into t1(pk) values (0),(1),(2),(3),(4),(5),(6),(7),(8),(9);
+begin;
+select * from t1 where pk between 2 and 5 for update;
+pk a
+2 NULL
+3 NULL
+4 NULL
+5 NULL
+# Below should show individual row locks, not locked range:
+# select * from information_schema.rocksdb_locks; # With replacements by select_from_is_rowlocks.inc
+COLUMN_FAMILY_ID TRANSACTION_ID KEY mode
+$cf_id $trx_id ${indexnr}80000002 X
+$cf_id $trx_id ${indexnr}80000003 X
+$cf_id $trx_id ${indexnr}80000004 X
+$cf_id $trx_id ${indexnr}80000005 X
+$cf_id $trx_id ${indexnr}80000006 X
+rollback;
+begin;
+update t1 set a=a+1 where pk between 2 and 5;
+# Below should show individual row locks, not locked range:
+# select * from information_schema.rocksdb_locks; # With replacements by select_from_is_rowlocks.inc
+COLUMN_FAMILY_ID TRANSACTION_ID KEY mode
+$cf_id $trx_id ${indexnr}80000002 X
+$cf_id $trx_id ${indexnr}80000003 X
+$cf_id $trx_id ${indexnr}80000004 X
+$cf_id $trx_id ${indexnr}80000005 X
+$cf_id $trx_id ${indexnr}80000006 X
+rollback;
+drop table t1;
+disconnect con1;
+connection default;
+#
+# Range Locking and READ-COMMITTED, another test
+#
+create table t1 (
+pk int,
+a int,
+b int,
+primary key (pk),
+key(a)
+) engine=rocksdb;
+insert into t1 values
+(1, 100, 1000),
+(2, 200, 2000),
+(3, 300, 3000);
+set transaction isolation level repeatable read;
+begin;
+update t1 set b = b + 1 where a > 200;
+connect con1,localhost,root,,;
+connection con1;
+set transaction isolation level read committed;
+begin;
+insert into t1 values (4, 150, 1500);
+insert into t1 values (5, 250, 1500);
+ERROR HY000: Lock wait timeout exceeded; try restarting transaction: Timeout on index: test.t1.a
+rollback;
+disconnect con1;
+connection default;
+rollback;
+drop table t1;
diff --git a/mysql-test/suite/rocksdb/r/range_locking_conc_test.result b/mysql-test/suite/rocksdb/r/range_locking_conc_test.result
new file mode 100644
index 00000000000..a70152da808
--- /dev/null
+++ b/mysql-test/suite/rocksdb/r/range_locking_conc_test.result
@@ -0,0 +1,4 @@
+set @save_rlwt=@@rocksdb_lock_wait_timeout;
+# Run range_locking_conc_test.py
+set global rocksdb_lock_wait_timeout= @save_rlwt;
+DROP TABLE t1;
diff --git a/mysql-test/suite/rocksdb/r/range_locking_deadlock_tracking.result b/mysql-test/suite/rocksdb/r/range_locking_deadlock_tracking.result
new file mode 100644
index 00000000000..00fd1788dfd
--- /dev/null
+++ b/mysql-test/suite/rocksdb/r/range_locking_deadlock_tracking.result
@@ -0,0 +1,453 @@
+set @prior_lock_wait_timeout = @@rocksdb_lock_wait_timeout;
+set @prior_deadlock_detect = @@rocksdb_deadlock_detect;
+set @prior_max_latest_deadlocks = @@rocksdb_max_latest_deadlocks;
+set global rocksdb_deadlock_detect = on;
+set global rocksdb_lock_wait_timeout = 10000;
+# Clears deadlock buffer of any prior deadlocks.
+set global rocksdb_max_latest_deadlocks = 0;
+set global rocksdb_max_latest_deadlocks = @prior_max_latest_deadlocks;
+create table t (i int primary key) engine=rocksdb;
+insert into t values (1), (2), (3);
+show engine rocksdb transaction status;
+Type Name Status
+rocksdb
+============================================================
+TIMESTAMP ROCKSDB TRANSACTION MONITOR OUTPUT
+============================================================
+---------
+SNAPSHOTS
+---------
+LIST OF SNAPSHOTS FOR EACH SESSION:
+----------LATEST DETECTED DEADLOCKS----------
+-----------------------------------------
+END OF ROCKSDB TRANSACTION MONITOR OUTPUT
+=========================================
+
+Deadlock #1
+begin;
+select * from t where i=1 for update;
+i
+1
+begin;
+select * from t where i=2 for update;
+i
+2
+select * from t where i=2 for update;
+select * from t where i=1 for update;
+ERROR 40001: Deadlock found when trying to get lock; try restarting transaction
+rollback;
+i
+2
+rollback;
+show engine rocksdb transaction status;
+Type Name Status
+rocksdb
+============================================================
+TIMESTAMP ROCKSDB TRANSACTION MONITOR OUTPUT
+============================================================
+---------
+SNAPSHOTS
+---------
+LIST OF SNAPSHOTS FOR EACH SESSION:
+----------LATEST DETECTED DEADLOCKS----------
+
+*** DEADLOCK PATH
+=========================================
+TSTAMP
+TXN_ID
+COLUMN FAMILY NAME: default
+KEY
+LOCK TYPE: EXCLUSIVE
+INDEX NAME: PRIMARY
+TABLE NAME: test.t
+---------------WAITING FOR---------------
+TSTAMP
+TXN_ID
+COLUMN FAMILY NAME: default
+KEY
+LOCK TYPE: EXCLUSIVE
+INDEX NAME: PRIMARY
+TABLE NAME: test.t
+
+--------TXN_ID GOT DEADLOCK---------
+-----------------------------------------
+END OF ROCKSDB TRANSACTION MONITOR OUTPUT
+=========================================
+
+Deadlock #2
+begin;
+select * from t where i=1 for update;
+i
+1
+begin;
+select * from t where i=2 for update;
+i
+2
+select * from t where i=2 for update;
+select * from t where i=1 for update;
+ERROR 40001: Deadlock found when trying to get lock; try restarting transaction
+rollback;
+i
+2
+rollback;
+show engine rocksdb transaction status;
+Type Name Status
+rocksdb
+============================================================
+TIMESTAMP ROCKSDB TRANSACTION MONITOR OUTPUT
+============================================================
+---------
+SNAPSHOTS
+---------
+LIST OF SNAPSHOTS FOR EACH SESSION:
+----------LATEST DETECTED DEADLOCKS----------
+
+*** DEADLOCK PATH
+=========================================
+TSTAMP
+TXN_ID
+COLUMN FAMILY NAME: default
+KEY
+LOCK TYPE: EXCLUSIVE
+INDEX NAME: PRIMARY
+TABLE NAME: test.t
+---------------WAITING FOR---------------
+TSTAMP
+TXN_ID
+COLUMN FAMILY NAME: default
+KEY
+LOCK TYPE: EXCLUSIVE
+INDEX NAME: PRIMARY
+TABLE NAME: test.t
+
+--------TXN_ID GOT DEADLOCK---------
+
+*** DEADLOCK PATH
+=========================================
+TSTAMP
+TXN_ID
+COLUMN FAMILY NAME: default
+KEY
+LOCK TYPE: EXCLUSIVE
+INDEX NAME: PRIMARY
+TABLE NAME: test.t
+---------------WAITING FOR---------------
+TSTAMP
+TXN_ID
+COLUMN FAMILY NAME: default
+KEY
+LOCK TYPE: EXCLUSIVE
+INDEX NAME: PRIMARY
+TABLE NAME: test.t
+
+--------TXN_ID GOT DEADLOCK---------
+-----------------------------------------
+END OF ROCKSDB TRANSACTION MONITOR OUTPUT
+=========================================
+
+set global rocksdb_max_latest_deadlocks = 10;
+Deadlock #3
+begin;
+select * from t where i=1 for update;
+i
+1
+begin;
+select * from t where i=2 for update;
+i
+2
+select * from t where i=2 for update;
+select * from t where i=1 for update;
+ERROR 40001: Deadlock found when trying to get lock; try restarting transaction
+rollback;
+i
+2
+rollback;
+show engine rocksdb transaction status;
+Type Name Status
+rocksdb
+============================================================
+TIMESTAMP ROCKSDB TRANSACTION MONITOR OUTPUT
+============================================================
+---------
+SNAPSHOTS
+---------
+LIST OF SNAPSHOTS FOR EACH SESSION:
+----------LATEST DETECTED DEADLOCKS----------
+
+*** DEADLOCK PATH
+=========================================
+TSTAMP
+TXN_ID
+COLUMN FAMILY NAME: default
+KEY
+LOCK TYPE: EXCLUSIVE
+INDEX NAME: PRIMARY
+TABLE NAME: test.t
+---------------WAITING FOR---------------
+TSTAMP
+TXN_ID
+COLUMN FAMILY NAME: default
+KEY
+LOCK TYPE: EXCLUSIVE
+INDEX NAME: PRIMARY
+TABLE NAME: test.t
+
+--------TXN_ID GOT DEADLOCK---------
+
+*** DEADLOCK PATH
+=========================================
+TSTAMP
+TXN_ID
+COLUMN FAMILY NAME: default
+KEY
+LOCK TYPE: EXCLUSIVE
+INDEX NAME: PRIMARY
+TABLE NAME: test.t
+---------------WAITING FOR---------------
+TSTAMP
+TXN_ID
+COLUMN FAMILY NAME: default
+KEY
+LOCK TYPE: EXCLUSIVE
+INDEX NAME: PRIMARY
+TABLE NAME: test.t
+
+--------TXN_ID GOT DEADLOCK---------
+
+*** DEADLOCK PATH
+=========================================
+TSTAMP
+TXN_ID
+COLUMN FAMILY NAME: default
+KEY
+LOCK TYPE: EXCLUSIVE
+INDEX NAME: PRIMARY
+TABLE NAME: test.t
+---------------WAITING FOR---------------
+TSTAMP
+TXN_ID
+COLUMN FAMILY NAME: default
+KEY
+LOCK TYPE: EXCLUSIVE
+INDEX NAME: PRIMARY
+TABLE NAME: test.t
+
+--------TXN_ID GOT DEADLOCK---------
+-----------------------------------------
+END OF ROCKSDB TRANSACTION MONITOR OUTPUT
+=========================================
+
+set global rocksdb_max_latest_deadlocks = 1;
+show engine rocksdb transaction status;
+Type Name Status
+rocksdb
+============================================================
+TIMESTAMP ROCKSDB TRANSACTION MONITOR OUTPUT
+============================================================
+---------
+SNAPSHOTS
+---------
+LIST OF SNAPSHOTS FOR EACH SESSION:
+----------LATEST DETECTED DEADLOCKS----------
+
+*** DEADLOCK PATH
+=========================================
+TSTAMP
+TXN_ID
+COLUMN FAMILY NAME: default
+KEY
+LOCK TYPE: EXCLUSIVE
+INDEX NAME: PRIMARY
+TABLE NAME: test.t
+---------------WAITING FOR---------------
+TSTAMP
+TXN_ID
+COLUMN FAMILY NAME: default
+KEY
+LOCK TYPE: EXCLUSIVE
+INDEX NAME: PRIMARY
+TABLE NAME: test.t
+
+--------TXN_ID GOT DEADLOCK---------
+-----------------------------------------
+END OF ROCKSDB TRANSACTION MONITOR OUTPUT
+=========================================
+
+set rocksdb_deadlock_detect_depth = 2;
+# Range locking code will report deadlocks, because it doesn't honor
+# rocksdb_deadlock_detect_depth:
+Deadlock #4
+begin;
+select * from t where i=1 for update;
+i
+1
+begin;
+select * from t where i=2 for update;
+i
+2
+begin;
+select * from t where i=3 for update;
+i
+3
+select * from t where i=2 for update;
+select * from t where i=3 for update;
+select variable_value into @a from performance_schema.global_status where variable_name='rocksdb_row_lock_deadlocks';
+select * from t where i=1 for update;
+ERROR 40001: Deadlock found when trying to get lock; try restarting transaction
+select case when variable_value-@a = 1 then 'true' else 'false' end as deadlocks from performance_schema.global_status where variable_name='rocksdb_row_lock_deadlocks';
+deadlocks
+true
+rollback;
+i
+3
+rollback;
+i
+2
+rollback;
+set global rocksdb_max_latest_deadlocks = 5;
+show engine rocksdb transaction status;
+Type Name Status
+rocksdb
+============================================================
+TIMESTAMP ROCKSDB TRANSACTION MONITOR OUTPUT
+============================================================
+---------
+SNAPSHOTS
+---------
+LIST OF SNAPSHOTS FOR EACH SESSION:
+----------LATEST DETECTED DEADLOCKS----------
+
+*** DEADLOCK PATH
+=========================================
+TSTAMP
+TXN_ID
+COLUMN FAMILY NAME: default
+KEY
+LOCK TYPE: EXCLUSIVE
+INDEX NAME: PRIMARY
+TABLE NAME: test.t
+---------------WAITING FOR---------------
+TSTAMP
+TXN_ID
+COLUMN FAMILY NAME: default
+KEY
+LOCK TYPE: EXCLUSIVE
+INDEX NAME: PRIMARY
+TABLE NAME: test.t
+---------------WAITING FOR---------------
+TSTAMP
+TXN_ID
+COLUMN FAMILY NAME: default
+KEY
+LOCK TYPE: EXCLUSIVE
+INDEX NAME: PRIMARY
+TABLE NAME: test.t
+
+--------TXN_ID GOT DEADLOCK---------
+-----------------------------------------
+END OF ROCKSDB TRANSACTION MONITOR OUTPUT
+=========================================
+
+Deadlock #6
+create table t1 (id int primary key, value int) engine=rocksdb;
+insert into t1 values (1,1),(2,2),(3,3),(4,4),(5,5);
+begin;
+update t1 set value=value+100 where id=1;
+update t1 set value=value+100 where id=2;
+begin;
+update t1 set value=value+200 where id=3;
+update t1 set value=value+100 where id=3;
+update t1 set value=value+200 where id=1;
+ERROR 40001: Deadlock found when trying to get lock; try restarting transaction
+select * from t1;
+id value
+1 101
+2 102
+3 103
+4 4
+5 5
+drop table t1;
+set global rocksdb_lock_wait_timeout = @prior_lock_wait_timeout;
+set global rocksdb_deadlock_detect = @prior_deadlock_detect;
+drop table t;
+show engine rocksdb transaction status;
+Type Name Status
+rocksdb
+============================================================
+TIMESTAMP ROCKSDB TRANSACTION MONITOR OUTPUT
+============================================================
+---------
+SNAPSHOTS
+---------
+LIST OF SNAPSHOTS FOR EACH SESSION:
+----------LATEST DETECTED DEADLOCKS----------
+
+*** DEADLOCK PATH
+=========================================
+TSTAMP
+TXN_ID
+COLUMN FAMILY NAME: default
+KEY
+LOCK TYPE: EXCLUSIVE
+INDEX NAME: NOT FOUND; IDX_ID
+TABLE NAME: NOT FOUND; IDX_ID
+---------------WAITING FOR---------------
+TSTAMP
+TXN_ID
+COLUMN FAMILY NAME: default
+KEY
+LOCK TYPE: EXCLUSIVE
+INDEX NAME: NOT FOUND; IDX_ID
+TABLE NAME: NOT FOUND; IDX_ID
+
+--------TXN_ID GOT DEADLOCK---------
+
+*** DEADLOCK PATH
+=========================================
+TSTAMP
+TXN_ID
+COLUMN FAMILY NAME: default
+KEY
+LOCK TYPE: EXCLUSIVE
+INDEX NAME: NOT FOUND; IDX_ID
+TABLE NAME: NOT FOUND; IDX_ID
+---------------WAITING FOR---------------
+TSTAMP
+TXN_ID
+COLUMN FAMILY NAME: default
+KEY
+LOCK TYPE: EXCLUSIVE
+INDEX NAME: NOT FOUND; IDX_ID
+TABLE NAME: NOT FOUND; IDX_ID
+---------------WAITING FOR---------------
+TSTAMP
+TXN_ID
+COLUMN FAMILY NAME: default
+KEY
+LOCK TYPE: EXCLUSIVE
+INDEX NAME: NOT FOUND; IDX_ID
+TABLE NAME: NOT FOUND; IDX_ID
+
+--------TXN_ID GOT DEADLOCK---------
+-----------------------------------------
+END OF ROCKSDB TRANSACTION MONITOR OUTPUT
+=========================================
+
+set global rocksdb_max_latest_deadlocks = 0;
+# Clears deadlock buffer of any existent deadlocks.
+set global rocksdb_max_latest_deadlocks = @prior_max_latest_deadlocks;
+show engine rocksdb transaction status;
+Type Name Status
+rocksdb
+============================================================
+TIMESTAMP ROCKSDB TRANSACTION MONITOR OUTPUT
+============================================================
+---------
+SNAPSHOTS
+---------
+LIST OF SNAPSHOTS FOR EACH SESSION:
+----------LATEST DETECTED DEADLOCKS----------
+-----------------------------------------
+END OF ROCKSDB TRANSACTION MONITOR OUTPUT
+=========================================
+
diff --git a/mysql-test/suite/rocksdb/r/range_locking_escalation.result b/mysql-test/suite/rocksdb/r/range_locking_escalation.result
new file mode 100644
index 00000000000..dd19d728ef2
--- /dev/null
+++ b/mysql-test/suite/rocksdb/r/range_locking_escalation.result
@@ -0,0 +1,27 @@
+show variables like 'rocksdb_use_range_locking';
+Variable_name Value
+rocksdb_use_range_locking ON
+show variables like 'rocksdb_max_lock_memory';
+Variable_name Value
+rocksdb_max_lock_memory 1024
+show status like 'rocksdb_locktree_escalation_count';
+Variable_name Value
+rocksdb_locktree_escalation_count 0
+create table t0(a int);
+insert into t0 values (0),(1),(2),(3),(4),(5),(6),(7),(8),(9);
+create table t1 (
+pk int primary key,
+a int
+) engine=rocksdb;
+insert into t1
+select
+A.a + B.a*10 + C.a*100 + D.a*1000,
+12345
+from t0 A, t0 B, t0 C, t0 D;
+select count(*) from t1;
+count(*)
+10000
+show status like 'rocksdb_locktree_escalation_count';
+Variable_name Value
+rocksdb_locktree_escalation_count 127
+drop table t0,t1;
diff --git a/mysql-test/suite/rocksdb/r/range_locking_partial_index.result b/mysql-test/suite/rocksdb/r/range_locking_partial_index.result
new file mode 100644
index 00000000000..2e22d8b3206
--- /dev/null
+++ b/mysql-test/suite/rocksdb/r/range_locking_partial_index.result
@@ -0,0 +1,203 @@
+create table t0(a int primary key) engine=rocksdb;
+insert into t0 values (0),(1),(2),(3),(4),(5),(6),(7),(8),(9);
+create table t1 (
+pk1 int,
+pk2 int,
+a int not null,
+b int,
+primary key (pk1, pk2),
+key key1(pk1, a) comment 'partial_group_keyparts=1;partial_group_threshold=5'
+) engine=rocksdb;
+insert into t1 select
+1,
+A.a,
+100 + A.a,
+123456
+from t0 A;
+select * from t1 force index (key1) where pk1=1;
+pk1 pk2 a b
+1 0 100 123456
+1 1 101 123456
+1 2 102 123456
+1 3 103 123456
+1 4 104 123456
+1 5 105 123456
+1 6 106 123456
+1 7 107 123456
+1 8 108 123456
+1 9 109 123456
+insert into t1 select
+2,
+A.a,
+100 + A.a,
+123456
+from t0 A limit 3;
+insert into t1 select
+10000 + A.a +10 *B.a +100*C.a,
+A.a,
+100 + A.a,
+123456
+from t0 A, t0 B, t0 C;
+create table t3(pk int primary key);
+connect con2,localhost,root,,;
+connection con2;
+begin;
+insert into t3 values(3333333);
+connection default;
+#
+# First, test a query with range lock
+#
+explain
+select * from t1 force index (key1) where pk1>=1 and pk1<=10;
+id select_type table partitions type possible_keys key key_len ref rows filtered Extra
+1 SIMPLE t1 NULL range key1 key1 4 NULL # 100.00 Using index condition
+Warnings:
+Note 1003 /* select#1 */ select `test`.`t1`.`pk1` AS `pk1`,`test`.`t1`.`pk2` AS `pk2`,`test`.`t1`.`a` AS `a`,`test`.`t1`.`b` AS `b` from `test`.`t1` FORCE INDEX (`key1`) where ((`test`.`t1`.`pk1` >= 1) and (`test`.`t1`.`pk1` <= 10))
+connect con1,localhost,root,,;
+connection con1;
+begin;
+# Allocate a snapshot
+select * from t0 where a=3;
+a
+3
+connection default;
+# Make some modifications not visible in the snapshot
+insert into t1 values (1,11, 99999, 99999);
+insert into t1 values (2,11, 99999, 99999);
+connection con1;
+# This doesn't see the modifications
+select * from t1 force index (key1) where pk1>=1 and pk1<=10;
+pk1 pk2 a b
+1 0 100 123456
+1 1 101 123456
+1 2 102 123456
+1 3 103 123456
+1 4 104 123456
+1 5 105 123456
+1 6 106 123456
+1 7 107 123456
+1 8 108 123456
+1 9 109 123456
+2 0 100 123456
+2 1 101 123456
+2 2 102 123456
+# This DOES see the modifications
+select * from t1 force index (key1) where pk1>=1 and pk1<=10 for update;
+pk1 pk2 a b
+1 0 100 123456
+1 1 101 123456
+1 2 102 123456
+1 3 103 123456
+1 4 104 123456
+1 5 105 123456
+1 6 106 123456
+1 7 107 123456
+1 8 108 123456
+1 9 109 123456
+1 11 99999 99999
+2 0 100 123456
+2 1 101 123456
+2 2 102 123456
+2 11 99999 99999
+# select * from information_schema.rocksdb_locks; # With replacements by select_from_is_rowlocks.inc
+COLUMN_FAMILY_ID TRANSACTION_ID KEY mode
+$cf2_id $trx_id ${indexnr2}80000001-${indexnr2}8000000a:1 X
+$cf_id $trx_id ${indexnr}8000000180000000 X
+$cf_id $trx_id ${indexnr}8000000180000001 X
+$cf_id $trx_id ${indexnr}8000000180000002 X
+$cf_id $trx_id ${indexnr}8000000180000003 X
+$cf_id $trx_id ${indexnr}8000000180000004 X
+$cf_id $trx_id ${indexnr}8000000180000005 X
+$cf_id $trx_id ${indexnr}8000000180000006 X
+$cf_id $trx_id ${indexnr}8000000180000007 X
+$cf_id $trx_id ${indexnr}8000000180000008 X
+$cf_id $trx_id ${indexnr}8000000180000009 X
+$cf_id $trx_id ${indexnr}800000018000000b X
+$cf_id $trx_id ${indexnr}8000000280000000 X
+$cf_id $trx_id ${indexnr}8000000280000001 X
+$cf_id $trx_id ${indexnr}8000000280000002 X
+$cf_id $trx_id ${indexnr}800000028000000b X
+rollback;
+#
+# Now, test a query with LockingIterator
+#
+delete from t1 where b=99999;
+begin;
+# Allocate a snapshot
+select * from t0 where a=3;
+a
+3
+connection default;
+# Make some modifications not visible in the snapshot
+insert into t1 values (1,11, 99999, 99999);
+insert into t1 values (2,11, 99999, 99999);
+connection con1;
+# This doesn't see the modifications:
+select * from t1 force index (key1) where pk1>=1 order by pk1 limit 15;
+pk1 pk2 a b
+1 0 100 123456
+1 1 101 123456
+1 2 102 123456
+1 3 103 123456
+1 4 104 123456
+1 5 105 123456
+1 6 106 123456
+1 7 107 123456
+1 8 108 123456
+1 9 109 123456
+2 0 100 123456
+2 1 101 123456
+2 2 102 123456
+10000 0 100 123456
+10001 1 101 123456
+# This DOES see the modifications:
+select * from t1 force index (key1) where pk1>=1 order by pk1 limit 15 for update;
+pk1 pk2 a b
+1 0 100 123456
+1 1 101 123456
+1 2 102 123456
+1 3 103 123456
+1 4 104 123456
+1 5 105 123456
+1 6 106 123456
+1 7 107 123456
+1 8 108 123456
+1 9 109 123456
+1 11 99999 99999
+2 0 100 123456
+2 1 101 123456
+2 2 102 123456
+2 11 99999 99999
+# select * from information_schema.rocksdb_locks; # With replacements by select_from_is_rowlocks.inc
+COLUMN_FAMILY_ID TRANSACTION_ID KEY mode
+$cf2_id $trx_id ${indexnr2}80000001-${indexnr2}800000018000006480000000 X
+$cf2_id $trx_id ${indexnr2}800000018000006480000000-${indexnr2}800000018000006580000001 X
+$cf2_id $trx_id ${indexnr2}800000018000006580000001-${indexnr2}800000018000006680000002 X
+$cf2_id $trx_id ${indexnr2}800000018000006680000002-${indexnr2}800000018000006780000003 X
+$cf2_id $trx_id ${indexnr2}800000018000006780000003-${indexnr2}800000018000006880000004 X
+$cf2_id $trx_id ${indexnr2}800000018000006880000004-${indexnr2}800000018000006980000005 X
+$cf2_id $trx_id ${indexnr2}800000018000006980000005-${indexnr2}800000018000006a80000006 X
+$cf2_id $trx_id ${indexnr2}800000018000006a80000006-${indexnr2}800000018000006b80000007 X
+$cf2_id $trx_id ${indexnr2}800000018000006b80000007-${indexnr2}800000018000006c80000008 X
+$cf2_id $trx_id ${indexnr2}800000018000006c80000008-${indexnr2}800000018000006d80000009 X
+$cf2_id $trx_id ${indexnr2}800000018000006d80000009-${indexnr2}800000018001869f8000000b X
+$cf2_id $trx_id ${indexnr2}800000018001869f8000000b-${indexnr2+1} X
+$cf2_id $trx_id ${indexnr2}80000002-${indexnr2}80000002:1 X
+$cf2_id $trx_id ${indexnr2}80000002-${indexnr2}80000003 X
+$cf_id $trx_id ${indexnr}8000000180000000 X
+$cf_id $trx_id ${indexnr}8000000180000001 X
+$cf_id $trx_id ${indexnr}8000000180000002 X
+$cf_id $trx_id ${indexnr}8000000180000003 X
+$cf_id $trx_id ${indexnr}8000000180000004 X
+$cf_id $trx_id ${indexnr}8000000180000005 X
+$cf_id $trx_id ${indexnr}8000000180000006 X
+$cf_id $trx_id ${indexnr}8000000180000007 X
+$cf_id $trx_id ${indexnr}8000000180000008 X
+$cf_id $trx_id ${indexnr}8000000180000009 X
+$cf_id $trx_id ${indexnr}800000018000000b X
+$cf_id $trx_id ${indexnr}80000002-${indexnr}80000003 X
+rollback;
+disconnect con1;
+connection default;
+disconnect con2;
+drop table t0, t1,t3;
diff --git a/mysql-test/suite/rocksdb/r/range_locking_refresh_iter.result b/mysql-test/suite/rocksdb/r/range_locking_refresh_iter.result
new file mode 100644
index 00000000000..1067087e816
--- /dev/null
+++ b/mysql-test/suite/rocksdb/r/range_locking_refresh_iter.result
@@ -0,0 +1,50 @@
+select @@rocksdb_use_range_locking;
+@@rocksdb_use_range_locking
+1
+set debug_sync='RESET';
+create table ten(a int primary key);
+insert into ten values (0),(1),(2),(3),(4),(5),(6),(7),(8),(9);
+create table one_k(a int primary key);
+insert into one_k select A.a + B.a* 10 + C.a * 100 from ten A, ten B, ten C;
+create table t1 (
+pk int primary key,
+a int
+) engine=rocksdb;
+insert into t1 select a,a from ten;
+insert into t1 select a+40, a+40 from ten;
+insert into t1 select a+100, a+100 from one_k;
+delete from t1 where pk=44;
+set global rocksdb_force_flush_memtable_and_lzero_now=1;
+begin;
+set debug_sync='rocksdb.check_flags_iri SIGNAL con1_stopped WAIT_FOR con1_cont';
+update t1 set a=a+100 where pk < 3 or pk between 10 and 50;
+set debug_sync='now WAIT_FOR con1_stopped';
+insert into t1 values (44,5000);
+delete from t1 where pk= 42;
+update t1 set a=5000 where pk between 40 and 45;
+set global rocksdb_force_flush_memtable_and_lzero_now=1;
+set debug_sync='now SIGNAL con1_cont';
+select * from t1 where pk<100;
+pk a
+0 100
+1 101
+2 102
+3 3
+4 4
+5 5
+6 6
+7 7
+8 8
+9 9
+40 5100
+41 5100
+43 5100
+44 5100
+45 5100
+46 146
+47 147
+48 148
+49 149
+commit;
+set debug_sync='RESET';
+drop table t1, ten, one_k;
diff --git a/mysql-test/suite/rocksdb/r/range_locking_rev_cf.result b/mysql-test/suite/rocksdb/r/range_locking_rev_cf.result
new file mode 100644
index 00000000000..e39c6bb3339
--- /dev/null
+++ b/mysql-test/suite/rocksdb/r/range_locking_rev_cf.result
@@ -0,0 +1,556 @@
+show variables like 'rocksdb_use_range_locking';
+Variable_name Value
+rocksdb_use_range_locking ON
+create table t1 (
+pk int,
+a int,
+primary key (pk) comment 'rev:cf1'
+) engine=rocksdb;
+insert into t1 values
+(10,10),(20,20),(30,30);
+connect con1,localhost,root,,;
+connect con2,localhost,root,,;
+### Test: check that range lock inhibits a point lock
+connection con1;
+begin;
+select * from t1 where pk between 5 and 25 for update;
+pk a
+10 10
+20 20
+connection con2;
+insert into t1 values (15,15);
+ERROR HY000: Lock wait timeout exceeded; try restarting transaction: Timeout on index: test.t1.PRIMARY
+connection con1;
+rollback;
+## Test: check that range lock inhibits another range lock
+connection con1;
+begin;
+select * from t1 where pk between 5 and 25 for update;
+pk a
+10 10
+20 20
+connection con2;
+begin;
+select * from t1 where pk between 15 and 35 for update;
+ERROR HY000: Lock wait timeout exceeded; try restarting transaction: Timeout on index: test.t1.PRIMARY
+rollback;
+connection con1;
+rollback;
+## Test: check that regular read does not get a range lock
+connection con1;
+begin;
+select * from t1 where pk between 5 and 25;
+pk a
+10 10
+20 20
+connection con2;
+begin;
+select * from t1 where pk between 15 and 35 for update;
+pk a
+20 20
+30 30
+rollback;
+connection con1;
+rollback;
+## Test that locks are not released when a statement inside
+## a transaction is rolled back
+create table t2 (
+pk int,
+a int,
+primary key (pk) comment 'rev:cf1',
+unique key(a) comment ''
+) engine=rocksdb;
+insert into t2 values (1,1),(2,2);
+begin;
+insert into t2 values (3,3);
+insert into t2 values (10,2);
+ERROR 23000: Duplicate entry '2' for key 't2.a'
+connection con2;
+begin;
+select * from t2 where pk=3 for update;
+ERROR HY000: Lock wait timeout exceeded; try restarting transaction: Timeout on index: test.t2.PRIMARY
+rollback;
+connection con1;
+rollback;
+drop table t2;
+connection default;
+disconnect con1;
+disconnect con2;
+drop table t1;
+#
+# Test INFORMATION_SCHEMA.lock_info in range-locking mode
+#
+connect con1,localhost,root,,;
+connection con1;
+create table t0 (a int primary key);
+begin;
+insert into t0 values (1);
+connection default;
+create table t1 (
+pk int,
+a int,
+primary key (pk) comment 'rev:cf1'
+) engine=rocksdb;
+insert into t1 values
+(10,10),(20,20),(30,30);
+begin;
+select * from t1 where pk=10 for update;
+pk a
+10 10
+# select * from information_schema.rocksdb_locks; # With replacements by select_from_is_rowlocks.inc
+COLUMN_FAMILY_ID TRANSACTION_ID KEY mode
+$cf_id $trx_id ${indexnr}8000000a X
+delete from t1 where pk between 25 and 40;
+# select * from information_schema.rocksdb_locks; # With replacements by select_from_is_rowlocks.inc
+COLUMN_FAMILY_ID TRANSACTION_ID KEY mode
+$cf_id $trx_id ${indexnr}8000000a X
+$cf_id $trx_id ${indexnr}80000028-${indexnr}80000019:1 X
+rollback;
+begin;
+# The following will show a range lock on 2-9 and also a point lock on 10.
+# This is how things currently work. (after MDEV-21314, not anymore)
+select * from t1 where pk between 2 and 9 for update;
+pk a
+# select * from information_schema.rocksdb_locks; # With replacements by select_from_is_rowlocks.inc
+COLUMN_FAMILY_ID TRANSACTION_ID KEY mode
+$cf_id $trx_id ${indexnr}80000009-${indexnr}80000002:1 X
+rollback;
+drop table t1;
+connection con1;
+rollback;
+drop table t0;
+connection default;
+disconnect con1;
+#
+# MDEV-18104: MyRocks-Gap-Lock: range locking bounds are incorrect for multi-part keys
+#
+create table t0(a int);
+insert into t0 values (0),(1),(2),(3),(4),(5),(6),(7),(8),(9);
+create table t1 (
+kp1 int not null,
+kp2 int not null,
+a int,
+primary key(kp1, kp2) comment 'rev:cf1'
+) engine=rocksdb;
+insert into t1 select 1, a, 1234 from t0;
+insert into t1 select 2, a, 1234 from t0;
+insert into t1 select 3, a, 1234 from t0;
+connect con1,localhost,root,,;
+connection con1;
+begin;
+select * from t1 where kp1=2 for update;
+kp1 kp2 a
+2 0 1234
+2 1 1234
+2 2 1234
+2 3 1234
+2 4 1234
+2 5 1234
+2 6 1234
+2 7 1234
+2 8 1234
+2 9 1234
+connection default;
+# The lock on kp1=2 should inhibit the following INSERT:
+insert into t1 values ( 2,5,9999);
+ERROR HY000: Lock wait timeout exceeded; try restarting transaction: Timeout on index: test.t1.PRIMARY
+rollback;
+connection con1;
+rollback;
+connection default;
+disconnect con1;
+drop table t0,t1;
+#
+# Test that locks on ranges on non-unique secondary keys inhibit
+# modifications of the contents of these ranges
+#
+create table t0(a int);
+insert into t0 values (0),(1),(2),(3),(4),(5),(6),(7),(8),(9);
+create table t1 (
+kp1 int not null,
+kp2 int not null,
+a int,
+key(kp1, kp2) comment 'rev:cf1'
+) engine=rocksdb;
+insert into t1 select 1, a, 1234 from t0;
+insert into t1 values (2, 3, 1234);
+insert into t1 values (2, 5, 1234);
+insert into t1 values (2, 7, 1234);
+insert into t1 select 3, a, 1234 from t0;
+connect con1,localhost,root,,;
+connection con1;
+begin;
+explain
+select * from t1 where kp1=2 for update;
+id select_type table partitions type possible_keys key key_len ref rows filtered Extra
+1 SIMPLE t1 NULL ref kp1 kp1 4 const # 100.00 NULL
+Warnings:
+Note 1003 /* select#1 */ select `test`.`t1`.`kp1` AS `kp1`,`test`.`t1`.`kp2` AS `kp2`,`test`.`t1`.`a` AS `a` from `test`.`t1` where (`test`.`t1`.`kp1` = 2)
+select * from t1 where kp1=2 for update;
+kp1 kp2 a
+2 3 1234
+2 5 1234
+2 7 1234
+connection default;
+begin;
+insert into t1 values (2, 9, 9999);
+ERROR HY000: Lock wait timeout exceeded; try restarting transaction: Timeout on index: test.t1.kp1
+delete from t1 where kp1=2 and kp2=5;
+ERROR HY000: Lock wait timeout exceeded; try restarting transaction: Timeout on index: test.t1.kp1
+update t1 set kp1=333 where kp1=2 and kp2=3;
+ERROR HY000: Lock wait timeout exceeded; try restarting transaction: Timeout on index: test.t1.kp1
+update t1 set kp1=2 where kp1=1 and kp2=8;
+ERROR HY000: Lock wait timeout exceeded; try restarting transaction: Timeout on index: test.t1.kp1
+rollback;
+connection con1;
+rollback;
+disconnect con1;
+connection default;
+drop table t0,t1;
+#
+# Transaction isolation test
+#
+create table t1 (pk int primary key, a int) engine=rocksdb;
+insert into t1 values (1,1),(2,2),(3,3);
+connect con1,localhost,root,,;
+# TRX1: Start, Allocate a snapshot
+connection con1;
+begin;
+select * from t1;
+pk a
+1 1
+2 2
+3 3
+# TRX2: Make a change that TRX1 will not see
+connection default;
+update t1 set a=2222 where pk=2;
+# TRX1: Now, make a change that would overwrite TRX2'x change and commit
+connection con1;
+update t1 set a=a+1 where pk=2;
+commit;
+# Examine the result:
+# pk=2, a=2223 means UPDATE in TRX1 used "read committed" (InnoDB-like isolation)
+# pk=2, a=3 means UPDATE in TRX1 silently overwrote TRX2
+# (and with key tracking, one would get an error on the second UPDATE)
+connection default;
+select * from t1;
+pk a
+1 1
+2 2223
+3 3
+disconnect con1;
+connection default;
+drop table t1;
+#
+# Same test as above, but check the range scan
+#
+create table t1 (
+pk int,
+a int,
+primary key (pk) comment 'rev:cf1'
+) engine=rocksdb;
+insert into t1 values (1,1),(2,2),(3,3),(4,4),(5,5),(6,6);
+connect con1,localhost,root,,;
+# TRX1: Start, Allocate a snapshot
+connection con1;
+begin;
+select * from t1;
+pk a
+6 6
+5 5
+4 4
+3 3
+2 2
+1 1
+# TRX2: Make a change that TRX1 will not see
+connection default;
+update t1 set a=2222 where pk between 3 and 5;
+# TRX1: Now, make a change that would overwrite TRX2'x change and commit
+connection con1;
+update t1 set a=a+1 where pk between 3 and 5;
+commit;
+# Examine the result:
+# pk={3,4,5} a=2223 means UPDATE in TRX1 used "read committed" (InnoDB-like isolation)
+connection default;
+select * from t1;
+pk a
+6 6
+5 2223
+4 2223
+3 2223
+2 2
+1 1
+disconnect con1;
+connection default;
+drop table t1;
+#
+# Same as above, but test SELECT FOR UPDATE.
+#
+create table t1 (
+pk int,
+a int,
+primary key (pk) comment 'rev:cf1'
+) engine=rocksdb;
+insert into t1 values (1,1),(2,2),(3,3),(4,4),(5,5),(6,6);
+connect con1,localhost,root,,;
+# TRX1: Start, Allocate a snapshot
+connection con1;
+begin;
+select * from t1;
+pk a
+6 6
+5 5
+4 4
+3 3
+2 2
+1 1
+# TRX2: Make a change that TRX1 will not see
+connection default;
+update t1 set a=222 where pk=2;
+update t1 set a=333 where pk=3;
+# TRX1: Check what select [FOR UPDATE] sees
+connection con1;
+select * from t1 where pk in (2,3);
+pk a
+2 2
+3 3
+select * from t1 where pk=2 for update;
+pk a
+2 222
+select * from t1 where pk=2 lock in share mode;
+pk a
+2 222
+select * from t1 where pk=2;
+pk a
+2 2
+commit;
+disconnect con1;
+connection default;
+drop table t1;
+#
+# Check that I_S.processlist.state is set correctly now.
+#
+create table t1(
+pk int,
+a int,
+primary key(pk) comment 'rev:cf1'
+) engine=rocksdb;
+insert into t1 values (1,1),(2,2),(3,3);
+begin;
+select * from t1 where pk=2 for update;
+pk a
+2 2
+connect con1,localhost,root,,;
+begin;
+set rocksdb_lock_wait_timeout=300;
+select * from t1 where pk=2 for update;
+connection default;
+# Now, will wait until we see con1 have state="Waiting for row lock"
+rollback;
+connection con1;
+pk a
+2 2
+rollback;
+disconnect con1;
+connection default;
+drop table t1;
+#
+# Test range locking for ranges with HA_READ_PREFIX_LAST
+#
+create table t0(a int) engine=rocksdb;
+insert into t0 values (0),(1),(2),(3),(4),(5),(6),(7),(8),(9);
+create table t1 (
+pk1 int,
+pk2 int,
+a int,
+primary key(pk1, pk2) comment 'rev:cf1'
+) engine=rocksdb;
+insert into t1
+select
+A.a, B.a, A.a*10+B.a
+from
+t0 A, t0 B;
+connect con1,localhost,root,,;
+connection con1;
+begin;
+insert into t1 values (0x1112222,0x1112222,0);
+connection default;
+begin;
+# Should use ref access w/o filesort:
+explain
+select * from t1
+where pk1=3
+order by pk1 desc, pk2 desc
+for update;
+id select_type table partitions type possible_keys key key_len ref rows filtered Extra
+1 SIMPLE t1 NULL ref PRIMARY PRIMARY 4 const # 100.00 Backward index scan
+Warnings:
+Note 1003 /* select#1 */ select `test`.`t1`.`pk1` AS `pk1`,`test`.`t1`.`pk2` AS `pk2`,`test`.`t1`.`a` AS `a` from `test`.`t1` where (`test`.`t1`.`pk1` = 3) order by `test`.`t1`.`pk1` desc,`test`.`t1`.`pk2` desc
+select * from t1
+where pk1=3
+order by pk1 desc, pk2 desc
+for update;
+pk1 pk2 a
+3 9 39
+3 8 38
+3 7 37
+3 6 36
+3 5 35
+3 4 34
+3 3 33
+3 2 32
+3 1 31
+3 0 30
+# select * from information_schema.rocksdb_locks; # With replacements by select_from_is_rowlocks.inc
+COLUMN_FAMILY_ID TRANSACTION_ID KEY mode
+$cf_id $trx_id ${indexnr}80000003-${indexnr}80000003:1 X
+rollback;
+#
+# Test range locking for ranges with HA_READ_PREFIX_LAST_OR_PREV
+#
+begin;
+# Should use range access with 2 keyparts and w/o filesort:
+explain
+select * from t1
+where pk1=4 and pk2 between 5 and 8
+order by pk1 desc, pk2 desc
+for update;
+id select_type table partitions type possible_keys key key_len ref rows filtered Extra
+1 SIMPLE t1 NULL range PRIMARY PRIMARY 8 NULL # 100.00 Using where; Backward index scan
+Warnings:
+Note 1003 /* select#1 */ select `test`.`t1`.`pk1` AS `pk1`,`test`.`t1`.`pk2` AS `pk2`,`test`.`t1`.`a` AS `a` from `test`.`t1` where ((`test`.`t1`.`pk1` = 4) and (`test`.`t1`.`pk2` between 5 and 8)) order by `test`.`t1`.`pk1` desc,`test`.`t1`.`pk2` desc
+select * from t1
+where pk1=4 and pk2 between 5 and 8
+order by pk1 desc, pk2 desc
+for update;
+pk1 pk2 a
+4 8 48
+4 7 47
+4 6 46
+4 5 45
+# select * from information_schema.rocksdb_locks; # With replacements by select_from_is_rowlocks.inc
+COLUMN_FAMILY_ID TRANSACTION_ID KEY mode
+$cf_id $trx_id ${indexnr}8000000480000008-${indexnr}8000000480000005:1 X
+rollback;
+connection con1;
+rollback;
+connection default;
+drop table t0, t1;
+#
+# A bug: range locking was not used when scan started at table start or end
+#
+create table t0(a int);
+insert into t0 values (0),(1),(2),(3),(4),(5),(6),(7),(8),(9);
+create table t10(a int);
+insert into t10 select A.a + B.a* 10 + C.a * 100 from t0 A, t0 B, t0 C;
+create table t1 (
+pk int not null,
+a int,
+primary key(pk)
+) engine=rocksdb;
+insert into t1 select a*2,a*2 from t10;
+connection con1;
+begin;
+select * from t1 where pk=500 for update;
+pk a
+500 500
+connection default;
+begin;
+select * from t1 where pk<10 order by pk limit 10 for update;
+pk a
+0 0
+2 2
+4 4
+6 6
+8 8
+# select * from information_schema.rocksdb_locks; # With replacements by select_from_is_rowlocks.inc
+COLUMN_FAMILY_ID TRANSACTION_ID KEY mode
+$cf_id $trx_id ${indexnr}-${indexnr}8000000a X
+rollback;
+begin;
+select * from t1 where pk>1990 order by pk desc limit 10 for update;
+pk a
+1998 1998
+1996 1996
+1994 1994
+1992 1992
+# select * from information_schema.rocksdb_locks; # With replacements by select_from_is_rowlocks.inc
+COLUMN_FAMILY_ID TRANSACTION_ID KEY mode
+$cf_id $trx_id ${indexnr}800007c6-${indexnr+1} X
+rollback;
+connection con1;
+rollback;
+disconnect con1;
+connection default;
+drop table t0,t10,t1;
+#
+# Range locking and READ-COMMITTED isolation level
+#
+connect con1,localhost,root,,;
+connection con1;
+set session transaction isolation level read committed;
+create table t1 (
+pk int not null,
+a int,
+primary key(pk)
+) engine=rocksdb;
+insert into t1(pk) values (0),(1),(2),(3),(4),(5),(6),(7),(8),(9);
+begin;
+select * from t1 where pk between 2 and 5 for update;
+pk a
+2 NULL
+3 NULL
+4 NULL
+5 NULL
+# Below should show individual row locks, not locked range:
+# select * from information_schema.rocksdb_locks; # With replacements by select_from_is_rowlocks.inc
+COLUMN_FAMILY_ID TRANSACTION_ID KEY mode
+$cf_id $trx_id ${indexnr}80000002 X
+$cf_id $trx_id ${indexnr}80000003 X
+$cf_id $trx_id ${indexnr}80000004 X
+$cf_id $trx_id ${indexnr}80000005 X
+$cf_id $trx_id ${indexnr}80000006 X
+rollback;
+begin;
+update t1 set a=a+1 where pk between 2 and 5;
+# Below should show individual row locks, not locked range:
+# select * from information_schema.rocksdb_locks; # With replacements by select_from_is_rowlocks.inc
+COLUMN_FAMILY_ID TRANSACTION_ID KEY mode
+$cf_id $trx_id ${indexnr}80000002 X
+$cf_id $trx_id ${indexnr}80000003 X
+$cf_id $trx_id ${indexnr}80000004 X
+$cf_id $trx_id ${indexnr}80000005 X
+$cf_id $trx_id ${indexnr}80000006 X
+rollback;
+drop table t1;
+disconnect con1;
+connection default;
+#
+# Range Locking and READ-COMMITTED, another test
+#
+create table t1 (
+pk int,
+a int,
+b int,
+primary key (pk),
+key(a)
+) engine=rocksdb;
+insert into t1 values
+(1, 100, 1000),
+(2, 200, 2000),
+(3, 300, 3000);
+set transaction isolation level repeatable read;
+begin;
+update t1 set b = b + 1 where a > 200;
+connect con1,localhost,root,,;
+connection con1;
+set transaction isolation level read committed;
+begin;
+insert into t1 values (4, 150, 1500);
+insert into t1 values (5, 250, 1500);
+ERROR HY000: Lock wait timeout exceeded; try restarting transaction: Timeout on index: test.t1.a
+rollback;
+disconnect con1;
+connection default;
+rollback;
+drop table t1;
diff --git a/mysql-test/suite/rocksdb/r/range_locking_seek_for_update.result b/mysql-test/suite/rocksdb/r/range_locking_seek_for_update.result
new file mode 100644
index 00000000000..bf1cb916a5b
--- /dev/null
+++ b/mysql-test/suite/rocksdb/r/range_locking_seek_for_update.result
@@ -0,0 +1,291 @@
+show variables like 'rocksdb_use_range_locking';
+Variable_name Value
+rocksdb_use_range_locking ON
+create table t0(a int primary key);
+insert into t0 values (0),(1),(2),(3),(4),(5),(6),(7),(8),(9);
+create table t1 (
+pk int,
+a int,
+primary key (pk)
+) engine=rocksdb;
+insert into t1 select
+A.a + B.a*10 + C.a*100,
+A.a + B.a*10 + C.a*100
+from
+t0 A, t0 B, t0 C;
+# Make another connection to get the lock tree out of the STO-mode
+connect con1,localhost,root,,;
+connection con1;
+begin;
+select * from t1 where pk=10 for update;
+pk a
+10 10
+connection default;
+begin;
+select * from t1 where pk=11 for update;
+pk a
+11 11
+# Now, we will just see locks on 10=0xA and 11=0xB:
+# select * from information_schema.rocksdb_locks; # With replacements by select_from_is_rowlocks.inc
+COLUMN_FAMILY_ID TRANSACTION_ID KEY mode
+$cf_id $trx_id ${indexnr}8000000b X
+#
+# SeekForUpdate Test #1: A query with type=range (without upper bound) and LIMIT
+#
+explain
+select * from t1 where pk>=500 order by pk limit 3 for update;
+id select_type table partitions type possible_keys key key_len ref rows filtered Extra
+1 SIMPLE t1 NULL range PRIMARY PRIMARY 4 NULL # 100.00 Using where
+Warnings:
+Note 1003 /* select#1 */ select `test`.`t1`.`pk` AS `pk`,`test`.`t1`.`a` AS `a` from `test`.`t1` where (`test`.`t1`.`pk` >= 500) order by `test`.`t1`.`pk` limit 3
+select * from t1 where pk>=500 order by pk limit 3 for update;
+pk a
+500 500
+501 501
+502 502
+# select * from information_schema.rocksdb_locks; # With replacements by select_from_is_rowlocks.inc
+COLUMN_FAMILY_ID TRANSACTION_ID KEY mode
+$cf_id $trx_id ${indexnr}8000000b X
+$cf_id $trx_id ${indexnr}800001f4-${indexnr}800001f6 X
+rollback;
+begin;
+select * from t1 where pk=11 for update;
+pk a
+11 11
+explain
+select * from t1 order by pk limit 3 for update;
+id select_type table partitions type possible_keys key key_len ref rows filtered Extra
+1 SIMPLE t1 NULL index NULL PRIMARY 4 NULL 3 100.00 NULL
+Warnings:
+Note 1003 /* select#1 */ select `test`.`t1`.`pk` AS `pk`,`test`.`t1`.`a` AS `a` from `test`.`t1` order by `test`.`t1`.`pk` limit 3
+select * from t1 order by pk limit 3 for update;
+pk a
+0 0
+1 1
+2 2
+# select * from information_schema.rocksdb_locks; # With replacements by select_from_is_rowlocks.inc
+COLUMN_FAMILY_ID TRANSACTION_ID KEY mode
+$cf_id $trx_id ${indexnr}-${indexnr}80000002 X
+$cf_id $trx_id ${indexnr}8000000b X
+rollback;
+connection con1;
+rollback;
+disconnect con1;
+connection default;
+drop table t0, t1;
+#
+# Concurrent tests: let one thread do SeekForUpdate and the other
+# interfere by committing modifications
+#
+create table t0(a int primary key);
+insert into t0 values (0),(1),(2),(3),(4),(5),(6),(7),(8),(9);
+create table t1 (
+pk int,
+a int,
+primary key (pk)
+) engine=rocksdb;
+insert into t1 select
+A.a + B.a*10 + C.a*100,
+A.a + B.a*10 + C.a*100
+from
+t0 A, t0 B, t0 C;
+select * from t1 where pk<10;
+pk a
+0 0
+1 1
+2 2
+3 3
+4 4
+5 5
+6 6
+7 7
+8 8
+9 9
+delete from t1 where pk<10;
+select * from t1 where pk<10;
+pk a
+# Test what happens when another transaction commits a row
+# right before the range we are about to lock (nothing)
+explain
+select * from t1 where pk >=5 order by pk limit 3 for update;
+id select_type table partitions type possible_keys key key_len ref rows filtered Extra
+1 SIMPLE t1 NULL range PRIMARY PRIMARY 4 NULL # 100.00 Using where
+Warnings:
+Note 1003 /* select#1 */ select `test`.`t1`.`pk` AS `pk`,`test`.`t1`.`a` AS `a` from `test`.`t1` where (`test`.`t1`.`pk` >= 5) order by `test`.`t1`.`pk` limit 3
+begin;
+set debug_sync='rocksdb.locking_iter_scan SIGNAL about_to_lock_range WAIT_FOR spoiler_inserted';
+select * from t1 where pk >=5 order by pk limit 3 for update;
+connect con1,localhost,root,,;
+connection con1;
+set debug_sync='now WAIT_FOR about_to_lock_range';
+insert into t1 values (3,3);
+set debug_sync='now SIGNAL spoiler_inserted';
+connection default;
+pk a
+10 10
+11 11
+12 12
+rollback;
+delete from t1 where pk=3;
+#
+# Now, repeat the test but let the other transaction insert the row into
+# the range we are locking
+explain
+select * from t1 where pk >=5 order by pk limit 1 for update;
+id select_type table partitions type possible_keys key key_len ref rows filtered Extra
+1 SIMPLE t1 NULL range PRIMARY PRIMARY 4 NULL # 100.00 Using where
+Warnings:
+Note 1003 /* select#1 */ select `test`.`t1`.`pk` AS `pk`,`test`.`t1`.`a` AS `a` from `test`.`t1` where (`test`.`t1`.`pk` >= 5) order by `test`.`t1`.`pk` limit 1
+begin;
+set debug_sync='rocksdb.locking_iter_scan SIGNAL about_to_lock_range WAIT_FOR spoiler_inserted';
+select * from t1 where pk >=5 order by pk limit 1 for update;
+connection con1;
+set debug_sync='now WAIT_FOR about_to_lock_range';
+insert into t1 values (8,8);
+set debug_sync='now SIGNAL spoiler_inserted';
+connection default;
+pk a
+8 8
+# select * from information_schema.rocksdb_locks; # With replacements by select_from_is_rowlocks.inc
+COLUMN_FAMILY_ID TRANSACTION_ID KEY mode
+$cf_id $trx_id ${indexnr}80000005-${indexnr}8000000a X
+rollback;
+delete from t1 where pk=8;
+#
+# Repeat the third time, this time deleting the row that SeekForUpdate saw
+#
+insert into t1 values (7,7);
+begin;
+set debug_sync='rocksdb.locking_iter_scan SIGNAL about_to_lock_range WAIT_FOR spoiler_inserted';
+select * from t1 where pk >=5 order by pk limit 1 for update;
+connection con1;
+set debug_sync='now WAIT_FOR about_to_lock_range';
+delete from t1 where pk=7;
+set debug_sync='now SIGNAL spoiler_inserted';
+connection default;
+pk a
+10 10
+rollback;
+#
+# Repeat the above test, but let the read fail with ER_LOCK_WAIT_TIMEOUT
+# error. MyRocks code should now be prepared that data reads cause this
+# error
+#
+insert into t1 values (7,7);
+begin;
+set debug_sync='rocksdb.locking_iter_scan SIGNAL about_to_lock_range WAIT_FOR spoiler_inserted';
+select * from t1 where pk >=5 order by pk limit 1 for update;
+connection con1;
+set debug_sync='now WAIT_FOR about_to_lock_range';
+begin;
+delete from t1 where pk=7;
+set debug_sync='now SIGNAL spoiler_inserted';
+connection default;
+ERROR HY000: Lock wait timeout exceeded; try restarting transaction: Timeout on index: test.t1.PRIMARY
+rollback;
+connection con1;
+rollback;
+connection default;
+#
+# Test the thd_killed check in the iterator
+#
+begin;
+set debug_sync='rocksdb.locking_iter_scan SIGNAL about_to_lock_range WAIT_FOR go_get_killed';
+select * from t1 where pk >=5 order by pk limit 1 for update;
+connection con1;
+set debug_sync='now WAIT_FOR about_to_lock_range';
+kill query CONN_ID;
+connection default;
+ERROR HY000: Got error 10 'Operation aborted: ' from ROCKSDB
+rollback;
+#
+# Backward scan test
+#
+connection con1;
+begin;
+select * from t1 where pk=500 for update;
+pk a
+500 500
+connection default;
+insert into t1 values
+(1001, 1001),
+(1005, 1005),
+(1007, 1007),
+(1010, 1010);
+begin;
+select * from t1 order by pk desc limit 2 for update;
+pk a
+1010 1010
+1007 1007
+# The below will lock from pk=1007 (0x3ef) till the end of the table:
+# select * from information_schema.rocksdb_locks; # With replacements by select_from_is_rowlocks.inc
+COLUMN_FAMILY_ID TRANSACTION_ID KEY mode
+$cf_id $trx_id ${indexnr}800003ef-${indexnr+1} X
+rollback;
+begin;
+select * from t1 where pk <1007 order by pk desc limit 2 for update;
+pk a
+1005 1005
+1001 1001
+# select * from information_schema.rocksdb_locks; # With replacements by select_from_is_rowlocks.inc
+COLUMN_FAMILY_ID TRANSACTION_ID KEY mode
+$cf_id $trx_id ${indexnr}800003e9-${indexnr}800003ef X
+connection con1;
+rollback;
+connection default;
+rollback;
+#
+# Backward scan test 2: error condition
+#
+connection con1;
+begin;
+select * from t1 where pk=1010 for update;
+pk a
+1010 1010
+connection default;
+begin;
+select * from t1 order by pk desc limit 2 for update;
+ERROR HY000: Lock wait timeout exceeded; try restarting transaction: Timeout on index: test.t1.PRIMARY
+rollback;
+connection con1;
+rollback;
+begin;
+select * from t1 where pk=1007 for update;
+pk a
+1007 1007
+connection default;
+begin;
+select * from t1 order by pk desc limit 2 for update;
+ERROR HY000: Lock wait timeout exceeded; try restarting transaction: Timeout on index: test.t1.PRIMARY
+rollback;
+connection con1;
+rollback;
+disconnect con1;
+connection default;
+drop table t0,t1;
+#
+# A test: full table scan doesn't lock gaps
+#
+create table t1 (
+pk int primary key,
+a int
+) engine=rocksdb;
+insert into t1 values (10,10),(20,20),(30,30);
+connect con1,localhost,root,,;
+connect con2,localhost,root,,;
+connection con1;
+begin;
+select * from t1 for update;
+pk a
+10 10
+20 20
+30 30
+connection con2;
+insert into t1 values (5,5);
+ERROR HY000: Lock wait timeout exceeded; try restarting transaction: Timeout on index: test.t1.PRIMARY
+connection con1;
+rollback;
+disconnect con1;
+disconnect con2;
+connection default;
+drop table t1;
diff --git a/mysql-test/suite/rocksdb/r/range_locking_seek_for_update2.result b/mysql-test/suite/rocksdb/r/range_locking_seek_for_update2.result
new file mode 100644
index 00000000000..2afb2eea589
--- /dev/null
+++ b/mysql-test/suite/rocksdb/r/range_locking_seek_for_update2.result
@@ -0,0 +1,142 @@
+show variables like 'rocksdb_use_range_locking';
+Variable_name Value
+rocksdb_use_range_locking ON
+create table t0(a int primary key);
+insert into t0 values (0),(1),(2),(3),(4),(5),(6),(7),(8),(9);
+create table t1 (
+pk int,
+a int,
+primary key (pk) comment 'rlsfu_test'
+) engine=rocksdb;
+insert into t1 (pk)
+select
+A.a + B.a*10 + C.a*100
+from
+t0 A, t0 B, t0 C;
+delete from t1 where pk<100;
+connect con1,localhost,root,,;
+connection con1;
+begin;
+set debug_sync='rocksdb.locking_iter_scan SIGNAL about_to_lock_range WAIT_FOR spoiler_inserted';
+select * from t1 where pk >=5 order by pk limit 5 for update;
+connection default;
+set debug_sync='now WAIT_FOR about_to_lock_range';
+insert into t1 (pk) values
+(10),(20),(30),(40),(50);
+set debug_sync='now SIGNAL spoiler_inserted';
+connection con1;
+pk a
+10 NULL
+20 NULL
+30 NULL
+40 NULL
+50 NULL
+# This must return 1, no 5:
+select lock_count from information_schema.rocksdb_trx
+where thread_id=CONNECTION_ID();
+lock_count
+1
+rollback;
+disconnect con1;
+connection default;
+drop table t0, t1;
+#
+# A testcase for locking at the end of the scan
+#
+create table t1 (
+pk int,
+primary key (pk) comment 'rlsfu_test'
+) engine=rocksdb;
+connect con1,localhost,root,,;
+connection con1;
+insert into t1 values (1), (10), (100);
+begin;
+select * from t1 for update;
+pk
+1
+10
+100
+connection default;
+select * from t1;
+pk
+1
+10
+100
+insert into t1 values (150);
+ERROR HY000: Lock wait timeout exceeded; try restarting transaction: Timeout on index: test.t1.PRIMARY
+connection con1;
+rollback;
+begin;
+explain
+select * from t1 order by pk desc for update;
+id select_type table partitions type possible_keys key key_len ref rows filtered Extra
+1 SIMPLE t1 NULL index NULL PRIMARY 4 NULL # 100.00 Backward index scan; Using index
+Warnings:
+Note 1003 /* select#1 */ select `test`.`t1`.`pk` AS `pk` from `test`.`t1` order by `test`.`t1`.`pk` desc
+select * from t1 order by pk desc for update;
+pk
+100
+10
+1
+connection default;
+select * from t1;
+pk
+1
+10
+100
+insert into t1 values (0);
+ERROR HY000: Lock wait timeout exceeded; try restarting transaction: Timeout on index: test.t1.PRIMARY
+disconnect con1;
+connection default;
+drop table t1;
+set global rocksdb_enable_iterate_bounds=off;
+#
+# A testcase for locking at the end of the scan
+#
+create table t1 (
+pk int,
+primary key (pk) comment 'rlsfu_test'
+) engine=rocksdb;
+connect con1,localhost,root,,;
+connection con1;
+insert into t1 values (1), (10), (100);
+begin;
+select * from t1 for update;
+pk
+1
+10
+100
+connection default;
+select * from t1;
+pk
+1
+10
+100
+insert into t1 values (150);
+ERROR HY000: Lock wait timeout exceeded; try restarting transaction: Timeout on index: test.t1.PRIMARY
+connection con1;
+rollback;
+begin;
+explain
+select * from t1 order by pk desc for update;
+id select_type table partitions type possible_keys key key_len ref rows filtered Extra
+1 SIMPLE t1 NULL index NULL PRIMARY 4 NULL # 100.00 Backward index scan; Using index
+Warnings:
+Note 1003 /* select#1 */ select `test`.`t1`.`pk` AS `pk` from `test`.`t1` order by `test`.`t1`.`pk` desc
+select * from t1 order by pk desc for update;
+pk
+100
+10
+1
+connection default;
+select * from t1;
+pk
+1
+10
+100
+insert into t1 values (0);
+ERROR HY000: Lock wait timeout exceeded; try restarting transaction: Timeout on index: test.t1.PRIMARY
+disconnect con1;
+connection default;
+drop table t1;
+set global rocksdb_enable_iterate_bounds=on;
diff --git a/mysql-test/suite/rocksdb/r/range_locking_seek_for_update2_rev_cf.result b/mysql-test/suite/rocksdb/r/range_locking_seek_for_update2_rev_cf.result
new file mode 100644
index 00000000000..fef9e0ef6a0
--- /dev/null
+++ b/mysql-test/suite/rocksdb/r/range_locking_seek_for_update2_rev_cf.result
@@ -0,0 +1,142 @@
+show variables like 'rocksdb_use_range_locking';
+Variable_name Value
+rocksdb_use_range_locking ON
+create table t0(a int primary key);
+insert into t0 values (0),(1),(2),(3),(4),(5),(6),(7),(8),(9);
+create table t1 (
+pk int,
+a int,
+primary key (pk) comment 'rev:rlsfu_test'
+) engine=rocksdb;
+insert into t1 (pk)
+select
+A.a + B.a*10 + C.a*100
+from
+t0 A, t0 B, t0 C;
+delete from t1 where pk<100;
+connect con1,localhost,root,,;
+connection con1;
+begin;
+set debug_sync='rocksdb.locking_iter_scan SIGNAL about_to_lock_range WAIT_FOR spoiler_inserted';
+select * from t1 where pk >=5 order by pk limit 5 for update;
+connection default;
+set debug_sync='now WAIT_FOR about_to_lock_range';
+insert into t1 (pk) values
+(10),(20),(30),(40),(50);
+set debug_sync='now SIGNAL spoiler_inserted';
+connection con1;
+pk a
+10 NULL
+20 NULL
+30 NULL
+40 NULL
+50 NULL
+# This must return 1, no 5:
+select lock_count from information_schema.rocksdb_trx
+where thread_id=CONNECTION_ID();
+lock_count
+1
+rollback;
+disconnect con1;
+connection default;
+drop table t0, t1;
+#
+# A testcase for locking at the end of the scan
+#
+create table t1 (
+pk int,
+primary key (pk) comment 'rev:rlsfu_test'
+) engine=rocksdb;
+connect con1,localhost,root,,;
+connection con1;
+insert into t1 values (1), (10), (100);
+begin;
+select * from t1 for update;
+pk
+1
+10
+100
+connection default;
+select * from t1;
+pk
+1
+10
+100
+insert into t1 values (150);
+ERROR HY000: Lock wait timeout exceeded; try restarting transaction: Timeout on index: test.t1.PRIMARY
+connection con1;
+rollback;
+begin;
+explain
+select * from t1 order by pk desc for update;
+id select_type table partitions type possible_keys key key_len ref rows filtered Extra
+1 SIMPLE t1 NULL index NULL PRIMARY 4 NULL # 100.00 Backward index scan; Using index
+Warnings:
+Note 1003 /* select#1 */ select `test`.`t1`.`pk` AS `pk` from `test`.`t1` order by `test`.`t1`.`pk` desc
+select * from t1 order by pk desc for update;
+pk
+100
+10
+1
+connection default;
+select * from t1;
+pk
+1
+10
+100
+insert into t1 values (0);
+ERROR HY000: Lock wait timeout exceeded; try restarting transaction: Timeout on index: test.t1.PRIMARY
+disconnect con1;
+connection default;
+drop table t1;
+set global rocksdb_enable_iterate_bounds=off;
+#
+# A testcase for locking at the end of the scan
+#
+create table t1 (
+pk int,
+primary key (pk) comment 'rev:rlsfu_test'
+) engine=rocksdb;
+connect con1,localhost,root,,;
+connection con1;
+insert into t1 values (1), (10), (100);
+begin;
+select * from t1 for update;
+pk
+1
+10
+100
+connection default;
+select * from t1;
+pk
+1
+10
+100
+insert into t1 values (150);
+ERROR HY000: Lock wait timeout exceeded; try restarting transaction: Timeout on index: test.t1.PRIMARY
+connection con1;
+rollback;
+begin;
+explain
+select * from t1 order by pk desc for update;
+id select_type table partitions type possible_keys key key_len ref rows filtered Extra
+1 SIMPLE t1 NULL index NULL PRIMARY 4 NULL # 100.00 Backward index scan; Using index
+Warnings:
+Note 1003 /* select#1 */ select `test`.`t1`.`pk` AS `pk` from `test`.`t1` order by `test`.`t1`.`pk` desc
+select * from t1 order by pk desc for update;
+pk
+100
+10
+1
+connection default;
+select * from t1;
+pk
+1
+10
+100
+insert into t1 values (0);
+ERROR HY000: Lock wait timeout exceeded; try restarting transaction: Timeout on index: test.t1.PRIMARY
+disconnect con1;
+connection default;
+drop table t1;
+set global rocksdb_enable_iterate_bounds=on;
diff --git a/mysql-test/suite/rocksdb/r/range_locking_shared_locks.result b/mysql-test/suite/rocksdb/r/range_locking_shared_locks.result
new file mode 100644
index 00000000000..580108de6f6
--- /dev/null
+++ b/mysql-test/suite/rocksdb/r/range_locking_shared_locks.result
@@ -0,0 +1,251 @@
+select @@rocksdb_use_range_locking;
+@@rocksdb_use_range_locking
+1
+create table t0 (a int);
+insert into t0 values (0),(1),(2),(3),(4),(5),(6),(7),(8),(9);
+create table t1 (
+pk int primary key,
+a int
+) engine=rocksdb;
+insert into t1 select a,a from t0;
+# A basic test for shared locks
+begin;
+select * from t1 where pk=3 for update;
+pk a
+3 3
+select * from t1 where pk=5 lock in share mode;
+pk a
+5 5
+connect con1,localhost,root,,;
+connection con1;
+begin;
+select * from t1 where pk=5 lock in share mode;
+pk a
+5 5
+# Now for pk=5 we should see two locks by TRX1 and TRX2 with mode=S:
+# select * from information_schema.rocksdb_locks; # With replacements by select_from_is_rowlocks.inc
+COLUMN_FAMILY_ID TRANSACTION_ID KEY mode
+$cf_id $TRX1_ID ${indexnr}80000003 X
+$cf_id $TRX1_ID ${indexnr}80000005 S
+$cf_id $TRX2_ID ${indexnr}80000005 S
+rollback;
+# Now, TRX2_ID should be gone:
+# select * from information_schema.rocksdb_locks; # With replacements by select_from_is_rowlocks.inc
+COLUMN_FAMILY_ID TRANSACTION_ID KEY mode
+$cf_id $TRX1_ID ${indexnr}80000003 X
+$cf_id $TRX1_ID ${indexnr}80000005 S
+connection default;
+# Get a read lock on pk=3 (where we have a write lock).
+# The result should be that we will still have a write lock
+select * from t1 where pk=3 for update;
+pk a
+3 3
+# select * from information_schema.rocksdb_locks; # With replacements by select_from_is_rowlocks.inc
+COLUMN_FAMILY_ID TRANSACTION_ID KEY mode
+$cf_id $TRX1_ID ${indexnr}80000003 X
+$cf_id $TRX1_ID ${indexnr}80000005 S
+# Get a write lock on pk=5 (where we have a read lock).
+# The result should be that we will have a write lock.
+select * from t1 where pk=5 for update;
+pk a
+5 5
+# select * from information_schema.rocksdb_locks; # With replacements by select_from_is_rowlocks.inc
+COLUMN_FAMILY_ID TRANSACTION_ID KEY mode
+$cf_id $TRX1_ID ${indexnr}80000003 X
+$cf_id $TRX1_ID ${indexnr}80000005 X
+connection default;
+rollback;
+#
+# Test if a read lock inhibits write locks
+#
+begin;
+select * from t1 where pk=2 lock in share mode;
+pk a
+2 2
+select * from t1 where pk=8 for update;
+pk a
+8 8
+connection con1;
+begin;
+select * from t1 where pk=2 for update;
+ERROR HY000: Lock wait timeout exceeded; try restarting transaction: Timeout on index: test.t1.PRIMARY
+select * from t1 where pk between 0 and 4 for update;
+ERROR HY000: Lock wait timeout exceeded; try restarting transaction: Timeout on index: test.t1.PRIMARY
+delete from t1 where pk=2;
+ERROR HY000: Lock wait timeout exceeded; try restarting transaction: Timeout on index: test.t1.PRIMARY
+# Get a shared lock
+select * from t1 where pk=2 lock in share mode;
+pk a
+2 2
+# But this should still prevent us from acquiring a write lock on that value:
+select * from t1 where pk=2 for update;
+ERROR HY000: Lock wait timeout exceeded; try restarting transaction: Timeout on index: test.t1.PRIMARY
+rollback;
+connection default;
+rollback;
+drop table t1;
+create table t1 (
+pk int not null primary key,
+a int not null,
+key(a)
+) engine=rocksdb;
+insert into t1
+select
+A.a+10*B.a+100*C.a+1000*D.a, A.a+10*B.a+100*C.a+1000*D.a
+from
+t0 A, t0 B, t0 C, t0 D;
+set global rocksdb_force_flush_memtable_now=1;
+connection con1;
+begin;
+select * from t1 where pk=900 for update;
+pk a
+900 900
+connection default;
+begin;
+explain
+select * from t1 where a between 2 and 5 lock in share mode;
+id select_type table partitions type possible_keys key key_len ref rows filtered Extra
+1 SIMPLE t1 NULL range a a 4 NULL # 100.00 Using where; Using index
+Warnings:
+Note 1003 /* select#1 */ select `test`.`t1`.`pk` AS `pk`,`test`.`t1`.`a` AS `a` from `test`.`t1` where (`test`.`t1`.`a` between 2 and 5)
+select * from t1 where a between 2 and 5 lock in share mode;
+pk a
+2 2
+3 3
+4 4
+5 5
+# select * from information_schema.rocksdb_locks; # With replacements by select_from_is_rowlocks.inc
+COLUMN_FAMILY_ID TRANSACTION_ID KEY mode
+$cf_id $TRX1_ID ${indexnr+1}80000002-${indexnr+1}80000005:1 X
+$cf_id $TRX1_ID ${indexnr}80000002 S
+$cf_id $TRX1_ID ${indexnr}80000003 S
+$cf_id $TRX1_ID ${indexnr}80000004 S
+$cf_id $TRX1_ID ${indexnr}80000005 S
+$cf_id $TRX1_ID ${indexnr}80000006 S
+$cf_id $TRX2_ID ${indexnr}80000384 X
+rollback;
+disconnect con1;
+drop table t0,t1;
+#
+# Test shared point locks and lock escalation
+#
+create table t0 (a int);
+insert into t0 values (0),(1),(2),(3),(4),(5),(6),(7),(8),(9);
+create table t1 (
+pk int primary key,
+a int
+) engine=rocksdb;
+insert into t1
+select 1000 + 100*A.a + 10*B.a + C.a, 12345 from t0 A, t0 B, t0 C;
+show status like 'rocksdb_locktree_current_lock_memory';
+Variable_name Value
+rocksdb_locktree_current_lock_memory 0
+connect con1,localhost,root,,;
+connection con1;
+begin;
+# CON1: get some shared locks
+select * from t1 where pk=1001 lock in share mode;
+pk a
+1001 12345
+select * from t1 where pk=1100 lock in share mode;
+pk a
+1100 12345
+select * from t1 where pk=1200 lock in share mode;
+pk a
+1200 12345
+select * from t1 where pk=2500 lock in share mode;
+pk a
+connection default;
+begin;
+# DEFAULT: get the same locks so we have locks with multiple owners
+select * from t1 where pk=1001 lock in share mode;
+pk a
+1001 12345
+select * from t1 where pk=1100 lock in share mode;
+pk a
+1100 12345
+select * from t1 where pk=1200 lock in share mode;
+pk a
+1200 12345
+# DEFAULT: get shared locks with one owner:
+select * from t1 where pk=2510 lock in share mode;
+pk a
+# DEFAULT: exclusive locks on 0-10:
+insert into t1 select A.a, 0 from t0 A;
+connection con1;
+# CON1: exclusive locks on 2000-2010:
+insert into t1 select 2000+A.a, 0 from t0 A;
+# select * from information_schema.rocksdb_locks; # With replacements by select_from_is_rowlocks.inc
+COLUMN_FAMILY_ID TRANSACTION_ID KEY mode
+$cf_id $TRX2_ID ${indexnr}80000000 X
+$cf_id $TRX2_ID ${indexnr}80000001 X
+$cf_id $TRX2_ID ${indexnr}80000002 X
+$cf_id $TRX2_ID ${indexnr}80000003 X
+$cf_id $TRX2_ID ${indexnr}80000004 X
+$cf_id $TRX2_ID ${indexnr}80000005 X
+$cf_id $TRX2_ID ${indexnr}80000006 X
+$cf_id $TRX2_ID ${indexnr}80000007 X
+$cf_id $TRX2_ID ${indexnr}80000008 X
+$cf_id $TRX2_ID ${indexnr}80000009 X
+$cf_id $TRX1_ID ${indexnr}800003e9 S
+$cf_id $TRX2_ID ${indexnr}800003e9 S
+$cf_id $TRX1_ID ${indexnr}8000044c S
+$cf_id $TRX2_ID ${indexnr}8000044c S
+$cf_id $TRX1_ID ${indexnr}800004b0 S
+$cf_id $TRX2_ID ${indexnr}800004b0 S
+$cf_id $TRX1_ID ${indexnr}800007d0 X
+$cf_id $TRX1_ID ${indexnr}800007d1 X
+$cf_id $TRX1_ID ${indexnr}800007d2 X
+$cf_id $TRX1_ID ${indexnr}800007d3 X
+$cf_id $TRX1_ID ${indexnr}800007d4 X
+$cf_id $TRX1_ID ${indexnr}800007d5 X
+$cf_id $TRX1_ID ${indexnr}800007d6 X
+$cf_id $TRX1_ID ${indexnr}800007d7 X
+$cf_id $TRX1_ID ${indexnr}800007d8 X
+$cf_id $TRX1_ID ${indexnr}800007d9 X
+$cf_id $TRX1_ID ${indexnr}800009c4 S
+$cf_id $TRX2_ID ${indexnr}800009ce S
+connection default;
+show status like 'rocksdb_locktree_current_lock_memory';
+Variable_name Value
+rocksdb_locktree_current_lock_memory 8792
+set @save_mlm= @@rocksdb_max_lock_memory;
+# Set the limit to cause lock escalation:
+set @cur_mem_usage= (select
+variable_value
+from
+performance_schema.global_status
+where
+variable_name='rocksdb_locktree_current_lock_memory');
+set global rocksdb_max_lock_memory = cast(@cur_mem_usage+4 as SIGNED);
+connection con1;
+insert into t1 select 3000+A.a, 0 from t0 A;
+# select * from information_schema.rocksdb_locks; # With replacements by select_from_is_rowlocks.inc
+COLUMN_FAMILY_ID TRANSACTION_ID KEY mode
+$cf_id $TRX2_ID ${indexnr}80000000-${indexnr}80000009 X
+$cf_id $TRX1_ID ${indexnr}800003e9 S
+$cf_id $TRX2_ID ${indexnr}800003e9 S
+$cf_id $TRX1_ID ${indexnr}8000044c S
+$cf_id $TRX2_ID ${indexnr}8000044c S
+$cf_id $TRX1_ID ${indexnr}800004b0 S
+$cf_id $TRX2_ID ${indexnr}800004b0 S
+$cf_id $TRX1_ID ${indexnr}800007d0-${indexnr}800007d9 X
+$cf_id $TRX1_ID ${indexnr}800009c4 S
+$cf_id $TRX2_ID ${indexnr}800009ce S
+$cf_id $TRX1_ID ${indexnr}80000bb8 X
+$cf_id $TRX1_ID ${indexnr}80000bb9 X
+$cf_id $TRX1_ID ${indexnr}80000bba X
+$cf_id $TRX1_ID ${indexnr}80000bbb X
+$cf_id $TRX1_ID ${indexnr}80000bbc X
+$cf_id $TRX1_ID ${indexnr}80000bbd X
+$cf_id $TRX1_ID ${indexnr}80000bbe X
+$cf_id $TRX1_ID ${indexnr}80000bbf X
+$cf_id $TRX1_ID ${indexnr}80000bc0 X
+$cf_id $TRX1_ID ${indexnr}80000bc1 X
+connection con1;
+rollback;
+connection default;
+rollback;
+disconnect con1;
+set global rocksdb_max_lock_memory= cast(@save_mlm as SIGNED);
+drop table t0, t1;
diff --git a/mysql-test/suite/rocksdb/r/rocksdb.result b/mysql-test/suite/rocksdb/r/rocksdb.result
index c6397708446..6c1e9a0d938 100644
--- a/mysql-test/suite/rocksdb/r/rocksdb.result
+++ b/mysql-test/suite/rocksdb/r/rocksdb.result
@@ -987,6 +987,7 @@ rocksdb_max_background_jobs 2
rocksdb_max_bottom_pri_background_compactions 0
rocksdb_max_compaction_history 64
rocksdb_max_latest_deadlocks 5
+rocksdb_max_lock_memory 1073741824
rocksdb_max_log_file_size 0
rocksdb_max_manifest_file_size 1073741824
rocksdb_max_manual_compactions 10
@@ -1056,6 +1057,8 @@ rocksdb_use_default_sk_cf OFF
rocksdb_use_direct_io_for_flush_and_compaction OFF
rocksdb_use_direct_reads OFF
rocksdb_use_fsync OFF
+rocksdb_use_range_lock_manager_as_point OFF
+rocksdb_use_range_locking OFF
rocksdb_validate_tables 1
rocksdb_verify_row_debug_checksums OFF
rocksdb_wal_bytes_per_sync 0
diff --git a/mysql-test/suite/rocksdb/r/rocksdb_read_free_rpl.result b/mysql-test/suite/rocksdb/r/rocksdb_read_free_rpl.result
index 58329d03ebc..71e6ff4d30b 100644
--- a/mysql-test/suite/rocksdb/r/rocksdb_read_free_rpl.result
+++ b/mysql-test/suite/rocksdb/r/rocksdb_read_free_rpl.result
@@ -72,7 +72,7 @@ update t1 set c2=100 where c1=3;
delete from t1 where c1 <= 2;
include/sync_slave_sql_with_master.inc
[connection slave]
-select case when variable_value-@up > 0 then 'false' else 'true' end as read_free from performance_schema.global_status where variable_name='rocksdb_num_get_for_update_calls';
+select case when (@@rocksdb_use_range_locking=1 OR variable_value-@up > 0) then 'false' else 'true' end as read_free from performance_schema.global_status where variable_name='rocksdb_num_get_for_update_calls';
read_free
false
select * from t1;
diff --git a/mysql-test/suite/rocksdb/r/rocksdb_timeout_rollback.result b/mysql-test/suite/rocksdb/r/rocksdb_timeout_rollback.result
index 1e253a9974b..08a0a2f5942 100644
--- a/mysql-test/suite/rocksdb/r/rocksdb_timeout_rollback.result
+++ b/mysql-test/suite/rocksdb/r/rocksdb_timeout_rollback.result
@@ -36,6 +36,9 @@ rocksdb_rollback_on_timeout OFF
begin work;
insert into t1 values (9);
insert into t1 values (10);
+# Fix for Range Locking: force a snapshot to be taken:
+select * from t1 where a=100;
+a
update t1 set a = a + 1 where a = 2;
begin work;
insert into t1 values (11);
diff --git a/mysql-test/suite/rocksdb/r/select_count_for_update.result b/mysql-test/suite/rocksdb/r/select_count_for_update.result
index 1107aa2f6cb..6672d43eb43 100644
--- a/mysql-test/suite/rocksdb/r/select_count_for_update.result
+++ b/mysql-test/suite/rocksdb/r/select_count_for_update.result
@@ -35,9 +35,9 @@ SELECT COUNT(*) FROM t1 FORCE INDEX (sk);
COUNT(*)
3
SELECT COUNT(*) FROM t1 FORCE INDEX (sk) LOCK IN SHARE MODE;
-ERROR HY000: Lock wait timeout exceeded; try restarting transaction: Timeout on index: test.t1.PRIMARY
+ERROR HY000: Lock wait timeout exceeded; try restarting transaction: Timeout on index: $FAILING_INDEX
SELECT COUNT(*) FROM t1 FORCE INDEX (sk) FOR UPDATE;
-ERROR HY000: Lock wait timeout exceeded; try restarting transaction: Timeout on index: test.t1.PRIMARY
+ERROR HY000: Lock wait timeout exceeded; try restarting transaction: Timeout on index: $FAILING_INDEX
connection con1;
COMMIT;
SELECT COUNT(*) FROM t1 FORCE INDEX (sk);
diff --git a/mysql-test/suite/rocksdb/r/trx_info.result b/mysql-test/suite/rocksdb/r/trx_info.result
index ada2e127021..562c1d7022e 100644
--- a/mysql-test/suite/rocksdb/r/trx_info.result
+++ b/mysql-test/suite/rocksdb/r/trx_info.result
@@ -9,5 +9,10 @@ a
2
select * from information_schema.rocksdb_trx;
TRANSACTION_ID STATE NAME WRITE_COUNT LOCK_COUNT TIMEOUT_SEC WAITING_KEY WAITING_COLUMN_FAMILY_ID IS_REPLICATION SKIP_TRX_API READ_ONLY HAS_DEADLOCK_DETECTION NUM_ONGOING_BULKLOAD THREAD_ID QUERY
-_TRX_ID_ STARTED _NAME_ 0 2 1 _KEY_ 0 0 0 0 0 0 _THREAD_ID_ select * from information_schema.rocksdb_trx
+_TRX_ID_ STARTED _NAME_ 0 2_or_3 1 _KEY_ 0 0 0 0 0 0 _THREAD_ID_ select * from information_schema.rocksdb_trx
+select
+if(@@rocksdb_use_range_locking=1, LOCK_COUNT=3, LOCK_COUNT=2) as LOCK_COUNT_IS_CORRECT
+from information_schema.rocksdb_trx;
+LOCK_COUNT_IS_CORRECT
+1
DROP TABLE t1;
diff --git a/mysql-test/suite/rocksdb/r/unique_sec.result b/mysql-test/suite/rocksdb/r/unique_sec.result
index 1da78db24b1..d4ef2e0ff2e 100644
--- a/mysql-test/suite/rocksdb/r/unique_sec.result
+++ b/mysql-test/suite/rocksdb/r/unique_sec.result
@@ -114,6 +114,10 @@ ERROR 23000: Duplicate entry '37' for key 't1.id5'
UPDATE t1 SET id5=34 WHERE id1=38;
ERROR HY000: Lock wait timeout exceeded; try restarting transaction: Timeout on index: test.t1.id5
# NULL values are unique
+# (Note: the following UPDATE reads through the whole table without
+# finding anything to update. With point locking, this is fine,
+# but with range locking it will time out while waiting on a row lock
+# that the other transaction is holding)
UPDATE t1 SET id5=NULL WHERE value1 > 37;
COMMIT;
COMMIT;
diff --git a/mysql-test/suite/rocksdb/r/unique_sec_rev_cf.result b/mysql-test/suite/rocksdb/r/unique_sec_rev_cf.result
index d6d06f6ece5..0e71e6481aa 100644
--- a/mysql-test/suite/rocksdb/r/unique_sec_rev_cf.result
+++ b/mysql-test/suite/rocksdb/r/unique_sec_rev_cf.result
@@ -114,6 +114,10 @@ ERROR 23000: Duplicate entry '37' for key 't1.id5'
UPDATE t1 SET id5=34 WHERE id1=38;
ERROR HY000: Lock wait timeout exceeded; try restarting transaction: Timeout on index: test.t1.id5
# NULL values are unique
+# (Note: the following UPDATE reads through the whole table without
+# finding anything to update. With point locking, this is fine,
+# but with range locking it will time out while waiting on a row lock
+# that the other transaction is holding)
UPDATE t1 SET id5=NULL WHERE value1 > 37;
COMMIT;
COMMIT;
diff --git a/mysql-test/suite/rocksdb/t/deadlock_tracking.test b/mysql-test/suite/rocksdb/t/deadlock_tracking.test
index 42e46bb0f28..55e6502c079 100644
--- a/mysql-test/suite/rocksdb/t/deadlock_tracking.test
+++ b/mysql-test/suite/rocksdb/t/deadlock_tracking.test
@@ -1,3 +1,9 @@
+# Deadlock #5 uses SELECT ... LOCK IN SHARE MODE;
+# SHOW ENGINE ROCKSDB TRANSACTION status prints information about deadlocks.
+# A part of this test that works with range locking is in
+# range_locking_deadlock_tracking.test
+--source suite/rocksdb/include/not_range_locking.inc
+
set @prior_lock_wait_timeout = @@rocksdb_lock_wait_timeout;
set @prior_deadlock_detect = @@rocksdb_deadlock_detect;
set @prior_max_latest_deadlocks = @@rocksdb_max_latest_deadlocks;
@@ -137,7 +143,6 @@ rollback;
connection default;
--replace_regex /[0-9]{4}-[0-9]{2}-[0-9]{2} [0-9]{2}:[0-9]{2}:[0-9]{2}/TIMESTAMP/ /WAITING KEY: [0-9a-f]{16}/KEY/ /TRANSACTION ID: [0-9]*/TXN_ID/ /TIMESTAMP: [0-9]*/TSTAMP/
show engine rocksdb transaction status;
-
echo Deadlock #6;
connection con1;
create table t1 (id int primary key, value int) engine=rocksdb;
diff --git a/mysql-test/suite/rocksdb/t/drop_cf_before_show_deadlock_info.test b/mysql-test/suite/rocksdb/t/drop_cf_before_show_deadlock_info.test
index f7eb8151f40..05ae30f2ddd 100644
--- a/mysql-test/suite/rocksdb/t/drop_cf_before_show_deadlock_info.test
+++ b/mysql-test/suite/rocksdb/t/drop_cf_before_show_deadlock_info.test
@@ -3,6 +3,10 @@
--source include/have_rocksdb.inc
--source include/count_sessions.inc
+# Doesn't work with range locking because range locking
+# does not provide info in rocksdb_deadlock.
+--source suite/rocksdb/include/not_range_locking.inc
+
--disable_query_log
call mtr.add_suppression("Column family '[a-z_]+' not found");
--enable_query_log
diff --git a/mysql-test/suite/rocksdb/t/hermitage-range_locking.test b/mysql-test/suite/rocksdb/t/hermitage-range_locking.test
new file mode 100644
index 00000000000..55203af9cf8
--- /dev/null
+++ b/mysql-test/suite/rocksdb/t/hermitage-range_locking.test
@@ -0,0 +1,15 @@
+--source include/have_rocksdb.inc
+
+# Range locking uses InnoDB-like transaction isolation, which
+# means the results differ from "true" Repeatable Read.
+--source suite/rocksdb/include/have_range_locking.inc
+
+
+# Hermitage is an attempt to test transaction isolation levels.
+# https://github.com/ept/hermitage
+
+let $trx_isolation = READ COMMITTED;
+--source hermitage.inc
+
+let $trx_isolation = REPEATABLE READ;
+--source hermitage.inc
diff --git a/mysql-test/suite/rocksdb/t/hermitage.inc b/mysql-test/suite/rocksdb/t/hermitage.inc
index 90f7d482533..83815a70459 100644
--- a/mysql-test/suite/rocksdb/t/hermitage.inc
+++ b/mysql-test/suite/rocksdb/t/hermitage.inc
@@ -108,6 +108,8 @@ select * from test where value % 3 = 0;
commit;
--source hermitage_init.inc
+let $RC_OR_RANGE_LOCKING=`select @@tx_isolation='READ-COMMITTED' OR @@rocksdb_use_range_locking=1`;
+let $RR_AND_NOT_RANGE_LOCKING=`select @@tx_isolation='REPEATABLE-READ' AND @@rocksdb_use_range_locking=0`;
connection con1;
update test set value = value + 10;
connection con2;
@@ -117,13 +119,13 @@ send delete from test where value = 20;
connection con1;
commit;
connection con2;
-if ($trx_isolation == "READ COMMITTED")
+if ($RC_OR_RANGE_LOCKING)
{
reap;
# RC: Returns 2 => 30
select * from test;
}
-if ($trx_isolation == "REPEATABLE READ")
+if ($RR_AND_NOT_RANGE_LOCKING)
{
--error ER_LOCK_DEADLOCK
reap;
@@ -147,13 +149,13 @@ send update test set value = 12 where id = 1;
connection con1;
commit;
connection con2;
-if ($trx_isolation == "READ COMMITTED")
+if ($RC_OR_RANGE_LOCKING)
{
reap;
# RC: Returns 1 => 12
select * from test;
}
-if ($trx_isolation == "REPEATABLE READ")
+if ($RR_AND_NOT_RANGE_LOCKING)
{
--error ER_LOCK_DEADLOCK
reap;
@@ -200,12 +202,12 @@ update test set value = 12 where id = 1;
update test set value = 18 where id = 2;
commit;
connection con1;
-if ($trx_isolation == "READ COMMITTED")
+if ($RC_OR_RANGE_LOCKING)
{
delete from test where value = 20; # doesn't delete anything
select * from test where id = 2; # shows 2 => 18
}
-if ($trx_isolation == "REPEATABLE READ")
+if ($RR_AND_NOT_RANGE_LOCKING)
{
--error ER_LOCK_DEADLOCK
delete from test where value = 20;
diff --git a/mysql-test/suite/rocksdb/t/hermitage.test b/mysql-test/suite/rocksdb/t/hermitage.test
index e4138e8d89f..51f3f286a0e 100644
--- a/mysql-test/suite/rocksdb/t/hermitage.test
+++ b/mysql-test/suite/rocksdb/t/hermitage.test
@@ -1,5 +1,8 @@
--source include/have_rocksdb.inc
+# See hermitage-range_locking variant
+--source suite/rocksdb/include/not_range_locking.inc
+
# Hermitage is an attempt to test transaction isolation levels.
# https://github.com/ept/hermitage
diff --git a/mysql-test/suite/rocksdb/t/i_s_deadlock.test b/mysql-test/suite/rocksdb/t/i_s_deadlock.test
index e0479d6a337..82fa9fc6bbd 100644
--- a/mysql-test/suite/rocksdb/t/i_s_deadlock.test
+++ b/mysql-test/suite/rocksdb/t/i_s_deadlock.test
@@ -1,5 +1,9 @@
--source include/have_rocksdb.inc
+# Uses LOCK IN SHARE MODE and so will hang in range-locking mode. The part that
+# doesn't hang is in rocksdb.range_locking_i_s_deadlock.test
+--source suite/rocksdb/include/not_range_locking.inc
+
set @prior_lock_wait_timeout = @@rocksdb_lock_wait_timeout;
set @prior_deadlock_detect = @@rocksdb_deadlock_detect;
set @prior_max_latest_deadlocks = @@rocksdb_max_latest_deadlocks;
diff --git a/mysql-test/suite/rocksdb/t/issue111.test b/mysql-test/suite/rocksdb/t/issue111.test
index 671ea4708d6..3657e977a70 100644
--- a/mysql-test/suite/rocksdb/t/issue111.test
+++ b/mysql-test/suite/rocksdb/t/issue111.test
@@ -1,5 +1,9 @@
--source include/have_rocksdb.inc
+# The testcase here assumes key tracking is present
+# (and range locking uses InnoDB-like approach, "DMLs use Read Commited")
+--source suite/rocksdb/include/not_range_locking.inc
+
connect (con2,localhost,root,,);
connection default;
diff --git a/mysql-test/suite/rocksdb/t/issue243_transactionStatus-range_locking.test b/mysql-test/suite/rocksdb/t/issue243_transactionStatus-range_locking.test
new file mode 100644
index 00000000000..465fb9099da
--- /dev/null
+++ b/mysql-test/suite/rocksdb/t/issue243_transactionStatus-range_locking.test
@@ -0,0 +1,10 @@
+#
+# A range-locking variant of issue243_transactionStatus.test
+
+--source include/have_rocksdb.inc
+--source suite/rocksdb/include/have_range_locking.inc
+
+let $forced_range_locking=1;
+--source issue243_transactionStatus.test
+
+
diff --git a/mysql-test/suite/rocksdb/t/issue243_transactionStatus.test b/mysql-test/suite/rocksdb/t/issue243_transactionStatus.test
index 1e2f0b41226..5c1948ebe81 100644
--- a/mysql-test/suite/rocksdb/t/issue243_transactionStatus.test
+++ b/mysql-test/suite/rocksdb/t/issue243_transactionStatus.test
@@ -1,5 +1,9 @@
--source include/have_rocksdb.inc
+if (!$forced_range_locking) {
+--source suite/rocksdb/include/not_range_locking.inc
+}
+
--disable_warnings
DROP TABLE IF EXISTS t1;
--enable_warnings
diff --git a/mysql-test/suite/rocksdb/t/level_repeatable_read-range_locking.test b/mysql-test/suite/rocksdb/t/level_repeatable_read-range_locking.test
new file mode 100644
index 00000000000..6c42c7be12c
--- /dev/null
+++ b/mysql-test/suite/rocksdb/t/level_repeatable_read-range_locking.test
@@ -0,0 +1,9 @@
+--source include/have_rocksdb.inc
+
+# Range locking uses InnoDB-like transaction isolation, which
+# means the results differ from "true" Repeatable Read.
+--source suite/rocksdb/include/have_range_locking.inc
+
+let $trx_isolation = REPEATABLE READ;
+--source transaction_isolation.inc
+
diff --git a/mysql-test/suite/rocksdb/t/level_repeatable_read.test b/mysql-test/suite/rocksdb/t/level_repeatable_read.test
index cf29073f69e..b81dcf31ab1 100644
--- a/mysql-test/suite/rocksdb/t/level_repeatable_read.test
+++ b/mysql-test/suite/rocksdb/t/level_repeatable_read.test
@@ -1,5 +1,8 @@
--source include/have_rocksdb.inc
+# See level_repeatable_read-range_locking variant
+--source suite/rocksdb/include/not_range_locking.inc
+
let $trx_isolation = REPEATABLE READ;
--source transaction_isolation.inc
diff --git a/mysql-test/suite/rocksdb/t/lock_info.test b/mysql-test/suite/rocksdb/t/lock_info.test
index 1b624cf38c0..a277c1b8d8d 100644
--- a/mysql-test/suite/rocksdb/t/lock_info.test
+++ b/mysql-test/suite/rocksdb/t/lock_info.test
@@ -1,5 +1,8 @@
--source include/have_rocksdb.inc
+# Range Locking supports I_S.lock_info but its printout is different (see range_locking.test)
+--source suite/rocksdb/include/not_range_locking.inc
+
--disable_warnings
DROP TABLE IF EXISTS t1;
DROP TABLE IF EXISTS t2;
diff --git a/mysql-test/suite/rocksdb/t/locking_issues.test b/mysql-test/suite/rocksdb/t/locking_issues.test
index 035046ae368..95a6676f78a 100644
--- a/mysql-test/suite/rocksdb/t/locking_issues.test
+++ b/mysql-test/suite/rocksdb/t/locking_issues.test
@@ -1,5 +1,8 @@
--source include/have_rocksdb.inc
+# A lot of tests below assume point locking, not range.
+--source suite/rocksdb/include/not_range_locking.inc
+
let $isolation_level = REPEATABLE READ;
--source suite/rocksdb/include/locking_issues_case1_1.inc
diff --git a/mysql-test/suite/rocksdb/t/max_row_locks.test b/mysql-test/suite/rocksdb/t/max_row_locks.test
index 4b07f3d8492..d4b2604f1e3 100644
--- a/mysql-test/suite/rocksdb/t/max_row_locks.test
+++ b/mysql-test/suite/rocksdb/t/max_row_locks.test
@@ -1,4 +1,5 @@
--source include/have_rocksdb.inc
+--source suite/rocksdb/include/not_range_locking.inc
create table t1 (id1 bigint, id2 bigint, c1 bigint, c2 bigint, c3 bigint, c4 bigint, c5 bigint, c6 bigint, c7 bigint, primary key (id1, id2), index i(c1, c2));
--disable_query_log
diff --git a/mysql-test/suite/rocksdb/t/range_locking.inc b/mysql-test/suite/rocksdb/t/range_locking.inc
new file mode 100644
index 00000000000..b3ec0ae7367
--- /dev/null
+++ b/mysql-test/suite/rocksdb/t/range_locking.inc
@@ -0,0 +1,612 @@
+#
+# Range locking tests.
+#
+
+--source include/have_rocksdb.inc
+--source suite/rocksdb/include/have_range_locking.inc
+
+--enable_connect_log
+
+
+show variables like 'rocksdb_use_range_locking';
+
+eval create table t1 (
+ pk int,
+ a int,
+ primary key (pk) comment '$pk_cf'
+) engine=rocksdb;
+
+insert into t1 values
+(10,10),(20,20),(30,30);
+
+connect (con1,localhost,root,,);
+connect (con2,localhost,root,,);
+
+--echo ### Test: check that range lock inhibits a point lock
+connection con1;
+begin;
+select * from t1 where pk between 5 and 25 for update;
+
+connection con2;
+--error ER_LOCK_WAIT_TIMEOUT
+insert into t1 values (15,15);
+
+connection con1;
+rollback;
+
+--echo ## Test: check that range lock inhibits another range lock
+connection con1;
+begin;
+select * from t1 where pk between 5 and 25 for update;
+
+connection con2;
+begin;
+--error ER_LOCK_WAIT_TIMEOUT
+select * from t1 where pk between 15 and 35 for update;
+rollback;
+
+connection con1;
+rollback;
+
+--echo ## Test: check that regular read does not get a range lock
+connection con1;
+begin;
+select * from t1 where pk between 5 and 25;
+
+connection con2;
+begin;
+# This must not block
+select * from t1 where pk between 15 and 35 for update;
+rollback;
+
+connection con1;
+rollback;
+
+--echo ## Test that locks are not released when a statement inside
+--echo ## a transaction is rolled back
+eval
+create table t2 (
+ pk int,
+ a int,
+ primary key (pk) comment '$pk_cf',
+ unique key(a) comment '$sk_cf'
+) engine=rocksdb;
+
+insert into t2 values (1,1),(2,2);
+
+begin;
+insert into t2 values (3,3);
+--error ER_DUP_ENTRY
+insert into t2 values (10,2);
+
+connection con2;
+begin;
+# This must time out:
+--error ER_LOCK_WAIT_TIMEOUT
+select * from t2 where pk=3 for update;
+
+rollback;
+connection con1;
+rollback;
+drop table t2;
+
+# cleanup
+connection default;
+disconnect con1;
+disconnect con2;
+drop table t1;
+
+--echo #
+--echo # Test INFORMATION_SCHEMA.lock_info in range-locking mode
+--echo #
+
+connect (con1,localhost,root,,);
+connection con1;
+eval create table t0 (a int primary key);
+begin;
+insert into t0 values (1);
+connection default;
+
+
+eval
+create table t1 (
+ pk int,
+ a int,
+ primary key (pk) comment '$pk_cf'
+) engine=rocksdb;
+
+insert into t1 values
+(10,10),(20,20),(30,30);
+
+begin;
+select * from t1 where pk=10 for update;
+
+#let TRX1_ID=`(select transaction_id from information_schema.rocksdb_trx where thread_id=connection_id())` ;
+let $select_from_is_rowlocks_current_trx_only=1;
+--source suite/rocksdb/include/select_from_is_rowlocks.inc
+
+delete from t1 where pk between 25 and 40;
+
+--source suite/rocksdb/include/select_from_is_rowlocks.inc
+
+rollback;
+begin;
+--echo # The following will show a range lock on 2-9 and also a point lock on 10.
+--echo # This is how things currently work. (after MDEV-21314, not anymore)
+select * from t1 where pk between 2 and 9 for update;
+--source suite/rocksdb/include/select_from_is_rowlocks.inc
+rollback;
+
+drop table t1;
+connection con1;
+rollback;
+drop table t0;
+connection default;
+disconnect con1;
+
+--echo #
+--echo # MDEV-18104: MyRocks-Gap-Lock: range locking bounds are incorrect for multi-part keys
+--echo #
+
+create table t0(a int);
+insert into t0 values (0),(1),(2),(3),(4),(5),(6),(7),(8),(9);
+eval
+create table t1 (
+ kp1 int not null,
+ kp2 int not null,
+ a int,
+ primary key(kp1, kp2) comment '$pk_cf'
+) engine=rocksdb;
+
+insert into t1 select 1, a, 1234 from t0;
+insert into t1 select 2, a, 1234 from t0;
+insert into t1 select 3, a, 1234 from t0;
+
+connect (con1,localhost,root,,);
+connection con1;
+
+begin;
+select * from t1 where kp1=2 for update;
+
+connection default;
+--echo # The lock on kp1=2 should inhibit the following INSERT:
+--error ER_LOCK_WAIT_TIMEOUT
+insert into t1 values ( 2,5,9999);
+rollback;
+
+connection con1;
+rollback;
+connection default;
+disconnect con1;
+drop table t0,t1;
+
+--echo #
+--echo # Test that locks on ranges on non-unique secondary keys inhibit
+--echo # modifications of the contents of these ranges
+--echo #
+
+create table t0(a int);
+insert into t0 values (0),(1),(2),(3),(4),(5),(6),(7),(8),(9);
+eval
+create table t1 (
+ kp1 int not null,
+ kp2 int not null,
+ a int,
+ key(kp1, kp2) comment '$pk_cf'
+) engine=rocksdb;
+
+insert into t1 select 1, a, 1234 from t0;
+insert into t1 values (2, 3, 1234);
+insert into t1 values (2, 5, 1234);
+insert into t1 values (2, 7, 1234);
+insert into t1 select 3, a, 1234 from t0;
+
+connect (con1,localhost,root,,);
+connection con1;
+begin;
+--replace_column 10 #
+explain
+select * from t1 where kp1=2 for update;
+select * from t1 where kp1=2 for update;
+
+connection default;
+begin;
+--error ER_LOCK_WAIT_TIMEOUT
+insert into t1 values (2, 9, 9999);
+
+--error ER_LOCK_WAIT_TIMEOUT
+delete from t1 where kp1=2 and kp2=5;
+
+# Update that "moves a row away" from the locked range
+--error ER_LOCK_WAIT_TIMEOUT
+update t1 set kp1=333 where kp1=2 and kp2=3;
+
+# Update that "moves a row into" the locked range
+--error ER_LOCK_WAIT_TIMEOUT
+update t1 set kp1=2 where kp1=1 and kp2=8;
+
+rollback;
+
+connection con1;
+rollback;
+disconnect con1;
+connection default;
+drop table t0,t1;
+
+--echo #
+--echo # Transaction isolation test
+--echo #
+
+create table t1 (pk int primary key, a int) engine=rocksdb;
+insert into t1 values (1,1),(2,2),(3,3);
+
+connect (con1,localhost,root,,);
+
+--echo # TRX1: Start, Allocate a snapshot
+connection con1;
+begin;
+select * from t1;
+
+--echo # TRX2: Make a change that TRX1 will not see
+connection default;
+update t1 set a=2222 where pk=2;
+
+--echo # TRX1: Now, make a change that would overwrite TRX2'x change and commit
+connection con1;
+update t1 set a=a+1 where pk=2;
+commit;
+
+--echo # Examine the result:
+--echo # pk=2, a=2223 means UPDATE in TRX1 used "read committed" (InnoDB-like isolation)
+--echo # pk=2, a=3 means UPDATE in TRX1 silently overwrote TRX2
+--echo # (and with key tracking, one would get an error on the second UPDATE)
+connection default;
+select * from t1;
+
+disconnect con1;
+connection default;
+drop table t1;
+
+--echo #
+--echo # Same test as above, but check the range scan
+--echo #
+
+eval
+create table t1 (
+ pk int,
+ a int,
+ primary key (pk) comment '$pk_cf'
+) engine=rocksdb;
+insert into t1 values (1,1),(2,2),(3,3),(4,4),(5,5),(6,6);
+
+connect (con1,localhost,root,,);
+
+--echo # TRX1: Start, Allocate a snapshot
+connection con1;
+begin;
+select * from t1;
+
+--echo # TRX2: Make a change that TRX1 will not see
+connection default;
+update t1 set a=2222 where pk between 3 and 5;
+
+--echo # TRX1: Now, make a change that would overwrite TRX2'x change and commit
+connection con1;
+update t1 set a=a+1 where pk between 3 and 5;
+commit;
+
+--echo # Examine the result:
+--echo # pk={3,4,5} a=2223 means UPDATE in TRX1 used "read committed" (InnoDB-like isolation)
+connection default;
+select * from t1;
+
+disconnect con1;
+connection default;
+drop table t1;
+
+--echo #
+--echo # Same as above, but test SELECT FOR UPDATE.
+--echo #
+eval
+create table t1 (
+ pk int,
+ a int,
+ primary key (pk) comment '$pk_cf'
+) engine=rocksdb;
+insert into t1 values (1,1),(2,2),(3,3),(4,4),(5,5),(6,6);
+
+connect (con1,localhost,root,,);
+
+--echo # TRX1: Start, Allocate a snapshot
+connection con1;
+begin;
+select * from t1;
+
+--echo # TRX2: Make a change that TRX1 will not see
+connection default;
+update t1 set a=222 where pk=2;
+update t1 set a=333 where pk=3;
+
+--echo # TRX1: Check what select [FOR UPDATE] sees
+connection con1;
+select * from t1 where pk in (2,3);
+select * from t1 where pk=2 for update;
+select * from t1 where pk=2 lock in share mode;
+select * from t1 where pk=2;
+
+commit;
+
+disconnect con1;
+connection default;
+drop table t1;
+
+if (!$PK_USES_REVERSE_CF) {
+--echo #
+--echo # Another no-snapshot-checking test, this time for single-statement
+--echo # transaction
+--echo #
+eval
+create table t1 (
+ pk int,
+ a int,
+ name varchar(16),
+ primary key(pk) comment '$pk_cf'
+) engine=rocksdb;
+insert into t1 values (1,1, 'row1'), (2,2,'row2');
+
+connect (con1,localhost,root,,);
+connection con1;
+select get_lock('row1', 100);
+
+connection default;
+
+--echo # The following will read the first row (1,1,'row1'), and stop.
+
+send update t1 set a=a+100 where get_lock(name, 1000)=1;
+
+# Wait till the default connection has stopped:
+connection con1;
+
+let $wait_condition=
+ SELECT COUNT(*) = 1 FROM INFORMATION_SCHEMA.PROCESSLIST WHERE STATE = "User lock"
+ AND INFO = "update t1 set a=a+100 where get_lock(name, 1000)=1";
+--source include/wait_condition.inc
+
+# Update the second row
+update t1 set a=5 where pk=2;
+
+select release_lock('row1');
+
+connection default;
+reap;
+
+--echo # Look at the row with pk=2:
+--echo # 2, 105, row2 - means the UPDATE was reading current data (Correct)
+--echo # 2, 102, row - means the UPDATE read the snapshot (incorrect)
+select * from t1;
+
+--echo # Try releasing both locks (in 5.6, we will be holding only the second one)
+select release_lock(name) from t1;
+
+disconnect con1;
+connection default;
+drop table t1;
+}
+
+--echo #
+--echo # Check that I_S.processlist.state is set correctly now.
+--echo #
+eval
+create table t1(
+ pk int,
+ a int,
+ primary key(pk) comment '$pk_cf'
+) engine=rocksdb;
+insert into t1 values (1,1),(2,2),(3,3);
+
+begin;
+select * from t1 where pk=2 for update;
+
+--connect (con1,localhost,root,,)
+begin;
+set rocksdb_lock_wait_timeout=300;
+send select * from t1 where pk=2 for update;
+
+connection default;
+--echo # Now, will wait until we see con1 have state="Waiting for row lock"
+let $wait_condition=
+ SELECT COUNT(*) = 1 FROM INFORMATION_SCHEMA.PROCESSLIST WHERE STATE = "Waiting for row lock"
+ AND INFO = "select * from t1 where pk=2 for update";
+--source include/wait_condition.inc
+
+rollback;
+connection con1;
+--reap
+rollback;
+
+disconnect con1;
+connection default;
+drop table t1;
+
+--echo #
+--echo # Test range locking for ranges with HA_READ_PREFIX_LAST
+--echo #
+create table t0(a int) engine=rocksdb;
+insert into t0 values (0),(1),(2),(3),(4),(5),(6),(7),(8),(9);
+
+eval
+create table t1 (
+ pk1 int,
+ pk2 int,
+ a int,
+ primary key(pk1, pk2) comment '$pk_cf'
+) engine=rocksdb;
+
+insert into t1
+select
+ A.a, B.a, A.a*10+B.a
+from
+ t0 A, t0 B;
+
+
+# Get a lock in another connection so that the primary transaction is not using
+# STO optimization, and its locks can be seen in I_S.rocksdb_locks
+--connect (con1,localhost,root,,)
+connection con1;
+begin;
+insert into t1 values (0x1112222,0x1112222,0);
+
+connection default;
+begin;
+--echo # Should use ref access w/o filesort:
+--replace_column 10 #
+explain
+select * from t1
+where pk1=3
+order by pk1 desc, pk2 desc
+for update;
+
+select * from t1
+where pk1=3
+order by pk1 desc, pk2 desc
+for update;
+
+let $select_from_is_rowlocks_current_trx_only=1;
+--source suite/rocksdb/include/select_from_is_rowlocks.inc
+rollback;
+
+--echo #
+--echo # Test range locking for ranges with HA_READ_PREFIX_LAST_OR_PREV
+--echo #
+
+begin;
+--echo # Should use range access with 2 keyparts and w/o filesort:
+--replace_column 10 #
+explain
+select * from t1
+where pk1=4 and pk2 between 5 and 8
+order by pk1 desc, pk2 desc
+for update;
+
+select * from t1
+where pk1=4 and pk2 between 5 and 8
+order by pk1 desc, pk2 desc
+for update;
+
+let $select_from_is_rowlocks_current_trx_only=1;
+--source suite/rocksdb/include/select_from_is_rowlocks.inc
+rollback;
+
+connection con1;
+rollback;
+
+connection default;
+drop table t0, t1;
+
+--echo #
+--echo # A bug: range locking was not used when scan started at table start or end
+--echo #
+create table t0(a int);
+insert into t0 values (0),(1),(2),(3),(4),(5),(6),(7),(8),(9);
+create table t10(a int);
+insert into t10 select A.a + B.a* 10 + C.a * 100 from t0 A, t0 B, t0 C;
+
+create table t1 (
+ pk int not null,
+ a int,
+ primary key(pk)
+) engine=rocksdb;
+
+insert into t1 select a*2,a*2 from t10;
+
+connection con1;
+begin;
+select * from t1 where pk=500 for update;
+connection default;
+
+begin;
+select * from t1 where pk<10 order by pk limit 10 for update;
+
+let $select_from_is_rowlocks_current_trx_only=1;
+--source suite/rocksdb/include/select_from_is_rowlocks.inc
+rollback;
+
+begin;
+select * from t1 where pk>1990 order by pk desc limit 10 for update;
+let $select_from_is_rowlocks_current_trx_only=1;
+--source suite/rocksdb/include/select_from_is_rowlocks.inc
+rollback;
+
+connection con1;
+rollback;
+disconnect con1;
+
+connection default;
+drop table t0,t10,t1;
+
+--echo #
+--echo # Range locking and READ-COMMITTED isolation level
+--echo #
+connect (con1,localhost,root,,);
+connection con1;
+set session transaction isolation level read committed;
+create table t1 (
+ pk int not null,
+ a int,
+ primary key(pk)
+) engine=rocksdb;
+
+insert into t1(pk) values (0),(1),(2),(3),(4),(5),(6),(7),(8),(9);
+
+begin;
+select * from t1 where pk between 2 and 5 for update;
+let $select_from_is_rowlocks_current_trx_only=1;
+--echo # Below should show individual row locks, not locked range:
+--source suite/rocksdb/include/select_from_is_rowlocks.inc
+rollback;
+
+begin;
+update t1 set a=a+1 where pk between 2 and 5;
+let $select_from_is_rowlocks_current_trx_only=1;
+--echo # Below should show individual row locks, not locked range:
+--source suite/rocksdb/include/select_from_is_rowlocks.inc
+rollback;
+
+drop table t1;
+disconnect con1;
+connection default;
+
+--echo #
+--echo # Range Locking and READ-COMMITTED, another test
+--echo #
+create table t1 (
+ pk int,
+ a int,
+ b int,
+ primary key (pk),
+ key(a)
+) engine=rocksdb;
+
+insert into t1 values
+(1, 100, 1000),
+(2, 200, 2000),
+(3, 300, 3000);
+
+set transaction isolation level repeatable read;
+begin;
+update t1 set b = b + 1 where a > 200;
+
+connect (con1,localhost,root,,);
+connection con1;
+set transaction isolation level read committed;
+begin;
+insert into t1 values (4, 150, 1500);
+--error ER_LOCK_WAIT_TIMEOUT
+insert into t1 values (5, 250, 1500);
+
+rollback;
+
+disconnect con1;
+connection default;
+rollback;
+drop table t1;
diff --git a/mysql-test/suite/rocksdb/t/range_locking.test b/mysql-test/suite/rocksdb/t/range_locking.test
new file mode 100644
index 00000000000..5c599238a0a
--- /dev/null
+++ b/mysql-test/suite/rocksdb/t/range_locking.test
@@ -0,0 +1,6 @@
+
+--let pk_cf=default
+--let sk_cf=default
+
+--source range_locking.inc
+
diff --git a/mysql-test/suite/rocksdb/t/range_locking_conc_test.py b/mysql-test/suite/rocksdb/t/range_locking_conc_test.py
new file mode 100644
index 00000000000..4a71193b079
--- /dev/null
+++ b/mysql-test/suite/rocksdb/t/range_locking_conc_test.py
@@ -0,0 +1,448 @@
+"""
+This script tests Range Locking.
+Usage:
+
+ python3 suite/rocksdb/t/range_locking_conc_test.py \
+ [--progress] [--verbose] \
+ root 127.0.0.1 $MASTER_MYPORT test t1 \
+ num_inserts num_insert_threads \
+ num_group_ops num_group_threads
+
+
+ For Example:
+
+ time python3 suite/rocksdb/t/range_locking_conc_test.py --progress root 127.0.0.1 3314 test t1 2000 20 10000 40
+
+"""
+
+import hashlib
+import MySQLdb
+from MySQLdb.constants import ER
+import os
+import random
+import signal
+import sys
+import threading
+import time
+import string
+import traceback
+
+#MAX_PK_VAL = 1000*1000*1000
+MAX_PK_VAL = 1000*1000
+
+show_progress= False
+verbose_output= False
+
+counter_n_inserted = 0
+counter_n_deleted = 0
+counter_n_insert_failed = 0
+counter_n_groups_created = 0
+counter_n_group_create_fails = 0
+counter_n_groups_verified = 0
+counter_n_groups_deleted = 0
+
+def is_deadlock_error(exc):
+ error_code = exc.args[0]
+ return (error_code == MySQLdb.constants.ER.LOCK_DEADLOCK)
+
+#
+# Watcher prints the test progress and some status variables once per second
+#
+class Watcher(threading.Thread):
+ Instance = None
+ def __init__(self, con):
+ threading.Thread.__init__(self)
+ self.should_stop = False
+ self.finished = False
+ self.con = con
+ self.start()
+
+ def run(self):
+ global counter_n_inserted
+ global counter_n_deleted
+ global counter_n_insert_failed
+ global counter_n_groups_created
+ global counter_n_group_create_fails
+ global counter_n_groups_verified
+ global counter_n_groups_deleted
+ event = threading.Event()
+
+ save_counter_n_inserted=0
+ save_counter_n_deleted=0
+ save_counter_n_insert_failed=0
+ save_counter_n_groups_created=0
+ save_counter_n_group_create_fails=0
+ save_counter_n_groups_verified=0
+ save_counter_n_groups_deleted=0
+ save_wait_count=0
+ n=0
+ cur= self.con.cursor();
+ try:
+ while not self.should_stop:
+ event.wait(1)
+
+ cur.execute("show status like '%rocksdb_locktree_wait_count%'");
+ row = cur.fetchone()
+ wait_count= int(row[1])
+
+ print("== %d ========================" % n)
+ print("counter_n_inserted=%d" % (counter_n_inserted - save_counter_n_inserted))
+ print("counter_n_deleted=%d" % (counter_n_deleted - save_counter_n_deleted))
+ print("counter_n_insert_failed=%d" % (counter_n_insert_failed - save_counter_n_insert_failed))
+ print("counter_n_groups_created=%d" % (counter_n_groups_created - save_counter_n_groups_created))
+ print("counter_n_group_create_fails=%d" % (counter_n_group_create_fails - save_counter_n_group_create_fails))
+ print("counter_n_groups_verified=%d" % (counter_n_groups_verified - save_counter_n_groups_verified))
+ print("counter_n_groups_deleted=%d" % (counter_n_groups_deleted - save_counter_n_groups_deleted))
+ print("wait_count=%d" % (wait_count - save_wait_count))
+
+ save_counter_n_inserted=counter_n_inserted
+ save_counter_n_deleted=counter_n_deleted
+ save_counter_n_insert_failed=counter_n_insert_failed
+ save_counter_n_groups_created=counter_n_groups_created
+ save_counter_n_group_create_fails=counter_n_group_create_fails
+ save_counter_n_groups_verified=counter_n_groups_verified
+ save_counter_n_groups_deleted=counter_n_groups_deleted
+ save_wait_count= wait_count
+ n+=1
+
+ except Exception as e:
+ self.exception = traceback.format_exc()
+ print("Watcher caught (%s)" % (e))
+
+ finally:
+ self.finish()
+
+ def finish(self):
+ n=0
+ # Do nothing
+
+
+#
+# A worker is one client thread producing the benchmark workload
+#
+class Worker(threading.Thread):
+ Instance = None
+ def __init__(self, con, table_name, worker_type_arg, num_inserts):
+ threading.Thread.__init__(self)
+ self.finished = False
+ self.num_inserts = num_inserts
+ con.autocommit(False)
+ self.con = con
+ self.rand = random.Random()
+ self.exception = None
+ self.table_name = table_name
+ self.worker_type = worker_type_arg
+ Worker.Instance = self
+ self.start()
+
+ def run(self):
+ self.rand.seed(threading.get_ident())
+ my_id= threading.get_ident()
+ try:
+ self.cur= self.con.cursor();
+ if (self.worker_type == "insert"):
+ self.run_inserts()
+ if (self.worker_type == "join_group"):
+ self.run_create_groups()
+ except Exception as e:
+ print(e)
+ self.exception = traceback.format_exc()
+ print("THR %d caught (%s)" % (my_id, e))
+
+ finally:
+ self.finish()
+
+ #
+ # Insert one row, making sure this doesn't break any groups
+ #
+ def run_one_insert(self):
+ global counter_n_inserted
+ global counter_n_deleted
+ global counter_n_insert_failed
+ cur = self.cur
+ #pk = self.rand.randint(1, MAX_PK_VAL)
+ # Note: here the distribution is intentionally 2x wider than the grouping
+ # thread has.
+ pk = int(self.rand.normalvariate(MAX_PK_VAL/2, MAX_PK_VAL/50.))
+
+ cur.execute("begin")
+ do_commit = False
+ cur.execute("select pk,parent_id,group_list from t1 where pk>=%s limit 1 for update", (pk,));
+ row = cur.fetchone()
+ group_to_delete= None
+ if row is None:
+ #print("No row found, inserting %d" % (pk+1000*1000))
+ cur.execute("insert into t1 (pk) values(%s)", (pk+1000*1000,));
+ do_commit = True
+ else:
+ if ((row[0] - pk)>2 and row[1] is None):
+ #print("Row found, inserting into gap, %d" % pk)
+ cur.execute("insert into t1 (pk) values(%s)", (pk,));
+ do_commit = True
+ else:
+ #print("Row found, grouped or too tight")
+ if row[2]:
+ # if parent_id is present, use it
+ group_to_delete = row[0]
+ #print("About to delete %d" % group_to_delete)
+ do_commit = False
+
+ if (do_commit):
+ cur.execute("commit")
+ counter_n_inserted += 1
+ return 1
+ else:
+ counter_n_insert_failed += 1
+ if group_to_delete:
+ counter_n_deleted += 5
+ self.delete_group(group_to_delete, True)
+ cur.execute("commit")
+ else:
+ cur.execute("rollback")
+ return 0
+
+ def run_one_create_group(self):
+ global counter_n_groups_created
+ global counter_n_group_create_fails
+ global counter_n_groups_deleted
+ cur = self.cur
+ #pk = self.rand.randint(1, MAX_PK_VAL)
+ pk = int(self.rand.normalvariate(MAX_PK_VAL/2, MAX_PK_VAL/100))
+
+ n_rows = 0
+ n_groups_deleted= 0
+ first_pk= None
+ cur.execute("select pk,parent_id,group_list from t1 where pk>=%s limit 5 for update", (pk,));
+ row = cur.fetchone()
+ while row is not None:
+ if (first_pk is None):
+ first_pk = row[0]
+ group_list= str(first_pk)
+ else:
+ group_list = group_list + "," + str(row[0])
+
+ last_pk = row[0]
+ if row[1] is not None:
+ # Found a row in a group
+ # Continue until group end.
+ found_next_group = False
+ row = cur.fetchone()
+ while row is not None:
+ if row[1] is None:
+ found_next_group = True
+ first_pk = row[0]
+ group_list= str(first_pk)
+ break
+ row= cur.fetchone()
+
+ if not found_next_group:
+ break;
+
+ if row[2] is not None:
+ # Found a group leader row.
+ ungrouped_ids = self.delete_group(row[0], False)
+ n_groups_deleted += 1;
+ i = 1
+ n_rows += 1
+ while (n_rows < 5):
+ group_list = group_list + "," + str(ungrouped_ids[i])
+ last_pk= ungrouped_ids[i];
+ i+= 1
+ n_rows += 1
+ break;
+ n_rows += 1
+ row = cur.fetchone()
+
+ if (n_rows == 5 or n_groups_deleted>0):
+ # Ok we got 5 rows in a row and they are all standalone
+ # Create a group.
+ # print("Creating group %d" % first_pk)
+ cur.execute("update t1 set group_list=%s where pk=%s", (group_list,first_pk,))
+ cur.execute("update t1 set parent_id=%s where pk > %s and pk <=%s",
+ (first_pk,first_pk, last_pk))
+ cur.execute("commit")
+ counter_n_groups_created += 1
+ counter_n_groups_deleted += n_groups_deleted;
+ return 1
+ else:
+ # print("Failed to join a group")
+ counter_n_group_create_fails += 1
+ cur.execute("rollback")
+ return 0
+
+ #
+ # Verify and delete the group
+ # @return An array listing the deleted PKs
+ #
+ def delete_group(self, group_id, delete_rows):
+ global counter_n_groups_verified
+ cur = self.con.cursor();
+ cur.execute("select pk,parent_id,group_list from t1 where pk>=%s limit 5 for update", (group_id,));
+ first_pk = None
+ n_rows = 0
+
+ row = cur.fetchone()
+ while row is not None:
+ if (first_pk is None):
+ first_pk = row[0]
+ group_list= str(first_pk)
+ group_arr= [];
+ group_arr.append(first_pk)
+ group_list_base= row[2]
+ if (first_pk != group_id):
+ self.raise_error("First row is not the group leader!");
+ else:
+ group_list = group_list + "," + str(row[0])
+ group_arr.append(row[0])
+
+ last_pk = row[0]
+ if (row[0] != first_pk and row[1] != first_pk):
+ self.raise_error("Row in group has wrong parent_id (expect %s got %s)" % (first_pk, row[1]))
+ break;
+ if (row[0] != first_pk and row[2] is not None):
+ self.raise_error("Row in group is a group leader?")
+ break;
+ n_rows += 1
+ row = cur.fetchone()
+
+ if (n_rows != 5):
+ self.raise_error("Expected %d rows got %d" % (5, n_rows,))
+ if (group_list != group_list_base):
+ self.raise_error("Group contents mismatch: expected '%s' got '%s'" % (group_list_base, group_list))
+ # Ok, everything seems to be in order.
+ if delete_rows:
+ cur.execute("delete from t1 where pk>=%s and pk<=%s", (group_id,last_pk,));
+ else:
+ cur.execute("update t1 set parent_id=NULL, group_list=NULL where pk>=%s and pk<=%s", (group_id,last_pk,));
+
+ counter_n_groups_verified += 1
+ return group_arr
+
+ def raise_error(self, msg):
+ print("Data corruption detected: " + msg)
+ sys.exit("Failed!")
+
+ def run_inserts(self):
+ #print("Worker.run_inserts")
+ i = 0
+ while (i < self.num_inserts):
+ try:
+ i += self.run_one_insert()
+ except MySQLdb.OperationalError as e:
+ self.con.rollback()
+ cur = self.con.cursor()
+ if not is_deadlock_error(e):
+ raise e
+
+ def run_create_groups(self):
+ #print("Worker.run_create_groups")
+ i = 0
+ while (i < self.num_inserts):
+ try:
+ i += self.run_one_create_group()
+ except MySQLdb.OperationalError as e:
+ self.con.rollback()
+ cur = self.con.cursor()
+ if not is_deadlock_error(e):
+ raise e
+
+ def finish(self):
+ self.finished = True
+
+
+if __name__ == '__main__':
+ if len(sys.argv) != 10 and len(sys.argv) != 11:
+ print("Usage: range_locking_conc_test.py " \
+ "[--progress] " \
+ "user host port db_name table_name " \
+ "num_inserts num_insert_threads " \
+ "num_grp_ops num_group_threads" \
+ )
+ sys.exit(1)
+ i=1;
+ if sys.argv[i] == "--progress":
+ show_progress = True
+ i+=1
+
+ if sys.argv[i] == "--verbose":
+ verbose_output = True
+ i+=1
+
+ user = sys.argv[i]
+ i+=1
+
+ host = sys.argv[i]
+ i+=1
+
+ port = int(sys.argv[i])
+ i+=1
+
+ db = sys.argv[i]
+ i+=1
+
+ table_name = sys.argv[i]
+ i+=1
+
+ num_inserts = int(sys.argv[i])
+ i+=1
+
+ num_insert_workers = int(sys.argv[i])
+ i+=1
+
+ num_group_ops = int(sys.argv[i])
+ i+=1
+
+ num_group_workers = int(sys.argv[i])
+ i+=1
+
+ con= MySQLdb.connect(user=user, host=host, port=port, db=db)
+ con.cursor().execute("set global rocksdb_lock_wait_timeout=20")
+ con.cursor().execute("drop table if exists t1")
+ con.cursor().execute("create table t1 ( " \
+ " pk bigint primary key, " \
+ " group_list varchar(128), " \
+ " parent_id bigint " \
+ ") engine=rocksdb;")
+
+ worker_failed = False
+ workers = []
+ worker_type = "insert"
+ num_loops = num_inserts
+ for i in range(num_insert_workers + num_group_workers):
+ worker = Worker(
+ MySQLdb.connect(user=user, host=host, port=port, db=db), table_name,
+ worker_type, num_loops)
+ workers.append(worker)
+ if (i == num_insert_workers - 1):
+ worker_type = "join_group"
+ num_loops = num_group_ops
+
+ # A watcher thread to print the statistics periodically
+ if show_progress:
+ watcher= Watcher(MySQLdb.connect(user=user, host=host, port=port, db=db))
+
+ for w in workers:
+ w.join()
+ if w.exception:
+ print("Worker hit an exception:\n%s\n" % w.exception)
+ worker_failed = True
+
+ # Stop the watcher
+ if show_progress:
+ watcher.should_stop= True;
+ watcher.join()
+
+ if verbose_output:
+ print("\n")
+ print("rows_inserted: %d" % counter_n_inserted)
+ print("rows_deleted: %d" % counter_n_deleted)
+ print("rows_insert_failed: %d" % counter_n_insert_failed)
+ print("groups_created: %d" % counter_n_groups_created)
+ print("groups_verified: %d" % counter_n_groups_verified)
+ print("groups_deleted: %d" % counter_n_groups_deleted)
+ print("group_create_fails: %d" % counter_n_group_create_fails)
+
+ if worker_failed:
+ sys.exit(1)
+
+
diff --git a/mysql-test/suite/rocksdb/t/range_locking_conc_test.test b/mysql-test/suite/rocksdb/t/range_locking_conc_test.test
new file mode 100644
index 00000000000..3b8f95dc12c
--- /dev/null
+++ b/mysql-test/suite/rocksdb/t/range_locking_conc_test.test
@@ -0,0 +1,17 @@
+
+#
+# Tests concurrent inserts for tables with no primary key.
+#
+
+--source include/have_rocksdb.inc
+--source suite/rocksdb/include/have_range_locking.inc
+
+set @save_rlwt=@@rocksdb_lock_wait_timeout;
+let $SERVER_PORT=`select @@port`;
+let $exec = /usr/bin/python3 suite/rocksdb/t/range_locking_conc_test.py root 127.0.0.1 $SERVER_PORT test t1 2000 20 30000 10;
+
+--echo # Run range_locking_conc_test.py
+exec $exec;
+
+set global rocksdb_lock_wait_timeout= @save_rlwt;
+DROP TABLE t1;
diff --git a/mysql-test/suite/rocksdb/t/range_locking_conc_test.txt b/mysql-test/suite/rocksdb/t/range_locking_conc_test.txt
new file mode 100644
index 00000000000..f10bfe33925
--- /dev/null
+++ b/mysql-test/suite/rocksdb/t/range_locking_conc_test.txt
@@ -0,0 +1,91 @@
+
+
+A concurrent test for Range Locking code
+
+== Requirements ==
+
+The idea is to have a concurrent workload, where
+- Some clients make modifications to the database that use range locking (let's
+ denote these RL-UPDATES)
+
+- There is some concurrent data modification activity, which is likely to
+ make RL-UPDATES corrupt the data if transaction isolation does not perform
+ proper range locking
+
+- There is a way to detect this data corruption. First suggestion: have some
+ invariants that must remain true regardless of any action done by
+ RL-UPDATES. We can run the workload, then stop it and verify the invariants
+ still hold.
+
+== Basic idea ==
+
+Rows and groups.
+
+Consider a table:
+
+create table t1 (
+ pk bigint primary key,
+ group_list varchar(128),
+ parent_id bigint
+) engine=rocksdb;
+
+
+We allow the following to be stored:
+
+1. Individual rows. An individual row has group_list=NULL, parent_id=NULL.
+
+2. Groups.
+
+A group is 5 (potentially could be some other number) of rows with adjacent
+PK values.
+
+The row with the smallest PK value is the "group leader" and has
+group_list=(comma-separated-list-of-pks-of-group members).
+
+The rest of the rows are "followers" and have parent_id=$GROUP_LEADER.pk
+
+Example of a group:
+
+mysql> select * from t1 where pk>=720418192 order by pk limit 5;
++-----------+---------------------------------------------------+-----------+
+| pk | group_list | parent_id |
++-----------+---------------------------------------------------+-----------+
+| 720418192 | 720418192,721972360,730798130,741595383,742883456 | NULL |
+| 721972360 | NULL | 720418192 |
+| 730798130 | NULL | 720418192 |
+| 741595383 | NULL | 720418192 |
+| 742883456 | NULL | 720418192 |
++-----------+---------------------------------------------------+-----------+
+5 rows in set (0.01 sec)
+
+
+Operations:
+- Insert an individual row. It is obvious we may not insert a row into a group.
+
+- Convert 5 individual rows into a group. One needs range locking to prevent
+ other threads from deleting these rows or putting them into another group.
+
+- Disband a group (convert it back into 5 individual rows).
+ When we are disbanding a group, we verify it to be valid.
+
+- Delete a group. If we attempt to insert a row and hit a group leader, we
+ don't insert the row and delete the whole group we've hit, instead. (when
+ deleting the group, we also verify it to be valid)
+ This provides some balance between inserts and deletes.
+
+=== Making sure lock contention happens ===
+
+We use normal distribution (rand.normalvariate) to pick random PK values,
+which are then used to make an attempt to insert a row or create a group.
+
+This creates a much greater contention than the uniform distribution.
+
+With sufficiently small sigma parameter, the contention seems to be
+sufficiently high.
+
+=== Implementation ===
+
+range_locking_conc_test.py implements the above operations.
+
+
+
diff --git a/mysql-test/suite/rocksdb/t/range_locking_deadlock_tracking.test b/mysql-test/suite/rocksdb/t/range_locking_deadlock_tracking.test
new file mode 100644
index 00000000000..2a5966b65c3
--- /dev/null
+++ b/mysql-test/suite/rocksdb/t/range_locking_deadlock_tracking.test
@@ -0,0 +1,196 @@
+--source suite/rocksdb/include/have_range_locking.inc
+
+#
+# This is deadlock_tracking.test, variant for running with Range Locking:
+# - Deadlock #5 is disabled, it requires LOCK IN SHARE MODE tests
+# - In the result file, SHOW ENGINE ROCKSDB TRANSACTION STATUS does not print
+# deadlock information.
+#
+set @prior_lock_wait_timeout = @@rocksdb_lock_wait_timeout;
+set @prior_deadlock_detect = @@rocksdb_deadlock_detect;
+set @prior_max_latest_deadlocks = @@rocksdb_max_latest_deadlocks;
+set global rocksdb_deadlock_detect = on;
+set global rocksdb_lock_wait_timeout = 10000;
+--echo # Clears deadlock buffer of any prior deadlocks.
+set global rocksdb_max_latest_deadlocks = 0;
+set global rocksdb_max_latest_deadlocks = @prior_max_latest_deadlocks;
+let $engine = rocksdb;
+
+--source include/count_sessions.inc
+connect (con1,localhost,root,,);
+let $con1= `SELECT CONNECTION_ID()`;
+
+connect (con2,localhost,root,,);
+let $con2= `SELECT CONNECTION_ID()`;
+
+connect (con3,localhost,root,,);
+let $con3= `SELECT CONNECTION_ID()`;
+
+connection default;
+eval create table t (i int primary key) engine=$engine;
+insert into t values (1), (2), (3);
+--replace_regex /[0-9]{4}-[0-9]{2}-[0-9]{2} [0-9]{2}:[0-9]{2}:[0-9]{2}/TIMESTAMP/ /WAITING KEY: [0-9a-f]{16}/KEY/ /TRANSACTION ID: [0-9]*/TXN_ID/ /TIMESTAMP: [0-9]*/TSTAMP/
+show engine rocksdb transaction status;
+
+echo Deadlock #1;
+--source include/simple_deadlock.inc
+connection default;
+--replace_regex /[0-9]{4}-[0-9]{2}-[0-9]{2} [0-9]{2}:[0-9]{2}:[0-9]{2}/TIMESTAMP/ /WAITING KEY: [0-9a-f]{16}/KEY/ /TRANSACTION ID: [0-9]*/TXN_ID/ /TIMESTAMP: [0-9]*/TSTAMP/
+show engine rocksdb transaction status;
+
+echo Deadlock #2;
+--source include/simple_deadlock.inc
+connection default;
+--replace_regex /[0-9]{4}-[0-9]{2}-[0-9]{2} [0-9]{2}:[0-9]{2}:[0-9]{2}/TIMESTAMP/ /WAITING KEY: [0-9a-f]{16}/KEY/ /TRANSACTION ID: [0-9]*/TXN_ID/ /TIMESTAMP: [0-9]*/TSTAMP/
+show engine rocksdb transaction status;
+set global rocksdb_max_latest_deadlocks = 10;
+
+echo Deadlock #3;
+--source include/simple_deadlock.inc
+connection default;
+--replace_regex /[0-9]{4}-[0-9]{2}-[0-9]{2} [0-9]{2}:[0-9]{2}:[0-9]{2}/TIMESTAMP/ /WAITING KEY: [0-9a-f]{16}/KEY/ /TRANSACTION ID: [0-9]*/TXN_ID/ /TIMESTAMP: [0-9]*/TSTAMP/
+show engine rocksdb transaction status;
+set global rocksdb_max_latest_deadlocks = 1;
+--replace_regex /[0-9]{4}-[0-9]{2}-[0-9]{2} [0-9]{2}:[0-9]{2}:[0-9]{2}/TIMESTAMP/ /WAITING KEY: [0-9a-f]{16}/KEY/ /TRANSACTION ID: [0-9]*/TXN_ID/ /TIMESTAMP: [0-9]*/TSTAMP/
+show engine rocksdb transaction status;
+
+connection con3;
+set rocksdb_deadlock_detect_depth = 2;
+
+--echo # Range locking code will report deadlocks, because it doesn't honor
+--echo # rocksdb_deadlock_detect_depth:
+echo Deadlock #4;
+connection con1;
+begin;
+select * from t where i=1 for update;
+
+connection con2;
+begin;
+select * from t where i=2 for update;
+
+connection con3;
+begin;
+select * from t where i=3 for update;
+
+connection con1;
+send select * from t where i=2 for update;
+
+connection con2;
+let $wait_condition = select count(*) = 1 from information_schema.rocksdb_trx
+where thread_id = $con1 and waiting_key != "";
+--source include/wait_condition.inc
+
+send select * from t where i=3 for update;
+
+connection con3;
+let $wait_condition = select count(*) = 1 from information_schema.rocksdb_trx
+where thread_id = $con2 and waiting_key != "";
+--source include/wait_condition.inc
+
+select variable_value into @a from performance_schema.global_status where variable_name='rocksdb_row_lock_deadlocks';
+--error ER_LOCK_DEADLOCK
+select * from t where i=1 for update;
+select case when variable_value-@a = 1 then 'true' else 'false' end as deadlocks from performance_schema.global_status where variable_name='rocksdb_row_lock_deadlocks';
+rollback;
+
+connection con2;
+reap;
+rollback;
+
+connection con1;
+reap;
+rollback;
+
+connection default;
+set global rocksdb_max_latest_deadlocks = 5;
+--replace_regex /[0-9]{4}-[0-9]{2}-[0-9]{2} [0-9]{2}:[0-9]{2}:[0-9]{2}/TIMESTAMP/ /WAITING KEY: [0-9a-f]{16}/KEY/ /TRANSACTION ID: [0-9]*/TXN_ID/ /TIMESTAMP: [0-9]*/TSTAMP/
+show engine rocksdb transaction status;
+
+--disable_testcase BUG#0000
+echo Deadlock #5;
+connection con1;
+begin;
+select * from t where i=1 for update;
+
+connection con2;
+begin;
+select * from t where i=2 for update;
+
+connection con3;
+begin;
+select * from t where i=3 lock in share mode;
+
+connection con1;
+select * from t where i=100 for update;
+select * from t where i=101 for update;
+send select * from t where i=2 for update;
+
+connection con2;
+let $wait_condition = select count(*) = 1 from information_schema.rocksdb_trx
+where thread_id = $con1 and waiting_key != "";
+--source include/wait_condition.inc
+
+select * from t where i=3 lock in share mode;
+select * from t where i=200 for update;
+select * from t where i=201 for update;
+
+--error ER_LOCK_DEADLOCK
+select * from t where i=1 lock in share mode;
+rollback;
+
+connection con1;
+reap;
+rollback;
+
+connection con3;
+rollback;
+
+connection default;
+--replace_regex /[0-9]{4}-[0-9]{2}-[0-9]{2} [0-9]{2}:[0-9]{2}:[0-9]{2}/TIMESTAMP/ /WAITING KEY: [0-9a-f]{16}/KEY/ /TRANSACTION ID: [0-9]*/TXN_ID/ /TIMESTAMP: [0-9]*/TSTAMP/
+show engine rocksdb transaction status;
+--enable_testcase
+echo Deadlock #6;
+connection con1;
+create table t1 (id int primary key, value int) engine=rocksdb;
+insert into t1 values (1,1),(2,2),(3,3),(4,4),(5,5);
+begin;
+update t1 set value=value+100 where id=1;
+update t1 set value=value+100 where id=2;
+
+connection con2;
+begin;
+update t1 set value=value+200 where id=3;
+
+connection con1;
+send update t1 set value=value+100 where id=3;
+
+connection con2;
+let $wait_condition = select count(*) = 1 from information_schema.rocksdb_trx
+where thread_id = $con1 and waiting_key != "";
+--source include/wait_condition.inc
+--error ER_LOCK_DEADLOCK
+update t1 set value=value+200 where id=1;
+
+# con2 tx is automatically rolled back
+connection con1;
+reap;
+select * from t1;
+drop table t1;
+
+connection default;
+
+disconnect con1;
+disconnect con2;
+disconnect con3;
+
+set global rocksdb_lock_wait_timeout = @prior_lock_wait_timeout;
+set global rocksdb_deadlock_detect = @prior_deadlock_detect;
+drop table t;
+--replace_regex /[0-9]{4}-[0-9]{2}-[0-9]{2} [0-9]{2}:[0-9]{2}:[0-9]{2}/TIMESTAMP/ /WAITING KEY: [0-9a-f]{16}/KEY/ /TRANSACTION ID: [0-9]*/TXN_ID/ /INDEX_ID: [0-9a-f]*/IDX_ID/ /TIMESTAMP: [0-9]*/TSTAMP/
+show engine rocksdb transaction status;
+set global rocksdb_max_latest_deadlocks = 0;
+--echo # Clears deadlock buffer of any existent deadlocks.
+set global rocksdb_max_latest_deadlocks = @prior_max_latest_deadlocks;
+--replace_regex /[0-9]{4}-[0-9]{2}-[0-9]{2} [0-9]{2}:[0-9]{2}:[0-9]{2}/TIMESTAMP/ /WAITING KEY: [0-9a-f]{16}/KEY/ /TRANSACTION ID: [0-9]*/TXN_ID/ /INDEX_ID: [0-9a-f]*/IDX_ID/ /TIMESTAMP: [0-9]*/TSTAMP/
+show engine rocksdb transaction status;
+--source include/wait_until_count_sessions.inc
diff --git a/mysql-test/suite/rocksdb/t/range_locking_escalation-master.opt b/mysql-test/suite/rocksdb/t/range_locking_escalation-master.opt
new file mode 100644
index 00000000000..d0087e2a77b
--- /dev/null
+++ b/mysql-test/suite/rocksdb/t/range_locking_escalation-master.opt
@@ -0,0 +1 @@
+--rocksdb_use_range_locking=1 --rocksdb_max_lock_memory=1024
diff --git a/mysql-test/suite/rocksdb/t/range_locking_escalation.test b/mysql-test/suite/rocksdb/t/range_locking_escalation.test
new file mode 100644
index 00000000000..5a6e9fa6616
--- /dev/null
+++ b/mysql-test/suite/rocksdb/t/range_locking_escalation.test
@@ -0,0 +1,39 @@
+#
+# Range Locking - Lock Escalation Tests.
+#
+
+--source include/have_rocksdb.inc
+--source suite/rocksdb/include/have_range_locking.inc
+--enable_connect_log
+
+
+show variables like 'rocksdb_use_range_locking';
+show variables like 'rocksdb_max_lock_memory';
+show status like 'rocksdb_locktree_escalation_count';
+create table t0(a int);
+insert into t0 values (0),(1),(2),(3),(4),(5),(6),(7),(8),(9);
+
+create table t1 (
+ pk int primary key,
+ a int
+) engine=rocksdb;
+
+#begin;
+#insert into t1 values (1000111,100011);
+#connect (con1,localhost,root,,);
+#connection con1;
+
+insert into t1
+select
+ A.a + B.a*10 + C.a*100 + D.a*1000,
+ 12345
+from t0 A, t0 B, t0 C, t0 D;
+
+select count(*) from t1;
+
+#connection default;
+#disconnect con1;
+show status like 'rocksdb_locktree_escalation_count';
+
+drop table t0,t1;
+
diff --git a/mysql-test/suite/rocksdb/t/range_locking_partial_index.test b/mysql-test/suite/rocksdb/t/range_locking_partial_index.test
new file mode 100644
index 00000000000..ce49737b2f6
--- /dev/null
+++ b/mysql-test/suite/rocksdb/t/range_locking_partial_index.test
@@ -0,0 +1,120 @@
+#
+# Range Locking and Partial Indexes
+#
+
+--source include/have_rocksdb.inc
+--source suite/rocksdb/include/have_range_locking.inc
+--source include/have_debug_sync.inc
+--enable_connect_log
+
+create table t0(a int primary key) engine=rocksdb;
+insert into t0 values (0),(1),(2),(3),(4),(5),(6),(7),(8),(9);
+
+create table t1 (
+ pk1 int,
+ pk2 int,
+ a int not null,
+ b int,
+ primary key (pk1, pk2),
+ key key1(pk1, a) comment 'partial_group_keyparts=1;partial_group_threshold=5'
+) engine=rocksdb;
+
+# pk1=1 IS materialized prefix (N=10)
+insert into t1 select
+ 1,
+ A.a,
+ 100 + A.a,
+ 123456
+from t0 A;
+# Run a SELECT so that it is acually materialized:
+select * from t1 force index (key1) where pk1=1;
+
+# pk1=2 IS NOT materialized (N=3)
+insert into t1 select
+ 2,
+ A.a,
+ 100 + A.a,
+ 123456
+from t0 A limit 3;
+
+# and some more rows
+insert into t1 select
+ 10000 + A.a +10 *B.a +100*C.a,
+ A.a,
+ 100 + A.a,
+ 123456
+from t0 A, t0 B, t0 C;
+
+create table t3(pk int primary key);
+
+connect (con2,localhost,root,,);
+connection con2;
+begin;
+insert into t3 values(3333333);
+connection default;
+
+--echo #
+--echo # First, test a query with range lock
+--echo #
+
+--replace_column 10 #
+explain
+select * from t1 force index (key1) where pk1>=1 and pk1<=10;
+
+connect (con1,localhost,root,,);
+connection con1;
+begin;
+--echo # Allocate a snapshot
+select * from t0 where a=3;
+
+connection default;
+--echo # Make some modifications not visible in the snapshot
+insert into t1 values (1,11, 99999, 99999);
+insert into t1 values (2,11, 99999, 99999);
+
+connection con1;
+--echo # This doesn't see the modifications
+select * from t1 force index (key1) where pk1>=1 and pk1<=10;
+--echo # This DOES see the modifications
+select * from t1 force index (key1) where pk1>=1 and pk1<=10 for update;
+
+let $order_by_rowkey=1;
+let $SECOND_INDEX_NAME=key1;
+let $select_from_is_rowlocks_current_trx_only=1;
+--source suite/rocksdb/include/select_from_is_rowlocks.inc
+
+rollback;
+
+--echo #
+--echo # Now, test a query with LockingIterator
+--echo #
+delete from t1 where b=99999;
+
+begin;
+--echo # Allocate a snapshot
+select * from t0 where a=3;
+
+connection default;
+--echo # Make some modifications not visible in the snapshot
+insert into t1 values (1,11, 99999, 99999);
+insert into t1 values (2,11, 99999, 99999);
+
+connection con1;
+--echo # This doesn't see the modifications:
+select * from t1 force index (key1) where pk1>=1 order by pk1 limit 15;
+--echo # This DOES see the modifications:
+select * from t1 force index (key1) where pk1>=1 order by pk1 limit 15 for update;
+
+let $order_by_rowkey=1;
+let $SECOND_INDEX_NAME=key1;
+let $select_from_is_rowlocks_current_trx_only=1;
+--source suite/rocksdb/include/select_from_is_rowlocks.inc
+
+rollback;
+
+disconnect con1;
+connection default;
+
+disconnect con2;
+
+drop table t0, t1,t3;
diff --git a/mysql-test/suite/rocksdb/t/range_locking_refresh_iter.test b/mysql-test/suite/rocksdb/t/range_locking_refresh_iter.test
new file mode 100644
index 00000000000..9bbb1b9b392
--- /dev/null
+++ b/mysql-test/suite/rocksdb/t/range_locking_refresh_iter.test
@@ -0,0 +1,70 @@
+--source include/have_rocksdb.inc
+--source suite/rocksdb/include/have_range_locking.inc
+--source include/have_debug_sync.inc
+
+select @@rocksdb_use_range_locking;
+
+--disable_warnings
+set debug_sync='RESET';
+--enable_warnings
+#
+# Testcase for iterator snapshot refresh
+#
+create table ten(a int primary key);
+insert into ten values (0),(1),(2),(3),(4),(5),(6),(7),(8),(9);
+
+create table one_k(a int primary key);
+insert into one_k select A.a + B.a* 10 + C.a * 100 from ten A, ten B, ten C;
+
+create table t1 (
+ pk int primary key,
+ a int
+) engine=rocksdb;
+
+insert into t1 select a,a from ten;
+insert into t1 select a+40, a+40 from ten;
+insert into t1 select a+100, a+100 from one_k;
+delete from t1 where pk=44;
+set global rocksdb_force_flush_memtable_and_lzero_now=1;
+
+# Ok, now the table has these PK ranges:
+# 0..9 40..49 100...1000
+# and all rows have pk=a
+connect (con1,localhost,root,,);
+connect (con2,localhost,root,,);
+
+connection con1;
+begin;
+set debug_sync='rocksdb.check_flags_iri SIGNAL con1_stopped WAIT_FOR con1_cont';
+send
+update t1 set a=a+100 where pk < 3 or pk between 10 and 50;
+
+# The query is how stuck at the start of the second range.
+
+
+## con2>
+connection con2;
+set debug_sync='now WAIT_FOR con1_stopped';
+
+# Make some changes to check if the iterator is reading current data or
+# snapshot
+insert into t1 values (44,5000);
+delete from t1 where pk= 42;
+update t1 set a=5000 where pk between 40 and 45;
+set global rocksdb_force_flush_memtable_and_lzero_now=1;
+
+set debug_sync='now SIGNAL con1_cont';
+
+connection con1;
+#--error ER_GET_ERRMSG
+reap;
+select * from t1 where pk<100;
+
+commit;
+disconnect con1;
+disconnect con2;
+connection default;
+set debug_sync='RESET';
+
+drop table t1, ten, one_k;
+
diff --git a/mysql-test/suite/rocksdb/t/range_locking_rev_cf.test b/mysql-test/suite/rocksdb/t/range_locking_rev_cf.test
new file mode 100644
index 00000000000..8b993764235
--- /dev/null
+++ b/mysql-test/suite/rocksdb/t/range_locking_rev_cf.test
@@ -0,0 +1,12 @@
+#
+# Range locking tests.
+#
+
+--source include/have_rocksdb.inc
+--source suite/rocksdb/include/have_range_locking.inc
+
+--let pk_cf=rev:cf1
+--let PK_USES_REVERSE_CF=1
+
+--source range_locking.inc
+
diff --git a/mysql-test/suite/rocksdb/t/range_locking_seek_for_update.test b/mysql-test/suite/rocksdb/t/range_locking_seek_for_update.test
new file mode 100644
index 00000000000..542d76661e2
--- /dev/null
+++ b/mysql-test/suite/rocksdb/t/range_locking_seek_for_update.test
@@ -0,0 +1,308 @@
+#
+# Range Locking : tests for SeekForUpdate feature
+#
+
+--source include/have_rocksdb.inc
+--source include/have_debug_sync.inc
+--source suite/rocksdb/include/have_range_locking.inc
+--enable_connect_log
+show variables like 'rocksdb_use_range_locking';
+
+create table t0(a int primary key);
+insert into t0 values (0),(1),(2),(3),(4),(5),(6),(7),(8),(9);
+
+create table t1 (
+ pk int,
+ a int,
+ primary key (pk)
+) engine=rocksdb;
+
+insert into t1 select
+ A.a + B.a*10 + C.a*100,
+ A.a + B.a*10 + C.a*100
+from
+ t0 A, t0 B, t0 C;
+
+--echo # Make another connection to get the lock tree out of the STO-mode
+connect (con1,localhost,root,,);
+connection con1;
+begin;
+select * from t1 where pk=10 for update;
+
+connection default;
+begin;
+select * from t1 where pk=11 for update;
+
+let $select_from_is_rowlocks_current_trx_only=1;
+--echo # Now, we will just see locks on 10=0xA and 11=0xB:
+--source suite/rocksdb/include/select_from_is_rowlocks.inc
+
+--echo #
+--echo # SeekForUpdate Test #1: A query with type=range (without upper bound) and LIMIT
+--echo #
+--replace_column 10 #
+explain
+select * from t1 where pk>=500 order by pk limit 3 for update;
+select * from t1 where pk>=500 order by pk limit 3 for update;
+--source suite/rocksdb/include/select_from_is_rowlocks.inc
+rollback;
+
+
+begin;
+select * from t1 where pk=11 for update;
+explain
+select * from t1 order by pk limit 3 for update;
+select * from t1 order by pk limit 3 for update;
+
+--source suite/rocksdb/include/select_from_is_rowlocks.inc
+
+rollback;
+connection con1;
+rollback;
+disconnect con1;
+connection default;
+drop table t0, t1;
+
+
+--echo #
+--echo # Concurrent tests: let one thread do SeekForUpdate and the other
+--echo # interfere by committing modifications
+--echo #
+
+create table t0(a int primary key);
+insert into t0 values (0),(1),(2),(3),(4),(5),(6),(7),(8),(9);
+
+create table t1 (
+ pk int,
+ a int,
+ primary key (pk)
+) engine=rocksdb;
+
+insert into t1 select
+ A.a + B.a*10 + C.a*100,
+ A.a + B.a*10 + C.a*100
+from
+ t0 A, t0 B, t0 C;
+
+select * from t1 where pk<10;
+delete from t1 where pk<10;
+select * from t1 where pk<10;
+
+
+--echo # Test what happens when another transaction commits a row
+--echo # right before the range we are about to lock (nothing)
+
+--replace_column 10 #
+explain
+select * from t1 where pk >=5 order by pk limit 3 for update;
+
+begin;
+
+set debug_sync='rocksdb.locking_iter_scan SIGNAL about_to_lock_range WAIT_FOR spoiler_inserted';
+send select * from t1 where pk >=5 order by pk limit 3 for update;
+
+connect (con1,localhost,root,,);
+connection con1;
+set debug_sync='now WAIT_FOR about_to_lock_range';
+insert into t1 values (3,3);
+set debug_sync='now SIGNAL spoiler_inserted';
+
+connection default;
+reap;
+rollback;
+
+delete from t1 where pk=3;
+
+--echo #
+--echo # Now, repeat the test but let the other transaction insert the row into
+--echo # the range we are locking
+
+--replace_column 10 #
+explain
+select * from t1 where pk >=5 order by pk limit 1 for update;
+
+begin;
+
+set debug_sync='rocksdb.locking_iter_scan SIGNAL about_to_lock_range WAIT_FOR spoiler_inserted';
+send
+select * from t1 where pk >=5 order by pk limit 1 for update;
+
+connection con1;
+set debug_sync='now WAIT_FOR about_to_lock_range';
+insert into t1 values (8,8);
+set debug_sync='now SIGNAL spoiler_inserted';
+
+connection default;
+reap;
+
+--source suite/rocksdb/include/select_from_is_rowlocks.inc
+
+rollback;
+delete from t1 where pk=8;
+
+--echo #
+--echo # Repeat the third time, this time deleting the row that SeekForUpdate saw
+--echo #
+insert into t1 values (7,7);
+
+begin;
+
+set debug_sync='rocksdb.locking_iter_scan SIGNAL about_to_lock_range WAIT_FOR spoiler_inserted';
+send
+select * from t1 where pk >=5 order by pk limit 1 for update;
+
+connection con1;
+set debug_sync='now WAIT_FOR about_to_lock_range';
+delete from t1 where pk=7;
+set debug_sync='now SIGNAL spoiler_inserted';
+
+connection default;
+reap;
+
+rollback;
+
+--echo #
+--echo # Repeat the above test, but let the read fail with ER_LOCK_WAIT_TIMEOUT
+--echo # error. MyRocks code should now be prepared that data reads cause this
+--echo # error
+--echo #
+insert into t1 values (7,7);
+
+begin;
+
+set debug_sync='rocksdb.locking_iter_scan SIGNAL about_to_lock_range WAIT_FOR spoiler_inserted';
+send
+select * from t1 where pk >=5 order by pk limit 1 for update;
+
+connection con1;
+set debug_sync='now WAIT_FOR about_to_lock_range';
+begin;
+delete from t1 where pk=7;
+set debug_sync='now SIGNAL spoiler_inserted';
+
+connection default;
+--error ER_LOCK_WAIT_TIMEOUT
+reap;
+
+rollback;
+
+connection con1;
+rollback;
+connection default;
+
+--echo #
+--echo # Test the thd_killed check in the iterator
+--echo #
+let $conn_id=`select connection_id()`;
+begin;
+set debug_sync='rocksdb.locking_iter_scan SIGNAL about_to_lock_range WAIT_FOR go_get_killed';
+send
+select * from t1 where pk >=5 order by pk limit 1 for update;
+
+connection con1;
+set debug_sync='now WAIT_FOR about_to_lock_range';
+--disable_query_log
+eval kill query $conn_id;
+--enable_query_log
+--echo kill query CONN_ID;
+connection default;
+--error ER_GET_ERRMSG
+reap;
+rollback;
+
+--echo #
+--echo # Backward scan test
+--echo #
+connection con1;
+begin;
+select * from t1 where pk=500 for update;
+connection default;
+
+insert into t1 values
+ (1001, 1001),
+ (1005, 1005),
+ (1007, 1007),
+ (1010, 1010);
+
+begin;
+select * from t1 order by pk desc limit 2 for update;
+
+let $select_from_is_rowlocks_current_trx_only=1;
+
+--echo # The below will lock from pk=1007 (0x3ef) till the end of the table:
+--source suite/rocksdb/include/select_from_is_rowlocks.inc
+
+rollback;
+
+begin;
+select * from t1 where pk <1007 order by pk desc limit 2 for update;
+--source suite/rocksdb/include/select_from_is_rowlocks.inc
+
+connection con1;
+rollback;
+
+connection default;
+rollback;
+
+--echo #
+--echo # Backward scan test 2: error condition
+--echo #
+connection con1;
+begin;
+select * from t1 where pk=1010 for update;
+
+connection default;
+begin;
+--error ER_LOCK_WAIT_TIMEOUT
+select * from t1 order by pk desc limit 2 for update;
+rollback;
+
+connection con1;
+rollback;
+begin;
+select * from t1 where pk=1007 for update;
+
+connection default;
+begin;
+--error ER_LOCK_WAIT_TIMEOUT
+select * from t1 order by pk desc limit 2 for update;
+rollback;
+
+connection con1;
+rollback;
+
+disconnect con1;
+connection default;
+drop table t0,t1;
+
+--echo #
+--echo # A test: full table scan doesn't lock gaps
+--echo #
+
+create table t1 (
+ pk int primary key,
+ a int
+) engine=rocksdb;
+
+insert into t1 values (10,10),(20,20),(30,30);
+
+connect (con1,localhost,root,,);
+connect (con2,localhost,root,,);
+
+connection con1;
+begin;
+
+select * from t1 for update;
+
+connection con2;
+
+--error ER_LOCK_WAIT_TIMEOUT
+insert into t1 values (5,5);
+
+connection con1;
+rollback;
+
+disconnect con1;
+disconnect con2;
+connection default;
+drop table t1;
diff --git a/mysql-test/suite/rocksdb/t/range_locking_seek_for_update2.inc b/mysql-test/suite/rocksdb/t/range_locking_seek_for_update2.inc
new file mode 100644
index 00000000000..6ab2f31cae5
--- /dev/null
+++ b/mysql-test/suite/rocksdb/t/range_locking_seek_for_update2.inc
@@ -0,0 +1,55 @@
+
+
+--source include/have_rocksdb.inc
+--source include/have_debug_sync.inc
+--source suite/rocksdb/include/have_range_locking.inc
+--enable_connect_log
+show variables like 'rocksdb_use_range_locking';
+
+
+create table t0(a int primary key);
+insert into t0 values (0),(1),(2),(3),(4),(5),(6),(7),(8),(9);
+
+eval create table t1 (
+ pk int,
+ a int,
+ primary key (pk) comment '$cf'
+) engine=rocksdb;
+
+insert into t1 (pk)
+select
+ A.a + B.a*10 + C.a*100
+from
+ t0 A, t0 B, t0 C;
+delete from t1 where pk<100;
+
+connect (con1,localhost,root,,);
+connection con1;
+
+begin;
+set debug_sync='rocksdb.locking_iter_scan SIGNAL about_to_lock_range WAIT_FOR spoiler_inserted';
+send
+select * from t1 where pk >=5 order by pk limit 5 for update;
+
+connection default;
+set debug_sync='now WAIT_FOR about_to_lock_range';
+insert into t1 (pk) values
+(10),(20),(30),(40),(50);
+set debug_sync='now SIGNAL spoiler_inserted';
+
+connection con1;
+reap;
+--echo # This must return 1, no 5:
+select lock_count from information_schema.rocksdb_trx
+where thread_id=CONNECTION_ID();
+
+rollback;
+disconnect con1;
+connection default;
+drop table t0, t1;
+
+--source range_locking_seek_for_update_iter_end.inc
+set global rocksdb_enable_iterate_bounds=off;
+--source range_locking_seek_for_update_iter_end.inc
+set global rocksdb_enable_iterate_bounds=on;
+
diff --git a/mysql-test/suite/rocksdb/t/range_locking_seek_for_update2.test b/mysql-test/suite/rocksdb/t/range_locking_seek_for_update2.test
new file mode 100644
index 00000000000..703331cab9a
--- /dev/null
+++ b/mysql-test/suite/rocksdb/t/range_locking_seek_for_update2.test
@@ -0,0 +1,4 @@
+
+--let cf=rlsfu_test
+--source range_locking_seek_for_update2.inc
+
diff --git a/mysql-test/suite/rocksdb/t/range_locking_seek_for_update2_rev_cf.test b/mysql-test/suite/rocksdb/t/range_locking_seek_for_update2_rev_cf.test
new file mode 100644
index 00000000000..0620593b4e7
--- /dev/null
+++ b/mysql-test/suite/rocksdb/t/range_locking_seek_for_update2_rev_cf.test
@@ -0,0 +1,4 @@
+
+--let cf=rev:rlsfu_test
+
+--source range_locking_seek_for_update2.inc
diff --git a/mysql-test/suite/rocksdb/t/range_locking_seek_for_update_iter_end.inc b/mysql-test/suite/rocksdb/t/range_locking_seek_for_update_iter_end.inc
new file mode 100644
index 00000000000..3b0fb6c53b3
--- /dev/null
+++ b/mysql-test/suite/rocksdb/t/range_locking_seek_for_update_iter_end.inc
@@ -0,0 +1,41 @@
+--echo #
+--echo # A testcase for locking at the end of the scan
+--echo #
+eval create table t1 (
+ pk int,
+ primary key (pk) comment '$cf'
+) engine=rocksdb;
+
+connect (con1,localhost,root,,);
+connection con1;
+
+insert into t1 values (1), (10), (100);
+
+begin;
+select * from t1 for update;
+
+connection default;
+select * from t1;
+
+--error ER_LOCK_WAIT_TIMEOUT
+insert into t1 values (150);
+
+connection con1;
+rollback;
+
+begin;
+--replace_column 10 #
+explain
+select * from t1 order by pk desc for update;
+select * from t1 order by pk desc for update;
+
+connection default;
+select * from t1;
+
+--error ER_LOCK_WAIT_TIMEOUT
+insert into t1 values (0);
+
+disconnect con1;
+connection default;
+drop table t1;
+
diff --git a/mysql-test/suite/rocksdb/t/range_locking_shared_locks.test b/mysql-test/suite/rocksdb/t/range_locking_shared_locks.test
new file mode 100644
index 00000000000..c6e4e457897
--- /dev/null
+++ b/mysql-test/suite/rocksdb/t/range_locking_shared_locks.test
@@ -0,0 +1,202 @@
+#
+# Test for shared lock support for range locking
+#
+--source include/have_rocksdb.inc
+--source suite/rocksdb/include/have_range_locking.inc
+--enable_connect_log
+
+select @@rocksdb_use_range_locking;
+
+create table t0 (a int);
+insert into t0 values (0),(1),(2),(3),(4),(5),(6),(7),(8),(9);
+
+create table t1 (
+ pk int primary key,
+ a int
+) engine=rocksdb;
+
+
+insert into t1 select a,a from t0;
+
+--echo # A basic test for shared locks
+
+begin;
+select * from t1 where pk=3 for update;
+select * from t1 where pk=5 lock in share mode;
+let $TRX1_ID=`select transaction_id from information_schema.rocksdb_trx where thread_id=connection_id()`;
+
+connect (con1,localhost,root,,);
+connection con1;
+begin;
+select * from t1 where pk=5 lock in share mode;
+let $TRX2_ID=`select transaction_id from information_schema.rocksdb_trx where thread_id=connection_id()`;
+--echo # Now for pk=5 we should see two locks by TRX1 and TRX2 with mode=S:
+--source suite/rocksdb/include/select_from_is_rowlocks.inc
+
+rollback;
+--echo # Now, TRX2_ID should be gone:
+--source suite/rocksdb/include/select_from_is_rowlocks.inc
+
+connection default;
+
+--echo # Get a read lock on pk=3 (where we have a write lock).
+--echo # The result should be that we will still have a write lock
+select * from t1 where pk=3 for update;
+--source suite/rocksdb/include/select_from_is_rowlocks.inc
+
+--echo # Get a write lock on pk=5 (where we have a read lock).
+--echo # The result should be that we will have a write lock.
+select * from t1 where pk=5 for update;
+--source suite/rocksdb/include/select_from_is_rowlocks.inc
+
+connection default;
+rollback;
+
+--echo #
+--echo # Test if a read lock inhibits write locks
+--echo #
+
+begin;
+select * from t1 where pk=2 lock in share mode;
+select * from t1 where pk=8 for update;
+
+connection con1;
+begin;
+
+--error ER_LOCK_WAIT_TIMEOUT
+select * from t1 where pk=2 for update;
+
+--error ER_LOCK_WAIT_TIMEOUT
+select * from t1 where pk between 0 and 4 for update;
+
+--error ER_LOCK_WAIT_TIMEOUT
+delete from t1 where pk=2;
+
+--echo # Get a shared lock
+select * from t1 where pk=2 lock in share mode;
+
+--echo # But this should still prevent us from acquiring a write lock on that value:
+--error ER_LOCK_WAIT_TIMEOUT
+select * from t1 where pk=2 for update;
+
+rollback;
+connection default;
+rollback;
+
+drop table t1;
+create table t1 (
+ pk int not null primary key,
+ a int not null,
+ key(a)
+) engine=rocksdb;
+
+insert into t1
+select
+ A.a+10*B.a+100*C.a+1000*D.a, A.a+10*B.a+100*C.a+1000*D.a
+from
+ t0 A, t0 B, t0 C, t0 D;
+set global rocksdb_force_flush_memtable_now=1;
+
+connection con1;
+begin;
+select * from t1 where pk=900 for update;
+let $TRX2_ID=`select transaction_id from information_schema.rocksdb_trx where thread_id=connection_id()`;
+
+connection default;
+begin;
+--replace_column 10 #
+explain
+select * from t1 where a between 2 and 5 lock in share mode;
+select * from t1 where a between 2 and 5 lock in share mode;
+let $TRX1_ID=`select transaction_id from information_schema.rocksdb_trx where thread_id=connection_id()`;
+
+--source suite/rocksdb/include/select_from_is_rowlocks.inc
+
+rollback;
+
+disconnect con1;
+
+drop table t0,t1;
+
+--echo #
+--echo # Test shared point locks and lock escalation
+--echo #
+create table t0 (a int);
+insert into t0 values (0),(1),(2),(3),(4),(5),(6),(7),(8),(9);
+
+create table t1 (
+ pk int primary key,
+ a int
+) engine=rocksdb;
+
+insert into t1
+select 1000 + 100*A.a + 10*B.a + C.a, 12345 from t0 A, t0 B, t0 C;
+
+show status like 'rocksdb_locktree_current_lock_memory';
+
+connect (con1,localhost,root,,);
+connection con1;
+
+begin;
+--echo # CON1: get some shared locks
+select * from t1 where pk=1001 lock in share mode;
+select * from t1 where pk=1100 lock in share mode;
+select * from t1 where pk=1200 lock in share mode;
+
+select * from t1 where pk=2500 lock in share mode;
+let $TRX1_ID=`select transaction_id from information_schema.rocksdb_trx where thread_id=connection_id()`;
+
+connection default;
+begin;
+--echo # DEFAULT: get the same locks so we have locks with multiple owners
+select * from t1 where pk=1001 lock in share mode;
+select * from t1 where pk=1100 lock in share mode;
+select * from t1 where pk=1200 lock in share mode;
+
+--echo # DEFAULT: get shared locks with one owner:
+select * from t1 where pk=2510 lock in share mode;
+let $TRX2_ID=`select transaction_id from information_schema.rocksdb_trx where thread_id=connection_id()`;
+
+
+--echo # DEFAULT: exclusive locks on 0-10:
+insert into t1 select A.a, 0 from t0 A;
+
+connection con1;
+--echo # CON1: exclusive locks on 2000-2010:
+insert into t1 select 2000+A.a, 0 from t0 A;
+
+let $order_by_rowkey=1;
+#select * from information_schema.rocksdb_locks;
+--source suite/rocksdb/include/select_from_is_rowlocks.inc
+
+connection default;
+show status like 'rocksdb_locktree_current_lock_memory';
+set @save_mlm= @@rocksdb_max_lock_memory;
+
+--echo # Set the limit to cause lock escalation:
+set @cur_mem_usage= (select
+ variable_value
+ from
+ performance_schema.global_status
+ where
+ variable_name='rocksdb_locktree_current_lock_memory');
+
+set global rocksdb_max_lock_memory = cast(@cur_mem_usage+4 as SIGNED);
+
+connection con1;
+insert into t1 select 3000+A.a, 0 from t0 A;
+
+#select * from information_schema.rocksdb_locks;
+--source suite/rocksdb/include/select_from_is_rowlocks.inc
+
+connection con1;
+rollback;
+connection default;
+rollback;
+
+disconnect con1;
+set global rocksdb_max_lock_memory= cast(@save_mlm as SIGNED);
+
+drop table t0, t1;
+
+
diff --git a/mysql-test/suite/rocksdb/t/rocksdb.test b/mysql-test/suite/rocksdb/t/rocksdb.test
index c063d8c7ccb..0544214b8c9 100644
--- a/mysql-test/suite/rocksdb/t/rocksdb.test
+++ b/mysql-test/suite/rocksdb/t/rocksdb.test
@@ -2,6 +2,9 @@
--source suite/rocksdb/include/have_write_committed.inc
--source include/count_sessions.inc
+# Does SHOW WARNINGS and SHOW STATUS which change in Range Locking mode
+--source suite/rocksdb/include/not_range_locking.inc
+
#
# RocksDB Storage Engine tests
#
diff --git a/mysql-test/suite/rocksdb/t/rocksdb_concurrent_delete.test b/mysql-test/suite/rocksdb/t/rocksdb_concurrent_delete.test
index 47818bfdbe1..3aa51b7be80 100644
--- a/mysql-test/suite/rocksdb/t/rocksdb_concurrent_delete.test
+++ b/mysql-test/suite/rocksdb/t/rocksdb_concurrent_delete.test
@@ -27,6 +27,10 @@
# In all cases, RR gets snapshot conflict errors if non-first rows get
# deleted by another transaction after scanning.
+# The tests do not work with range locking as it locks it is about to
+# read, first.
+--source suite/rocksdb/include/not_range_locking.inc
+
--source include/have_rocksdb.inc
--source include/have_debug_sync.inc
diff --git a/mysql-test/suite/rocksdb/t/rocksdb_locks.test b/mysql-test/suite/rocksdb/t/rocksdb_locks.test
index ff092773737..8b3975723df 100644
--- a/mysql-test/suite/rocksdb/t/rocksdb_locks.test
+++ b/mysql-test/suite/rocksdb/t/rocksdb_locks.test
@@ -5,6 +5,9 @@
#
--source include/have_debug.inc
+# Range locking requests locks before doing snapshot checking.
+--source suite/rocksdb/include/not_range_locking.inc
+
--enable_connect_log
create table t1 (pk int not null primary key) engine=rocksdb;
diff --git a/mysql-test/suite/rocksdb/t/rocksdb_read_free_rpl.test b/mysql-test/suite/rocksdb/t/rocksdb_read_free_rpl.test
index 6848459c445..2affb66f7af 100644
--- a/mysql-test/suite/rocksdb/t/rocksdb_read_free_rpl.test
+++ b/mysql-test/suite/rocksdb/t/rocksdb_read_free_rpl.test
@@ -62,7 +62,7 @@ update t1 set c2=100 where c1=3;
delete from t1 where c1 <= 2;
--source include/sync_slave_sql_with_master.inc
--source include/rpl_connection_slave.inc
-select case when variable_value-@up > 0 then 'false' else 'true' end as read_free from performance_schema.global_status where variable_name='rocksdb_num_get_for_update_calls';
+select case when (@@rocksdb_use_range_locking=1 OR variable_value-@up > 0) then 'false' else 'true' end as read_free from performance_schema.global_status where variable_name='rocksdb_num_get_for_update_calls';
select * from t1;
--echo
diff --git a/mysql-test/suite/rocksdb/t/rocksdb_timeout_rollback.test b/mysql-test/suite/rocksdb/t/rocksdb_timeout_rollback.test
index 694594efd70..1273a2b6f70 100644
--- a/mysql-test/suite/rocksdb/t/rocksdb_timeout_rollback.test
+++ b/mysql-test/suite/rocksdb/t/rocksdb_timeout_rollback.test
@@ -46,6 +46,8 @@ begin work;
insert into t1 values (9);
insert into t1 values (10);
+--echo # Fix for Range Locking: force a snapshot to be taken:
+select * from t1 where a=100;
update t1 set a = a + 1 where a = 2;
connection con1;
diff --git a/mysql-test/suite/rocksdb/t/rpl_row_not_found.inc b/mysql-test/suite/rocksdb/t/rpl_row_not_found.inc
index 5a78979f048..63b72ce5c5a 100644
--- a/mysql-test/suite/rocksdb/t/rpl_row_not_found.inc
+++ b/mysql-test/suite/rocksdb/t/rpl_row_not_found.inc
@@ -3,6 +3,8 @@
--source include/have_debug.inc
--source include/have_debug_sync.inc
+--source suite/rocksdb/include/not_range_locking.inc
+
connection master;
--disable_warnings
drop table if exists t1;
diff --git a/mysql-test/suite/rocksdb/t/select_count_for_update.test b/mysql-test/suite/rocksdb/t/select_count_for_update.test
index 2c6f5d474a1..aa7059dfc7e 100644
--- a/mysql-test/suite/rocksdb/t/select_count_for_update.test
+++ b/mysql-test/suite/rocksdb/t/select_count_for_update.test
@@ -52,9 +52,23 @@ SET lock_wait_timeout = 1;
SELECT COUNT(*) FROM t1 FORCE INDEX (sk);
# ... but not with LOCK IN SHARE MODE / FOR UPDATE
+let $uses_range_locking=`select @@rocksdb_use_range_locking`;
+
+if ($uses_range_locking == "0") {
+--replace_regex /test.t1.PRIMARY/$FAILING_INDEX/
+}
+if ($uses_range_locking == "1") {
+--replace_regex /test.t1.sk/$FAILING_INDEX/
+}
--error ER_LOCK_WAIT_TIMEOUT
SELECT COUNT(*) FROM t1 FORCE INDEX (sk) LOCK IN SHARE MODE;
+if ($uses_range_locking == "0") {
+--replace_regex /test.t1.PRIMARY/$FAILING_INDEX/
+}
+if ($uses_range_locking == "1") {
+--replace_regex /test.t1.sk/$FAILING_INDEX/
+}
--error ER_LOCK_WAIT_TIMEOUT
SELECT COUNT(*) FROM t1 FORCE INDEX (sk) FOR UPDATE;
diff --git a/mysql-test/suite/rocksdb/t/select_lock_in_share_mode.test b/mysql-test/suite/rocksdb/t/select_lock_in_share_mode.test
index 23ce6d45234..cf9d53ff88a 100644
--- a/mysql-test/suite/rocksdb/t/select_lock_in_share_mode.test
+++ b/mysql-test/suite/rocksdb/t/select_lock_in_share_mode.test
@@ -1,5 +1,8 @@
--source include/have_rocksdb.inc
+# Range locking only supports exclusive locks currently.
+--source suite/rocksdb/include/not_range_locking.inc
+
#
# SELECT .. LOCK IN SHARE MODE
#
diff --git a/mysql-test/suite/rocksdb/t/skip_locked_nowait.test b/mysql-test/suite/rocksdb/t/skip_locked_nowait.test
index bfa36714816..3b8bcb033c0 100644
--- a/mysql-test/suite/rocksdb/t/skip_locked_nowait.test
+++ b/mysql-test/suite/rocksdb/t/skip_locked_nowait.test
@@ -2,5 +2,8 @@
# wl#8919 Implement NOWAIT and SKIP LOCKED
#
+# Range locking cannot support SKIP LOCKED? (TODO: but can support NOWAIT)
+--source suite/rocksdb/include/not_range_locking.inc
+
--let $engine=ROCKSDB
--source include/skip_locked_nowait.inc
diff --git a/mysql-test/suite/rocksdb/t/trx_info.test b/mysql-test/suite/rocksdb/t/trx_info.test
index 975bed6132c..009c0ce67d4 100644
--- a/mysql-test/suite/rocksdb/t/trx_info.test
+++ b/mysql-test/suite/rocksdb/t/trx_info.test
@@ -11,7 +11,11 @@ insert into t1 values (2);
set autocommit=0;
select * from t1 for update;
---replace_column 1 _TRX_ID_ 3 _NAME_ 7 _KEY_ 14 _THREAD_ID_
+--replace_column 1 _TRX_ID_ 3 _NAME_ 5 2_or_3 7 _KEY_ 14 _THREAD_ID_
select * from information_schema.rocksdb_trx;
+select
+ if(@@rocksdb_use_range_locking=1, LOCK_COUNT=3, LOCK_COUNT=2) as LOCK_COUNT_IS_CORRECT
+from information_schema.rocksdb_trx;
+
DROP TABLE t1;
diff --git a/mysql-test/suite/rocksdb/t/unique_check.test b/mysql-test/suite/rocksdb/t/unique_check.test
index 47ca74d0e5e..9814d89448d 100644
--- a/mysql-test/suite/rocksdb/t/unique_check.test
+++ b/mysql-test/suite/rocksdb/t/unique_check.test
@@ -2,6 +2,11 @@
--source include/have_debug_sync.inc
--source include/count_sessions.inc
+# Doesn't work with range locking because lock tree waits do not set
+# state="Waiting for row lock" in I_S.PROCESSLIST. See MDEV-17873 for
+# details.
+--source suite/rocksdb/include/not_range_locking.inc
+
# For GitHub issue#167 -- Unique key check doesn't work
connect (con1, localhost, root,,);
diff --git a/mysql-test/suite/rocksdb/t/unique_sec.inc b/mysql-test/suite/rocksdb/t/unique_sec.inc
index ce0bb1e39a9..508816e6ace 100644
--- a/mysql-test/suite/rocksdb/t/unique_sec.inc
+++ b/mysql-test/suite/rocksdb/t/unique_sec.inc
@@ -144,8 +144,16 @@ UPDATE t1 SET id5=37 WHERE id1=38;
UPDATE t1 SET id5=34 WHERE id1=38;
--echo # NULL values are unique
+--echo # (Note: the following UPDATE reads through the whole table without
+--echo # finding anything to update. With point locking, this is fine,
+--echo # but with range locking it will time out while waiting on a row lock
+--echo # that the other transaction is holding)
+if (`select @@rocksdb_use_range_locking=0`) {
UPDATE t1 SET id5=NULL WHERE value1 > 37;
-
+}
+if (`select @@rocksdb_use_range_locking=1`) {
+-- echo UPDATE t1 SET id5=NULL WHERE value1 > 37;
+}
connection con1;
COMMIT;
diff --git a/mysql-test/suite/rocksdb/t/varbinary_format.test b/mysql-test/suite/rocksdb/t/varbinary_format.test
index fbebfeac85a..0d8a35a1321 100644
--- a/mysql-test/suite/rocksdb/t/varbinary_format.test
+++ b/mysql-test/suite/rocksdb/t/varbinary_format.test
@@ -1,6 +1,10 @@
--source include/have_debug.inc
--source include/have_rocksdb.inc
+# The test uses SELECT .. FOR UPDATE and examines which locks it acquires
+# Range Locking will use different locks from point locking
+--source suite/rocksdb/include/not_range_locking.inc
+
# Create a table with a varbinary key with the current format and validate
# that it sorts correctly
CREATE TABLE t1(
diff --git a/mysql-test/suite/rocksdb/t/varchar_format.test b/mysql-test/suite/rocksdb/t/varchar_format.test
index 3ea1a1a60b3..985b2c0c8e7 100644
--- a/mysql-test/suite/rocksdb/t/varchar_format.test
+++ b/mysql-test/suite/rocksdb/t/varchar_format.test
@@ -1,6 +1,8 @@
--source include/have_debug.inc
--source include/have_rocksdb.inc
+--source suite/rocksdb/include/not_range_locking.inc
+
####################
# Create a table with a varchar key with the current format and validate
# that it sorts correctly
diff --git a/mysql-test/suite/rocksdb_sys_vars/r/rocksdb_max_lock_memory_basic.result b/mysql-test/suite/rocksdb_sys_vars/r/rocksdb_max_lock_memory_basic.result
new file mode 100644
index 00000000000..614737fcfbc
--- /dev/null
+++ b/mysql-test/suite/rocksdb_sys_vars/r/rocksdb_max_lock_memory_basic.result
@@ -0,0 +1,7 @@
+SET @start_global_value = @@global.ROCKSDB_USE_RANGE_LOCKING;
+SELECT @start_global_value;
+@start_global_value
+0
+"Trying to set variable @@global.ROCKSDB_USE_RANGE_LOCKING to 444. It should fail because it is readonly."
+SET @@global.ROCKSDB_USE_RANGE_LOCKING = 444;
+ERROR HY000: Variable 'rocksdb_use_range_locking' is a read only variable
diff --git a/mysql-test/suite/rocksdb_sys_vars/r/rocksdb_use_range_locking_basic.result b/mysql-test/suite/rocksdb_sys_vars/r/rocksdb_use_range_locking_basic.result
new file mode 100644
index 00000000000..614737fcfbc
--- /dev/null
+++ b/mysql-test/suite/rocksdb_sys_vars/r/rocksdb_use_range_locking_basic.result
@@ -0,0 +1,7 @@
+SET @start_global_value = @@global.ROCKSDB_USE_RANGE_LOCKING;
+SELECT @start_global_value;
+@start_global_value
+0
+"Trying to set variable @@global.ROCKSDB_USE_RANGE_LOCKING to 444. It should fail because it is readonly."
+SET @@global.ROCKSDB_USE_RANGE_LOCKING = 444;
+ERROR HY000: Variable 'rocksdb_use_range_locking' is a read only variable
diff --git a/mysql-test/suite/rocksdb_sys_vars/t/rocksdb_max_lock_memory_basic.test b/mysql-test/suite/rocksdb_sys_vars/t/rocksdb_max_lock_memory_basic.test
new file mode 100644
index 00000000000..ee185aba660
--- /dev/null
+++ b/mysql-test/suite/rocksdb_sys_vars/t/rocksdb_max_lock_memory_basic.test
@@ -0,0 +1,5 @@
+--source include/have_rocksdb.inc
+--let $sys_var=ROCKSDB_USE_RANGE_LOCKING
+--let $read_only=1
+--let $session=0
+--source ../include/rocksdb_sys_var.inc
diff --git a/mysql-test/suite/rocksdb_sys_vars/t/rocksdb_use_range_locking_basic.test b/mysql-test/suite/rocksdb_sys_vars/t/rocksdb_use_range_locking_basic.test
new file mode 100644
index 00000000000..ee185aba660
--- /dev/null
+++ b/mysql-test/suite/rocksdb_sys_vars/t/rocksdb_use_range_locking_basic.test
@@ -0,0 +1,5 @@
+--source include/have_rocksdb.inc
+--let $sys_var=ROCKSDB_USE_RANGE_LOCKING
+--let $read_only=1
+--let $session=0
+--source ../include/rocksdb_sys_var.inc
diff --git a/rocksdb b/rocksdb
index fdb3125547b..c9042db6196 160000
--- a/rocksdb
+++ b/rocksdb
@@ -1 +1 @@
-Subproject commit fdb3125547b12883fedc31ef111ffb33d8934d26
+Subproject commit c9042db61962babbe0f2697a157feeeb3b8fb746
diff --git a/storage/rocksdb/CMakeLists.txt b/storage/rocksdb/CMakeLists.txt
index 714f29cfa92..3e111ae2c52 100644
--- a/storage/rocksdb/CMakeLists.txt
+++ b/storage/rocksdb/CMakeLists.txt
@@ -135,6 +135,7 @@ SET(ROCKSDB_SOURCES
logger.h
rdb_datadic.cc rdb_datadic.h
rdb_iterator.cc rdb_iterator.h
+ rdb_locking_iter.cc rdb_locking_iter.h
rdb_cf_options.cc rdb_cf_options.h
rdb_cf_manager.cc rdb_cf_manager.h
rdb_converter.cc rdb_converter.h
diff --git a/storage/rocksdb/get_rocksdb_files.sh b/storage/rocksdb/get_rocksdb_files.sh
index e0fa66b54fa..07fee252a1e 100755
--- a/storage/rocksdb/get_rocksdb_files.sh
+++ b/storage/rocksdb/get_rocksdb_files.sh
@@ -4,7 +4,7 @@ MKFILE=`mktemp`
# include rocksdb make file relative to the path of this script
echo "include rocksdb/src.mk
all:
- @echo \$(LIB_SOURCES)" > $MKFILE
+ @echo \$(LIB_SOURCES) \$(RANGE_TREE_SOURCES)" > $MKFILE
for f in `make --makefile $MKFILE`
do
echo ../../rocksdb/$f
diff --git a/storage/rocksdb/ha_rocksdb.cc b/storage/rocksdb/ha_rocksdb.cc
index 37b5b414e6e..2b48cc04dd0 100644
--- a/storage/rocksdb/ha_rocksdb.cc
+++ b/storage/rocksdb/ha_rocksdb.cc
@@ -104,6 +104,9 @@
#include "./ObjectFactory.h"
#endif
+#include "./rdb_locking_iter.h"
+
+
// Internal MySQL APIs not exposed in any header.
extern "C" {
/**
@@ -667,6 +670,10 @@ static void rocksdb_set_delayed_write_rate(THD *thd, struct SYS_VAR *var,
static void rocksdb_set_max_latest_deadlocks(THD *thd, struct SYS_VAR *var,
void *var_ptr, const void *save);
+static void rocksdb_set_max_lock_memory(THD *thd,
+ struct SYS_VAR *var,
+ void *var_ptr, const void *save);
+
static void rdb_set_collation_exception_list(const char *exception_list);
static void rocksdb_set_collation_exception_list(THD *thd, struct SYS_VAR *var,
void *var_ptr,
@@ -822,6 +829,16 @@ static bool rocksdb_alter_column_default_inplace = false;
static bool rocksdb_instant_ddl = false;
bool rocksdb_enable_tmp_table = false;
+// Range Locking: how much memory can be used used for the lock data structure
+// (which hold the locks acquired by all clients).
+static ulonglong rocksdb_max_lock_memory;
+
+bool rocksdb_use_range_locking = 0;
+static bool rocksdb_use_range_lock_manager_as_point = 0;
+std::shared_ptr<rocksdb::RangeLockManagerHandle> range_lock_mgr;
+
+std::shared_ptr<rocksdb::RangeLockManagerHandle> range_lock_mgr_used_as_point;
+
std::atomic<uint64_t> rocksdb_row_lock_deadlocks(0);
std::atomic<uint64_t> rocksdb_row_lock_wait_timeouts(0);
std::atomic<uint64_t> rocksdb_snapshot_conflict_errors(0);
@@ -1522,6 +1539,13 @@ static MYSQL_SYSVAR_UINT(max_latest_deadlocks, rocksdb_max_latest_deadlocks,
nullptr, rocksdb_set_max_latest_deadlocks,
rocksdb::kInitialMaxDeadlocks, 0, UINT32_MAX, 0);
+static MYSQL_SYSVAR_ULONGLONG(max_lock_memory, rocksdb_max_lock_memory,
+ PLUGIN_VAR_RQCMDARG,
+ "Range-locking mode: Maximum amount of memory "
+ "that locks from all transactions can use at a time",
+ nullptr, rocksdb_set_max_lock_memory,
+ /*initial*/1073741824, 0, UINT64_MAX, 0);
+
static MYSQL_SYSVAR_ENUM(
info_log_level, rocksdb_info_log_level, PLUGIN_VAR_RQCMDARG,
"Filter level for info logs to be written mysqld error log. "
@@ -2371,6 +2395,19 @@ static MYSQL_SYSVAR_BOOL(table_stats_use_table_scan,
rocksdb_update_table_stats_use_table_scan,
rocksdb_table_stats_use_table_scan);
+static MYSQL_SYSVAR_BOOL(use_range_locking, rocksdb_use_range_locking,
+ PLUGIN_VAR_OPCMDARG | PLUGIN_VAR_READONLY,
+ "Use Range Locking",
+ nullptr, nullptr,
+ rocksdb_use_range_locking);
+
+static MYSQL_SYSVAR_BOOL(use_range_lock_manager_as_point,
+ rocksdb_use_range_lock_manager_as_point,
+ PLUGIN_VAR_OPCMDARG | PLUGIN_VAR_READONLY,
+ "Use Range Lock Manager as point",
+ nullptr, nullptr,
+ rocksdb_use_range_lock_manager_as_point);
+
static MYSQL_SYSVAR_BOOL(
large_prefix, rocksdb_large_prefix, PLUGIN_VAR_RQCMDARG,
"Support large index prefix length of 3072 bytes. If off, the maximum "
@@ -2694,7 +2731,9 @@ static struct SYS_VAR *rocksdb_system_variables[] = {
MYSQL_SYSVAR(manual_compaction_threads),
MYSQL_SYSVAR(manual_compaction_bottommost_level),
MYSQL_SYSVAR(rollback_on_timeout),
-
+ MYSQL_SYSVAR(use_range_locking),
+ MYSQL_SYSVAR(use_range_lock_manager_as_point),
+ MYSQL_SYSVAR(max_lock_memory),
MYSQL_SYSVAR(enable_insert_with_update_caching),
MYSQL_SYSVAR(trace_block_cache_access),
MYSQL_SYSVAR(trace_queries),
@@ -2920,6 +2959,7 @@ class Rdb_transaction {
ulonglong m_update_count = 0;
ulonglong m_delete_count = 0;
// per row data
+ // (with range locking, each locked range is counted here, too)
ulonglong m_row_lock_count = 0;
std::unordered_map<GL_INDEX_ID, ulonglong> m_auto_incr_map;
@@ -2993,8 +3033,41 @@ class Rdb_transaction {
virtual rocksdb::Status do_pop_savepoint() = 0;
virtual void do_rollback_to_savepoint() = 0;
+ private:
+ /*
+ If true, the current statement should not use a snapshot for reading.
+ Note that in a multi-statement transaction, the snapshot may have been
+ allocated by another statement.
+ */
+ bool m_stmt_ignores_snapshot = false;
+
+ /* Snapshot-ignore mode will put away m_reads_opts.snapshot here: */
+ const rocksdb::Snapshot *m_saved_snapshot= nullptr;
+
public:
+
+ void start_ignore_snapshot() {
+ // note: this may be called several times for the same statement
+ if (!m_stmt_ignores_snapshot) {
+ m_saved_snapshot = m_read_opts.snapshot;
+ m_read_opts.snapshot = nullptr;
+ m_stmt_ignores_snapshot= true;
+ if (!m_snapshot_timestamp)
+ rdb->GetEnv()->GetCurrentTime(&m_snapshot_timestamp);
+ }
+ }
+
+ void end_ignore_snapshot_if_needed() {
+ if (m_stmt_ignores_snapshot) {
+ m_stmt_ignores_snapshot = false;
+ m_read_opts.snapshot = m_saved_snapshot;
+ m_saved_snapshot = nullptr;
+ }
+ }
+ bool in_snapshot_ignore_mode() const { return m_stmt_ignores_snapshot; }
+
rocksdb::ReadOptions m_read_opts;
+
const char *m_mysql_log_file_name;
my_off_t m_mysql_log_offset;
const char *m_mysql_gtid;
@@ -3175,6 +3248,19 @@ class Rdb_transaction {
virtual void release_lock(const Rdb_key_def &key_descr,
const std::string &rowkey, bool force = false) = 0;
+ virtual
+ rocksdb::Status lock_range(rocksdb::ColumnFamilyHandle *const cf,
+ const rocksdb::Endpoint &start,
+ const rocksdb::Endpoint &end) = 0;
+
+ rocksdb::Status lock_singlepoint_range(rocksdb::ColumnFamilyHandle *const cf,
+ const rocksdb::Slice &point) {
+ // Normally, one needs to "flip" the endpoint type for reverse-ordered CFs.
+ // But here we are locking just one point so this is not necessary.
+ rocksdb::Endpoint endp(point, false);
+ return lock_range(cf, endp, endp);
+ }
+
virtual bool prepare() = 0;
bool commit_or_rollback() {
@@ -3233,10 +3319,17 @@ class Rdb_transaction {
m_is_delayed_snapshot = false;
}
+ void locking_iter_created() {
+ if (!m_snapshot_timestamp)
+ rdb->GetEnv()->GetCurrentTime(&m_snapshot_timestamp);
+ }
+
virtual void acquire_snapshot(bool acquire_now) = 0;
virtual void release_snapshot() = 0;
- bool has_snapshot() const { return m_read_opts.snapshot != nullptr; }
+ bool has_snapshot() const {
+ return m_read_opts.snapshot != nullptr || m_saved_snapshot;
+ }
private:
// The Rdb_sst_info structures we are currently loading. In a partitioned
@@ -3886,7 +3979,9 @@ class Rdb_transaction {
virtual rocksdb::Iterator *get_iterator(
const rocksdb::ReadOptions &options,
- rocksdb::ColumnFamilyHandle *column_family) = 0;
+ rocksdb::ColumnFamilyHandle *column_family,
+ const std::shared_ptr<Rdb_key_def>&,
+ bool use_locking_iterator=false) = 0;
virtual void multi_get(rocksdb::ColumnFamilyHandle *const column_family,
const size_t num_keys, const rocksdb::Slice *keys,
@@ -3895,10 +3990,13 @@ class Rdb_transaction {
const bool sorted_input) const = 0;
rocksdb::Iterator *get_iterator(
- rocksdb::ColumnFamilyHandle *const column_family, bool skip_bloom_filter,
+ rocksdb::ColumnFamilyHandle *const column_family,
+ const std::shared_ptr<Rdb_key_def>& kd,
+ bool skip_bloom_filter,
const rocksdb::Slice &eq_cond_lower_bound,
const rocksdb::Slice &eq_cond_upper_bound, bool read_current = false,
- bool create_snapshot = true) {
+ bool create_snapshot = true,
+ bool use_locking_iterator=false) {
// Make sure we are not doing both read_current (which implies we don't
// want a snapshot) and create_snapshot which makes sure we create
// a snapshot
@@ -3928,12 +4026,13 @@ class Rdb_transaction {
if (read_current) {
options.snapshot = nullptr;
}
- return get_iterator(options, column_family);
+ return get_iterator(options, column_family, kd, use_locking_iterator);
}
virtual bool is_tx_started() const = 0;
virtual void start_tx() = 0;
- virtual void start_stmt() = 0;
+ virtual void start_stmt(bool is_dml_statement) = 0;
+ virtual void start_autocommit_stmt(bool /*is_dml_statement*/){}
virtual void set_name() = 0;
protected:
@@ -4131,6 +4230,13 @@ class Rdb_transaction_impl : public Rdb_transaction {
virtual bool is_writebatch_trx() const override { return false; }
+ // Lock the range between two specified endpoints
+ rocksdb::Status lock_range(rocksdb::ColumnFamilyHandle *const cf,
+ const rocksdb::Endpoint &start_endp,
+ const rocksdb::Endpoint &end_endp) override {
+ ++m_row_lock_count;
+ return m_rocksdb_tx->GetRangeLock(cf, start_endp, end_endp);
+ }
private:
void release_tx(void) {
// We are done with the current active transaction object. Preserve it
@@ -4237,7 +4343,7 @@ class Rdb_transaction_impl : public Rdb_transaction {
}
void acquire_snapshot(bool acquire_now) override {
- if (m_read_opts.snapshot == nullptr) {
+ if (m_read_opts.snapshot == nullptr && !in_snapshot_ignore_mode()) {
const auto thd_ss = std::static_pointer_cast<Rdb_explicit_snapshot>(
m_thd->get_explicit_snapshot());
if (thd_ss) {
@@ -4393,9 +4499,17 @@ class Rdb_transaction_impl : public Rdb_transaction {
rocksdb::Iterator *get_iterator(
const rocksdb::ReadOptions &options,
- rocksdb::ColumnFamilyHandle *const column_family) override {
+ rocksdb::ColumnFamilyHandle *const column_family,
+ const std::shared_ptr<Rdb_key_def>& kd,
+ bool use_locking_iterator) override {
global_stats.queries[QUERIES_RANGE].inc();
- return m_rocksdb_tx->GetIterator(options, column_family);
+ if (use_locking_iterator) {
+ locking_iter_created();
+ return GetLockingIterator(m_rocksdb_tx, options, column_family,
+ kd, &m_row_lock_count);
+ }
+ else
+ return m_rocksdb_tx->GetIterator(options, column_family);
}
const rocksdb::Transaction *get_rdb_trx() const { return m_rocksdb_tx; }
@@ -4469,17 +4583,35 @@ class Rdb_transaction_impl : public Rdb_transaction {
/*
Start a statement inside a multi-statement transaction.
- @todo: are we sure this is called once (and not several times) per
- statement start?
+ @note: If a statement uses N tables, this function will be called N times,
+ for each TABLE object that is used.
For hooking to start of statement that is its own transaction, see
ha_rocksdb::external_lock().
*/
- void start_stmt() override {
+ void start_stmt(bool is_dml_statement) override {
+
+ if (rocksdb_use_range_locking && is_dml_statement) {
+ /*
+ In Range Locking mode, RocksDB does not do "key tracking".
+ Use InnoDB-like concurrency mode: make the DML statements always read
+ the latest data (instead of using transaction's snapshot).
+ This "downgrades" the transaction isolation to READ-COMMITTED on the
+ primary, but in return the actions can be replayed on the replica.
+ */
+ start_ignore_snapshot();
+ }
+
// Set the snapshot to delayed acquisition (SetSnapshotOnNextOperation)
acquire_snapshot(false);
}
+ void start_autocommit_stmt(bool is_dml_statement) override {
+ if (rocksdb_use_range_locking && is_dml_statement) {
+ start_ignore_snapshot();
+ }
+ }
+
/*
This must be called when last statement is rolled back, but the transaction
continues
@@ -4608,6 +4740,12 @@ class Rdb_writebatch_impl : public Rdb_transaction {
// Nothing to do here since we don't hold any row locks.
}
+ rocksdb::Status lock_range(rocksdb::ColumnFamilyHandle *const,
+ const rocksdb::Endpoint&,
+ const rocksdb::Endpoint&) override {
+ return rocksdb::Status::OK();
+ }
+
void rollback() override {
on_rollback();
m_write_count = 0;
@@ -4707,7 +4845,9 @@ class Rdb_writebatch_impl : public Rdb_transaction {
rocksdb::Iterator *get_iterator(
const rocksdb::ReadOptions &options,
- rocksdb::ColumnFamilyHandle *const /* column_family */) override {
+ rocksdb::ColumnFamilyHandle *const /* column_family */,
+ const std::shared_ptr<Rdb_key_def>&,
+ bool /*use_locking_iterator*/) override {
const auto it = rdb->NewIterator(options);
return m_batch->NewIteratorWithBase(it);
}
@@ -4725,9 +4865,9 @@ class Rdb_writebatch_impl : public Rdb_transaction {
set_initial_savepoint();
}
+ void start_stmt(bool /*is_dml_statement*/) override {}
void set_name() override {}
- void start_stmt() override {}
void rollback_stmt() override {
if (m_batch) rollback_to_stmt_savepoint();
@@ -5051,6 +5191,7 @@ static int rocksdb_prepare(handlerton *const hton MY_ATTRIBUTE((__unused__)),
DEBUG_SYNC(thd, "rocksdb.prepared");
} else {
tx->make_stmt_savepoint_permanent();
+ tx->end_ignore_snapshot_if_needed();
}
return HA_EXIT_SUCCESS;
@@ -5278,6 +5419,7 @@ static int rocksdb_commit(handlerton *const hton MY_ATTRIBUTE((__unused__)),
- For a COMMIT statement that finishes a multi-statement transaction
- For a statement that has its own transaction
*/
+ tx->end_ignore_snapshot_if_needed();
if (tx->commit()) {
DBUG_RETURN(HA_ERR_ROCKSDB_COMMIT_FAILED);
}
@@ -5287,6 +5429,7 @@ static int rocksdb_commit(handlerton *const hton MY_ATTRIBUTE((__unused__)),
*/
tx->set_tx_failed(false);
tx->make_stmt_savepoint_permanent();
+ tx->end_ignore_snapshot_if_needed();
}
if (my_core::thd_tx_isolation(thd) <= ISO_READ_COMMITTED) {
@@ -5324,6 +5467,7 @@ static int rocksdb_rollback(handlerton *const hton MY_ATTRIBUTE((__unused__)),
- a statement inside a transaction is rolled back
*/
+ tx->end_ignore_snapshot_if_needed();
tx->rollback_stmt();
tx->set_tx_failed(true);
}
@@ -5428,8 +5572,9 @@ class Rdb_snapshot_status : public Rdb_tx_list_walker {
"=========================================\n";
}
+ template<class PathStruct>
static Rdb_deadlock_info::Rdb_dl_trx_info get_dl_txn_info(
- const rocksdb::DeadlockInfo &txn, const GL_INDEX_ID &gl_index_id) {
+ const PathStruct &txn, const GL_INDEX_ID &gl_index_id) {
Rdb_deadlock_info::Rdb_dl_trx_info txn_data;
txn_data.trx_id = txn.m_txn_id;
@@ -5454,23 +5599,49 @@ class Rdb_snapshot_status : public Rdb_tx_list_walker {
? cfh->GetName()
: "NOT FOUND; CF_ID: " + std::to_string(txn.m_cf_id);
- txn_data.waiting_key =
- rdb_hexdump(txn.m_waiting_key.c_str(), txn.m_waiting_key.length());
+ txn_data.waiting_key = format_wait_key(txn);
txn_data.exclusive_lock = txn.m_exclusive;
return txn_data;
}
+ // Get the key to use to find the index number (and then, index name)
+ // Two functions with matching signatures so get_dl_path_trx_info() template
+ // can be used with both point and range locking.
+ static const std::string& get_key_for_indexnr(
+ const rocksdb::DeadlockInfo& info) {
+ return info.m_waiting_key;
+ }
+ static const std::string& get_key_for_indexnr(
+ const rocksdb::RangeDeadlockInfo& info) {
+ // Range locks do not span across indexes, so take the left bound
+ return info.m_start.slice;
+ }
+
+ // Print the locked key (or range) in hex
+ // Two functions with matching signatures so get_dl_path_trx_info() template
+ // can be used with both point and range locking.
+ static std::string format_wait_key(const rocksdb::DeadlockInfo& info) {
+ return rdb_hexdump(info.m_waiting_key.c_str(), info.m_waiting_key.length());
+ }
+ static std::string format_wait_key(const rocksdb::RangeDeadlockInfo& info) {
+ return rdb_hexdump_range(info.m_start, info.m_end);
+ }
+
+ // Get deadlock path info. A templated function so one can use it with both
+ // point and range locking.
+ template<class PathStruct>
static Rdb_deadlock_info get_dl_path_trx_info(
- const rocksdb::DeadlockPath &path_entry) {
+ const PathStruct &path_entry) {
Rdb_deadlock_info deadlock_info;
for (auto it = path_entry.path.begin(); it != path_entry.path.end(); it++) {
const auto &txn = *it;
+ auto waiting_key = get_key_for_indexnr(txn);
const GL_INDEX_ID gl_index_id = {
txn.m_cf_id, rdb_netbuf_to_uint32(reinterpret_cast<const uchar *>(
- txn.m_waiting_key.c_str()))};
+ waiting_key.c_str()))};
deadlock_info.path.push_back(get_dl_txn_info(txn, gl_index_id));
}
DBUG_ASSERT_IFF(path_entry.limit_exceeded, path_entry.path.empty());
@@ -5495,7 +5666,7 @@ class Rdb_snapshot_status : public Rdb_tx_list_walker {
/* Calculate the duration the snapshot has existed */
int64_t snapshot_timestamp = tx->m_snapshot_timestamp;
- if (snapshot_timestamp != 0) {
+ if (snapshot_timestamp != 0 && tx->has_snapshot()) {
int64_t curr_time;
rdb->GetEnv()->GetCurrentTime(&curr_time);
@@ -5514,8 +5685,8 @@ class Rdb_snapshot_status : public Rdb_tx_list_walker {
}
}
- void populate_deadlock_buffer() {
- auto dlock_buffer = rdb->GetDeadlockInfoBuffer();
+ template<class PathStruct>
+ void populate_deadlock_buffer_tmpl(PathStruct &dlock_buffer) {
m_data += "----------LATEST DETECTED DEADLOCKS----------\n";
for (const auto &path_entry : dlock_buffer) {
@@ -5555,12 +5726,32 @@ class Rdb_snapshot_status : public Rdb_tx_list_walker {
}
}
+ void populate_deadlock_buffer() {
+ if (range_lock_mgr) {
+ auto dlock_buffer = range_lock_mgr->GetRangeDeadlockInfoBuffer();
+ populate_deadlock_buffer_tmpl(dlock_buffer);
+ } else {
+ auto dlock_buffer = rdb->GetDeadlockInfoBuffer();
+ populate_deadlock_buffer_tmpl(dlock_buffer);
+ }
+ }
+
std::vector<Rdb_deadlock_info> get_deadlock_info() {
std::vector<Rdb_deadlock_info> deadlock_info;
- auto dlock_buffer = rdb->GetDeadlockInfoBuffer();
- for (const auto &path_entry : dlock_buffer) {
- if (!path_entry.limit_exceeded) {
- deadlock_info.push_back(get_dl_path_trx_info(path_entry));
+
+ if (range_lock_mgr) {
+ auto dlock_buffer = range_lock_mgr->GetRangeDeadlockInfoBuffer();
+ for (const auto &path_entry : dlock_buffer) {
+ if (!path_entry.limit_exceeded) {
+ deadlock_info.push_back(get_dl_path_trx_info(path_entry));
+ }
+ }
+ } else {
+ auto dlock_buffer = rdb->GetDeadlockInfoBuffer();
+ for (const auto &path_entry : dlock_buffer) {
+ if (!path_entry.limit_exceeded) {
+ deadlock_info.push_back(get_dl_path_trx_info(path_entry));
+ }
}
}
return deadlock_info;
@@ -5976,9 +6167,13 @@ static bool rocksdb_collect_hton_log_info(handlerton *const /* unused */,
return ret_val;
}
+/*
+ @param is_dml_statement If true, we are is a DML statement
+*/
static inline void rocksdb_register_tx(
handlerton *const hton MY_ATTRIBUTE((__unused__)), THD *const thd,
- Rdb_transaction *const tx) {
+ Rdb_transaction *const tx,
+ bool is_dml_stmt) {
DBUG_ASSERT(tx != nullptr);
trans_register_ha(thd, false, rocksdb_hton, NULL);
@@ -5993,8 +6188,10 @@ static inline void rocksdb_register_tx(
}
}
if (my_core::thd_test_options(thd, OPTION_NOT_AUTOCOMMIT | OPTION_BEGIN)) {
- tx->start_stmt();
+ tx->start_stmt(is_dml_stmt);
trans_register_ha(thd, true, rocksdb_hton, NULL);
+ } else {
+ tx->start_autocommit_stmt(is_dml_stmt);
}
}
@@ -6079,7 +6276,7 @@ static int rocksdb_start_tx_and_assign_read_view(
DBUG_ASSERT(!tx->has_snapshot());
tx->set_tx_read_only(true);
- rocksdb_register_tx(hton, thd, tx);
+ rocksdb_register_tx(hton, thd, tx, false);
tx->acquire_snapshot(true);
return HA_EXIT_SUCCESS;
@@ -6136,7 +6333,7 @@ static int rocksdb_start_tx_with_shared_read_view(
DBUG_ASSERT(!tx->has_snapshot());
tx->set_tx_read_only(true);
- rocksdb_register_tx(hton, thd, tx);
+ rocksdb_register_tx(hton, thd, tx, false);
tx->acquire_snapshot(true);
// case: an explicit snapshot was not assigned to this transaction
@@ -6815,6 +7012,25 @@ static int rocksdb_init_internal(void *const p) {
tx_db_options.custom_mutex_factory = std::make_shared<Rdb_mutex_factory>();
tx_db_options.write_policy =
static_cast<rocksdb::TxnDBWritePolicy>(rocksdb_write_policy);
+
+ if (rocksdb_use_range_locking && rocksdb_use_range_lock_manager_as_point) {
+ //rdb_log_status_error(
+ // status, "Can't have both range_locking and range_lock_manager_as_point");
+ //DBUG_RETURN(HA_EXIT_FAILURE);
+ rocksdb_use_range_lock_manager_as_point= 0;
+ }
+
+
+ if (rocksdb_use_range_locking) {
+ range_lock_mgr.reset(
+ rocksdb::NewRangeLockManager(tx_db_options.custom_mutex_factory));
+ tx_db_options.lock_mgr_handle = range_lock_mgr;
+ }
+ if (rocksdb_use_range_lock_manager_as_point) {
+ range_lock_mgr_used_as_point.reset(
+ rocksdb::NewRangeLockManager(tx_db_options.custom_mutex_factory));
+ tx_db_options.lock_mgr_handle = range_lock_mgr_used_as_point;
+ }
status =
check_rocksdb_options_compatibility(rocksdb_datadir, main_opts, cf_descr);
@@ -6849,6 +7065,15 @@ static int rocksdb_init_internal(void *const p) {
DBUG_RETURN(HA_EXIT_FAILURE);
}
+ if (range_lock_mgr)
+ {
+ range_lock_mgr->SetMaxLockMemory(rocksdb_max_lock_memory);
+ sql_print_information("RocksDB: USING RANGE LOCKING");
+ sql_print_information("RocksDB: Max lock memory=%llu", rocksdb_max_lock_memory);
+ }
+ else
+ sql_print_information("RocksDB: USING POINT LOCKING");
+
// NO_LINT_DEBUG
sql_print_information("RocksDB:Init column families...");
if (st_rdb_exec_time.exec("cf_manager::init", [&]() {
@@ -7580,6 +7805,7 @@ ha_rocksdb::ha_rocksdb(my_core::handlerton *const hton,
m_sk_packed_tuple_old(nullptr),
m_pack_buffer(nullptr),
m_lock_rows(RDB_LOCK_NONE),
+ m_use_range_locking(false),
m_keyread_only(false),
m_iteration_only(false),
m_insert_with_update(false),
@@ -7902,6 +8128,7 @@ int ha_rocksdb::open(const char *const name,
}
m_lock_rows = RDB_LOCK_NONE;
+ m_use_range_locking = false;
m_locked_row_action = THR_WAIT;
m_key_descr_arr = m_tbl_def->m_key_descr_arr;
@@ -9453,6 +9680,15 @@ int ha_rocksdb::index_read_intern(uchar *const buf, const uchar *const key,
}
Rdb_transaction *const tx = get_or_create_tx(table->in_use);
+
+ bool use_locking_iter= false;
+
+ if ((rc = set_range_lock(tx, kd, find_flag, slice, end_range,
+ &use_locking_iter)))
+ DBUG_RETURN(rc);
+ if (use_locking_iter)
+ m_iterator->set_use_locking();
+
const bool is_new_snapshot = !tx->has_snapshot();
// Loop as long as we get a deadlock error AND we end up creating the
@@ -9501,6 +9737,253 @@ int ha_rocksdb::index_read_intern(uchar *const buf, const uchar *const key,
DBUG_RETURN(rc);
}
+
+/*
+ @brief
+ Compute the range lock endpoints and set the range lock, if necessary
+
+ @param use_locking_iter OUT If true, locks are not set and LockingIterator
+ should be used instead
+
+ @detail
+ If the scanned range doesn't have the endpoint we're scanning towards,
+ don't set the lock, it will be too coarse. Indicate that LockingIterator
+ should be used, instead.
+
+ == RangeFlagsShouldBeFlippedForRevCF ==
+ When using reverse column families, the value of Endpoint::inf_suffix has
+ the reverse meaning.
+
+ Let's consider a forward-ordered CF and some keys and endpoints in it:
+
+ key=a, inf_suffix=false
+ key=ab
+ key=az
+ key=a, inf_suffix=true
+
+ Now, let's put the same data and endpoints into a reverse-ordered CF. The
+ physical order of the data will be the reverse of the above:
+
+ key=a, inf_suffix=true
+ key=az
+ key=ab
+ key=a, inf_suffix=false
+
+ Note that inf_suffix=false comes *before* any values with the same prefix.
+ And inf_suffix=true comes *after* all values with the same prefix.
+
+ The Endpoint comparison function in RocksDB doesn't "know" if the CF is
+ reverse-ordered or not. It uses the Key Comparator for key values, and
+ then it assumes that Endpoint(key=$VAL, inf_suffix=false) comes before
+ the row with key=$VAL.
+
+ The only way to achieve the required ordering is to flip the endpoint
+ flag value before passing class Endpoint to RocksDB. This function does
+ it before the lock_range() call.
+
+ @return
+ 0 Ok
+ Other Error acquiring the lock (wait timeout, deadlock, etc)
+*/
+
+int ha_rocksdb::set_range_lock(Rdb_transaction *tx,
+ const Rdb_key_def &kd,
+ const enum ha_rkey_function &find_flag,
+ const rocksdb::Slice &slice_arg,
+ const key_range *const end_key,
+ bool *use_locking_iterator)
+{
+ rocksdb::Slice end_slice;
+ uchar end_slice_buf[MAX_KEY_LENGTH];
+ bool start_has_inf_suffix = false, end_has_inf_suffix = false;
+ rocksdb::Slice slice(slice_arg);
+ *use_locking_iterator= false;
+
+ if (!m_use_range_locking) {
+ return 0;
+ }
+ bool big_range= false;
+
+ /*
+ The 'slice' has the left endpoint of the range to lock.
+ Figure out the right endpoint.
+ */
+
+ if (find_flag == HA_READ_KEY_EXACT ||
+ find_flag == HA_READ_PREFIX_LAST) {
+ if (slice.size() == Rdb_key_def::INDEX_NUMBER_SIZE) {
+ // This is a full table/index scan
+ // (in case of HA_READ_PREFIX_LAST, a reverse-ordered one)
+ start_has_inf_suffix= false;
+ big_range = true;
+ } else {
+ /*
+ HA_READ_KEY_EXACT:
+ This is "key_part= const" interval. We need to lock this range:
+ (lookup_value, -inf) < key < (lookup_value, +inf)
+ HA_READ_PREFIX_LAST:
+ We get here for queries like:
+
+ select * from t1 where pk1=const order by pk1 desc for update
+
+ assuming this uses an index on (pk1, ...).
+ We get end_key=nullptr.
+ */
+ start_has_inf_suffix= false;
+ end_has_inf_suffix= true;
+ end_slice= slice;
+ }
+ } else if (find_flag == HA_READ_PREFIX_LAST_OR_PREV) {
+ /*
+ We get here for queries like:
+
+ select * from t1 where pk1=const1 and pk2 between const2 and const3
+ order by pk1 desc
+ for update
+
+ assuming this uses an index on (pk1, pk2).
+ The slice has the right endpoint: {const1, const3}
+ the end_key has the left endpoint: {const1, const2}.
+ */
+
+ // Move the right endpoint from slice to end_slice
+ end_slice= slice;
+
+ // Pack the left endpoint and make "slice" point to it
+ uchar pack_buffer[MAX_KEY_LENGTH];
+ uint end_slice_size=
+ kd.pack_index_tuple(table, pack_buffer, end_slice_buf,
+ end_key->key, end_key->keypart_map);
+ slice= rocksdb::Slice(reinterpret_cast<char *>(end_slice_buf),
+ end_slice_size);
+ start_has_inf_suffix= false;
+ end_has_inf_suffix= true;
+ } else if (find_flag == HA_READ_BEFORE_KEY) {
+ /*
+ We get here for queries like
+ select * from t1
+ where pk <1007 order by pk desc limit 2 for update
+ select * from t1
+ where pk >=800 and pk <1007 order by pk desc limit 2 for update
+ */
+
+ // Move the right endpoint from slice to end_slice
+ end_slice= slice;
+
+ if (end_key) {
+ uchar pack_buffer[MAX_KEY_LENGTH];
+ uint end_slice_size=
+ kd.pack_index_tuple(table, pack_buffer, end_slice_buf,
+ end_key->key, end_key->keypart_map);
+
+ slice= rocksdb::Slice(reinterpret_cast<char *>(end_slice_buf),
+ end_slice_size);
+
+ // end_has_inf_suffix is false, because we're looking key<const
+ } else {
+ uint end_slice_size;
+ kd.get_infimum_key(end_slice_buf, &end_slice_size);
+ slice= rocksdb::Slice((char*)end_slice_buf, end_slice_size);
+
+ big_range= true;
+ }
+ } else if (end_key) {
+ // Known start range bounds: HA_READ_KEY_OR_NEXT, HA_READ_AFTER_KEY
+ if (find_flag == HA_READ_KEY_OR_NEXT)
+ start_has_inf_suffix= false;
+ else if (find_flag == HA_READ_AFTER_KEY)
+ start_has_inf_suffix= true;
+ else {
+ // Unknown type of range, shouldn't happen
+ DBUG_ASSERT(0);
+ big_range = true;
+ }
+
+ // Known end range bounds: HA_READ_AFTER_KEY, HA_READ_BEFORE_KEY
+ if (end_key->flag == HA_READ_AFTER_KEY) {
+ // this is "key_part <= const".
+ end_has_inf_suffix= true;
+ } else if (end_key->flag == HA_READ_BEFORE_KEY) {
+ // this is "key_part < const", non-inclusive.
+ end_has_inf_suffix= false;
+ } else {
+ // Unknown type of range, shouldn't happen
+ DBUG_ASSERT(0);
+ big_range = true;
+ }
+
+ uchar pack_buffer[MAX_KEY_LENGTH];
+ uint end_slice_size= kd.pack_index_tuple(table, pack_buffer, end_slice_buf,
+ end_key->key,
+ end_key->keypart_map);
+
+ end_slice= rocksdb::Slice(reinterpret_cast<char *>(end_slice_buf),
+ end_slice_size);
+ } else {
+ big_range= true;
+#if 0
+ // The below is code to handle this without LockingIterator:
+ // No end key
+ // Known start range bounds: HA_READ_KEY_OR_NEXT, HA_READ_AFTER_KEY
+ if (find_flag == HA_READ_KEY_OR_NEXT)
+ start_has_inf_suffix= false;
+ else if (find_flag == HA_READ_AFTER_KEY)
+ start_has_inf_suffix= true;
+ else
+ DBUG_ASSERT(0);
+
+ uint end_slice_size;
+ kd.get_infimum_key(end_slice_buf, &end_slice_size);
+ end_slice= rocksdb::Slice((char*)end_slice_buf, end_slice_size);
+ end_has_inf_suffix= true;
+#endif
+ }
+
+ if (big_range) {
+ *use_locking_iterator= true;
+ return 0;
+ }
+
+ rocksdb::Endpoint start_endp;
+ rocksdb::Endpoint end_endp;
+
+ if (kd.m_is_reverse_cf) {
+ // Flip the endpoint flag values, as explained in the
+ // RangeFlagsShouldBeFlippedForRevCF comment above.
+ start_endp =rocksdb::Endpoint(end_slice, !end_has_inf_suffix);
+ end_endp = rocksdb::Endpoint(slice, !start_has_inf_suffix);
+ } else {
+ start_endp= rocksdb::Endpoint(slice, start_has_inf_suffix);
+ end_endp= rocksdb::Endpoint(end_slice, end_has_inf_suffix);
+ }
+
+ /*
+ RocksDB's iterator is reading the snapshot of the data that was taken at
+ the time the iterator was created.
+
+ After we've got a lock on the range, we'll need to refresh the iterator
+ to read the latest contents. (If we use the iterator created before the
+ lock_range() call, we may miss the changes that were made/committed after
+ the iterator was created but before the lock_range() call was made).
+
+ RocksDB has Iterator::Refresh() method, but alas, it is not implemented for
+ the iterator returned by Transaction object (Transaction object returns
+ BaseDeltaIterator which allows one to see the transactions's own changes).
+
+ Our solution to this is to release the iterator and create the new one.
+ We release it here, it will be created as soon as there's a need to read
+ records.
+ */
+ m_iterator->reset();
+
+ auto s= tx->lock_range(kd.get_cf(), start_endp, end_endp);
+ if (!s.ok()) {
+ return (tx->set_status_error(table->in_use, s, kd, m_tbl_def,
+ m_table_handler));
+ }
+ return 0;
+}
+
/*
See storage/rocksdb/rocksdb-range-access.txt for description of how MySQL
index navigation commands are converted into RocksDB lookup commands.
@@ -10015,7 +10498,7 @@ int ha_rocksdb::index_next_with_direction_intern(uchar *const buf,
}
}
- if (rc) {
+ if (rc != HA_EXIT_SUCCESS) {
break;
}
@@ -10026,7 +10509,7 @@ int ha_rocksdb::index_next_with_direction_intern(uchar *const buf,
table->m_status = 0;
rc = 0;
} else if (active_index == table->s->primary_key) {
- if (m_lock_rows != RDB_LOCK_NONE) {
+ if (m_lock_rows != RDB_LOCK_NONE && !m_use_range_locking) {
DEBUG_SYNC(ha_thd(), "rocksdb_concurrent_delete");
/* We need to put a lock and re-read */
bool skip_row = false;
@@ -10222,7 +10705,9 @@ int ha_rocksdb::index_read_intern(uchar *const buf, bool first) {
void ha_rocksdb::unlock_row() {
DBUG_ENTER_FUNC();
- if (m_lock_rows != RDB_LOCK_NONE) {
+ // Don't release the lock when using range locking.
+ // This breaks m_row_lock_count
+ if (m_lock_rows != RDB_LOCK_NONE && !m_use_range_locking) {
Rdb_transaction *const tx = get_or_create_tx(table->in_use);
tx->release_lock(*m_pk_descr,
std::string(m_last_rowkey.ptr(), m_last_rowkey.length()));
@@ -10694,6 +11179,8 @@ int ha_rocksdb::check_and_lock_sk(
lock for this key.
*/
if (!(key_info->flags & HA_NOSAME)) {
+ if (rocksdb_use_range_locking)
+ return check_and_lock_non_unique_sk(key_id, row_info);
return HA_EXIT_SUCCESS;
}
@@ -10810,6 +11297,57 @@ int ha_rocksdb::check_and_lock_sk(
return rc;
}
+
+/**
+ @brief
+ Lock the non-unique sk for range locking
+*/
+int ha_rocksdb::check_and_lock_non_unique_sk(
+ const uint key_id, const struct update_row_info &row_info) {
+
+ DBUG_ASSERT(rocksdb_use_range_locking);
+ const Rdb_key_def &kd = *m_key_descr_arr[key_id];
+ bool store_row_debug_checksums = should_store_row_debug_checksums();
+
+ if (row_info.old_data != nullptr) {
+ rocksdb::Slice old_key_slice;
+ int old_packed_size;
+
+ old_packed_size = kd.pack_record(
+ table, m_pack_buffer, row_info.old_data, m_sk_packed_tuple_old,
+ &m_sk_tails_old, store_row_debug_checksums, row_info.hidden_pk_id, 0,
+ nullptr, m_ttl_bytes);
+
+ old_key_slice = rocksdb::Slice(
+ reinterpret_cast<const char *>(m_sk_packed_tuple_old), old_packed_size);
+
+ auto s= row_info.tx->lock_singlepoint_range(kd.get_cf(), old_key_slice);
+ if (!s.ok()) {
+ return (row_info.tx->set_status_error(table->in_use, s, kd,
+ m_tbl_def, m_table_handler));
+ }
+ }
+
+ int new_packed_size;
+ rocksdb::Slice new_key_slice;
+ rocksdb::Slice new_value_slice;
+ new_packed_size =
+ kd.pack_record(table, m_pack_buffer, row_info.new_data,
+ m_sk_packed_tuple, &m_sk_tails, 0,
+ row_info.hidden_pk_id, 0, nullptr, m_ttl_bytes);
+ new_key_slice = rocksdb::Slice(
+ reinterpret_cast<const char *>(m_sk_packed_tuple), new_packed_size);
+
+ auto s= row_info.tx->lock_singlepoint_range(kd.get_cf(), new_key_slice);
+ if (!s.ok()) {
+ return (row_info.tx->set_status_error(table->in_use, s, kd,
+ m_tbl_def, m_table_handler));
+ }
+
+ return HA_EXIT_SUCCESS;
+}
+
+
/**
Enumerate all keys to check their uniquess and also lock it
@@ -11606,6 +12144,19 @@ int ha_rocksdb::delete_row(const uchar *const buf) {
nullptr, false, hidden_pk_id);
rocksdb::Slice secondary_key_slice(
reinterpret_cast<const char *>(m_sk_packed_tuple), packed_size);
+
+ /*
+ For point locking, Deleting on secondary key doesn't need any locks.
+ Range locking must get a lock.
+ */
+ if (rocksdb_use_range_locking) {
+ auto s= tx->lock_singlepoint_range(kd.get_cf(), secondary_key_slice);
+ if (!s.ok()) {
+ DBUG_RETURN(tx->set_status_error(table->in_use, s, kd, m_tbl_def,
+ m_table_handler));
+ }
+ }
+
tx->get_indexed_write_batch()->SingleDelete(kd.get_cf(),
secondary_key_slice);
bytes_written += secondary_key_slice.size();
@@ -12189,8 +12740,15 @@ int ha_rocksdb::external_lock(THD *const thd, int lock_type) {
}
}
tx->m_n_mysql_tables_in_use++;
- rocksdb_register_tx(rocksdb_hton, thd, tx);
+ rocksdb_register_tx(rocksdb_hton, thd, tx, (m_lock_rows != RDB_LOCK_NONE));
tx->io_perf_start(&m_io_perf);
+
+ m_use_range_locking= false;
+ if (rocksdb_use_range_locking &&
+ m_lock_rows != RDB_LOCK_NONE &&
+ my_core::thd_tx_isolation(thd) >= ISO_REPEATABLE_READ) {
+ m_use_range_locking= true;
+ }
}
DBUG_RETURN(res);
@@ -12217,7 +12775,7 @@ int ha_rocksdb::start_stmt(THD *const thd,
Rdb_transaction *const tx = get_or_create_tx(thd);
read_thd_vars(thd);
- rocksdb_register_tx(ht, thd, tx);
+ rocksdb_register_tx(ht, thd, tx, (m_lock_rows != RDB_LOCK_NONE));
tx->io_perf_start(&m_io_perf);
DBUG_RETURN(HA_EXIT_SUCCESS);
@@ -14713,6 +15271,36 @@ static int show_rocksdb_stall_vars(THD *thd MY_ATTRIBUTE((unused)),
return 0;
}
+//
+// Lock Tree Status variables
+//
+static longlong rocksdb_locktree_escalation_count=1234;
+static longlong rocksdb_locktree_current_lock_memory=0;
+
+static SHOW_VAR rocksdb_locktree_status_variables[] = {
+ DEF_STATUS_VAR_FUNC("escalation_count",
+ &rocksdb_locktree_escalation_count, SHOW_LONGLONG),
+ DEF_STATUS_VAR_FUNC("current_lock_memory",
+ &rocksdb_locktree_current_lock_memory, SHOW_LONGLONG),
+ // end of the array marker
+ {NullS, NullS, SHOW_LONG, SHOW_SCOPE_GLOBAL}};
+
+static SHOW_VAR rocksdb_empty_status_variables[] = {
+ {NullS, NullS, SHOW_LONG, SHOW_SCOPE_GLOBAL}};
+
+static void show_rocksdb_locktree_vars(THD*, SHOW_VAR *var, char*) {
+ var->type = SHOW_ARRAY;
+ if (range_lock_mgr)
+ {
+ auto status = range_lock_mgr->GetStatus();
+ rocksdb_locktree_escalation_count = status.escalation_count;
+ rocksdb_locktree_current_lock_memory = status.current_lock_memory;
+ var->value = reinterpret_cast<char *>(&rocksdb_locktree_status_variables);
+ }
+ else
+ var->value = reinterpret_cast<char *>(&rocksdb_empty_status_variables);
+}
+
static SHOW_VAR rocksdb_status_vars[] = {
DEF_STATUS_VAR(block_cache_miss),
DEF_STATUS_VAR(block_cache_hit),
@@ -14842,6 +15430,8 @@ static SHOW_VAR rocksdb_status_vars[] = {
SHOW_SCOPE_GLOBAL},
{"rocksdb_stall", reinterpret_cast<char *>(&show_rocksdb_stall_vars),
SHOW_FUNC, SHOW_SCOPE_GLOBAL},
+ {"rocksdb_locktree", reinterpret_cast<char *>(show_rocksdb_locktree_vars),
+ SHOW_FUNC, SHOW_SCOPE_GLOBAL},
{NullS, NullS, SHOW_LONG, SHOW_SCOPE_GLOBAL}};
/*
@@ -15834,6 +16424,23 @@ void rocksdb_set_delayed_write_rate(THD *thd MY_ATTRIBUTE((unused)),
RDB_MUTEX_UNLOCK_CHECK(rdb_sysvars_mutex);
}
+void rocksdb_set_max_lock_memory(THD *thd, struct SYS_VAR*,
+ void* /*var_ptr*/, const void *save) {
+ const uint64_t new_val = *static_cast<const uint64_t *>(save);
+ if (rocksdb_max_lock_memory != new_val) {
+ if (range_lock_mgr->SetMaxLockMemory(new_val)) {
+ /* NO_LINT_DEBUG */
+ sql_print_warning("MyRocks: failed to set max_lock_memory");
+ push_warning_printf(thd, Sql_condition::SL_WARNING,
+ ER_ERROR_WHEN_EXECUTING_COMMAND,
+ "Cannot set max_lock_memory to size below currently used");
+ } else {
+ // Succeeded
+ rocksdb_max_lock_memory = new_val;
+ }
+ }
+}
+
void rocksdb_set_max_latest_deadlocks(
THD *thd MY_ATTRIBUTE((unused)), struct SYS_VAR *var MY_ATTRIBUTE((unused)),
void *var_ptr MY_ATTRIBUTE((unused)), const void *save) {
@@ -15841,7 +16448,13 @@ void rocksdb_set_max_latest_deadlocks(
const uint32_t new_val = *static_cast<const uint32_t *>(save);
if (rocksdb_max_latest_deadlocks != new_val) {
rocksdb_max_latest_deadlocks = new_val;
- rdb->SetDeadlockInfoBufferSize(rocksdb_max_latest_deadlocks);
+ if (range_lock_mgr) {
+ auto n= rocksdb_max_latest_deadlocks;
+ range_lock_mgr->SetRangeDeadlockInfoBufferSize(n);
+ }
+ else
+ rdb->SetDeadlockInfoBufferSize(rocksdb_max_latest_deadlocks);
+
}
RDB_MUTEX_UNLOCK_CHECK(rdb_sysvars_mutex);
}
@@ -16556,11 +17169,24 @@ const rocksdb::ReadOptions &rdb_tx_acquire_snapshot(Rdb_transaction *tx) {
}
rocksdb::Iterator *rdb_tx_get_iterator(
- THD *thd, rocksdb::ColumnFamilyHandle *const cf, bool skip_bloom_filter,
+ Rdb_transaction *tx, rocksdb::ColumnFamilyHandle *const column_family,
+ const std::shared_ptr<Rdb_key_def> &kd,
+ bool skip_bloom_filter, const rocksdb::Slice &lower_bound_slice,
+ const rocksdb::Slice &upper_bound_slice, bool read_current,
+ bool create_snapshot) {
+ return tx->get_iterator(column_family, kd, skip_bloom_filter, lower_bound_slice,
+ upper_bound_slice, read_current, create_snapshot);
+}
+
+
+rocksdb::Iterator *rdb_tx_get_iterator(
+ THD *thd, rocksdb::ColumnFamilyHandle *const cf,
+ const std::shared_ptr<Rdb_key_def> &kd,
+ bool skip_bloom_filter,
const rocksdb::Slice &eq_cond_lower_bound,
const rocksdb::Slice &eq_cond_upper_bound,
const rocksdb::Snapshot **snapshot, bool read_current,
- bool create_snapshot) {
+ bool create_snapshot, bool use_locking_iter) {
if (commit_in_the_middle(thd)) {
DBUG_ASSERT(snapshot && *snapshot == nullptr);
if (snapshot) {
@@ -16575,8 +17201,8 @@ rocksdb::Iterator *rdb_tx_get_iterator(
}
} else {
Rdb_transaction *tx = get_tx_from_thd(thd);
- return tx->get_iterator(cf, skip_bloom_filter, eq_cond_lower_bound,
- eq_cond_upper_bound, read_current, create_snapshot);
+ return tx->get_iterator(cf, kd, skip_bloom_filter, eq_cond_lower_bound,
+ eq_cond_upper_bound, read_current, create_snapshot, use_locking_iter);
}
}
@@ -16610,6 +17236,15 @@ void rdb_tx_release_lock(Rdb_transaction *tx, const Rdb_key_def &kd,
tx->release_lock(kd, std::string(key.data(), key.size()), force);
}
+
+rocksdb::Status rdb_tx_lock_range(Rdb_transaction *tx,
+ const Rdb_key_def &kd,
+ const rocksdb::Endpoint &start_key,
+ const rocksdb::Endpoint &end_key)
+{
+ return tx->lock_range(kd.get_cf(), start_key, end_key);
+}
+
void rdb_tx_multi_get(Rdb_transaction *tx,
rocksdb::ColumnFamilyHandle *const column_family,
const size_t num_keys, const rocksdb::Slice *keys,
diff --git a/storage/rocksdb/ha_rocksdb.h b/storage/rocksdb/ha_rocksdb.h
index 2af4013bf72..bdb4ba3bab9 100644
--- a/storage/rocksdb/ha_rocksdb.h
+++ b/storage/rocksdb/ha_rocksdb.h
@@ -90,6 +90,8 @@ extern PSI_rwlock_key key_rwlock_read_free_rpl_tables;
#endif
extern Regex_list_handler rdb_read_free_regex_handler;
+extern bool rocksdb_use_range_locking;
+
/**
@brief
Rdb_table_handler is a reference-counted structure storing information for
@@ -274,6 +276,8 @@ class ha_rocksdb : public my_core::handler, public blob_buffer {
/* Type of locking to apply to rows */
Rdb_lock_type m_lock_rows;
+ bool m_use_range_locking;
+
thr_locked_row_action m_locked_row_action;
/* true means we're doing an index-only read. false means otherwise. */
@@ -338,6 +342,13 @@ class ha_rocksdb : public my_core::handler, public blob_buffer {
const Rdb_key_def &kd,
const rocksdb::Slice &key) const;
+ int set_range_lock(Rdb_transaction *tx,
+ const Rdb_key_def &kd,
+ const enum ha_rkey_function &find_flag,
+ const rocksdb::Slice &slice,
+ const key_range *const end_key,
+ bool *use_locking_iterator);
+
int get_row_by_rowid(uchar *const buf, const char *const rowid,
const uint rowid_size, bool *skip_row = nullptr,
const bool skip_lookup = false,
@@ -792,6 +803,10 @@ class ha_rocksdb : public my_core::handler, public blob_buffer {
const struct update_row_info &row_info,
bool *const found, const bool skip_unique_check)
MY_ATTRIBUTE((__warn_unused_result__));
+
+ int check_and_lock_non_unique_sk(const uint key_id,
+ const struct update_row_info &row_info)
+ MY_ATTRIBUTE((__warn_unused_result__));
int check_uniqueness_and_lock(const struct update_row_info &row_info,
bool pk_changed, const bool skip_unique_check)
MY_ATTRIBUTE((__warn_unused_result__));
@@ -988,6 +1003,8 @@ class ha_rocksdb : public my_core::handler, public blob_buffer {
/* Need to build decoder on next read operation */
bool m_need_build_decoder;
+
+ int iter_status_to_retval(rocksdb::Iterator *it, const Rdb_key_def &kd, int not_found_code);
};
/*
@@ -1125,11 +1142,14 @@ Rdb_transaction *get_tx_from_thd(THD *const thd);
const rocksdb::ReadOptions &rdb_tx_acquire_snapshot(Rdb_transaction *tx);
rocksdb::Iterator *rdb_tx_get_iterator(
- THD *thd, rocksdb::ColumnFamilyHandle *const cf, bool skip_bloom_filter,
+ THD *thd, rocksdb::ColumnFamilyHandle *const cf,
+ const std::shared_ptr<Rdb_key_def> &kd,
+ bool skip_bloom_filter,
const rocksdb::Slice &eq_cond_lower_bound,
const rocksdb::Slice &eq_cond_upper_bound,
const rocksdb::Snapshot **snapshot, bool read_current = false,
- bool create_snapshot = true);
+ bool create_snapshot = true,
+ bool use_locking_iter= false);
rocksdb::Status rdb_tx_get(Rdb_transaction *tx,
rocksdb::ColumnFamilyHandle *const column_family,
@@ -1142,6 +1162,11 @@ rocksdb::Status rdb_tx_get_for_update(Rdb_transaction *tx,
rocksdb::PinnableSlice *const value,
bool exclusive, bool skip_wait);
+rocksdb::Status rdb_tx_lock_range(Rdb_transaction *tx,
+ const Rdb_key_def &kd,
+ const rocksdb::Endpoint &start_key,
+ const rocksdb::Endpoint &end_key);
+
void rdb_tx_release_lock(Rdb_transaction *tx, const Rdb_key_def &kd,
const rocksdb::Slice &key, bool force);
@@ -1203,4 +1228,6 @@ extern std::atomic<uint64_t> rocksdb_partial_index_rows_sorted;
extern std::atomic<uint64_t> rocksdb_partial_index_rows_materialized;
extern bool rocksdb_enable_tmp_table;
+extern std::shared_ptr<rocksdb::RangeLockManagerHandle> range_lock_mgr;
+
} // namespace myrocks
diff --git a/storage/rocksdb/locking-iterator-partial-index.txt b/storage/rocksdb/locking-iterator-partial-index.txt
new file mode 100644
index 00000000000..d3ffb41d8f0
--- /dev/null
+++ b/storage/rocksdb/locking-iterator-partial-index.txt
@@ -0,0 +1,120 @@
+
+This document describes how Locking Reads are implemented with partial index
+iterator (Rdb_iterator_partial)
+
+== Closed ranges ==
+ha_rocksdb will make the lock_range() call before making any reads.
+
+We just need to make sure that after that, the iterator reads the latest
+committed data (not the data from the read snapshot).
+
+== Open ranges and LockingIterator ==
+
+With Open Ranges, regular indexes use LockingIterator.
+
+How does one make Rdb_iterator_partial use the LockingIterator?
+
+=== Background info: Rdb_iterator_Partial ===
+
+Partial iterator is used for partial secondary keys.
+
+PK and SK share the same key prefix (see 'partial_group_keyparts=N' parameter).
+
+For a given value of key prefix:
+
+* if the number of rows exceeds 'partial_group_threshold' parameter, one reads
+ the secondary index, as usual (the SK is "materialized" for this prefix)
+
+* if the number of rows is less than 'partial_group_threshold', then the SK is
+ "not materialized" and has no rows for the prefix. The rows are obtained by
+ reading the records using PK, buffering+sorting them in memory, and then
+ returning them.
+
+* Switching from non-materialized to materialized is done "lazily" on-read.
+ The write locks the group prefix in SK (see
+ Rdb_iterator_partial::materialize_prefix())
+
+* Update/Delete operations also lock the group prefix in SK (see
+ ha_rocksdb::acquire_prefix_lock)
+
+* De-materialization is currently not performed.
+
+=== Regular reads with Rdb_iterator_partial ===
+
+Regular (that is, non locking) reads are done as follows:
+
+Step #1: first, we need to figure out the key prefix we're going to read.
+There are two possibilities:
+
+A. It can be inferred from the lookup key value.
+B. It cannot be inferred (e.g. we scan from the start of the table).
+ In this case, we read from the PK to find out the first key prefix.
+
+See Rdb_iterator_partial::get_prefix_from_start().
+
+Step #2 is to read rows within this prefix.
+
+We first try to read through the SK. If it has a row within the prefix, it
+means this prefix is materialized and we continue to read from the SK within
+the bounds of the key prefix.
+
+If the SK has no data we read all rows through the PK, buffer, sort them, and
+return. (See Rdb_iterator_partial::read_prefix_from_pk())
+
+Step #3:
+When we have exhausted rows in this key prefix, we check if we need to continue
+the scan.
+If we do, we take the current prefix value and try to get the next row after
+it using the approach in Step #1. See Rdb_iterator_partial::get_next_prefix().
+
+=== Locking Reads with Rdb_iterator_partial ===
+
+This section describes how one can perform locking reads for Steps 1-3 from the
+previous section.
+
+Step #1:
+for Case A, there's no need to lock anything.
+
+for case B, we need to lock the range from the lookup key to the start of
+the first key prefix. This can be achieved by making m_iterator_pk use
+a LockingIterator.
+
+(PK iterator uses a prefix of the lookup key, so we may end up with a
+coarser-grained lock, but this is still correct).
+
+Step #2:
+
+Quoting the previous section:
+
+> We first try to read through the SK.
+> If it has a row within the prefix, it means this prefix is materialized and we
+> continue to read from the SK within the bounds of the key prefix.
+
+If we use a LockingIterator for scanning the SK, we are doing a locking read
+and have achieved our goal.
+
+> If the SK has no data we read all rows through the PK
+
+Suppose we use a LockingIterator to try read through the SK. The read is done
+with this call in Rdb_iterator_partial::seek_next_prefix:
+
+ rc = Rdb_iterator_base::seek(
+ direction ? HA_READ_KEY_EXACT : HA_READ_PREFIX_LAST, cur_prefix_key,
+ false, empty_end_key);
+
+If the SK has no data for the cur_prefix_key, the LockingIterator will lock the
+range before returning from this call.
+Note that it will lock just this key prefix: this is achieved by use of iterator
+bounds.
+
+If there is a Materialization operation in progress, the locking read will
+eventually come into conflict with it. To avoid hitting the conflict at the
+end, Rdb_iterator_partial::materialize_prefix() is made to acquire a lock
+on the whole prefix when Range Locking is used.
+
+Step #3: The same approach as in step #1. If we use LockingIterator for the PK
+it will correctly lock the gap between the prefixes (or just the gap after the
+last prefix if there are no further prefixes).
+
+This way, one can provide LockingIterator semantics with Rdb_iterator_partial.
+
diff --git a/storage/rocksdb/nosql_access.cc b/storage/rocksdb/nosql_access.cc
index 37da5741cb4..2cdadf4caf2 100644
--- a/storage/rocksdb/nosql_access.cc
+++ b/storage/rocksdb/nosql_access.cc
@@ -727,10 +727,11 @@ class select_exec {
}
rocksdb::Iterator *get_iterator(rocksdb::ColumnFamilyHandle *cf,
+ const std::shared_ptr<Rdb_key_def> &kd,
bool use_bloom,
const rocksdb::Slice &lower_bound,
const rocksdb::Slice &upper_bound) {
- return rdb_tx_get_iterator(m_thd, cf, !use_bloom, lower_bound,
+ return rdb_tx_get_iterator(m_thd, cf, kd, !use_bloom, lower_bound,
upper_bound, nullptr);
}
@@ -1569,6 +1570,7 @@ bool INLINE_ATTR select_exec::setup_iterator(THD *thd) {
} else {
m_iterator.reset(
new Rdb_iterator_base(thd, m_key_def, m_pk_def, m_tbl_def));
+// m_lower_bound_slice, m_upper_bound_slice);
}
return m_iterator == nullptr;
diff --git a/storage/rocksdb/rdb_i_s.cc b/storage/rocksdb/rdb_i_s.cc
index 3c5f6ef7b38..13c205b8908 100644
--- a/storage/rocksdb/rdb_i_s.cc
+++ b/storage/rocksdb/rdb_i_s.cc
@@ -1772,6 +1772,61 @@ static ST_FIELD_INFO rdb_i_s_lock_info_fields_info[] = {
ROCKSDB_FIELD_INFO("MODE", 32, MYSQL_TYPE_STRING, 0),
ROCKSDB_FIELD_INFO_END};
+
+// Dump the locked key (or range) into a string. A template and its
+// specializations
+template <typename LockInfo> std::string dump_key(const LockInfo& info);
+
+// Specialization for point lock manager.
+template<>
+std::string dump_key<rocksdb::KeyLockInfo>(const rocksdb::KeyLockInfo& info) {
+ return rdb_hexdump(info.key.c_str(), info.key.length(), FN_REFLEN);
+}
+
+// Specialization for Range Lock manager.
+template<>
+std::string dump_key<rocksdb::RangeLockInfo>(
+ const rocksdb::RangeLockInfo &info) {
+ // todo: Do we need to enforce the total length limit of FN_REFLEN?
+ return rdb_hexdump_range(info.start, info.end);
+}
+
+
+//
+// A template that walks the Lock info data structure and dumps its contents.
+//
+template <typename LockInfo>
+int dump_locks(my_core::THD *thd, my_core::TABLE *table, LockInfo& lock_info) {
+ for (const auto &lock : lock_info) {
+ const uint32_t cf_id = lock.first;
+ const auto &one_lock_info = lock.second;
+
+ // Call the templated function to get string representation of the acquired
+ // lock
+ std::string key_hexstr = dump_key(one_lock_info);
+
+ for (const auto &id : one_lock_info.ids) {
+ table->field[RDB_LOCKS_FIELD::COLUMN_FAMILY_ID]->store(cf_id, true);
+ table->field[RDB_LOCKS_FIELD::TRANSACTION_ID]->store(id, true);
+
+ table->field[RDB_LOCKS_FIELD::KEY]->store(key_hexstr.c_str(),
+ key_hexstr.size(),
+ system_charset_info);
+ table->field[RDB_LOCKS_FIELD::MODE]->store(
+ one_lock_info.exclusive ? "X" : "S", 1, system_charset_info);
+
+ /* Tell MySQL about this row in the virtual table */
+ int ret = static_cast<int>(
+ my_core::schema_table_store_record(thd, table));
+
+ if (ret != 0) {
+ return ret;
+ }
+ }
+ }
+ return 0;
+}
+
/* Fill the information_schema.rocksdb_locks virtual table */
static int rdb_i_s_lock_info_fill_table(
my_core::THD *const thd, my_core::TABLE_LIST *const tables,
@@ -1791,36 +1846,16 @@ static int rdb_i_s_lock_info_fill_table(
DBUG_RETURN(ret);
}
- /* cf id -> rocksdb::KeyLockInfo */
- std::unordered_multimap<uint32_t, rocksdb::KeyLockInfo> lock_info =
- rdb->GetLockStatusData();
-
- for (const auto &lock : lock_info) {
- const uint32_t cf_id = lock.first;
- const auto &key_lock_info = lock.second;
- const auto key_hexstr = rdb_hexdump(key_lock_info.key.c_str(),
- key_lock_info.key.length(), FN_REFLEN);
-
- for (const auto &id : key_lock_info.ids) {
- tables->table->field[RDB_LOCKS_FIELD::COLUMN_FAMILY_ID]->store(cf_id,
- true);
- tables->table->field[RDB_LOCKS_FIELD::TRANSACTION_ID]->store(id, true);
-
- tables->table->field[RDB_LOCKS_FIELD::KEY]->store(
- key_hexstr.c_str(), key_hexstr.size(), system_charset_info);
- tables->table->field[RDB_LOCKS_FIELD::MODE]->store(
- key_lock_info.exclusive ? "X" : "S", 1, system_charset_info);
-
- /* Tell MySQL about this row in the virtual table */
- ret = static_cast<int>(
- my_core::schema_table_store_record(thd, tables->table));
-
- if (ret != 0) {
- break;
- }
- }
+ if (range_lock_mgr) {
+ // Use Range Lock Manager's interface for obtaining more specific
+ // information about the acquired locks
+ auto lock_info = range_lock_mgr->GetRangeLockStatusData();
+ ret= dump_locks(thd, tables->table, lock_info);
+ } else {
+ // Obtain information from point lock manager
+ auto lock_info = rdb->GetLockStatusData();
+ ret= dump_locks(thd, tables->table, lock_info);
}
-
DBUG_RETURN(ret);
}
diff --git a/storage/rocksdb/rdb_iterator.cc b/storage/rocksdb/rdb_iterator.cc
index 11e538370fd..bfe38c23199 100644
--- a/storage/rocksdb/rdb_iterator.cc
+++ b/storage/rocksdb/rdb_iterator.cc
@@ -36,6 +36,8 @@ Rdb_iterator_base::Rdb_iterator_base(THD *thd,
m_tbl_def(tbl_def),
m_thd(thd),
m_scan_it(nullptr),
+ m_iter_uses_locking(false),
+ m_iter_should_use_locking(false),
m_scan_it_skips_bloom(false),
m_scan_it_snapshot(nullptr),
m_scan_it_lower_bound(nullptr),
@@ -83,7 +85,7 @@ int Rdb_iterator_base::read_before_key(const bool full_key_match,
return HA_EXIT_SUCCESS;
}
- return HA_ERR_END_OF_FILE;
+ return iter_status_to_retval(m_scan_it, m_kd, HA_ERR_END_OF_FILE);
}
int Rdb_iterator_base::read_after_key(const rocksdb::Slice &key_slice) {
@@ -98,12 +100,15 @@ int Rdb_iterator_base::read_after_key(const rocksdb::Slice &key_slice) {
*/
rocksdb_smart_seek(m_kd->m_is_reverse_cf, m_scan_it, key_slice);
- return is_valid_iterator(m_scan_it) ? HA_EXIT_SUCCESS : HA_ERR_END_OF_FILE;
+ return is_valid_iterator(m_scan_it) ?
+ HA_EXIT_SUCCESS :
+ iter_status_to_retval(m_scan_it, m_kd, HA_ERR_END_OF_FILE);
}
void Rdb_iterator_base::release_scan_iterator() {
delete m_scan_it;
m_scan_it = nullptr;
+ m_iter_uses_locking = false;
if (m_scan_it_snapshot) {
auto rdb = rdb_get_rocksdb_db();
@@ -154,7 +159,8 @@ void Rdb_iterator_base::setup_scan_iterator(const rocksdb::Slice *const slice,
and
re-create Iterator.
*/
- if (m_scan_it_skips_bloom != skip_bloom) {
+ if (m_scan_it_skips_bloom != skip_bloom ||
+ m_iter_uses_locking != m_iter_should_use_locking) {
release_scan_iterator();
}
@@ -164,10 +170,12 @@ void Rdb_iterator_base::setup_scan_iterator(const rocksdb::Slice *const slice,
*/
if (!m_scan_it) {
m_scan_it = rdb_tx_get_iterator(
- m_thd, m_kd->get_cf(), skip_bloom, m_scan_it_lower_bound_slice,
+ m_thd, m_kd->get_cf(), m_kd, skip_bloom,
+ m_scan_it_lower_bound_slice,
m_scan_it_upper_bound_slice, &m_scan_it_snapshot, read_current,
- !read_current);
+ !read_current, m_iter_should_use_locking);
m_scan_it_skips_bloom = skip_bloom;
+ m_iter_uses_locking = m_iter_should_use_locking;
}
}
@@ -208,6 +216,20 @@ int Rdb_iterator_base::calc_eq_cond_len(enum ha_rkey_function find_flag,
return Rdb_key_def::INDEX_NUMBER_SIZE;
}
+int Rdb_iterator_base::iter_status_to_retval(rocksdb::Iterator *it,
+ const std::shared_ptr<Rdb_key_def> kd,
+ int not_found_code) {
+ if (it->Valid())
+ return HA_EXIT_SUCCESS;
+
+ rocksdb::Status s= it->status();
+ if (s.ok() || s.IsNotFound())
+ return not_found_code;
+
+ Rdb_transaction *tx = get_tx_from_thd(m_thd);
+ return rdb_tx_set_status_error(tx, s, *kd, m_tbl_def);
+}
+
int Rdb_iterator_base::next_with_direction(bool move_forward, bool skip_next) {
int rc = 0;
const auto &kd = *m_kd;
@@ -237,7 +259,7 @@ int Rdb_iterator_base::next_with_direction(bool move_forward, bool skip_next) {
}
if (!is_valid_iterator(m_scan_it)) {
- rc = HA_ERR_END_OF_FILE;
+ rc = iter_status_to_retval(m_scan_it, m_kd, HA_ERR_END_OF_FILE);
break;
}
@@ -617,8 +639,16 @@ int Rdb_iterator_partial::materialize_prefix() {
const char *old_proc_info = m_thd->get_proc_info();
thd_proc_info(m_thd, "Materializing group in partial index");
- auto s =
- rdb_tx_get_for_update(tx, *m_kd, cur_prefix_key, nullptr, true, false);
+ rocksdb::Status s;
+ if (rocksdb_use_range_locking) {
+ rocksdb::Endpoint start_endp(cur_prefix_key, false);
+ rocksdb::Endpoint end_endp(cur_prefix_key, true);
+ s = rdb_tx_lock_range(tx, *m_kd, start_endp, end_endp);
+ } else {
+ s = rdb_tx_get_for_update(tx, *m_kd, cur_prefix_key, nullptr, true,
+ false);
+ }
+
if (!s.ok()) {
thd_proc_info(m_thd, old_proc_info);
return rdb_tx_set_status_error(tx, s, *m_kd, m_tbl_def);
@@ -815,7 +845,14 @@ int Rdb_iterator_partial::seek(enum ha_rkey_function find_flag,
return HA_ERR_INTERNAL_ERROR;
}
+ // Save the value because reset() clears it:
+ auto save_iter_should_use_locking= m_iter_should_use_locking;
reset();
+ m_iter_should_use_locking= save_iter_should_use_locking;
+ // Range Locking: when the iterator does a locking read,
+ // both secondary and primary key iterators should use locking reads.
+ if (m_iter_should_use_locking)
+ m_iterator_pk.set_use_locking();
bool direction = (find_flag == HA_READ_KEY_EXACT) ||
(find_flag == HA_READ_AFTER_KEY) ||
@@ -831,6 +868,15 @@ int Rdb_iterator_partial::seek(enum ha_rkey_function find_flag,
m_cur_prefix_key_len);
m_kd->get_infimum_key(m_cur_prefix_key, &tmp);
+ //
+ // Range Locking note: When using a locking iterator (i.e.
+ // m_iter_should_use_locking=true), this will read (and lock)
+ // and the value space up to the next prefix.
+ // If the next prefix is not materialized, it will lock the whole
+ // prefix in the secondary key. It will not lock more than that,
+ // because the iterator use the iterator bounds to limit the scan
+ // to the prefix specified.
+ //
rc = Rdb_iterator_base::seek(find_flag, start_key, true, end_key,
read_current);
diff --git a/storage/rocksdb/rdb_iterator.h b/storage/rocksdb/rdb_iterator.h
index 7fd1ff470c7..0a516c384f5 100644
--- a/storage/rocksdb/rdb_iterator.h
+++ b/storage/rocksdb/rdb_iterator.h
@@ -69,6 +69,8 @@ class Rdb_iterator {
virtual rocksdb::Slice key() = 0;
virtual rocksdb::Slice value() = 0;
virtual void reset() = 0;
+
+ virtual void set_use_locking()=0;
};
class Rdb_iterator_base : public Rdb_iterator {
@@ -107,8 +109,12 @@ class Rdb_iterator_base : public Rdb_iterator {
rocksdb::Slice value() override { return m_scan_it->value(); }
- void reset() override { release_scan_iterator(); }
+ void reset() override {
+ m_iter_should_use_locking = false;
+ release_scan_iterator();
+ }
+ void set_use_locking() override { m_iter_should_use_locking = true; }
protected:
friend class Rdb_iterator;
const std::shared_ptr<Rdb_key_def> m_kd;
@@ -123,6 +129,18 @@ class Rdb_iterator_base : public Rdb_iterator {
/* Iterator used for range scans and for full table/index scans */
rocksdb::Iterator *m_scan_it;
+ /* Whether m_scan_it is a locking iterator */
+ bool m_iter_uses_locking;
+
+ /*
+ Whether the iterator should use locking. The intended workflow is:
+
+ iter.set_use_locking() // sets m_iter_should_use_locking=true
+ iter.seek() // this will re-create m_scan_it to be the right kind
+ // of iterator
+ */
+ bool m_iter_should_use_locking;
+
/* Whether m_scan_it was created with skip_bloom=true */
bool m_scan_it_skips_bloom;
@@ -136,8 +154,17 @@ class Rdb_iterator_base : public Rdb_iterator {
uchar *m_prefix_buf;
rocksdb::Slice m_prefix_tuple;
+
+ int iter_status_to_retval(rocksdb::Iterator *it,
+ const std::shared_ptr<Rdb_key_def> kd,
+ int not_found_code);
};
+/*
+ Iterator for reading partial secondary indexes
+
+ It can do locking reads, see locking_iterator_partial_index.txt for details.
+*/
class Rdb_iterator_partial : public Rdb_iterator_base {
private:
TABLE *m_table;
diff --git a/storage/rocksdb/rdb_locking_iter.cc b/storage/rocksdb/rdb_locking_iter.cc
new file mode 100644
index 00000000000..35b3763f7f3
--- /dev/null
+++ b/storage/rocksdb/rdb_locking_iter.cc
@@ -0,0 +1,285 @@
+
+#ifdef USE_PRAGMA_IMPLEMENTATION
+#pragma implementation // gcc: Class implementation
+#endif
+
+#define MYSQL_SERVER 1
+
+/* This C++ file's header file */
+#include "./rdb_locking_iter.h"
+
+namespace myrocks {
+
+rocksdb::Iterator* GetLockingIterator(
+ rocksdb::Transaction *trx,
+ const rocksdb::ReadOptions& read_options,
+ rocksdb::ColumnFamilyHandle* column_family,
+ const std::shared_ptr<Rdb_key_def> &kd,
+ ulonglong *counter) {
+ return new LockingIterator(trx, column_family, kd, read_options,
+ counter);
+}
+
+/*
+ @brief
+ Seek to the first key K that is equal or greater than target,
+ locking the range [target; K].
+*/
+
+void LockingIterator::Seek(const rocksdb::Slice& target) {
+ m_have_locked_until = false;
+ m_iter = m_txn->GetIterator(m_read_opts, m_cfh);
+ m_iter->Seek(target);
+ ScanForward(target, false);
+}
+
+void LockingIterator::SeekForPrev(const rocksdb::Slice& target) {
+ m_have_locked_until = false;
+ m_iter = m_txn->GetIterator(m_read_opts, m_cfh);
+ m_iter->SeekForPrev(target);
+ ScanBackward(target, false);
+}
+
+/*
+ @brief
+ Move the iterator to the next key, locking the range between the current
+ and the next key.
+
+ @detail
+ Implementation is similar to Seek(next_key). Since we don't know what the
+ next_key is, we reach it by calling { Seek(current_key); Next(); }
+*/
+void LockingIterator::Next() {
+ DEBUG_SYNC(my_core::thd_get_current_thd(), "rocksdb.LockingIterator.Next");
+ assert(Valid());
+ // Save the current key value. We need it as the left endpoint
+ // of the range lock we're going to acquire
+ std::string current_key = m_iter->key().ToString();
+
+ m_iter->Next();
+ ScanForward(rocksdb::Slice(current_key), true);
+}
+
+/*
+ @brief
+ Move the iterator to the previous key, locking the range between the current
+ and the previous key.
+*/
+
+void LockingIterator::Prev() {
+ assert(Valid());
+
+ std::string current_key = m_iter->key().ToString();
+ m_iter->Prev();
+ ScanBackward(rocksdb::Slice(current_key), true);
+}
+
+/*
+ @brief
+ Lock range from target to end_key.
+
+ @detail
+ In forward-ordered scan, target < end_key. In backward-ordered scan, it's
+ other way around.
+
+ We might have already locked a subset of this range, a subrange that
+ starts from target and extends to some point between target and end_key.
+*/
+void LockingIterator::lock_up_to(bool scan_forward,
+ const rocksdb::Slice& target,
+ const rocksdb::Slice& end_key) {
+ const int inv = scan_forward ? 1 : -1;
+ auto cmp= m_cfh->GetComparator();
+ bool endp_arg= m_kd->m_is_reverse_cf;
+
+ if (m_have_locked_until &&
+ cmp->Compare(end_key, rocksdb::Slice(m_locked_until))*inv <= 0) {
+ // We've already locked this range. The following has happened:
+ // - m_iter->key() returned $KEY
+ // - other transaction(s) have inserted row $ROW before the $KEY.
+ // - we got a range lock on [range_start, $KEY]
+ // - we've read $ROW and returned.
+ // Now, we're looking to lock [$ROW, $KEY] but we don't need to,
+ // we already have a lock on this range.
+ } else {
+ if (scan_forward) {
+ m_status = m_txn->GetRangeLock(m_cfh,
+ rocksdb::Endpoint(target, endp_arg),
+ rocksdb::Endpoint(end_key, endp_arg));
+ } else {
+ m_status = m_txn->GetRangeLock(m_cfh,
+ rocksdb::Endpoint(end_key, endp_arg),
+ rocksdb::Endpoint(target, endp_arg));
+ }
+
+ if (!m_status.ok())
+ return;
+
+ // Save the bound where we locked until:
+ m_have_locked_until= true;
+ m_locked_until.assign(end_key.data(), end_key.size());
+ if (m_lock_count) (*m_lock_count)++;
+ }
+}
+
+/*
+ Lock the range from target till the iterator end point that we are scaning
+ towards. If there's no iterator bound, use index start (or end, depending
+ on the scan direction)
+*/
+void LockingIterator::lock_till_iterator_end(bool scan_forward,
+ const rocksdb::Slice& target) {
+ rocksdb::Slice end;
+ uchar buf[Rdb_key_def::INDEX_NUMBER_SIZE];
+ uint size;
+ if (scan_forward) {
+ if (m_read_opts.iterate_upper_bound)
+ end = *m_read_opts.iterate_upper_bound;
+ else {
+ if (m_kd->m_is_reverse_cf)
+ m_kd->get_infimum_key(buf, &size);
+ else
+ m_kd->get_supremum_key(buf, &size);
+
+ DBUG_ASSERT(size == Rdb_key_def::INDEX_NUMBER_SIZE);
+ end = rocksdb::Slice((const char*)buf, size);
+ }
+ } else {
+ if (m_read_opts.iterate_lower_bound)
+ end = *m_read_opts.iterate_lower_bound;
+ else {
+ if (m_kd->m_is_reverse_cf)
+ m_kd->get_supremum_key(buf, &size);
+ else
+ m_kd->get_infimum_key(buf, &size);
+
+ DBUG_ASSERT(size == Rdb_key_def::INDEX_NUMBER_SIZE);
+ end = rocksdb::Slice((const char*)buf, size);
+ }
+ }
+ // This will set m_status accordingly
+ lock_up_to(scan_forward, target, end);
+}
+
+/*
+ Lock the range between [target, (current m_iter position)] and position
+ the iterator on the first record in it.
+
+ @param call_next false means current iterator position is achieved by
+ calling Seek(target).
+ true means one also needs to call Next()
+*/
+void LockingIterator::Scan(bool scan_forward,
+ const rocksdb::Slice& target, bool call_next) {
+ if (!m_iter->Valid()) {
+ m_status = m_iter->status();
+ m_valid = false;
+ if (m_status.ok()) {
+ // m_iter has reached EOF
+ lock_till_iterator_end(scan_forward, target);
+ }
+ return;
+ }
+
+ while (1) {
+
+ DEBUG_SYNC(my_core::thd_get_current_thd(), "rocksdb.locking_iter_scan");
+
+ if (my_core::thd_killed(current_thd)) {
+ m_status = rocksdb::Status::Aborted();
+ m_valid = false;
+ return;
+ }
+
+ const int inv = scan_forward ? 1 : -1;
+ auto cmp= m_cfh->GetComparator();
+
+ auto end_key = m_iter->key();
+ std::string end_key_copy= end_key.ToString();
+
+ lock_up_to(scan_forward, target, end_key);
+ if (!m_status.ok()) {
+ // Failed to get a lock (most likely lock wait timeout)
+ m_valid = false;
+ return;
+ }
+
+ //Ok, now we have a lock which is inhibiting modifications in the range
+ // Somebody might have done external modifications, though:
+ // - removed the key we've found
+ // - added a key before that key.
+
+ // First, refresh the iterator:
+ delete m_iter;
+ m_iter = m_txn->GetIterator(m_read_opts, m_cfh);
+
+ // Then, try seeking to the same row
+ if (scan_forward)
+ m_iter->Seek(target);
+ else
+ m_iter->SeekForPrev(target);
+
+ if (call_next && m_iter->Valid() &&
+ !cmp->Compare(m_iter->key(), target)) {
+ if (scan_forward)
+ m_iter->Next();
+ else
+ m_iter->Prev();
+ }
+
+ if (m_iter->Valid()) {
+ if (cmp->Compare(m_iter->key(), rocksdb::Slice(end_key_copy))*inv <= 0) {
+ // Ok, the found key is within the locked range.
+ m_status = rocksdb::Status::OK();
+ m_valid= true;
+ break;
+ } else {
+ // We've got a key but it is outside the range we've locked.
+ // Re-try the lock-and-read step.
+ continue;
+ }
+ } else {
+ m_valid = false;
+ m_status = m_iter->status();
+ if (m_status.ok()) {
+ // m_iter has reached EOF
+ lock_till_iterator_end(scan_forward, target);
+ }
+ break;
+ }
+ }
+}
+
+/*
+ @detail
+ Ideally, this function should
+ - find the first key $first_key
+ - lock the range [-inf; $first_key]
+ - return, the iterator is positioned on $first_key
+
+ The problem here is that we cannot have "-infinity" bound.
+
+ Note: we don't have a practical use for this function - MyRocks always
+ searches within one index_name.table_name, which means we are only looking
+ at the keys with index_number as the prefix.
+*/
+
+void LockingIterator::SeekToFirst() {
+ DBUG_ASSERT(0);
+ m_status = rocksdb::Status::NotSupported("Not implemented");
+ m_valid = false;
+}
+
+/*
+ @detail
+ See SeekToFirst.
+*/
+
+void LockingIterator::SeekToLast() {
+ DBUG_ASSERT(0);
+ m_status = rocksdb::Status::NotSupported("Not implemented");
+ m_valid = false;
+}
+
+} // namespace myrocks
+
diff --git a/storage/rocksdb/rdb_locking_iter.h b/storage/rocksdb/rdb_locking_iter.h
new file mode 100644
index 00000000000..3cd09143649
--- /dev/null
+++ b/storage/rocksdb/rdb_locking_iter.h
@@ -0,0 +1,120 @@
+
+/* MySQL header files */
+#include "sql/handler.h" /* handler */
+#include "sql/debug_sync.h"
+#include "./rdb_threads.h" /* for thd_get_current_thd */
+
+/* MyRocks header files */
+#include "./ha_rocksdb.h"
+#include "./rdb_datadic.h"
+
+namespace myrocks {
+
+//
+// LockingIterator is an iterator that locks the rows before returning, as well
+// as scanned gaps between the rows.
+//
+// Example:
+// lock_iter= trx->GetLockingIterator();
+// lock_iter->Seek('abc');
+// lock_iter->Valid()==true && lock_iter->key() == 'bcd';
+//
+// After the above, the returned record 'bcd' is locked by transaction trx.
+// Also, the range between ['abc'..'bcd'] is empty and locked by trx.
+//
+// lock_iter->Next();
+// lock_iter->Valid()==true && lock_iter->key() == 'efg'
+//
+// Now, the range ['bcd'.. 'efg'] (bounds inclusive) is also locked, and there are no
+// records between 'bcd' and 'efg'.
+//
+class LockingIterator : public rocksdb::Iterator {
+
+ rocksdb::Transaction *m_txn;
+ rocksdb::ColumnFamilyHandle* m_cfh;
+ const std::shared_ptr<Rdb_key_def> &m_kd;
+
+ rocksdb::ReadOptions m_read_opts;
+ rocksdb::Iterator *m_iter;
+ rocksdb::Status m_status;
+
+ // note: an iterator that has reached EOF has status()==OK && m_valid==false
+ bool m_valid;
+
+ ulonglong *m_lock_count;
+
+ // If true, m_locked_until has a valid key value.
+ bool m_have_locked_until;
+
+ // The key value until we've locked the range. That is, we have a range lock
+ // on [current_position ... m_locked_until].
+ // This is used to avoid making extra GetRangeLock() calls.
+ std::string m_locked_until;
+ public:
+ LockingIterator(rocksdb::Transaction *txn,
+ rocksdb::ColumnFamilyHandle *cfh,
+ const std::shared_ptr<Rdb_key_def> &kd,
+ const rocksdb::ReadOptions& opts,
+ ulonglong *lock_count=nullptr
+ ) :
+ m_txn(txn), m_cfh(cfh), m_kd(kd), m_read_opts(opts), m_iter(nullptr),
+ m_status(rocksdb::Status::InvalidArgument()), m_valid(false),
+ m_lock_count(lock_count), m_have_locked_until(false) {}
+
+ ~LockingIterator() {
+ delete m_iter;
+ }
+
+ virtual bool Valid() const override { return m_valid; }
+
+ // Note: MyRocks doesn't ever call these:
+ virtual void SeekToFirst() override;
+ virtual void SeekToLast() override;
+
+ virtual void Seek(const rocksdb::Slice& target) override;
+
+ // Position at the last key in the source that at or before target.
+ // The iterator is Valid() after this call iff the source contains
+ // an entry that comes at or before target.
+ virtual void SeekForPrev(const rocksdb::Slice& target) override;
+
+ virtual void Next() override;
+ virtual void Prev() override;
+
+ virtual rocksdb::Slice key() const override {
+ assert(Valid());
+ return m_iter->key();
+ }
+
+ virtual rocksdb::Slice value() const override {
+ assert(Valid());
+ return m_iter->value();
+ }
+
+ virtual rocksdb::Status status() const override {
+ return m_status;
+ }
+
+ private:
+ void lock_up_to(bool scan_forward, const rocksdb::Slice& target,
+ const rocksdb::Slice& end_key);
+ void lock_till_iterator_end(bool scan_forward, const rocksdb::Slice& target);
+ void Scan(bool scan_forward, const rocksdb::Slice& target, bool call_next);
+
+ inline void ScanForward(const rocksdb::Slice& target, bool call_next) {
+ Scan(true, target, call_next);
+ }
+
+ inline void ScanBackward(const rocksdb::Slice& target, bool call_next) {
+ Scan(false, target, call_next);
+ }
+};
+
+rocksdb::Iterator*
+GetLockingIterator(rocksdb::Transaction *trx,
+ const rocksdb::ReadOptions& read_options,
+ rocksdb::ColumnFamilyHandle* column_family,
+ const std::shared_ptr<Rdb_key_def>& kd,
+ ulonglong *counter);
+
+} // namespace myrocks
diff --git a/storage/rocksdb/rdb_utils.cc b/storage/rocksdb/rdb_utils.cc
index a723ac9e806..88695aa0539 100644
--- a/storage/rocksdb/rdb_utils.cc
+++ b/storage/rocksdb/rdb_utils.cc
@@ -259,6 +259,33 @@ std::string rdb_hexdump(const char *data, const std::size_t data_len,
return str;
}
+/*
+ Print the range in hex, in "start_endpoint-end_endpoint" form
+*/
+
+std::string rdb_hexdump_range(const rocksdb::EndpointWithString& start,
+ const rocksdb::EndpointWithString& end) {
+ std::string res;
+ // For keys: :0 keys should look like point keys
+ if (!start.inf_suffix && !end.inf_suffix && (start.slice == end.slice)) {
+ // This is a single-point range, show it like a key
+ res = rdb_hexdump(start.slice.c_str(), start.slice.length(), FN_REFLEN);
+ } else {
+ res = rdb_hexdump(start.slice.c_str(), start.slice.length(), FN_REFLEN);
+ if (start.inf_suffix)
+ res.append(":1");
+
+ res.append("-");
+
+ std::string key2 = rdb_hexdump(end.slice.c_str(), end.slice.length(),
+ FN_REFLEN);
+ if (end.inf_suffix)
+ key2.append(":1");
+ res.append(key2);
+ }
+ return res;
+}
+
/*
Attempt to access the database subdirectory to see if it exists
*/
diff --git a/storage/rocksdb/rdb_utils.h b/storage/rocksdb/rdb_utils.h
index f49f102a08c..39f2096dd35 100644
--- a/storage/rocksdb/rdb_utils.h
+++ b/storage/rocksdb/rdb_utils.h
@@ -29,6 +29,7 @@
/* RocksDB header files */
#include "rocksdb/slice.h"
#include "rocksdb/status.h"
+#include "rocksdb/utilities/transaction_db.h"
/* MyRocks header files */
#include "./rdb_global.h"
@@ -290,6 +291,8 @@ std::string rdb_hexdump(const char *data, const std::size_t data_len,
const std::size_t maxsize = 0)
MY_ATTRIBUTE((__nonnull__));
+std::string rdb_hexdump_range(const rocksdb::EndpointWithString& left,
+ const rocksdb::EndpointWithString& right);
/*
Helper function to see if a database exists
*/
1
0
[Commits] 57fe50eab9f: MDEV-25636: Bug report: abortion in sql/sql_parse.cc:6294
by psergey 10 Feb '22
by psergey 10 Feb '22
10 Feb '22
revision-id: 57fe50eab9ff01590ec3396dca0de9081ef0ef00 (mariadb-10.2.42-2-g57fe50eab9f)
parent(s): 941bc7053616d5ca8c9e6538828c4a65802e8c3d
author: Sergei Petrunia
committer: Sergei Petrunia
timestamp: 2022-02-10 14:25:07 +0300
message:
MDEV-25636: Bug report: abortion in sql/sql_parse.cc:6294
The asserion failure was caused by this query
select /*id=1*/ from t1
where
col= ( select /*id=2*/ from ... where corr_cond1
union
select /*id=4*/ from ... where corr_cond2)
Here,
- select with id=2 was correlated due to corr_cond1.
- select with id=4 was initially correlated due to corr_cond2, but then
the optimizer optimized away the correlation, making the select with id=4
uncorrelated.
However, since select with id=2 remained correlated, the execution had to
re-compute the whole UNION. When it tried to execute select with id=4, it
hit an assertion (join buffer already free'd).
This is because select with id=4 has freed its execution structures after
it has been executed once. The select is uncorrelated, so it did not expect
it would need to be executed for the second time.
Fixed this by adding this logic in
st_select_lex::optimize_unflattened_subqueries():
If a member of a UNION is correlated, mark all its members as
correlated, so that they are prepared to be executed multiple times.
---
mysql-test/r/subselect4.result | 4 +++-
mysql-test/r/union_innodb.result | 34 ++++++++++++++++++++++++++++++
mysql-test/t/subselect4.test | 2 ++
mysql-test/t/union_innodb.test | 45 ++++++++++++++++++++++++++++++++++++++++
sql/sql_lex.cc | 16 +++++++++++++-
5 files changed, 99 insertions(+), 2 deletions(-)
diff --git a/mysql-test/r/subselect4.result b/mysql-test/r/subselect4.result
index b1db309ec18..2657977dae7 100644
--- a/mysql-test/r/subselect4.result
+++ b/mysql-test/r/subselect4.result
@@ -1358,6 +1358,8 @@ INSERT IGNORE INTO t2 VALUES (8,0,0),(5,0,0);
CREATE TABLE t3 (f4 int,KEY (f4)) ;
INSERT IGNORE INTO t3 VALUES (0),(0);
set @@optimizer_switch='semijoin=off';
+# NOTE: the following should have 'SUBQUERY', not 'DEPENDENT SUBQUERY'
+# for line with id=2, see MDEV-27794.
EXPLAIN
SELECT * FROM t1 WHERE
(SELECT f2 FROM t2
@@ -1367,7 +1369,7 @@ FROM t3 AS SQ1_t1 JOIN t3 AS SQ1_t3 ON SQ1_t3.f4
GROUP BY SQ1_t1.f4));
id select_type table type possible_keys key key_len ref rows Extra
1 PRIMARY t1 system NULL NULL NULL NULL 1
-2 SUBQUERY t2 ALL NULL NULL NULL NULL 2 Using where
+2 DEPENDENT SUBQUERY t2 ALL NULL NULL NULL NULL 2 Using where
3 SUBQUERY SQ1_t1 index NULL f4 5 NULL 2 Using index; Using temporary
3 SUBQUERY SQ1_t3 index f4 f4 5 NULL 2 Using where; Using index; Using join buffer (flat, BNL join)
SELECT * FROM t1 WHERE
diff --git a/mysql-test/r/union_innodb.result b/mysql-test/r/union_innodb.result
new file mode 100644
index 00000000000..876897fa865
--- /dev/null
+++ b/mysql-test/r/union_innodb.result
@@ -0,0 +1,34 @@
+#
+# MDEV-25636: Bug report: abortion in sql/sql_parse.cc:6294
+#
+CREATE TABLE t1 (i1 int)engine=innodb;
+INSERT INTO `t1` VALUES (62),(66);
+CREATE TABLE t2 (i1 int) engine=innodb;
+SELECT 1 FROM t1
+WHERE t1.i1 =( SELECT t1.i1 FROM t2
+UNION SELECT i1 FROM (t1 AS dt1 natural JOIN t2)
+window w1 as (partition by t1.i1));
+1
+drop table t1,t2;
+# Another testcase
+CREATE TABLE t1 (i3 int NOT NULL, i1 int , i2 int , i4 int , PRIMARY key(i2));
+INSERT INTO t1 VALUES (6,72,98,98),(46,1,6952,0);
+SELECT i1 FROM t1
+WHERE t1.i3 =
+(SELECT ref_4.i2 FROM t1 AS ref_4
+WHERE t1.i2 > (SELECT i3 FROM t1 ORDER BY i3 LIMIT 1 OFFSET 4)
+UNION
+SELECT ref_6.i2
+FROM (t1 AS ref_5 JOIN t1 AS ref_6 ON ((ref_6.i1 > ref_6.i2) OR (ref_5.i4 < ref_5.i4)))
+WHERE (t1.i2 >= t1.i2));
+i1
+drop table t1;
+#
+# MDEV-25761: Assertion `aggr != __null' failed in sub_select_postjoin_aggr
+#
+CREATE TABLE t1 ( a int NOT NULL PRIMARY KEY) engine=innodb;
+INSERT INTO t1 VALUES (0),(4),(31);
+CREATE TABLE t2 (i int) engine=innodb;
+DELETE FROM t1 WHERE t1.a =
+(SELECT t1.a FROM t2 UNION SELECT DISTINCT 52 FROM t2 r WHERE t1.a = t1.a);
+DROP TABLE t1,t2;
diff --git a/mysql-test/t/subselect4.test b/mysql-test/t/subselect4.test
index bd1e20cb5d6..93389571c5c 100644
--- a/mysql-test/t/subselect4.test
+++ b/mysql-test/t/subselect4.test
@@ -1039,6 +1039,8 @@ INSERT IGNORE INTO t3 VALUES (0),(0);
set @@optimizer_switch='semijoin=off';
+--echo # NOTE: the following should have 'SUBQUERY', not 'DEPENDENT SUBQUERY'
+--echo # for line with id=2, see MDEV-27794.
EXPLAIN
SELECT * FROM t1 WHERE
(SELECT f2 FROM t2
diff --git a/mysql-test/t/union_innodb.test b/mysql-test/t/union_innodb.test
new file mode 100644
index 00000000000..cb805a30bb4
--- /dev/null
+++ b/mysql-test/t/union_innodb.test
@@ -0,0 +1,45 @@
+--source include/have_innodb.inc
+
+--echo #
+--echo # MDEV-25636: Bug report: abortion in sql/sql_parse.cc:6294
+--echo #
+
+CREATE TABLE t1 (i1 int)engine=innodb;
+INSERT INTO `t1` VALUES (62),(66);
+CREATE TABLE t2 (i1 int) engine=innodb;
+
+SELECT 1 FROM t1
+WHERE t1.i1 =( SELECT t1.i1 FROM t2
+ UNION SELECT i1 FROM (t1 AS dt1 natural JOIN t2)
+ window w1 as (partition by t1.i1));
+
+drop table t1,t2;
+
+--echo # Another testcase
+CREATE TABLE t1 (i3 int NOT NULL, i1 int , i2 int , i4 int , PRIMARY key(i2));
+INSERT INTO t1 VALUES (6,72,98,98),(46,1,6952,0);
+
+SELECT i1 FROM t1
+WHERE t1.i3 =
+ (SELECT ref_4.i2 FROM t1 AS ref_4
+ WHERE t1.i2 > (SELECT i3 FROM t1 ORDER BY i3 LIMIT 1 OFFSET 4)
+ UNION
+ SELECT ref_6.i2
+ FROM (t1 AS ref_5 JOIN t1 AS ref_6 ON ((ref_6.i1 > ref_6.i2) OR (ref_5.i4 < ref_5.i4)))
+ WHERE (t1.i2 >= t1.i2));
+
+drop table t1;
+
+--echo #
+--echo # MDEV-25761: Assertion `aggr != __null' failed in sub_select_postjoin_aggr
+--echo #
+
+CREATE TABLE t1 ( a int NOT NULL PRIMARY KEY) engine=innodb;
+INSERT INTO t1 VALUES (0),(4),(31);
+
+CREATE TABLE t2 (i int) engine=innodb;
+
+DELETE FROM t1 WHERE t1.a =
+ (SELECT t1.a FROM t2 UNION SELECT DISTINCT 52 FROM t2 r WHERE t1.a = t1.a);
+
+DROP TABLE t1,t2;
diff --git a/sql/sql_lex.cc b/sql/sql_lex.cc
index 125bbfe1bfd..47ff2836aba 100644
--- a/sql/sql_lex.cc
+++ b/sql/sql_lex.cc
@@ -3898,7 +3898,21 @@ bool st_select_lex::optimize_unflattened_subqueries(bool const_only)
}
if (empty_union_result)
subquery_predicate->no_rows_in_result();
- if (!is_correlated_unit)
+
+ if (is_correlated_unit)
+ {
+ /*
+ Some parts of UNION are not correlated. This means we will need to
+ re-execute the whole UNION every time. Mark all parts of the UNION
+ as correlated so that they are prepared to be executed multiple
+ times (if we don't do that, some part of the UNION may free its
+ execution data at the end of first execution and crash on the second
+ execution)
+ */
+ for (SELECT_LEX *sl= un->first_select(); sl; sl= sl->next_select())
+ sl->uncacheable |= UNCACHEABLE_DEPENDENT;
+ }
+ else
un->uncacheable&= ~UNCACHEABLE_DEPENDENT;
subquery_predicate->is_correlated= is_correlated_unit;
}
1
0