developers
Threads by month
- ----- 2025 -----
- April
- March
- February
- January
- ----- 2024 -----
- December
- November
- October
- September
- August
- July
- June
- May
- April
- March
- February
- January
- ----- 2023 -----
- December
- November
- October
- September
- August
- July
- June
- May
- April
- March
- February
- January
- ----- 2022 -----
- December
- November
- October
- September
- August
- July
- June
- May
- April
- March
- February
- January
- ----- 2021 -----
- December
- November
- October
- September
- August
- July
- June
- May
- April
- March
- February
- January
- ----- 2020 -----
- December
- November
- October
- September
- August
- July
- June
- May
- April
- March
- February
- January
- ----- 2019 -----
- December
- November
- October
- September
- August
- July
- June
- May
- April
- March
- February
- January
- ----- 2018 -----
- December
- November
- October
- September
- August
- July
- June
- May
- April
- March
- February
- January
- ----- 2017 -----
- December
- November
- October
- September
- August
- July
- June
- May
- April
- March
- February
- January
- ----- 2016 -----
- December
- November
- October
- September
- August
- July
- June
- May
- April
- March
- February
- January
- ----- 2015 -----
- December
- November
- October
- September
- August
- July
- June
- May
- April
- March
- February
- January
- ----- 2014 -----
- December
- November
- October
- September
- August
- July
- June
- May
- April
- March
- February
- January
- ----- 2013 -----
- December
- November
- October
- September
- August
- July
- June
- May
- April
- March
- February
- January
- ----- 2012 -----
- December
- November
- October
- September
- August
- July
- June
- May
- April
- March
- February
- January
- ----- 2011 -----
- December
- November
- October
- September
- August
- July
- June
- May
- April
- March
- February
- January
- ----- 2010 -----
- December
- November
- October
- September
- August
- July
- June
- May
- April
- March
- February
- January
- ----- 2009 -----
- December
- November
- October
- September
- August
- July
- June
- May
- April
- March
- February
- 7 participants
- 6853 discussions

[Maria-developers] Rev 2760: Subquery backport: in file:///home/psergey/dev/maria-5.3-subqueries-r7/
by Sergey Petrunya 17 Feb '10
by Sergey Petrunya 17 Feb '10
17 Feb '10
At file:///home/psergey/dev/maria-5.3-subqueries-r7/
------------------------------------------------------------
revno: 2760
revision-id: psergey(a)askmonty.org-20100217104755-3psvc5fmo3pqsnpy
parent: psergey(a)askmonty.org-20100217100527-k81b7torhmj99moy
committer: Sergey Petrunya <psergey(a)askmonty.org>
branch nick: maria-5.3-subqueries-r7
timestamp: Wed 2010-02-17 13:47:55 +0300
message:
Subquery backport:
- More test results updates (checked)
=== modified file 'mysql-test/r/subselect3_jcl6.result'
--- a/mysql-test/r/subselect3_jcl6.result 2010-02-11 21:56:02 +0000
+++ b/mysql-test/r/subselect3_jcl6.result 2010-02-17 10:47:55 +0000
@@ -877,7 +877,7 @@
Note 1276 Field or reference 'test.t1.a' of SELECT #3 was resolved in SELECT #2
Note 1276 Field or reference 'test.t1.c' of SELECT #3 was resolved in SELECT #2
Error 1054 Unknown column 'c' in 'field list'
-Note 1003 select `c` AS `c` from (select (select count(`test`.`t1`.`a`) AS `COUNT(a)` from dual group by `c`) AS `(SELECT COUNT(a) FROM
+Note 1003 select `c` AS `c` from (select (select count(`test`.`t1`.`a`) AS `COUNT(a)` from (select count(`test`.`t1`.`b`) AS `COUNT(b)` from `test`.`t1`) `x` group by `t1`.`c`) AS `(SELECT COUNT(a) FROM
(SELECT COUNT(b) FROM t1) AS x GROUP BY c
)` from `test`.`t1` group by `test`.`t1`.`b`) `y`
DROP TABLE t1;
@@ -1122,7 +1122,7 @@
explain select * from t3 where a in (select t1.kp1 from t1,t4 where kp1<20
and t4.pk=t1.c);
id select_type table type possible_keys key key_len ref rows Extra
-1 PRIMARY t1 range kp1 kp1 5 NULL 48 Using index condition; Using where; Using MRR; LooseScan
+1 PRIMARY t1 range kp1 kp1 5 NULL 48 Using index condition; Using MRR; LooseScan
1 PRIMARY t4 eq_ref PRIMARY PRIMARY 4 test.t1.c 1 Using index; FirstMatch(t1)
1 PRIMARY t3 ALL NULL NULL NULL NULL 100 Using where; Using join buffer
drop table t1, t3, t4;
=== modified file 'mysql-test/r/subselect_no_semijoin.result'
--- a/mysql-test/r/subselect_no_semijoin.result 2010-01-17 20:52:20 +0000
+++ b/mysql-test/r/subselect_no_semijoin.result 2010-02-17 10:47:55 +0000
@@ -54,7 +54,7 @@
Warnings:
Note 1276 Field or reference 'b.a' of SELECT #3 was resolved in SELECT #1
Note 1276 Field or reference 'b.a' of SELECT #3 was resolved in SELECT #1
-Note 1003 select 1 AS `1` from dual having ((select '1' AS `a`) = 1)
+Note 1003 select 1 AS `1` from (select 1 AS `a`) `b` having ((select '1' AS `a`) = 1)
SELECT 1 FROM (SELECT 1 as a) as b HAVING (SELECT a)=1;
1
1
@@ -207,7 +207,7 @@
3 DERIVED t2 ALL NULL NULL NULL NULL 2 100.00 Using where
2 SUBQUERY t3 ALL NULL NULL NULL NULL 3 100.00 Using where; Using filesort
Warnings:
-Note 1003 select (select `test`.`t3`.`a` AS `a` from `test`.`t3` where (`test`.`t3`.`a` < 8) order by 1 desc limit 1) AS `(select t3.a from t3 where a<8 order by 1 desc limit 1)`,'2' AS `a` from dual
+Note 1003 select (select `test`.`t3`.`a` AS `a` from `test`.`t3` where (`test`.`t3`.`a` < 8) order by 1 desc limit 1) AS `(select t3.a from t3 where a<8 order by 1 desc limit 1)`,'2' AS `a` from (select `test`.`t2`.`a` AS `a`,`test`.`t2`.`b` AS `b` from `test`.`t2` where (`test`.`t2`.`a` > 1)) `tt`
select * from t1 where t1.a=(select t2.a from t2 where t2.b=(select max(a) from t3) order by 1 desc limit 1);
a
2
@@ -318,7 +318,7 @@
Warnings:
Note 1276 Field or reference 'test.t2.a' of SELECT #2 was resolved in SELECT #1
Note 1276 Field or reference 'test.t2.a' of SELECT #3 was resolved in SELECT #1
-Note 1003 select (select '2' AS `a` from dual where ('2' = `test`.`t2`.`a`) union select `test`.`t5`.`a` AS `a` from `test`.`t5` where (`test`.`t5`.`a` = `test`.`t2`.`a`)) AS `(select a from t1 where t1.a=t2.a union select a from t5 where t5.a=t2.a)`,`test`.`t2`.`a` AS `a` from `test`.`t2`
+Note 1003 select (select '2' AS `a` from `test`.`t1` where ('2' = `test`.`t2`.`a`) union select `test`.`t5`.`a` AS `a` from `test`.`t5` where (`test`.`t5`.`a` = `test`.`t2`.`a`)) AS `(select a from t1 where t1.a=t2.a union select a from t5 where t5.a=t2.a)`,`test`.`t2`.`a` AS `a` from `test`.`t2`
select (select a from t1 where t1.a=t2.a union all select a from t5 where t5.a=t2.a), a from t2;
ERROR 21000: Subquery returns more than 1 row
create table t6 (patient_uq int, clinic_uq int, index i1 (clinic_uq));
@@ -739,7 +739,7 @@
Warnings:
Note 1249 Select 3 was reduced during optimization
Note 1249 Select 2 was reduced during optimization
-Note 1003 select `test`.`t2`.`id` AS `id` from `test`.`t2` where (`test`.`t2`.`id` = <cache>((1 + 1)))
+Note 1003 select `test`.`t2`.`id` AS `id` from `test`.`t2` where (`test`.`t2`.`id` = (1 + 1))
EXPLAIN EXTENDED SELECT * FROM t2 WHERE id IN (SELECT 1 UNION SELECT 3);
id select_type table type possible_keys key key_len ref rows filtered Extra
1 PRIMARY t2 index NULL id 5 NULL 2 100.00 Using where; Using index
@@ -1437,7 +1437,7 @@
id select_type table type possible_keys key key_len ref rows filtered Extra
1 SIMPLE t1 system NULL NULL NULL NULL 1 100.00
Warnings:
-Note 1003 (select 'tttt' AS `s1` from dual)
+Note 1003 (select 'tttt' AS `s1` from `test`.`t1`)
(select * from t1);
s1
tttt
@@ -1625,7 +1625,7 @@
3 UNION t1 system NULL NULL NULL NULL 1 100.00
NULL UNION RESULT <union2,3> ALL NULL NULL NULL NULL NULL NULL
Warnings:
-Note 1003 select 'e' AS `s1` from dual where 1
+Note 1003 select 'e' AS `s1` from `test`.`t1` where 1
drop table t1;
CREATE TABLE t1 (number char(11) NOT NULL default '') ENGINE=MyISAM CHARSET=latin1;
INSERT INTO t1 VALUES ('69294728265'),('18621828126'),('89356874041'),('95895001874');
@@ -4686,7 +4686,7 @@
explain
SELECT t1.a, (SELECT 1 FROM t2 WHERE t2.b=t3.c AND t2.c=t1.a ORDER BY t2.d LIMIT 1) AS incorrect FROM t1, t3 WHERE t3.b=t1.a;
id select_type table type possible_keys key key_len ref rows Extra
-1 PRIMARY t3 index b,b_2 b 10 NULL 2 Using where; Using index
+1 PRIMARY t3 index b,b_2 b 10 NULL 2 Using index
1 PRIMARY t1 eq_ref PRIMARY PRIMARY 4 test.t3.b 1 Using index
2 DEPENDENT SUBQUERY t2 index b,b_2,c d 5 NULL 1 Using where
SELECT t1.a, (SELECT 1 FROM t2 WHERE t2.b=t3.c AND t2.c=t1.a ORDER BY t2.d LIMIT 1) AS incorrect FROM t1, t3 WHERE t3.b=t1.a;
=== modified file 'mysql-test/r/subselect_sj.result'
--- a/mysql-test/r/subselect_sj.result 2010-02-11 23:59:58 +0000
+++ b/mysql-test/r/subselect_sj.result 2010-02-17 10:47:55 +0000
@@ -12,7 +12,7 @@
Flattened because of dependency, t10=func(t1)
explain select * from t1 where a in (select pk from t10);
id select_type table type possible_keys key key_len ref rows Extra
-1 PRIMARY t1 ALL NULL NULL NULL NULL 3 Using where
+1 PRIMARY t1 ALL NULL NULL NULL NULL 3
1 PRIMARY t10 eq_ref PRIMARY PRIMARY 4 test.t1.a 1 Using index
select * from t1 where a in (select pk from t10);
a b
@@ -39,7 +39,7 @@
a b
explain select * from t1 where a in (select pk from t10) and b in (select pk from t10);
id select_type table type possible_keys key key_len ref rows Extra
-1 PRIMARY t1 ALL NULL NULL NULL NULL 3 Using where
+1 PRIMARY t1 ALL NULL NULL NULL NULL 3
1 PRIMARY t10 eq_ref PRIMARY PRIMARY 4 test.t1.a 1 Using index
1 PRIMARY t10 eq_ref PRIMARY PRIMARY 4 test.t1.b 1 Using index
select * from t1 where a in (select pk from t10) and b in (select pk from t10);
@@ -50,8 +50,8 @@
flattening a nested subquery
explain select * from t1 where a in (select pk from t10 where t10.a in (select pk from t12));
id select_type table type possible_keys key key_len ref rows Extra
-1 PRIMARY t1 ALL NULL NULL NULL NULL 3 Using where
-1 PRIMARY t10 eq_ref PRIMARY PRIMARY 4 test.t1.a 1 Using where
+1 PRIMARY t1 ALL NULL NULL NULL NULL 3
+1 PRIMARY t10 eq_ref PRIMARY PRIMARY 4 test.t1.a 1
1 PRIMARY t12 eq_ref PRIMARY PRIMARY 4 test.t10.a 1 Using index
select * from t1 where a in (select pk from t10 where t10.a in (select pk from t12));
a b
@@ -61,8 +61,8 @@
flattening subquery w/ several tables
explain extended select * from t1 where a in (select t10.pk from t10, t12 where t12.pk=t10.a);
id select_type table type possible_keys key key_len ref rows filtered Extra
-1 PRIMARY t1 ALL NULL NULL NULL NULL 3 100.00 Using where
-1 PRIMARY t10 eq_ref PRIMARY PRIMARY 4 test.t1.a 1 100.00 Using where
+1 PRIMARY t1 ALL NULL NULL NULL NULL 3 100.00
+1 PRIMARY t10 eq_ref PRIMARY PRIMARY 4 test.t1.a 1 100.00
1 PRIMARY t12 eq_ref PRIMARY PRIMARY 4 test.t10.a 1 100.00 Using index
Warnings:
Note 1003 select `test`.`t1`.`a` AS `a`,`test`.`t1`.`b` AS `b` from `test`.`t10` join `test`.`t12` join `test`.`t1` where ((`test`.`t10`.`pk` = `test`.`t1`.`a`) and (`test`.`t12`.`pk` = `test`.`t10`.`a`))
@@ -545,7 +545,7 @@
(SELECT t1.pk FROM t0 t1 JOIN t0 t2 ON t2.vkey = t1.vnokey);
id select_type table type possible_keys key key_len ref rows filtered Extra
1 PRIMARY t0 ALL PRIMARY NULL NULL NULL 5 100.00
-1 PRIMARY t1 eq_ref PRIMARY PRIMARY 4 test.t0.pk 1 100.00 Using where
+1 PRIMARY t1 eq_ref PRIMARY PRIMARY 4 test.t0.pk 1 100.00
1 PRIMARY t2 ref vkey vkey 4 test.t1.vnokey 2 100.00 Using index; FirstMatch(t1)
Warnings:
Note 1003 select `test`.`t0`.`vkey` AS `vkey` from `test`.`t0` `t1` semi join (`test`.`t0` `t2`) join `test`.`t0` where ((`test`.`t2`.`vkey` = `test`.`t1`.`vnokey`) and (`test`.`t1`.`pk` = `test`.`t0`.`pk`))
=== modified file 'mysql-test/r/subselect_sj2.result'
--- a/mysql-test/r/subselect_sj2.result 2010-01-17 14:51:10 +0000
+++ b/mysql-test/r/subselect_sj2.result 2010-02-17 10:47:55 +0000
@@ -32,7 +32,7 @@
9 5
explain select * from t2 where b in (select a from t1);
id select_type table type possible_keys key key_len ref rows Extra
-1 PRIMARY t1 ALL NULL NULL NULL NULL 3 Using where; Materialize; Scan
+1 PRIMARY t1 ALL NULL NULL NULL NULL 3 Materialize; Scan
1 PRIMARY t2 ref b b 5 test.t1.a 2
select * from t2 where b in (select a from t1);
a b
@@ -73,7 +73,7 @@
from t0 A, t0 B where B.a <5;
explain select * from t3 where b in (select a from t0);
id select_type table type possible_keys key key_len ref rows Extra
-1 PRIMARY t0 ALL NULL NULL NULL NULL 10 Using where; Materialize; Scan
+1 PRIMARY t0 ALL NULL NULL NULL NULL 10 Materialize; Scan
1 PRIMARY t3 ref b b 5 test.t0.a 1
set @save_ecp= @@engine_condition_pushdown;
set engine_condition_pushdown=0;
@@ -417,7 +417,7 @@
where t0.a in ( select t1.a from t1,t2 where t2.a=t0.a and
t1.b=t2.b);
id select_type table type possible_keys key key_len ref rows filtered Extra
-1 PRIMARY t0 ALL NULL NULL NULL NULL 5 100.00 Using where
+1 PRIMARY t0 ALL NULL NULL NULL NULL 5 100.00
1 PRIMARY t1 ref a a 5 test.t0.a 1 100.00 Start temporary
1 PRIMARY t2 eq_ref PRIMARY PRIMARY 4 test.t0.a 1 100.00 Using where; End temporary
Warnings:
=== modified file 'mysql-test/r/subselect_sj2_jcl6.result'
--- a/mysql-test/r/subselect_sj2_jcl6.result 2010-01-17 14:51:10 +0000
+++ b/mysql-test/r/subselect_sj2_jcl6.result 2010-02-17 10:47:55 +0000
@@ -36,7 +36,7 @@
9 5
explain select * from t2 where b in (select a from t1);
id select_type table type possible_keys key key_len ref rows Extra
-1 PRIMARY t1 ALL NULL NULL NULL NULL 3 Using where; Materialize; Scan
+1 PRIMARY t1 ALL NULL NULL NULL NULL 3 Materialize; Scan
1 PRIMARY t2 ref b b 5 test.t1.a 2 Using join buffer
select * from t2 where b in (select a from t1);
a b
@@ -77,8 +77,8 @@
from t0 A, t0 B where B.a <5;
explain select * from t3 where b in (select a from t0);
id select_type table type possible_keys key key_len ref rows Extra
-1 PRIMARY t0 ALL NULL NULL NULL NULL 10 Using where; Materialize; Scan
-1 PRIMARY t3 ref b b 5 test.t0.a 1
+1 PRIMARY t0 ALL NULL NULL NULL NULL 10 Materialize; Scan
+1 PRIMARY t3 ref b b 5 test.t0.a 1 Using join buffer
set @save_ecp= @@engine_condition_pushdown;
set engine_condition_pushdown=0;
select * from t3 where b in (select A.a+B.a from t0 A, t0 B where B.a<5);
@@ -421,7 +421,7 @@
where t0.a in ( select t1.a from t1,t2 where t2.a=t0.a and
t1.b=t2.b);
id select_type table type possible_keys key key_len ref rows filtered Extra
-1 PRIMARY t0 ALL NULL NULL NULL NULL 5 100.00 Using where
+1 PRIMARY t0 ALL NULL NULL NULL NULL 5 100.00
1 PRIMARY t1 ref a a 5 test.t0.a 1 100.00 Start temporary; Using join buffer
1 PRIMARY t2 eq_ref PRIMARY PRIMARY 4 test.t0.a 1 100.00 Using where; End temporary; Using join buffer
Warnings:
=== modified file 'mysql-test/r/subselect_sj_jcl6.result'
--- a/mysql-test/r/subselect_sj_jcl6.result 2010-02-11 23:59:58 +0000
+++ b/mysql-test/r/subselect_sj_jcl6.result 2010-02-17 10:47:55 +0000
@@ -16,7 +16,7 @@
Flattened because of dependency, t10=func(t1)
explain select * from t1 where a in (select pk from t10);
id select_type table type possible_keys key key_len ref rows Extra
-1 PRIMARY t1 ALL NULL NULL NULL NULL 3 Using where
+1 PRIMARY t1 ALL NULL NULL NULL NULL 3
1 PRIMARY t10 eq_ref PRIMARY PRIMARY 4 test.t1.a 1 Using index
select * from t1 where a in (select pk from t10);
a b
@@ -43,7 +43,7 @@
a b
explain select * from t1 where a in (select pk from t10) and b in (select pk from t10);
id select_type table type possible_keys key key_len ref rows Extra
-1 PRIMARY t1 ALL NULL NULL NULL NULL 3 Using where
+1 PRIMARY t1 ALL NULL NULL NULL NULL 3
1 PRIMARY t10 eq_ref PRIMARY PRIMARY 4 test.t1.a 1 Using index
1 PRIMARY t10 eq_ref PRIMARY PRIMARY 4 test.t1.b 1 Using index
select * from t1 where a in (select pk from t10) and b in (select pk from t10);
@@ -54,8 +54,8 @@
flattening a nested subquery
explain select * from t1 where a in (select pk from t10 where t10.a in (select pk from t12));
id select_type table type possible_keys key key_len ref rows Extra
-1 PRIMARY t1 ALL NULL NULL NULL NULL 3 Using where
-1 PRIMARY t10 eq_ref PRIMARY PRIMARY 4 test.t1.a 1 Using where; Using join buffer
+1 PRIMARY t1 ALL NULL NULL NULL NULL 3
+1 PRIMARY t10 eq_ref PRIMARY PRIMARY 4 test.t1.a 1 Using join buffer
1 PRIMARY t12 eq_ref PRIMARY PRIMARY 4 test.t10.a 1 Using index
select * from t1 where a in (select pk from t10 where t10.a in (select pk from t12));
a b
@@ -65,8 +65,8 @@
flattening subquery w/ several tables
explain extended select * from t1 where a in (select t10.pk from t10, t12 where t12.pk=t10.a);
id select_type table type possible_keys key key_len ref rows filtered Extra
-1 PRIMARY t1 ALL NULL NULL NULL NULL 3 100.00 Using where
-1 PRIMARY t10 eq_ref PRIMARY PRIMARY 4 test.t1.a 1 100.00 Using where; Using join buffer
+1 PRIMARY t1 ALL NULL NULL NULL NULL 3 100.00
+1 PRIMARY t10 eq_ref PRIMARY PRIMARY 4 test.t1.a 1 100.00 Using join buffer
1 PRIMARY t12 eq_ref PRIMARY PRIMARY 4 test.t10.a 1 100.00 Using index
Warnings:
Note 1003 select `test`.`t1`.`a` AS `a`,`test`.`t1`.`b` AS `b` from `test`.`t10` join `test`.`t12` join `test`.`t1` where ((`test`.`t10`.`pk` = `test`.`t1`.`a`) and (`test`.`t12`.`pk` = `test`.`t10`.`a`))
@@ -549,7 +549,7 @@
(SELECT t1.pk FROM t0 t1 JOIN t0 t2 ON t2.vkey = t1.vnokey);
id select_type table type possible_keys key key_len ref rows filtered Extra
1 PRIMARY t0 ALL PRIMARY NULL NULL NULL 5 100.00
-1 PRIMARY t1 eq_ref PRIMARY PRIMARY 4 test.t0.pk 1 100.00 Using where; Using join buffer
+1 PRIMARY t1 eq_ref PRIMARY PRIMARY 4 test.t0.pk 1 100.00 Using join buffer
1 PRIMARY t2 ref vkey vkey 4 test.t1.vnokey 2 100.00 Using index; FirstMatch(t1)
Warnings:
Note 1003 select `test`.`t0`.`vkey` AS `vkey` from `test`.`t0` `t1` semi join (`test`.`t0` `t2`) join `test`.`t0` where ((`test`.`t2`.`vkey` = `test`.`t1`.`vnokey`) and (`test`.`t1`.`pk` = `test`.`t0`.`pk`))
=== modified file 'mysql-test/r/view.result'
--- a/mysql-test/r/view.result 2009-12-15 07:16:46 +0000
+++ b/mysql-test/r/view.result 2010-02-17 10:47:55 +0000
@@ -2342,11 +2342,11 @@
EXPLAIN SELECT t1.* FROM t1 JOIN t2 WHERE t1.a=t2.a AND t1.b=t2.b AND t1.a=1;
id select_type table type possible_keys key key_len ref rows Extra
1 SIMPLE t1 ref a a 5 const 1 Using index
-1 SIMPLE t2 ref a a 10 const,test.t1.b 2 Using index
+1 SIMPLE t2 ref a a 10 const,test.t1.b 1 Using index
EXPLAIN SELECT * FROM v1 WHERE a=1;
id select_type table type possible_keys key key_len ref rows Extra
1 SIMPLE t1 ref a a 5 const 1 Using index
-1 SIMPLE t2 ref a a 10 const,test.t1.b 2 Using index
+1 SIMPLE t2 ref a a 10 const,test.t1.b 1 Using index
EXPLAIN SELECT * FROM v2 WHERE a=1;
id select_type table type possible_keys key key_len ref rows Extra
1 SIMPLE t1 ref a a 5 const 1 Using index
1
0

[Maria-developers] Rev 2759: Subquery optimizations backport: in file:///home/psergey/dev/maria-5.3-subqueries-r7/
by Sergey Petrunya 17 Feb '10
by Sergey Petrunya 17 Feb '10
17 Feb '10
At file:///home/psergey/dev/maria-5.3-subqueries-r7/
------------------------------------------------------------
revno: 2759
revision-id: psergey(a)askmonty.org-20100217100527-k81b7torhmj99moy
parent: psergey(a)askmonty.org-20100215215306-hc0levm9ag1lv1b1
committer: Sergey Petrunya <psergey(a)askmonty.org>
branch nick: maria-5.3-subqueries-r7
timestamp: Wed 2010-02-17 13:05:27 +0300
message:
Subquery optimizations backport:
- Update test results
- More comments
- Add Item_in_optimizer::transform() which was lost in backport
=== modified file 'mysql-test/r/subselect.result'
--- a/mysql-test/r/subselect.result 2010-01-17 20:52:20 +0000
+++ b/mysql-test/r/subselect.result 2010-02-17 10:05:27 +0000
@@ -1377,7 +1377,7 @@
2 DEPENDENT SUBQUERY t1 index_subquery a a 5 func 1001 100.00 Using index; Using where
Warnings:
Note 1003 select `test`.`t2`.`a` AS `a` from `test`.`t2` where <in_optimizer>(`test`.`t2`.`a`,<exists>(<index_lookup>(<cache>(`test`.`t2`.`a`) in t1 on a where ((`test`.`t1`.`b` <> 30) and (<cache>(`test`.`t2`.`a`) = `test`.`t1`.`a`)))))
-drop table t1, t2, t3;
+drop table t0, t1, t2, t3;
create table t1 (a int, b int);
create table t2 (a int, b int);
create table t3 (a int, b int);
=== modified file 'mysql-test/r/subselect3.result'
--- a/mysql-test/r/subselect3.result 2010-01-17 14:51:10 +0000
+++ b/mysql-test/r/subselect3.result 2010-02-17 10:05:27 +0000
@@ -873,7 +873,7 @@
Note 1276 Field or reference 'test.t1.a' of SELECT #3 was resolved in SELECT #2
Note 1276 Field or reference 'test.t1.c' of SELECT #3 was resolved in SELECT #2
Error 1054 Unknown column 'c' in 'field list'
-Note 1003 select `c` AS `c` from (select (select count(`test`.`t1`.`a`) AS `COUNT(a)` from dual group by `c`) AS `(SELECT COUNT(a) FROM
+Note 1003 select `c` AS `c` from (select (select count(`test`.`t1`.`a`) AS `COUNT(a)` from (select count(`test`.`t1`.`b`) AS `COUNT(b)` from `test`.`t1`) `x` group by `t1`.`c`) AS `(SELECT COUNT(a) FROM
(SELECT COUNT(b) FROM t1) AS x GROUP BY c
)` from `test`.`t1` group by `test`.`t1`.`b`) `y`
DROP TABLE t1;
@@ -1117,7 +1117,7 @@
explain select * from t3 where a in (select t1.kp1 from t1,t4 where kp1<20
and t4.pk=t1.c);
id select_type table type possible_keys key key_len ref rows Extra
-1 PRIMARY t1 range kp1 kp1 5 NULL 48 Using index condition; Using where; Using MRR; LooseScan
+1 PRIMARY t1 range kp1 kp1 5 NULL 48 Using index condition; Using MRR; LooseScan
1 PRIMARY t4 eq_ref PRIMARY PRIMARY 4 test.t1.c 1 Using index; FirstMatch(t1)
1 PRIMARY t3 ALL NULL NULL NULL NULL 100 Using where; Using join buffer
drop table t1, t3, t4;
=== modified file 'mysql-test/r/subselect4.result'
--- a/mysql-test/r/subselect4.result 2010-01-17 14:51:10 +0000
+++ b/mysql-test/r/subselect4.result 2010-02-17 10:05:27 +0000
@@ -13,9 +13,9 @@
WHERE NOT EXISTS (SELECT 1 FROM t2 WHERE 1 = (SELECT MIN(t2.b) FROM t3))
ORDER BY count(*);
id select_type table type possible_keys key key_len ref rows Extra
-1 PRIMARY t1 index NULL a 5 NULL 2 Using index; Using temporary
+1 PRIMARY t1 index NULL a 5 NULL 2 Using where; Using index; Using temporary
2 DEPENDENT SUBQUERY t2 ALL NULL NULL NULL NULL 2 Using where
-3 DEPENDENT SUBQUERY NULL NULL NULL NULL NULL NULL NULL no matching row in const table
+3 DEPENDENT SUBQUERY t3 system NULL NULL NULL NULL 0 const row not found
# should not crash the next statement
SELECT 1 FROM t1
WHERE NOT EXISTS (SELECT 1 FROM t2 WHERE 1 = (SELECT MIN(t2.b) FROM t3))
@@ -77,10 +77,10 @@
EXPLAIN EXTENDED SELECT (SELECT 1 FROM t2 WHERE d = c) AS RESULT FROM t1 ;
id select_type table type possible_keys key key_len ref rows filtered Extra
1 PRIMARY t1 system NULL NULL NULL NULL 1 100.00
-2 DEPENDENT SUBQUERY NULL NULL NULL NULL NULL NULL NULL NULL Impossible WHERE noticed after reading const tables
+2 DEPENDENT SUBQUERY t2 ref d d 5 const 1 100.00 Using index
Warnings:
Note 1276 Field or reference 'test.t1.c' of SELECT #2 was resolved in SELECT #1
-Note 1003 select (select 1 AS `1` from `test`.`t2` where (`test`.`t2`.`d` = '0')) AS `RESULT` from dual
+Note 1003 select (select 1 AS `1` from `test`.`t2` where (`test`.`t2`.`d` = '0')) AS `RESULT` from `test`.`t1`
first equivalent variant
SELECT (SELECT 1 FROM t2 WHERE d = IFNULL(c,NULL)) AS RESULT FROM t1 GROUP BY c ;
RESULT
@@ -88,10 +88,10 @@
EXPLAIN EXTENDED SELECT (SELECT 1 FROM t2 WHERE d = IFNULL(c,NULL)) AS RESULT FROM t1 GROUP BY c;
id select_type table type possible_keys key key_len ref rows filtered Extra
1 PRIMARY t1 system NULL NULL NULL NULL 1 100.00
-2 DEPENDENT SUBQUERY NULL NULL NULL NULL NULL NULL NULL NULL Impossible WHERE noticed after reading const tables
+2 DEPENDENT SUBQUERY t2 ref d d 5 const 1 100.00 Using where; Using index
Warnings:
Note 1276 Field or reference 'test.t1.c' of SELECT #2 was resolved in SELECT #1
-Note 1003 select (select 1 AS `1` from `test`.`t2` where (`test`.`t2`.`d` = ifnull('0',NULL))) AS `RESULT` from dual group by '0'
+Note 1003 select (select 1 AS `1` from `test`.`t2` where (`test`.`t2`.`d` = ifnull('0',NULL))) AS `RESULT` from `test`.`t1` group by '0'
second equivalent variant
SELECT (SELECT 1 FROM t2 WHERE d = c) AS RESULT FROM t1 GROUP BY c ;
RESULT
@@ -99,10 +99,10 @@
EXPLAIN EXTENDED SELECT (SELECT 1 FROM t2 WHERE d = c) AS RESULT FROM t1 GROUP BY c ;
id select_type table type possible_keys key key_len ref rows filtered Extra
1 PRIMARY t1 system NULL NULL NULL NULL 1 100.00
-2 DEPENDENT SUBQUERY NULL NULL NULL NULL NULL NULL NULL NULL Impossible WHERE noticed after reading const tables
+2 DEPENDENT SUBQUERY t2 ref d d 5 const 1 100.00 Using index
Warnings:
Note 1276 Field or reference 'test.t1.c' of SELECT #2 was resolved in SELECT #1
-Note 1003 select (select 1 AS `1` from `test`.`t2` where (`test`.`t2`.`d` = '0')) AS `RESULT` from dual group by '0'
+Note 1003 select (select 1 AS `1` from `test`.`t2` where (`test`.`t2`.`d` = '0')) AS `RESULT` from `test`.`t1` group by '0'
DROP TABLE t1,t2;
#
# BUG#45928 "Differing query results depending on MRR and
=== modified file 'sql/item.h'
--- a/sql/item.h 2010-02-11 23:59:58 +0000
+++ b/sql/item.h 2010-02-17 10:05:27 +0000
@@ -2817,6 +2817,17 @@
};
+/*
+ Cached_item_XXX objects are not exactly caches. They do the following:
+
+ Each Cached_item_XXX object has
+ - its source item
+ - saved value of the source item
+ - cmp() method that compares the saved value with the current value of the
+ source item, and if they were not equal saves item's value into the saved
+ value.
+*/
+
class Cached_item :public Sql_alloc
{
public:
=== modified file 'sql/item_cmpfunc.cc'
--- a/sql/item_cmpfunc.cc 2010-01-17 14:55:08 +0000
+++ b/sql/item_cmpfunc.cc 2010-02-17 10:05:27 +0000
@@ -1649,6 +1649,70 @@
}
+/**
+ Transform an Item_in_optimizer and its arguments with a callback function.
+
+ @param transformer the transformer callback function to be applied to the
+ nodes of the tree of the object
+ @param parameter to be passed to the transformer
+
+ @detail
+ Recursively transform the left and the right operand of this Item. The
+ Right operand is an Item_in_subselect or its subclass. To avoid the
+ creation of new Items, we use the fact the the left operand of the
+ Item_in_subselect is the same as the one of 'this', so instead of
+ transforming its operand, we just assign the left operand of the
+ Item_in_subselect to be equal to the left operand of 'this'.
+ The transformation is not applied further to the subquery operand
+ if the IN predicate.
+
+ @returns
+ @retval pointer to the transformed item
+ @retval NULL if an error occurred
+*/
+
+Item *Item_in_optimizer::transform(Item_transformer transformer, uchar *argument)
+{
+ Item *new_item;
+
+ DBUG_ASSERT(!current_thd->is_stmt_prepare());
+ DBUG_ASSERT(arg_count == 2);
+
+ /* Transform the left IN operand. */
+ new_item= (*args)->transform(transformer, argument);
+ if (!new_item)
+ return 0;
+ /*
+ THD::change_item_tree() should be called only if the tree was
+ really transformed, i.e. when a new item has been created.
+ Otherwise we'll be allocating a lot of unnecessary memory for
+ change records at each execution.
+ */
+ if ((*args) != new_item)
+ current_thd->change_item_tree(args, new_item);
+
+ /*
+ Transform the right IN operand which should be an Item_in_subselect or a
+ subclass of it. The left operand of the IN must be the same as the left
+ operand of this Item_in_optimizer, so in this case there is no further
+ transformation, we only make both operands the same.
+ TODO: is it the way it should be?
+ */
+ DBUG_ASSERT((args[1])->type() == Item::SUBSELECT_ITEM &&
+ (((Item_subselect*)(args[1]))->substype() ==
+ Item_subselect::IN_SUBS ||
+ ((Item_subselect*)(args[1]))->substype() ==
+ Item_subselect::ALL_SUBS ||
+ ((Item_subselect*)(args[1]))->substype() ==
+ Item_subselect::ANY_SUBS));
+
+ Item_in_subselect *in_arg= (Item_in_subselect*)args[1];
+ in_arg->left_expr= args[0];
+
+ return (this->*transformer)(argument);
+}
+
+
longlong Item_func_eq::val_int()
{
DBUG_ASSERT(fixed == 1);
=== modified file 'sql/item_cmpfunc.h'
--- a/sql/item_cmpfunc.h 2010-02-11 21:58:23 +0000
+++ b/sql/item_cmpfunc.h 2010-02-17 10:05:27 +0000
@@ -241,6 +241,7 @@
const char *func_name() const { return "<in_optimizer>"; }
Item_cache **get_cache() { return &cache; }
void keep_top_level_cache();
+ Item *transform(Item_transformer transformer, uchar *arg);
};
class Comp_creator
=== modified file 'sql/item_subselect.cc'
--- a/sql/item_subselect.cc 2010-02-15 21:53:06 +0000
+++ b/sql/item_subselect.cc 2010-02-17 10:05:27 +0000
@@ -1312,7 +1312,7 @@
(char *)in_left_expr_name);
master_unit->uncacheable|= UNCACHEABLE_DEPENDENT;
- select_lex->uncacheable|= UNCACHEABLE_DEPENDENT;
+ //psergey: placed then removed: select_lex->uncacheable|= UNCACHEABLE_DEPENDENT;
}
if (!abort_on_null && left_expr->maybe_null && !pushed_cond_guards)
=== modified file 'sql/opt_subselect.cc'
--- a/sql/opt_subselect.cc 2010-02-15 21:53:06 +0000
+++ b/sql/opt_subselect.cc 2010-02-17 10:05:27 +0000
@@ -120,7 +120,7 @@
SELECT_LEX *current= thd->lex->current_select;
thd->lex->current_select= current->return_after_parsing();
char const *save_where= thd->where;
- thd->where= " IN/ALL/ANY subquery";
+ thd->where= "IN/ALL/ANY subquery";
bool failure= !in_subs->left_expr->fixed &&
in_subs->left_expr->fix_fields(thd, &in_subs->left_expr);
1
0
Hi!
Them who use bazaar email plugn.
Most 'feature' diffs are too large so IMHO it is good to put following
to ~/.bazaar/bazaar.conf ([DEFAULT] section):
post_commit_difflimit = 10000
(or what limit you want)
2
1

[Maria-developers] bzr commit into Mariadb 5.2, with Maria 2.0:maria/5.2 branch (igor:2742) WL#86
by Igor Babaev 16 Feb '10
by Igor Babaev 16 Feb '10
16 Feb '10
#At lp:maria/5.2 based on revid:sanja@askmonty.org-20100212131228-bgxli0wfybhjkvg9
2742 Igor Babaev 2010-02-16
WL#86: Partitioned key cache for MyISAM.
This is the base patch for the task.
added:
mysql-test/r/select_pkeycache.result
mysql-test/t/select_pkeycache-master.opt
mysql-test/t/select_pkeycache.test
modified:
include/keycache.h
mysql-test/r/information_schema.result
mysql-test/r/information_schema_all_engines.result
mysql-test/r/key_cache.result
mysql-test/t/key_cache.test
mysys/mf_keycache.c
sql/handler.cc
sql/handler.h
sql/mysqld.cc
sql/set_var.cc
sql/set_var.h
sql/sql_show.cc
sql/sql_test.cc
sql/table.h
storage/myisam/mi_check.c
storage/myisam/mi_close.c
storage/myisam/mi_delete_all.c
storage/myisam/mi_extra.c
storage/myisam/mi_keycache.c
storage/myisam/mi_locking.c
storage/myisam/mi_page.c
storage/myisam/mi_panic.c
storage/myisam/mi_preload.c
storage/myisam/mi_test1.c
storage/myisam/mi_test2.c
storage/myisam/mi_test3.c
storage/myisam/myisam_ftdump.c
storage/myisam/myisamchk.c
storage/myisam/myisamdef.h
storage/myisam/myisamlog.c
=== modified file 'include/keycache.h'
--- a/include/keycache.h 2009-12-03 11:19:05 +0000
+++ b/include/keycache.h 2010-02-16 16:41:11 +0000
@@ -19,96 +19,121 @@
#define _keycache_h
C_MODE_START
-/* declare structures that is used by st_key_cache */
-struct st_block_link;
-typedef struct st_block_link BLOCK_LINK;
-struct st_keycache_page;
-typedef struct st_keycache_page KEYCACHE_PAGE;
-struct st_hash_link;
-typedef struct st_hash_link HASH_LINK;
-/* info about requests in a waiting queue */
-typedef struct st_keycache_wqueue
+/*
+ Currently the default key cache is created as non-partitioned at
+ the start of the server unless the server is started with the parameter
+ --key-cache-partitions that is greater than 0
+*/
+
+#define DEFAULT_KEY_CACHE_PARTITIONS 0
+
+/*
+ MAX_KEY_CACHE_PARTITIONS cannot be greater than
+ sizeof(MYISAM_SHARE::dirty_part_map)
+ Currently sizeof(MYISAM_SHARE::dirty_part_map)=sizeof(ulonglong)
+*/
+
+#define MAX_KEY_CACHE_PARTITIONS 64
+
+
+/* The structure to get statistical data about a key cache */
+
+typedef struct st_key_cache_statistics
+{
+ ulonglong mem_size; /* memory for cache buffers/auxiliary structures */
+ ulonglong block_size; /* size of the each buffers in the key cache */
+ ulonglong blocks_used; /* maximum number of used blocks/buffers */
+ ulonglong blocks_unused; /* number of currently unused blocks */
+ ulonglong blocks_changed; /* number of currently dirty blocks */
+ ulonglong read_requests; /* number of read requests (read hits) */
+ ulonglong reads; /* number of actual reads from files into buffers */
+ ulonglong write_requests; /* number of write requests (write hits) */
+ ulonglong writes; /* number of actual writes from buffers into files */
+} KEY_CACHE_STATISTICS;
+
+/* The type of a key cache object */
+typedef enum key_cache_type
{
- struct st_my_thread_var *last_thread; /* circular list of waiting threads */
-} KEYCACHE_WQUEUE;
+ SIMPLE_KEY_CACHE,
+ PARTITIONED_KEY_CACHE
+} KEY_CACHE_TYPE;
-#define CHANGED_BLOCKS_HASH 128 /* must be power of 2 */
/*
- The key cache structure
- It also contains read-only statistics parameters.
+ An object of the type KEY_CACHE_FUNCS contains pointers to all functions
+ from the key cache interface.
+ Currently a key cache can be of two types: simple and partitioned.
+ For each of them its own static structure of the type KEY_CACHE_FUNCS is
+ defined . The structures contain the pointers to the implementations of
+ the interface functions used by simple key caches and partitioned key
+ caches respectively. Pointers to these structures are assigned to key cache
+ objects at the time of their creation.
*/
+typedef struct st_key_cache_funcs
+{
+ int (*init) (void *, uint key_cache_block_size,
+ size_t use_mem, uint division_limit, uint age_threshold);
+ int (*resize) (void *, uint key_cache_block_size,
+ size_t use_mem, uint division_limit, uint age_threshold);
+ void (*change_param) (void *keycache_cb,
+ uint division_limit, uint age_threshold);
+ uchar* (*read) (void *keycache_cb,
+ File file, my_off_t filepos, int level,
+ uchar *buff, uint length,
+ uint block_length, int return_buffer);
+ int (*insert) (void *keycache_cb,
+ File file, my_off_t filepos, int level,
+ uchar *buff, uint length);
+ int (*write) (void *keycache_cb,
+ File file, void *file_extra,
+ my_off_t filepos, int level,
+ uchar *buff, uint length,
+ uint block_length, int force_write);
+ int (*flush) (void *keycache_cb,
+ int file, void *file_extra,
+ enum flush_type type);
+ int (*reset_counters) (const char *name, void *keycache_cb);
+ void (*end) (void *keycache_cb, my_bool cleanup);
+ void (*get_stats) (void *keycache_cb, uint partition_no,
+ KEY_CACHE_STATISTICS *key_cache_stats);
+ ulonglong (*get_stat_val) (void *keycache_cb, uint var_no);
+} KEY_CACHE_FUNCS;
+
+
typedef struct st_key_cache
{
- my_bool key_cache_inited;
- my_bool in_resize; /* true during resize operation */
- my_bool resize_in_flush; /* true during flush of resize operation */
+ KEY_CACHE_TYPE key_cache_type; /* type of the key cache used for debugging */
+ void *keycache_cb; /* control block of the used key cache */
+ KEY_CACHE_FUNCS *interface_funcs; /* interface functions of the key cache */
+ ulonglong param_buff_size; /* size the memory allocated for the cache */
+ ulong param_block_size; /* size of the blocks in the key cache */
+ ulong param_division_limit; /* min. percentage of warm blocks */
+ ulong param_age_threshold; /* determines when hot block is downgraded */
+ ulong param_partitions; /* number of the key cache partitions */
+ my_bool key_cache_inited; /* <=> key cache has been created */
my_bool can_be_used; /* usage of cache for read/write is allowed */
- size_t key_cache_mem_size; /* specified size of the cache memory */
- uint key_cache_block_size; /* size of the page buffer of a cache block */
- ulong min_warm_blocks; /* min number of warm blocks; */
- ulong age_threshold; /* age threshold for hot blocks */
- ulonglong keycache_time; /* total number of block link operations */
- uint hash_entries; /* max number of entries in the hash table */
- int hash_links; /* max number of hash links */
- int hash_links_used; /* number of hash links currently used */
- int disk_blocks; /* max number of blocks in the cache */
- ulong blocks_used; /* maximum number of concurrently used blocks */
- ulong blocks_unused; /* number of currently unused blocks */
- ulong blocks_changed; /* number of currently dirty blocks */
- ulong warm_blocks; /* number of blocks in warm sub-chain */
- ulong cnt_for_resize_op; /* counter to block resize operation */
- long blocks_available; /* number of blocks available in the LRU chain */
- HASH_LINK **hash_root; /* arr. of entries into hash table buckets */
- HASH_LINK *hash_link_root; /* memory for hash table links */
- HASH_LINK *free_hash_list; /* list of free hash links */
- BLOCK_LINK *free_block_list; /* list of free blocks */
- BLOCK_LINK *block_root; /* memory for block links */
- uchar HUGE_PTR *block_mem; /* memory for block buffers */
- BLOCK_LINK *used_last; /* ptr to the last block of the LRU chain */
- BLOCK_LINK *used_ins; /* ptr to the insertion block in LRU chain */
- pthread_mutex_t cache_lock; /* to lock access to the cache structure */
- KEYCACHE_WQUEUE resize_queue; /* threads waiting during resize operation */
- /*
- Waiting for a zero resize count. Using a queue for symmetry though
- only one thread can wait here.
- */
- KEYCACHE_WQUEUE waiting_for_resize_cnt;
- KEYCACHE_WQUEUE waiting_for_hash_link; /* waiting for a free hash link */
- KEYCACHE_WQUEUE waiting_for_block; /* requests waiting for a free block */
- BLOCK_LINK *changed_blocks[CHANGED_BLOCKS_HASH]; /* hash for dirty file bl.*/
- BLOCK_LINK *file_blocks[CHANGED_BLOCKS_HASH]; /* hash for other file bl.*/
-
- /*
- The following variables are and variables used to hold parameters for
- initializing the key cache.
- */
-
- ulonglong param_buff_size; /* size the memory allocated for the cache */
- ulong param_block_size; /* size of the blocks in the key cache */
- ulong param_division_limit; /* min. percentage of warm blocks */
- ulong param_age_threshold; /* determines when hot block is downgraded */
-
- /* Statistics variables. These are reset in reset_key_cache_counters(). */
- ulong global_blocks_changed; /* number of currently dirty blocks */
+ my_bool in_init; /* Set to 1 in MySQL during init/resize */
+ uint partitions; /* actual number of partitions */
+ size_t key_cache_mem_size; /* specified size of the cache memory */
+ ulong blocks_used; /* maximum number of concurrently used blocks */
+ ulong blocks_unused; /* number of currently unused blocks */
+ ulong global_blocks_changed; /* number of currently dirty blocks */
ulonglong global_cache_w_requests;/* number of write requests (write hits) */
ulonglong global_cache_write; /* number of writes from cache to files */
ulonglong global_cache_r_requests;/* number of read requests (read hits) */
ulonglong global_cache_read; /* number of reads from files to cache */
-
- int blocks; /* max number of blocks in the cache */
- my_bool in_init; /* Set to 1 in MySQL during init/resize */
} KEY_CACHE;
+
/* The default key cache */
extern KEY_CACHE dflt_key_cache_var, *dflt_key_cache;
extern int init_key_cache(KEY_CACHE *keycache, uint key_cache_block_size,
size_t use_mem, uint division_limit,
- uint age_threshold);
+ uint age_threshold, uint partitions);
extern int resize_key_cache(KEY_CACHE *keycache, uint key_cache_block_size,
size_t use_mem, uint division_limit,
uint age_threshold);
@@ -122,12 +147,18 @@ extern int key_cache_insert(KEY_CACHE *k
File file, my_off_t filepos, int level,
uchar *buff, uint length);
extern int key_cache_write(KEY_CACHE *keycache,
- File file, my_off_t filepos, int level,
+ File file, void *file_extra,
+ my_off_t filepos, int level,
uchar *buff, uint length,
- uint block_length,int force_write);
+ uint block_length, int force_write);
extern int flush_key_blocks(KEY_CACHE *keycache,
- int file, enum flush_type type);
+ int file, void *file_extra,
+ enum flush_type type);
extern void end_key_cache(KEY_CACHE *keycache, my_bool cleanup);
+extern void get_key_cache_statistics(KEY_CACHE *keycache,
+ uint partition_no,
+ KEY_CACHE_STATISTICS *key_cache_stats);
+extern ulonglong get_key_cache_stat_value(KEY_CACHE *keycache, uint var_no);
/* Functions to handle multiple key caches */
extern my_bool multi_keycache_init(void);
@@ -140,5 +171,11 @@ extern void multi_key_cache_change(KEY_C
KEY_CACHE *new_data);
extern int reset_key_cache_counters(const char *name,
KEY_CACHE *key_cache);
+extern int repartition_key_cache(KEY_CACHE *keycache,
+ uint key_cache_block_size,
+ size_t use_mem,
+ uint division_limit,
+ uint age_threshold,
+ uint partitions);
C_MODE_END
#endif /* _keycache_h */
=== modified file 'mysql-test/r/information_schema.result'
--- a/mysql-test/r/information_schema.result 2010-02-01 06:14:12 +0000
+++ b/mysql-test/r/information_schema.result 2010-02-16 16:41:11 +0000
@@ -69,6 +69,7 @@ INNODB_LOCK_WAITS
INNODB_RSEG
INNODB_TABLE_STATS
INNODB_TRX
+KEY_CACHES
KEY_COLUMN_USAGE
PARTITIONS
PLUGINS
=== modified file 'mysql-test/r/information_schema_all_engines.result'
--- a/mysql-test/r/information_schema_all_engines.result 2010-02-01 06:14:12 +0000
+++ b/mysql-test/r/information_schema_all_engines.result 2010-02-16 16:41:11 +0000
@@ -13,6 +13,7 @@ FILES
GLOBAL_STATUS
GLOBAL_VARIABLES
INDEX_STATISTICS
+KEY_CACHES
KEY_COLUMN_USAGE
PARTITIONS
PLUGINS
@@ -76,6 +77,7 @@ FILES TABLE_SCHEMA
GLOBAL_STATUS VARIABLE_NAME
GLOBAL_VARIABLES VARIABLE_NAME
INDEX_STATISTICS TABLE_SCHEMA
+KEY_CACHES KEY_CACHE_NAME
KEY_COLUMN_USAGE CONSTRAINT_SCHEMA
PARTITIONS TABLE_SCHEMA
PLUGINS PLUGIN_NAME
@@ -139,6 +141,7 @@ FILES TABLE_SCHEMA
GLOBAL_STATUS VARIABLE_NAME
GLOBAL_VARIABLES VARIABLE_NAME
INDEX_STATISTICS TABLE_SCHEMA
+KEY_CACHES KEY_CACHE_NAME
KEY_COLUMN_USAGE CONSTRAINT_SCHEMA
PARTITIONS TABLE_SCHEMA
PLUGINS PLUGIN_NAME
@@ -221,6 +224,7 @@ INNODB_LOCK_WAITS information_schema.INN
INNODB_RSEG information_schema.INNODB_RSEG 1
INNODB_TABLE_STATS information_schema.INNODB_TABLE_STATS 1
INNODB_TRX information_schema.INNODB_TRX 1
+KEY_CACHES information_schema.KEY_CACHES 1
KEY_COLUMN_USAGE information_schema.KEY_COLUMN_USAGE 1
PARTITIONS information_schema.PARTITIONS 1
PBXT_STATISTICS information_schema.PBXT_STATISTICS 1
@@ -259,6 +263,7 @@ Database: information_schema
| GLOBAL_STATUS |
| GLOBAL_VARIABLES |
| INDEX_STATISTICS |
+| KEY_CACHES |
| KEY_COLUMN_USAGE |
| PARTITIONS |
| PLUGINS |
@@ -312,6 +317,7 @@ Database: INFORMATION_SCHEMA
| GLOBAL_STATUS |
| GLOBAL_VARIABLES |
| INDEX_STATISTICS |
+| KEY_CACHES |
| KEY_COLUMN_USAGE |
| PARTITIONS |
| PLUGINS |
@@ -357,5 +363,5 @@ Wildcard: inf_rmation_schema
+--------------------+
SELECT table_schema, count(*) FROM information_schema.TABLES WHERE table_schema IN ('mysql', 'INFORMATION_SCHEMA', 'test', 'mysqltest') AND table_name<>'ndb_binlog_index' AND table_name<>'ndb_apply_status' GROUP BY TABLE_SCHEMA;
table_schema count(*)
-information_schema 48
+information_schema 49
mysql 22
=== modified file 'mysql-test/r/key_cache.result'
--- a/mysql-test/r/key_cache.result 2009-03-16 19:54:50 +0000
+++ b/mysql-test/r/key_cache.result 2010-02-16 16:41:11 +0000
@@ -1,5 +1,7 @@
drop table if exists t1, t2, t3;
-SET @save_key_buffer=@@key_buffer_size;
+SET @save_key_buffer_size=@@key_buffer_size;
+SET @save_key_cache_block_size=@@key_cache_block_size;
+SET @save_key_cache_partitions=@@key_cache_partitions;
SELECT @@key_buffer_size, @@small.key_buffer_size;
@@key_buffer_size @@small.key_buffer_size
2097152 131072
@@ -37,7 +39,7 @@ SELECT @@small.key_buffer_size;
SELECT @@medium.key_buffer_size;
@@medium.key_buffer_size
0
-SET @@global.key_buffer_size=@save_key_buffer;
+SET @@global.key_buffer_size=@save_key_buffer_size;
SELECT @@default.key_buffer_size;
ERROR 42000: You have an error in your SQL syntax; check the manual that corresponds to your MySQL server version for the right syntax to use near 'default.key_buffer_size' at line 1
SELECT @@skr.storage_engine="test";
@@ -366,3 +368,537 @@ Variable_name Value
key_cache_block_size 1536
SET GLOBAL key_cache_block_size= @bug28478_key_cache_block_size;
DROP TABLE t1;
+set global key_buffer_size=@save_key_buffer_size;
+set global key_cache_block_size=@save_key_cache_block_size;
+select @@key_buffer_size;
+@@key_buffer_size
+2097152
+select @@key_cache_block_size;
+@@key_cache_block_size
+1024
+select @@key_cache_partitions;
+@@key_cache_partitions
+0
+create table t1 (
+p int not null auto_increment primary key,
+a char(10));
+create table t2 (
+p int not null auto_increment primary key,
+i int, a char(10), key k1(i), key k2(a));
+select @@key_cache_partitions;
+@@key_cache_partitions
+0
+select * from information_schema.key_caches;
+KEY_CACHE_NAME PARTITIONS PARTITION_NUMBER FULL_SIZE BLOCK_SIZE USED_BLOCKS UNUSED_BLOCKS DIRTY_BLOCKS READ_REQUESTS READS WRITE_REQUESTS WRITES
+default NULL NULL 2097152 1024 0 # 0 0 0 0 0
+small NULL NULL 1048576 1024 1 # 0 1 0 2 1
+insert into t1 values (1, 'qqqq'), (2, 'yyyy');
+insert into t2 values (1, 1, 'qqqq'), (2, 1, 'pppp'),
+(3, 1, 'yyyy'), (4, 3, 'zzzz');
+select * from t1;
+p a
+1 qqqq
+2 yyyy
+select * from t2;
+p i a
+1 1 qqqq
+2 1 pppp
+3 1 yyyy
+4 3 zzzz
+update t1 set p=3 where p=1;
+update t2 set i=2 where i=1;
+show status like 'key_%';
+Variable_name Value
+Key_blocks_not_flushed 0
+Key_blocks_unused KEY_BLOCKS_UNUSED
+Key_blocks_used 4
+Key_read_requests 22
+Key_reads 0
+Key_write_requests 26
+Key_writes 6
+select * from information_schema.key_caches;
+KEY_CACHE_NAME PARTITIONS PARTITION_NUMBER FULL_SIZE BLOCK_SIZE USED_BLOCKS UNUSED_BLOCKS DIRTY_BLOCKS READ_REQUESTS READS WRITE_REQUESTS WRITES
+default NULL NULL 2097152 1024 4 # 0 22 0 26 6
+small NULL NULL 1048576 1024 1 # 0 1 0 2 1
+delete from t2 where a='zzzz';
+select * from information_schema.key_caches;
+KEY_CACHE_NAME PARTITIONS PARTITION_NUMBER FULL_SIZE BLOCK_SIZE USED_BLOCKS UNUSED_BLOCKS DIRTY_BLOCKS READ_REQUESTS READS WRITE_REQUESTS WRITES
+default NULL NULL 2097152 1024 4 # 0 29 0 32 9
+small NULL NULL 1048576 1024 1 # 0 1 0 2 1
+delete from t1;
+delete from t2;
+select * from information_schema.key_caches;
+KEY_CACHE_NAME PARTITIONS PARTITION_NUMBER FULL_SIZE BLOCK_SIZE USED_BLOCKS UNUSED_BLOCKS DIRTY_BLOCKS READ_REQUESTS READS WRITE_REQUESTS WRITES
+default NULL NULL 2097152 1024 4 # 0 29 0 32 9
+small NULL NULL 1048576 1024 1 # 0 1 0 2 1
+set global key_cache_partitions=2;
+select @@key_cache_partitions;
+@@key_cache_partitions
+2
+select * from information_schema.key_caches;
+KEY_CACHE_NAME PARTITIONS PARTITION_NUMBER FULL_SIZE BLOCK_SIZE USED_BLOCKS UNUSED_BLOCKS DIRTY_BLOCKS READ_REQUESTS READS WRITE_REQUESTS WRITES
+default 2 1 1048576 1024 0 # 0 0 0 0 0
+default 2 2 1048576 1024 0 # 0 0 0 0 0
+default 2 NULL 2097152 1024 0 # 0 0 0 0 0
+small NULL NULL 1048576 1024 1 # 0 1 0 2 1
+insert into t1 values (1, 'qqqq'), (2, 'yyyy');
+insert into t2 values (1, 1, 'qqqq'), (2, 1, 'pppp'),
+(3, 1, 'yyyy'), (4, 3, 'zzzz');
+select * from t1;
+p a
+1 qqqq
+2 yyyy
+select * from t2;
+p i a
+1 1 qqqq
+2 1 pppp
+3 1 yyyy
+4 3 zzzz
+update t1 set p=3 where p=1;
+update t2 set i=2 where i=1;
+show status like 'key_%';
+Variable_name Value
+Key_blocks_not_flushed 0
+Key_blocks_unused KEY_BLOCKS_UNUSED
+Key_blocks_used 4
+Key_read_requests 22
+Key_reads 0
+Key_write_requests 26
+Key_writes 6
+select * from information_schema.key_caches;
+KEY_CACHE_NAME PARTITIONS PARTITION_NUMBER FULL_SIZE BLOCK_SIZE USED_BLOCKS UNUSED_BLOCKS DIRTY_BLOCKS READ_REQUESTS READS WRITE_REQUESTS WRITES
+default 2 1 1048576 1024 3 # 0 10 0 13 4
+default 2 2 1048576 1024 1 # 0 12 0 13 2
+default 2 NULL 2097152 1024 4 # 0 22 0 26 6
+small NULL NULL 1048576 1024 1 # 0 1 0 2 1
+delete from t1;
+delete from t2;
+select * from information_schema.key_caches;
+KEY_CACHE_NAME PARTITIONS PARTITION_NUMBER FULL_SIZE BLOCK_SIZE USED_BLOCKS UNUSED_BLOCKS DIRTY_BLOCKS READ_REQUESTS READS WRITE_REQUESTS WRITES
+default 2 1 1048576 1024 3 # 0 10 0 13 4
+default 2 2 1048576 1024 1 # 0 12 0 13 2
+default 2 NULL 2097152 1024 4 # 0 22 0 26 6
+small NULL NULL 1048576 1024 1 # 0 1 0 2 1
+set global key_cache_partitions=1;
+select @@key_cache_partitions;
+@@key_cache_partitions
+1
+select * from information_schema.key_caches;
+KEY_CACHE_NAME PARTITIONS PARTITION_NUMBER FULL_SIZE BLOCK_SIZE USED_BLOCKS UNUSED_BLOCKS DIRTY_BLOCKS READ_REQUESTS READS WRITE_REQUESTS WRITES
+default 1 1 2097152 1024 0 # 0 0 0 0 0
+default 1 NULL 2097152 1024 0 # 0 0 0 0 0
+small NULL NULL 1048576 1024 1 # 0 1 0 2 1
+insert into t1 values (1, 'qqqq'), (2, 'yyyy');
+insert into t2 values (1, 1, 'qqqq'), (2, 1, 'pppp'),
+(3, 1, 'yyyy'), (4, 3, 'zzzz');
+select * from t1;
+p a
+1 qqqq
+2 yyyy
+select * from t2;
+p i a
+1 1 qqqq
+2 1 pppp
+3 1 yyyy
+4 3 zzzz
+update t1 set p=3 where p=1;
+update t2 set i=2 where i=1;
+show status like 'key_%';
+Variable_name Value
+Key_blocks_not_flushed 0
+Key_blocks_unused KEY_BLOCKS_UNUSED
+Key_blocks_used 4
+Key_read_requests 22
+Key_reads 0
+Key_write_requests 26
+Key_writes 6
+select * from information_schema.key_caches;
+KEY_CACHE_NAME PARTITIONS PARTITION_NUMBER FULL_SIZE BLOCK_SIZE USED_BLOCKS UNUSED_BLOCKS DIRTY_BLOCKS READ_REQUESTS READS WRITE_REQUESTS WRITES
+default 1 1 2097152 1024 4 # 0 22 0 26 6
+default 1 NULL 2097152 1024 4 # 0 22 0 26 6
+small NULL NULL 1048576 1024 1 # 0 1 0 2 1
+delete from t1;
+delete from t2;
+select * from information_schema.key_caches;
+KEY_CACHE_NAME PARTITIONS PARTITION_NUMBER FULL_SIZE BLOCK_SIZE USED_BLOCKS UNUSED_BLOCKS DIRTY_BLOCKS READ_REQUESTS READS WRITE_REQUESTS WRITES
+default 1 1 2097152 1024 4 # 0 22 0 26 6
+default 1 NULL 2097152 1024 4 # 0 22 0 26 6
+small NULL NULL 1048576 1024 1 # 0 1 0 2 1
+flush tables;
+flush status;
+select * from information_schema.key_caches;
+KEY_CACHE_NAME PARTITIONS PARTITION_NUMBER FULL_SIZE BLOCK_SIZE USED_BLOCKS UNUSED_BLOCKS DIRTY_BLOCKS READ_REQUESTS READS WRITE_REQUESTS WRITES
+default 1 1 2097152 1024 4 # 0 0 0 0 0
+default 1 NULL 2097152 1024 4 # 0 0 0 0 0
+small NULL NULL 1048576 1024 1 # 0 0 0 0 0
+set global key_buffer_size=32*1024;
+select @@key_buffer_size;
+@@key_buffer_size
+32768
+set global key_cache_partitions=2;
+select @@key_cache_partitions;
+@@key_cache_partitions
+2
+select * from information_schema.key_caches;
+KEY_CACHE_NAME PARTITIONS PARTITION_NUMBER FULL_SIZE BLOCK_SIZE USED_BLOCKS UNUSED_BLOCKS DIRTY_BLOCKS READ_REQUESTS READS WRITE_REQUESTS WRITES
+default 2 1 16384 1024 0 # 0 0 0 0 0
+default 2 2 16384 1024 0 # 0 0 0 0 0
+default 2 NULL 32768 1024 0 # 0 0 0 0 0
+small NULL NULL 1048576 1024 1 # 0 0 0 0 0
+insert into t1 values (1, 'qqqq'), (2, 'yyyy');
+insert into t2 values (1, 1, 'qqqq'), (2, 1, 'pppp'),
+(3, 1, 'yyyy'), (4, 3, 'zzzz');
+select * from t1;
+p a
+1 qqqq
+2 yyyy
+select * from t2;
+p i a
+1 1 qqqq
+2 1 pppp
+3 1 yyyy
+4 3 zzzz
+update t1 set p=3 where p=1;
+update t2 set i=2 where i=1;
+select * from information_schema.key_caches;
+KEY_CACHE_NAME PARTITIONS PARTITION_NUMBER FULL_SIZE BLOCK_SIZE USED_BLOCKS UNUSED_BLOCKS DIRTY_BLOCKS READ_REQUESTS READS WRITE_REQUESTS WRITES
+default 2 1 16384 1024 1 # 0 12 0 13 2
+default 2 2 16384 1024 3 # 0 10 0 13 4
+default 2 NULL 32768 1024 4 # 0 22 0 26 6
+small NULL NULL 1048576 1024 1 # 0 0 0 0 0
+insert into t1(a) select a from t1;
+insert into t1(a) select a from t1;
+insert into t1(a) select a from t1;
+insert into t1(a) select a from t1;
+insert into t1(a) select a from t1;
+insert into t1(a) select a from t1;
+insert into t1(a) select a from t1;
+insert into t1(a) select a from t1;
+insert into t2(i,a) select i,a from t2;
+insert into t2(i,a) select i,a from t2;
+insert into t2(i,a) select i,a from t2;
+insert into t2(i,a) select i,a from t2;
+insert into t2(i,a) select i,a from t2;
+insert into t2(i,a) select i,a from t2;
+insert into t2(i,a) select i,a from t2;
+insert into t2(i,a) select i,a from t2;
+select * from information_schema.key_caches;
+KEY_CACHE_NAME PARTITIONS PARTITION_NUMBER FULL_SIZE BLOCK_SIZE USED_BLOCKS UNUSED_BLOCKS DIRTY_BLOCKS READ_REQUESTS READS WRITE_REQUESTS WRITES
+default 2 1 16384 1024 # # 0 1951 # 1976 43
+default 2 2 16384 1024 # # 0 4782 # 1708 60
+default 2 NULL 32768 1024 # # 0 6733 # 3684 103
+small NULL NULL 1048576 1024 # # 0 0 # 0 0
+select * from t1 where p between 1010 and 1020 ;
+p a
+select * from t2 where p between 1010 and 1020 ;
+p i a
+1010 2 pppp
+1011 2 yyyy
+1012 3 zzzz
+1013 2 qqqq
+1014 2 pppp
+1015 2 yyyy
+1016 3 zzzz
+1017 2 qqqq
+1018 2 pppp
+1019 2 yyyy
+1020 3 zzzz
+select * from information_schema.key_caches;
+KEY_CACHE_NAME PARTITIONS PARTITION_NUMBER FULL_SIZE BLOCK_SIZE USED_BLOCKS UNUSED_BLOCKS DIRTY_BLOCKS READ_REQUESTS READS WRITE_REQUESTS WRITES
+default 2 1 16384 1024 # # 0 1954 # 1976 43
+default 2 2 16384 1024 # # 0 4796 # 1708 60
+default 2 NULL 32768 1024 # # 0 6750 # 3684 103
+small NULL NULL 1048576 1024 # # 0 0 # 0 0
+flush tables;
+flush status;
+update t1 set a='zzzz' where a='qqqq';
+update t2 set i=1 where i=2;
+select * from information_schema.key_caches;
+KEY_CACHE_NAME PARTITIONS PARTITION_NUMBER FULL_SIZE BLOCK_SIZE USED_BLOCKS UNUSED_BLOCKS DIRTY_BLOCKS READ_REQUESTS READS WRITE_REQUESTS WRITES
+default 2 1 16384 1024 # # 0 940 10 939 10
+default 2 2 16384 1024 # # 0 2136 8 613 8
+default 2 NULL 32768 1024 # # 0 3076 18 1552 18
+small NULL NULL 1048576 1024 # # 0 0 0 0 0
+set global keycache1.key_buffer_size=256*1024;
+select @@keycache1.key_buffer_size;
+@@keycache1.key_buffer_size
+262144
+set global keycache1.key_cache_partitions=7;
+select @@keycache1.key_cache_partitions;
+@@keycache1.key_cache_partitions
+7
+select * from information_schema.key_caches;
+KEY_CACHE_NAME PARTITIONS PARTITION_NUMBER FULL_SIZE BLOCK_SIZE USED_BLOCKS UNUSED_BLOCKS DIRTY_BLOCKS READ_REQUESTS READS WRITE_REQUESTS WRITES
+default 2 1 16384 1024 # # 0 940 10 939 10
+default 2 2 16384 1024 # # 0 2136 8 613 8
+default 2 NULL 32768 1024 # # 0 3076 18 1552 18
+small NULL NULL 1048576 1024 # # 0 0 0 0 0
+keycache1 7 1 37449 2048 # # 0 0 0 0 0
+keycache1 7 2 37449 2048 # # 0 0 0 0 0
+keycache1 7 3 37449 2048 # # 0 0 0 0 0
+keycache1 7 4 37449 2048 # # 0 0 0 0 0
+keycache1 7 5 37449 2048 # # 0 0 0 0 0
+keycache1 7 6 37449 2048 # # 0 0 0 0 0
+keycache1 7 7 37449 2048 # # 0 0 0 0 0
+keycache1 7 NULL 262143 2048 # # 0 0 0 0 0
+select * from information_schema.key_caches where key_cache_name like "key%";
+KEY_CACHE_NAME PARTITIONS PARTITION_NUMBER FULL_SIZE BLOCK_SIZE USED_BLOCKS UNUSED_BLOCKS DIRTY_BLOCKS READ_REQUESTS READS WRITE_REQUESTS WRITES
+keycache1 7 1 37449 2048 0 # 0 0 0 0 0
+keycache1 7 2 37449 2048 0 # 0 0 0 0 0
+keycache1 7 3 37449 2048 0 # 0 0 0 0 0
+keycache1 7 4 37449 2048 0 # 0 0 0 0 0
+keycache1 7 5 37449 2048 0 # 0 0 0 0 0
+keycache1 7 6 37449 2048 0 # 0 0 0 0 0
+keycache1 7 7 37449 2048 0 # 0 0 0 0 0
+keycache1 7 NULL 262143 2048 0 # 0 0 0 0 0
+cache index t1 key (`primary`) in keycache1;
+Table Op Msg_type Msg_text
+test.t1 assign_to_keycache status OK
+explain select p from t1 where p between 1010 and 1020;
+id select_type table type possible_keys key key_len ref rows Extra
+1 SIMPLE t1 range PRIMARY PRIMARY 4 NULL 1 Using where; Using index
+select p from t1 where p between 1010 and 1020;
+p
+explain select i from t2 where p between 1010 and 1020;
+id select_type table type possible_keys key key_len ref rows Extra
+1 SIMPLE t2 range PRIMARY PRIMARY 4 NULL 28 Using where
+select i from t2 where p between 1010 and 1020;
+i
+1
+1
+3
+1
+1
+1
+3
+1
+1
+1
+3
+explain select count(*) from t1, t2 where t1.p = t2.i;
+id select_type table type possible_keys key key_len ref rows Extra
+1 SIMPLE t2 index k1 k1 5 NULL 1024 Using index
+1 SIMPLE t1 eq_ref PRIMARY PRIMARY 4 test.t2.i 1 Using index
+select count(*) from t1, t2 where t1.p = t2.i;
+count(*)
+256
+select * from information_schema.key_caches;
+KEY_CACHE_NAME PARTITIONS PARTITION_NUMBER FULL_SIZE BLOCK_SIZE USED_BLOCKS UNUSED_BLOCKS DIRTY_BLOCKS READ_REQUESTS READS WRITE_REQUESTS WRITES
+default 2 1 16384 1024 # # 0 966 12 939 10
+default 2 2 16384 1024 # # 0 2206 12 613 8
+default 2 NULL 32768 1024 # # 0 3172 24 1552 18
+small NULL NULL 1048576 1024 # # 0 0 0 0 0
+keycache1 7 1 37449 2048 # # 0 2 1 0 0
+keycache1 7 2 37449 2048 # # 0 7 1 0 0
+keycache1 7 3 37449 2048 # # 0 0 0 0 0
+keycache1 7 4 37449 2048 # # 0 5 1 0 0
+keycache1 7 5 37449 2048 # # 0 0 0 0 0
+keycache1 7 6 37449 2048 # # 0 0 0 0 0
+keycache1 7 7 37449 2048 # # 0 0 0 0 0
+keycache1 7 NULL 262143 2048 # # 0 14 3 0 0
+select * from information_schema.key_caches where key_cache_name like "key%";
+KEY_CACHE_NAME PARTITIONS PARTITION_NUMBER FULL_SIZE BLOCK_SIZE USED_BLOCKS UNUSED_BLOCKS DIRTY_BLOCKS READ_REQUESTS READS WRITE_REQUESTS WRITES
+keycache1 7 1 37449 2048 1 # 0 2 1 0 0
+keycache1 7 2 37449 2048 1 # 0 7 1 0 0
+keycache1 7 3 37449 2048 0 # 0 0 0 0 0
+keycache1 7 4 37449 2048 1 # 0 5 1 0 0
+keycache1 7 5 37449 2048 0 # 0 0 0 0 0
+keycache1 7 6 37449 2048 0 # 0 0 0 0 0
+keycache1 7 7 37449 2048 0 # 0 0 0 0 0
+keycache1 7 NULL 262143 2048 3 # 0 14 3 0 0
+cache index t2 in keycache1;
+Table Op Msg_type Msg_text
+test.t2 assign_to_keycache status OK
+update t2 set p=p+3000, i=2 where a='qqqq';
+select * from information_schema.key_caches where key_cache_name like "key%";
+KEY_CACHE_NAME PARTITIONS PARTITION_NUMBER FULL_SIZE BLOCK_SIZE USED_BLOCKS UNUSED_BLOCKS DIRTY_BLOCKS READ_REQUESTS READS WRITE_REQUESTS WRITES
+keycache1 7 1 37449 2048 3 # 0 44 3 43 2
+keycache1 7 2 37449 2048 4 # 0 61 4 51 1
+keycache1 7 3 37449 2048 4 # 0 177 4 176 3
+keycache1 7 4 37449 2048 4 # 0 122 4 119 3
+keycache1 7 5 37449 2048 4 # 0 840 4 335 4
+keycache1 7 6 37449 2048 3 # 0 627 3 133 3
+keycache1 7 7 37449 2048 3 # 0 211 3 214 3
+keycache1 7 NULL 262143 2048 25 # 0 2082 25 1071 19
+set global keycache2.key_buffer_size=1024*1024;
+cache index t2 in keycache2;
+Table Op Msg_type Msg_text
+test.t2 assign_to_keycache status OK
+insert into t2 values (2000, 3, 'yyyy');
+select * from information_schema.key_caches where key_cache_name like "keycache2";
+KEY_CACHE_NAME PARTITIONS PARTITION_NUMBER FULL_SIZE BLOCK_SIZE USED_BLOCKS UNUSED_BLOCKS DIRTY_BLOCKS READ_REQUESTS READS WRITE_REQUESTS WRITES
+keycache2 NULL NULL 1048576 1024 0 # 0 0 0 0 0
+select * from information_schema.key_caches where key_cache_name like "key%";
+KEY_CACHE_NAME PARTITIONS PARTITION_NUMBER FULL_SIZE BLOCK_SIZE USED_BLOCKS UNUSED_BLOCKS DIRTY_BLOCKS READ_REQUESTS READS WRITE_REQUESTS WRITES
+keycache1 7 1 37449 2048 3 # 0 44 3 43 2
+keycache1 7 2 37449 2048 4 # 0 61 4 51 1
+keycache1 7 3 37449 2048 4 # 0 177 4 176 3
+keycache1 7 4 37449 2048 4 # 0 122 4 119 3
+keycache1 7 5 37449 2048 4 # 0 840 4 335 4
+keycache1 7 6 37449 2048 3 # 0 627 3 133 3
+keycache1 7 7 37449 2048 3 # 0 211 3 214 3
+keycache1 7 NULL 262143 2048 25 # 0 2082 25 1071 19
+keycache2 NULL NULL 1048576 1024 0 # 0 0 0 0 0
+cache index t2 in keycache1;
+Table Op Msg_type Msg_text
+test.t2 assign_to_keycache status OK
+update t2 set p=p+5000 where a='zzzz';
+select * from t2 where p between 1010 and 1020;
+p i a
+1010 1 pppp
+1011 1 yyyy
+1014 1 pppp
+1015 1 yyyy
+1018 1 pppp
+1019 1 yyyy
+explain select p from t2 where p between 1010 and 1020;
+id select_type table type possible_keys key key_len ref rows Extra
+1 SIMPLE t2 range PRIMARY PRIMARY 4 NULL 7 Using where; Using index
+select p from t2 where p between 1010 and 1020;
+p
+1010
+1011
+1014
+1015
+1018
+1019
+explain select i from t2 where a='yyyy' and i=3;
+id select_type table type possible_keys key key_len ref rows Extra
+1 SIMPLE t2 ref k1,k2 k1 5 const 188 Using where
+select i from t2 where a='yyyy' and i=3;
+i
+3
+explain select a from t2 where a='yyyy' and i=3;
+id select_type table type possible_keys key key_len ref rows Extra
+1 SIMPLE t2 ref k1,k2 k1 5 const 188 Using where
+select a from t2 where a='yyyy' and i=3 ;
+a
+yyyy
+select * from information_schema.key_caches;
+KEY_CACHE_NAME PARTITIONS PARTITION_NUMBER FULL_SIZE BLOCK_SIZE USED_BLOCKS UNUSED_BLOCKS DIRTY_BLOCKS READ_REQUESTS READS WRITE_REQUESTS WRITES
+default 2 1 16384 1024 # # 0 966 12 939 10
+default 2 2 16384 1024 # # 0 2206 12 613 8
+default 2 NULL 32768 1024 # # 0 3172 24 1552 18
+small NULL NULL 1048576 1024 # # 0 0 0 0 0
+keycache1 7 1 37449 2048 # # 0 85 6 68 3
+keycache1 7 2 37449 2048 # # 0 122 6 102 2
+keycache1 7 3 37449 2048 # # 0 271 8 254 6
+keycache1 7 4 37449 2048 # # 0 179 6 170 4
+keycache1 7 5 37449 2048 # # 0 1445 7 416 6
+keycache1 7 6 37449 2048 # # 0 863 6 345 5
+keycache1 7 7 37449 2048 # # 0 236 4 239 4
+keycache1 7 NULL 262143 2048 # # 0 3201 43 1594 30
+keycache2 NULL NULL 1048576 1024 # # 0 0 0 0 0
+set global keycache1.key_cache_block_size=2*1024;
+insert into t2 values (7000, 3, 'yyyy');
+select * from information_schema.key_caches;
+KEY_CACHE_NAME PARTITIONS PARTITION_NUMBER FULL_SIZE BLOCK_SIZE USED_BLOCKS UNUSED_BLOCKS DIRTY_BLOCKS READ_REQUESTS READS WRITE_REQUESTS WRITES
+default 2 1 16384 1024 # # 0 966 12 939 10
+default 2 2 16384 1024 # # 0 2206 12 613 8
+default 2 NULL 32768 1024 # # 0 3172 24 1552 18
+small NULL NULL 1048576 1024 # # 0 0 0 0 0
+keycache1 7 1 37449 2048 # # 0 1 1 1 1
+keycache1 7 2 37449 2048 # # 0 1 1 0 0
+keycache1 7 3 37449 2048 # # 0 0 0 0 0
+keycache1 7 4 37449 2048 # # 0 1 1 1 1
+keycache1 7 5 37449 2048 # # 0 1 1 0 0
+keycache1 7 6 37449 2048 # # 0 2 2 1 1
+keycache1 7 7 37449 2048 # # 0 0 0 0 0
+keycache1 7 NULL 262143 2048 # # 0 6 6 3 3
+keycache2 NULL NULL 1048576 1024 # # 0 0 0 0 0
+set global keycache1.key_cache_block_size=8*1024;
+insert into t2 values (8000, 3, 'yyyy');
+select * from information_schema.key_caches;
+KEY_CACHE_NAME PARTITIONS PARTITION_NUMBER FULL_SIZE BLOCK_SIZE USED_BLOCKS UNUSED_BLOCKS DIRTY_BLOCKS READ_REQUESTS READS WRITE_REQUESTS WRITES
+default 2 1 16384 1024 # # 0 966 12 939 10
+default 2 2 16384 1024 # # 0 2206 12 613 8
+default 2 NULL 32768 1024 # # 0 3172 24 1552 18
+small NULL NULL 1048576 1024 # # 0 0 0 0 0
+keycache1 3 1 87381 8192 # # 0 1 1 1 1
+keycache1 3 2 87381 8192 # # 0 3 2 1 1
+keycache1 3 3 87381 8192 # # 0 2 2 1 1
+keycache1 3 NULL 262143 8192 # # 0 6 5 3 3
+keycache2 NULL NULL 1048576 1024 # # 0 0 0 0 0
+set global keycache1.key_buffer_size=64*1024;
+select * from information_schema.key_caches;
+KEY_CACHE_NAME PARTITIONS PARTITION_NUMBER FULL_SIZE BLOCK_SIZE USED_BLOCKS UNUSED_BLOCKS DIRTY_BLOCKS READ_REQUESTS READS WRITE_REQUESTS WRITES
+default 2 1 16384 1024 # # 0 966 12 939 10
+default 2 2 16384 1024 # # 0 2206 12 613 8
+default 2 NULL 32768 1024 # # 0 3172 24 1552 18
+small NULL NULL 1048576 1024 # # 0 0 0 0 0
+keycache2 NULL NULL 1048576 1024 # # 0 0 0 0 0
+set global keycache1.key_cache_block_size=2*1024;
+select * from information_schema.key_caches;
+KEY_CACHE_NAME PARTITIONS PARTITION_NUMBER FULL_SIZE BLOCK_SIZE USED_BLOCKS UNUSED_BLOCKS DIRTY_BLOCKS READ_REQUESTS READS WRITE_REQUESTS WRITES
+default 2 1 16384 1024 # # 0 966 12 939 10
+default 2 2 16384 1024 # # 0 2206 12 613 8
+default 2 NULL 32768 1024 # # 0 3172 24 1552 18
+small NULL NULL 1048576 1024 # # 0 0 0 0 0
+keycache1 3 1 21845 2048 # # 0 0 0 0 0
+keycache1 3 2 21845 2048 # # 0 0 0 0 0
+keycache1 3 3 21845 2048 # # 0 0 0 0 0
+keycache1 3 NULL 65535 2048 # # 0 0 0 0 0
+keycache2 NULL NULL 1048576 1024 # # 0 0 0 0 0
+set global keycache1.key_cache_block_size=8*1024;
+select * from information_schema.key_caches;
+KEY_CACHE_NAME PARTITIONS PARTITION_NUMBER FULL_SIZE BLOCK_SIZE USED_BLOCKS UNUSED_BLOCKS DIRTY_BLOCKS READ_REQUESTS READS WRITE_REQUESTS WRITES
+default 2 1 16384 1024 # # 0 966 12 939 10
+default 2 2 16384 1024 # # 0 2206 12 613 8
+default 2 NULL 32768 1024 # # 0 3172 24 1552 18
+small NULL NULL 1048576 1024 # # 0 0 0 0 0
+keycache2 NULL NULL 1048576 1024 # # 0 0 0 0 0
+set global keycache1.key_buffer_size=0;
+select * from information_schema.key_caches;
+KEY_CACHE_NAME PARTITIONS PARTITION_NUMBER FULL_SIZE BLOCK_SIZE USED_BLOCKS UNUSED_BLOCKS DIRTY_BLOCKS READ_REQUESTS READS WRITE_REQUESTS WRITES
+default 2 1 16384 1024 # # 0 966 12 939 10
+default 2 2 16384 1024 # # 0 2206 12 613 8
+default 2 NULL 32768 1024 # # 0 3172 24 1552 18
+small NULL NULL 1048576 1024 # # 0 0 0 0 0
+keycache2 NULL NULL 1048576 1024 # # 0 0 0 0 0
+set global keycache1.key_cache_block_size=8*1024;
+select * from information_schema.key_caches;
+KEY_CACHE_NAME PARTITIONS PARTITION_NUMBER FULL_SIZE BLOCK_SIZE USED_BLOCKS UNUSED_BLOCKS DIRTY_BLOCKS READ_REQUESTS READS WRITE_REQUESTS WRITES
+default 2 1 16384 1024 # # 0 966 12 939 10
+default 2 2 16384 1024 # # 0 2206 12 613 8
+default 2 NULL 32768 1024 # # 0 3172 24 1552 18
+small NULL NULL 1048576 1024 # # 0 0 0 0 0
+keycache2 NULL NULL 1048576 1024 # # 0 0 0 0 0
+set global keycache1.key_buffer_size=0;
+select * from information_schema.key_caches;
+KEY_CACHE_NAME PARTITIONS PARTITION_NUMBER FULL_SIZE BLOCK_SIZE USED_BLOCKS UNUSED_BLOCKS DIRTY_BLOCKS READ_REQUESTS READS WRITE_REQUESTS WRITES
+default 2 1 16384 1024 # # 0 966 12 939 10
+default 2 2 16384 1024 # # 0 2206 12 613 8
+default 2 NULL 32768 1024 # # 0 3172 24 1552 18
+small NULL NULL 1048576 1024 # # 0 0 0 0 0
+keycache2 NULL NULL 1048576 1024 # # 0 0 0 0 0
+set global keycache1.key_buffer_size=128*1024;
+select * from information_schema.key_caches;
+KEY_CACHE_NAME PARTITIONS PARTITION_NUMBER FULL_SIZE BLOCK_SIZE USED_BLOCKS UNUSED_BLOCKS DIRTY_BLOCKS READ_REQUESTS READS WRITE_REQUESTS WRITES
+default 2 1 16384 1024 # # 0 966 12 939 10
+default 2 2 16384 1024 # # 0 2206 12 613 8
+default 2 NULL 32768 1024 # # 0 3172 24 1552 18
+small NULL NULL 1048576 1024 # # 0 0 0 0 0
+keycache1 1 1 131072 8192 # # 0 0 0 0 0
+keycache1 1 NULL 131072 8192 # # 0 0 0 0 0
+keycache2 NULL NULL 1048576 1024 # # 0 0 0 0 0
+set global keycache1.key_cache_block_size=1024;
+select * from information_schema.key_caches;
+KEY_CACHE_NAME PARTITIONS PARTITION_NUMBER FULL_SIZE BLOCK_SIZE USED_BLOCKS UNUSED_BLOCKS DIRTY_BLOCKS READ_REQUESTS READS WRITE_REQUESTS WRITES
+default 2 1 16384 1024 # # 0 966 12 939 10
+default 2 2 16384 1024 # # 0 2206 12 613 8
+default 2 NULL 32768 1024 # # 0 3172 24 1552 18
+small NULL NULL 1048576 1024 # # 0 0 0 0 0
+keycache1 7 1 18724 1024 # # 0 0 0 0 0
+keycache1 7 2 18724 1024 # # 0 0 0 0 0
+keycache1 7 3 18724 1024 # # 0 0 0 0 0
+keycache1 7 4 18724 1024 # # 0 0 0 0 0
+keycache1 7 5 18724 1024 # # 0 0 0 0 0
+keycache1 7 6 18724 1024 # # 0 0 0 0 0
+keycache1 7 7 18724 1024 # # 0 0 0 0 0
+keycache1 7 NULL 131068 1024 # # 0 0 0 0 0
+keycache2 NULL NULL 1048576 1024 # # 0 0 0 0 0
+drop table t1,t2;
+set global keycache1.key_buffer_size=0;
+set global keycache2.key_buffer_size=0;
+set global key_buffer_size=@save_key_buffer_size;
+set global key_cache_partitions=@save_key_cache_partitions;
=== added file 'mysql-test/r/select_pkeycache.result'
--- a/mysql-test/r/select_pkeycache.result 1970-01-01 00:00:00 +0000
+++ b/mysql-test/r/select_pkeycache.result 2010-02-16 16:41:11 +0000
@@ -0,0 +1,4636 @@
+drop table if exists t1,t2,t3,t4,t11;
+drop table if exists t1_1,t1_2,t9_1,t9_2,t1aa,t2aa;
+drop view if exists v1;
+CREATE TABLE t1 (
+Period smallint(4) unsigned zerofill DEFAULT '0000' NOT NULL,
+Varor_period smallint(4) unsigned DEFAULT '0' NOT NULL
+);
+INSERT INTO t1 VALUES (9410,9412);
+select period from t1;
+period
+9410
+select * from t1;
+Period Varor_period
+9410 9412
+select t1.* from t1;
+Period Varor_period
+9410 9412
+CREATE TABLE t2 (
+auto int not null auto_increment,
+fld1 int(6) unsigned zerofill DEFAULT '000000' NOT NULL,
+companynr tinyint(2) unsigned zerofill DEFAULT '00' NOT NULL,
+fld3 char(30) DEFAULT '' NOT NULL,
+fld4 char(35) DEFAULT '' NOT NULL,
+fld5 char(35) DEFAULT '' NOT NULL,
+fld6 char(4) DEFAULT '' NOT NULL,
+UNIQUE fld1 (fld1),
+KEY fld3 (fld3),
+PRIMARY KEY (auto)
+);
+select t2.fld3 from t2 where companynr = 58 and fld3 like "%imaginable%";
+fld3
+imaginable
+select fld3 from t2 where fld3 like "%cultivation" ;
+fld3
+cultivation
+select t2.fld3,companynr from t2 where companynr = 57+1 order by fld3;
+fld3 companynr
+concoct 58
+druggists 58
+engrossing 58
+Eurydice 58
+exclaimers 58
+ferociousness 58
+hopelessness 58
+Huey 58
+imaginable 58
+judges 58
+merging 58
+ostrich 58
+peering 58
+Phelps 58
+presumes 58
+Ruth 58
+sentences 58
+Shylock 58
+straggled 58
+synergy 58
+thanking 58
+tying 58
+unlocks 58
+select fld3,companynr from t2 where companynr = 58 order by fld3;
+fld3 companynr
+concoct 58
+druggists 58
+engrossing 58
+Eurydice 58
+exclaimers 58
+ferociousness 58
+hopelessness 58
+Huey 58
+imaginable 58
+judges 58
+merging 58
+ostrich 58
+peering 58
+Phelps 58
+presumes 58
+Ruth 58
+sentences 58
+Shylock 58
+straggled 58
+synergy 58
+thanking 58
+tying 58
+unlocks 58
+select fld3 from t2 order by fld3 desc limit 10;
+fld3
+youthfulness
+yelped
+Wotan
+workers
+Witt
+witchcraft
+Winsett
+Willy
+willed
+wildcats
+select fld3 from t2 order by fld3 desc limit 5;
+fld3
+youthfulness
+yelped
+Wotan
+workers
+Witt
+select fld3 from t2 order by fld3 desc limit 5,5;
+fld3
+witchcraft
+Winsett
+Willy
+willed
+wildcats
+select t2.fld3 from t2 where fld3 = 'honeysuckle';
+fld3
+honeysuckle
+select t2.fld3 from t2 where fld3 LIKE 'honeysuckl_';
+fld3
+honeysuckle
+select t2.fld3 from t2 where fld3 LIKE 'hon_ysuckl_';
+fld3
+honeysuckle
+select t2.fld3 from t2 where fld3 LIKE 'honeysuckle%';
+fld3
+honeysuckle
+select t2.fld3 from t2 where fld3 LIKE 'h%le';
+fld3
+honeysuckle
+select t2.fld3 from t2 where fld3 LIKE 'honeysuckle_';
+fld3
+select t2.fld3 from t2 where fld3 LIKE 'don_t_find_me_please%';
+fld3
+explain select t2.fld3 from t2 where fld3 = 'honeysuckle';
+id select_type table type possible_keys key key_len ref rows Extra
+1 SIMPLE t2 ref fld3 fld3 30 const 1 Using where; Using index
+explain select fld3 from t2 ignore index (fld3) where fld3 = 'honeysuckle';
+id select_type table type possible_keys key key_len ref rows Extra
+1 SIMPLE t2 ALL NULL NULL NULL NULL 1199 Using where
+explain select fld3 from t2 use index (fld1) where fld3 = 'honeysuckle';
+id select_type table type possible_keys key key_len ref rows Extra
+1 SIMPLE t2 ALL NULL NULL NULL NULL 1199 Using where
+explain select fld3 from t2 use index (fld3) where fld3 = 'honeysuckle';
+id select_type table type possible_keys key key_len ref rows Extra
+1 SIMPLE t2 ref fld3 fld3 30 const 1 Using where; Using index
+explain select fld3 from t2 use index (fld1,fld3) where fld3 = 'honeysuckle';
+id select_type table type possible_keys key key_len ref rows Extra
+1 SIMPLE t2 ref fld3 fld3 30 const 1 Using where; Using index
+explain select fld3 from t2 ignore index (fld3,not_used);
+ERROR 42000: Key 'not_used' doesn't exist in table 't2'
+explain select fld3 from t2 use index (not_used);
+ERROR 42000: Key 'not_used' doesn't exist in table 't2'
+select t2.fld3 from t2 where fld3 >= 'honeysuckle' and fld3 <= 'honoring' order by fld3;
+fld3
+honeysuckle
+honoring
+explain select t2.fld3 from t2 where fld3 >= 'honeysuckle' and fld3 <= 'honoring' order by fld3;
+id select_type table type possible_keys key key_len ref rows Extra
+1 SIMPLE t2 range fld3 fld3 30 NULL 2 Using where; Using index
+select fld1,fld3 from t2 where fld3="Colombo" or fld3 = "nondecreasing" order by fld3;
+fld1 fld3
+148504 Colombo
+068305 Colombo
+000000 nondecreasing
+select fld1,fld3 from t2 where companynr = 37 and fld3 = 'appendixes';
+fld1 fld3
+232605 appendixes
+1232605 appendixes
+1232606 appendixes
+1232607 appendixes
+1232608 appendixes
+1232609 appendixes
+select fld1 from t2 where fld1=250501 or fld1="250502";
+fld1
+250501
+250502
+explain select fld1 from t2 where fld1=250501 or fld1="250502";
+id select_type table type possible_keys key key_len ref rows Extra
+1 SIMPLE t2 range fld1 fld1 4 NULL 2 Using where; Using index
+select fld1 from t2 where fld1=250501 or fld1=250502 or fld1 >= 250505 and fld1 <= 250601 or fld1 between 250501 and 250502;
+fld1
+250501
+250502
+250505
+250601
+explain select fld1 from t2 where fld1=250501 or fld1=250502 or fld1 >= 250505 and fld1 <= 250601 or fld1 between 250501 and 250502;
+id select_type table type possible_keys key key_len ref rows Extra
+1 SIMPLE t2 range fld1 fld1 4 NULL 4 Using where; Using index
+select fld1,fld3 from t2 where companynr = 37 and fld3 like 'f%';
+fld1 fld3
+218401 faithful
+018007 fanatic
+228311 fated
+018017 featherweight
+218022 feed
+088303 feminine
+058004 Fenton
+038017 fetched
+018054 fetters
+208101 fiftieth
+238007 filial
+013606 fingerings
+218008 finishers
+038205 firearm
+188505 fitting
+202301 Fitzpatrick
+238008 fixedly
+012001 flanking
+018103 flint
+018104 flopping
+188007 flurried
+013602 foldout
+226205 foothill
+232102 forgivably
+228306 forthcoming
+186002 freakish
+208113 freest
+231315 freezes
+036002 funereal
+226209 furnishings
+198006 furthermore
+select fld3 from t2 where fld3 like "L%" and fld3 = "ok";
+fld3
+select fld3 from t2 where (fld3 like "C%" and fld3 = "Chantilly");
+fld3
+Chantilly
+select fld1,fld3 from t2 where fld1 like "25050%";
+fld1 fld3
+250501 poisoning
+250502 Iraqis
+250503 heaving
+250504 population
+250505 bomb
+select fld1,fld3 from t2 where fld1 like "25050_";
+fld1 fld3
+250501 poisoning
+250502 Iraqis
+250503 heaving
+250504 population
+250505 bomb
+select distinct companynr from t2;
+companynr
+00
+37
+36
+50
+58
+29
+40
+53
+65
+41
+34
+68
+select distinct companynr from t2 order by companynr;
+companynr
+00
+29
+34
+36
+37
+40
+41
+50
+53
+58
+65
+68
+select distinct companynr from t2 order by companynr desc;
+companynr
+68
+65
+58
+53
+50
+41
+40
+37
+36
+34
+29
+00
+select distinct t2.fld3,period from t2,t1 where companynr=37 and fld3 like "O%";
+fld3 period
+obliterates 9410
+offload 9410
+opaquely 9410
+organizer 9410
+overestimating 9410
+overlay 9410
+select distinct fld3 from t2 where companynr = 34 order by fld3;
+fld3
+absentee
+accessed
+ahead
+alphabetic
+Asiaticizations
+attitude
+aye
+bankruptcies
+belays
+Blythe
+bomb
+boulevard
+bulldozes
+cannot
+caressing
+charcoal
+checksumming
+chess
+clubroom
+colorful
+cosy
+creator
+crying
+Darius
+diffusing
+duality
+Eiffel
+Epiphany
+Ernestine
+explorers
+exterminated
+famine
+forked
+Gershwins
+heaving
+Hodges
+Iraqis
+Italianization
+Lagos
+landslide
+libretto
+Majorca
+mastering
+narrowed
+occurred
+offerers
+Palestine
+Peruvianizes
+pharmaceutic
+poisoning
+population
+Pygmalion
+rats
+realest
+recording
+regimented
+retransmitting
+reviver
+rouses
+scars
+sicker
+sleepwalk
+stopped
+sugars
+translatable
+uncles
+unexpected
+uprisings
+versatility
+vest
+select distinct fld3 from t2 limit 10;
+fld3
+abates
+abiding
+Abraham
+abrogating
+absentee
+abut
+accessed
+accruing
+accumulating
+accuracies
+select distinct fld3 from t2 having fld3 like "A%" limit 10;
+fld3
+abates
+abiding
+Abraham
+abrogating
+absentee
+abut
+accessed
+accruing
+accumulating
+accuracies
+select distinct substring(fld3,1,3) from t2 where fld3 like "A%";
+substring(fld3,1,3)
+aba
+abi
+Abr
+abs
+abu
+acc
+acq
+acu
+Ade
+adj
+Adl
+adm
+Ado
+ads
+adv
+aer
+aff
+afi
+afl
+afo
+agi
+ahe
+aim
+air
+Ald
+alg
+ali
+all
+alp
+alr
+ama
+ame
+amm
+ana
+and
+ane
+Ang
+ani
+Ann
+Ant
+api
+app
+aqu
+Ara
+arc
+Arm
+arr
+Art
+Asi
+ask
+asp
+ass
+ast
+att
+aud
+Aug
+aut
+ave
+avo
+awe
+aye
+Azt
+select distinct substring(fld3,1,3) as a from t2 having a like "A%" order by a limit 10;
+a
+aba
+abi
+Abr
+abs
+abu
+acc
+acq
+acu
+Ade
+adj
+select distinct substring(fld3,1,3) from t2 where fld3 like "A%" limit 10;
+substring(fld3,1,3)
+aba
+abi
+Abr
+abs
+abu
+acc
+acq
+acu
+Ade
+adj
+select distinct substring(fld3,1,3) as a from t2 having a like "A%" limit 10;
+a
+aba
+abi
+Abr
+abs
+abu
+acc
+acq
+acu
+Ade
+adj
+create table t3 (
+period int not null,
+name char(32) not null,
+companynr int not null,
+price double(11,0),
+price2 double(11,0),
+key (period),
+key (name)
+);
+create temporary table tmp engine = myisam select * from t3;
+insert into t3 select * from tmp;
+insert into tmp select * from t3;
+insert into t3 select * from tmp;
+insert into tmp select * from t3;
+insert into t3 select * from tmp;
+insert into tmp select * from t3;
+insert into t3 select * from tmp;
+insert into tmp select * from t3;
+insert into t3 select * from tmp;
+insert into tmp select * from t3;
+insert into t3 select * from tmp;
+insert into tmp select * from t3;
+insert into t3 select * from tmp;
+insert into tmp select * from t3;
+insert into t3 select * from tmp;
+insert into tmp select * from t3;
+insert into t3 select * from tmp;
+alter table t3 add t2nr int not null auto_increment primary key first;
+drop table tmp;
+SET SQL_BIG_TABLES=1;
+select distinct concat(fld3," ",fld3) as namn from t2,t3 where t2.fld1=t3.t2nr order by namn limit 10;
+namn
+Abraham Abraham
+abrogating abrogating
+admonishing admonishing
+Adolph Adolph
+afield afield
+aging aging
+ammonium ammonium
+analyzable analyzable
+animals animals
+animized animized
+SET SQL_BIG_TABLES=0;
+select distinct concat(fld3," ",fld3) from t2,t3 where t2.fld1=t3.t2nr order by fld3 limit 10;
+concat(fld3," ",fld3)
+Abraham Abraham
+abrogating abrogating
+admonishing admonishing
+Adolph Adolph
+afield afield
+aging aging
+ammonium ammonium
+analyzable analyzable
+animals animals
+animized animized
+select distinct fld5 from t2 limit 10;
+fld5
+neat
+Steinberg
+jarring
+tinily
+balled
+persist
+attainments
+fanatic
+measures
+rightfulness
+select distinct fld3,count(*) from t2 group by companynr,fld3 limit 10;
+fld3 count(*)
+affixed 1
+and 1
+annoyers 1
+Anthony 1
+assayed 1
+assurers 1
+attendants 1
+bedlam 1
+bedpost 1
+boasted 1
+SET SQL_BIG_TABLES=1;
+select distinct fld3,count(*) from t2 group by companynr,fld3 limit 10;
+fld3 count(*)
+affixed 1
+and 1
+annoyers 1
+Anthony 1
+assayed 1
+assurers 1
+attendants 1
+bedlam 1
+bedpost 1
+boasted 1
+SET SQL_BIG_TABLES=0;
+select distinct fld3,repeat("a",length(fld3)),count(*) from t2 group by companynr,fld3 limit 100,10;
+fld3 repeat("a",length(fld3)) count(*)
+circus aaaaaa 1
+cited aaaaa 1
+Colombo aaaaaaa 1
+congresswoman aaaaaaaaaaaaa 1
+contrition aaaaaaaaaa 1
+corny aaaaa 1
+cultivation aaaaaaaaaaa 1
+definiteness aaaaaaaaaaaa 1
+demultiplex aaaaaaaaaaa 1
+disappointing aaaaaaaaaaaaa 1
+select distinct companynr,rtrim(space(512+companynr)) from t3 order by 1,2;
+companynr rtrim(space(512+companynr))
+37
+78
+101
+154
+311
+447
+512
+select distinct fld3 from t2,t3 where t2.companynr = 34 and t2.fld1=t3.t2nr order by fld3;
+fld3
+explain select t3.t2nr,fld3 from t2,t3 where t2.companynr = 34 and t2.fld1=t3.t2nr order by t3.t2nr,fld3;
+id select_type table type possible_keys key key_len ref rows Extra
+1 SIMPLE t2 ALL fld1 NULL NULL NULL 1199 Using where; Using temporary; Using filesort
+1 SIMPLE t3 eq_ref PRIMARY PRIMARY 4 test.t2.fld1 1 Using where; Using index
+explain select * from t3 as t1,t3 where t1.period=t3.period order by t3.period;
+id select_type table type possible_keys key key_len ref rows Extra
+1 SIMPLE t1 ALL period NULL NULL NULL 41810 Using temporary; Using filesort
+1 SIMPLE t3 ref period period 4 test.t1.period 4181
+explain select * from t3 as t1,t3 where t1.period=t3.period order by t3.period limit 10;
+id select_type table type possible_keys key key_len ref rows Extra
+1 SIMPLE t3 index period period 4 NULL 1
+1 SIMPLE t1 ref period period 4 test.t3.period 4181
+explain select * from t3 as t1,t3 where t1.period=t3.period order by t1.period limit 10;
+id select_type table type possible_keys key key_len ref rows Extra
+1 SIMPLE t1 index period period 4 NULL 1
+1 SIMPLE t3 ref period period 4 test.t1.period 4181
+select period from t1;
+period
+9410
+select period from t1 where period=1900;
+period
+select fld3,period from t1,t2 where fld1 = 011401 order by period;
+fld3 period
+breaking 9410
+select fld3,period from t2,t3 where t2.fld1 = 011401 and t2.fld1=t3.t2nr and t3.period=1001;
+fld3 period
+breaking 1001
+explain select fld3,period from t2,t3 where t2.fld1 = 011401 and t3.t2nr=t2.fld1 and 1001 = t3.period;
+id select_type table type possible_keys key key_len ref rows Extra
+1 SIMPLE t2 const fld1 fld1 4 const 1
+1 SIMPLE t3 const PRIMARY,period PRIMARY 4 const 1
+select fld3,period from t2,t1 where companynr*10 = 37*10;
+fld3 period
+breaking 9410
+Romans 9410
+intercepted 9410
+bewilderingly 9410
+astound 9410
+admonishing 9410
+sumac 9410
+flanking 9410
+combed 9410
+subjective 9410
+scatterbrain 9410
+Eulerian 9410
+Kane 9410
+overlay 9410
+perturb 9410
+goblins 9410
+annihilates 9410
+Wotan 9410
+snatching 9410
+concludes 9410
+laterally 9410
+yelped 9410
+grazing 9410
+Baird 9410
+celery 9410
+misunderstander 9410
+handgun 9410
+foldout 9410
+mystic 9410
+succumbed 9410
+Nabisco 9410
+fingerings 9410
+aging 9410
+afield 9410
+ammonium 9410
+boat 9410
+intelligibility 9410
+Augustine 9410
+teethe 9410
+dreaded 9410
+scholastics 9410
+audiology 9410
+wallet 9410
+parters 9410
+eschew 9410
+quitter 9410
+neat 9410
+Steinberg 9410
+jarring 9410
+tinily 9410
+balled 9410
+persist 9410
+attainments 9410
+fanatic 9410
+measures 9410
+rightfulness 9410
+capably 9410
+impulsive 9410
+starlet 9410
+terminators 9410
+untying 9410
+announces 9410
+featherweight 9410
+pessimist 9410
+daughter 9410
+decliner 9410
+lawgiver 9410
+stated 9410
+readable 9410
+attrition 9410
+cascade 9410
+motors 9410
+interrogate 9410
+pests 9410
+stairway 9410
+dopers 9410
+testicle 9410
+Parsifal 9410
+leavings 9410
+postulation 9410
+squeaking 9410
+contrasted 9410
+leftover 9410
+whiteners 9410
+erases 9410
+Punjab 9410
+Merritt 9410
+Quixotism 9410
+sweetish 9410
+dogging 9410
+scornfully 9410
+bellow 9410
+bills 9410
+cupboard 9410
+sureties 9410
+puddings 9410
+fetters 9410
+bivalves 9410
+incurring 9410
+Adolph 9410
+pithed 9410
+Miles 9410
+trimmings 9410
+tragedies 9410
+skulking 9410
+flint 9410
+flopping 9410
+relaxing 9410
+offload 9410
+suites 9410
+lists 9410
+animized 9410
+multilayer 9410
+standardizes 9410
+Judas 9410
+vacuuming 9410
+dentally 9410
+humanness 9410
+inch 9410
+Weissmuller 9410
+irresponsibly 9410
+luckily 9410
+culled 9410
+medical 9410
+bloodbath 9410
+subschema 9410
+animals 9410
+Micronesia 9410
+repetitions 9410
+Antares 9410
+ventilate 9410
+pityingly 9410
+interdependent 9410
+Graves 9410
+neonatal 9410
+chafe 9410
+honoring 9410
+realtor 9410
+elite 9410
+funereal 9410
+abrogating 9410
+sorters 9410
+Conley 9410
+lectured 9410
+Abraham 9410
+Hawaii 9410
+cage 9410
+hushes 9410
+Simla 9410
+reporters 9410
+Dutchman 9410
+descendants 9410
+groupings 9410
+dissociate 9410
+coexist 9410
+Beebe 9410
+Taoism 9410
+Connally 9410
+fetched 9410
+checkpoints 9410
+rusting 9410
+galling 9410
+obliterates 9410
+traitor 9410
+resumes 9410
+analyzable 9410
+terminator 9410
+gritty 9410
+firearm 9410
+minima 9410
+Selfridge 9410
+disable 9410
+witchcraft 9410
+betroth 9410
+Manhattanize 9410
+imprint 9410
+peeked 9410
+swelling 9410
+interrelationships 9410
+riser 9410
+Gandhian 9410
+peacock 9410
+bee 9410
+kanji 9410
+dental 9410
+scarf 9410
+chasm 9410
+insolence 9410
+syndicate 9410
+alike 9410
+imperial 9410
+convulsion 9410
+railway 9410
+validate 9410
+normalizes 9410
+comprehensive 9410
+chewing 9410
+denizen 9410
+schemer 9410
+chronicle 9410
+Kline 9410
+Anatole 9410
+partridges 9410
+brunch 9410
+recruited 9410
+dimensions 9410
+Chicana 9410
+announced 9410
+praised 9410
+employing 9410
+linear 9410
+quagmire 9410
+western 9410
+relishing 9410
+serving 9410
+scheduling 9410
+lore 9410
+eventful 9410
+arteriole 9410
+disentangle 9410
+cured 9410
+Fenton 9410
+avoidable 9410
+drains 9410
+detectably 9410
+husky 9410
+impelling 9410
+undoes 9410
+evened 9410
+squeezes 9410
+destroyer 9410
+rudeness 9410
+beaner 9410
+boorish 9410
+Everhart 9410
+encompass 9410
+mushrooms 9410
+Alison 9410
+externally 9410
+pellagra 9410
+cult 9410
+creek 9410
+Huffman 9410
+Majorca 9410
+governing 9410
+gadfly 9410
+reassigned 9410
+intentness 9410
+craziness 9410
+psychic 9410
+squabbled 9410
+burlesque 9410
+capped 9410
+extracted 9410
+DiMaggio 9410
+exclamation 9410
+subdirectory 9410
+Gothicism 9410
+feminine 9410
+metaphysically 9410
+sanding 9410
+Miltonism 9410
+freakish 9410
+index 9410
+straight 9410
+flurried 9410
+denotative 9410
+coming 9410
+commencements 9410
+gentleman 9410
+gifted 9410
+Shanghais 9410
+sportswriting 9410
+sloping 9410
+navies 9410
+leaflet 9410
+shooter 9410
+Joplin 9410
+babies 9410
+assails 9410
+admiring 9410
+swaying 9410
+Goldstine 9410
+fitting 9410
+Norwalk 9410
+analogy 9410
+deludes 9410
+cokes 9410
+Clayton 9410
+exhausts 9410
+causality 9410
+sating 9410
+icon 9410
+throttles 9410
+communicants 9410
+dehydrate 9410
+priceless 9410
+publicly 9410
+incidentals 9410
+commonplace 9410
+mumbles 9410
+furthermore 9410
+cautioned 9410
+parametrized 9410
+registration 9410
+sadly 9410
+positioning 9410
+babysitting 9410
+eternal 9410
+hoarder 9410
+congregates 9410
+rains 9410
+workers 9410
+sags 9410
+unplug 9410
+garage 9410
+boulder 9410
+specifics 9410
+Teresa 9410
+Winsett 9410
+convenient 9410
+buckboards 9410
+amenities 9410
+resplendent 9410
+sews 9410
+participated 9410
+Simon 9410
+certificates 9410
+Fitzpatrick 9410
+Evanston 9410
+misted 9410
+textures 9410
+save 9410
+count 9410
+rightful 9410
+chaperone 9410
+Lizzy 9410
+clenched 9410
+effortlessly 9410
+accessed 9410
+beaters 9410
+Hornblower 9410
+vests 9410
+indulgences 9410
+infallibly 9410
+unwilling 9410
+excrete 9410
+spools 9410
+crunches 9410
+overestimating 9410
+ineffective 9410
+humiliation 9410
+sophomore 9410
+star 9410
+rifles 9410
+dialysis 9410
+arriving 9410
+indulge 9410
+clockers 9410
+languages 9410
+Antarctica 9410
+percentage 9410
+ceiling 9410
+specification 9410
+regimented 9410
+ciphers 9410
+pictures 9410
+serpents 9410
+allot 9410
+realized 9410
+mayoral 9410
+opaquely 9410
+hostess 9410
+fiftieth 9410
+incorrectly 9410
+decomposition 9410
+stranglings 9410
+mixture 9410
+electroencephalography 9410
+similarities 9410
+charges 9410
+freest 9410
+Greenberg 9410
+tinting 9410
+expelled 9410
+warm 9410
+smoothed 9410
+deductions 9410
+Romano 9410
+bitterroot 9410
+corset 9410
+securing 9410
+environing 9410
+cute 9410
+Crays 9410
+heiress 9410
+inform 9410
+avenge 9410
+universals 9410
+Kinsey 9410
+ravines 9410
+bestseller 9410
+equilibrium 9410
+extents 9410
+relatively 9410
+pressure 9410
+critiques 9410
+befouled 9410
+rightfully 9410
+mechanizing 9410
+Latinizes 9410
+timesharing 9410
+Aden 9410
+embassies 9410
+males 9410
+shapelessly 9410
+mastering 9410
+Newtonian 9410
+finishers 9410
+abates 9410
+teem 9410
+kiting 9410
+stodgy 9410
+feed 9410
+guitars 9410
+airships 9410
+store 9410
+denounces 9410
+Pyle 9410
+Saxony 9410
+serializations 9410
+Peruvian 9410
+taxonomically 9410
+kingdom 9410
+stint 9410
+Sault 9410
+faithful 9410
+Ganymede 9410
+tidiness 9410
+gainful 9410
+contrary 9410
+Tipperary 9410
+tropics 9410
+theorizers 9410
+renew 9410
+already 9410
+terminal 9410
+Hegelian 9410
+hypothesizer 9410
+warningly 9410
+journalizing 9410
+nested 9410
+Lars 9410
+saplings 9410
+foothill 9410
+labeled 9410
+imperiously 9410
+reporters 9410
+furnishings 9410
+precipitable 9410
+discounts 9410
+excises 9410
+Stalin 9410
+despot 9410
+ripeness 9410
+Arabia 9410
+unruly 9410
+mournfulness 9410
+boom 9410
+slaughter 9410
+Sabine 9410
+handy 9410
+rural 9410
+organizer 9410
+shipyard 9410
+civics 9410
+inaccuracy 9410
+rules 9410
+juveniles 9410
+comprised 9410
+investigations 9410
+stabilizes 9410
+seminaries 9410
+Hunter 9410
+sporty 9410
+test 9410
+weasels 9410
+CERN 9410
+tempering 9410
+afore 9410
+Galatean 9410
+techniques 9410
+error 9410
+veranda 9410
+severely 9410
+Cassites 9410
+forthcoming 9410
+guides 9410
+vanish 9410
+lied 9410
+sawtooth 9410
+fated 9410
+gradually 9410
+widens 9410
+preclude 9410
+evenhandedly 9410
+percentage 9410
+disobedience 9410
+humility 9410
+gleaning 9410
+petted 9410
+bloater 9410
+minion 9410
+marginal 9410
+apiary 9410
+measures 9410
+precaution 9410
+repelled 9410
+primary 9410
+coverings 9410
+Artemia 9410
+navigate 9410
+spatial 9410
+Gurkha 9410
+meanwhile 9410
+Melinda 9410
+Butterfield 9410
+Aldrich 9410
+previewing 9410
+glut 9410
+unaffected 9410
+inmate 9410
+mineral 9410
+impending 9410
+meditation 9410
+ideas 9410
+miniaturizes 9410
+lewdly 9410
+title 9410
+youthfulness 9410
+creak 9410
+Chippewa 9410
+clamored 9410
+freezes 9410
+forgivably 9410
+reduce 9410
+McGovern 9410
+Nazis 9410
+epistle 9410
+socializes 9410
+conceptions 9410
+Kevin 9410
+uncovering 9410
+chews 9410
+appendixes 9410
+appendixes 9410
+appendixes 9410
+appendixes 9410
+appendixes 9410
+appendixes 9410
+raining 9410
+infest 9410
+compartment 9410
+minting 9410
+ducks 9410
+roped 9410
+waltz 9410
+Lillian 9410
+repressions 9410
+chillingly 9410
+noncritical 9410
+lithograph 9410
+spongers 9410
+parenthood 9410
+posed 9410
+instruments 9410
+filial 9410
+fixedly 9410
+relives 9410
+Pandora 9410
+watering 9410
+ungrateful 9410
+secures 9410
+poison 9410
+dusted 9410
+encompasses 9410
+presentation 9410
+Kantian 9410
+select fld3,period,price,price2 from t2,t3 where t2.fld1=t3.t2nr and period >= 1001 and period <= 1002 and t2.companynr = 37 order by fld3,period, price;
+fld3 period price price2
+admonishing 1002 28357832 8723648
+analyzable 1002 28357832 8723648
+annihilates 1001 5987435 234724
+Antares 1002 28357832 8723648
+astound 1001 5987435 234724
+audiology 1001 5987435 234724
+Augustine 1002 28357832 8723648
+Baird 1002 28357832 8723648
+bewilderingly 1001 5987435 234724
+breaking 1001 5987435 234724
+Conley 1001 5987435 234724
+dentally 1002 28357832 8723648
+dissociate 1002 28357832 8723648
+elite 1001 5987435 234724
+eschew 1001 5987435 234724
+Eulerian 1001 5987435 234724
+flanking 1001 5987435 234724
+foldout 1002 28357832 8723648
+funereal 1002 28357832 8723648
+galling 1002 28357832 8723648
+Graves 1001 5987435 234724
+grazing 1001 5987435 234724
+groupings 1001 5987435 234724
+handgun 1001 5987435 234724
+humility 1002 28357832 8723648
+impulsive 1002 28357832 8723648
+inch 1001 5987435 234724
+intelligibility 1001 5987435 234724
+jarring 1001 5987435 234724
+lawgiver 1001 5987435 234724
+lectured 1002 28357832 8723648
+Merritt 1002 28357832 8723648
+neonatal 1001 5987435 234724
+offload 1002 28357832 8723648
+parters 1002 28357832 8723648
+pityingly 1002 28357832 8723648
+puddings 1002 28357832 8723648
+Punjab 1001 5987435 234724
+quitter 1002 28357832 8723648
+realtor 1001 5987435 234724
+relaxing 1001 5987435 234724
+repetitions 1001 5987435 234724
+resumes 1001 5987435 234724
+Romans 1002 28357832 8723648
+rusting 1001 5987435 234724
+scholastics 1001 5987435 234724
+skulking 1002 28357832 8723648
+stated 1002 28357832 8723648
+suites 1002 28357832 8723648
+sureties 1001 5987435 234724
+testicle 1002 28357832 8723648
+tinily 1002 28357832 8723648
+tragedies 1001 5987435 234724
+trimmings 1001 5987435 234724
+vacuuming 1001 5987435 234724
+ventilate 1001 5987435 234724
+wallet 1001 5987435 234724
+Weissmuller 1002 28357832 8723648
+Wotan 1002 28357832 8723648
+select t2.fld1,fld3,period,price,price2 from t2,t3 where t2.fld1>= 18201 and t2.fld1 <= 18811 and t2.fld1=t3.t2nr and period = 1001 and t2.companynr = 37;
+fld1 fld3 period price price2
+018201 relaxing 1001 5987435 234724
+018601 vacuuming 1001 5987435 234724
+018801 inch 1001 5987435 234724
+018811 repetitions 1001 5987435 234724
+create table t4 (
+companynr tinyint(2) unsigned zerofill NOT NULL default '00',
+companyname char(30) NOT NULL default '',
+PRIMARY KEY (companynr),
+UNIQUE KEY companyname(companyname)
+) ENGINE=MyISAM MAX_ROWS=50 PACK_KEYS=1 COMMENT='companynames';
+select STRAIGHT_JOIN t2.companynr,companyname from t4,t2 where t2.companynr=t4.companynr group by t2.companynr;
+companynr companyname
+00 Unknown
+29 company 1
+34 company 2
+36 company 3
+37 company 4
+40 company 5
+41 company 6
+50 company 11
+53 company 7
+58 company 8
+65 company 9
+68 company 10
+select SQL_SMALL_RESULT t2.companynr,companyname from t4,t2 where t2.companynr=t4.companynr group by t2.companynr;
+companynr companyname
+00 Unknown
+29 company 1
+34 company 2
+36 company 3
+37 company 4
+40 company 5
+41 company 6
+50 company 11
+53 company 7
+58 company 8
+65 company 9
+68 company 10
+select * from t1,t1 t12;
+Period Varor_period Period Varor_period
+9410 9412 9410 9412
+select t2.fld1,t22.fld1 from t2,t2 t22 where t2.fld1 >= 250501 and t2.fld1 <= 250505 and t22.fld1 >= 250501 and t22.fld1 <= 250505;
+fld1 fld1
+250501 250501
+250502 250501
+250503 250501
+250504 250501
+250505 250501
+250501 250502
+250502 250502
+250503 250502
+250504 250502
+250505 250502
+250501 250503
+250502 250503
+250503 250503
+250504 250503
+250505 250503
+250501 250504
+250502 250504
+250503 250504
+250504 250504
+250505 250504
+250501 250505
+250502 250505
+250503 250505
+250504 250505
+250505 250505
+insert into t2 (fld1, companynr) values (999999,99);
+select t2.companynr,companyname from t2 left join t4 using (companynr) where t4.companynr is null;
+companynr companyname
+99 NULL
+select count(*) from t2 left join t4 using (companynr) where t4.companynr is not null;
+count(*)
+1199
+explain select t2.companynr,companyname from t2 left join t4 using (companynr) where t4.companynr is null;
+id select_type table type possible_keys key key_len ref rows Extra
+1 SIMPLE t2 ALL NULL NULL NULL NULL 1200
+1 SIMPLE t4 eq_ref PRIMARY PRIMARY 1 test.t2.companynr 1 Using where; Not exists
+explain select t2.companynr,companyname from t4 left join t2 using (companynr) where t2.companynr is null;
+id select_type table type possible_keys key key_len ref rows Extra
+1 SIMPLE t4 ALL NULL NULL NULL NULL 12
+1 SIMPLE t2 ALL NULL NULL NULL NULL 1200 Using where; Not exists
+select companynr,companyname from t2 left join t4 using (companynr) where companynr is null;
+companynr companyname
+select count(*) from t2 left join t4 using (companynr) where companynr is not null;
+count(*)
+1200
+explain select companynr,companyname from t2 left join t4 using (companynr) where companynr is null;
+id select_type table type possible_keys key key_len ref rows Extra
+1 SIMPLE NULL NULL NULL NULL NULL NULL NULL Impossible WHERE
+explain select companynr,companyname from t4 left join t2 using (companynr) where companynr is null;
+id select_type table type possible_keys key key_len ref rows Extra
+1 SIMPLE NULL NULL NULL NULL NULL NULL NULL Impossible WHERE
+delete from t2 where fld1=999999;
+explain select t2.companynr,companyname from t4 left join t2 using (companynr) where t2.companynr > 0;
+id select_type table type possible_keys key key_len ref rows Extra
+1 SIMPLE t2 ALL NULL NULL NULL NULL 1199 Using where
+1 SIMPLE t4 eq_ref PRIMARY PRIMARY 1 test.t2.companynr 1
+explain select t2.companynr,companyname from t4 left join t2 using (companynr) where t2.companynr > 0 or t2.companynr < 0;
+id select_type table type possible_keys key key_len ref rows Extra
+1 SIMPLE t2 ALL NULL NULL NULL NULL 1199 Using where
+1 SIMPLE t4 eq_ref PRIMARY PRIMARY 1 test.t2.companynr 1
+explain select t2.companynr,companyname from t4 left join t2 using (companynr) where t2.companynr > 0 and t4.companynr > 0;
+id select_type table type possible_keys key key_len ref rows Extra
+1 SIMPLE t2 ALL NULL NULL NULL NULL 1199 Using where
+1 SIMPLE t4 eq_ref PRIMARY PRIMARY 1 test.t2.companynr 1
+explain select companynr,companyname from t4 left join t2 using (companynr) where companynr > 0;
+id select_type table type possible_keys key key_len ref rows Extra
+1 SIMPLE t4 ALL PRIMARY NULL NULL NULL 12 Using where
+1 SIMPLE t2 ALL NULL NULL NULL NULL 1199
+explain select companynr,companyname from t4 left join t2 using (companynr) where companynr > 0 or companynr < 0;
+id select_type table type possible_keys key key_len ref rows Extra
+1 SIMPLE t4 ALL PRIMARY NULL NULL NULL 12 Using where
+1 SIMPLE t2 ALL NULL NULL NULL NULL 1199
+explain select companynr,companyname from t4 left join t2 using (companynr) where companynr > 0 and companynr > 0;
+id select_type table type possible_keys key key_len ref rows Extra
+1 SIMPLE t4 ALL PRIMARY NULL NULL NULL 12 Using where
+1 SIMPLE t2 ALL NULL NULL NULL NULL 1199
+explain select t2.companynr,companyname from t4 left join t2 using (companynr) where t2.companynr > 0 or t2.companynr is null;
+id select_type table type possible_keys key key_len ref rows Extra
+1 SIMPLE t4 ALL NULL NULL NULL NULL 12
+1 SIMPLE t2 ALL NULL NULL NULL NULL 1199 Using where
+explain select t2.companynr,companyname from t4 left join t2 using (companynr) where t2.companynr > 0 or t2.companynr < 0 or t4.companynr > 0;
+id select_type table type possible_keys key key_len ref rows Extra
+1 SIMPLE t4 ALL PRIMARY NULL NULL NULL 12
+1 SIMPLE t2 ALL NULL NULL NULL NULL 1199 Using where
+explain select t2.companynr,companyname from t4 left join t2 using (companynr) where ifnull(t2.companynr,1)>0;
+id select_type table type possible_keys key key_len ref rows Extra
+1 SIMPLE t4 ALL NULL NULL NULL NULL 12
+1 SIMPLE t2 ALL NULL NULL NULL NULL 1199 Using where
+explain select companynr,companyname from t4 left join t2 using (companynr) where companynr > 0 or companynr is null;
+id select_type table type possible_keys key key_len ref rows Extra
+1 SIMPLE t4 ALL PRIMARY NULL NULL NULL 12 Using where
+1 SIMPLE t2 ALL NULL NULL NULL NULL 1199
+explain select companynr,companyname from t4 left join t2 using (companynr) where companynr > 0 or companynr < 0 or companynr > 0;
+id select_type table type possible_keys key key_len ref rows Extra
+1 SIMPLE t4 ALL PRIMARY NULL NULL NULL 12 Using where
+1 SIMPLE t2 ALL NULL NULL NULL NULL 1199
+explain select companynr,companyname from t4 left join t2 using (companynr) where ifnull(companynr,1)>0;
+id select_type table type possible_keys key key_len ref rows Extra
+1 SIMPLE t4 ALL NULL NULL NULL NULL 12 Using where
+1 SIMPLE t2 ALL NULL NULL NULL NULL 1199
+select distinct t2.companynr,t4.companynr from t2,t4 where t2.companynr=t4.companynr+1;
+companynr companynr
+37 36
+41 40
+explain select distinct t2.companynr,t4.companynr from t2,t4 where t2.companynr=t4.companynr+1;
+id select_type table type possible_keys key key_len ref rows Extra
+1 SIMPLE t4 index NULL PRIMARY 1 NULL 12 Using index; Using temporary
+1 SIMPLE t2 ALL NULL NULL NULL NULL 1199 Using where; Using join buffer
+select t2.fld1,t2.companynr,fld3,period from t3,t2 where t2.fld1 = 38208 and t2.fld1=t3.t2nr and period = 1008 or t2.fld1 = 38008 and t2.fld1 =t3.t2nr and period = 1008;
+fld1 companynr fld3 period
+038008 37 reporters 1008
+038208 37 Selfridge 1008
+select t2.fld1,t2.companynr,fld3,period from t3,t2 where (t2.fld1 = 38208 or t2.fld1 = 38008) and t2.fld1=t3.t2nr and period>=1008 and period<=1009;
+fld1 companynr fld3 period
+038008 37 reporters 1008
+038208 37 Selfridge 1008
+select t2.fld1,t2.companynr,fld3,period from t3,t2 where (t3.t2nr = 38208 or t3.t2nr = 38008) and t2.fld1=t3.t2nr and period>=1008 and period<=1009;
+fld1 companynr fld3 period
+038008 37 reporters 1008
+038208 37 Selfridge 1008
+select period from t1 where (((period > 0) or period < 10000 or (period = 1900)) and (period=1900 and period <= 1901) or (period=1903 and (period=1903)) and period>=1902) or ((period=1904 or period=1905) or (period=1906 or period>1907)) or (period=1908 and period = 1909);
+period
+9410
+select period from t1 where ((period > 0 and period < 1) or (((period > 0 and period < 100) and (period > 10)) or (period > 10)) or (period > 0 and (period > 5 or period > 6)));
+period
+9410
+select a.fld1 from t2 as a,t2 b where ((a.fld1 = 250501 and a.fld1=b.fld1) or a.fld1=250502 or a.fld1=250503 or (a.fld1=250505 and a.fld1<=b.fld1 and b.fld1>=a.fld1)) and a.fld1=b.fld1;
+fld1
+250501
+250502
+250503
+250505
+select fld1 from t2 where fld1 in (250502,98005,98006,250503,250605,250606) and fld1 >=250502 and fld1 not in (250605,250606);
+fld1
+250502
+250503
+select fld1 from t2 where fld1 between 250502 and 250504;
+fld1
+250502
+250503
+250504
+select fld3 from t2 where (((fld3 like "_%L%" ) or (fld3 like "%ok%")) and ( fld3 like "L%" or fld3 like "G%")) and fld3 like "L%" ;
+fld3
+label
+labeled
+labeled
+landslide
+laterally
+leaflet
+lewdly
+Lillian
+luckily
+select count(*) from t1;
+count(*)
+1
+select companynr,count(*),sum(fld1) from t2 group by companynr;
+companynr count(*) sum(fld1)
+00 82 10355753
+29 95 14473298
+34 70 17788966
+36 215 22786296
+37 588 83602098
+40 37 6618386
+41 52 12816335
+50 11 1595438
+53 4 793210
+58 23 2254293
+65 10 2284055
+68 12 3097288
+select companynr,count(*) from t2 group by companynr order by companynr desc limit 5;
+companynr count(*)
+68 12
+65 10
+58 23
+53 4
+50 11
+select count(*),min(fld4),max(fld4),sum(fld1),avg(fld1),std(fld1),variance(fld1) from t2 where companynr = 34 and fld4<>"";
+count(*) min(fld4) max(fld4) sum(fld1) avg(fld1) std(fld1) variance(fld1)
+70 absentee vest 17788966 254128.0857 3272.5940 10709871.3069
+explain extended select count(*),min(fld4),max(fld4),sum(fld1),avg(fld1),std(fld1),variance(fld1) from t2 where companynr = 34 and fld4<>"";
+id select_type table type possible_keys key key_len ref rows filtered Extra
+1 SIMPLE t2 ALL NULL NULL NULL NULL 1199 100.00 Using where
+Warnings:
+Note 1003 select count(0) AS `count(*)`,min(`test`.`t2`.`fld4`) AS `min(fld4)`,max(`test`.`t2`.`fld4`) AS `max(fld4)`,sum(`test`.`t2`.`fld1`) AS `sum(fld1)`,avg(`test`.`t2`.`fld1`) AS `avg(fld1)`,std(`test`.`t2`.`fld1`) AS `std(fld1)`,variance(`test`.`t2`.`fld1`) AS `variance(fld1)` from `test`.`t2` where ((`test`.`t2`.`companynr` = 34) and (`test`.`t2`.`fld4` <> ''))
+select companynr,count(*),min(fld4),max(fld4),sum(fld1),avg(fld1),std(fld1),variance(fld1) from t2 group by companynr limit 3;
+companynr count(*) min(fld4) max(fld4) sum(fld1) avg(fld1) std(fld1) variance(fld1)
+00 82 Anthony windmills 10355753 126289.6707 115550.9757 13352027981.7087
+29 95 abut wetness 14473298 152350.5053 8368.5480 70032594.9026
+34 70 absentee vest 17788966 254128.0857 3272.5940 10709871.3069
+select companynr,t2nr,count(price),sum(price),min(price),max(price),avg(price) from t3 where companynr = 37 group by companynr,t2nr limit 10;
+companynr t2nr count(price) sum(price) min(price) max(price) avg(price)
+37 1 1 5987435 5987435 5987435 5987435.0000
+37 2 1 28357832 28357832 28357832 28357832.0000
+37 3 1 39654943 39654943 39654943 39654943.0000
+37 11 1 5987435 5987435 5987435 5987435.0000
+37 12 1 28357832 28357832 28357832 28357832.0000
+37 13 1 39654943 39654943 39654943 39654943.0000
+37 21 1 5987435 5987435 5987435 5987435.0000
+37 22 1 28357832 28357832 28357832 28357832.0000
+37 23 1 39654943 39654943 39654943 39654943.0000
+37 31 1 5987435 5987435 5987435 5987435.0000
+select /*! SQL_SMALL_RESULT */ companynr,t2nr,count(price),sum(price),min(price),max(price),avg(price) from t3 where companynr = 37 group by companynr,t2nr limit 10;
+companynr t2nr count(price) sum(price) min(price) max(price) avg(price)
+37 1 1 5987435 5987435 5987435 5987435.0000
+37 2 1 28357832 28357832 28357832 28357832.0000
+37 3 1 39654943 39654943 39654943 39654943.0000
+37 11 1 5987435 5987435 5987435 5987435.0000
+37 12 1 28357832 28357832 28357832 28357832.0000
+37 13 1 39654943 39654943 39654943 39654943.0000
+37 21 1 5987435 5987435 5987435 5987435.0000
+37 22 1 28357832 28357832 28357832 28357832.0000
+37 23 1 39654943 39654943 39654943 39654943.0000
+37 31 1 5987435 5987435 5987435 5987435.0000
+select companynr,count(price),sum(price),min(price),max(price),avg(price) from t3 group by companynr ;
+companynr count(price) sum(price) min(price) max(price) avg(price)
+37 12543 309394878010 5987435 39654943 24666736.6667
+78 8362 414611089292 726498 98439034 49582766.0000
+101 4181 3489454238 834598 834598 834598.0000
+154 4181 4112197254950 983543950 983543950 983543950.0000
+311 4181 979599938 234298 234298 234298.0000
+447 4181 9929180954 2374834 2374834 2374834.0000
+512 4181 3288532102 786542 786542 786542.0000
+select distinct mod(companynr,10) from t4 group by companynr;
+mod(companynr,10)
+0
+9
+4
+6
+7
+1
+3
+8
+5
+select distinct 1 from t4 group by companynr;
+1
+1
+select count(distinct fld1) from t2;
+count(distinct fld1)
+1199
+select companynr,count(distinct fld1) from t2 group by companynr;
+companynr count(distinct fld1)
+00 82
+29 95
+34 70
+36 215
+37 588
+40 37
+41 52
+50 11
+53 4
+58 23
+65 10
+68 12
+select companynr,count(*) from t2 group by companynr;
+companynr count(*)
+00 82
+29 95
+34 70
+36 215
+37 588
+40 37
+41 52
+50 11
+53 4
+58 23
+65 10
+68 12
+select companynr,count(distinct concat(fld1,repeat(65,1000))) from t2 group by companynr;
+companynr count(distinct concat(fld1,repeat(65,1000)))
+00 82
+29 95
+34 70
+36 215
+37 588
+40 37
+41 52
+50 11
+53 4
+58 23
+65 10
+68 12
+select companynr,count(distinct concat(fld1,repeat(65,200))) from t2 group by companynr;
+companynr count(distinct concat(fld1,repeat(65,200)))
+00 82
+29 95
+34 70
+36 215
+37 588
+40 37
+41 52
+50 11
+53 4
+58 23
+65 10
+68 12
+select companynr,count(distinct floor(fld1/100)) from t2 group by companynr;
+companynr count(distinct floor(fld1/100))
+00 47
+29 35
+34 14
+36 69
+37 108
+40 16
+41 11
+50 9
+53 1
+58 1
+65 1
+68 1
+select companynr,count(distinct concat(repeat(65,1000),floor(fld1/100))) from t2 group by companynr;
+companynr count(distinct concat(repeat(65,1000),floor(fld1/100)))
+00 47
+29 35
+34 14
+36 69
+37 108
+40 16
+41 11
+50 9
+53 1
+58 1
+65 1
+68 1
+select sum(fld1),fld3 from t2 where fld3="Romans" group by fld1 limit 10;
+sum(fld1) fld3
+11402 Romans
+select name,count(*) from t3 where name='cloakroom' group by name;
+name count(*)
+cloakroom 4181
+select name,count(*) from t3 where name='cloakroom' and price>10 group by name;
+name count(*)
+cloakroom 4181
+select count(*) from t3 where name='cloakroom' and price2=823742;
+count(*)
+4181
+select name,count(*) from t3 where name='cloakroom' and price2=823742 group by name;
+name count(*)
+cloakroom 4181
+select name,count(*) from t3 where name >= "extramarital" and price <= 39654943 group by name;
+name count(*)
+extramarital 4181
+gazer 4181
+gems 4181
+Iranizes 4181
+spates 4181
+tucked 4181
+violinist 4181
+select t2.fld3,count(*) from t2,t3 where t2.fld1=158402 and t3.name=t2.fld3 group by t3.name;
+fld3 count(*)
+spates 4181
+select companynr|0,companyname from t4 group by 1;
+companynr|0 companyname
+0 Unknown
+29 company 1
+34 company 2
+36 company 3
+37 company 4
+40 company 5
+41 company 6
+50 company 11
+53 company 7
+58 company 8
+65 company 9
+68 company 10
+select t2.companynr,companyname,count(*) from t2,t4 where t2.companynr=t4.companynr group by t2.companynr order by companyname;
+companynr companyname count(*)
+29 company 1 95
+68 company 10 12
+50 company 11 11
+34 company 2 70
+36 company 3 215
+37 company 4 588
+40 company 5 37
+41 company 6 52
+53 company 7 4
+58 company 8 23
+65 company 9 10
+00 Unknown 82
+select t2.fld1,count(*) from t2,t3 where t2.fld1=158402 and t3.name=t2.fld3 group by t3.name;
+fld1 count(*)
+158402 4181
+select sum(Period)/count(*) from t1;
+sum(Period)/count(*)
+9410.0000
+select companynr,count(price) as "count",sum(price) as "sum" ,abs(sum(price)/count(price)-avg(price)) as "diff",(0+count(price))*companynr as func from t3 group by companynr;
+companynr count sum diff func
+37 12543 309394878010 0.0000 464091
+78 8362 414611089292 0.0000 652236
+101 4181 3489454238 0.0000 422281
+154 4181 4112197254950 0.0000 643874
+311 4181 979599938 0.0000 1300291
+447 4181 9929180954 0.0000 1868907
+512 4181 3288532102 0.0000 2140672
+select companynr,sum(price)/count(price) as avg from t3 group by companynr having avg > 70000000 order by avg;
+companynr avg
+154 983543950.0000
+select companynr,count(*) from t2 group by companynr order by 2 desc;
+companynr count(*)
+37 588
+36 215
+29 95
+00 82
+34 70
+41 52
+40 37
+58 23
+68 12
+50 11
+65 10
+53 4
+select companynr,count(*) from t2 where companynr > 40 group by companynr order by 2 desc;
+companynr count(*)
+41 52
+58 23
+68 12
+50 11
+65 10
+53 4
+select t2.fld4,t2.fld1,count(price),sum(price),min(price),max(price),avg(price) from t3,t2 where t3.companynr = 37 and t2.fld1 = t3.t2nr group by fld1,t2.fld4;
+fld4 fld1 count(price) sum(price) min(price) max(price) avg(price)
+teethe 000001 1 5987435 5987435 5987435 5987435.0000
+dreaded 011401 1 5987435 5987435 5987435 5987435.0000
+scholastics 011402 1 28357832 28357832 28357832 28357832.0000
+audiology 011403 1 39654943 39654943 39654943 39654943.0000
+wallet 011501 1 5987435 5987435 5987435 5987435.0000
+parters 011701 1 5987435 5987435 5987435 5987435.0000
+eschew 011702 1 28357832 28357832 28357832 28357832.0000
+quitter 011703 1 39654943 39654943 39654943 39654943.0000
+neat 012001 1 5987435 5987435 5987435 5987435.0000
+Steinberg 012003 1 39654943 39654943 39654943 39654943.0000
+balled 012301 1 5987435 5987435 5987435 5987435.0000
+persist 012302 1 28357832 28357832 28357832 28357832.0000
+attainments 012303 1 39654943 39654943 39654943 39654943.0000
+capably 012501 1 5987435 5987435 5987435 5987435.0000
+impulsive 012602 1 28357832 28357832 28357832 28357832.0000
+starlet 012603 1 39654943 39654943 39654943 39654943.0000
+featherweight 012701 1 5987435 5987435 5987435 5987435.0000
+pessimist 012702 1 28357832 28357832 28357832 28357832.0000
+daughter 012703 1 39654943 39654943 39654943 39654943.0000
+lawgiver 013601 1 5987435 5987435 5987435 5987435.0000
+stated 013602 1 28357832 28357832 28357832 28357832.0000
+readable 013603 1 39654943 39654943 39654943 39654943.0000
+testicle 013801 1 5987435 5987435 5987435 5987435.0000
+Parsifal 013802 1 28357832 28357832 28357832 28357832.0000
+leavings 013803 1 39654943 39654943 39654943 39654943.0000
+squeaking 013901 1 5987435 5987435 5987435 5987435.0000
+contrasted 016001 1 5987435 5987435 5987435 5987435.0000
+leftover 016201 1 5987435 5987435 5987435 5987435.0000
+whiteners 016202 1 28357832 28357832 28357832 28357832.0000
+erases 016301 1 5987435 5987435 5987435 5987435.0000
+Punjab 016302 1 28357832 28357832 28357832 28357832.0000
+Merritt 016303 1 39654943 39654943 39654943 39654943.0000
+sweetish 018001 1 5987435 5987435 5987435 5987435.0000
+dogging 018002 1 28357832 28357832 28357832 28357832.0000
+scornfully 018003 1 39654943 39654943 39654943 39654943.0000
+fetters 018012 1 28357832 28357832 28357832 28357832.0000
+bivalves 018013 1 39654943 39654943 39654943 39654943.0000
+skulking 018021 1 5987435 5987435 5987435 5987435.0000
+flint 018022 1 28357832 28357832 28357832 28357832.0000
+flopping 018023 1 39654943 39654943 39654943 39654943.0000
+Judas 018032 1 28357832 28357832 28357832 28357832.0000
+vacuuming 018033 1 39654943 39654943 39654943 39654943.0000
+medical 018041 1 5987435 5987435 5987435 5987435.0000
+bloodbath 018042 1 28357832 28357832 28357832 28357832.0000
+subschema 018043 1 39654943 39654943 39654943 39654943.0000
+interdependent 018051 1 5987435 5987435 5987435 5987435.0000
+Graves 018052 1 28357832 28357832 28357832 28357832.0000
+neonatal 018053 1 39654943 39654943 39654943 39654943.0000
+sorters 018061 1 5987435 5987435 5987435 5987435.0000
+epistle 018062 1 28357832 28357832 28357832 28357832.0000
+Conley 018101 1 5987435 5987435 5987435 5987435.0000
+lectured 018102 1 28357832 28357832 28357832 28357832.0000
+Abraham 018103 1 39654943 39654943 39654943 39654943.0000
+cage 018201 1 5987435 5987435 5987435 5987435.0000
+hushes 018202 1 28357832 28357832 28357832 28357832.0000
+Simla 018402 1 28357832 28357832 28357832 28357832.0000
+reporters 018403 1 39654943 39654943 39654943 39654943.0000
+coexist 018601 1 5987435 5987435 5987435 5987435.0000
+Beebe 018602 1 28357832 28357832 28357832 28357832.0000
+Taoism 018603 1 39654943 39654943 39654943 39654943.0000
+Connally 018801 1 5987435 5987435 5987435 5987435.0000
+fetched 018802 1 28357832 28357832 28357832 28357832.0000
+checkpoints 018803 1 39654943 39654943 39654943 39654943.0000
+gritty 018811 1 5987435 5987435 5987435 5987435.0000
+firearm 018812 1 28357832 28357832 28357832 28357832.0000
+minima 019101 1 5987435 5987435 5987435 5987435.0000
+Selfridge 019102 1 28357832 28357832 28357832 28357832.0000
+disable 019103 1 39654943 39654943 39654943 39654943.0000
+witchcraft 019201 1 5987435 5987435 5987435 5987435.0000
+betroth 030501 1 5987435 5987435 5987435 5987435.0000
+Manhattanize 030502 1 28357832 28357832 28357832 28357832.0000
+imprint 030503 1 39654943 39654943 39654943 39654943.0000
+swelling 031901 1 5987435 5987435 5987435 5987435.0000
+interrelationships 036001 1 5987435 5987435 5987435 5987435.0000
+riser 036002 1 28357832 28357832 28357832 28357832.0000
+bee 038001 1 5987435 5987435 5987435 5987435.0000
+kanji 038002 1 28357832 28357832 28357832 28357832.0000
+dental 038003 1 39654943 39654943 39654943 39654943.0000
+railway 038011 1 5987435 5987435 5987435 5987435.0000
+validate 038012 1 28357832 28357832 28357832 28357832.0000
+normalizes 038013 1 39654943 39654943 39654943 39654943.0000
+Kline 038101 1 5987435 5987435 5987435 5987435.0000
+Anatole 038102 1 28357832 28357832 28357832 28357832.0000
+partridges 038103 1 39654943 39654943 39654943 39654943.0000
+recruited 038201 1 5987435 5987435 5987435 5987435.0000
+dimensions 038202 1 28357832 28357832 28357832 28357832.0000
+Chicana 038203 1 39654943 39654943 39654943 39654943.0000
+select t3.companynr,fld3,sum(price) from t3,t2 where t2.fld1 = t3.t2nr and t3.companynr = 512 group by companynr,fld3;
+companynr fld3 sum(price)
+512 boat 786542
+512 capably 786542
+512 cupboard 786542
+512 decliner 786542
+512 descendants 786542
+512 dopers 786542
+512 erases 786542
+512 Micronesia 786542
+512 Miles 786542
+512 skies 786542
+select t2.companynr,count(*),min(fld3),max(fld3),sum(price),avg(price) from t2,t3 where t3.companynr >= 30 and t3.companynr <= 58 and t3.t2nr = t2.fld1 and 1+1=2 group by t2.companynr;
+companynr count(*) min(fld3) max(fld3) sum(price) avg(price)
+00 1 Omaha Omaha 5987435 5987435.0000
+36 1 dubbed dubbed 28357832 28357832.0000
+37 83 Abraham Wotan 1908978016 22999735.1325
+50 2 scribbled tapestry 68012775 34006387.5000
+select t3.companynr+0,t3.t2nr,fld3,sum(price) from t3,t2 where t2.fld1 = t3.t2nr and t3.companynr = 37 group by 1,t3.t2nr,fld3,fld3,fld3,fld3,fld3 order by fld1;
+t3.companynr+0 t2nr fld3 sum(price)
+37 1 Omaha 5987435
+37 11401 breaking 5987435
+37 11402 Romans 28357832
+37 11403 intercepted 39654943
+37 11501 bewilderingly 5987435
+37 11701 astound 5987435
+37 11702 admonishing 28357832
+37 11703 sumac 39654943
+37 12001 flanking 5987435
+37 12003 combed 39654943
+37 12301 Eulerian 5987435
+37 12302 dubbed 28357832
+37 12303 Kane 39654943
+37 12501 annihilates 5987435
+37 12602 Wotan 28357832
+37 12603 snatching 39654943
+37 12701 grazing 5987435
+37 12702 Baird 28357832
+37 12703 celery 39654943
+37 13601 handgun 5987435
+37 13602 foldout 28357832
+37 13603 mystic 39654943
+37 13801 intelligibility 5987435
+37 13802 Augustine 28357832
+37 13803 teethe 39654943
+37 13901 scholastics 5987435
+37 16001 audiology 5987435
+37 16201 wallet 5987435
+37 16202 parters 28357832
+37 16301 eschew 5987435
+37 16302 quitter 28357832
+37 16303 neat 39654943
+37 18001 jarring 5987435
+37 18002 tinily 28357832
+37 18003 balled 39654943
+37 18012 impulsive 28357832
+37 18013 starlet 39654943
+37 18021 lawgiver 5987435
+37 18022 stated 28357832
+37 18023 readable 39654943
+37 18032 testicle 28357832
+37 18033 Parsifal 39654943
+37 18041 Punjab 5987435
+37 18042 Merritt 28357832
+37 18043 Quixotism 39654943
+37 18051 sureties 5987435
+37 18052 puddings 28357832
+37 18053 tapestry 39654943
+37 18061 trimmings 5987435
+37 18062 humility 28357832
+37 18101 tragedies 5987435
+37 18102 skulking 28357832
+37 18103 flint 39654943
+37 18201 relaxing 5987435
+37 18202 offload 28357832
+37 18402 suites 28357832
+37 18403 lists 39654943
+37 18601 vacuuming 5987435
+37 18602 dentally 28357832
+37 18603 humanness 39654943
+37 18801 inch 5987435
+37 18802 Weissmuller 28357832
+37 18803 irresponsibly 39654943
+37 18811 repetitions 5987435
+37 18812 Antares 28357832
+37 19101 ventilate 5987435
+37 19102 pityingly 28357832
+37 19103 interdependent 39654943
+37 19201 Graves 5987435
+37 30501 neonatal 5987435
+37 30502 scribbled 28357832
+37 30503 chafe 39654943
+37 31901 realtor 5987435
+37 36001 elite 5987435
+37 36002 funereal 28357832
+37 38001 Conley 5987435
+37 38002 lectured 28357832
+37 38003 Abraham 39654943
+37 38011 groupings 5987435
+37 38012 dissociate 28357832
+37 38013 coexist 39654943
+37 38101 rusting 5987435
+37 38102 galling 28357832
+37 38103 obliterates 39654943
+37 38201 resumes 5987435
+37 38202 analyzable 28357832
+37 38203 terminator 39654943
+select sum(price) from t3,t2 where t2.fld1 = t3.t2nr and t3.companynr = 512 and t3.t2nr = 38008 and t2.fld1 = 38008 or t2.fld1= t3.t2nr and t3.t2nr = 38008 and t2.fld1 = 38008;
+sum(price)
+234298
+select t2.fld1,sum(price) from t3,t2 where t2.fld1 = t3.t2nr and t3.companynr = 512 and t3.t2nr = 38008 and t2.fld1 = 38008 or t2.fld1 = t3.t2nr and t3.t2nr = 38008 and t2.fld1 = 38008 or t3.t2nr = t2.fld1 and t2.fld1 = 38008 group by t2.fld1;
+fld1 sum(price)
+038008 234298
+explain select fld3 from t2 where 1>2 or 2>3;
+id select_type table type possible_keys key key_len ref rows Extra
+1 SIMPLE NULL NULL NULL NULL NULL NULL NULL Impossible WHERE
+explain select fld3 from t2 where fld1=fld1;
+id select_type table type possible_keys key key_len ref rows Extra
+1 SIMPLE t2 ALL NULL NULL NULL NULL 1199
+select companynr,fld1 from t2 HAVING fld1=250501 or fld1=250502;
+companynr fld1
+34 250501
+34 250502
+select companynr,fld1 from t2 WHERE fld1>=250501 HAVING fld1<=250502;
+companynr fld1
+34 250501
+34 250502
+select companynr,count(*) as count,sum(fld1) as sum from t2 group by companynr having count > 40 and sum/count >= 120000;
+companynr count sum
+00 82 10355753
+29 95 14473298
+34 70 17788966
+37 588 83602098
+41 52 12816335
+select companynr from t2 group by companynr having count(*) > 40 and sum(fld1)/count(*) >= 120000 ;
+companynr
+00
+29
+34
+37
+41
+select t2.companynr,companyname,count(*) from t2,t4 where t2.companynr=t4.companynr group by companyname having t2.companynr >= 40;
+companynr companyname count(*)
+68 company 10 12
+50 company 11 11
+40 company 5 37
+41 company 6 52
+53 company 7 4
+58 company 8 23
+65 company 9 10
+select count(*) from t2;
+count(*)
+1199
+select count(*) from t2 where fld1 < 098024;
+count(*)
+387
+select min(fld1) from t2 where fld1>= 098024;
+min(fld1)
+98024
+select max(fld1) from t2 where fld1>= 098024;
+max(fld1)
+1232609
+select count(*) from t3 where price2=76234234;
+count(*)
+4181
+select count(*) from t3 where companynr=512 and price2=76234234;
+count(*)
+4181
+explain select min(fld1),max(fld1),count(*) from t2;
+id select_type table type possible_keys key key_len ref rows Extra
+1 SIMPLE NULL NULL NULL NULL NULL NULL NULL Select tables optimized away
+select min(fld1),max(fld1),count(*) from t2;
+min(fld1) max(fld1) count(*)
+0 1232609 1199
+select min(t2nr),max(t2nr) from t3 where t2nr=2115 and price2=823742;
+min(t2nr) max(t2nr)
+2115 2115
+select count(*),min(t2nr),max(t2nr) from t3 where name='spates' and companynr=78;
+count(*) min(t2nr) max(t2nr)
+4181 4 41804
+select t2nr,count(*) from t3 where name='gems' group by t2nr limit 20;
+t2nr count(*)
+9 1
+19 1
+29 1
+39 1
+49 1
+59 1
+69 1
+79 1
+89 1
+99 1
+109 1
+119 1
+129 1
+139 1
+149 1
+159 1
+169 1
+179 1
+189 1
+199 1
+select max(t2nr) from t3 where price=983543950;
+max(t2nr)
+41807
+select t1.period from t3 = t1 limit 1;
+period
+1001
+select t1.period from t1 as t1 limit 1;
+period
+9410
+select t1.period as "Nuvarande period" from t1 as t1 limit 1;
+Nuvarande period
+9410
+select period as ok_period from t1 limit 1;
+ok_period
+9410
+select period as ok_period from t1 group by ok_period limit 1;
+ok_period
+9410
+select 1+1 as summa from t1 group by summa limit 1;
+summa
+2
+select period as "Nuvarande period" from t1 group by "Nuvarande period" limit 1;
+Nuvarande period
+9410
+show tables;
+Tables_in_test
+t1
+t2
+t3
+t4
+show tables from test like "s%";
+Tables_in_test (s%)
+show tables from test like "t?";
+Tables_in_test (t?)
+show full columns from t2;
+Field Type Collation Null Key Default Extra Privileges Comment
+auto int(11) NULL NO PRI NULL auto_increment #
+fld1 int(6) unsigned zerofill NULL NO UNI 000000 #
+companynr tinyint(2) unsigned zerofill NULL NO 00 #
+fld3 char(30) latin1_swedish_ci NO MUL #
+fld4 char(35) latin1_swedish_ci NO #
+fld5 char(35) latin1_swedish_ci NO #
+fld6 char(4) latin1_swedish_ci NO #
+show full columns from t2 from test like 'f%';
+Field Type Collation Null Key Default Extra Privileges Comment
+fld1 int(6) unsigned zerofill NULL NO UNI 000000 #
+fld3 char(30) latin1_swedish_ci NO MUL #
+fld4 char(35) latin1_swedish_ci NO #
+fld5 char(35) latin1_swedish_ci NO #
+fld6 char(4) latin1_swedish_ci NO #
+show full columns from t2 from test like 's%';
+Field Type Collation Null Key Default Extra Privileges Comment
+show keys from t2;
+Table Non_unique Key_name Seq_in_index Column_name Collation Cardinality Sub_part Packed Null Index_type Comment
+t2 0 PRIMARY 1 auto A 1199 NULL NULL BTREE
+t2 0 fld1 1 fld1 A 1199 NULL NULL BTREE
+t2 1 fld3 1 fld3 A NULL NULL NULL BTREE
+drop table t4, t3, t2, t1;
+DO 1;
+DO benchmark(100,1+1),1,1;
+do default;
+ERROR 42000: You have an error in your SQL syntax; check the manual that corresponds to your MySQL server version for the right syntax to use near '' at line 1
+do foobar;
+ERROR 42S22: Unknown column 'foobar' in 'field list'
+CREATE TABLE t1 (
+id mediumint(8) unsigned NOT NULL auto_increment,
+pseudo varchar(35) NOT NULL default '',
+PRIMARY KEY (id),
+UNIQUE KEY pseudo (pseudo)
+);
+INSERT INTO t1 (pseudo) VALUES ('test');
+INSERT INTO t1 (pseudo) VALUES ('test1');
+SELECT 1 as rnd1 from t1 where rand() > 2;
+rnd1
+DROP TABLE t1;
+CREATE TABLE t1 (gvid int(10) unsigned default NULL, hmid int(10) unsigned default NULL, volid int(10) unsigned default NULL, mmid int(10) unsigned default NULL, hdid int(10) unsigned default NULL, fsid int(10) unsigned default NULL, ctid int(10) unsigned default NULL, dtid int(10) unsigned default NULL, cost int(10) unsigned default NULL, performance int(10) unsigned default NULL, serialnumber bigint(20) unsigned default NULL, monitored tinyint(3) unsigned default '1', removed tinyint(3) unsigned default '0', target tinyint(3) unsigned default '0', dt_modified timestamp NOT NULL, name varchar(255) binary default NULL, description varchar(255) default NULL, UNIQUE KEY hmid (hmid,volid)) ENGINE=MyISAM;
+INSERT INTO t1 VALUES (200001,2,1,1,100,1,1,1,0,0,0,1,0,1,20020425060057,'\\\\ARKIVIO-TESTPDC\\E$',''),(200002,2,2,1,101,1,1,1,0,0,0,1,0,1,20020425060057,'\\\\ARKIVIO-TESTPDC\\C$',''),(200003,1,3,2,NULL,NULL,NULL,NULL,NULL,NULL,NULL,1,0,1,20020425060427,'c:',NULL);
+CREATE TABLE t2 ( hmid int(10) unsigned default NULL, volid int(10) unsigned default NULL, sampletid smallint(5) unsigned default NULL, sampletime datetime default NULL, samplevalue bigint(20) unsigned default NULL, KEY idx1 (hmid,volid,sampletid,sampletime)) ENGINE=MyISAM;
+INSERT INTO t2 VALUES (1,3,10,'2002-06-01 08:00:00',35),(1,3,1010,'2002-06-01 12:00:01',35);
+SELECT a.gvid, (SUM(CASE b.sampletid WHEN 140 THEN b.samplevalue ELSE 0 END)) as the_success,(SUM(CASE b.sampletid WHEN 141 THEN b.samplevalue ELSE 0 END)) as the_fail,(SUM(CASE b.sampletid WHEN 142 THEN b.samplevalue ELSE 0 END)) as the_size,(SUM(CASE b.sampletid WHEN 143 THEN b.samplevalue ELSE 0 END)) as the_time FROM t1 a, t2 b WHERE a.hmid = b.hmid AND a.volid = b.volid AND b.sampletime >= 'wrong-date-value' AND b.sampletime < 'wrong-date-value' AND b.sampletid IN (140, 141, 142, 143) GROUP BY a.gvid;
+gvid the_success the_fail the_size the_time
+Warnings:
+Warning 1292 Incorrect datetime value: 'wrong-date-value' for column 'sampletime' at row 1
+Warning 1292 Incorrect datetime value: 'wrong-date-value' for column 'sampletime' at row 1
+SELECT a.gvid, (SUM(CASE b.sampletid WHEN 140 THEN b.samplevalue ELSE 0 END)) as the_success,(SUM(CASE b.sampletid WHEN 141 THEN b.samplevalue ELSE 0 END)) as the_fail,(SUM(CASE b.sampletid WHEN 142 THEN b.samplevalue ELSE 0 END)) as the_size,(SUM(CASE b.sampletid WHEN 143 THEN b.samplevalue ELSE 0 END)) as the_time FROM t1 a, t2 b WHERE a.hmid = b.hmid AND a.volid = b.volid AND b.sampletime >= NULL AND b.sampletime < NULL AND b.sampletid IN (140, 141, 142, 143) GROUP BY a.gvid;
+gvid the_success the_fail the_size the_time
+DROP TABLE t1,t2;
+create table t1 ( A_Id bigint(20) NOT NULL default '0', A_UpdateBy char(10) NOT NULL default '', A_UpdateDate bigint(20) NOT NULL default '0', A_UpdateSerial int(11) NOT NULL default '0', other_types bigint(20) NOT NULL default '0', wss_type bigint(20) NOT NULL default '0');
+INSERT INTO t1 VALUES (102935998719055004,'brade',1029359987,2,102935229116544068,102935229216544093);
+select wss_type from t1 where wss_type ='102935229216544106';
+wss_type
+select wss_type from t1 where wss_type ='102935229216544105';
+wss_type
+select wss_type from t1 where wss_type ='102935229216544104';
+wss_type
+select wss_type from t1 where wss_type ='102935229216544093';
+wss_type
+102935229216544093
+select wss_type from t1 where wss_type =102935229216544093;
+wss_type
+102935229216544093
+drop table t1;
+select 1+2,"aaaa",3.13*2.0 into @a,@b,@c;
+select @a;
+@a
+3
+select @b;
+@b
+aaaa
+select @c;
+@c
+6.260
+create table t1 (a int not null auto_increment primary key);
+insert into t1 values ();
+insert into t1 values ();
+insert into t1 values ();
+select * from (t1 as t2 left join t1 as t3 using (a)), t1;
+a a
+1 1
+2 1
+3 1
+1 2
+2 2
+3 2
+1 3
+2 3
+3 3
+select * from t1, (t1 as t2 left join t1 as t3 using (a));
+a a
+1 1
+2 1
+3 1
+1 2
+2 2
+3 2
+1 3
+2 3
+3 3
+select * from (t1 as t2 left join t1 as t3 using (a)) straight_join t1;
+a a
+1 1
+2 1
+3 1
+1 2
+2 2
+3 2
+1 3
+2 3
+3 3
+select * from t1 straight_join (t1 as t2 left join t1 as t3 using (a));
+a a
+1 1
+2 1
+3 1
+1 2
+2 2
+3 2
+1 3
+2 3
+3 3
+select * from (t1 as t2 left join t1 as t3 using (a)) inner join t1 on t1.a>1;
+a a
+1 2
+2 2
+3 2
+1 3
+2 3
+3 3
+select * from t1 inner join (t1 as t2 left join t1 as t3 using (a)) on t1.a>1;
+a a
+2 1
+3 1
+2 2
+3 2
+2 3
+3 3
+select * from (t1 as t2 left join t1 as t3 using (a)) inner join t1 using ( a );
+a
+1
+2
+3
+select * from t1 inner join (t1 as t2 left join t1 as t3 using (a)) using ( a );
+a
+1
+2
+3
+select * from (t1 as t2 left join t1 as t3 using (a)) left outer join t1 on t1.a>1;
+a a
+1 2
+1 3
+2 2
+2 3
+3 2
+3 3
+select * from t1 left outer join (t1 as t2 left join t1 as t3 using (a)) on t1.a>1;
+a a
+1 NULL
+2 1
+2 2
+2 3
+3 1
+3 2
+3 3
+select * from (t1 as t2 left join t1 as t3 using (a)) left join t1 using ( a );
+a
+1
+2
+3
+select * from t1 left join (t1 as t2 left join t1 as t3 using (a)) using ( a );
+a
+1
+2
+3
+select * from (t1 as t2 left join t1 as t3 using (a)) natural left join t1;
+a
+1
+2
+3
+select * from t1 natural left join (t1 as t2 left join t1 as t3 using (a));
+a
+1
+2
+3
+select * from (t1 as t2 left join t1 as t3 using (a)) right join t1 on t1.a>1;
+a a
+NULL 1
+1 2
+2 2
+3 2
+1 3
+2 3
+3 3
+select * from t1 right join (t1 as t2 left join t1 as t3 using (a)) on t1.a>1;
+a a
+2 1
+3 1
+2 2
+3 2
+2 3
+3 3
+select * from (t1 as t2 left join t1 as t3 using (a)) right outer join t1 using ( a );
+a
+1
+2
+3
+select * from t1 right outer join (t1 as t2 left join t1 as t3 using (a)) using ( a );
+a
+1
+2
+3
+select * from (t1 as t2 left join t1 as t3 using (a)) natural right join t1;
+a
+1
+2
+3
+select * from t1 natural right join (t1 as t2 left join t1 as t3 using (a));
+a
+1
+2
+3
+select * from t1 natural join (t1 as t2 left join t1 as t3 using (a));
+a
+1
+2
+3
+select * from (t1 as t2 left join t1 as t3 using (a)) natural join t1;
+a
+1
+2
+3
+drop table t1;
+CREATE TABLE t1 ( aa char(2), id int(11) NOT NULL auto_increment, t2_id int(11) NOT NULL default '0', PRIMARY KEY (id), KEY replace_id (t2_id)) ENGINE=MyISAM;
+INSERT INTO t1 VALUES ("1",8264,2506),("2",8299,2517),("3",8301,2518),("4",8302,2519),("5",8303,2520),("6",8304,2521),("7",8305,2522);
+CREATE TABLE t2 ( id int(11) NOT NULL auto_increment, PRIMARY KEY (id)) ENGINE=MyISAM;
+INSERT INTO t2 VALUES (2517), (2518), (2519), (2520), (2521), (2522);
+select * from t1, t2 WHERE t1.t2_id = t2.id and t1.t2_id > 0 order by t1.id LIMIT 0, 5;
+aa id t2_id id
+2 8299 2517 2517
+3 8301 2518 2518
+4 8302 2519 2519
+5 8303 2520 2520
+6 8304 2521 2521
+drop table t1,t2;
+create table t1 (id1 int NOT NULL);
+create table t2 (id2 int NOT NULL);
+create table t3 (id3 int NOT NULL);
+create table t4 (id4 int NOT NULL, id44 int NOT NULL, KEY (id4));
+insert into t1 values (1);
+insert into t1 values (2);
+insert into t2 values (1);
+insert into t4 values (1,1);
+explain select * from t1 left join t2 on id1 = id2 left join t3 on id1 = id3
+left join t4 on id3 = id4 where id2 = 1 or id4 = 1;
+id select_type table type possible_keys key key_len ref rows Extra
+1 SIMPLE t3 system NULL NULL NULL NULL 0 const row not found
+1 SIMPLE t4 const id4 NULL NULL NULL 1
+1 SIMPLE t1 ALL NULL NULL NULL NULL 2
+1 SIMPLE t2 ALL NULL NULL NULL NULL 1 Using where
+select * from t1 left join t2 on id1 = id2 left join t3 on id1 = id3
+left join t4 on id3 = id4 where id2 = 1 or id4 = 1;
+id1 id2 id3 id4 id44
+1 1 NULL NULL NULL
+drop table t1,t2,t3,t4;
+create table t1(s varchar(10) not null);
+create table t2(s varchar(10) not null primary key);
+create table t3(s varchar(10) not null primary key);
+insert into t1 values ('one\t'), ('two\t');
+insert into t2 values ('one\r'), ('two\t');
+insert into t3 values ('one '), ('two\t');
+select * from t1 where s = 'one';
+s
+select * from t2 where s = 'one';
+s
+select * from t3 where s = 'one';
+s
+one
+select * from t1,t2 where t1.s = t2.s;
+s s
+two two
+select * from t2,t3 where t2.s = t3.s;
+s s
+two two
+drop table t1, t2, t3;
+create table t1 (a integer, b integer, index(a), index(b));
+create table t2 (c integer, d integer, index(c), index(d));
+insert into t1 values (1,2), (2,2), (3,2), (4,2);
+insert into t2 values (1,3), (2,3), (3,4), (4,4);
+explain select * from t1 left join t2 on a=c where d in (4);
+id select_type table type possible_keys key key_len ref rows Extra
+1 SIMPLE t2 ref c,d d 5 const 2 Using where
+1 SIMPLE t1 ALL a NULL NULL NULL 4 Using where; Using join buffer
+select * from t1 left join t2 on a=c where d in (4);
+a b c d
+3 2 3 4
+4 2 4 4
+explain select * from t1 left join t2 on a=c where d = 4;
+id select_type table type possible_keys key key_len ref rows Extra
+1 SIMPLE t2 ref c,d d 5 const 2 Using where
+1 SIMPLE t1 ALL a NULL NULL NULL 4 Using where; Using join buffer
+select * from t1 left join t2 on a=c where d = 4;
+a b c d
+3 2 3 4
+4 2 4 4
+drop table t1, t2;
+CREATE TABLE t1 (
+i int(11) NOT NULL default '0',
+c char(10) NOT NULL default '',
+PRIMARY KEY (i),
+UNIQUE KEY c (c)
+) ENGINE=MyISAM;
+INSERT INTO t1 VALUES (1,'a');
+INSERT INTO t1 VALUES (2,'b');
+INSERT INTO t1 VALUES (3,'c');
+EXPLAIN SELECT i FROM t1 WHERE i=1;
+id select_type table type possible_keys key key_len ref rows Extra
+1 SIMPLE t1 const PRIMARY PRIMARY 4 const 1 Using index
+DROP TABLE t1;
+CREATE TABLE t1 ( a BLOB, INDEX (a(20)) );
+CREATE TABLE t2 ( a BLOB, INDEX (a(20)) );
+INSERT INTO t1 VALUES ('one'),('two'),('three'),('four'),('five');
+INSERT INTO t2 VALUES ('one'),('two'),('three'),('four'),('five');
+EXPLAIN SELECT * FROM t1 LEFT JOIN t2 USE INDEX (a) ON t1.a=t2.a;
+id select_type table type possible_keys key key_len ref rows Extra
+1 SIMPLE t1 ALL NULL NULL NULL NULL 5
+1 SIMPLE t2 ref a a 23 test.t1.a 2
+EXPLAIN SELECT * FROM t1 LEFT JOIN t2 FORCE INDEX (a) ON t1.a=t2.a;
+id select_type table type possible_keys key key_len ref rows Extra
+1 SIMPLE t1 ALL NULL NULL NULL NULL 5
+1 SIMPLE t2 ref a a 23 test.t1.a 2
+DROP TABLE t1, t2;
+CREATE TABLE t1 ( city char(30) );
+INSERT INTO t1 VALUES ('London');
+INSERT INTO t1 VALUES ('Paris');
+SELECT * FROM t1 WHERE city='London';
+city
+London
+SELECT * FROM t1 WHERE city='london';
+city
+London
+EXPLAIN SELECT * FROM t1 WHERE city='London' AND city='london';
+id select_type table type possible_keys key key_len ref rows Extra
+1 SIMPLE t1 ALL NULL NULL NULL NULL 2 Using where
+SELECT * FROM t1 WHERE city='London' AND city='london';
+city
+London
+EXPLAIN SELECT * FROM t1 WHERE city LIKE '%london%' AND city='London';
+id select_type table type possible_keys key key_len ref rows Extra
+1 SIMPLE t1 ALL NULL NULL NULL NULL 2 Using where
+SELECT * FROM t1 WHERE city LIKE '%london%' AND city='London';
+city
+London
+DROP TABLE t1;
+create table t1 (a int(11) unsigned, b int(11) unsigned);
+insert into t1 values (1,0), (1,1), (1,2);
+select a-b from t1 order by 1;
+a-b
+0
+1
+18446744073709551615
+select a-b , (a-b < 0) from t1 order by 1;
+a-b (a-b < 0)
+0 0
+1 0
+18446744073709551615 0
+select a-b as d, (a-b >= 0), b from t1 group by b having d >= 0;
+d (a-b >= 0) b
+1 1 0
+0 1 1
+18446744073709551615 1 2
+select cast((a - b) as unsigned) from t1 order by 1;
+cast((a - b) as unsigned)
+0
+1
+18446744073709551615
+drop table t1;
+create table t1 (a int(11));
+select all all * from t1;
+a
+select distinct distinct * from t1;
+a
+select all distinct * from t1;
+ERROR HY000: Incorrect usage of ALL and DISTINCT
+select distinct all * from t1;
+ERROR HY000: Incorrect usage of ALL and DISTINCT
+drop table t1;
+CREATE TABLE t1 (
+kunde_intern_id int(10) unsigned NOT NULL default '0',
+kunde_id int(10) unsigned NOT NULL default '0',
+FK_firma_id int(10) unsigned NOT NULL default '0',
+aktuell enum('Ja','Nein') NOT NULL default 'Ja',
+vorname varchar(128) NOT NULL default '',
+nachname varchar(128) NOT NULL default '',
+geloescht enum('Ja','Nein') NOT NULL default 'Nein',
+firma varchar(128) NOT NULL default ''
+);
+INSERT INTO t1 VALUES
+(3964,3051,1,'Ja','Vorname1','1Nachname','Nein','Print Schau XXXX'),
+(3965,3051111,1,'Ja','Vorname1111','1111Nachname','Nein','Print Schau XXXX');
+SELECT kunde_id ,FK_firma_id ,aktuell, vorname, nachname, geloescht FROM t1
+WHERE
+(
+(
+( '' != '' AND firma LIKE CONCAT('%', '', '%'))
+OR
+(vorname LIKE CONCAT('%', 'Vorname1', '%') AND
+nachname LIKE CONCAT('%', '1Nachname', '%') AND
+'Vorname1' != '' AND 'xxxx' != '')
+)
+AND
+(
+aktuell = 'Ja' AND geloescht = 'Nein' AND FK_firma_id = 2
+)
+)
+;
+kunde_id FK_firma_id aktuell vorname nachname geloescht
+SELECT kunde_id ,FK_firma_id ,aktuell, vorname, nachname,
+geloescht FROM t1
+WHERE
+(
+(
+aktuell = 'Ja' AND geloescht = 'Nein' AND FK_firma_id = 2
+)
+AND
+(
+( '' != '' AND firma LIKE CONCAT('%', '', '%') )
+OR
+( vorname LIKE CONCAT('%', 'Vorname1', '%') AND
+nachname LIKE CONCAT('%', '1Nachname', '%') AND 'Vorname1' != '' AND
+'xxxx' != '')
+)
+)
+;
+kunde_id FK_firma_id aktuell vorname nachname geloescht
+SELECT COUNT(*) FROM t1 WHERE
+( 0 OR (vorname LIKE '%Vorname1%' AND nachname LIKE '%1Nachname%' AND 1))
+AND FK_firma_id = 2;
+COUNT(*)
+0
+drop table t1;
+CREATE TABLE t1 (b BIGINT(20) UNSIGNED NOT NULL, PRIMARY KEY (b));
+INSERT INTO t1 VALUES (0x8000000000000000);
+SELECT b FROM t1 WHERE b=0x8000000000000000;
+b
+9223372036854775808
+DROP TABLE t1;
+CREATE TABLE `t1` ( `gid` int(11) default NULL, `uid` int(11) default NULL);
+CREATE TABLE `t2` ( `ident` int(11) default NULL, `level` char(16) default NULL);
+INSERT INTO `t2` VALUES (0,'READ');
+CREATE TABLE `t3` ( `id` int(11) default NULL, `name` char(16) default NULL);
+INSERT INTO `t3` VALUES (1,'fs');
+select * from t3 left join t1 on t3.id = t1.uid, t2 where t2.ident in (0, t1.gid, t3.id, 0);
+id name gid uid ident level
+1 fs NULL NULL 0 READ
+drop table t1,t2,t3;
+CREATE TABLE t1 (
+acct_id int(11) NOT NULL default '0',
+profile_id smallint(6) default NULL,
+UNIQUE KEY t1$acct_id (acct_id),
+KEY t1$profile_id (profile_id)
+);
+INSERT INTO t1 VALUES (132,17),(133,18);
+CREATE TABLE t2 (
+profile_id smallint(6) default NULL,
+queue_id int(11) default NULL,
+seq int(11) default NULL,
+KEY t2$queue_id (queue_id)
+);
+INSERT INTO t2 VALUES (17,31,4),(17,30,3),(17,36,2),(17,37,1);
+CREATE TABLE t3 (
+id int(11) NOT NULL default '0',
+qtype int(11) default NULL,
+seq int(11) default NULL,
+warn_lvl int(11) default NULL,
+crit_lvl int(11) default NULL,
+rr1 tinyint(4) NOT NULL default '0',
+rr2 int(11) default NULL,
+default_queue tinyint(4) NOT NULL default '0',
+KEY t3$qtype (qtype),
+KEY t3$id (id)
+);
+INSERT INTO t3 VALUES (30,1,29,NULL,NULL,0,NULL,0),(31,1,28,NULL,NULL,0,NULL,0),
+(36,1,34,NULL,NULL,0,NULL,0),(37,1,35,NULL,NULL,0,121,0);
+SELECT COUNT(*) FROM t1 a STRAIGHT_JOIN t2 pq STRAIGHT_JOIN t3 q
+WHERE
+(pq.profile_id = a.profile_id) AND (a.acct_id = 132) AND
+(pq.queue_id = q.id) AND (q.rr1 <> 1);
+COUNT(*)
+4
+drop table t1,t2,t3;
+create table t1 (f1 int);
+insert into t1 values (1),(NULL);
+create table t2 (f2 int, f3 int, f4 int);
+create index idx1 on t2 (f4);
+insert into t2 values (1,2,3),(2,4,6);
+select A.f2 from t1 left join t2 A on A.f2 = f1 where A.f3=(select min(f3)
+from t2 C where A.f4 = C.f4) or A.f3 IS NULL;
+f2
+1
+NULL
+drop table t1,t2;
+create table t2 (a tinyint unsigned);
+create index t2i on t2(a);
+insert into t2 values (0), (254), (255);
+explain select * from t2 where a > -1;
+id select_type table type possible_keys key key_len ref rows Extra
+1 SIMPLE t2 index t2i t2i 2 NULL 3 Using where; Using index
+select * from t2 where a > -1;
+a
+0
+254
+255
+drop table t2;
+CREATE TABLE t1 (a int, b int, c int);
+INSERT INTO t1
+SELECT 50, 3, 3 FROM DUAL
+WHERE NOT EXISTS
+(SELECT * FROM t1 WHERE a = 50 AND b = 3);
+SELECT * FROM t1;
+a b c
+50 3 3
+INSERT INTO t1
+SELECT 50, 3, 3 FROM DUAL
+WHERE NOT EXISTS
+(SELECT * FROM t1 WHERE a = 50 AND b = 3);
+select found_rows();
+found_rows()
+0
+SELECT * FROM t1;
+a b c
+50 3 3
+select count(*) from t1;
+count(*)
+1
+select found_rows();
+found_rows()
+1
+select count(*) from t1 limit 2,3;
+count(*)
+select found_rows();
+found_rows()
+0
+select SQL_CALC_FOUND_ROWS count(*) from t1 limit 2,3;
+count(*)
+select found_rows();
+found_rows()
+1
+DROP TABLE t1;
+CREATE TABLE t1 (a INT, b INT);
+(SELECT a, b AS c FROM t1) ORDER BY c+1;
+a c
+(SELECT a, b AS c FROM t1) ORDER BY b+1;
+a c
+SELECT a, b AS c FROM t1 ORDER BY c+1;
+a c
+SELECT a, b AS c FROM t1 ORDER BY b+1;
+a c
+drop table t1;
+create table t1(f1 int, f2 int);
+create table t2(f3 int);
+select f1 from t1,t2 where f1=f2 and (f1,f2) = ((1,1));
+f1
+select f1 from t1,t2 where f1=f2 and (f1,NULL) = ((1,1));
+f1
+select f1 from t1,t2 where f1=f2 and (f1,f2) = ((1,NULL));
+f1
+insert into t1 values(1,1),(2,null);
+insert into t2 values(2);
+select * from t1,t2 where f1=f3 and (f1,f2) = (2,null);
+f1 f2 f3
+select * from t1,t2 where f1=f3 and (f1,f2) <=> (2,null);
+f1 f2 f3
+2 NULL 2
+drop table t1,t2;
+create table t1 (f1 int not null auto_increment primary key, f2 varchar(10));
+create table t11 like t1;
+insert into t1 values(1,""),(2,"");
+show table status like 't1%';
+Name Engine Version Row_format Rows Avg_row_length Data_length Max_data_length Index_length Data_free Auto_increment Create_time Update_time Check_time Collation Checksum Create_options Comment
+t1 MyISAM 10 Dynamic 2 20 X X X X X X X X latin1_swedish_ci NULL
+t11 MyISAM 10 Dynamic 0 0 X X X X X X X X latin1_swedish_ci NULL
+select 123 as a from t1 where f1 is null;
+a
+drop table t1,t11;
+CREATE TABLE t1 ( a INT NOT NULL, b INT NOT NULL, UNIQUE idx (a,b) );
+INSERT INTO t1 VALUES (1,1),(1,2),(1,3),(1,4);
+CREATE TABLE t2 ( a INT NOT NULL, b INT NOT NULL, e INT );
+INSERT INTO t2 VALUES ( 1,10,1), (1,10,2), (1,11,1), (1,11,2), (1,2,1), (1,2,2),(1,2,3);
+SELECT t2.a, t2.b, IF(t1.b IS NULL,'',e) AS c, COUNT(*) AS d FROM t2 LEFT JOIN
+t1 ON t2.a = t1.a AND t2.b = t1.b GROUP BY a, b, c;
+a b c d
+1 2 1 1
+1 2 2 1
+1 2 3 1
+1 10 2
+1 11 2
+SELECT t2.a, t2.b, IF(t1.b IS NULL,'',e) AS c, COUNT(*) AS d FROM t2 LEFT JOIN
+t1 ON t2.a = t1.a AND t2.b = t1.b GROUP BY t1.a, t1.b, c;
+a b c d
+1 10 4
+1 2 1 1
+1 2 2 1
+1 2 3 1
+SELECT t2.a, t2.b, IF(t1.b IS NULL,'',e) AS c, COUNT(*) AS d FROM t2 LEFT JOIN
+t1 ON t2.a = t1.a AND t2.b = t1.b GROUP BY t2.a, t2.b, c;
+a b c d
+1 2 1 1
+1 2 2 1
+1 2 3 1
+1 10 2
+1 11 2
+SELECT t2.a, t2.b, IF(t1.b IS NULL,'',e) AS c, COUNT(*) AS d FROM t2,t1
+WHERE t2.a = t1.a AND t2.b = t1.b GROUP BY a, b, c;
+a b c d
+1 2 1 1
+1 2 2 1
+1 2 3 1
+DROP TABLE IF EXISTS t1, t2;
+create table t1 (f1 int primary key, f2 int);
+create table t2 (f3 int, f4 int, primary key(f3,f4));
+insert into t1 values (1,1);
+insert into t2 values (1,1),(1,2);
+select distinct count(f2) >0 from t1 left join t2 on f1=f3 group by f1;
+count(f2) >0
+1
+drop table t1,t2;
+create table t1 (f1 int,f2 int);
+insert into t1 values(1,1);
+create table t2 (f3 int, f4 int, primary key(f3,f4));
+insert into t2 values(1,1);
+select * from t1 where f1 in (select f3 from t2 where (f3,f4)= (select f3,f4 from t2));
+f1 f2
+1 1
+drop table t1,t2;
+CREATE TABLE t1(a int, b int, c int, KEY b(b), KEY c(c));
+insert into t1 values (1,0,0),(2,0,0);
+CREATE TABLE t2 (a int, b varchar(2), c varchar(2), PRIMARY KEY(a));
+insert into t2 values (1,'',''), (2,'','');
+CREATE TABLE t3 (a int, b int, PRIMARY KEY (a,b), KEY a (a), KEY b (b));
+insert into t3 values (1,1),(1,2);
+explain select straight_join DISTINCT t2.a,t2.b, t1.c from t1, t3, t2
+where (t1.c=t2.a or (t1.c=t3.a and t2.a=t3.b)) and t1.b=556476786 and
+t2.b like '%%' order by t2.b limit 0,1;
+id select_type table type possible_keys key key_len ref rows Extra
+1 SIMPLE t1 ref b,c b 5 const 1 Using where; Using temporary; Using filesort
+1 SIMPLE t3 index PRIMARY,a,b PRIMARY 8 NULL 2 Using index; Using join buffer
+1 SIMPLE t2 ALL PRIMARY NULL NULL NULL 2 Range checked for each record (index map: 0x1)
+DROP TABLE t1,t2,t3;
+CREATE TABLE t1 (a int, INDEX idx(a));
+INSERT INTO t1 VALUES (2), (3), (1);
+EXPLAIN SELECT * FROM t1 IGNORE INDEX (idx);
+id select_type table type possible_keys key key_len ref rows Extra
+1 SIMPLE t1 ALL NULL NULL NULL NULL 3
+EXPLAIN SELECT * FROM t1 IGNORE INDEX (a);
+ERROR 42000: Key 'a' doesn't exist in table 't1'
+EXPLAIN SELECT * FROM t1 FORCE INDEX (a);
+ERROR 42000: Key 'a' doesn't exist in table 't1'
+DROP TABLE t1;
+CREATE TABLE t1 (a int, b int);
+INSERT INTO t1 VALUES (1,1), (2,1), (4,10);
+CREATE TABLE t2 (a int PRIMARY KEY, b int, KEY b (b));
+INSERT INTO t2 VALUES (1,NULL), (2,10);
+ALTER TABLE t1 ENABLE KEYS;
+EXPLAIN SELECT STRAIGHT_JOIN SQL_NO_CACHE COUNT(*) FROM t2, t1 WHERE t1.b = t2.b OR t2.b IS NULL;
+id select_type table type possible_keys key key_len ref rows Extra
+1 SIMPLE t2 index b b 5 NULL 2 Using index
+1 SIMPLE t1 ALL NULL NULL NULL NULL 3 Using where; Using join buffer
+SELECT STRAIGHT_JOIN SQL_NO_CACHE * FROM t2, t1 WHERE t1.b = t2.b OR t2.b IS NULL;
+a b a b
+1 NULL 1 1
+1 NULL 2 1
+1 NULL 4 10
+2 10 4 10
+EXPLAIN SELECT STRAIGHT_JOIN SQL_NO_CACHE COUNT(*) FROM t2, t1 WHERE t1.b = t2.b OR t2.b IS NULL;
+id select_type table type possible_keys key key_len ref rows Extra
+1 SIMPLE t2 index b b 5 NULL 2 Using index
+1 SIMPLE t1 ALL NULL NULL NULL NULL 3 Using where; Using join buffer
+SELECT STRAIGHT_JOIN SQL_NO_CACHE * FROM t2, t1 WHERE t1.b = t2.b OR t2.b IS NULL;
+a b a b
+1 NULL 1 1
+1 NULL 2 1
+1 NULL 4 10
+2 10 4 10
+DROP TABLE IF EXISTS t1,t2;
+CREATE TABLE t1 (key1 float default NULL, UNIQUE KEY key1 (key1));
+CREATE TABLE t2 (key2 float default NULL, UNIQUE KEY key2 (key2));
+INSERT INTO t1 VALUES (0.3762),(0.3845),(0.6158),(0.7941);
+INSERT INTO t2 VALUES (1.3762),(1.3845),(1.6158),(1.7941);
+explain select max(key1) from t1 where key1 <= 0.6158;
+id select_type table type possible_keys key key_len ref rows Extra
+1 SIMPLE NULL NULL NULL NULL NULL NULL NULL Select tables optimized away
+explain select max(key2) from t2 where key2 <= 1.6158;
+id select_type table type possible_keys key key_len ref rows Extra
+1 SIMPLE NULL NULL NULL NULL NULL NULL NULL Select tables optimized away
+explain select min(key1) from t1 where key1 >= 0.3762;
+id select_type table type possible_keys key key_len ref rows Extra
+1 SIMPLE NULL NULL NULL NULL NULL NULL NULL Select tables optimized away
+explain select min(key2) from t2 where key2 >= 1.3762;
+id select_type table type possible_keys key key_len ref rows Extra
+1 SIMPLE NULL NULL NULL NULL NULL NULL NULL Select tables optimized away
+explain select max(key1), min(key2) from t1, t2
+where key1 <= 0.6158 and key2 >= 1.3762;
+id select_type table type possible_keys key key_len ref rows Extra
+1 SIMPLE NULL NULL NULL NULL NULL NULL NULL Select tables optimized away
+explain select max(key1) from t1 where key1 <= 0.6158 and rand() + 0.5 >= 0.5;
+id select_type table type possible_keys key key_len ref rows Extra
+1 SIMPLE NULL NULL NULL NULL NULL NULL NULL Select tables optimized away
+explain select min(key1) from t1 where key1 >= 0.3762 and rand() + 0.5 >= 0.5;
+id select_type table type possible_keys key key_len ref rows Extra
+1 SIMPLE NULL NULL NULL NULL NULL NULL NULL Select tables optimized away
+select max(key1) from t1 where key1 <= 0.6158;
+max(key1)
+0.615800023078918
+select max(key2) from t2 where key2 <= 1.6158;
+max(key2)
+1.61580002307892
+select min(key1) from t1 where key1 >= 0.3762;
+min(key1)
+0.376199990510941
+select min(key2) from t2 where key2 >= 1.3762;
+min(key2)
+1.37619996070862
+select max(key1), min(key2) from t1, t2
+where key1 <= 0.6158 and key2 >= 1.3762;
+max(key1) min(key2)
+0.615800023078918 1.37619996070862
+select max(key1) from t1 where key1 <= 0.6158 and rand() + 0.5 >= 0.5;
+max(key1)
+0.615800023078918
+select min(key1) from t1 where key1 >= 0.3762 and rand() + 0.5 >= 0.5;
+min(key1)
+0.376199990510941
+DROP TABLE t1,t2;
+CREATE TABLE t1 (i BIGINT UNSIGNED NOT NULL);
+INSERT INTO t1 VALUES (10);
+SELECT i='1e+01',i=1e+01, i in (1e+01,1e+01), i in ('1e+01','1e+01') FROM t1;
+i='1e+01' i=1e+01 i in (1e+01,1e+01) i in ('1e+01','1e+01')
+1 1 1 1
+DROP TABLE t1;
+create table t1(a bigint unsigned, b bigint);
+insert into t1 values (0xfffffffffffffffff, 0xfffffffffffffffff),
+(0x10000000000000000, 0x10000000000000000),
+(0x8fffffffffffffff, 0x8fffffffffffffff);
+Warnings:
+Warning 1264 Out of range value for column 'a' at row 1
+Warning 1264 Out of range value for column 'b' at row 1
+Warning 1264 Out of range value for column 'a' at row 2
+Warning 1264 Out of range value for column 'b' at row 2
+Warning 1264 Out of range value for column 'b' at row 3
+select hex(a), hex(b) from t1;
+hex(a) hex(b)
+FFFFFFFFFFFFFFFF 7FFFFFFFFFFFFFFF
+FFFFFFFFFFFFFFFF 7FFFFFFFFFFFFFFF
+8FFFFFFFFFFFFFFF 7FFFFFFFFFFFFFFF
+drop table t1;
+CREATE TABLE t1 (c0 int);
+CREATE TABLE t2 (c0 int);
+INSERT INTO t1 VALUES(@@connect_timeout);
+INSERT INTO t2 VALUES(@@connect_timeout);
+SELECT * FROM t1 JOIN t2 ON t1.c0 = t2.c0 WHERE (t1.c0 <=> @@connect_timeout);
+c0 c0
+X X
+DROP TABLE t1, t2;
+End of 4.1 tests
+CREATE TABLE t1 (
+K2C4 varchar(4) character set latin1 collate latin1_bin NOT NULL default '',
+K4N4 varchar(4) character set latin1 collate latin1_bin NOT NULL default '0000',
+F2I4 int(11) NOT NULL default '0'
+) ENGINE=MyISAM DEFAULT CHARSET=latin1;
+INSERT INTO t1 VALUES
+('W%RT', '0100', 1),
+('W-RT', '0100', 1),
+('WART', '0100', 1),
+('WART', '0200', 1),
+('WERT', '0100', 2),
+('WORT','0200', 2),
+('WT', '0100', 2),
+('W_RT', '0100', 2),
+('WaRT', '0100', 3),
+('WART', '0300', 3),
+('WRT' , '0400', 3),
+('WURM', '0500', 3),
+('W%T', '0600', 4),
+('WA%T', '0700', 4),
+('WA_T', '0800', 4);
+SELECT K2C4, K4N4, F2I4 FROM t1
+WHERE K2C4 = 'WART' AND
+(F2I4 = 2 AND K2C4 = 'WART' OR (F2I4 = 2 OR K4N4 = '0200'));
+K2C4 K4N4 F2I4
+WART 0200 1
+SELECT K2C4, K4N4, F2I4 FROM t1
+WHERE K2C4 = 'WART' AND (K2C4 = 'WART' OR K4N4 = '0200');
+K2C4 K4N4 F2I4
+WART 0100 1
+WART 0200 1
+WART 0300 3
+DROP TABLE t1;
+create table t1 (a int, b int);
+create table t2 like t1;
+select t1.a from (t1 inner join t2 on t1.a=t2.a) where t2.a=1;
+a
+select t1.a from ((t1 inner join t2 on t1.a=t2.a)) where t2.a=1;
+a
+select x.a, y.a, z.a from ( (t1 x inner join t2 y on x.a=y.a) inner join t2 z on y.a=z.a) WHERE x.a=1;
+a a a
+drop table t1,t2;
+create table t1 (s1 varchar(5));
+insert into t1 values ('Wall');
+select min(s1) from t1 group by s1 with rollup;
+min(s1)
+Wall
+Wall
+drop table t1;
+create table t1 (s1 int) engine=myisam;
+insert into t1 values (0);
+select avg(distinct s1) from t1 group by s1 with rollup;
+avg(distinct s1)
+0.0000
+0.0000
+drop table t1;
+create table t1 (s1 int);
+insert into t1 values (null),(1);
+select distinct avg(s1) as x from t1 group by s1 with rollup;
+x
+NULL
+1.0000
+drop table t1;
+CREATE TABLE t1 (a int);
+CREATE TABLE t2 (a int);
+INSERT INTO t1 VALUES (1), (2), (3), (4), (5);
+INSERT INTO t2 VALUES (2), (4), (6);
+SELECT t1.a FROM t1 STRAIGHT_JOIN t2 ON t1.a=t2.a;
+a
+2
+4
+EXPLAIN SELECT t1.a FROM t1 STRAIGHT_JOIN t2 ON t1.a=t2.a;
+id select_type table type possible_keys key key_len ref rows Extra
+1 SIMPLE t1 ALL NULL NULL NULL NULL 5
+1 SIMPLE t2 ALL NULL NULL NULL NULL 3 Using where; Using join buffer
+EXPLAIN SELECT t1.a FROM t1 INNER JOIN t2 ON t1.a=t2.a;
+id select_type table type possible_keys key key_len ref rows Extra
+1 SIMPLE t2 ALL NULL NULL NULL NULL 3
+1 SIMPLE t1 ALL NULL NULL NULL NULL 5 Using where; Using join buffer
+DROP TABLE t1,t2;
+select x'10' + 0, X'10' + 0, b'10' + 0, B'10' + 0;
+x'10' + 0 X'10' + 0 b'10' + 0 B'10' + 0
+16 16 2 2
+create table t1 (f1 varchar(6) default NULL, f2 int(6) primary key not null);
+create table t2 (f3 varchar(5) not null, f4 varchar(5) not null, UNIQUE KEY UKEY (f3,f4));
+insert into t1 values (" 2", 2);
+insert into t2 values (" 2", " one "),(" 2", " two ");
+select * from t1 left join t2 on f1 = f3;
+f1 f2 f3 f4
+ 2 2 2 one
+ 2 2 2 two
+drop table t1,t2;
+create table t1 (empnum smallint, grp int);
+create table t2 (empnum int, name char(5));
+insert into t1 values(1,1);
+insert into t2 values(1,'bob');
+create view v1 as select * from t2 inner join t1 using (empnum);
+select * from v1;
+empnum name grp
+1 bob 1
+drop table t1,t2;
+drop view v1;
+create table t1 (pk int primary key, b int);
+create table t2 (pk int primary key, c int);
+select pk from t1 inner join t2 using (pk);
+pk
+drop table t1,t2;
+create table t1 (s1 int, s2 char(5), s3 decimal(10));
+create view v1 as select s1, s2, 'x' as s3 from t1;
+select * from t1 natural join v1;
+s1 s2 s3
+insert into t1 values (1,'x',5);
+select * from t1 natural join v1;
+s1 s2 s3
+Warnings:
+Warning 1292 Truncated incorrect DOUBLE value: 'x'
+drop table t1;
+drop view v1;
+create table t1(a1 int);
+create table t2(a2 int);
+insert into t1 values(1),(2);
+insert into t2 values(1),(2);
+create view v2 (c) as select a1 from t1;
+select * from t1 natural left join t2;
+a1 a2
+1 1
+1 2
+2 1
+2 2
+select * from t1 natural right join t2;
+a2 a1
+1 1
+1 2
+2 1
+2 2
+select * from v2 natural left join t2;
+c a2
+1 1
+1 2
+2 1
+2 2
+select * from v2 natural right join t2;
+a2 c
+1 1
+1 2
+2 1
+2 2
+drop table t1, t2;
+drop view v2;
+create table t1 (a int(10), t1_val int(10));
+create table t2 (b int(10), t2_val int(10));
+create table t3 (a int(10), b int(10));
+insert into t1 values (1,1),(2,2);
+insert into t2 values (1,1),(2,2),(3,3);
+insert into t3 values (1,1),(2,1),(3,1),(4,1);
+select * from t1 natural join t2 natural join t3;
+a b t1_val t2_val
+1 1 1 1
+2 1 2 1
+select * from t1 natural join t3 natural join t2;
+b a t1_val t2_val
+1 1 1 1
+1 2 2 1
+drop table t1, t2, t3;
+DO IFNULL(NULL, NULL);
+SELECT CAST(IFNULL(NULL, NULL) AS DECIMAL);
+CAST(IFNULL(NULL, NULL) AS DECIMAL)
+NULL
+SELECT ABS(IFNULL(NULL, NULL));
+ABS(IFNULL(NULL, NULL))
+NULL
+SELECT IFNULL(NULL, NULL);
+IFNULL(NULL, NULL)
+NULL
+SET @OLD_SQL_MODE12595=@@SQL_MODE, @@SQL_MODE='';
+SHOW LOCAL VARIABLES LIKE 'SQL_MODE';
+Variable_name Value
+sql_mode
+CREATE TABLE BUG_12595(a varchar(100));
+INSERT INTO BUG_12595 VALUES ('hakan%'), ('hakank'), ("ha%an");
+SELECT * FROM BUG_12595 WHERE a LIKE 'hakan\%';
+a
+hakan%
+SELECT * FROM BUG_12595 WHERE a LIKE 'hakan*%' ESCAPE '*';
+a
+hakan%
+SELECT * FROM BUG_12595 WHERE a LIKE 'hakan**%' ESCAPE '**';
+ERROR HY000: Incorrect arguments to ESCAPE
+SELECT * FROM BUG_12595 WHERE a LIKE 'hakan%' ESCAPE '';
+a
+hakan%
+hakank
+SELECT * FROM BUG_12595 WHERE a LIKE 'hakan\%' ESCAPE '';
+a
+SELECT * FROM BUG_12595 WHERE a LIKE 'ha\%an' ESCAPE 0x5c;
+a
+ha%an
+SELECT * FROM BUG_12595 WHERE a LIKE 'ha%%an' ESCAPE '%';
+a
+ha%an
+SELECT * FROM BUG_12595 WHERE a LIKE 'ha\%an' ESCAPE '\\';
+a
+ha%an
+SELECT * FROM BUG_12595 WHERE a LIKE 'ha|%an' ESCAPE '|';
+a
+ha%an
+SET @@SQL_MODE='NO_BACKSLASH_ESCAPES';
+SHOW LOCAL VARIABLES LIKE 'SQL_MODE';
+Variable_name Value
+sql_mode NO_BACKSLASH_ESCAPES
+SELECT * FROM BUG_12595 WHERE a LIKE 'hakan\%';
+a
+SELECT * FROM BUG_12595 WHERE a LIKE 'hakan*%' ESCAPE '*';
+a
+hakan%
+SELECT * FROM BUG_12595 WHERE a LIKE 'hakan**%' ESCAPE '**';
+ERROR HY000: Incorrect arguments to ESCAPE
+SELECT * FROM BUG_12595 WHERE a LIKE 'hakan\%' ESCAPE '\\';
+ERROR HY000: Incorrect arguments to ESCAPE
+SELECT * FROM BUG_12595 WHERE a LIKE 'hakan%' ESCAPE '';
+ERROR HY000: Incorrect arguments to ESCAPE
+SELECT * FROM BUG_12595 WHERE a LIKE 'ha\%an' ESCAPE 0x5c;
+a
+ha%an
+SELECT * FROM BUG_12595 WHERE a LIKE 'ha|%an' ESCAPE '|';
+a
+ha%an
+SELECT * FROM BUG_12595 WHERE a LIKE 'hakan\n%' ESCAPE '\n';
+ERROR HY000: Incorrect arguments to ESCAPE
+SET @@SQL_MODE=@OLD_SQL_MODE12595;
+DROP TABLE BUG_12595;
+create table t1 (a char(1));
+create table t2 (a char(1));
+insert into t1 values ('a'),('b'),('c');
+insert into t2 values ('b'),('c'),('d');
+select a from t1 natural join t2;
+a
+b
+c
+select * from t1 natural join t2 where a = 'b';
+a
+b
+drop table t1, t2;
+CREATE TABLE t1 (`id` TINYINT);
+CREATE TABLE t2 (`id` TINYINT);
+CREATE TABLE t3 (`id` TINYINT);
+INSERT INTO t1 VALUES (1),(2),(3);
+INSERT INTO t2 VALUES (2);
+INSERT INTO t3 VALUES (3);
+SELECT t1.id,t3.id FROM t1 JOIN t2 ON (t2.id=t1.id) LEFT JOIN t3 USING (id);
+ERROR 23000: Column 'id' in from clause is ambiguous
+SELECT t1.id,t3.id FROM t1 JOIN t2 ON (t2.notacolumn=t1.id) LEFT JOIN t3 USING (id);
+ERROR 23000: Column 'id' in from clause is ambiguous
+SELECT id,t3.id FROM t1 JOIN t2 ON (t2.id=t1.id) LEFT JOIN t3 USING (id);
+ERROR 23000: Column 'id' in from clause is ambiguous
+SELECT id,t3.id FROM (t1 JOIN t2 ON (t2.id=t1.id)) LEFT JOIN t3 USING (id);
+ERROR 23000: Column 'id' in from clause is ambiguous
+drop table t1, t2, t3;
+create table t1 (a int(10),b int(10));
+create table t2 (a int(10),b int(10));
+insert into t1 values (1,10),(2,20),(3,30);
+insert into t2 values (1,10);
+select * from t1 inner join t2 using (A);
+a b b
+1 10 10
+select * from t1 inner join t2 using (a);
+a b b
+1 10 10
+drop table t1, t2;
+create table t1 (a int, c int);
+create table t2 (b int);
+create table t3 (b int, a int);
+create table t4 (c int);
+insert into t1 values (1,1);
+insert into t2 values (1);
+insert into t3 values (1,1);
+insert into t4 values (1);
+select * from t1 join t2 join t3 on (t2.b = t3.b and t1.a = t3.a);
+a c b b a
+1 1 1 1 1
+select * from t1, t2 join t3 on (t2.b = t3.b and t1.a = t3.a);
+ERROR 42S22: Unknown column 't1.a' in 'on clause'
+select * from t1 join t2 join t3 join t4 on (t1.a = t4.c and t2.b = t4.c);
+a c b b a c
+1 1 1 1 1 1
+select * from t1 join t2 join t4 using (c);
+c a b
+1 1 1
+drop table t1, t2, t3, t4;
+create table t1(x int, y int);
+create table t2(x int, y int);
+create table t3(x int, primary key(x));
+insert into t1 values (1, 1), (2, 1), (3, 1), (4, 3), (5, 6), (6, 6);
+insert into t2 values (1, 1), (2, 1), (3, 3), (4, 6), (5, 6);
+insert into t3 values (1), (2), (3), (4), (5);
+select t1.x, t3.x from t1, t2, t3 where t1.x = t2.x and t3.x >= t1.y and t3.x <= t2.y;
+x x
+1 1
+2 1
+3 1
+3 2
+3 3
+4 3
+4 4
+4 5
+drop table t1,t2,t3;
+create table t1 (id char(16) not null default '', primary key (id));
+insert into t1 values ('100'),('101'),('102');
+create table t2 (id char(16) default null);
+insert into t2 values (1);
+create view v1 as select t1.id from t1;
+create view v2 as select t2.id from t2;
+create view v3 as select (t1.id+2) as id from t1 natural left join t2;
+select t1.id from t1 left join v2 using (id);
+id
+100
+101
+102
+select t1.id from v2 right join t1 using (id);
+id
+100
+101
+102
+select t1.id from t1 left join v3 using (id);
+id
+100
+101
+102
+select * from t1 left join v2 using (id);
+id
+100
+101
+102
+select * from v2 right join t1 using (id);
+id
+100
+101
+102
+select * from t1 left join v3 using (id);
+id
+100
+101
+102
+select v1.id from v1 left join v2 using (id);
+id
+100
+101
+102
+select v1.id from v2 right join v1 using (id);
+id
+100
+101
+102
+select v1.id from v1 left join v3 using (id);
+id
+100
+101
+102
+select * from v1 left join v2 using (id);
+id
+100
+101
+102
+select * from v2 right join v1 using (id);
+id
+100
+101
+102
+select * from v1 left join v3 using (id);
+id
+100
+101
+102
+drop table t1, t2;
+drop view v1, v2, v3;
+create table t1 (id int(11) not null default '0');
+insert into t1 values (123),(191),(192);
+create table t2 (id char(16) character set utf8 not null);
+insert into t2 values ('58013'),('58014'),('58015'),('58016');
+create table t3 (a_id int(11) not null, b_id char(16) character set utf8);
+insert into t3 values (123,null),(123,null),(123,null),(123,null),(123,null),(123,'58013');
+select count(*)
+from t1 inner join (t3 left join t2 on t2.id = t3.b_id) on t1.id = t3.a_id;
+count(*)
+6
+select count(*)
+from t1 inner join (t2 right join t3 on t2.id = t3.b_id) on t1.id = t3.a_id;
+count(*)
+6
+drop table t1,t2,t3;
+create table t1 (a int);
+create table t2 (b int);
+create table t3 (c int);
+select * from t1 join t2 join t3 on (t1.a=t3.c);
+a b c
+select * from t1 join t2 left join t3 on (t1.a=t3.c);
+a b c
+select * from t1 join t2 right join t3 on (t1.a=t3.c);
+a b c
+select * from t1 join t2 straight_join t3 on (t1.a=t3.c);
+a b c
+drop table t1, t2 ,t3;
+create table t1(f1 int, f2 date);
+insert into t1 values(1,'2005-01-01'),(2,'2005-09-01'),(3,'2005-09-30'),
+(4,'2005-10-01'),(5,'2005-12-30');
+select * from t1 where f2 >= 0 order by f2;
+f1 f2
+1 2005-01-01
+2 2005-09-01
+3 2005-09-30
+4 2005-10-01
+5 2005-12-30
+select * from t1 where f2 >= '0000-00-00' order by f2;
+f1 f2
+1 2005-01-01
+2 2005-09-01
+3 2005-09-30
+4 2005-10-01
+5 2005-12-30
+select * from t1 where f2 >= '2005-09-31' order by f2;
+f1 f2
+4 2005-10-01
+5 2005-12-30
+select * from t1 where f2 >= '2005-09-3a' order by f2;
+f1 f2
+3 2005-09-30
+4 2005-10-01
+5 2005-12-30
+Warnings:
+Warning 1292 Incorrect date value: '2005-09-3a' for column 'f2' at row 1
+select * from t1 where f2 <= '2005-09-31' order by f2;
+f1 f2
+1 2005-01-01
+2 2005-09-01
+3 2005-09-30
+select * from t1 where f2 <= '2005-09-3a' order by f2;
+f1 f2
+1 2005-01-01
+2 2005-09-01
+Warnings:
+Warning 1292 Incorrect date value: '2005-09-3a' for column 'f2' at row 1
+drop table t1;
+create table t1 (f1 int, f2 int);
+insert into t1 values (1, 30), (2, 20), (3, 10);
+create algorithm=merge view v1 as select f1, f2 from t1;
+create algorithm=merge view v2 (f2, f1) as select f1, f2 from t1;
+create algorithm=merge view v3 as select t1.f1 as f2, t1.f2 as f1 from t1;
+select t1.f1 as x1, f1 from t1 order by t1.f1;
+x1 f1
+1 1
+2 2
+3 3
+select v1.f1 as x1, f1 from v1 order by v1.f1;
+x1 f1
+1 1
+2 2
+3 3
+select v2.f1 as x1, f1 from v2 order by v2.f1;
+x1 f1
+10 10
+20 20
+30 30
+select v3.f1 as x1, f1 from v3 order by v3.f1;
+x1 f1
+10 10
+20 20
+30 30
+select f1, f2, v1.f1 as x1 from v1 order by v1.f1;
+f1 f2 x1
+1 30 1
+2 20 2
+3 10 3
+select f1, f2, v2.f1 as x1 from v2 order by v2.f1;
+f1 f2 x1
+10 3 10
+20 2 20
+30 1 30
+select f1, f2, v3.f1 as x1 from v3 order by v3.f1;
+f1 f2 x1
+10 3 10
+20 2 20
+30 1 30
+drop table t1;
+drop view v1, v2, v3;
+CREATE TABLE t1(key_a int4 NOT NULL, optimus varchar(32), PRIMARY KEY(key_a));
+CREATE TABLE t2(key_a int4 NOT NULL, prime varchar(32), PRIMARY KEY(key_a));
+CREATE table t3(key_a int4 NOT NULL, key_b int4 NOT NULL, foo varchar(32),
+PRIMARY KEY(key_a,key_b));
+INSERT INTO t1 VALUES (0,'');
+INSERT INTO t1 VALUES (1,'i');
+INSERT INTO t1 VALUES (2,'j');
+INSERT INTO t1 VALUES (3,'k');
+INSERT INTO t2 VALUES (1,'r');
+INSERT INTO t2 VALUES (2,'s');
+INSERT INTO t2 VALUES (3,'t');
+INSERT INTO t3 VALUES (1,5,'x');
+INSERT INTO t3 VALUES (1,6,'y');
+INSERT INTO t3 VALUES (2,5,'xx');
+INSERT INTO t3 VALUES (2,6,'yy');
+INSERT INTO t3 VALUES (2,7,'zz');
+INSERT INTO t3 VALUES (3,5,'xxx');
+SELECT t2.key_a,foo
+FROM t1 INNER JOIN t2 ON t1.key_a = t2.key_a
+INNER JOIN t3 ON t1.key_a = t3.key_a
+WHERE t2.key_a=2 and key_b=5;
+key_a foo
+2 xx
+EXPLAIN SELECT t2.key_a,foo
+FROM t1 INNER JOIN t2 ON t1.key_a = t2.key_a
+INNER JOIN t3 ON t1.key_a = t3.key_a
+WHERE t2.key_a=2 and key_b=5;
+id select_type table type possible_keys key key_len ref rows Extra
+1 SIMPLE t1 const PRIMARY PRIMARY 4 const 1 Using index
+1 SIMPLE t2 const PRIMARY PRIMARY 4 const 1 Using index
+1 SIMPLE t3 const PRIMARY PRIMARY 8 const,const 1
+SELECT t2.key_a,foo
+FROM t1 INNER JOIN t2 ON t2.key_a = t1.key_a
+INNER JOIN t3 ON t1.key_a = t3.key_a
+WHERE t2.key_a=2 and key_b=5;
+key_a foo
+2 xx
+EXPLAIN SELECT t2.key_a,foo
+FROM t1 INNER JOIN t2 ON t2.key_a = t1.key_a
+INNER JOIN t3 ON t1.key_a = t3.key_a
+WHERE t2.key_a=2 and key_b=5;
+id select_type table type possible_keys key key_len ref rows Extra
+1 SIMPLE t1 const PRIMARY PRIMARY 4 const 1 Using index
+1 SIMPLE t2 const PRIMARY PRIMARY 4 const 1 Using index
+1 SIMPLE t3 const PRIMARY PRIMARY 8 const,const 1
+DROP TABLE t1,t2,t3;
+create table t1 (f1 int);
+insert into t1 values(1),(2);
+create table t2 (f2 int, f3 int, key(f2));
+insert into t2 values(1,1),(2,2);
+create table t3 (f4 int not null);
+insert into t3 values (2),(2),(2);
+select f1,(select count(*) from t2,t3 where f2=f1 and f3=f4) as count from t1;
+f1 count
+1 0
+2 3
+drop table t1,t2,t3;
+create table t1 (f1 int unique);
+create table t2 (f2 int unique);
+create table t3 (f3 int unique);
+insert into t1 values(1),(2);
+insert into t2 values(1),(2);
+insert into t3 values(1),(NULL);
+select * from t3 where f3 is null;
+f3
+NULL
+select t2.f2 from t1 left join t2 on f1=f2 join t3 on f1=f3 where f1=1;
+f2
+1
+drop table t1,t2,t3;
+create table t1(f1 char, f2 char not null);
+insert into t1 values(null,'a');
+create table t2 (f2 char not null);
+insert into t2 values('b');
+select * from t1 left join t2 on f1=t2.f2 where t1.f2='a';
+f1 f2 f2
+NULL a NULL
+drop table t1,t2;
+select * from (select * left join t on f1=f2) tt;
+ERROR 42000: You have an error in your SQL syntax; check the manual that corresponds to your MySQL server version for the right syntax to use near 'on f1=f2) tt' at line 1
+CREATE TABLE t1 (sku int PRIMARY KEY, pr int);
+CREATE TABLE t2 (sku int PRIMARY KEY, sppr int, name varchar(255));
+INSERT INTO t1 VALUES
+(10, 10), (20, 10), (30, 20), (40, 30), (50, 10), (60, 10);
+INSERT INTO t2 VALUES
+(10, 10, 'aaa'), (20, 10, 'bbb'), (30, 10, 'ccc'), (40, 20, 'ddd'),
+(50, 10, 'eee'), (60, 20, 'fff'), (70, 20, 'ggg'), (80, 30, 'hhh');
+SELECT t2.sku, t2.sppr, t2.name, t1.sku, t1.pr
+FROM t2, t1 WHERE t2.sku=20 AND (t2.sku=t1.sku OR t2.sppr=t1.sku);
+sku sppr name sku pr
+20 10 bbb 10 10
+20 10 bbb 20 10
+EXPLAIN
+SELECT t2.sku, t2.sppr, t2.name, t1.sku, t1.pr
+FROM t2, t1 WHERE t2.sku=20 AND (t2.sku=t1.sku OR t2.sppr=t1.sku);
+id select_type table type possible_keys key key_len ref rows Extra
+1 SIMPLE t2 const PRIMARY PRIMARY 4 const 1
+1 SIMPLE t1 range PRIMARY PRIMARY 4 NULL 2 Using where
+DROP TABLE t1,t2;
+CREATE TABLE t1 (i TINYINT UNSIGNED NOT NULL);
+INSERT t1 SET i = 0;
+UPDATE t1 SET i = -1;
+Warnings:
+Warning 1264 Out of range value for column 'i' at row 1
+SELECT * FROM t1;
+i
+0
+UPDATE t1 SET i = CAST(i - 1 AS SIGNED);
+Warnings:
+Warning 1264 Out of range value for column 'i' at row 1
+SELECT * FROM t1;
+i
+0
+UPDATE t1 SET i = i - 1;
+Warnings:
+Warning 1264 Out of range value for column 'i' at row 1
+SELECT * FROM t1;
+i
+255
+DROP TABLE t1;
+create table t1 (a int);
+insert into t1 values (0),(1),(2),(3),(4),(5),(6),(7),(8),(9);
+create table t2 (a int, b int, c int, e int, primary key(a,b,c));
+insert into t2 select A.a, B.a, C.a, C.a from t1 A, t1 B, t1 C;
+analyze table t2;
+Table Op Msg_type Msg_text
+test.t2 analyze status OK
+select 'In next EXPLAIN, B.rows must be exactly 10:' Z;
+Z
+In next EXPLAIN, B.rows must be exactly 10:
+explain select * from t2 A, t2 B where A.a=5 and A.b=5 and A.C<5
+and B.a=5 and B.b=A.e and (B.b =1 or B.b = 3 or B.b=5);
+id select_type table type possible_keys key key_len ref rows Extra
+1 SIMPLE A range PRIMARY PRIMARY 12 NULL 4 Using where
+1 SIMPLE B ref PRIMARY PRIMARY 8 const,test.A.e 10
+drop table t1, t2;
+CREATE TABLE t1 (a int PRIMARY KEY, b int, INDEX(b));
+INSERT INTO t1 VALUES (1, 3), (9,4), (7,5), (4,5), (6,2),
+(3,1), (5,1), (8,9), (2,2), (0,9);
+CREATE TABLE t2 (c int, d int, f int, INDEX(c,f));
+INSERT INTO t2 VALUES
+(1,0,0), (1,0,1), (2,0,0), (2,0,1), (3,0,0), (4,0,1),
+(5,0,0), (5,0,1), (6,0,0), (0,0,1), (7,0,0), (7,0,1),
+(0,0,0), (0,0,1), (8,0,0), (8,0,1), (9,0,0), (9,0,1);
+EXPLAIN
+SELECT a, c, d, f FROM t1,t2 WHERE a=c AND b BETWEEN 4 AND 6;
+id select_type table type possible_keys key key_len ref rows Extra
+1 SIMPLE t1 range PRIMARY,b b 5 NULL 3 Using where
+1 SIMPLE t2 ref c c 5 test.t1.a 2 Using where
+EXPLAIN
+SELECT a, c, d, f FROM t1,t2 WHERE a=c AND b BETWEEN 4 AND 6 AND a > 0;
+id select_type table type possible_keys key key_len ref rows Extra
+1 SIMPLE t1 range PRIMARY,b b 5 NULL 3 Using where
+1 SIMPLE t2 ref c c 5 test.t1.a 2 Using where
+DROP TABLE t1, t2;
+create table t1 (
+a int unsigned not null auto_increment primary key,
+b bit not null,
+c bit not null
+);
+create table t2 (
+a int unsigned not null auto_increment primary key,
+b bit not null,
+c int unsigned not null,
+d varchar(50)
+);
+insert into t1 (b,c) values (0,1), (0,1);
+insert into t2 (b,c) values (0,1);
+select t1.a, t1.b + 0, t1.c + 0, t2.a, t2.b + 0, t2.c, t2.d
+from t1 left outer join t2 on t1.a = t2.c and t2.b <> 1
+where t1.b <> 1 order by t1.a;
+a t1.b + 0 t1.c + 0 a t2.b + 0 c d
+1 0 1 1 0 1 NULL
+2 0 1 NULL NULL NULL NULL
+drop table t1,t2;
+SELECT 0.9888889889 * 1.011111411911;
+0.9888889889 * 1.011111411911
+0.9998769417899202067879
+prepare stmt from 'select 1 as " a "';
+Warnings:
+Warning 1466 Leading spaces are removed from name ' a '
+execute stmt;
+a
+1
+CREATE TABLE t1 (a int NOT NULL PRIMARY KEY, b int NOT NULL);
+INSERT INTO t1 VALUES (1,1), (2,2), (3,3), (4,4);
+CREATE TABLE t2 (c int NOT NULL, INDEX idx(c));
+INSERT INTO t2 VALUES
+(1), (1), (1), (1), (1), (1), (1), (1),
+(2), (2), (2), (2),
+(3), (3),
+(4);
+EXPLAIN SELECT b FROM t1, t2 WHERE b=c AND a=1;
+id select_type table type possible_keys key key_len ref rows Extra
+1 SIMPLE t1 const PRIMARY PRIMARY 4 const 1
+1 SIMPLE t2 ref idx idx 4 const 7 Using index
+EXPLAIN SELECT b FROM t1, t2 WHERE b=c AND a=4;
+id select_type table type possible_keys key key_len ref rows Extra
+1 SIMPLE t1 const PRIMARY PRIMARY 4 const 1
+1 SIMPLE t2 ref idx idx 4 const 1 Using index
+DROP TABLE t1, t2;
+CREATE TABLE t1 (id int NOT NULL PRIMARY KEY, a int);
+INSERT INTO t1 VALUES (1,2), (2,NULL), (3,2);
+CREATE TABLE t2 (b int, c INT, INDEX idx1(b));
+INSERT INTO t2 VALUES (2,1), (3,2);
+CREATE TABLE t3 (d int, e int, INDEX idx1(d));
+INSERT INTO t3 VALUES (2,10), (2,20), (1,30), (2,40), (2,50);
+EXPLAIN
+SELECT * FROM t1 LEFT JOIN t2 ON t2.b=t1.a INNER JOIN t3 ON t3.d=t1.id
+WHERE t1.id=2;
+id select_type table type possible_keys key key_len ref rows Extra
+1 SIMPLE t1 const PRIMARY PRIMARY 4 const 1
+1 SIMPLE t2 const idx1 NULL NULL NULL 1
+1 SIMPLE t3 ref idx1 idx1 5 const 3 Using where
+SELECT * FROM t1 LEFT JOIN t2 ON t2.b=t1.a INNER JOIN t3 ON t3.d=t1.id
+WHERE t1.id=2;
+id a b c d e
+2 NULL NULL NULL 2 10
+2 NULL NULL NULL 2 20
+2 NULL NULL NULL 2 40
+2 NULL NULL NULL 2 50
+DROP TABLE t1,t2,t3;
+create table t1 (c1 varchar(1), c2 int, c3 int, c4 int, c5 int, c6 int,
+c7 int, c8 int, c9 int, fulltext key (`c1`));
+select distinct match (`c1`) against ('z') , c2, c3, c4,c5, c6,c7, c8
+from t1 where c9=1 order by c2, c2;
+match (`c1`) against ('z') c2 c3 c4 c5 c6 c7 c8
+drop table t1;
+CREATE TABLE t1 (pk varchar(10) PRIMARY KEY, fk varchar(16));
+CREATE TABLE t2 (pk varchar(16) PRIMARY KEY, fk varchar(10));
+INSERT INTO t1 VALUES
+('d','dddd'), ('i','iii'), ('a','aa'), ('b','bb'), ('g','gg'),
+('e','eee'), ('c','cccc'), ('h','hhh'), ('j','jjj'), ('f','fff');
+INSERT INTO t2 VALUES
+('jjj', 'j'), ('cc','c'), ('ccc','c'), ('aaa', 'a'), ('jjjj','j'),
+('hhh','h'), ('gg','g'), ('fff','f'), ('ee','e'), ('ffff','f'),
+('bbb','b'), ('ff','f'), ('cccc','c'), ('dddd','d'), ('jj','j'),
+('aaaa','a'), ('bb','b'), ('eeee','e'), ('aa','a'), ('hh','h');
+EXPLAIN SELECT t2.*
+FROM t1 JOIN t2 ON t2.fk=t1.pk
+WHERE t2.fk < 'c' AND t2.pk=t1.fk;
+id select_type table type possible_keys key key_len ref rows Extra
+1 SIMPLE t1 range PRIMARY PRIMARY 12 NULL 3 Using where
+1 SIMPLE t2 eq_ref PRIMARY PRIMARY 18 test.t1.fk 1 Using where
+EXPLAIN SELECT t2.*
+FROM t1 JOIN t2 ON t2.fk=t1.pk
+WHERE t2.fk BETWEEN 'a' AND 'b' AND t2.pk=t1.fk;
+id select_type table type possible_keys key key_len ref rows Extra
+1 SIMPLE t1 range PRIMARY PRIMARY 12 NULL 2 Using where
+1 SIMPLE t2 eq_ref PRIMARY PRIMARY 18 test.t1.fk 1 Using where
+EXPLAIN SELECT t2.*
+FROM t1 JOIN t2 ON t2.fk=t1.pk
+WHERE t2.fk IN ('a','b') AND t2.pk=t1.fk;
+id select_type table type possible_keys key key_len ref rows Extra
+1 SIMPLE t1 range PRIMARY PRIMARY 12 NULL 2 Using where
+1 SIMPLE t2 eq_ref PRIMARY PRIMARY 18 test.t1.fk 1 Using where
+DROP TABLE t1,t2;
+CREATE TABLE t1 (a int, b varchar(20) NOT NULL, PRIMARY KEY(a));
+CREATE TABLE t2 (a int, b varchar(20) NOT NULL,
+PRIMARY KEY (a), UNIQUE KEY (b));
+INSERT INTO t1 VALUES (1,'a'),(2,'b'),(3,'c');
+INSERT INTO t2 VALUES (1,'a'),(2,'b'),(3,'c');
+EXPLAIN SELECT t1.a FROM t1 LEFT JOIN t2 ON t2.b=t1.b WHERE t1.a=3;
+id select_type table type possible_keys key key_len ref rows Extra
+1 SIMPLE t1 const PRIMARY PRIMARY 4 const 1
+DROP TABLE t1,t2;
+CREATE TABLE t1(id int PRIMARY KEY, b int, e int);
+CREATE TABLE t2(i int, a int, INDEX si(i), INDEX ai(a));
+CREATE TABLE t3(a int PRIMARY KEY, c char(4), INDEX ci(c));
+INSERT INTO t1 VALUES
+(1,10,19), (2,20,22), (4,41,42), (9,93,95), (7, 77,79),
+(6,63,67), (5,55,58), (3,38,39), (8,81,89);
+INSERT INTO t2 VALUES
+(21,210), (41,410), (82,820), (83,830), (84,840),
+(65,650), (51,510), (37,370), (94,940), (76,760),
+(22,220), (33,330), (40,400), (95,950), (38,380),
+(67,670), (88,880), (57,570), (96,960), (97,970);
+INSERT INTO t3 VALUES
+(210,'bb'), (950,'ii'), (400,'ab'), (500,'ee'), (220,'gg'),
+(440,'gg'), (310,'eg'), (380,'ee'), (840,'bb'), (830,'ff'),
+(230,'aa'), (960,'ii'), (410,'aa'), (510,'ee'), (290,'bb'),
+(450,'gg'), (320,'dd'), (390,'hh'), (850,'jj'), (860,'ff');
+EXPLAIN
+SELECT t3.a FROM t1,t2 FORCE INDEX (si),t3
+WHERE t1.id = 8 AND t2.i BETWEEN t1.b AND t1.e AND
+t3.a=t2.a AND t3.c IN ('bb','ee');
+id select_type table type possible_keys key key_len ref rows Extra
+1 SIMPLE t1 const PRIMARY PRIMARY 4 const 1
+1 SIMPLE t2 range si si 5 NULL 4 Using where
+1 SIMPLE t3 eq_ref PRIMARY,ci PRIMARY 4 test.t2.a 1 Using where
+EXPLAIN
+SELECT t3.a FROM t1,t2,t3
+WHERE t1.id = 8 AND t2.i BETWEEN t1.b AND t1.e AND
+t3.a=t2.a AND t3.c IN ('bb','ee') ;
+id select_type table type possible_keys key key_len ref rows Extra
+1 SIMPLE t1 const PRIMARY PRIMARY 4 const 1
+1 SIMPLE t2 range si,ai si 5 NULL 4 Using where
+1 SIMPLE t3 eq_ref PRIMARY,ci PRIMARY 4 test.t2.a 1 Using where
+EXPLAIN
+SELECT t3.a FROM t1,t2 FORCE INDEX (si),t3
+WHERE t1.id = 8 AND (t2.i=t1.b OR t2.i=t1.e) AND t3.a=t2.a AND
+t3.c IN ('bb','ee');
+id select_type table type possible_keys key key_len ref rows Extra
+1 SIMPLE t1 const PRIMARY PRIMARY 4 const 1
+1 SIMPLE t2 range si si 5 NULL 2 Using where
+1 SIMPLE t3 eq_ref PRIMARY,ci PRIMARY 4 test.t2.a 1 Using where
+EXPLAIN
+SELECT t3.a FROM t1,t2,t3
+WHERE t1.id = 8 AND (t2.i=t1.b OR t2.i=t1.e) AND t3.a=t2.a AND
+t3.c IN ('bb','ee');
+id select_type table type possible_keys key key_len ref rows Extra
+1 SIMPLE t1 const PRIMARY PRIMARY 4 const 1
+1 SIMPLE t2 range si,ai si 5 NULL 2 Using where
+1 SIMPLE t3 eq_ref PRIMARY,ci PRIMARY 4 test.t2.a 1 Using where
+DROP TABLE t1,t2,t3;
+CREATE TABLE t1 ( f1 int primary key, f2 int, f3 int, f4 int, f5 int, f6 int, checked_out int);
+CREATE TABLE t2 ( f11 int PRIMARY KEY );
+INSERT INTO t1 VALUES (1,1,1,0,0,0,0),(2,1,1,3,8,1,0),(3,1,1,4,12,1,0);
+INSERT INTO t2 VALUES (62);
+SELECT * FROM t1 LEFT JOIN t2 ON f11 = t1.checked_out GROUP BY f1 ORDER BY f2, f3, f4, f5 LIMIT 0, 1;
+f1 f2 f3 f4 f5 f6 checked_out f11
+1 1 1 0 0 0 0 NULL
+DROP TABLE t1, t2;
+DROP TABLE IF EXISTS t1;
+CREATE TABLE t1(a int);
+INSERT into t1 values (1), (2), (3);
+SELECT * FROM t1 LIMIT 2, -1;
+ERROR 42000: You have an error in your SQL syntax; check the manual that corresponds to your MySQL server version for the right syntax to use near '-1' at line 1
+DROP TABLE t1;
+CREATE TABLE t1 (
+ID_with_null int NULL,
+ID_better int NOT NULL,
+INDEX idx1 (ID_with_null),
+INDEX idx2 (ID_better)
+);
+INSERT INTO t1 VALUES (1,1), (2,1), (null,3), (null,3), (null,3), (null,3);
+INSERT INTO t1 SELECT * FROM t1 WHERE ID_with_null IS NULL;
+INSERT INTO t1 SELECT * FROM t1 WHERE ID_with_null IS NULL;
+INSERT INTO t1 SELECT * FROM t1 WHERE ID_with_null IS NULL;
+INSERT INTO t1 SELECT * FROM t1 WHERE ID_with_null IS NULL;
+INSERT INTO t1 SELECT * FROM t1 WHERE ID_with_null IS NULL;
+SELECT COUNT(*) FROM t1 WHERE ID_with_null IS NULL;
+COUNT(*)
+128
+SELECT COUNT(*) FROM t1 WHERE ID_better=1;
+COUNT(*)
+2
+EXPLAIN SELECT * FROM t1 WHERE ID_better=1 AND ID_with_null IS NULL;
+id select_type table type possible_keys key key_len ref rows Extra
+1 SIMPLE t1 ref idx1,idx2 idx2 4 const 1 Using where
+DROP INDEX idx1 ON t1;
+CREATE UNIQUE INDEX idx1 ON t1(ID_with_null);
+EXPLAIN SELECT * FROM t1 WHERE ID_better=1 AND ID_with_null IS NULL;
+id select_type table type possible_keys key key_len ref rows Extra
+1 SIMPLE t1 ref idx1,idx2 idx2 4 const 1 Using where
+DROP TABLE t1;
+CREATE TABLE t1 (
+ID1_with_null int NULL,
+ID2_with_null int NULL,
+ID_better int NOT NULL,
+INDEX idx1 (ID1_with_null, ID2_with_null),
+INDEX idx2 (ID_better)
+);
+INSERT INTO t1 VALUES (1,1,1), (2,2,1), (3,null,3), (null,3,3), (null,null,3),
+(3,null,3), (null,3,3), (null,null,3), (3,null,3), (null,3,3), (null,null,3);
+INSERT INTO t1 SELECT * FROM t1 WHERE ID1_with_null IS NULL;
+INSERT INTO t1 SELECT * FROM t1 WHERE ID2_with_null IS NULL;
+INSERT INTO t1 SELECT * FROM t1 WHERE ID1_with_null IS NULL;
+INSERT INTO t1 SELECT * FROM t1 WHERE ID2_with_null IS NULL;
+INSERT INTO t1 SELECT * FROM t1 WHERE ID1_with_null IS NULL;
+INSERT INTO t1 SELECT * FROM t1 WHERE ID2_with_null IS NULL;
+SELECT COUNT(*) FROM t1 WHERE ID1_with_null IS NULL AND ID2_with_null=3;
+COUNT(*)
+24
+SELECT COUNT(*) FROM t1 WHERE ID1_with_null=3 AND ID2_with_null IS NULL;
+COUNT(*)
+24
+SELECT COUNT(*) FROM t1 WHERE ID1_with_null IS NULL AND ID2_with_null IS NULL;
+COUNT(*)
+192
+SELECT COUNT(*) FROM t1 WHERE ID_better=1;
+COUNT(*)
+2
+EXPLAIN SELECT * FROM t1
+WHERE ID_better=1 AND ID1_with_null IS NULL AND ID2_with_null=3 ;
+id select_type table type possible_keys key key_len ref rows Extra
+1 SIMPLE t1 ref idx1,idx2 idx2 4 const 1 Using where
+EXPLAIN SELECT * FROM t1
+WHERE ID_better=1 AND ID1_with_null=3 AND ID2_with_null=3 IS NULL ;
+id select_type table type possible_keys key key_len ref rows Extra
+1 SIMPLE t1 ref idx1,idx2 idx2 4 const 1 Using where
+EXPLAIN SELECT * FROM t1
+WHERE ID_better=1 AND ID1_with_null IS NULL AND ID2_with_null IS NULL;
+id select_type table type possible_keys key key_len ref rows Extra
+1 SIMPLE t1 ref idx1,idx2 idx2 4 const 1 Using where
+DROP INDEX idx1 ON t1;
+CREATE UNIQUE INDEX idx1 ON t1(ID1_with_null,ID2_with_null);
+EXPLAIN SELECT * FROM t1
+WHERE ID_better=1 AND ID1_with_null IS NULL AND ID2_with_null=3 ;
+id select_type table type possible_keys key key_len ref rows Extra
+1 SIMPLE t1 ref idx1,idx2 idx2 4 const 1 Using where
+EXPLAIN SELECT * FROM t1
+WHERE ID_better=1 AND ID1_with_null=3 AND ID2_with_null IS NULL ;
+id select_type table type possible_keys key key_len ref rows Extra
+1 SIMPLE t1 ref idx1,idx2 idx2 4 const 1 Using where
+EXPLAIN SELECT * FROM t1
+WHERE ID_better=1 AND ID1_with_null IS NULL AND ID2_with_null IS NULL;
+id select_type table type possible_keys key key_len ref rows Extra
+1 SIMPLE t1 ref idx1,idx2 idx2 4 const 1 Using where
+EXPLAIN SELECT * FROM t1
+WHERE ID_better=1 AND ID1_with_null IS NULL AND
+(ID2_with_null=1 OR ID2_with_null=2);
+id select_type table type possible_keys key key_len ref rows Extra
+1 SIMPLE t1 ref idx1,idx2 idx2 4 const 1 Using where
+DROP TABLE t1;
+CREATE TABLE t1 (a INT, ts TIMESTAMP, KEY ts(ts));
+INSERT INTO t1 VALUES (30,"2006-01-03 23:00:00"), (31,"2006-01-03 23:00:00");
+ANALYZE TABLE t1;
+Table Op Msg_type Msg_text
+test.t1 analyze status OK
+CREATE TABLE t2 (a INT, dt1 DATETIME, dt2 DATETIME, PRIMARY KEY (a));
+INSERT INTO t2 VALUES (30, "2006-01-01 00:00:00", "2999-12-31 00:00:00");
+INSERT INTO t2 SELECT a+1,dt1,dt2 FROM t2;
+ANALYZE TABLE t2;
+Table Op Msg_type Msg_text
+test.t2 analyze status OK
+EXPLAIN
+SELECT * FROM t1 LEFT JOIN t2 ON (t1.a=t2.a) WHERE t1.a=30
+AND t1.ts BETWEEN t2.dt1 AND t2.dt2
+AND t1.ts BETWEEN "2006-01-01" AND "2006-12-31";
+id select_type table type possible_keys key key_len ref rows Extra
+1 SIMPLE t2 const PRIMARY PRIMARY 4 const 1
+1 SIMPLE t1 range ts ts 4 NULL 1 Using where
+Warnings:
+Warning 1292 Incorrect datetime value: '2999-12-31 00:00:00' for column 'ts' at row 1
+SELECT * FROM t1 LEFT JOIN t2 ON (t1.a=t2.a) WHERE t1.a=30
+AND t1.ts BETWEEN t2.dt1 AND t2.dt2
+AND t1.ts BETWEEN "2006-01-01" AND "2006-12-31";
+a ts a dt1 dt2
+30 2006-01-03 23:00:00 30 2006-01-01 00:00:00 2999-12-31 00:00:00
+Warnings:
+Warning 1292 Incorrect datetime value: '2999-12-31 00:00:00' for column 'ts' at row 1
+DROP TABLE t1,t2;
+create table t1 (a bigint unsigned);
+insert into t1 values
+(if(1, 9223372036854775808, 1)),
+(case when 1 then 9223372036854775808 else 1 end),
+(coalesce(9223372036854775808, 1));
+select * from t1;
+a
+9223372036854775808
+9223372036854775808
+9223372036854775808
+drop table t1;
+create table t1 select
+if(1, 9223372036854775808, 1) i,
+case when 1 then 9223372036854775808 else 1 end c,
+coalesce(9223372036854775808, 1) co;
+show create table t1;
+Table Create Table
+t1 CREATE TABLE `t1` (
+ `i` decimal(19,0) NOT NULL DEFAULT '0',
+ `c` decimal(19,0) NOT NULL DEFAULT '0',
+ `co` decimal(19,0) NOT NULL DEFAULT '0'
+) ENGINE=MyISAM DEFAULT CHARSET=latin1
+drop table t1;
+select
+if(1, cast(1111111111111111111 as unsigned), 1) i,
+case when 1 then cast(1111111111111111111 as unsigned) else 1 end c,
+coalesce(cast(1111111111111111111 as unsigned), 1) co;
+i c co
+1111111111111111111 1111111111111111111 1111111111111111111
+CREATE TABLE t1 (name varchar(255));
+CREATE TABLE t2 (name varchar(255), n int, KEY (name(3)));
+INSERT INTO t1 VALUES ('ccc'), ('bb'), ('cc '), ('aa '), ('aa');
+INSERT INTO t2 VALUES ('bb',1), ('aa',2), ('cc ',3);
+INSERT INTO t2 VALUES (concat('cc ', 0x06), 4);
+INSERT INTO t2 VALUES ('cc',5), ('bb ',6), ('cc ',7);
+SELECT * FROM t2;
+name n
+bb 1
+aa 2
+cc 3
+cc 4
+cc 5
+bb 6
+cc 7
+SELECT * FROM t2 ORDER BY name;
+name n
+aa 2
+bb 1
+bb 6
+cc 4
+cc 3
+cc 5
+cc 7
+SELECT name, LENGTH(name), n FROM t2 ORDER BY name;
+name LENGTH(name) n
+aa 2 2
+bb 2 1
+bb 3 6
+cc 4 4
+cc 5 3
+cc 2 5
+cc 3 7
+EXPLAIN SELECT name, LENGTH(name), n FROM t2 WHERE name='cc ';
+id select_type table type possible_keys key key_len ref rows Extra
+1 SIMPLE t2 ref name name 6 const 3 Using where
+SELECT name, LENGTH(name), n FROM t2 WHERE name='cc ';
+name LENGTH(name) n
+cc 5 3
+cc 2 5
+cc 3 7
+EXPLAIN SELECT name , LENGTH(name), n FROM t2 WHERE name LIKE 'cc%';
+id select_type table type possible_keys key key_len ref rows Extra
+1 SIMPLE t2 range name name 6 NULL 3 Using where
+SELECT name , LENGTH(name), n FROM t2 WHERE name LIKE 'cc%';
+name LENGTH(name) n
+cc 5 3
+cc 4 4
+cc 2 5
+cc 3 7
+EXPLAIN SELECT name , LENGTH(name), n FROM t2 WHERE name LIKE 'cc%' ORDER BY name;
+id select_type table type possible_keys key key_len ref rows Extra
+1 SIMPLE t2 range name name 6 NULL 3 Using where; Using filesort
+SELECT name , LENGTH(name), n FROM t2 WHERE name LIKE 'cc%' ORDER BY name;
+name LENGTH(name) n
+cc 4 4
+cc 5 3
+cc 2 5
+cc 3 7
+EXPLAIN SELECT * FROM t1 LEFT JOIN t2 ON t1.name=t2.name;
+id select_type table type possible_keys key key_len ref rows Extra
+1 SIMPLE t1 ALL NULL NULL NULL NULL 5
+1 SIMPLE t2 ref name name 6 test.t1.name 2
+SELECT * FROM t1 LEFT JOIN t2 ON t1.name=t2.name;
+name name n
+ccc NULL NULL
+bb bb 1
+bb bb 6
+cc cc 3
+cc cc 5
+cc cc 7
+aa aa 2
+aa aa 2
+DROP TABLE t1,t2;
+CREATE TABLE t1 (name text);
+CREATE TABLE t2 (name text, n int, KEY (name(3)));
+INSERT INTO t1 VALUES ('ccc'), ('bb'), ('cc '), ('aa '), ('aa');
+INSERT INTO t2 VALUES ('bb',1), ('aa',2), ('cc ',3);
+INSERT INTO t2 VALUES (concat('cc ', 0x06), 4);
+INSERT INTO t2 VALUES ('cc',5), ('bb ',6), ('cc ',7);
+SELECT * FROM t2;
+name n
+bb 1
+aa 2
+cc 3
+cc 4
+cc 5
+bb 6
+cc 7
+SELECT * FROM t2 ORDER BY name;
+name n
+aa 2
+bb 1
+bb 6
+cc 4
+cc 3
+cc 5
+cc 7
+SELECT name, LENGTH(name), n FROM t2 ORDER BY name;
+name LENGTH(name) n
+aa 2 2
+bb 2 1
+bb 3 6
+cc 4 4
+cc 5 3
+cc 2 5
+cc 3 7
+EXPLAIN SELECT name, LENGTH(name), n FROM t2 WHERE name='cc ';
+id select_type table type possible_keys key key_len ref rows Extra
+1 SIMPLE t2 ref name name 6 const 3 Using where
+SELECT name, LENGTH(name), n FROM t2 WHERE name='cc ';
+name LENGTH(name) n
+cc 5 3
+cc 2 5
+cc 3 7
+EXPLAIN SELECT name , LENGTH(name), n FROM t2 WHERE name LIKE 'cc%';
+id select_type table type possible_keys key key_len ref rows Extra
+1 SIMPLE t2 range name name 6 NULL 3 Using where
+SELECT name , LENGTH(name), n FROM t2 WHERE name LIKE 'cc%';
+name LENGTH(name) n
+cc 5 3
+cc 4 4
+cc 2 5
+cc 3 7
+EXPLAIN SELECT name , LENGTH(name), n FROM t2 WHERE name LIKE 'cc%' ORDER BY name;
+id select_type table type possible_keys key key_len ref rows Extra
+1 SIMPLE t2 range name name 6 NULL 3 Using where; Using filesort
+SELECT name , LENGTH(name), n FROM t2 WHERE name LIKE 'cc%' ORDER BY name;
+name LENGTH(name) n
+cc 4 4
+cc 5 3
+cc 2 5
+cc 3 7
+EXPLAIN SELECT * FROM t1 LEFT JOIN t2 ON t1.name=t2.name;
+id select_type table type possible_keys key key_len ref rows Extra
+1 SIMPLE t1 ALL NULL NULL NULL NULL 5
+1 SIMPLE t2 ref name name 6 test.t1.name 2
+SELECT * FROM t1 LEFT JOIN t2 ON t1.name=t2.name;
+name name n
+ccc NULL NULL
+bb bb 1
+bb bb 6
+cc cc 3
+cc cc 5
+cc cc 7
+aa aa 2
+aa aa 2
+DROP TABLE t1,t2;
+CREATE TABLE t1 (
+access_id int NOT NULL default '0',
+name varchar(20) default NULL,
+rank int NOT NULL default '0',
+KEY idx (access_id)
+);
+CREATE TABLE t2 (
+faq_group_id int NOT NULL default '0',
+faq_id int NOT NULL default '0',
+access_id int default NULL,
+UNIQUE KEY idx1 (faq_id),
+KEY idx2 (faq_group_id,faq_id)
+);
+INSERT INTO t1 VALUES
+(1,'Everyone',2),(2,'Help',3),(3,'Technical Support',1),(4,'Chat User',4);
+INSERT INTO t2 VALUES
+(261,265,1),(490,494,1);
+SELECT t2.faq_id
+FROM t1 INNER JOIN t2 IGNORE INDEX (idx1)
+ON (t1.access_id = t2.access_id)
+LEFT JOIN t2 t
+ON (t.faq_group_id = t2.faq_group_id AND
+find_in_set(t.access_id, '1,4') < find_in_set(t2.access_id, '1,4'))
+WHERE
+t2.access_id IN (1,4) AND t.access_id IS NULL AND t2.faq_id in (265);
+faq_id
+265
+SELECT t2.faq_id
+FROM t1 INNER JOIN t2
+ON (t1.access_id = t2.access_id)
+LEFT JOIN t2 t
+ON (t.faq_group_id = t2.faq_group_id AND
+find_in_set(t.access_id, '1,4') < find_in_set(t2.access_id, '1,4'))
+WHERE
+t2.access_id IN (1,4) AND t.access_id IS NULL AND t2.faq_id in (265);
+faq_id
+265
+DROP TABLE t1,t2;
+CREATE TABLE t1 (a INT, b INT, KEY inx (b,a));
+INSERT INTO t1 VALUES (1,1), (1,2), (1,3), (1,4), (1,5), (1, 6), (1,7);
+EXPLAIN SELECT COUNT(*) FROM t1 f1 INNER JOIN t1 f2
+ON ( f1.b=f2.b AND f1.a<f2.a )
+WHERE 1 AND f1.b NOT IN (100,2232,3343,51111);
+id select_type table type possible_keys key key_len ref rows Extra
+1 SIMPLE f1 index inx inx 10 NULL 7 Using where; Using index
+1 SIMPLE f2 ref inx inx 5 test.f1.b 1 Using where; Using index
+DROP TABLE t1;
+CREATE TABLE t1 (c1 INT, c2 INT);
+INSERT INTO t1 VALUES (1,11), (2,22), (2,22);
+EXPLAIN SELECT c1 FROM t1 WHERE (SELECT (SELECT (SELECT (SELECT (SELECT (SELECT (SELECT (SELECT (SELECT (SELECT (SELECT (SELECT (SELECT (SELECT (SELECT (SELECT (SELECT (SELECT (SELECT (SELECT (SELECT (SELECT (SELECT (SELECT (SELECT (SELECT (SELECT (SELECT (SELECT (SELECT (SELECT COUNT(c2)))))))))))))))))))))))))))))))) > 0;
+id select_type table type possible_keys key key_len ref rows Extra
+1 PRIMARY t1 ALL NULL NULL NULL NULL 3 Using where
+31 DEPENDENT SUBQUERY NULL NULL NULL NULL NULL NULL NULL No tables used
+32 DEPENDENT SUBQUERY NULL NULL NULL NULL NULL NULL NULL No tables used
+EXPLAIN SELECT c1 FROM t1 WHERE (SELECT (SELECT (SELECT (SELECT (SELECT (SELECT (SELECT (SELECT (SELECT (SELECT (SELECT (SELECT (SELECT (SELECT (SELECT (SELECT (SELECT (SELECT (SELECT (SELECT (SELECT (SELECT (SELECT (SELECT (SELECT (SELECT (SELECT (SELECT (SELECT (SELECT (SELECT (SELECT (SELECT (SELECT (SELECT (SELECT (SELECT (SELECT (SELECT (SELECT (SELECT (SELECT (SELECT (SELECT (SELECT (SELECT (SELECT (SELECT (SELECT (SELECT (SELECT (SELECT (SELECT (SELECT (SELECT (SELECT (SELECT (SELECT (SELECT (SELECT (SELECT (SELECT (SELECT (SELECT COUNT(c2))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))) > 0;
+ERROR HY000: Too high level of nesting for select
+DROP TABLE t1;
+CREATE TABLE t1 (
+c1 int(11) NOT NULL AUTO_INCREMENT,
+c2 varchar(1000) DEFAULT NULL,
+c3 bigint(20) DEFAULT NULL,
+c4 bigint(20) DEFAULT NULL,
+PRIMARY KEY (c1)
+);
+EXPLAIN EXTENDED
+SELECT join_2.c1
+FROM
+t1 AS join_0,
+t1 AS join_1,
+t1 AS join_2,
+t1 AS join_3,
+t1 AS join_4,
+t1 AS join_5,
+t1 AS join_6,
+t1 AS join_7
+WHERE
+join_0.c1=join_1.c1 AND
+join_1.c1=join_2.c1 AND
+join_2.c1=join_3.c1 AND
+join_3.c1=join_4.c1 AND
+join_4.c1=join_5.c1 AND
+join_5.c1=join_6.c1 AND
+join_6.c1=join_7.c1
+OR
+join_0.c2 < '?' AND
+join_1.c2 < '?' AND
+join_2.c2 > '?' AND
+join_2.c2 < '!' AND
+join_3.c2 > '?' AND
+join_4.c2 = '?' AND
+join_5.c2 <> '?' AND
+join_6.c2 <> '?' AND
+join_7.c2 >= '?' AND
+join_0.c1=join_1.c1 AND
+join_1.c1=join_2.c1 AND
+join_2.c1=join_3.c1 AND
+join_3.c1=join_4.c1 AND
+join_4.c1=join_5.c1 AND
+join_5.c1=join_6.c1 AND
+join_6.c1=join_7.c1
+GROUP BY
+join_3.c1,
+join_2.c1,
+join_7.c1,
+join_1.c1,
+join_0.c1;
+id select_type table type possible_keys key key_len ref rows filtered Extra
+1 SIMPLE NULL NULL NULL NULL NULL NULL NULL NULL Impossible WHERE noticed after reading const tables
+Warnings:
+Note 1003 select '0' AS `c1` from `test`.`t1` `join_0` join `test`.`t1` `join_1` join `test`.`t1` `join_2` join `test`.`t1` `join_3` join `test`.`t1` `join_4` join `test`.`t1` `join_5` join `test`.`t1` `join_6` join `test`.`t1` `join_7` where 0 group by '0','0','0','0','0'
+SHOW WARNINGS;
+Level Code Message
+Note 1003 select '0' AS `c1` from `test`.`t1` `join_0` join `test`.`t1` `join_1` join `test`.`t1` `join_2` join `test`.`t1` `join_3` join `test`.`t1` `join_4` join `test`.`t1` `join_5` join `test`.`t1` `join_6` join `test`.`t1` `join_7` where 0 group by '0','0','0','0','0'
+DROP TABLE t1;
+SELECT 1 AS ` `;
+
+1
+Warnings:
+Warning 1474 Name ' ' has become ''
+SELECT 1 AS ` `;
+
+1
+Warnings:
+Warning 1474 Name ' ' has become ''
+SELECT 1 AS ` x`;
+x
+1
+Warnings:
+Warning 1466 Leading spaces are removed from name ' x'
+CREATE VIEW v1 AS SELECT 1 AS ``;
+ERROR 42000: Incorrect column name ''
+CREATE VIEW v1 AS SELECT 1 AS ` `;
+ERROR 42000: Incorrect column name ' '
+CREATE VIEW v1 AS SELECT 1 AS ` `;
+ERROR 42000: Incorrect column name ' '
+CREATE VIEW v1 AS SELECT (SELECT 1 AS ` `);
+ERROR 42000: Incorrect column name ' '
+CREATE VIEW v1 AS SELECT 1 AS ` x`;
+Warnings:
+Warning 1466 Leading spaces are removed from name ' x'
+SELECT `x` FROM v1;
+x
+1
+ALTER VIEW v1 AS SELECT 1 AS ` `;
+ERROR 42000: Incorrect column name ' '
+DROP VIEW v1;
+select str_to_date('2007-10-09','%Y-%m-%d') between '2007/10/01 00:00:00 GMT'
+ and '2007/10/20 00:00:00 GMT';
+str_to_date('2007-10-09','%Y-%m-%d') between '2007/10/01 00:00:00 GMT'
+ and '2007/10/20 00:00:00 GMT'
+1
+Warnings:
+Warning 1292 Truncated incorrect datetime value: '2007/10/01 00:00:00 GMT'
+Warning 1292 Truncated incorrect datetime value: '2007/10/20 00:00:00 GMT'
+select str_to_date('2007-10-09','%Y-%m-%d') > '2007/10/01 00:00:00 GMT-6';
+str_to_date('2007-10-09','%Y-%m-%d') > '2007/10/01 00:00:00 GMT-6'
+1
+Warnings:
+Warning 1292 Truncated incorrect date value: '2007/10/01 00:00:00 GMT-6'
+select str_to_date('2007-10-09','%Y-%m-%d') <= '2007/10/2000:00:00 GMT-6';
+str_to_date('2007-10-09','%Y-%m-%d') <= '2007/10/2000:00:00 GMT-6'
+1
+Warnings:
+Warning 1292 Truncated incorrect date value: '2007/10/2000:00:00 GMT-6'
+select str_to_date('2007-10-01','%Y-%m-%d') = '2007-10-1 00:00:00 GMT-6';
+str_to_date('2007-10-01','%Y-%m-%d') = '2007-10-1 00:00:00 GMT-6'
+1
+Warnings:
+Warning 1292 Truncated incorrect date value: '2007-10-1 00:00:00 GMT-6'
+select str_to_date('2007-10-01','%Y-%m-%d') = '2007-10-01 x00:00:00 GMT-6';
+str_to_date('2007-10-01','%Y-%m-%d') = '2007-10-01 x00:00:00 GMT-6'
+1
+Warnings:
+Warning 1292 Truncated incorrect date value: '2007-10-01 x00:00:00 GMT-6'
+select str_to_date('2007-10-01','%Y-%m-%d %H:%i:%s') = '2007-10-01 00:00:00 GMT-6';
+str_to_date('2007-10-01','%Y-%m-%d %H:%i:%s') = '2007-10-01 00:00:00 GMT-6'
+1
+Warnings:
+Warning 1292 Truncated incorrect datetime value: '2007-10-01 00:00:00 GMT-6'
+select str_to_date('2007-10-01','%Y-%m-%d %H:%i:%s') = '2007-10-01 00:x00:00 GMT-6';
+str_to_date('2007-10-01','%Y-%m-%d %H:%i:%s') = '2007-10-01 00:x00:00 GMT-6'
+1
+Warnings:
+Warning 1292 Truncated incorrect datetime value: '2007-10-01 00:x00:00 GMT-6'
+select str_to_date('2007-10-01','%Y-%m-%d %H:%i:%s') = '2007-10-01 x12:34:56 GMT-6';
+str_to_date('2007-10-01','%Y-%m-%d %H:%i:%s') = '2007-10-01 x12:34:56 GMT-6'
+1
+Warnings:
+Warning 1292 Truncated incorrect datetime value: '2007-10-01 x12:34:56 GMT-6'
+select str_to_date('2007-10-01 12:34:00','%Y-%m-%d %H:%i:%s') = '2007-10-01 12:34x:56 GMT-6';
+str_to_date('2007-10-01 12:34:00','%Y-%m-%d %H:%i:%s') = '2007-10-01 12:34x:56 GMT-6'
+1
+Warnings:
+Warning 1292 Truncated incorrect datetime value: '2007-10-01 12:34x:56 GMT-6'
+select str_to_date('2007-10-01 12:34:56','%Y-%m-%d %H:%i:%s') = '2007-10-01 12:34x:56 GMT-6';
+str_to_date('2007-10-01 12:34:56','%Y-%m-%d %H:%i:%s') = '2007-10-01 12:34x:56 GMT-6'
+0
+Warnings:
+Warning 1292 Truncated incorrect datetime value: '2007-10-01 12:34x:56 GMT-6'
+select str_to_date('2007-10-01 12:34:56','%Y-%m-%d %H:%i:%s') = '2007-10-01 12:34:56';
+str_to_date('2007-10-01 12:34:56','%Y-%m-%d %H:%i:%s') = '2007-10-01 12:34:56'
+1
+select str_to_date('2007-10-01','%Y-%m-%d') = '2007-10-01 12:00:00';
+str_to_date('2007-10-01','%Y-%m-%d') = '2007-10-01 12:00:00'
+0
+select str_to_date('2007-10-01 12','%Y-%m-%d %H') = '2007-10-01 12:00:00';
+str_to_date('2007-10-01 12','%Y-%m-%d %H') = '2007-10-01 12:00:00'
+1
+select str_to_date('2007-10-01 12:34','%Y-%m-%d %H') = '2007-10-01 12:00:00';
+str_to_date('2007-10-01 12:34','%Y-%m-%d %H') = '2007-10-01 12:00:00'
+1
+Warnings:
+Warning 1292 Truncated incorrect datetime value: '2007-10-01 12:34'
+select str_to_date('2007-02-30 12:34','%Y-%m-%d %H:%i') = '2007-02-30 12:34';
+str_to_date('2007-02-30 12:34','%Y-%m-%d %H:%i') = '2007-02-30 12:34'
+1
+select str_to_date('2007-10-00 12:34','%Y-%m-%d %H:%i') = '2007-10-00 12:34';
+str_to_date('2007-10-00 12:34','%Y-%m-%d %H:%i') = '2007-10-00 12:34'
+1
+select str_to_date('2007-10-00','%Y-%m-%d') between '2007/09/01 00:00:00'
+ and '2007/10/20 00:00:00';
+str_to_date('2007-10-00','%Y-%m-%d') between '2007/09/01 00:00:00'
+ and '2007/10/20 00:00:00'
+1
+set SQL_MODE=TRADITIONAL;
+select str_to_date('2007-10-00 12:34','%Y-%m-%d %H:%i') = '2007-10-00 12:34';
+str_to_date('2007-10-00 12:34','%Y-%m-%d %H:%i') = '2007-10-00 12:34'
+0
+Warnings:
+Warning 1292 Truncated incorrect datetime value: '2007-10-00 12:34'
+select str_to_date('2007-10-01 12:34','%Y-%m-%d %H:%i') = '2007-10-00 12:34';
+str_to_date('2007-10-01 12:34','%Y-%m-%d %H:%i') = '2007-10-00 12:34'
+0
+Warnings:
+Warning 1292 Truncated incorrect datetime value: '2007-10-00 12:34'
+select str_to_date('2007-10-00 12:34','%Y-%m-%d %H:%i') = '2007-10-01 12:34';
+str_to_date('2007-10-00 12:34','%Y-%m-%d %H:%i') = '2007-10-01 12:34'
+0
+Warnings:
+Warning 1292 Truncated incorrect datetime value: '2007-10-00 12:34:00'
+select str_to_date('2007-10-00','%Y-%m-%d') between '2007/09/01'
+ and '2007/10/20';
+str_to_date('2007-10-00','%Y-%m-%d') between '2007/09/01'
+ and '2007/10/20'
+0
+Warnings:
+Warning 1292 Incorrect datetime value: '2007-10-00' for column '2007/09/01' at row 1
+Warning 1292 Incorrect datetime value: '2007-10-00' for column '2007/10/20' at row 1
+set SQL_MODE=DEFAULT;
+select str_to_date('2007-10-00','%Y-%m-%d') between '' and '2007/10/20';
+str_to_date('2007-10-00','%Y-%m-%d') between '' and '2007/10/20'
+1
+Warnings:
+Warning 1292 Truncated incorrect datetime value: ''
+select str_to_date('','%Y-%m-%d') between '2007/10/01' and '2007/10/20';
+str_to_date('','%Y-%m-%d') between '2007/10/01' and '2007/10/20'
+0
+select str_to_date('','%Y-%m-%d %H:%i') = '2007-10-01 12:34';
+str_to_date('','%Y-%m-%d %H:%i') = '2007-10-01 12:34'
+0
+select str_to_date(NULL,'%Y-%m-%d %H:%i') = '2007-10-01 12:34';
+str_to_date(NULL,'%Y-%m-%d %H:%i') = '2007-10-01 12:34'
+NULL
+select str_to_date('2007-10-00 12:34','%Y-%m-%d %H:%i') = '';
+str_to_date('2007-10-00 12:34','%Y-%m-%d %H:%i') = ''
+0
+Warnings:
+Warning 1292 Truncated incorrect datetime value: ''
+select str_to_date('1','%Y-%m-%d') = '1';
+str_to_date('1','%Y-%m-%d') = '1'
+0
+Warnings:
+Warning 1292 Truncated incorrect date value: '1'
+select str_to_date('1','%Y-%m-%d') = '1';
+str_to_date('1','%Y-%m-%d') = '1'
+0
+Warnings:
+Warning 1292 Truncated incorrect date value: '1'
+select str_to_date('','%Y-%m-%d') = '';
+str_to_date('','%Y-%m-%d') = ''
+0
+Warnings:
+Warning 1292 Truncated incorrect date value: ''
+select str_to_date('1000-01-01','%Y-%m-%d') between '0000-00-00' and NULL;
+str_to_date('1000-01-01','%Y-%m-%d') between '0000-00-00' and NULL
+0
+select str_to_date('1000-01-01','%Y-%m-%d') between NULL and '2000-00-00';
+str_to_date('1000-01-01','%Y-%m-%d') between NULL and '2000-00-00'
+0
+select str_to_date('1000-01-01','%Y-%m-%d') between NULL and NULL;
+str_to_date('1000-01-01','%Y-%m-%d') between NULL and NULL
+0
+CREATE TABLE t1 (c11 INT UNSIGNED NOT NULL AUTO_INCREMENT PRIMARY KEY);
+CREATE TABLE t2 (c21 INT UNSIGNED NOT NULL,
+c22 INT DEFAULT NULL,
+KEY(c21, c22));
+CREATE TABLE t3 (c31 INT UNSIGNED NOT NULL DEFAULT 0,
+c32 INT DEFAULT NULL,
+c33 INT NOT NULL,
+c34 INT UNSIGNED DEFAULT 0,
+KEY (c33, c34, c32));
+INSERT INTO t1 values (),(),(),(),();
+INSERT INTO t2 SELECT a.c11, b.c11 FROM t1 a, t1 b;
+INSERT INTO t3 VALUES (1, 1, 1, 0),
+(2, 2, 0, 0),
+(3, 3, 1, 0),
+(4, 4, 0, 0),
+(5, 5, 1, 0);
+SELECT c32 FROM t1, t2, t3 WHERE t1.c11 IN (1, 3, 5) AND
+t3.c31 = t1.c11 AND t2.c21 = t1.c11 AND
+t3.c33 = 1 AND t2.c22 in (1, 3)
+ORDER BY c32;
+c32
+1
+1
+3
+3
+5
+5
+SELECT c32 FROM t1, t2, t3 WHERE t1.c11 IN (1, 3, 5) AND
+t3.c31 = t1.c11 AND t2.c21 = t1.c11 AND
+t3.c33 = 1 AND t2.c22 in (1, 3)
+ORDER BY c32 DESC;
+c32
+5
+5
+3
+3
+1
+1
+DROP TABLE t1, t2, t3;
+
+#
+# Bug#30736: Row Size Too Large Error Creating a Table and
+# Inserting Data.
+#
+DROP TABLE IF EXISTS t1;
+DROP TABLE IF EXISTS t2;
+
+CREATE TABLE t1(
+c1 DECIMAL(10, 2),
+c2 FLOAT);
+
+INSERT INTO t1 VALUES (0, 1), (2, 3), (4, 5);
+
+CREATE TABLE t2(
+c3 DECIMAL(10, 2))
+SELECT
+c1 * c2 AS c3
+FROM t1;
+
+SELECT * FROM t1;
+c1 c2
+0.00 1
+2.00 3
+4.00 5
+
+SELECT * FROM t2;
+c3
+0.00
+6.00
+20.00
+
+DROP TABLE t1;
+DROP TABLE t2;
+
+CREATE TABLE t1 (c1 BIGINT NOT NULL);
+INSERT INTO t1 (c1) VALUES (1);
+SELECT * FROM t1 WHERE c1 > NULL + 1;
+c1
+DROP TABLE t1;
+
+CREATE TABLE t1 (a VARCHAR(10) NOT NULL PRIMARY KEY);
+INSERT INTO t1 (a) VALUES ('foo0'), ('bar0'), ('baz0');
+SELECT * FROM t1 WHERE a IN (CONCAT('foo', 0), 'bar');
+a
+foo0
+DROP TABLE t1;
+CREATE TABLE t1 (a INT, b INT);
+CREATE TABLE t2 (a INT, c INT, KEY(a));
+INSERT INTO t1 VALUES (1, 1), (2, 2);
+INSERT INTO t2 VALUES (1, 1), (1, 2), (1, 3), (1, 4), (1, 5),
+(2, 1), (2, 2), (2, 3), (2, 4), (2, 5),
+(3, 1), (3, 2), (3, 3), (3, 4), (3, 5),
+(4, 1), (4, 2), (4, 3), (4, 4), (4, 5);
+FLUSH STATUS;
+SELECT DISTINCT b FROM t1 LEFT JOIN t2 USING(a) WHERE c <= 3;
+b
+1
+2
+SHOW STATUS LIKE 'Handler_read%';
+Variable_name Value
+Handler_read_first 0
+Handler_read_key 2
+Handler_read_next 0
+Handler_read_prev 0
+Handler_read_rnd 0
+Handler_read_rnd_next 6
+DROP TABLE t1, t2;
+CREATE TABLE t1 (f1 bigint(20) NOT NULL default '0',
+f2 int(11) NOT NULL default '0',
+f3 bigint(20) NOT NULL default '0',
+f4 varchar(255) NOT NULL default '',
+PRIMARY KEY (f1),
+KEY key1 (f4),
+KEY key2 (f2));
+CREATE TABLE t2 (f1 int(11) NOT NULL default '0',
+f2 enum('A1','A2','A3') NOT NULL default 'A1',
+f3 int(11) NOT NULL default '0',
+PRIMARY KEY (f1),
+KEY key1 (f3));
+CREATE TABLE t3 (f1 bigint(20) NOT NULL default '0',
+f2 datetime NOT NULL default '1980-01-01 00:00:00',
+PRIMARY KEY (f1));
+insert into t1 values (1, 1, 1, 'abc');
+insert into t1 values (2, 1, 2, 'def');
+insert into t1 values (3, 1, 2, 'def');
+insert into t2 values (1, 'A1', 1);
+insert into t3 values (1, '1980-01-01');
+SELECT a.f3, cr.f4, count(*) count
+FROM t2 a
+STRAIGHT_JOIN t1 cr ON cr.f2 = a.f1
+LEFT JOIN
+(t1 cr2
+JOIN t3 ae2 ON cr2.f3 = ae2.f1
+) ON a.f1 = cr2.f2 AND ae2.f2 < now() - INTERVAL 7 DAY AND
+cr.f4 = cr2.f4
+GROUP BY a.f3, cr.f4;
+f3 f4 count
+1 abc 1
+1 def 2
+drop table t1, t2, t3;
+CREATE TABLE t1 (a INT KEY, b INT);
+INSERT INTO t1 VALUES (1,1), (2,2), (3,3), (4,4);
+EXPLAIN EXTENDED SELECT a, b FROM t1 WHERE a > 1 AND a = b LIMIT 2;
+id select_type table type possible_keys key key_len ref rows filtered Extra
+1 SIMPLE t1 range PRIMARY PRIMARY 4 NULL 3 100.00 Using where
+Warnings:
+Note 1003 select `test`.`t1`.`a` AS `a`,`test`.`t1`.`b` AS `b` from `test`.`t1` where ((`test`.`t1`.`b` = `test`.`t1`.`a`) and (`test`.`t1`.`a` > 1)) limit 2
+EXPLAIN EXTENDED SELECT a, b FROM t1 WHERE a > 1 AND b = a LIMIT 2;
+id select_type table type possible_keys key key_len ref rows filtered Extra
+1 SIMPLE t1 range PRIMARY PRIMARY 4 NULL 3 100.00 Using where
+Warnings:
+Note 1003 select `test`.`t1`.`a` AS `a`,`test`.`t1`.`b` AS `b` from `test`.`t1` where ((`test`.`t1`.`a` = `test`.`t1`.`b`) and (`test`.`t1`.`a` > 1)) limit 2
+DROP TABLE t1;
+#
+# Bug#47019: Assertion failed: 0, file .\rt_mbr.c, line 138 when
+# forcing a spatial index
+#
+CREATE TABLE t1(a LINESTRING NOT NULL, SPATIAL KEY(a));
+INSERT INTO t1 VALUES
+(GEOMFROMTEXT('LINESTRING(-1 -1, 1 -1, -1 -1, -1 1, 1 1)')),
+(GEOMFROMTEXT('LINESTRING(-1 -1, 1 -1, -1 -1, -1 1, 1 1)'));
+EXPLAIN SELECT 1 FROM t1 NATURAL LEFT JOIN t1 AS t2;
+id select_type table type possible_keys key key_len ref rows Extra
+1 SIMPLE t1 ALL NULL NULL NULL NULL 2
+1 SIMPLE t2 ALL a NULL NULL NULL 2
+SELECT 1 FROM t1 NATURAL LEFT JOIN t1 AS t2;
+1
+1
+1
+1
+1
+EXPLAIN SELECT 1 FROM t1 NATURAL LEFT JOIN t1 AS t2 FORCE INDEX(a);
+id select_type table type possible_keys key key_len ref rows Extra
+1 SIMPLE t1 ALL NULL NULL NULL NULL 2
+1 SIMPLE t2 ALL a NULL NULL NULL 2
+SELECT 1 FROM t1 NATURAL LEFT JOIN t1 AS t2 FORCE INDEX(a);
+1
+1
+1
+1
+1
+DROP TABLE t1;
+#
+# Bug #48291 : crash with row() operator,select into @var, and
+# subquery returning multiple rows
+#
+CREATE TABLE t1(a INT);
+INSERT INTO t1 VALUES (2),(3);
+# Should not crash
+SELECT 1 FROM t1 WHERE a <> 1 AND NOT
+ROW(a,a) <=> ROW((SELECT 1 FROM t1 WHERE 1=2),(SELECT 1 FROM t1))
+INTO @var0;
+ERROR 21000: Subquery returns more than 1 row
+DROP TABLE t1;
+#
+# Bug #48458: simple query tries to allocate enormous amount of
+# memory
+#
+CREATE TABLE t1(a INT NOT NULL, b YEAR);
+INSERT INTO t1 VALUES ();
+Warnings:
+Warning 1364 Field 'a' doesn't have a default value
+CREATE TABLE t2(c INT);
+# Should not err out because of out-of-memory
+SELECT 1 FROM t2 JOIN t1 ON 1=1
+WHERE a != '1' AND NOT a >= b OR NOT ROW(b,a )<> ROW(a,a);
+1
+DROP TABLE t1,t2;
+End of 5.0 tests
+create table t1(a INT, KEY (a));
+INSERT INTO t1 VALUES (1),(2),(3),(4),(5);
+SELECT a FROM t1 ORDER BY a LIMIT 2;
+a
+1
+2
+SELECT a FROM t1 ORDER BY a LIMIT 2,4294967296;
+a
+3
+4
+5
+SELECT a FROM t1 ORDER BY a LIMIT 2,4294967297;
+a
+3
+4
+5
+DROP TABLE t1;
+CREATE TABLE A (date_key date);
+CREATE TABLE C (
+pk int,
+int_nokey int,
+int_key int,
+date_key date NOT NULL,
+date_nokey date,
+varchar_key varchar(1)
+);
+INSERT INTO C VALUES
+(1,1,1,'0000-00-00',NULL,NULL),
+(1,1,1,'0000-00-00',NULL,NULL);
+SELECT 1 FROM C WHERE pk > ANY (SELECT 1 FROM C);
+1
+SELECT COUNT(DISTINCT 1) FROM C
+WHERE date_key = (SELECT 1 FROM A WHERE C.date_key IS NULL) GROUP BY pk;
+COUNT(DISTINCT 1)
+SELECT date_nokey FROM C
+WHERE int_key IN (SELECT 1 FROM A)
+HAVING date_nokey = '10:41:7'
+ORDER BY date_key;
+date_nokey
+Warnings:
+Warning 1292 Incorrect date value: '10:41:7' for column 'date_nokey' at row 1
+DROP TABLE A,C;
+CREATE TABLE t1 (a INT NOT NULL, b INT);
+INSERT INTO t1 VALUES (1, 1);
+EXPLAIN EXTENDED SELECT * FROM t1 WHERE (a=a AND a=a) OR b > 2;
+id select_type table type possible_keys key key_len ref rows filtered Extra
+1 SIMPLE t1 system NULL NULL NULL NULL 1 100.00
+Warnings:
+Note 1003 select '1' AS `a`,'1' AS `b` from `test`.`t1` where 1
+SELECT * FROM t1 WHERE (a=a AND a=a) OR b > 2;
+a b
+1 1
+DROP TABLE t1;
+CREATE TABLE t1 (a INT NOT NULL, b INT NOT NULL, c INT NOT NULL);
+EXPLAIN EXTENDED SELECT * FROM t1 WHERE (a=a AND b=b AND c=c) OR b > 20;
+id select_type table type possible_keys key key_len ref rows filtered Extra
+1 SIMPLE t1 system NULL NULL NULL NULL 0 0.00 const row not found
+Warnings:
+Note 1003 select '0' AS `a`,'0' AS `b`,'0' AS `c` from `test`.`t1` where 1
+EXPLAIN EXTENDED SELECT * FROM t1 WHERE (a=a AND a=a AND b=b) OR b > 20;
+id select_type table type possible_keys key key_len ref rows filtered Extra
+1 SIMPLE t1 system NULL NULL NULL NULL 0 0.00 const row not found
+Warnings:
+Note 1003 select '0' AS `a`,'0' AS `b`,'0' AS `c` from `test`.`t1` where 1
+EXPLAIN EXTENDED SELECT * FROM t1 WHERE (a=a AND b=b AND a=a) OR b > 20;
+id select_type table type possible_keys key key_len ref rows filtered Extra
+1 SIMPLE t1 system NULL NULL NULL NULL 0 0.00 const row not found
+Warnings:
+Note 1003 select '0' AS `a`,'0' AS `b`,'0' AS `c` from `test`.`t1` where 1
+DROP TABLE t1;
+#
+# Bug#45266: Uninitialized variable lead to an empty result.
+#
+drop table if exists A,AA,B,BB;
+CREATE TABLE `A` (
+`pk` int(11) NOT NULL AUTO_INCREMENT,
+`date_key` date NOT NULL,
+`date_nokey` date NOT NULL,
+`datetime_key` datetime NOT NULL,
+`int_nokey` int(11) NOT NULL,
+`time_key` time NOT NULL,
+`time_nokey` time NOT NULL,
+PRIMARY KEY (`pk`),
+KEY `date_key` (`date_key`),
+KEY `time_key` (`time_key`),
+KEY `datetime_key` (`datetime_key`)
+);
+CREATE TABLE `AA` (
+`pk` int(11) NOT NULL AUTO_INCREMENT,
+`int_nokey` int(11) NOT NULL,
+`time_key` time NOT NULL,
+KEY `time_key` (`time_key`),
+PRIMARY KEY (`pk`)
+);
+CREATE TABLE `B` (
+`date_nokey` date NOT NULL,
+`date_key` date NOT NULL,
+`time_key` time NOT NULL,
+`datetime_nokey` datetime NOT NULL,
+`varchar_key` varchar(1) NOT NULL,
+KEY `date_key` (`date_key`),
+KEY `time_key` (`time_key`),
+KEY `varchar_key` (`varchar_key`)
+);
+INSERT INTO `B` VALUES ('2003-07-28','2003-07-28','15:13:38','0000-00-00 00:00:00','f'),('0000-00-00','0000-00-00','00:05:48','2004-07-02 14:34:13','x');
+CREATE TABLE `BB` (
+`pk` int(11) NOT NULL AUTO_INCREMENT,
+`int_nokey` int(11) NOT NULL,
+`date_key` date NOT NULL,
+`varchar_nokey` varchar(1) NOT NULL,
+`date_nokey` date NOT NULL,
+PRIMARY KEY (`pk`),
+KEY `date_key` (`date_key`)
+);
+INSERT INTO `BB` VALUES (10,8,'0000-00-00','i','0000-00-00'),(11,0,'2005-08-18','','2005-08-18');
+SELECT table1 . `pk` AS field1
+FROM
+(BB AS table1 INNER JOIN
+(AA AS table2 STRAIGHT_JOIN A AS table3
+ON ( table3 . `date_key` = table2 . `pk` ))
+ON ( table3 . `datetime_key` = table2 . `int_nokey` ))
+WHERE ( table3 . `date_key` <= 4 AND table2 . `pk` = table1 . `varchar_nokey`)
+GROUP BY field1 ;
+field1
+SELECT table3 .`date_key` field1
+FROM
+B table1 LEFT JOIN B table3 JOIN
+(BB table6 JOIN A table7 ON table6 .`varchar_nokey`)
+ON table6 .`int_nokey` ON table6 .`date_key`
+ WHERE NOT ( table1 .`varchar_key` AND table7 .`pk`) GROUP BY field1;
+field1
+NULL
+SELECT table4 . `time_nokey` AS field1 FROM
+(AA AS table1 CROSS JOIN
+(AA AS table2 STRAIGHT_JOIN
+(B AS table3 STRAIGHT_JOIN A AS table4
+ON ( table4 . `date_key` = table3 . `time_key` ))
+ON ( table4 . `pk` = table3 . `date_nokey` ))
+ON ( table4 . `time_key` = table3 . `datetime_nokey` ))
+WHERE ( table4 . `time_key` < table1 . `time_key` AND
+table1 . `int_nokey` != 'f')
+GROUP BY field1 ORDER BY field1 , field1;
+field1
+SELECT table1 .`time_key` field2 FROM B table1 LEFT JOIN BB JOIN A table5 ON table5 .`date_nokey` ON table5 .`int_nokey` GROUP BY field2;
+field2
+00:05:48
+15:13:38
+drop table A,AA,B,BB;
+#end of test for bug#45266
+#
+# BUG#48052: Valgrind warning - uninitialized value in init_read_record()
+#
+CREATE TABLE t1 (
+pk int(11) NOT NULL,
+i int(11) DEFAULT NULL,
+v varchar(1) DEFAULT NULL,
+PRIMARY KEY (pk)
+);
+INSERT INTO t1 VALUES (2,7,'m');
+INSERT INTO t1 VALUES (3,9,'m');
+SELECT v
+FROM t1
+WHERE NOT pk > 0
+HAVING v <= 't'
+ORDER BY pk;
+v
+DROP TABLE t1;
+#
+# Bug#49489 Uninitialized cache led to a wrong result.
+#
+CREATE TABLE t1(c1 DOUBLE(5,4));
+INSERT INTO t1 VALUES (9.1234);
+SELECT * FROM t1 WHERE c1 < 9.12345;
+c1
+9.1234
+DROP TABLE t1;
+# End of test for bug#49489.
+#
+# Bug #49517: Inconsistent behavior while using
+# NULLable BIGINT and INT columns in comparison
+#
+CREATE TABLE t1(a BIGINT UNSIGNED NOT NULL, b BIGINT NULL, c INT NULL);
+INSERT INTO t1 VALUES(105, NULL, NULL);
+SELECT * FROM t1 WHERE b < 102;
+a b c
+SELECT * FROM t1 WHERE c < 102;
+a b c
+SELECT * FROM t1 WHERE 102 < b;
+a b c
+SELECT * FROM t1 WHERE 102 < c;
+a b c
+DROP TABLE t1;
+End of 5.1 tests
=== modified file 'mysql-test/t/key_cache.test'
--- a/mysql-test/t/key_cache.test 2008-03-27 16:43:17 +0000
+++ b/mysql-test/t/key_cache.test 2010-02-16 16:41:11 +0000
@@ -1,11 +1,13 @@
#
-# Test of multiple key caches
+# Test of multiple key caches, simple an partitioned
#
--disable_warnings
drop table if exists t1, t2, t3;
--enable_warnings
-SET @save_key_buffer=@@key_buffer_size;
+SET @save_key_buffer_size=@@key_buffer_size;
+SET @save_key_cache_block_size=@@key_cache_block_size;
+SET @save_key_cache_partitions=@@key_cache_partitions;
SELECT @@key_buffer_size, @@small.key_buffer_size;
@@ -33,7 +35,7 @@ SELECT @@`default`.key_buffer_size;
SELECT @@small.key_buffer_size;
SELECT @@medium.key_buffer_size;
-SET @@global.key_buffer_size=@save_key_buffer;
+SET @@global.key_buffer_size=@save_key_buffer_size;
#
# Errors
@@ -247,3 +249,263 @@ SET GLOBAL key_cache_block_size= @bug284
DROP TABLE t1;
# End of 4.1 tests
+
+#
+# Test cases for partitioned key caches
+#
+
+# Test usage of the KEY_CACHE table from information schema
+# for a simple key cache
+
+set global key_buffer_size=@save_key_buffer_size;
+set global key_cache_block_size=@save_key_cache_block_size;
+select @@key_buffer_size;
+select @@key_cache_block_size;
+select @@key_cache_partitions;
+
+create table t1 (
+ p int not null auto_increment primary key,
+ a char(10));
+create table t2 (
+ p int not null auto_increment primary key,
+ i int, a char(10), key k1(i), key k2(a));
+
+select @@key_cache_partitions;
+--replace_column 7 #
+select * from information_schema.key_caches;
+
+insert into t1 values (1, 'qqqq'), (2, 'yyyy');
+insert into t2 values (1, 1, 'qqqq'), (2, 1, 'pppp'),
+ (3, 1, 'yyyy'), (4, 3, 'zzzz');
+select * from t1;
+select * from t2;
+update t1 set p=3 where p=1;
+update t2 set i=2 where i=1;
+
+--replace_result 1808 KEY_BLOCKS_UNUSED 1670 KEY_BLOCKS_UNUSED
+show status like 'key_%';
+--replace_column 7 #
+select * from information_schema.key_caches;
+
+delete from t2 where a='zzzz';
+--replace_column 7 #
+select * from information_schema.key_caches;
+
+delete from t1;
+delete from t2;
+--replace_column 7 #
+select * from information_schema.key_caches;
+
+# For the key cache with 2 partitions execute the same sequence of
+# statements as for the simple cache above.
+# The statistical information on the number of i/o requests and
+# the number of is expected to be the same.
+
+set global key_cache_partitions=2;
+select @@key_cache_partitions;
+--replace_column 7 #
+select * from information_schema.key_caches;
+
+insert into t1 values (1, 'qqqq'), (2, 'yyyy');
+insert into t2 values (1, 1, 'qqqq'), (2, 1, 'pppp'),
+ (3, 1, 'yyyy'), (4, 3, 'zzzz');
+select * from t1;
+select * from t2;
+update t1 set p=3 where p=1;
+update t2 set i=2 where i=1;
+
+--replace_result 1808 KEY_BLOCKS_UNUSED 1670 KEY_BLOCKS_UNUSED
+show status like 'key_%';
+--replace_column 7 #
+select * from information_schema.key_caches;
+
+delete from t1;
+delete from t2;
+--replace_column 7 #
+select * from information_schema.key_caches;
+
+# Check that we can work with one partition with the same results
+
+set global key_cache_partitions=1;
+select @@key_cache_partitions;
+--replace_column 7 #
+select * from information_schema.key_caches;
+
+insert into t1 values (1, 'qqqq'), (2, 'yyyy');
+insert into t2 values (1, 1, 'qqqq'), (2, 1, 'pppp'),
+ (3, 1, 'yyyy'), (4, 3, 'zzzz');
+select * from t1;
+select * from t2;
+update t1 set p=3 where p=1;
+update t2 set i=2 where i=1;
+
+--replace_result 1808 KEY_BLOCKS_UNUSED 1670 KEY_BLOCKS_UNUSED
+show status like 'key_%';
+--replace_column 7 #
+select * from information_schema.key_caches;
+
+delete from t1;
+delete from t2;
+--replace_column 7 #
+select * from information_schema.key_caches;
+
+flush tables; flush status;
+--replace_column 7 #
+select * from information_schema.key_caches;
+
+# Switch back to 2 partitions
+
+set global key_buffer_size=32*1024;
+select @@key_buffer_size;
+set global key_cache_partitions=2;
+select @@key_cache_partitions;
+--replace_column 7 #
+select * from information_schema.key_caches;
+
+insert into t1 values (1, 'qqqq'), (2, 'yyyy');
+insert into t2 values (1, 1, 'qqqq'), (2, 1, 'pppp'),
+ (3, 1, 'yyyy'), (4, 3, 'zzzz');
+select * from t1;
+select * from t2;
+update t1 set p=3 where p=1;
+update t2 set i=2 where i=1;
+
+--replace_column 7 #
+select * from information_schema.key_caches;
+
+# Add more rows to tables t1 and t2
+
+insert into t1(a) select a from t1;
+insert into t1(a) select a from t1;
+insert into t1(a) select a from t1;
+insert into t1(a) select a from t1;
+insert into t1(a) select a from t1;
+insert into t1(a) select a from t1;
+insert into t1(a) select a from t1;
+insert into t1(a) select a from t1;
+
+insert into t2(i,a) select i,a from t2;
+insert into t2(i,a) select i,a from t2;
+insert into t2(i,a) select i,a from t2;
+insert into t2(i,a) select i,a from t2;
+insert into t2(i,a) select i,a from t2;
+insert into t2(i,a) select i,a from t2;
+insert into t2(i,a) select i,a from t2;
+insert into t2(i,a) select i,a from t2;
+
+--replace_column 6 # 7 # 10 #
+select * from information_schema.key_caches;
+
+select * from t1 where p between 1010 and 1020 ;
+select * from t2 where p between 1010 and 1020 ;
+--replace_column 6 # 7 # 10 #
+select * from information_schema.key_caches;
+
+flush tables; flush status;
+update t1 set a='zzzz' where a='qqqq';
+update t2 set i=1 where i=2;
+--replace_column 6 # 7 #
+select * from information_schema.key_caches;
+
+# Now test how we can work with 7 partitions
+
+set global keycache1.key_buffer_size=256*1024;
+select @@keycache1.key_buffer_size;
+set global keycache1.key_cache_partitions=7;
+select @@keycache1.key_cache_partitions;
+
+--replace_column 6 # 7 #
+select * from information_schema.key_caches;
+--replace_column 7 #
+select * from information_schema.key_caches where key_cache_name like "key%";
+
+cache index t1 key (`primary`) in keycache1;
+
+explain select p from t1 where p between 1010 and 1020;
+select p from t1 where p between 1010 and 1020;
+explain select i from t2 where p between 1010 and 1020;
+select i from t2 where p between 1010 and 1020;
+explain select count(*) from t1, t2 where t1.p = t2.i;
+select count(*) from t1, t2 where t1.p = t2.i;
+
+--replace_column 6 # 7 #
+select * from information_schema.key_caches;
+--replace_column 7 #
+select * from information_schema.key_caches where key_cache_name like "key%";
+
+cache index t2 in keycache1;
+update t2 set p=p+3000, i=2 where a='qqqq';
+--replace_column 7 #
+select * from information_schema.key_caches where key_cache_name like "key%";
+
+set global keycache2.key_buffer_size=1024*1024;
+cache index t2 in keycache2;
+insert into t2 values (2000, 3, 'yyyy');
+--replace_column 7 #
+select * from information_schema.key_caches where key_cache_name like "keycache2";
+--replace_column 7 #
+select * from information_schema.key_caches where key_cache_name like "key%";
+
+cache index t2 in keycache1;
+update t2 set p=p+5000 where a='zzzz';
+select * from t2 where p between 1010 and 1020;
+explain select p from t2 where p between 1010 and 1020;
+select p from t2 where p between 1010 and 1020;
+explain select i from t2 where a='yyyy' and i=3;
+select i from t2 where a='yyyy' and i=3;
+explain select a from t2 where a='yyyy' and i=3;
+select a from t2 where a='yyyy' and i=3 ;
+--replace_column 6 # 7 #
+select * from information_schema.key_caches;
+
+set global keycache1.key_cache_block_size=2*1024;
+insert into t2 values (7000, 3, 'yyyy');
+--replace_column 6 # 7 #
+select * from information_schema.key_caches;
+
+set global keycache1.key_cache_block_size=8*1024;
+insert into t2 values (8000, 3, 'yyyy');
+--replace_column 6 # 7 #
+select * from information_schema.key_caches;
+
+set global keycache1.key_buffer_size=64*1024;
+--replace_column 6 # 7 #
+select * from information_schema.key_caches;
+
+set global keycache1.key_cache_block_size=2*1024;
+--replace_column 6 # 7 #
+select * from information_schema.key_caches;
+
+set global keycache1.key_cache_block_size=8*1024;
+--replace_column 6 # 7 #
+select * from information_schema.key_caches;
+
+set global keycache1.key_buffer_size=0;
+--replace_column 6 # 7 #
+select * from information_schema.key_caches;
+
+set global keycache1.key_cache_block_size=8*1024;
+--replace_column 6 # 7 #
+select * from information_schema.key_caches;
+
+set global keycache1.key_buffer_size=0;
+--replace_column 6 # 7 #
+select * from information_schema.key_caches;
+
+set global keycache1.key_buffer_size=128*1024;
+--replace_column 6 # 7 #
+select * from information_schema.key_caches;
+
+set global keycache1.key_cache_block_size=1024;
+--replace_column 6 # 7 #
+select * from information_schema.key_caches;
+
+drop table t1,t2;
+
+set global keycache1.key_buffer_size=0;
+set global keycache2.key_buffer_size=0;
+
+set global key_buffer_size=@save_key_buffer_size;
+set global key_cache_partitions=@save_key_cache_partitions;
+
+#End of 5.1 tests
=== added file 'mysql-test/t/select_pkeycache-master.opt'
--- a/mysql-test/t/select_pkeycache-master.opt 1970-01-01 00:00:00 +0000
+++ b/mysql-test/t/select_pkeycache-master.opt 2010-02-16 16:41:11 +0000
@@ -0,0 +1 @@
+--key_cache_partitions=7
=== added file 'mysql-test/t/select_pkeycache.test'
--- a/mysql-test/t/select_pkeycache.test 1970-01-01 00:00:00 +0000
+++ b/mysql-test/t/select_pkeycache.test 2010-02-16 16:41:11 +0000
@@ -0,0 +1,8 @@
+
+#
+# Run select.test partitioned default key cache (with 7 partitions)
+# (see setting the number of partitions in select_pkecache-master.opt)
+# The result is expected the same as for select.test
+#
+
+--source t/select.test
=== modified file 'mysys/mf_keycache.c'
--- a/mysys/mf_keycache.c 2009-12-03 11:19:05 +0000
+++ b/mysys/mf_keycache.c 2010-02-16 16:41:11 +0000
@@ -13,8 +13,38 @@
along with this program; if not, write to the Free Software
Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA */
+
+
/**
- @file
+ @file
+ The file contains the following modules:
+
+ Simple Key Cache Module
+
+ Partitioned Key Cache Module
+
+ Key Cache Interface Module
+
+*/
+
+#include "mysys_priv.h"
+#include "mysys_err.h"
+#include <keycache.h>
+#include "my_static.h"
+#include <m_string.h>
+#include <my_bit.h>
+#include <errno.h>
+#include <stdarg.h>
+
+/******************************************************************************
+ Simple Key Cache Module
+
+ The module contains implementations of all key cache interface functions
+ employed by partitioned key caches.
+
+******************************************************************************/
+
+/*
These functions handle keyblock cacheing for ISAM and MyISAM tables.
One cache can handle many files.
@@ -37,9 +67,7 @@
blocks_unused is the sum of never used blocks in the pool and of currently
free blocks. blocks_used is the number of blocks fetched from the pool and
as such gives the maximum number of in-use blocks at any time.
-*/
-/*
Key Cache Locking
=================
@@ -104,14 +132,77 @@
I/O finished.
*/
-#include "mysys_priv.h"
-#include "mysys_err.h"
-#include <keycache.h>
-#include "my_static.h"
-#include <m_string.h>
-#include <my_bit.h>
-#include <errno.h>
-#include <stdarg.h>
+/* declare structures that is used by st_key_cache */
+
+struct st_block_link;
+typedef struct st_block_link BLOCK_LINK;
+struct st_keycache_page;
+typedef struct st_keycache_page KEYCACHE_PAGE;
+struct st_hash_link;
+typedef struct st_hash_link HASH_LINK;
+
+/* info about requests in a waiting queue */
+typedef struct st_keycache_wqueue
+{
+ struct st_my_thread_var *last_thread; /* circular list of waiting threads */
+} KEYCACHE_WQUEUE;
+
+#define CHANGED_BLOCKS_HASH 128 /* must be power of 2 */
+
+/* Control block for a simple (non-partitioned) key cache */
+
+typedef struct st_s_key_cache_cb
+{
+ my_bool key_cache_inited; /* <=> control block is allocated */
+ my_bool in_resize; /* true during resize operation */
+ my_bool resize_in_flush; /* true during flush of resize operation */
+ my_bool can_be_used; /* usage of cache for read/write is allowed */
+ size_t key_cache_mem_size; /* specified size of the cache memory */
+ uint key_cache_block_size; /* size of the page buffer of a cache block */
+ ulong min_warm_blocks; /* min number of warm blocks; */
+ ulong age_threshold; /* age threshold for hot blocks */
+ ulonglong keycache_time; /* total number of block link operations */
+ uint hash_entries; /* max number of entries in the hash table */
+ int hash_links; /* max number of hash links */
+ int hash_links_used; /* number of hash links currently used */
+ int disk_blocks; /* max number of blocks in the cache */
+ ulong blocks_used; /* maximum number of concurrently used blocks */
+ ulong blocks_unused; /* number of currently unused blocks */
+ ulong blocks_changed; /* number of currently dirty blocks */
+ ulong warm_blocks; /* number of blocks in warm sub-chain */
+ ulong cnt_for_resize_op; /* counter to block resize operation */
+ long blocks_available; /* number of blocks available in the LRU chain */
+ HASH_LINK **hash_root; /* arr. of entries into hash table buckets */
+ HASH_LINK *hash_link_root; /* memory for hash table links */
+ HASH_LINK *free_hash_list; /* list of free hash links */
+ BLOCK_LINK *free_block_list; /* list of free blocks */
+ BLOCK_LINK *block_root; /* memory for block links */
+ uchar HUGE_PTR *block_mem; /* memory for block buffers */
+ BLOCK_LINK *used_last; /* ptr to the last block of the LRU chain */
+ BLOCK_LINK *used_ins; /* ptr to the insertion block in LRU chain */
+ pthread_mutex_t cache_lock; /* to lock access to the cache structure */
+ KEYCACHE_WQUEUE resize_queue; /* threads waiting during resize operation */
+ /*
+ Waiting for a zero resize count. Using a queue for symmetry though
+ only one thread can wait here.
+ */
+ KEYCACHE_WQUEUE waiting_for_resize_cnt;
+ KEYCACHE_WQUEUE waiting_for_hash_link; /* waiting for a free hash link */
+ KEYCACHE_WQUEUE waiting_for_block; /* requests waiting for a free block */
+ BLOCK_LINK *changed_blocks[CHANGED_BLOCKS_HASH]; /* hash for dirty file bl.*/
+ BLOCK_LINK *file_blocks[CHANGED_BLOCKS_HASH]; /* hash for other file bl.*/
+
+ /* Statistics variables. These are reset in reset_key_cache_counters(). */
+ ulong global_blocks_changed; /* number of currently dirty blocks */
+ ulonglong global_cache_w_requests;/* number of write requests (write hits) */
+ ulonglong global_cache_write; /* number of writes from cache to files */
+ ulonglong global_cache_r_requests;/* number of read requests (read hits) */
+ ulonglong global_cache_read; /* number of reads from files to cache */
+
+ int blocks; /* max number of blocks in the cache */
+ uint hash_factor; /* factor used to calculate hash function */
+ my_bool in_init; /* Set to 1 in MySQL during init/resize */
+} S_KEY_CACHE_CB;
/*
Some compilation flags have been added specifically for this module
@@ -223,7 +314,12 @@ KEY_CACHE *dflt_key_cache= &dflt_key_cac
#define FLUSH_CACHE 2000 /* sort this many blocks at once */
-static int flush_all_key_blocks(KEY_CACHE *keycache);
+static int flush_all_key_blocks(S_KEY_CACHE_CB *keycache);
+/*
+static void s_change_key_cache_param(void *keycache_cb, uint division_limit,
+ uint age_threshold);
+*/
+static void s_end_key_cache(void *keycache_cb, my_bool cleanup);
#ifdef THREAD
static void wait_on_queue(KEYCACHE_WQUEUE *wqueue,
pthread_mutex_t *mutex);
@@ -232,15 +328,16 @@ static void release_whole_queue(KEYCACHE
#define wait_on_queue(wqueue, mutex) do {} while (0)
#define release_whole_queue(wqueue) do {} while (0)
#endif
-static void free_block(KEY_CACHE *keycache, BLOCK_LINK *block);
+static void free_block(S_KEY_CACHE_CB *keycache, BLOCK_LINK *block);
#if !defined(DBUG_OFF)
-static void test_key_cache(KEY_CACHE *keycache,
+static void test_key_cache(S_KEY_CACHE_CB *keycache,
const char *where, my_bool lock);
#endif
-
+#define KEYCACHE_BASE_EXPR(f, pos) \
+ ((ulong) ((pos) / keycache->key_cache_block_size) + (ulong) (f))
#define KEYCACHE_HASH(f, pos) \
-(((ulong) ((pos) / keycache->key_cache_block_size) + \
- (ulong) (f)) & (keycache->hash_entries-1))
+ ((KEYCACHE_BASE_EXPR(f, pos) / keycache->hash_factor) & \
+ (keycache->hash_entries-1))
#define FILE_HASH(f) ((uint) (f) & (CHANGED_BLOCKS_HASH-1))
#define DEFAULT_KEYCACHE_DEBUG_LOG "keycache_debug.log"
@@ -336,9 +433,10 @@ static int keycache_pthread_cond_signal(
#define inline /* disabled inline for easier debugging */
static int fail_block(BLOCK_LINK *block);
static int fail_hlink(HASH_LINK *hlink);
-static int cache_empty(KEY_CACHE *keycache);
+static int cache_empty(S_KEY_CACHE_CB *keycache);
#endif
+
static inline uint next_power(uint value)
{
return (uint) my_round_up_to_next_power((uint32) value) << 1;
@@ -346,19 +444,32 @@ static inline uint next_power(uint value
/*
- Initialize a key cache
+ Initialize a simple key cache
SYNOPSIS
- init_key_cache()
- keycache pointer to a key cache data structure
- key_cache_block_size size of blocks to keep cached data
- use_mem total memory to use for the key cache
- division_limit division limit (may be zero)
- age_threshold age threshold (may be zero)
+ s_init_key_cache()
+ keycache_cb pointer to the control block of a simple key cache
+ key_cache_block_size size of blocks to keep cached data
+ use_mem memory to use for the key cache buferrs/structures
+ division_limit division limit (may be zero)
+ age_threshold age threshold (may be zero)
+
+ DESCRIPTION
+ This function is the implementation of the init_key_cache interface
+ function that is employed by simple (non-partitioned) key caches.
+ The function builds a simple key cache and initializes the control block
+ structure of the type S_KEY_CACHE_CB that is used for this key cache.
+ The parameter keycache_cb is supposed to point to this structure.
+ The parameter key_cache_block_size specifies the size of the blocks in
+ the key cache to be built. The parameters division_limit and age_threshhold
+ determine the initial values of those characteristics of the key cache
+ that are used for midpoint insertion strategy. The parameter use_mem
+ specifies the total amount of memory to be allocated for key cache blocks
+ and auxiliary structures.
RETURN VALUE
number of blocks in the key cache, if successful,
- 0 - otherwise.
+ <= 0 - otherwise.
NOTES.
if keycache->key_cache_inited != 0 we assume that the key cache
@@ -370,10 +481,12 @@ static inline uint next_power(uint value
*/
-int init_key_cache(KEY_CACHE *keycache, uint key_cache_block_size,
- size_t use_mem, uint division_limit,
- uint age_threshold)
+static
+int s_init_key_cache(void *keycache_cb, uint key_cache_block_size,
+ size_t use_mem, uint division_limit,
+ uint age_threshold)
{
+ S_KEY_CACHE_CB *keycache= (S_KEY_CACHE_CB *) keycache_cb;
ulong blocks, hash_links;
size_t length;
int error;
@@ -387,12 +500,15 @@ int init_key_cache(KEY_CACHE *keycache,
DBUG_RETURN(0);
}
+ keycache->blocks_used= keycache->blocks_unused= 0;
+ keycache->global_blocks_changed= 0;
keycache->global_cache_w_requests= keycache->global_cache_r_requests= 0;
keycache->global_cache_read= keycache->global_cache_write= 0;
keycache->disk_blocks= -1;
if (! keycache->key_cache_inited)
{
keycache->key_cache_inited= 1;
+ keycache->hash_factor= 1;
/*
Initialize these variables once only.
Their value must survive re-initialization during resizing.
@@ -534,51 +650,43 @@ err:
/*
- Resize a key cache
+ Prepare for resizing a simple key cache
SYNOPSIS
- resize_key_cache()
- keycache pointer to a key cache data structure
- key_cache_block_size size of blocks to keep cached data
- use_mem total memory to use for the new key cache
- division_limit new division limit (if not zero)
- age_threshold new age threshold (if not zero)
+ s_prepare_resize_key_cache()
+ keycache_cb pointer to the control block of a simple key cache
+ with_resize_queue <=> resize queue is used
+ release_lock <=> release the key cache lock before return
- RETURN VALUE
- number of blocks in the key cache, if successful,
- 0 - otherwise.
+ DESCRIPTION
+ This function flushes all dirty pages from a simple key cache and after
+ this it destroys the key cache calling s_end_key_cache. The function
+ considers the parameter keycache_cb as a pointer to the control block
+ structure of the type S_KEY_CACHE_CB for this key cache.
+ The parameter with_resize_queue determines weather the resize queue is
+ involved (MySQL server never uses this queue). The parameter release_lock
+ says weather the key cache lock must be released before return from
+ the function.
- NOTES.
- The function first compares the memory size and the block size parameters
- with the key cache values.
+ RETURN VALUE
+ 0 - on success,
+ 1 - otherwise.
- If they differ the function free the the memory allocated for the
- old key cache blocks by calling the end_key_cache function and
- then rebuilds the key cache with new blocks by calling
- init_key_cache.
+ NOTES
+ This function is the called by s_resize_key_cache and p_resize_key_cache
+ that resize simple and partitioned key caches respectively.
- The function starts the operation only when all other threads
- performing operations with the key cache let her to proceed
- (when cnt_for_resize=0).
*/
-int resize_key_cache(KEY_CACHE *keycache, uint key_cache_block_size,
- size_t use_mem, uint division_limit,
- uint age_threshold)
+static
+int s_prepare_resize_key_cache(void *keycache_cb,
+ my_bool with_resize_queue,
+ my_bool release_lock)
{
- int blocks;
- DBUG_ENTER("resize_key_cache");
-
- if (!keycache->key_cache_inited)
- DBUG_RETURN(keycache->disk_blocks);
-
- if(key_cache_block_size == keycache->key_cache_block_size &&
- use_mem == keycache->key_cache_mem_size)
- {
- change_key_cache_param(keycache, division_limit, age_threshold);
- DBUG_RETURN(keycache->disk_blocks);
- }
-
+ int res= 0;
+ S_KEY_CACHE_CB *keycache= (S_KEY_CACHE_CB *) keycache_cb;
+ DBUG_ENTER("s_prepare_resize_key_cache");
+
keycache_pthread_mutex_lock(&keycache->cache_lock);
#ifdef THREAD
@@ -588,7 +696,7 @@ int resize_key_cache(KEY_CACHE *keycache
one resizer only. In set_var.cc keycache->in_init is used to block
multiple attempts.
*/
- while (keycache->in_resize)
+ while (with_resize_queue && keycache->in_resize)
{
/* purecov: begin inspected */
wait_on_queue(&keycache->resize_queue, &keycache->cache_lock);
@@ -613,8 +721,8 @@ int resize_key_cache(KEY_CACHE *keycache
{
/* TODO: if this happens, we should write a warning in the log file ! */
keycache->resize_in_flush= 0;
- blocks= 0;
keycache->can_be_used= 0;
+ res= 1;
goto finish;
}
DBUG_ASSERT(cache_empty(keycache));
@@ -640,29 +748,145 @@ int resize_key_cache(KEY_CACHE *keycache
#else
KEYCACHE_DBUG_ASSERT(keycache->cnt_for_resize_op == 0);
#endif
-
- /*
- Free old cache structures, allocate new structures, and initialize
- them. Note that the cache_lock mutex and the resize_queue are left
- untouched. We do not lose the cache_lock and will release it only at
- the end of this function.
- */
- end_key_cache(keycache, 0); /* Don't free mutex */
- /* The following will work even if use_mem is 0 */
- blocks= init_key_cache(keycache, key_cache_block_size, use_mem,
- division_limit, age_threshold);
+
+ s_end_key_cache(keycache_cb, 0);
finish:
+ if (release_lock)
+ keycache_pthread_mutex_unlock(&keycache->cache_lock);
+ DBUG_RETURN(res);
+}
+
+
+/*
+ Finalize resizing a simple key cache
+
+ SYNOPSIS
+ s_finish_resize_key_cache()
+ keycache_cb pointer to the control block of a simple key cache
+ with_resize_queue <=> resize queue is used
+ acquire_lock <=> acquire the key cache lock at start
+
+ DESCRIPTION
+ This function performs finalizing actions for the operation of
+ resizing a simple key cache. The function considers the parameter
+ keycache_cb as a pointer to the control block structure of the type
+ S_KEY_CACHE_CB for this key cache. The function sets the flag
+ in_resize in this structure to FALSE.
+ The parameter with_resize_queue determines weather the resize queue
+ is involved (MySQL server never uses this queue).
+ The parameter acquire_lock says weather the key cache lock must be
+ acquired at the start of the function.
+
+ RETURN VALUE
+ none
+
+ NOTES
+ This function is the called by s_resize_key_cache and p_resize_key_cache
+ that resize simple and partitioned key caches respectively.
+
+*/
+
+static
+void s_finish_resize_key_cache(void *keycache_cb,
+ my_bool with_resize_queue,
+ my_bool acquire_lock)
+{
+ S_KEY_CACHE_CB *keycache= (S_KEY_CACHE_CB *) keycache_cb;
+ DBUG_ENTER("s_finish_resize_key_cache");
+
+ if (acquire_lock)
+ keycache_pthread_mutex_lock(&keycache->cache_lock);
+
/*
Mark the resize finished. This allows other threads to start a
resize or to request new cache blocks.
*/
keycache->in_resize= 0;
-
- /* Signal waiting threads. */
- release_whole_queue(&keycache->resize_queue);
+
+ if (with_resize_queue)
+ {
+ /* Signal waiting threads. */
+ release_whole_queue(&keycache->resize_queue);
+ }
keycache_pthread_mutex_unlock(&keycache->cache_lock);
+
+ DBUG_VOID_RETURN;
+}
+
+
+/*
+ Resize a simple key cache
+
+ SYNOPSIS
+ s_resize_key_cache()
+ keycache_cb pointer to the control block of a simple key cache
+ key_cache_block_size size of blocks to keep cached data
+ use_mem memory to use for the key cache buffers/structures
+ division_limit new division limit (if not zero)
+ age_threshold new age threshold (if not zero)
+
+ DESCRIPTION
+ This function is the implementation of the resize_key_cache interface
+ function that is employed by simple (non-partitioned) key caches.
+ The function considers the parameter keycache_cb as a pointer to the
+ control block structure of the type S_KEY_CACHE_CB for the simple key
+ cache to be resized.
+ The parameter key_cache_block_size specifies the new size of the blocks in
+ the key cache. The parameters division_limit and age_threshold
+ determine the new initial values of those characteristics of the key cache
+ that are used for midpoint insertion strategy. The parameter use_mem
+ specifies the total amount of memory to be allocated for key cache blocks
+ and auxiliary structures in the new key cache.
+
+ RETURN VALUE
+ number of blocks in the key cache, if successful,
+ 0 - otherwise.
+
+ NOTES.
+ The function first calls the function s_prepare_resize_key_cache
+ to flush all dirty blocks from key cache, to free memory used
+ for key cache blocks and auxiliary structures. After this the
+ function builds a new key cache with new parameters.
+
+ This implementation doesn't block the calls and executions of other
+ functions from the key cache interface. However it assumes that the
+ calls of s_resize_key_cache itself are serialized.
+
+ The function starts the operation only when all other threads
+ performing operations with the key cache let her to proceed
+ (when cnt_for_resize=0).
+
+*/
+
+static
+int s_resize_key_cache(void *keycache_cb, uint key_cache_block_size,
+ size_t use_mem, uint division_limit,
+ uint age_threshold)
+{
+ S_KEY_CACHE_CB *keycache= (S_KEY_CACHE_CB *) keycache_cb;
+ int blocks= 0;
+ DBUG_ENTER("s_resize_key_cache");
+
+ if (!keycache->key_cache_inited)
+ DBUG_RETURN(keycache->disk_blocks);
+
+ /*
+ Note that the cache_lock mutex and the resize_queue are left untouched.
+ We do not lose the cache_lock and will release it only at the end of
+ this function.
+ */
+ if (s_prepare_resize_key_cache(keycache_cb, 1, 0))
+ goto finish;
+
+ /* The following will work even if use_mem is 0 */
+ blocks= s_init_key_cache(keycache, key_cache_block_size, use_mem,
+ division_limit, age_threshold);
+
+finish:
+ s_finish_resize_key_cache(keycache_cb, 1, 0);
+
DBUG_RETURN(blocks);
}
@@ -670,7 +894,7 @@ finish:
/*
Increment counter blocking resize key cache operation
*/
-static inline void inc_counter_for_resize_op(KEY_CACHE *keycache)
+static inline void inc_counter_for_resize_op(S_KEY_CACHE_CB *keycache)
{
keycache->cnt_for_resize_op++;
}
@@ -680,35 +904,49 @@ static inline void inc_counter_for_resiz
Decrement counter blocking resize key cache operation;
Signal the operation to proceed when counter becomes equal zero
*/
-static inline void dec_counter_for_resize_op(KEY_CACHE *keycache)
+static inline void dec_counter_for_resize_op(S_KEY_CACHE_CB *keycache)
{
if (!--keycache->cnt_for_resize_op)
release_whole_queue(&keycache->waiting_for_resize_cnt);
}
+
/*
- Change the key cache parameters
+ Change key cache parameters of a simple key cache
SYNOPSIS
- change_key_cache_param()
- keycache pointer to a key cache data structure
- division_limit new division limit (if not zero)
- age_threshold new age threshold (if not zero)
+ s_change_key_cache_param()
+ keycache_cb pointer to the control block of a simple key cache
+ division_limit new division limit (if not zero)
+ age_threshold new age threshold (if not zero)
+
+ DESCRIPTION
+ This function is the implementation of the change_key_cache_param interface
+ function that is employed by simple (non-partitioned) key caches.
+ The function considers the parameter keycache_cb as a pointer to the
+ control block structure of the type S_KEY_CACHE_CB for the simple key
+ cache where new values of the division limit and the age threshold used
+ for midpoint insertion strategy are to be set. The parameters
+ division_limit and age_threshold provide these new values.
RETURN VALUE
none
NOTES.
- Presently the function resets the key cache parameters
- concerning midpoint insertion strategy - division_limit and
- age_threshold.
+ Presently the function resets the key cache parameters concerning
+ midpoint insertion strategy - division_limit and age_threshold.
+ This function changes some parameters of a given key cache without
+ reformatting it. The function does not touch the contents the key
+ cache blocks.
+
*/
-void change_key_cache_param(KEY_CACHE *keycache, uint division_limit,
- uint age_threshold)
+static
+void s_change_key_cache_param(void *keycache_cb, uint division_limit,
+ uint age_threshold)
{
- DBUG_ENTER("change_key_cache_param");
-
+ S_KEY_CACHE_CB *keycache= (S_KEY_CACHE_CB *) keycache_cb;
+ DBUG_ENTER("s_change_key_cache_param");
keycache_pthread_mutex_lock(&keycache->cache_lock);
if (division_limit)
keycache->min_warm_blocks= (keycache->disk_blocks *
@@ -722,20 +960,32 @@ void change_key_cache_param(KEY_CACHE *k
/*
- Remove key_cache from memory
+ Destroy a simple key cache
SYNOPSIS
- end_key_cache()
- keycache key cache handle
- cleanup Complete free (Free also mutex for key cache)
+ s_end_key_cache()
+ keycache_cb pointer to the control block of a simple key cache
+ cleanup <=> complete free (free also mutex for key cache)
+
+ DESCRIPTION
+ This function is the implementation of the end_key_cache interface
+ function that is employed by simple (non-partitioned) key caches.
+ The function considers the parameter keycache_cb as a pointer to the
+ control block structure of the type S_KEY_CACHE_CB for the simple key
+ cache to be destroyed.
+ The function frees the memory allocated for the key cache blocks and
+ auxiliary structures. If the value of the parameter cleanup is TRUE
+ then even the key cache mutex is freed.
RETURN VALUE
none
*/
-void end_key_cache(KEY_CACHE *keycache, my_bool cleanup)
+static
+void s_end_key_cache(void *keycache_cb, my_bool cleanup)
{
- DBUG_ENTER("end_key_cache");
+ S_KEY_CACHE_CB *keycache= (S_KEY_CACHE_CB *) keycache_cb;
+ DBUG_ENTER("s_end_key_cache");
DBUG_PRINT("enter", ("key_cache: 0x%lx", (long) keycache));
if (!keycache->key_cache_inited)
@@ -1026,7 +1276,7 @@ static inline void link_changed(BLOCK_LI
void
*/
-static void link_to_file_list(KEY_CACHE *keycache,
+static void link_to_file_list(S_KEY_CACHE_CB *keycache,
BLOCK_LINK *block, int file,
my_bool unlink_block)
{
@@ -1067,7 +1317,7 @@ static void link_to_file_list(KEY_CACHE
void
*/
-static void link_to_changed_list(KEY_CACHE *keycache,
+static void link_to_changed_list(S_KEY_CACHE_CB *keycache,
BLOCK_LINK *block)
{
DBUG_ASSERT(block->status & BLOCK_IN_USE);
@@ -1122,7 +1372,7 @@ static void link_to_changed_list(KEY_CAC
not linked in the LRU ring.
*/
-static void link_block(KEY_CACHE *keycache, BLOCK_LINK *block, my_bool hot,
+static void link_block(S_KEY_CACHE_CB *keycache, BLOCK_LINK *block, my_bool hot,
my_bool at_end)
{
BLOCK_LINK *ins;
@@ -1243,7 +1493,7 @@ static void link_block(KEY_CACHE *keycac
See NOTES for link_block
*/
-static void unlink_block(KEY_CACHE *keycache, BLOCK_LINK *block)
+static void unlink_block(S_KEY_CACHE_CB *keycache, BLOCK_LINK *block)
{
DBUG_ASSERT((block->status & ~BLOCK_CHANGED) == (BLOCK_READ | BLOCK_IN_USE));
DBUG_ASSERT(block->hash_link); /*backptr to block NULL from free_block()*/
@@ -1301,7 +1551,7 @@ static void unlink_block(KEY_CACHE *keyc
RETURN
void
*/
-static void reg_requests(KEY_CACHE *keycache, BLOCK_LINK *block, int count)
+static void reg_requests(S_KEY_CACHE_CB *keycache, BLOCK_LINK *block, int count)
{
DBUG_ASSERT(block->status & BLOCK_IN_USE);
DBUG_ASSERT(block->hash_link);
@@ -1344,7 +1594,7 @@ static void reg_requests(KEY_CACHE *keyc
not linked in the LRU ring.
*/
-static void unreg_request(KEY_CACHE *keycache,
+static void unreg_request(S_KEY_CACHE_CB *keycache,
BLOCK_LINK *block, int at_end)
{
DBUG_ASSERT(block->status & (BLOCK_READ | BLOCK_IN_USE));
@@ -1433,7 +1683,7 @@ static void remove_reader(BLOCK_LINK *bl
signals on its termination
*/
-static void wait_for_readers(KEY_CACHE *keycache,
+static void wait_for_readers(S_KEY_CACHE_CB *keycache,
BLOCK_LINK *block)
{
#ifdef THREAD
@@ -1482,7 +1732,7 @@ static inline void link_hash(HASH_LINK *
Remove a hash link from the hash table
*/
-static void unlink_hash(KEY_CACHE *keycache, HASH_LINK *hash_link)
+static void unlink_hash(S_KEY_CACHE_CB *keycache, HASH_LINK *hash_link)
{
KEYCACHE_DBUG_PRINT("unlink_hash", ("fd: %u pos_ %lu #requests=%u",
(uint) hash_link->file,(ulong) hash_link->diskpos, hash_link->requests));
@@ -1538,7 +1788,7 @@ static void unlink_hash(KEY_CACHE *keyca
Get the hash link for a page
*/
-static HASH_LINK *get_hash_link(KEY_CACHE *keycache,
+static HASH_LINK *get_hash_link(S_KEY_CACHE_CB *keycache,
int file, my_off_t filepos)
{
reg1 HASH_LINK *hash_link, **start;
@@ -1659,7 +1909,7 @@ restart:
waits until first of this operations links any block back.
*/
-static BLOCK_LINK *find_key_block(KEY_CACHE *keycache,
+static BLOCK_LINK *find_key_block(S_KEY_CACHE_CB *keycache,
File file, my_off_t filepos,
int init_hits_left,
int wrmode, int *page_st)
@@ -2419,7 +2669,7 @@ restart:
portion is less than read_length, but not less than min_length.
*/
-static void read_block(KEY_CACHE *keycache,
+static void read_block(S_KEY_CACHE_CB *keycache,
BLOCK_LINK *block, uint read_length,
uint min_length, my_bool primary)
{
@@ -2507,43 +2757,62 @@ static void read_block(KEY_CACHE *keycac
/*
- Read a block of data from a cached file into a buffer;
+ Read a block of data from a simple key cache into a buffer
SYNOPSIS
- key_cache_read()
- keycache pointer to a key cache data structure
- file handler for the file for the block of data to be read
- filepos position of the block of data in the file
- level determines the weight of the data
- buff buffer to where the data must be placed
- length length of the buffer
- block_length length of the block in the key cache buffer
- return_buffer return pointer to the key cache buffer with the data
+ s_key_cache_read()
+ keycache_cb pointer to the control block of a simple key cache
+ file handler for the file for the block of data to be read
+ filepos position of the block of data in the file
+ level determines the weight of the data
+ buff buffer to where the data must be placed
+ length length of the buffer
+ block_length length of the read data from a key cache block
+ return_buffer return pointer to the key cache buffer with the data
+ DESCRIPTION
+ This function is the implementation of the key_cache_read interface
+ function that is employed by simple (non-partitioned) key caches.
+ The function considers the parameter keycache_cb as a pointer to the
+ control block structure of the type S_KEY_CACHE_CB for a simple key
+ cache.
+ In a general case the function reads a block of data from the key cache
+ into the buffer buff of the size specified by the parameter length. The
+ beginning of the block of data to be read is specified by the parameters
+ file and filepos. The length of the read data is the same as the length
+ of the buffer. The data is read into the buffer in key_cache_block_size
+ increments. If the next portion of the data is not found in any key cache
+ block, first it is read from file into the key cache.
+ If the parameter return_buffer is not ignored and its value is TRUE, and
+ the data to be read of the specified size block_length can be read from one
+ key cache buffer, then the function returns a pointer to the data in the
+ key cache buffer.
+ The function takse into account parameters block_length and return buffer
+ only in a single-threaded environment.
+ The parameter 'level' is used only by the midpoint insertion strategy
+ when the data or its portion cannot be found in the key cache.
+
RETURN VALUE
- Returns address from where the data is placed if sucessful, 0 - otherwise.
+ Returns address from where the data is placed if successful, 0 - otherwise.
- NOTES.
- The function ensures that a block of data of size length from file
- positioned at filepos is in the buffers for some key cache blocks.
- Then the function either copies the data into the buffer buff, or,
- if return_buffer is TRUE, it just returns the pointer to the key cache
- buffer with the data.
+ NOTES
Filepos must be a multiple of 'block_length', but it doesn't
have to be a multiple of key_cache_block_size;
+
*/
-uchar *key_cache_read(KEY_CACHE *keycache,
- File file, my_off_t filepos, int level,
- uchar *buff, uint length,
- uint block_length __attribute__((unused)),
- int return_buffer __attribute__((unused)))
+uchar *s_key_cache_read(void *keycache_cb,
+ File file, my_off_t filepos, int level,
+ uchar *buff, uint length,
+ uint block_length __attribute__((unused)),
+ int return_buffer __attribute__((unused)))
{
+ S_KEY_CACHE_CB *keycache= (S_KEY_CACHE_CB *) keycache_cb;
my_bool locked_and_incremented= FALSE;
int error=0;
uchar *start= buff;
- DBUG_ENTER("key_cache_read");
+ DBUG_ENTER("s_key_cache_read");
DBUG_PRINT("enter", ("fd: %u pos: %lu length: %u",
(uint) file, (ulong) filepos, length));
@@ -2738,29 +3007,49 @@ end:
/*
- Insert a block of file data from a buffer into key cache
+ Insert a block of file data from a buffer into a simple key cache
SYNOPSIS
- key_cache_insert()
- keycache pointer to a key cache data structure
+ s_key_cache_insert()
+ keycache_cb pointer to the control block of a simple key cache
file handler for the file to insert data from
filepos position of the block of data in the file to insert
level determines the weight of the data
buff buffer to read data from
length length of the data in the buffer
- NOTES
- This is used by MyISAM to move all blocks from a index file to the key
- cache
-
+ DESCRIPTION
+ This function is the implementation of the key_cache_insert interface
+ function that is employed by simple (non-partitioned) key caches.
+ The function considers the parameter keycache_cb as a pointer to the
+ control block structure of the type S_KEY_CACHE_CB for a simple key
+ cache.
+ The function writes a block of file data from a buffer into the key cache.
+ The buffer is specified with the parameters buff and length - the pointer
+ to the beginning of the buffer and its size respectively. It's assumed
+ the buffer contains the data from 'file' allocated from the position
+ filepos. The data is copied from the buffer in key_cache_block_size
+ increments.
+ The parameter level is used to set one characteristic for the key buffers
+ loaded with the data from buff. The characteristic is used only by the
+ midpoint insertion strategy.
+
RETURN VALUE
0 if a success, 1 - otherwise.
+
+ NOTES
+ The function is used by MyISAM to move all blocks from a index file to
+ the key cache. It can be performed in parallel with reading the file data
+ from the key buffers by other threads.
+
*/
-int key_cache_insert(KEY_CACHE *keycache,
- File file, my_off_t filepos, int level,
- uchar *buff, uint length)
+static
+int s_key_cache_insert(void *keycache_cb,
+ File file, my_off_t filepos, int level,
+ uchar *buff, uint length)
{
+ S_KEY_CACHE_CB *keycache= (S_KEY_CACHE_CB *) keycache_cb;
int error= 0;
DBUG_ENTER("key_cache_insert");
DBUG_PRINT("enter", ("fd: %u pos: %lu length: %u",
@@ -2979,43 +3268,65 @@ int key_cache_insert(KEY_CACHE *keycache
/*
- Write a buffer into a cached file.
+ Write a buffer into a simple key cache
SYNOPSIS
- key_cache_write()
- keycache pointer to a key cache data structure
- file handler for the file to write data to
- filepos position in the file to write data to
- level determines the weight of the data
- buff buffer with the data
- length length of the buffer
- dont_write if is 0 then all dirty pages involved in writing
- should have been flushed from key cache
+ s_key_cache_write()
+ keycache_cb pointer to the control block of a simple key cache
+ file handler for the file to write data to
+ file_extra maps of key cache partitions containing
+ dirty pages from file
+ filepos position in the file to write data to
+ level determines the weight of the data
+ buff buffer with the data
+ length length of the buffer
+ dont_write if is 0 then all dirty pages involved in writing
+ should have been flushed from key cache
+ DESCRIPTION
+ This function is the implementation of the key_cache_write interface
+ function that is employed by simple (non-partitioned) key caches.
+ The function considers the parameter keycache_cb as a pointer to the
+ control block structure of the type S_KEY_CACHE_CB for a simple key
+ cache.
+ In a general case the function copies data from a buffer into the key
+ cache. The buffer is specified with the parameters buff and length -
+ the pointer to the beginning of the buffer and its size respectively.
+ It's assumed the buffer contains the data to be written into 'file'
+ starting from the position filepos. The data is copied from the buffer
+ in key_cache_block_size increments.
+ If the value of the parameter dont_write is FALSE then the function
+ also writes the data into file.
+ The parameter level is used to set one characteristic for the key buffers
+ filled with the data from buff. The characteristic is employed only by
+ the midpoint insertion strategy.
+ The parameter file_extra currently makes sense only for simple key caches
+ that are elements of a partitioned key cache. It provides a pointer to the
+ shared bitmap of the partitions that may contains dirty pages for the file.
+ This bitmap is used to optimize the function p_flush_key_blocks.
+
RETURN VALUE
0 if a success, 1 - otherwise.
- NOTES.
- The function copies the data of size length from buff into buffers
- for key cache blocks that are assigned to contain the portion of
- the file starting with position filepos.
- It ensures that this data is flushed to the file if dont_write is FALSE.
- Filepos must be a multiple of 'block_length', but it doesn't
- have to be a multiple of key_cache_block_size;
+ NOTES
+ This implementation exploits the fact that the function is called only
+ when a thread has got an exclusive lock for the key file.
- dont_write is always TRUE in the server (info->lock_type is never F_UNLCK).
*/
-int key_cache_write(KEY_CACHE *keycache,
- File file, my_off_t filepos, int level,
- uchar *buff, uint length,
- uint block_length __attribute__((unused)),
- int dont_write)
+static
+int s_key_cache_write(void *keycache_cb,
+ File file, void *file_extra __attribute__((unused)),
+ my_off_t filepos, int level,
+ uchar *buff, uint length,
+ uint block_length __attribute__((unused)),
+ int dont_write)
{
+ S_KEY_CACHE_CB *keycache= (S_KEY_CACHE_CB *) keycache_cb;
my_bool locked_and_incremented= FALSE;
int error=0;
- DBUG_ENTER("key_cache_write");
+ DBUG_ENTER("s_key_cache_write");
DBUG_PRINT("enter",
("fd: %u pos: %lu length: %u block_length: %u"
" key_block_length: %u",
@@ -3330,7 +3641,7 @@ end:
Block must have a request registered on it.
*/
-static void free_block(KEY_CACHE *keycache, BLOCK_LINK *block)
+static void free_block(S_KEY_CACHE_CB *keycache, BLOCK_LINK *block)
{
KEYCACHE_THREAD_TRACE("free block");
KEYCACHE_DBUG_PRINT("free_block",
@@ -3470,7 +3781,7 @@ static int cmp_sec_link(BLOCK_LINK **a,
free used blocks if requested
*/
-static int flush_cached_blocks(KEY_CACHE *keycache,
+static int flush_cached_blocks(S_KEY_CACHE_CB *keycache,
File file, BLOCK_LINK **cache,
BLOCK_LINK **end,
enum flush_type type)
@@ -3514,9 +3825,9 @@ static int flush_cached_blocks(KEY_CACHE
(BLOCK_READ | BLOCK_IN_FLUSH | BLOCK_CHANGED | BLOCK_IN_USE));
block->status|= BLOCK_IN_FLUSHWRITE;
keycache_pthread_mutex_unlock(&keycache->cache_lock);
- error= my_pwrite(file, block->buffer+block->offset,
+ error= my_pwrite(file, block->buffer + block->offset,
block->length - block->offset,
- block->hash_link->diskpos+ block->offset,
+ block->hash_link->diskpos + block->offset,
MYF(MY_NABP | MY_WAIT_IF_FULL));
keycache_pthread_mutex_lock(&keycache->cache_lock);
keycache->global_cache_write++;
@@ -3576,7 +3887,7 @@ static int flush_cached_blocks(KEY_CACHE
/*
- Flush all key blocks for a file to disk, but don't do any mutex locks.
+ Flush all key blocks for a file to disk, but don't do any mutex locks
SYNOPSIS
flush_key_blocks_int()
@@ -3598,7 +3909,7 @@ static int flush_cached_blocks(KEY_CACHE
1 error
*/
-static int flush_key_blocks_int(KEY_CACHE *keycache,
+static int flush_key_blocks_int(S_KEY_CACHE_CB *keycache,
File file, enum flush_type type)
{
BLOCK_LINK *cache_buff[FLUSH_CACHE],**cache;
@@ -4034,23 +4345,49 @@ err:
/*
- Flush all blocks for a file to disk
+ Flush all blocks for a file from key buffers of a simple key cache
SYNOPSIS
- flush_key_blocks()
- keycache pointer to a key cache data structure
- file handler for the file to flush to
- flush_type type of the flush
+ s_flush_key_blocks()
+ keycache_cb pointer to the control block of a simple key cache
+ file handler for the file to flush to
+ file_extra maps of key cache partitions containing
+ dirty pages from file (not used)
+ flush_type type of the flush operation
+ DESCRIPTION
+ This function is the implementation of the flush_key_blocks interface
+ function that is employed by simple (non-partitioned) key caches.
+ The function considers the parameter keycache_cb as a pointer to the
+ control block structure of the type S_KEY_CACHE_CB for a simple key
+ cache.
+ In a general case the function flushes the data from all dirty key
+ buffers related to the file 'file' into this file. The function does
+ exactly this if the value of the parameter type is FLUSH_KEEP. If the
+ value of this parameter is FLUSH_RELEASE, the function additionally
+ releases the key buffers containing data from 'file' for new usage.
+ If the value of the parameter type is FLUSH_IGNORE_CHANGED the function
+ just releases the key buffers containing data from 'file'.
+ The parameter file_extra currently is not used by this function.
+
RETURN
0 ok
1 error
+
+ NOTES
+ This implementation exploits the fact that the function is called only
+ when a thread has got an exclusive lock for the key file.
+
*/
-int flush_key_blocks(KEY_CACHE *keycache,
- File file, enum flush_type type)
+static
+int s_flush_key_blocks(void *keycache_cb,
+ File file,
+ void *file_extra __attribute__((unused)),
+ enum flush_type type)
{
+ S_KEY_CACHE_CB *keycache= (S_KEY_CACHE_CB *) keycache_cb;
int res= 0;
DBUG_ENTER("flush_key_blocks");
DBUG_PRINT("enter", ("keycache: 0x%lx", (long) keycache));
@@ -4103,7 +4440,7 @@ int flush_key_blocks(KEY_CACHE *keycache
!= 0 Error
*/
-static int flush_all_key_blocks(KEY_CACHE *keycache)
+static int flush_all_key_blocks(S_KEY_CACHE_CB *keycache)
{
BLOCK_LINK *block;
uint total_found;
@@ -4206,37 +4543,45 @@ static int flush_all_key_blocks(KEY_CACH
/*
- Reset the counters of a key cache.
+ Reset the counters of a simple key cache
SYNOPSIS
- reset_key_cache_counters()
- name the name of a key cache
- key_cache pointer to the key kache to be reset
+ s_reset_key_cache_counters()
+ name the name of a key cache
+ keycache_cb pointer to the control block of a simple key cache
DESCRIPTION
- This procedure is used by process_key_caches() to reset the counters of all
- currently used key caches, both the default one and the named ones.
+ This function is the implementation of the reset_key_cache_counters
+ interface function that is employed by simple (non-partitioned) key caches.
+ The function considers the parameter keycache_cb as a pointer to the
+ control block structure of the type S_KEY_CACHE_CB for a simple key cache.
+ This function resets the values of all statistical counters for the key
+ cache to 0.
+ The parameter name is currently not used.
RETURN
0 on success (always because it can't fail)
+
*/
-int reset_key_cache_counters(const char *name __attribute__((unused)),
- KEY_CACHE *key_cache)
+static
+int s_reset_key_cache_counters(const char *name __attribute__((unused)),
+ void *keycache_cb)
{
- DBUG_ENTER("reset_key_cache_counters");
- if (!key_cache->key_cache_inited)
+ S_KEY_CACHE_CB *keycache= (S_KEY_CACHE_CB *) keycache_cb;
+ DBUG_ENTER("s_reset_key_cache_counters");
+ if (!keycache->key_cache_inited)
{
DBUG_PRINT("info", ("Key cache %s not initialized.", name));
DBUG_RETURN(0);
}
DBUG_PRINT("info", ("Resetting counters for key cache %s.", name));
- key_cache->global_blocks_changed= 0; /* Key_blocks_not_flushed */
- key_cache->global_cache_r_requests= 0; /* Key_read_requests */
- key_cache->global_cache_read= 0; /* Key_reads */
- key_cache->global_cache_w_requests= 0; /* Key_write_requests */
- key_cache->global_cache_write= 0; /* Key_writes */
+ keycache->global_blocks_changed= 0; /* Key_blocks_not_flushed */
+ keycache->global_cache_r_requests= 0; /* Key_read_requests */
+ keycache->global_cache_read= 0; /* Key_reads */
+ keycache->global_cache_w_requests= 0; /* Key_write_requests */
+ keycache->global_cache_write= 0; /* Key_writes */
DBUG_RETURN(0);
}
@@ -4245,7 +4590,7 @@ int reset_key_cache_counters(const char
/*
Test if disk-cache is ok
*/
-static void test_key_cache(KEY_CACHE *keycache __attribute__((unused)),
+static void test_key_cache(S_KEY_CACHE_CB *keycache __attribute__((unused)),
const char *where __attribute__((unused)),
my_bool lock __attribute__((unused)))
{
@@ -4259,7 +4604,7 @@ static void test_key_cache(KEY_CACHE *ke
#define MAX_QUEUE_LEN 100
-static void keycache_dump(KEY_CACHE *keycache)
+static void keycache_dump(S_KEY_CACHE_CB *keycache)
{
FILE *keycache_dump_file=fopen(KEYCACHE_DUMP_FILE, "w");
struct st_my_thread_var *last;
@@ -4499,7 +4844,7 @@ static int fail_hlink(HASH_LINK *hlink)
return 0; /* Let the assert fail. */
}
-static int cache_empty(KEY_CACHE *keycache)
+static int cache_empty(S_KEY_CACHE_CB *keycache)
{
int errcnt= 0;
int idx;
@@ -4537,3 +4882,1675 @@ static int cache_empty(KEY_CACHE *keycac
}
#endif
+
+/*
+ Get statistics for a simple key cache
+
+ SYNOPSIS
+ get_key_cache_statistics()
+ keycache_cb pointer to the control block of a simple key cache
+ partition_no partition number (not used)
+ key_cache_stats OUT pointer to the structure for the returned statistics
+
+ DESCRIPTION
+ This function is the implementation of the get_key_cache_statistics
+ interface function that is employed by simple (non-partitioned) key caches.
+ The function considers the parameter keycache_cb as a pointer to the
+ control block structure of the type S_KEY_CACHE_CB for a simple key cache.
+ This function returns the statistical data for the key cache.
+ The parameter partition_no is not used by this function.
+
+ RETURN
+ none
+
+*/
+
+static
+void s_get_key_cache_statistics(void *keycache_cb,
+ uint partition_no __attribute__((unused)),
+ KEY_CACHE_STATISTICS *key_cache_stats)
+{
+ S_KEY_CACHE_CB *keycache= (S_KEY_CACHE_CB *) keycache_cb;
+ DBUG_ENTER("s_get_key_cache_statistics");
+
+ key_cache_stats->mem_size= (longlong) keycache->key_cache_mem_size;
+ key_cache_stats->block_size= (longlong) keycache->key_cache_block_size;
+ key_cache_stats->blocks_used= keycache->blocks_used;
+ key_cache_stats->blocks_unused= keycache->blocks_unused;
+ key_cache_stats->blocks_changed= keycache->global_blocks_changed;
+ key_cache_stats->read_requests= keycache->global_cache_r_requests;
+ key_cache_stats->reads= keycache->global_cache_read;
+ key_cache_stats->write_requests= keycache->global_cache_w_requests;
+ key_cache_stats->writes= keycache->global_cache_write;
+ DBUG_VOID_RETURN;
+}
+
+
+static size_t s_key_cache_stat_var_offsets[]=
+{
+ offsetof(S_KEY_CACHE_CB, blocks_used),
+ offsetof(S_KEY_CACHE_CB, blocks_unused),
+ offsetof(S_KEY_CACHE_CB, global_blocks_changed),
+ offsetof(S_KEY_CACHE_CB, global_cache_w_requests),
+ offsetof(S_KEY_CACHE_CB, global_cache_write),
+ offsetof(S_KEY_CACHE_CB, global_cache_r_requests),
+ offsetof(S_KEY_CACHE_CB, global_cache_read)
+};
+
+
+/*
+ Get the value of a statistical variable for a simple key cache
+
+ SYNOPSIS
+ s_get_key_cache_stat_value()
+ keycache_cb pointer to the control block of a simple key cache
+ var_no the ordered number of a statistical variable
+
+ DESCRIPTION
+ This function is the implementation of the s_get_key_cache_stat_value
+ interface function that is employed by simple (non-partitioned) key caches.
+ The function considers the parameter keycache_cb as a pointer to the
+ control block structure of the type S_KEY_CACHE_CB for a simple key cache.
+ This function returns the value of the statistical variable var_no
+ for this key cache. The variables are numbered starting from 0 to 6.
+
+ RETURN
+ The value of the specified statistical variable
+
+*/
+
+static
+ulonglong s_get_key_cache_stat_value(void *keycache_cb, uint var_no)
+{
+ S_KEY_CACHE_CB *keycache= (S_KEY_CACHE_CB *) keycache_cb;
+ size_t var_ofs= s_key_cache_stat_var_offsets[var_no];
+ ulonglong res= 0;
+ DBUG_ENTER("s_get_key_cache_stat_value");
+
+ if (var_no < 3)
+ res= (ulonglong) (*(long *) ((char *) keycache + var_ofs));
+ else
+ res= *(ulonglong *) ((char *) keycache + var_ofs);
+
+ DBUG_RETURN(res);
+}
+
+
+/*
+ The array of pointer to the key cache interface functions used for simple
+ key caches. Any simple key cache objects including those incorporated into
+ partitioned keys caches exploit this array.
+
+ The current implementation of these functions allows to call them from
+ the MySQL server code directly. We don't do it though.
+*/
+
+static KEY_CACHE_FUNCS s_key_cache_funcs =
+{
+ s_init_key_cache,
+ s_resize_key_cache,
+ s_change_key_cache_param,
+ s_key_cache_read,
+ s_key_cache_insert,
+ s_key_cache_write,
+ s_flush_key_blocks,
+ s_reset_key_cache_counters,
+ s_end_key_cache,
+ s_get_key_cache_statistics,
+ s_get_key_cache_stat_value
+};
+
+
+/******************************************************************************
+ Partitioned Key Cache Module
+
+ The module contains implementations of all key cache interface functions
+ employed by partitioned key caches.
+
+ A partitioned key cache is a collection of structures for simple key caches
+ called key cache partitions. Any page from a file can be placed into a buffer
+ of only one partition. The number of the partition is calculated from
+ the file number and the position of the page in the file, and it's always the
+ same for the page. The function that maps pages into partitions takes care
+ of even distribution of pages among partitions.
+
+ Partition key cache mitigate one of the major problem of simple key cache:
+ thread contention for key cache lock (mutex). Every call of a key cache
+ interface function must acquire this lock. So threads compete for this lock
+ even in the case when they have acquired shared locks for the file and
+ pages they want read from are in the key cache buffers.
+ When working with a partitioned key cache any key cache interface function
+ that needs only one page has to acquire the key cache lock only for the
+ partition the page is ascribed to. This makes the chances for threads not
+ compete for the same key cache lock better. Unfortunately if we use a
+ partitioned key cache with N partitions for B-tree indexes we can't say
+ that the chances becomes N times less. The fact is that any index lookup
+ operation requires reading from the root page that, for any index, is always
+ ascribed to the same partition. To resolve this problem we should have
+ employed more sophisticated mechanisms of working with root pages.
+
+ Currently the number of partitions in a partitioned key cache is limited
+ by 64. We could increase this limit. Simultaneously we would have to increase
+ accordingly the size of the bitmap dirty_part_map from the MYISAM_SHARE
+ structure.
+
+******************************************************************************/
+
+/* Control block for a partitioned key cache */
+
+typedef struct st_p_key_cache_cb
+{
+ my_bool key_cache_inited; /*<=> control block is allocated */
+ S_KEY_CACHE_CB **partition_array; /* array of the key cache partitions */
+ uint partitions; /* number of partitions in the key cache */
+ size_t key_cache_mem_size; /* specified size of the cache memory */
+ uint key_cache_block_size; /* size of the page buffer of a cache block */
+} P_KEY_CACHE_CB;
+
+static
+void p_end_key_cache(void *keycache_cb, my_bool cleanup);
+
+/*
+ Determine the partition to which the index block to read is ascribed
+
+ SYNOPSIS
+ get_key_cache_partition()
+ keycache pointer to the control block of a partitioned key cache
+ file handler for the file for the block of data to be read
+ filepos position of the block of data in the file
+
+ DESCRIPTION
+ The function determines the number of the partition in whose buffer the
+ block from 'file' at the position filepos has to be placed for reading.
+ The function returns the control block of the simple key cache for this
+ partition to the caller.
+
+ RETURN VALUE
+ The pointer to the control block of the partition to which the specified
+ file block is ascribed.
+*/
+
+static
+S_KEY_CACHE_CB *get_key_cache_partition(P_KEY_CACHE_CB *keycache,
+ File file, my_off_t filepos)
+{
+ uint i= KEYCACHE_BASE_EXPR( file, filepos) % keycache->partitions;
+ return keycache->partition_array[i];
+}
+
+
+/*
+ Determine the partition to which the index block to write is ascribed
+
+ SYNOPSIS
+ get_key_cache_partition()
+ keycache pointer to the control block of a partitioned key cache
+ file handler for the file for the block of data to be read
+ filepos position of the block of data in the file
+ dirty_part_map pointer to the bitmap of dirty partitions for the file
+
+ DESCRIPTION
+ The function determines the number of the partition in whose buffer the
+ block from 'file' at the position filepos has to be placed for writing and
+ marks the partition as dirty in the dirty_part_map bitmap.
+ The function returns the control block of the simple key cache for this
+ partition to the caller.
+
+ RETURN VALUE
+ The pointer to the control block of the partition to which the specified
+ file block is ascribed.
+*/
+
+static
+S_KEY_CACHE_CB *get_key_cache_partition_for_write(P_KEY_CACHE_CB *keycache,
+ File file, my_off_t filepos,
+ ulonglong* dirty_part_map)
+{
+ uint i= KEYCACHE_BASE_EXPR( file, filepos) % keycache->partitions;
+ *dirty_part_map|= 1<<i;
+ return keycache->partition_array[i];
+}
+
+
+/*
+ Initialize a partitioned key cache
+
+ SYNOPSIS
+ p_init_key_cache()
+ keycache_cb pointer to the control block of a partitioned key cache
+ key_cache_block_size size of blocks to keep cached data
+ use_mem total memory to use for all key cache partitions
+ division_limit division limit (may be zero)
+ age_threshold age threshold (may be zero)
+
+ DESCRIPTION
+ This function is the implementation of the init_key_cache interface function
+ that is employed by partitioned key caches.
+ The function builds and initializes an array of simple key caches, and then
+ initializes the control block structure of the type P_KEY_CACHE_CB that is
+ used for a partitioned key cache. The parameter keycache_cb is supposed to
+ point to this structure. The number of partitions in the partitioned key
+ cache to be built must be passed through the field 'partitions' of this
+ structure. The parameter key_cache_block_size specifies the size of the
+ blocks in the the simple key caches to be built. The parameters
+ division_limit and age_threshold determine the initial values of those
+ characteristics of the simple key caches that are used for midpoint
+ insertion strategy. The parameter use_mem specifies the total amount of
+ memory to be allocated for the key cache blocks in all simple key caches
+ and for all auxiliary structures.
+
+ RETURN VALUE
+ total number of blocks in key cache partitions, if successful,
+ <= 0 - otherwise.
+
+ NOTES
+ If keycache->key_cache_inited != 0 then we assume that the memory for
+ the array of partitions has been already allocated.
+
+ It's assumed that no two threads call this function simultaneously
+ referring to the same key cache handle.
+*/
+
+static
+int p_init_key_cache(void *keycache_cb, uint key_cache_block_size,
+ size_t use_mem, uint division_limit,
+ uint age_threshold)
+{
+ int i;
+ size_t mem_per_cache;
+ int cnt;
+ S_KEY_CACHE_CB *partition;
+ S_KEY_CACHE_CB **partition_ptr;
+ P_KEY_CACHE_CB *keycache= (P_KEY_CACHE_CB *) keycache_cb;
+ uint partitions= keycache->partitions;
+ int blocks= -1;
+ DBUG_ENTER("p_init_key_cache");
+
+ keycache->key_cache_block_size = key_cache_block_size;
+
+ if (keycache->key_cache_inited)
+ partition_ptr= keycache->partition_array;
+ else
+ {
+ if(!(partition_ptr=
+ (S_KEY_CACHE_CB **) my_malloc(sizeof(S_KEY_CACHE_CB *) * partitions,
+ MYF(0))))
+ DBUG_RETURN(blocks);
+ keycache->partition_array= partition_ptr;
+ }
+
+ mem_per_cache = use_mem / partitions;
+
+ for (i= 0; i < (int) partitions; i++)
+ {
+ my_bool key_cache_inited= keycache->key_cache_inited;
+ if (key_cache_inited)
+ partition= *partition_ptr;
+ else
+ {
+ if (!(partition= (S_KEY_CACHE_CB *) my_malloc(sizeof(S_KEY_CACHE_CB),
+ MYF(0))))
+ continue;
+ partition->key_cache_inited= 0;
+ }
+
+ if ((cnt= s_init_key_cache(partition,
+ key_cache_block_size, mem_per_cache,
+ division_limit, age_threshold)) <= 0)
+ {
+ s_end_key_cache(partition, 1);
+ my_free((uchar *) partition, MYF(0));
+ partition= 0;
+ if (key_cache_inited)
+ {
+ memmove(partition_ptr, partition_ptr+1,
+ sizeof(partition_ptr)*(partitions-i-1));
+ }
+ if (i == 0)
+ {
+ i--;
+ partitions--;
+ if (partitions)
+ mem_per_cache = use_mem / partitions;
+ }
+ continue;
+ }
+
+ if (blocks < 0)
+ blocks= 0;
+ blocks+= cnt;
+ *partition_ptr++= partition;
+ }
+
+ keycache->partitions= partitions= partition_ptr-keycache->partition_array;
+ keycache->key_cache_mem_size= mem_per_cache * partitions;
+ for (i= 0; i < (int) partitions; i++)
+ keycache->partition_array[i]->hash_factor= partitions;
+
+ keycache->key_cache_inited= 1;
+
+ DBUG_RETURN(blocks);
+}
+
+
+/*
+ Resize a partitioned key cache
+
+ SYNOPSIS
+ p_resize_key_cache()
+ keycache_cb pointer to the control block of a partitioned key cache
+ key_cache_block_size size of blocks to keep cached data
+ use_mem total memory to use for the new key cache
+ division_limit new division limit (if not zero)
+ age_threshold new age threshold (if not zero)
+
+ DESCRIPTION
+ This function is the implementation of the resize_key_cache interface
+ function that is employed by partitioned key caches.
+ The function considers the parameter keycache_cb as a pointer to the
+ control block structure of the type P_KEY_CACHE_CB for the partitioned
+ key cache to be resized.
+ The parameter key_cache_block_size specifies the new size of the blocks in
+ the simple key caches that comprise the partitioned key cache.
+ The parameters division_limit and age_threshold determine the new initial
+ values of those characteristics of the simple key cache that are used for
+ midpoint insertion strategy. The parameter use-mem specifies the total
+ amount of memory to be allocated for the key cache blocks in all new
+ simple key caches and for all auxiliary structures.
+
+ RETURN VALUE
+ number of blocks in the key cache, if successful,
+ 0 - otherwise.
+
+ NOTES.
+ The function first calls s_prepare_resize_key_cache for each simple
+ key cache effectively flushing all dirty pages from it and destroying
+ the key cache. Then p_init_key cache is called. This call builds all
+ the new array of simple key caches containing the same number of
+ elements as the old one. After this the function calls the function
+ s_finish_resize_key_cache for each simple key cache from this array.
+
+ This implementation doesn't block the calls and executions of other
+ functions from the key cache interface. However it assumes that the
+ calls of s_resize_key_cache itself are serialized.
+
+*/
+
+static
+int p_resize_key_cache(void *keycache_cb, uint key_cache_block_size,
+ size_t use_mem, uint division_limit,
+ uint age_threshold)
+{
+ uint i;
+ P_KEY_CACHE_CB *keycache= (P_KEY_CACHE_CB *) keycache_cb;
+ uint partitions= keycache->partitions;
+ my_bool cleanup= use_mem == 0;
+ int blocks= -1;
+ int err= 0;
+ DBUG_ENTER("p_resize_key_cache");
+ if (use_mem == 0)
+ {
+ p_end_key_cache(keycache_cb, 0);
+ DBUG_RETURN(blocks);
+ }
+ for (i= 0; i < partitions; i++)
+ {
+ err|= s_prepare_resize_key_cache(keycache->partition_array[i], 0, 1);
+ }
+ if (!err && use_mem)
+ blocks= p_init_key_cache(keycache_cb, key_cache_block_size, use_mem,
+ division_limit, age_threshold);
+ if (blocks > 0 && !cleanup)
+ {
+ for (i= 0; i < partitions; i++)
+ {
+ s_finish_resize_key_cache(keycache->partition_array[i], 0, 1);
+ }
+ }
+ DBUG_RETURN(blocks);
+}
+
+
+/*
+ Change key cache parameters of a partitioned key cache
+
+ SYNOPSIS
+ p_change_key_cache_param()
+ keycache_cb pointer to the control block of a partitioned key cache
+ division_limit new division limit (if not zero)
+ age_threshold new age threshold (if not zero)
+
+ DESCRIPTION
+ This function is the implementation of the change_key_cache_param interface
+ function that is employed by partitioned key caches.
+ The function considers the parameter keycache_cb as a pointer to the
+ control block structure of the type P_KEY_CACHE_CB for the simple key
+ cache where new values of the division limit and the age threshold used
+ for midpoint insertion strategy are to be set. The parameters
+ division_limit and age_threshold provide these new values.
+
+ RETURN VALUE
+ none
+
+ NOTES
+ The function just calls s_change_key_cache_param for each element from the
+ array of simple caches that comprise the partitioned key cache.
+
+*/
+
+static
+void p_change_key_cache_param(void *keycache_cb, uint division_limit,
+ uint age_threshold)
+{
+ uint i;
+ P_KEY_CACHE_CB *keycache= (P_KEY_CACHE_CB *) keycache_cb;
+ uint partitions= keycache->partitions;
+ DBUG_ENTER("p_change_key_cache_param");
+ for (i= 0; i < partitions; i++)
+ {
+ s_change_key_cache_param(keycache->partition_array[i], division_limit,
+ age_threshold);
+ }
+ DBUG_VOID_RETURN;
+}
+
+
+/*
+ Destroy a partitioned key cache
+
+ SYNOPSIS
+ p_end_key_cache()
+ keycache_cb pointer to the control block of a partitioned key cache
+ cleanup <=> complete free (free also control block structures
+ for all simple key caches)
+
+ DESCRIPTION
+ This function is the implementation of the end_key_cache interface
+ function that is employed by partitioned key caches.
+ The function considers the parameter keycache_cb as a pointer to the
+ control block structure of the type P_KEY_CACHE_CB for the partitioned
+ key cache to be destroyed.
+ The function frees the memory allocated for the cache blocks and
+ auxiliary structures used by simple key caches that comprise the
+ partitioned key cache. If the value of the parameter cleanup is TRUE
+ then even the memory used for control blocks of the simple key caches
+ and the array of pointers to them are freed.
+
+ RETURN VALUE
+ none
+
+*/
+
+static
+void p_end_key_cache(void *keycache_cb, my_bool cleanup)
+{
+ uint i;
+ P_KEY_CACHE_CB *keycache= (P_KEY_CACHE_CB *) keycache_cb;
+ uint partitions= keycache->partitions;
+ DBUG_ENTER("p_end_key_cache");
+ DBUG_PRINT("enter", ("key_cache: 0x%lx", (long) keycache));
+
+ for (i= 0; i < partitions; i++)
+ {
+ s_end_key_cache(keycache->partition_array[i], cleanup);
+ }
+ if (cleanup) {
+ for (i= 0; i < partitions; i++)
+ my_free((uchar*) keycache->partition_array[i], MYF(0));
+ my_free((uchar*) keycache->partition_array, MYF(0));
+ keycache->key_cache_inited= 0;
+ }
+ DBUG_VOID_RETURN;
+}
+
+
+/*
+ Read a block of data from a partitioned key cache into a buffer
+
+ SYNOPSIS
+
+ p_key_cache_read()
+ keycache_cb pointer to the control block of a partitioned key cache
+ file handler for the file for the block of data to be read
+ filepos position of the block of data in the file
+ level determines the weight of the data
+ buff buffer to where the data must be placed
+ length length of the buffer
+ block_length length of the read data from a key cache block
+ return_buffer return pointer to the key cache buffer with the data
+
+ DESCRIPTION
+ This function is the implementation of the key_cache_read interface
+ function that is employed by partitioned key caches.
+ The function considers the parameter keycache_cb as a pointer to the
+ control block structure of the type P_KEY_CACHE_CB for a partitioned
+ key cache.
+ In a general case the function reads a block of data from the key cache
+ into the buffer buff of the size specified by the parameter length. The
+ beginning of the block of data to be read is specified by the parameters
+ file and filepos. The length of the read data is the same as the length
+ of the buffer. The data is read into the buffer in key_cache_block_size
+ increments. To read each portion the function first finds out in what
+ partition of the key cache this portion(page) is to be saved, and calls
+ s_key_cache_read with the pointer to the corresponding simple key as
+ its first parameter.
+ If the parameter return_buffer is not ignored and its value is TRUE, and
+ the data to be read of the specified size block_length can be read from one
+ key cache buffer, then the function returns a pointer to the data in the
+ key cache buffer.
+ The function takes into account parameters block_length and return buffer
+ only in a single-threaded environment.
+ The parameter 'level' is used only by the midpoint insertion strategy
+ when the data or its portion cannot be found in the key cache.
+
+ RETURN VALUE
+ Returns address from where the data is placed if successful, 0 - otherwise.
+
+*/
+
+static
+uchar *p_key_cache_read(void *keycache_cb,
+ File file, my_off_t filepos, int level,
+ uchar *buff, uint length,
+ uint block_length __attribute__((unused)),
+ int return_buffer __attribute__((unused)))
+{
+ uint r_length;
+ P_KEY_CACHE_CB *keycache= (P_KEY_CACHE_CB *) keycache_cb;
+ uint offset= (uint) (filepos % keycache->key_cache_block_size);
+ uchar *start= buff;
+ DBUG_ENTER("p_key_cache_read");
+ DBUG_PRINT("enter", ("fd: %u pos: %lu length: %u",
+ (uint) file, (ulong) filepos, length));
+
+#ifndef THREAD
+ if (block_length > keycache->key_cache_block_size || offset)
+ return_buffer=0;
+#endif
+
+ /* Read data in key_cache_block_size increments */
+ do
+ {
+ S_KEY_CACHE_CB *partition= get_key_cache_partition(keycache,
+ file, filepos);
+ uchar *ret_buff= 0;
+ r_length= length;
+ set_if_smaller(r_length, keycache->key_cache_block_size - offset);
+ ret_buff= s_key_cache_read((void *) partition,
+ file, filepos, level,
+ buff, r_length,
+ block_length, return_buffer);
+ if (ret_buff == 0)
+ DBUG_RETURN(0);
+#ifndef THREAD
+ /* This is only true if we were able to read everything in one block */
+ if (return_buffer)
+ DBUG_RETURN(ret_buff);
+#endif
+ filepos+= r_length;
+ buff+= r_length;
+ offset= 0;
+ } while ((length-= r_length));
+
+ DBUG_RETURN(start);
+}
+
+
+/*
+ Insert a block of file data from a buffer into a partitioned key cache
+
+ SYNOPSIS
+ p_key_cache_insert()
+ keycache_cb pointer to the control block of a partitioned key cache
+ file handler for the file to insert data from
+ filepos position of the block of data in the file to insert
+ level determines the weight of the data
+ buff buffer to read data from
+ length length of the data in the buffer
+
+ DESCRIPTION
+ This function is the implementation of the key_cache_insert interface
+ function that is employed by partitioned key caches.
+ The function considers the parameter keycache_cb as a pointer to the
+ control block structure of the type P_KEY_CACHE_CB for a partitioned key
+ cache.
+ The function writes a block of file data from a buffer into the key cache.
+ The buffer is specified with the parameters buff and length - the pointer
+ to the beginning of the buffer and its size respectively. It's assumed
+ that the buffer contains the data from 'file' allocated from the position
+ filepos. The data is copied from the buffer in key_cache_block_size
+ increments. For every portion of data the function finds out in what simple
+ key cache from the array of partitions the data must be stored, and after
+ this calls s_key_cache_insert to copy the data into a key buffer of this
+ simple key cache.
+ The parameter level is used to set one characteristic for the key buffers
+ loaded with the data from buff. The characteristic is used only by the
+ midpoint insertion strategy.
+
+ RETURN VALUE
+ 0 if a success, 1 - otherwise.
+
+ NOTES
+ The function is used by MyISAM to move all blocks from a index file to
+ the key cache. It can be performed in parallel with reading the file data
+ from the key buffers by other threads.
+
+*/
+
+static
+int p_key_cache_insert(void *keycache_cb,
+ File file, my_off_t filepos, int level,
+ uchar *buff, uint length)
+{
+ uint w_length;
+ P_KEY_CACHE_CB *keycache= (P_KEY_CACHE_CB *) keycache_cb;
+ uint offset= (uint) (filepos % keycache->key_cache_block_size);
+ DBUG_ENTER("p_key_cache_insert");
+ DBUG_PRINT("enter", ("fd: %u pos: %lu length: %u",
+ (uint) file,(ulong) filepos, length));
+
+
+ /* Write data in key_cache_block_size increments */
+ do
+ {
+ S_KEY_CACHE_CB *partition= get_key_cache_partition(keycache,
+ file, filepos);
+ w_length= length;
+ set_if_smaller(w_length, keycache->key_cache_block_size);
+ if (s_key_cache_insert((void *) partition,
+ file, filepos, level,
+ buff, w_length))
+ DBUG_RETURN(1);
+
+ filepos+= w_length;
+ buff+= w_length;
+ offset = 0;
+ } while ((length-= w_length));
+
+ DBUG_RETURN(0);
+}
+
+
+/*
+ Write data from a buffer into a partitioned key cache
+
+ SYNOPSIS
+
+ p_key_cache_write()
+ keycache_cb pointer to the control block of a partitioned key cache
+ file handler for the file to write data to
+ filepos position in the file to write data to
+ level determines the weight of the data
+ buff buffer with the data
+ length length of the buffer
+ dont_write if is 0 then all dirty pages involved in writing
+ should have been flushed from key cache
+ file_extra maps of key cache partitions containing
+ dirty pages from file
+
+ DESCRIPTION
+ This function is the implementation of the key_cache_write interface
+ function that is employed by partitioned key caches.
+ The function considers the parameter keycache_cb as a pointer to the
+ control block structure of the type P_KEY_CACHE_CB for a partitioned
+ key cache.
+ In a general case the function copies data from a buffer into the key
+ cache. The buffer is specified with the parameters buff and length -
+ the pointer to the beginning of the buffer and its size respectively.
+ It's assumed the buffer contains the data to be written into 'file'
+ starting from the position filepos. The data is copied from the buffer
+ in key_cache_block_size increments. For every portion of data the
+ function finds out in what simple key cache from the array of partitions
+ the data must be stored, and after this calls s_key_cache_write to copy
+ the data into a key buffer of this simple key cache.
+ If the value of the parameter dont_write is FALSE then the function
+ also writes the data into file.
+ The parameter level is used to set one characteristic for the key buffers
+ filled with the data from buff. The characteristic is employed only by
+ the midpoint insertion strategy.
+ The parameter file_expra provides a pointer to the shared bitmap of
+ the partitions that may contains dirty pages for the file. This bitmap
+ is used to optimize the function p_flush_key_blocks.
+
+ RETURN VALUE
+ 0 if a success, 1 - otherwise.
+
+ NOTES
+ This implementation exploits the fact that the function is called only
+ when a thread has got an exclusive lock for the key file.
+
+*/
+
+static
+int p_key_cache_write(void *keycache_cb,
+ File file, void *file_extra,
+ my_off_t filepos, int level,
+ uchar *buff, uint length,
+ uint block_length __attribute__((unused)),
+ int dont_write)
+{
+ uint w_length;
+ ulonglong *part_map= (ulonglong *) file_extra;
+ P_KEY_CACHE_CB *keycache= (P_KEY_CACHE_CB *) keycache_cb;
+ uint offset= (uint) (filepos % keycache->key_cache_block_size);
+ DBUG_ENTER("p_key_cache_write");
+ DBUG_PRINT("enter",
+ ("fd: %u pos: %lu length: %u block_length: %u"
+ " key_block_length: %u",
+ (uint) file, (ulong) filepos, length, block_length,
+ keycache ? keycache->key_cache_block_size : 0));
+
+
+ /* Write data in key_cache_block_size increments */
+ do
+ {
+ S_KEY_CACHE_CB *partition= get_key_cache_partition_for_write(keycache,
+ file, filepos,
+ part_map);
+ w_length = length;
+ set_if_smaller(w_length, keycache->key_cache_block_size );
+ if (s_key_cache_write(partition,
+ file, 0, filepos, level,
+ buff, w_length, block_length,
+ dont_write))
+ DBUG_RETURN(1);
+
+ filepos+= w_length;
+ buff+= w_length;
+ offset= 0;
+ } while ((length-= w_length));
+
+ DBUG_RETURN(0);
+}
+
+
+/*
+ Flush all blocks for a file from key buffers of a partitioned key cache
+
+ SYNOPSIS
+
+ p_flush_key_blocks()
+ keycache_cb pointer to the control block of a partitioned key cache
+ file handler for the file to flush to
+ file_extra maps of key cache partitions containing
+ dirty pages from file (not used)
+ flush_type type of the flush operation
+
+ DESCRIPTION
+ This function is the implementation of the flush_key_blocks interface
+ function that is employed by partitioned key caches.
+ The function considers the parameter keycache_cb as a pointer to the
+ control block structure of the type P_KEY_CACHE_CB for a partitioned
+ key cache.
+ In a general case the function flushes the data from all dirty key
+ buffers related to the file 'file' into this file. The function does
+ exactly this if the value of the parameter type is FLUSH_KEEP. If the
+ value of this parameter is FLUSH_RELEASE, the function additionally
+ releases the key buffers containing data from 'file' for new usage.
+ If the value of the parameter type is FLUSH_IGNORE_CHANGED the function
+ just releases the key buffers containing data from 'file'.
+ The function performs the operation by calling s_flush_key_blocks
+ for the elements of the array of the simple key caches that comprise
+ the partitioned key_cache. If the value of the parameter type is
+ FLUSH_KEEP s_flush_key_blocks is called only for the partitions with
+ possibly dirty pages marked in the bitmap pointed to by the parameter
+ file_extra.
+
+ RETURN
+ 0 ok
+ 1 error
+
+ NOTES
+ This implementation exploits the fact that the function is called only
+ when a thread has got an exclusive lock for the key file.
+
+*/
+
+static
+int p_flush_key_blocks(void *keycache_cb,
+ File file, void *file_extra,
+ enum flush_type type)
+{
+ uint i;
+ P_KEY_CACHE_CB *keycache= (P_KEY_CACHE_CB *) keycache_cb;
+ uint partitions= keycache->partitions;
+ int err= 0;
+ ulonglong *dirty_part_map= (ulonglong *) file_extra;
+ DBUG_ENTER("p_flush_key_blocks");
+ DBUG_PRINT("enter", ("keycache: 0x%lx", (long) keycache));
+
+ for (i= 0; i < partitions; i++)
+ {
+ S_KEY_CACHE_CB *partition= keycache->partition_array[i];
+ if ((type == FLUSH_KEEP || type == FLUSH_FORCE_WRITE) &&
+ !((*dirty_part_map) & (1<<i)))
+ continue;
+ err+= test(s_flush_key_blocks(partition, file, 0, type));
+ }
+ *dirty_part_map= 0;
+
+ if (err > 0)
+ err= 1;
+
+ DBUG_RETURN(err);
+}
+
+
+/*
+ Reset the counters of a partitioned key cache
+
+ SYNOPSIS
+ p_reset_key_cache_counters()
+ name the name of a key cache
+ keycache_cb pointer to the control block of a partitioned key cache
+
+ DESCRIPTION
+ This function is the implementation of the reset_key_cache_counters
+ interface function that is employed by partitioned key caches.
+ The function considers the parameter keycache_cb as a pointer to the
+ control block structure of the type P_KEY_CACHE_CB for a partitioned
+ key cache.
+ This function resets the values of the statistical counters of the simple
+ key caches comprising partitioned key cache to 0. It does it by calling
+ s_reset_key_cache_counters for each key cache partition.
+ The parameter name is currently not used.
+
+ RETURN
+ 0 on success (always because it can't fail)
+
+*/
+
+static
+int p_reset_key_cache_counters(const char *name __attribute__((unused)),
+ void *keycache_cb)
+{
+ uint i;
+ P_KEY_CACHE_CB *keycache= (P_KEY_CACHE_CB *) keycache_cb;
+ uint partitions= keycache->partitions;
+ DBUG_ENTER("p_reset_key_cache_counters");
+
+ for (i = 0; i < partitions; i++)
+ {
+ s_reset_key_cache_counters(name, keycache->partition_array[i]);
+ }
+ DBUG_RETURN(0);
+}
+
+
+/*
+ Get statistics for a partition key cache
+
+ SYNOPSIS
+ p_get_key_cache_statistics()
+ keycache_cb pointer to the control block of a partitioned key cache
+ partition_no partition number to get statistics for
+ key_cache_stats OUT pointer to the structure for the returned statistics
+
+ DESCRIPTION
+ This function is the implementation of the get_key_cache_statistics
+ interface function that is employed by partitioned key caches.
+ The function considers the parameter keycache_cb as a pointer to the
+ control block structure of the type P_KEY_CACHE_CB for a partitioned
+ key cache.
+ If the value of the parameter partition_no is equal to 0 then aggregated
+ statistics for all partitions is returned in the fields of the
+ structure key_cache_stat of the type KEY_CACHE_STATISTICS . Otherwise
+ the function returns data for the partition number partition_no of the
+ key cache in the structure key_cache_stat. (Here partitions are numbered
+ starting from 1.)
+
+ RETURN
+ none
+
+*/
+
+static
+void p_get_key_cache_statistics(void *keycache_cb, uint partition_no,
+ KEY_CACHE_STATISTICS *key_cache_stats)
+{
+ uint i;
+ S_KEY_CACHE_CB *partition;
+ P_KEY_CACHE_CB *keycache= (P_KEY_CACHE_CB *) keycache_cb;
+ uint partitions= keycache->partitions;
+ DBUG_ENTER("p_get_key_cache_statistics_");
+
+ if (partition_no != 0)
+ {
+ partition= keycache->partition_array[partition_no-1];
+ s_get_key_cache_statistics((void *) partition, 0, key_cache_stats);
+ DBUG_VOID_RETURN;
+ }
+ key_cache_stats->mem_size= (longlong) keycache->key_cache_mem_size;
+ key_cache_stats->block_size= (longlong) keycache->key_cache_block_size;
+ for (i = 0; i < partitions; i++)
+ {
+ partition= keycache->partition_array[i];
+ key_cache_stats->blocks_used+= partition->blocks_used;
+ key_cache_stats->blocks_unused+= partition->blocks_unused;
+ key_cache_stats->blocks_changed+= partition->global_blocks_changed;
+ key_cache_stats->read_requests+= partition->global_cache_r_requests;
+ key_cache_stats->reads+= partition->global_cache_read;
+ key_cache_stats->write_requests+= partition->global_cache_w_requests;
+ key_cache_stats->writes+= partition->global_cache_write;
+ }
+ DBUG_VOID_RETURN;
+}
+
+/*
+ Get the value of a statistical variable for a partitioned key cache
+
+ SYNOPSIS
+ p_get_key_cache_stat_value()
+ keycache_cb pointer to the control block of a partitioned key cache
+ var_no the ordered number of a statistical variable
+
+ DESCRIPTION
+ This function is the implementation of the get_key_cache_stat_value
+ interface function that is employed by partitioned key caches.
+ The function considers the parameter keycache_cb as a pointer to the
+ control block structure of the type P_KEY_CACHE_CB for a partitioned
+ key cache.
+ This function returns the value of the statistical variable var_no
+ for this key cache. The variables are numbered starting from 0 to 6.
+ The returned value is calculated as the sum of the values of the
+ statistical variable with number var_no for all simple key caches that
+ comprise the partitioned key cache.
+
+ RETURN
+ The value of the specified statistical variable
+
+*/
+
+static
+ulonglong p_get_key_cache_stat_value(void *keycache_cb, uint var_no)
+{
+ uint i;
+ P_KEY_CACHE_CB *keycache= (P_KEY_CACHE_CB *) keycache_cb;
+ uint partitions= keycache->partitions;
+ size_t var_ofs= s_key_cache_stat_var_offsets[var_no];
+ ulonglong res= 0;
+ DBUG_ENTER("p_get_key_cache_stat_value");
+
+ if (var_no < 3)
+ {
+ for (i = 0; i < partitions; i++)
+ {
+ S_KEY_CACHE_CB *partition= keycache->partition_array[i];
+ res+= (ulonglong) (*(long *) ((char *) partition + var_ofs));
+ }
+ }
+ else
+ {
+ for (i = 0; i < partitions; i++)
+ {
+ S_KEY_CACHE_CB *partition= keycache->partition_array[i];
+ res+= *(ulonglong *) ((char *) partition + var_ofs);
+ }
+ }
+ DBUG_RETURN(res);
+}
+
+
+/*
+ The array of pointers to the key cache interface functions used by
+ partitioned key caches. Any partitioned key cache object caches exploits
+ this array.
+
+ The current implementation of these functions does not allow to call
+ them from the MySQL server code directly. The key cache interface
+ wrappers must be used for this purpose.
+*/
+
+static KEY_CACHE_FUNCS p_key_cache_funcs =
+{
+ p_init_key_cache,
+ p_resize_key_cache,
+ p_change_key_cache_param,
+ p_key_cache_read,
+ p_key_cache_insert,
+ p_key_cache_write,
+ p_flush_key_blocks,
+ p_reset_key_cache_counters,
+ p_end_key_cache,
+ p_get_key_cache_statistics,
+ p_get_key_cache_stat_value
+};
+
+
+/******************************************************************************
+ Key Cache Interface Module
+
+ The module contains wrappers for all key cache interface functions.
+
+ Currently there are key caches of two types: simple key caches and
+ partitioned key caches. Each type (class) has its own implementation of the
+ basic key cache operations used the MyISAM storage engine. The pointers
+ to the implementation functions are stored in two static structures of the
+ type KEY_CACHE_FUNC: s_key_cache_funcs - for simple key caches, and
+ p_key_cache_funcs - for partitioned key caches. When a key cache object is
+ created the constructor procedure init_key_cache places a pointer to the
+ corresponding table into one of its fields. The procedure also initializes
+ a control block for the key cache oject and saves the pointer to this
+ block in another field of the key cache object.
+ When a key cache wrapper function is invoked for a key cache object to
+ perform a basic key cache operation it looks into the interface table
+ associated with the key cache oject and calls the corresponding
+ implementation of the operation. It passes the saved key cache control
+ block to this implementation. If, for some reasons, the control block
+ has not been fully initialized yet, the wrapper function either does not
+ do anything or, in the case when it perform a read/write operation, the
+ function do it directly through the system i/o functions.
+
+ As we can see the model with which the key cache interface is supported
+ as quite conventional for interfaces in general.
+
+******************************************************************************/
+
+
+/*
+ Initialize a key cache
+
+ SYNOPSIS
+ init_key_cache()
+ keycache pointer to the key cache to be initialized
+ key_cache_block_size size of blocks to keep cached data
+ use_mem total memory to use for cache buffers/structures
+ division_limit division limit (may be zero)
+ age_threshold age threshold (may be zero)
+ partitions number of partitions in the key cache
+
+ DESCRIPTION
+ The function creates a control block structure for a key cache and
+ places the pointer to this block in the structure keycache.
+ If the value of the parameter 'partitions' is 0 then a simple key cache
+ is created. Otherwise a partitioned key cache with the specified number
+ of partitions is created.
+ The parameter key_cache_block_size specifies the size of the blocks in
+ the key cache to be created. The parameters division_limit and
+ age_threshold determine the initial values of those characteristics of
+ the key cache that are used for midpoint insertion strategy. The parameter
+ use_mem specifies the total amount of memory to be allocated for the
+ key cache buffers and for all auxiliary structures.
+
+ RETURN VALUE
+ total number of blocks in key cache partitions, if successful,
+ <= 0 - otherwise.
+
+ NOTES
+ if keycache->key_cache_inited != 0 we assume that the memory
+ for the control block of the key cache has been already allocated.
+
+ It's assumed that no two threads call this function simultaneously
+ referring to the same key cache handle.
+
+*/
+
+int init_key_cache(KEY_CACHE *keycache, uint key_cache_block_size,
+ size_t use_mem, uint division_limit,
+ uint age_threshold, uint partitions)
+{
+ void *keycache_cb;
+ int blocks;
+ if (keycache->key_cache_inited)
+ keycache_cb= keycache->keycache_cb;
+ else
+ {
+ if (partitions == 0)
+ {
+ if (!(keycache_cb= (void *) my_malloc(sizeof(S_KEY_CACHE_CB), MYF(0))))
+ return 0;
+ ((S_KEY_CACHE_CB *) keycache_cb)->key_cache_inited= 0;
+ keycache->key_cache_type= SIMPLE_KEY_CACHE;
+ keycache->interface_funcs= &s_key_cache_funcs;
+ }
+ else
+ {
+ if (!(keycache_cb= (void *) my_malloc(sizeof(P_KEY_CACHE_CB), MYF(0))))
+ return 0;
+ ((P_KEY_CACHE_CB *) keycache_cb)->key_cache_inited= 0;
+ keycache->key_cache_type= PARTITIONED_KEY_CACHE;
+ keycache->interface_funcs= &p_key_cache_funcs;
+ }
+ keycache->keycache_cb= keycache_cb;
+ keycache->key_cache_inited= 1;
+ }
+
+ if (partitions != 0)
+ {
+ ((P_KEY_CACHE_CB *) keycache_cb)->partitions= partitions;
+ }
+ keycache->can_be_used= 0;
+ blocks= keycache->interface_funcs->init(keycache_cb, key_cache_block_size,
+ use_mem, division_limit,
+ age_threshold);
+ keycache->partitions= partitions ?
+ ((P_KEY_CACHE_CB *) keycache_cb)->partitions : 0;
+ DBUG_ASSERT(partitions <= MAX_KEY_CACHE_PARTITIONS);
+ if (blocks > 0)
+ keycache->can_be_used= 1;
+ return blocks;
+}
+
+
+/*
+ Resize a key cache
+
+ SYNOPSIS
+ resize_key_cache()
+ keycache pointer to the key cache to be resized
+ key_cache_block_size size of blocks to keep cached data
+ use_mem total memory to use for the new key cache
+ division_limit new division limit (if not zero)
+ age_threshold new age threshold (if not zero)
+
+ DESCRIPTION
+ The function operates over the key cache key cache.
+ The parameter key_cache_block_size specifies the new size of the block
+ buffers in the key cache. The parameters division_limit and age_threshold
+ determine the new initial values of those characteristics of the key cache
+ that are used for midpoint insertion strategy. The parameter use_mem
+ specifies the total amount of memory to be allocated for the key cache
+ buffers and for all auxiliary structures.
+
+ RETURN VALUE
+ number of blocks in the key cache, if successful,
+ 0 - otherwise.
+
+ NOTES
+ The function does not block the calls and executions of other functions
+ from the key cache interface. However it assumes that the calls of
+ resize_key_cache itself are serialized.
+
+ Currently the function is called when the values of the variables
+ key_buffer_size and/or key_cache_block_size are being reset for
+ the key cache keycache.
+
+*/
+
+int resize_key_cache(KEY_CACHE *keycache, uint key_cache_block_size,
+ size_t use_mem, uint division_limit, uint age_threshold)
+{
+ int blocks= -1;
+ if (keycache->key_cache_inited)
+ {
+ if ((uint) keycache->param_partitions != keycache->partitions && use_mem)
+ blocks= repartition_key_cache (keycache,
+ key_cache_block_size, use_mem,
+ division_limit, age_threshold,
+ (uint) keycache->param_partitions);
+ else
+ {
+ blocks= keycache->interface_funcs->resize(keycache->keycache_cb,
+ key_cache_block_size,
+ use_mem, division_limit,
+ age_threshold);
+
+ if (keycache->partitions)
+ keycache->partitions=
+ ((P_KEY_CACHE_CB *)(keycache->keycache_cb))->partitions;
+ }
+ if (blocks <= 0)
+ keycache->can_be_used= 0;
+ }
+ return blocks;
+}
+
+
+/*
+ Change key cache parameters of a key cache
+
+ SYNOPSIS
+ change_key_cache_param()
+ keycache pointer to the key cache to change parameters for
+ division_limit new division limit (if not zero)
+ age_threshold new age threshold (if not zero)
+
+ DESCRIPTION
+ The function sets new values of the division limit and the age threshold
+ used when the key cache keycach employs midpoint insertion strategy.
+ The parameters division_limit and age_threshold provide these new values.
+
+ RETURN VALUE
+ none
+
+ NOTES
+ Currently the function is called when the values of the variables
+ key_cache_division_limit and/or key_cache_age_threshold are being reset
+ for the key cache keycache.
+
+*/
+
+void change_key_cache_param(KEY_CACHE *keycache, uint division_limit,
+ uint age_threshold)
+{
+ if (keycache->key_cache_inited)
+ {
+
+ keycache->interface_funcs->change_param(keycache->keycache_cb,
+ division_limit,
+ age_threshold);
+ }
+}
+
+
+/*
+ Destroy a key cache
+
+ SYNOPSIS
+ end_key_cache()
+ keycache pointer to the key cache to be destroyed
+ cleanup <=> complete free
+
+ DESCRIPTION
+ The function frees the memory allocated for the cache blocks and
+ auxiliary structures used by the key cache keycache. If the value
+ of the parameter cleanup is TRUE then all resources used by the key
+ cache are to be freed.
+
+ RETURN VALUE
+ none
+*/
+
+void end_key_cache(KEY_CACHE *keycache, my_bool cleanup)
+{
+ if (keycache->key_cache_inited)
+ {
+ keycache->interface_funcs->end(keycache->keycache_cb, cleanup);
+ if (cleanup)
+ {
+ if (keycache->keycache_cb)
+ {
+ my_free((uchar *) keycache->keycache_cb, MYF(0));
+ keycache->keycache_cb= 0;
+ }
+ keycache->key_cache_inited= 0;
+ }
+ keycache->can_be_used= 0;
+ }
+}
+
+
+/*
+ Read a block of data from a key cache into a buffer
+
+ SYNOPSIS
+
+ key_cache_read()
+ keycache pointer to the key cache to read data from
+ file handler for the file for the block of data to be read
+ filepos position of the block of data in the file
+ level determines the weight of the data
+ buff buffer to where the data must be placed
+ length length of the buffer
+ block_length length of the data read from a key cache block
+ return_buffer return pointer to the key cache buffer with the data
+
+ DESCRIPTION
+ The function operates over buffers of the key cache keycache.
+ In a general case the function reads a block of data from the key cache
+ into the buffer buff of the size specified by the parameter length. The
+ beginning of the block of data to be read is specified by the parameters
+ file and filepos. The length of the read data is the same as the length
+ of the buffer.
+ If the parameter return_buffer is not ignored and its value is TRUE, and
+ the data to be read of the specified size block_length can be read from one
+ key cache buffer, then the function returns a pointer to the data in the
+ key cache buffer.
+ The parameter 'level' is used only by the midpoint insertion strategy
+ when the data or its portion cannot be found in the key cache.
+ The function reads data into the buffer directly from file if the control
+ block of the key cache has not been initialized yet.
+
+ RETURN VALUE
+ Returns address from where the data is placed if successful, 0 - otherwise.
+
+ NOTES.
+ Filepos must be a multiple of 'block_length', but it doesn't
+ have to be a multiple of key_cache_block_size;
+*/
+
+uchar *key_cache_read(KEY_CACHE *keycache,
+ File file, my_off_t filepos, int level,
+ uchar *buff, uint length,
+ uint block_length, int return_buffer)
+{
+ if (keycache->key_cache_inited && keycache->can_be_used)
+ return keycache->interface_funcs->read(keycache->keycache_cb,
+ file, filepos, level,
+ buff, length,
+ block_length, return_buffer);
+
+ /* We can't use mutex here as the key cache may not be initialized */
+ keycache->global_cache_r_requests++;
+ keycache->global_cache_read++;
+
+ if (my_pread(file, (uchar*) buff, length, filepos, MYF(MY_NABP)))
+ return (uchar *) 0;
+
+ return buff;
+}
+
+
+/*
+ Insert a block of file data from a buffer into a key cache
+
+ SYNOPSIS
+ key_cache_insert()
+ keycache pointer to the key cache to insert data into
+ file handler for the file to insert data from
+ filepos position of the block of data in the file to insert
+ level determines the weight of the data
+ buff buffer to read data from
+ length length of the data in the buffer
+
+ DESCRIPTION
+ The function operates over buffers of the key cache keycache.
+ The function writes a block of file data from a buffer into the key cache.
+ The buffer is specified with the parameters buff and length - the pointer
+ to the beginning of the buffer and its size respectively. It's assumed
+ that the buffer contains the data from 'file' allocated from the position
+ filepos.
+ The parameter level is used to set one characteristic for the key buffers
+ loaded with the data from buff. The characteristic is used only by the
+ midpoint insertion strategy.
+
+ RETURN VALUE
+ 0 if a success, 1 - otherwise.
+
+ NOTES
+ The function is used by MyISAM to move all blocks from a index file to
+ the key cache.
+ It is assumed that it may be performed in parallel with reading the file
+ data from the key buffers by other threads.
+
+*/
+
+int key_cache_insert(KEY_CACHE *keycache,
+ File file, my_off_t filepos, int level,
+ uchar *buff, uint length)
+{
+ if (keycache->key_cache_inited && keycache->can_be_used)
+ return keycache->interface_funcs->insert(keycache->keycache_cb,
+ file, filepos, level,
+ buff, length);
+ return 0;
+}
+
+
+/*
+ Write data from a buffer into a key cache
+
+ SYNOPSIS
+
+ key_cache_write()
+ keycache pointer to the key cache to write data to
+ file handler for the file to write data to
+ filepos position in the file to write data to
+ level determines the weight of the data
+ buff buffer with the data
+ length length of the buffer
+ dont_write if is 0 then all dirty pages involved in writing
+ should have been flushed from key cache
+ file_extra pointer to optional file attributes
+
+ DESCRIPTION
+ The function operates over buffers of the key cache keycache.
+ In a general case the function writes data from a buffer into the key
+ cache. The buffer is specified with the parameters buff and length -
+ the pointer to the beginning of the buffer and its size respectively.
+ It's assumed the buffer contains the data to be written into 'file'
+ starting from the position filepos.
+ If the value of the parameter dont_write is FALSE then the function
+ also writes the data into file.
+ The parameter level is used to set one characteristic for the key buffers
+ filled with the data from buff. The characteristic is employed only by
+ the midpoint insertion strategy.
+ The parameter file_expra may point to additional file attributes used
+ for optimization or other purposes.
+ The function writes data from the buffer directly into file if the control
+ block of the key cache has not been initialized yet.
+
+ RETURN VALUE
+ 0 if a success, 1 - otherwise.
+
+ NOTES
+ This implementation may exploit the fact that the function is called only
+ when a thread has got an exclusive lock for the key file.
+
+*/
+
+int key_cache_write(KEY_CACHE *keycache,
+ File file, void *file_extra,
+ my_off_t filepos, int level,
+ uchar *buff, uint length,
+ uint block_length, int force_write)
+{
+ if (keycache->key_cache_inited && keycache->can_be_used)
+ return keycache->interface_funcs->write(keycache->keycache_cb,
+ file, file_extra,
+ filepos, level,
+ buff, length,
+ block_length, force_write);
+
+ /* We can't use mutex here as the key cache may not be initialized */
+ keycache->global_cache_w_requests++;
+ keycache->global_cache_write++;
+ if (my_pwrite(file, buff, length, filepos, MYF(MY_NABP | MY_WAIT_IF_FULL)))
+ return 1;
+
+ return 0;
+}
+
+
+/*
+ Flush all blocks for a file from key buffers of a key cache
+
+ SYNOPSIS
+
+ flush_key_blocks()
+ keycache pointer to the key cache whose blocks are to be flushed
+ file handler for the file to flush to
+ file_extra maps of key cache (used for partitioned key caches)
+ flush_type type of the flush operation
+
+ DESCRIPTION
+ The function operates over buffers of the key cache keycache.
+ In a general case the function flushes the data from all dirty key
+ buffers related to the file 'file' into this file. The function does
+ exactly this if the value of the parameter type is FLUSH_KEEP. If the
+ value of this parameter is FLUSH_RELEASE, the function additionally
+ releases the key buffers containing data from 'file' for new usage.
+ If the value of the parameter type is FLUSH_IGNORE_CHANGED the function
+ just releases the key buffers containing data from 'file'.
+ If the value of the parameter type is FLUSH_KEEP the function may use
+ the value of the parameter file_extra pointing to possibly dirty
+ partitions to optimize the operation for partitioned key caches.
+
+ RETURN
+ 0 ok
+ 1 error
+
+ NOTES
+ Any implementation of the function may exploit the fact that the function
+ is called only when a thread has got an exclusive lock for the key file.
+
+*/
+
+int flush_key_blocks(KEY_CACHE *keycache,
+ int file, void *file_extra,
+ enum flush_type type)
+{
+ if (keycache->key_cache_inited)
+ return keycache->interface_funcs->flush(keycache->keycache_cb,
+ file, file_extra, type);
+ return 0;
+}
+
+
+/*
+ Reset the counters of a key cache
+
+ SYNOPSIS
+ reset_key_cache_counters()
+ name the name of a key cache (unused)
+ keycache pointer to the key cache for which to reset counters
+
+ DESCRIPTION
+ This function resets the values of the statistical counters for the key
+ cache keycache.
+ The parameter name is currently not used.
+
+ RETURN
+ 0 on success (always because it can't fail)
+
+ NOTES
+ This procedure is used by process_key_caches() to reset the counters of all
+ currently used key caches, both the default one and the named ones.
+
+*/
+
+int reset_key_cache_counters(const char *name __attribute__((unused)),
+ KEY_CACHE *keycache)
+{
+ if (keycache->key_cache_inited)
+ {
+
+ return keycache->interface_funcs->reset_counters(name,
+ keycache->keycache_cb);
+ }
+ return 0;
+}
+
+
+/*
+ Get statistics for a key cache
+
+ SYNOPSIS
+ get_key_cache_statistics()
+ keycache pointer to the key cache to get statistics for
+ partition_no partition number to get statistics for
+ key_cache_stats OUT pointer to the structure for the returned statistics
+
+ DESCRIPTION
+ If the value of the parameter partition_no is equal to 0 then statistics
+ for the whole key cache keycache (aggregated statistics) is returned in the
+ fields of the structure key_cache_stat of the type KEY_CACHE_STATISTICS.
+ Otherwise the value of the parameter partition_no makes sense only for
+ a partitioned key cache. In this case the function returns statistics
+ for the partition with the specified number partition_no.
+
+ RETURN
+ none
+
+*/
+
+void get_key_cache_statistics(KEY_CACHE *keycache, uint partition_no,
+ KEY_CACHE_STATISTICS *key_cache_stats)
+{
+ bzero(key_cache_stats, sizeof(KEY_CACHE_STATISTICS));
+ if (keycache->key_cache_inited)
+ {
+ keycache->interface_funcs->get_stats(keycache->keycache_cb,
+ partition_no, key_cache_stats);
+ }
+}
+
+
+/*
+ Get the value of a statistical variable for a key cache
+
+ SYNOPSIS
+ get_key_cache_stat_value()
+ keycache pointer to the key cache to get statistics for
+ var_no the ordered number of a statistical variable
+
+ DESCRIPTION
+ This function returns the value of the statistical variable var_no for
+ the key cache keycache. The variables are numbered starting from 0 to 6.
+
+ RETURN
+ The value of the specified statistical variable.
+
+ NOTES
+ Currently for any key cache the function can return values for the
+ following 7 statistical variables:
+
+ Name Number
+
+ blocks_used 0
+ blocks_unused 1
+ blocks_changed 2
+ read_requests 3
+ reads 4
+ write_requests 5
+ writes 6
+
+*/
+
+ulonglong get_key_cache_stat_value(KEY_CACHE *keycache, uint var_no)
+{
+ if (keycache->key_cache_inited)
+ {
+ return keycache->interface_funcs->get_stat_val(keycache->keycache_cb,
+ var_no);
+ }
+ else
+ return 0;
+}
+
+
+/*
+ Repartition a key cache
+
+ SYNOPSIS
+ repartition_key_cache()
+ keycache pointer to the key cache to be repartitioned
+ key_cache_block_size size of blocks to keep cached data
+ use_mem total memory to use for the new key cache
+ division_limit new division limit (if not zero)
+ age_threshold new age threshold (if not zero)
+ partitions new number of partitions in the key cache
+
+ DESCRIPTION
+ The function operates over the key cache keycache.
+ The parameter partitions specifies the number of partitions in the key
+ cache after repartitioning. If the value of this parameter is 0 then
+ a simple key cache must be created instead of the old one.
+ The parameter key_cache_block_size specifies the new size of the block
+ buffers in the key cache. The parameters division_limit and age_threshold
+ determine the new initial values of those characteristics of the key cache
+ that are used for midpoint insertion strategy. The parameter use_mem
+ specifies the total amount of memory to be allocated for the new key
+ cache buffers and for all auxiliary structures.
+
+ RETURN VALUE
+ number of blocks in the key cache, if successful,
+ 0 - otherwise.
+
+ NOTES
+ The function does not block the calls and executions of other functions
+ from the key cache interface. However it assumes that the calls of
+ resize_key_cache itself are serialized.
+
+ Currently the function is called when the value of the variable
+ key_cache_partitions is being reset for the key cache keycache.
+
+*/
+
+int repartition_key_cache(KEY_CACHE *keycache, uint key_cache_block_size,
+ size_t use_mem, uint division_limit,
+ uint age_threshold, uint partitions)
+{
+ uint blocks= -1;
+ if (keycache->key_cache_inited)
+ {
+ keycache->interface_funcs->resize(keycache->keycache_cb,
+ key_cache_block_size, 0,
+ division_limit, age_threshold);
+ end_key_cache(keycache, 1);
+ blocks= init_key_cache(keycache, key_cache_block_size, use_mem,
+ division_limit, age_threshold, partitions);
+ }
+ return blocks;
+}
+
=== modified file 'sql/handler.cc'
--- a/sql/handler.cc 2010-02-01 06:14:12 +0000
+++ b/sql/handler.cc 2010-02-16 16:41:11 +0000
@@ -3828,11 +3828,13 @@ int ha_init_key_cache(const char *name,
uint tmp_block_size= (uint) key_cache->param_block_size;
uint division_limit= key_cache->param_division_limit;
uint age_threshold= key_cache->param_age_threshold;
+ uint partitions= key_cache->param_partitions;
pthread_mutex_unlock(&LOCK_global_system_variables);
DBUG_RETURN(!init_key_cache(key_cache,
tmp_block_size,
tmp_buff_size,
- division_limit, age_threshold));
+ division_limit, age_threshold,
+ partitions));
}
DBUG_RETURN(0);
}
@@ -3862,10 +3864,12 @@ int ha_resize_key_cache(KEY_CACHE *key_c
/**
- Change parameters for key cache (like size)
+ Change parameters for key cache (like division_limit)
*/
int ha_change_key_cache_param(KEY_CACHE *key_cache)
{
+ DBUG_ENTER("ha_change_key_cache_param");
+
if (key_cache->key_cache_inited)
{
pthread_mutex_lock(&LOCK_global_system_variables);
@@ -3874,9 +3878,35 @@ int ha_change_key_cache_param(KEY_CACHE
pthread_mutex_unlock(&LOCK_global_system_variables);
change_key_cache_param(key_cache, division_limit, age_threshold);
}
- return 0;
+ DBUG_RETURN(0);
}
+
+/**
+ Repartition key cache
+*/
+int ha_repartition_key_cache(KEY_CACHE *key_cache)
+{
+ DBUG_ENTER("ha_repartition_key_cache");
+
+ if (key_cache->key_cache_inited)
+ {
+ pthread_mutex_lock(&LOCK_global_system_variables);
+ size_t tmp_buff_size= (size_t) key_cache->param_buff_size;
+ long tmp_block_size= (long) key_cache->param_block_size;
+ uint division_limit= key_cache->param_division_limit;
+ uint age_threshold= key_cache->param_age_threshold;
+ uint partitions= key_cache->param_partitions;
+ pthread_mutex_unlock(&LOCK_global_system_variables);
+ DBUG_RETURN(!repartition_key_cache(key_cache, tmp_block_size,
+ tmp_buff_size,
+ division_limit, age_threshold,
+ partitions));
+ }
+ DBUG_RETURN(0);
+}
+
+
/**
Free memory allocated by a key cache.
*/
=== modified file 'sql/handler.h'
--- a/sql/handler.h 2010-02-01 06:14:12 +0000
+++ b/sql/handler.h 2010-02-16 16:41:11 +0000
@@ -2188,6 +2188,7 @@ int ha_table_exists_in_engine(THD* thd,
extern "C" int ha_init_key_cache(const char *name, KEY_CACHE *key_cache);
int ha_resize_key_cache(KEY_CACHE *key_cache);
int ha_change_key_cache_param(KEY_CACHE *key_cache);
+int ha_repartition_key_cache(KEY_CACHE *key_cache);
int ha_change_key_cache(KEY_CACHE *old_key_cache, KEY_CACHE *new_key_cache);
int ha_end_key_cache(KEY_CACHE *key_cache);
=== modified file 'sql/mysqld.cc'
--- a/sql/mysqld.cc 2010-02-12 08:47:31 +0000
+++ b/sql/mysqld.cc 2010-02-16 16:41:11 +0000
@@ -5825,6 +5825,7 @@ enum options_mysqld
OPT_INTERACTIVE_TIMEOUT, OPT_JOIN_BUFF_SIZE,
OPT_KEY_BUFFER_SIZE, OPT_KEY_CACHE_BLOCK_SIZE,
OPT_KEY_CACHE_DIVISION_LIMIT, OPT_KEY_CACHE_AGE_THRESHOLD,
+ OPT_KEY_CACHE_PARTITIONS,
OPT_LONG_QUERY_TIME,
OPT_LOWER_CASE_TABLE_NAMES, OPT_MAX_ALLOWED_PACKET,
OPT_MAX_BINLOG_CACHE_SIZE, OPT_MAX_BINLOG_SIZE,
@@ -6915,6 +6916,12 @@ log and this option does nothing anymore
(uchar**) 0,
0, (GET_ULONG | GET_ASK_ADDR) , REQUIRED_ARG, 100,
1, 100, 0, 1, 0},
+ {"key_cache_partitions", OPT_KEY_CACHE_PARTITIONS,
+ "The number of partitions in key cache",
+ (uchar**) &dflt_key_cache_var.param_partitions,
+ (uchar**) 0,
+ 0, (GET_ULONG | GET_ASK_ADDR), REQUIRED_ARG, DEFAULT_KEY_CACHE_PARTITIONS,
+ 0, MAX_KEY_CACHE_PARTITIONS, 0, 1, 0},
{"log-slow-filter", OPT_LOG_SLOW_FILTER,
"Log only the queries that followed certain execution plan. Multiple flags allowed in a comma-separated string. [admin, filesort, filesort_on_disk, full_join, full_scan, query_cache, query_cache_miss, tmp_table, tmp_table_on_disk]. Sets log-slow-admin-command to ON",
0, 0, 0, GET_STR, REQUIRED_ARG, 0, 0, 0, QPLAN_ALWAYS_SET, 0, 0},
@@ -8837,6 +8844,7 @@ mysql_getopt_value(const char *keyname,
case OPT_KEY_CACHE_BLOCK_SIZE:
case OPT_KEY_CACHE_DIVISION_LIMIT:
case OPT_KEY_CACHE_AGE_THRESHOLD:
+ case OPT_KEY_CACHE_PARTITIONS:
{
KEY_CACHE *key_cache;
if (!(key_cache= get_or_create_key_cache(keyname, key_length)))
@@ -8854,6 +8862,8 @@ mysql_getopt_value(const char *keyname,
return (uchar**) &key_cache->param_division_limit;
case OPT_KEY_CACHE_AGE_THRESHOLD:
return (uchar**) &key_cache->param_age_threshold;
+ case OPT_KEY_CACHE_PARTITIONS:
+ return (uchar**) &key_cache->param_partitions;
}
}
}
=== modified file 'sql/set_var.cc'
--- a/sql/set_var.cc 2010-02-01 06:14:12 +0000
+++ b/sql/set_var.cc 2010-02-16 16:41:11 +0000
@@ -317,15 +317,18 @@ static sys_var_thd_ulong sys_interactive
static sys_var_thd_ulong sys_join_buffer_size(&vars, "join_buffer_size",
&SV::join_buff_size);
static sys_var_key_buffer_size sys_key_buffer_size(&vars, "key_buffer_size");
-static sys_var_key_cache_long sys_key_cache_block_size(&vars, "key_cache_block_size",
- offsetof(KEY_CACHE,
- param_block_size));
-static sys_var_key_cache_long sys_key_cache_division_limit(&vars, "key_cache_division_limit",
- offsetof(KEY_CACHE,
- param_division_limit));
-static sys_var_key_cache_long sys_key_cache_age_threshold(&vars, "key_cache_age_threshold",
- offsetof(KEY_CACHE,
- param_age_threshold));
+static sys_var_key_cache_long sys_key_cache_block_size(&vars,
+ "key_cache_block_size",
+ offsetof(KEY_CACHE,param_block_size));
+static sys_var_key_cache_long sys_key_cache_division_limit(&vars,
+ "key_cache_division_limit",
+ offsetof(KEY_CACHE, param_division_limit));
+static sys_var_key_cache_long sys_key_cache_age_threshold(&vars,
+ "key_cache_age_threshold",
+ offsetof(KEY_CACHE, param_age_threshold));
+static sys_var_key_cache_long sys_key_cache_partitions(&vars,
+ "key_cache_partitions",
+ offsetof(KEY_CACHE, param_partitions));
static sys_var_const sys_language(&vars, "language",
OPT_GLOBAL, SHOW_CHAR,
(uchar*) language);
@@ -2540,7 +2543,21 @@ bool sys_var_key_cache_long::update(THD
pthread_mutex_unlock(&LOCK_global_system_variables);
- error= (bool) (ha_resize_key_cache(key_cache));
+ switch (offset) {
+
+ case offsetof(KEY_CACHE, param_block_size):
+ error= (bool) (ha_resize_key_cache(key_cache));
+ break;
+
+ case offsetof(KEY_CACHE, param_division_limit):
+ case offsetof(KEY_CACHE, param_age_threshold):
+ error= (bool) (ha_change_key_cache_param(key_cache));
+ break;
+
+ case offsetof(KEY_CACHE, param_partitions):
+ error= (bool) (ha_repartition_key_cache(key_cache));
+ break;
+ }
pthread_mutex_lock(&LOCK_global_system_variables);
key_cache->in_init= 0;
@@ -4142,6 +4159,7 @@ static KEY_CACHE *create_key_cache(const
key_cache->param_block_size= dflt_key_cache_var.param_block_size;
key_cache->param_division_limit= dflt_key_cache_var.param_division_limit;
key_cache->param_age_threshold= dflt_key_cache_var.param_age_threshold;
+ key_cache->param_partitions= dflt_key_cache_var.param_partitions;
}
}
DBUG_RETURN(key_cache);
=== modified file 'sql/set_var.h'
--- a/sql/set_var.h 2009-12-03 11:19:05 +0000
+++ b/sql/set_var.h 2010-02-16 16:41:11 +0000
@@ -1427,6 +1427,7 @@ public:
my_free((uchar*) name, MYF(0));
}
friend bool process_key_caches(process_key_cache_t func);
+ friend int fill_key_cache_tables(THD *thd, TABLE_LIST *tables, COND *cond);
friend void delete_elements(I_List<NAMED_LIST> *list,
void (*free_element)(const char*, uchar*));
};
=== modified file 'sql/sql_show.cc'
--- a/sql/sql_show.cc 2010-02-01 06:14:12 +0000
+++ b/sql/sql_show.cc 2010-02-16 16:41:11 +0000
@@ -2220,6 +2220,31 @@ void remove_status_vars(SHOW_VAR *list)
}
+
+static void update_key_cache_stat_var(KEY_CACHE *key_cache, size_t ofs)
+{
+ uint var_no;
+ switch (ofs) {
+ case offsetof(KEY_CACHE, blocks_used):
+ case offsetof(KEY_CACHE, blocks_unused):
+ case offsetof(KEY_CACHE, global_blocks_changed):
+ var_no= (ofs-offsetof(KEY_CACHE, blocks_used))/sizeof(ulong);
+ *(ulong *)((char *) key_cache + ofs)=
+ (ulong) get_key_cache_stat_value(key_cache, var_no);
+ break;
+ case offsetof(KEY_CACHE, global_cache_r_requests):
+ case offsetof(KEY_CACHE, global_cache_read):
+ case offsetof(KEY_CACHE, global_cache_w_requests):
+ case offsetof(KEY_CACHE, global_cache_write):
+ var_no= 3+(ofs-offsetof(KEY_CACHE, global_cache_w_requests))/
+ sizeof(ulonglong);
+ *(ulonglong *)((char *) key_cache + ofs)=
+ get_key_cache_stat_value(key_cache, var_no);
+ break;
+ }
+}
+
+
static bool show_status_array(THD *thd, const char *wild,
SHOW_VAR *variables,
enum enum_var_type value_type,
@@ -2352,10 +2377,12 @@ static bool show_status_array(THD *thd,
break;
}
case SHOW_KEY_CACHE_LONG:
+ update_key_cache_stat_var(dflt_key_cache, (size_t) value);
value= (char*) dflt_key_cache + (ulong)value;
end= int10_to_str(*(long*) value, buff, 10);
break;
case SHOW_KEY_CACHE_LONGLONG:
+ update_key_cache_stat_var(dflt_key_cache, (size_t) value);
value= (char*) dflt_key_cache + (ulong)value;
end= longlong10_to_str(*(longlong*) value, buff, 10);
break;
@@ -6611,6 +6638,90 @@ int fill_schema_files(THD *thd, TABLE_LI
}
+static
+int store_key_cache_table_record(THD *thd, TABLE *table,
+ const char *name, uint name_length,
+ KEY_CACHE *key_cache,
+ uint partitions, uint partition_no)
+{
+ KEY_CACHE_STATISTICS key_cache_stats;
+ uint err;
+ DBUG_ENTER("store_key_cache_table_record");
+
+ get_key_cache_statistics(key_cache, partition_no, &key_cache_stats);
+
+ if (key_cache_stats.mem_size == 0)
+ DBUG_RETURN(0);
+
+ restore_record(table, s->default_values);
+ table->field[0]->store(name, name_length, system_charset_info);
+ if (partitions == 0)
+ table->field[1]->set_null();
+ else
+ {
+ table->field[1]->set_notnull();
+ table->field[1]->store((long) partitions, TRUE);
+ }
+
+ if (partition_no == 0)
+ table->field[2]->set_null();
+ else
+ {
+ table->field[2]->set_notnull();
+ table->field[2]->store((long) partition_no, TRUE);
+ }
+ table->field[3]->store(key_cache_stats.mem_size, TRUE);
+ table->field[4]->store(key_cache_stats.block_size, TRUE);
+ table->field[5]->store(key_cache_stats.blocks_used, TRUE);
+ table->field[6]->store(key_cache_stats.blocks_unused, TRUE);
+ table->field[7]->store(key_cache_stats.blocks_changed, TRUE);
+ table->field[8]->store(key_cache_stats.read_requests, TRUE);
+ table->field[9]->store(key_cache_stats.reads, TRUE);
+ table->field[10]->store(key_cache_stats.write_requests, TRUE);
+ table->field[11]->store(key_cache_stats.writes, TRUE);
+
+ err= schema_table_store_record(thd, table);
+ DBUG_RETURN(err);
+}
+
+
+int fill_key_cache_tables(THD *thd, TABLE_LIST *tables, COND *cond)
+{
+ TABLE *table= tables->table;
+ I_List_iterator<NAMED_LIST> it(key_caches);
+ NAMED_LIST *element;
+ DBUG_ENTER("fill_key_cache_tables");
+
+ while ((element= it++))
+ {
+ KEY_CACHE *key_cache= (KEY_CACHE *) element->data;
+
+ if (!key_cache->key_cache_inited)
+ continue;
+
+ uint partitions= key_cache->partitions;
+ DBUG_ASSERT(partitions <= MAX_KEY_CACHE_PARTITIONS);
+
+ if (partitions)
+ {
+ for (uint i= 0; i < partitions; i++)
+ {
+ if (store_key_cache_table_record(thd, table,
+ element->name, element->name_length,
+ key_cache, partitions, i+1))
+ DBUG_RETURN(1);
+ }
+ }
+
+ if (store_key_cache_table_record(thd, table,
+ element->name, element->name_length,
+ key_cache, partitions, 0))
+ DBUG_RETURN(1);
+ }
+ DBUG_RETURN(0);
+}
+
+
ST_FIELD_INFO schema_fields_info[]=
{
{"CATALOG_NAME", FN_REFLEN, MYSQL_TYPE_STRING, 0, 1, 0, SKIP_OPEN_TABLE},
@@ -7188,6 +7299,35 @@ ST_FIELD_INFO referential_constraints_fi
};
+ST_FIELD_INFO keycache_fields_info[]=
+{
+ {"KEY_CACHE_NAME", NAME_LEN, MYSQL_TYPE_STRING, 0, 0, 0, SKIP_OPEN_TABLE},
+ {"PARTITIONS", 3, MYSQL_TYPE_LONG, 0,
+ (MY_I_S_MAYBE_NULL | MY_I_S_UNSIGNED) , 0, SKIP_OPEN_TABLE},
+ {"PARTITION_NUMBER", 3, MYSQL_TYPE_LONG, 0,
+ (MY_I_S_MAYBE_NULL | MY_I_S_UNSIGNED), 0, SKIP_OPEN_TABLE},
+ {"FULL_SIZE", MY_INT64_NUM_DECIMAL_DIGITS, MYSQL_TYPE_LONGLONG, 0,
+ (MY_I_S_UNSIGNED), 0, SKIP_OPEN_TABLE},
+ {"BLOCK_SIZE", MY_INT64_NUM_DECIMAL_DIGITS, MYSQL_TYPE_LONGLONG, 0,
+ (MY_I_S_UNSIGNED), 0, SKIP_OPEN_TABLE },
+ {"USED_BLOCKS", MY_INT64_NUM_DECIMAL_DIGITS, MYSQL_TYPE_LONGLONG, 0,
+ (MY_I_S_UNSIGNED), "Key_blocks_used", SKIP_OPEN_TABLE},
+ {"UNUSED_BLOCKS", MY_INT64_NUM_DECIMAL_DIGITS, MYSQL_TYPE_LONGLONG, 0,
+ (MY_I_S_UNSIGNED), "Key_blocks_unused", SKIP_OPEN_TABLE},
+ {"DIRTY_BLOCKS", MY_INT64_NUM_DECIMAL_DIGITS, MYSQL_TYPE_LONGLONG, 0,
+ (MY_I_S_UNSIGNED), "Key_blocks_not_flushed", SKIP_OPEN_TABLE},
+ {"READ_REQUESTS", MY_INT64_NUM_DECIMAL_DIGITS, MYSQL_TYPE_LONGLONG, 0,
+ (MY_I_S_UNSIGNED), "Key_read_requests", SKIP_OPEN_TABLE},
+ {"READS", MY_INT64_NUM_DECIMAL_DIGITS, MYSQL_TYPE_LONGLONG, 0,
+ (MY_I_S_UNSIGNED), "Key_reads", SKIP_OPEN_TABLE},
+ {"WRITE_REQUESTS", MY_INT64_NUM_DECIMAL_DIGITS, MYSQL_TYPE_LONGLONG, 0,
+ (MY_I_S_UNSIGNED), "Key_write_requests", SKIP_OPEN_TABLE},
+ {"WRITES", MY_INT64_NUM_DECIMAL_DIGITS, MYSQL_TYPE_LONGLONG, 0,
+ (MY_I_S_UNSIGNED), "Key_writes", SKIP_OPEN_TABLE},
+ {0, 0, MYSQL_TYPE_STRING, 0, 0, 0, SKIP_OPEN_TABLE}
+};
+
+
/*
Description of ST_FIELD_INFO in table.h
@@ -7227,6 +7367,8 @@ ST_SCHEMA_TABLE schema_tables[]=
fill_variables, make_old_format, 0, 0, -1, 0, 0},
{"INDEX_STATISTICS", index_stats_fields_info, create_schema_table,
fill_schema_index_stats, make_old_format, 0, -1, -1, 0, 0},
+ {"KEY_CACHES", keycache_fields_info, create_schema_table,
+ fill_key_cache_tables, make_old_format, 0, -1,-1, 0, 0},
{"KEY_COLUMN_USAGE", key_column_usage_fields_info, create_schema_table,
get_all_tables, 0, get_schema_key_column_usage_record, 4, 5, 0,
OPEN_TABLE_ONLY},
=== modified file 'sql/sql_test.cc'
--- a/sql/sql_test.cc 2009-09-07 20:50:10 +0000
+++ b/sql/sql_test.cc 2010-02-16 16:41:11 +0000
@@ -435,7 +435,8 @@ static int print_key_cache_status(const
Buffer_size: %10lu\n\
Block_size: %10lu\n\
Division_limit: %10lu\n\
-Age_limit: %10lu\n\
+Age_threshold: %10lu\n\
+Partitions: %10lu\n\
blocks used: %10lu\n\
not flushed: %10lu\n\
w_requests: %10s\n\
@@ -445,6 +446,7 @@ reads: %10s\n\n",
name,
(ulong) key_cache->param_buff_size, key_cache->param_block_size,
key_cache->param_division_limit, key_cache->param_age_threshold,
+ key_cache->param_partitions,
key_cache->blocks_used,key_cache->global_blocks_changed,
llstr(key_cache->global_cache_w_requests,llbuff1),
llstr(key_cache->global_cache_write,llbuff2),
=== modified file 'sql/table.h'
--- a/sql/table.h 2010-02-12 08:47:31 +0000
+++ b/sql/table.h 2010-02-16 16:41:11 +0000
@@ -953,6 +953,7 @@ enum enum_schema_tables
SCH_GLOBAL_STATUS,
SCH_GLOBAL_VARIABLES,
SCH_INDEX_STATS,
+ SCH_KEY_CACHES,
SCH_KEY_COLUMN_USAGE,
SCH_OPEN_TABLES,
SCH_PARTITIONS,
=== modified file 'storage/myisam/mi_check.c'
--- a/storage/myisam/mi_check.c 2010-01-14 16:51:00 +0000
+++ b/storage/myisam/mi_check.c 2010-02-16 16:41:11 +0000
@@ -332,7 +332,8 @@ int chk_size(HA_CHECK *param, register M
/* The following is needed if called externally (not from myisamchk) */
flush_key_blocks(info->s->key_cache,
- info->s->kfile, FLUSH_FORCE_WRITE);
+ info->s->kfile, &info->s->dirty_part_map,
+ FLUSH_FORCE_WRITE);
size= my_seek(info->s->kfile, 0L, MY_SEEK_END, MYF(MY_THREADSAFE));
if ((skr=(my_off_t) info->state->key_file_length) != size)
@@ -1474,6 +1475,7 @@ static int mi_drop_all_indexes(HA_CHECK
*/
DBUG_PRINT("repair", ("all disabled are empty: create missing"));
error= flush_key_blocks(share->key_cache, share->kfile,
+ &share->dirty_part_map,
FLUSH_FORCE_WRITE);
goto end;
}
@@ -1488,6 +1490,7 @@ static int mi_drop_all_indexes(HA_CHECK
/* Remove all key blocks of this index file from key cache. */
if ((error= flush_key_blocks(share->key_cache, share->kfile,
+ &share->dirty_part_map,
FLUSH_IGNORE_CHANGED)))
goto end; /* purecov: inspected */
@@ -1549,7 +1552,7 @@ int mi_repair(HA_CHECK *param, register
if (!param->using_global_keycache)
VOID(init_key_cache(dflt_key_cache, param->key_cache_block_size,
- (size_t) param->use_buffers, 0, 0));
+ (size_t) param->use_buffers, 0, 0, 0));
if (init_io_cache(¶m->read_cache,info->dfile,
(uint) param->read_buffer_length,
@@ -1762,7 +1765,8 @@ err:
VOID(end_io_cache(¶m->read_cache));
info->opt_flag&= ~(READ_CACHE_USED | WRITE_CACHE_USED);
VOID(end_io_cache(&info->rec_cache));
- got_error|=flush_blocks(param, share->key_cache, share->kfile);
+ got_error|=flush_blocks(param, share->key_cache, share->kfile,
+ &share->dirty_part_map);
if (!got_error && param->testflag & T_UNPACK)
{
share->state.header.options[0]&= (uchar) ~HA_OPTION_COMPRESS_RECORD;
@@ -1908,9 +1912,10 @@ void lock_memory(HA_CHECK *param __attri
/* Flush all changed blocks to disk */
-int flush_blocks(HA_CHECK *param, KEY_CACHE *key_cache, File file)
+int flush_blocks(HA_CHECK *param, KEY_CACHE *key_cache, File file,
+ ulonglong *dirty_part_map)
{
- if (flush_key_blocks(key_cache, file, FLUSH_RELEASE))
+ if (flush_key_blocks(key_cache, file, dirty_part_map, FLUSH_RELEASE))
{
mi_check_print_error(param,"%d when trying to write bufferts",my_errno);
return(1);
@@ -1977,7 +1982,8 @@ int mi_sort_index(HA_CHECK *param, regis
}
/* Flush key cache for this file if we are calling this outside myisamchk */
- flush_key_blocks(share->key_cache,share->kfile, FLUSH_IGNORE_CHANGED);
+ flush_key_blocks(share->key_cache, share->kfile, &share->dirty_part_map,
+ FLUSH_IGNORE_CHANGED);
share->state.version=(ulong) time((time_t*) 0);
old_state= share->state; /* save state if not stored */
@@ -2535,7 +2541,8 @@ int mi_repair_by_sort(HA_CHECK *param, r
memcpy( &share->state.state, info->state, sizeof(*info->state));
err:
- got_error|= flush_blocks(param, share->key_cache, share->kfile);
+ got_error|= flush_blocks(param, share->key_cache, share->kfile,
+ &share->dirty_part_map);
VOID(end_io_cache(&info->rec_cache));
if (!got_error)
{
@@ -3059,7 +3066,8 @@ int mi_repair_parallel(HA_CHECK *param,
memcpy(&share->state.state, info->state, sizeof(*info->state));
err:
- got_error|= flush_blocks(param, share->key_cache, share->kfile);
+ got_error|= flush_blocks(param, share->key_cache, share->kfile,
+ &share->dirty_part_map);
/*
Destroy the write cache. The master thread did already detach from
the share by remove_io_thread() or it was not yet started (if the
=== modified file 'storage/myisam/mi_close.c'
--- a/storage/myisam/mi_close.c 2010-02-10 19:06:24 +0000
+++ b/storage/myisam/mi_close.c 2010-02-16 16:41:11 +0000
@@ -64,6 +64,7 @@ int mi_close(register MI_INFO *info)
if (share->kfile >= 0) abort(););
if (share->kfile >= 0 &&
flush_key_blocks(share->key_cache, share->kfile,
+ &share->dirty_part_map,
((share->temporary || share->deleting) ?
FLUSH_IGNORE_CHANGED :
FLUSH_RELEASE)))
=== modified file 'storage/myisam/mi_delete_all.c'
--- a/storage/myisam/mi_delete_all.c 2008-04-28 16:24:05 +0000
+++ b/storage/myisam/mi_delete_all.c 2010-02-16 16:41:11 +0000
@@ -52,7 +52,8 @@ int mi_delete_all_rows(MI_INFO *info)
If we are using delayed keys or if the user has done changes to the tables
since it was locked then there may be key blocks in the key cache
*/
- flush_key_blocks(share->key_cache, share->kfile, FLUSH_IGNORE_CHANGED);
+ flush_key_blocks(share->key_cache, share->kfile, &share->dirty_part_map,
+ FLUSH_IGNORE_CHANGED);
#ifdef HAVE_MMAP
if (share->file_map)
_mi_unmap_file(info);
=== modified file 'storage/myisam/mi_extra.c'
--- a/storage/myisam/mi_extra.c 2010-02-10 19:06:24 +0000
+++ b/storage/myisam/mi_extra.c 2010-02-16 16:41:11 +0000
@@ -268,6 +268,7 @@ int mi_extra(MI_INFO *info, enum ha_extr
pthread_mutex_lock(&share->intern_lock);
/* Flush pages that we don't need anymore */
if (flush_key_blocks(share->key_cache, share->kfile,
+ &share->dirty_part_map,
(function == HA_EXTRA_PREPARE_FOR_DROP ?
FLUSH_IGNORE_CHANGED : FLUSH_RELEASE)))
{
@@ -326,7 +327,8 @@ int mi_extra(MI_INFO *info, enum ha_extr
break;
case HA_EXTRA_FLUSH:
if (!share->temporary)
- flush_key_blocks(share->key_cache, share->kfile, FLUSH_KEEP);
+ flush_key_blocks(share->key_cache, share->kfile, &share->dirty_part_map,
+ FLUSH_KEEP);
#ifdef HAVE_PWRITE
_mi_decrement_open_count(info);
#endif
=== modified file 'storage/myisam/mi_keycache.c'
--- a/storage/myisam/mi_keycache.c 2008-03-29 15:56:33 +0000
+++ b/storage/myisam/mi_keycache.c 2010-02-16 16:41:11 +0000
@@ -75,7 +75,8 @@ int mi_assign_to_key_cache(MI_INFO *info
in the old key cache.
*/
- if (flush_key_blocks(share->key_cache, share->kfile, FLUSH_RELEASE))
+ if (flush_key_blocks(share->key_cache, share->kfile, &share->dirty_part_map,
+ FLUSH_RELEASE))
{
error= my_errno;
mi_print_error(info->s, HA_ERR_CRASHED);
@@ -90,7 +91,8 @@ int mi_assign_to_key_cache(MI_INFO *info
(This can never fail as there is never any not written data in the
new key cache)
*/
- (void) flush_key_blocks(key_cache, share->kfile, FLUSH_RELEASE);
+ (void) flush_key_blocks(key_cache, share->kfile, &share->dirty_part_map,
+ FLUSH_RELEASE);
/*
ensure that setting the key cache and changing the multi_key_cache
@@ -102,6 +104,7 @@ int mi_assign_to_key_cache(MI_INFO *info
This should be seen at the lastes for the next call to an myisam function.
*/
share->key_cache= key_cache;
+ share->dirty_part_map= 0;
/* store the key cache in the global hash structure for future opens */
if (multi_key_cache_set((uchar*) share->unique_file_name,
=== modified file 'storage/myisam/mi_locking.c'
--- a/storage/myisam/mi_locking.c 2009-10-06 06:57:22 +0000
+++ b/storage/myisam/mi_locking.c 2010-02-16 16:41:11 +0000
@@ -68,7 +68,9 @@ int mi_lock_database(MI_INFO *info, int
--share->tot_locks;
if (info->lock_type == F_WRLCK && !share->w_locks &&
!share->delay_key_write && flush_key_blocks(share->key_cache,
- share->kfile,FLUSH_KEEP))
+ share->kfile,
+ &share->dirty_part_map,
+ FLUSH_KEEP))
{
error=my_errno;
mi_print_error(info->s, HA_ERR_CRASHED);
@@ -513,7 +515,8 @@ int _mi_test_if_changed(register MI_INFO
{ /* Keyfile has changed */
DBUG_PRINT("info",("index file changed"));
if (share->state.process != share->this_process)
- VOID(flush_key_blocks(share->key_cache, share->kfile, FLUSH_RELEASE));
+ VOID(flush_key_blocks(share->key_cache, share->kfile,
+ &share->dirty_part_map, FLUSH_RELEASE));
share->last_process=share->state.process;
info->last_unique= share->state.unique;
info->last_loop= share->state.update_count;
=== modified file 'storage/myisam/mi_page.c'
--- a/storage/myisam/mi_page.c 2009-05-06 12:03:24 +0000
+++ b/storage/myisam/mi_page.c 2010-02-16 16:41:11 +0000
@@ -94,10 +94,11 @@ int _mi_write_keypage(register MI_INFO *
}
#endif
DBUG_RETURN((key_cache_write(info->s->key_cache,
- info->s->kfile,page, level, (uchar*) buff,length,
- (uint) keyinfo->block_length,
- (int) ((info->lock_type != F_UNLCK) ||
- info->s->delay_key_write))));
+ info->s->kfile, &info->s->dirty_part_map,
+ page, level, (uchar*) buff, length,
+ (uint) keyinfo->block_length,
+ (int) ((info->lock_type != F_UNLCK) ||
+ info->s->delay_key_write))));
} /* mi_write_keypage */
@@ -116,7 +117,8 @@ int _mi_dispose(register MI_INFO *info,
mi_sizestore(buff,old_link);
info->s->state.changed|= STATE_NOT_SORTED_PAGES;
DBUG_RETURN(key_cache_write(info->s->key_cache,
- info->s->kfile, pos , level, buff,
+ info->s->kfile, &info->s->dirty_part_map,
+ pos , level, buff,
sizeof(buff),
(uint) keyinfo->block_length,
(int) (info->lock_type != F_UNLCK)));
=== modified file 'storage/myisam/mi_panic.c'
--- a/storage/myisam/mi_panic.c 2006-12-31 00:32:21 +0000
+++ b/storage/myisam/mi_panic.c 2010-02-16 16:41:11 +0000
@@ -47,7 +47,8 @@ int mi_panic(enum ha_panic_function flag
if (info->s->options & HA_OPTION_READ_ONLY_DATA)
break;
#endif
- if (flush_key_blocks(info->s->key_cache, info->s->kfile, FLUSH_RELEASE))
+ if (flush_key_blocks(info->s->key_cache, info->s->kfile,
+ &info->s->dirty_part_map, FLUSH_RELEASE))
error=my_errno;
if (info->opt_flag & WRITE_CACHE_USED)
if (flush_io_cache(&info->rec_cache))
=== modified file 'storage/myisam/mi_preload.c'
--- a/storage/myisam/mi_preload.c 2009-09-09 15:13:13 +0000
+++ b/storage/myisam/mi_preload.c 2010-02-16 16:41:11 +0000
@@ -65,7 +65,7 @@ int mi_preload(MI_INFO *info, ulonglong
}
}
else
- block_length= share->key_cache->key_cache_block_size;
+ block_length= share->key_cache->param_block_size;
length= info->preload_buff_size/block_length * block_length;
set_if_bigger(length, block_length);
@@ -73,7 +73,8 @@ int mi_preload(MI_INFO *info, ulonglong
if (!(buff= (uchar *) my_malloc(length, MYF(MY_WME))))
DBUG_RETURN(my_errno= HA_ERR_OUT_OF_MEM);
- if (flush_key_blocks(share->key_cache,share->kfile, FLUSH_RELEASE))
+ if (flush_key_blocks(share->key_cache, share->kfile, &share->dirty_part_map,
+ FLUSH_RELEASE))
goto err;
do
=== modified file 'storage/myisam/mi_test1.c'
--- a/storage/myisam/mi_test1.c 2008-04-28 16:24:05 +0000
+++ b/storage/myisam/mi_test1.c 2010-02-16 16:41:11 +0000
@@ -49,7 +49,8 @@ int main(int argc,char *argv[])
MY_INIT(argv[0]);
my_init();
if (key_cacheing)
- init_key_cache(dflt_key_cache,KEY_CACHE_BLOCK_SIZE,IO_SIZE*16,0,0);
+ init_key_cache(dflt_key_cache,KEY_CACHE_BLOCK_SIZE,IO_SIZE*16,0,0,
+ DEFAULT_KEY_CACHE_PARTITIONS);
get_options(argc,argv);
exit(run_test("test1"));
=== modified file 'storage/myisam/mi_test2.c'
--- a/storage/myisam/mi_test2.c 2008-04-28 16:24:05 +0000
+++ b/storage/myisam/mi_test2.c 2010-02-16 16:41:11 +0000
@@ -215,7 +215,8 @@ int main(int argc, char *argv[])
if (!silent)
printf("- Writing key:s\n");
if (key_cacheing)
- init_key_cache(dflt_key_cache,key_cache_block_size,key_cache_size,0,0);
+ init_key_cache(dflt_key_cache,key_cache_block_size,key_cache_size,0,0,
+ DEFAULT_KEY_CACHE_PARTITIONS);
if (do_locking)
mi_lock_database(file,F_WRLCK);
if (write_cacheing)
=== modified file 'storage/myisam/mi_test3.c'
--- a/storage/myisam/mi_test3.c 2010-01-06 21:27:53 +0000
+++ b/storage/myisam/mi_test3.c 2010-02-16 16:41:11 +0000
@@ -177,7 +177,8 @@ void start_test(int id)
exit(1);
}
if (key_cacheing && rnd(2) == 0)
- init_key_cache(dflt_key_cache, KEY_CACHE_BLOCK_SIZE, 65536L, 0, 0);
+ init_key_cache(dflt_key_cache, KEY_CACHE_BLOCK_SIZE, 65536L, 0, 0,
+ DEFAULT_KEY_CACHE_PARTITIONS);
printf("Process %d, pid: %ld\n", id, (long) getpid());
fflush(stdout);
=== modified file 'storage/myisam/myisam_ftdump.c'
--- a/storage/myisam/myisam_ftdump.c 2009-11-30 13:36:06 +0000
+++ b/storage/myisam/myisam_ftdump.c 2010-02-16 16:41:11 +0000
@@ -83,7 +83,7 @@ int main(int argc,char *argv[])
usage();
}
- init_key_cache(dflt_key_cache,MI_KEY_BLOCK_LENGTH,USE_BUFFER_INIT, 0, 0);
+ init_key_cache(dflt_key_cache,MI_KEY_BLOCK_LENGTH,USE_BUFFER_INIT, 0, 0, 0);
if (!(info=mi_open(argv[0], O_RDONLY,
HA_OPEN_ABORT_IF_LOCKED|HA_OPEN_FROM_SQL_LAYER)))
=== modified file 'storage/myisam/myisamchk.c'
--- a/storage/myisam/myisamchk.c 2009-12-03 11:34:11 +0000
+++ b/storage/myisam/myisamchk.c 2010-02-16 16:41:11 +0000
@@ -1102,7 +1102,7 @@ static int myisamchk(HA_CHECK *param, ch
{
if (param->testflag & (T_EXTEND | T_MEDIUM))
VOID(init_key_cache(dflt_key_cache,opt_key_cache_block_size,
- (size_t) param->use_buffers, 0, 0));
+ (size_t) param->use_buffers, 0, 0, 0));
VOID(init_io_cache(¶m->read_cache,datafile,
(uint) param->read_buffer_length,
READ_CACHE,
@@ -1116,7 +1116,8 @@ static int myisamchk(HA_CHECK *param, ch
HA_OPTION_COMPRESS_RECORD)) ||
(param->testflag & (T_EXTEND | T_MEDIUM)))
error|=chk_data_link(param, info, test(param->testflag & T_EXTEND));
- error|=flush_blocks(param, share->key_cache, share->kfile);
+ error|=flush_blocks(param, share->key_cache, share->kfile,
+ &share->dirty_part_map);
VOID(end_io_cache(¶m->read_cache));
}
if (!error)
@@ -1526,7 +1527,7 @@ static int mi_sort_records(HA_CHECK *par
DBUG_RETURN(0); /* Nothing to do */
init_key_cache(dflt_key_cache, opt_key_cache_block_size,
- (size_t) param->use_buffers, 0, 0);
+ (size_t) param->use_buffers, 0, 0, 0);
if (init_io_cache(&info->rec_cache,-1,(uint) param->write_buffer_length,
WRITE_CACHE,share->pack.header_length,1,
MYF(MY_WME | MY_WAIT_IF_FULL)))
@@ -1641,8 +1642,8 @@ err:
my_free(sort_info.buff,MYF(MY_ALLOW_ZERO_PTR));
sort_info.buff=0;
share->state.sortkey=sort_key;
- DBUG_RETURN(flush_blocks(param, share->key_cache, share->kfile) |
- got_error);
+ DBUG_RETURN(flush_blocks(param, share->key_cache, share->kfile,
+ &share->dirty_part_map) | got_error);
} /* sort_records */
=== modified file 'storage/myisam/myisamdef.h'
--- a/storage/myisam/myisamdef.h 2010-02-10 19:06:24 +0000
+++ b/storage/myisam/myisamdef.h 2010-02-16 16:41:11 +0000
@@ -174,6 +174,8 @@ typedef struct st_mi_isam_share
*index_file_name;
uchar *file_map; /* mem-map of file if possible */
KEY_CACHE *key_cache; /* ref to the current key cache */
+ /* To mark the key cache partitions containing dirty pages for this file */
+ ulonglong dirty_part_map;
MI_DECODE_TREE *decode_trees;
uint16 *decode_tables;
/* Function to use for a row checksum. */
@@ -733,7 +735,8 @@ void mi_check_print_info _VARARGS((HA_CH
#ifdef THREAD
pthread_handler_t thr_find_all_keys(void *arg);
#endif
-int flush_blocks(HA_CHECK *param, KEY_CACHE *key_cache, File file);
+int flush_blocks(HA_CHECK *param, KEY_CACHE *key_cache, File file,
+ ulonglong *dirty_part_map);
#ifdef __cplusplus
}
#endif
=== modified file 'storage/myisam/myisamlog.c'
--- a/storage/myisam/myisamlog.c 2009-11-29 23:08:56 +0000
+++ b/storage/myisam/myisamlog.c 2010-02-16 16:41:11 +0000
@@ -333,7 +333,7 @@ static int examine_log(char * file_name,
init_tree(&tree,0,0,sizeof(file_info),(qsort_cmp2) file_info_compare,1,
(tree_element_free) file_info_free, NULL);
VOID(init_key_cache(dflt_key_cache,KEY_CACHE_BLOCK_SIZE,KEY_CACHE_SIZE,
- 0, 0));
+ 0, 0, 0));
files_open=0; access_time=0;
while (access_time++ != number_of_commands &&
1
0

[Maria-developers] Updated (by Guest): Add a mysqlbinlog option to filter updates to certain tables (40)
by worklog-noreply@askmonty.org 16 Feb '10
by worklog-noreply@askmonty.org 16 Feb '10
16 Feb '10
-----------------------------------------------------------------------
WORKLOG TASK
-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-
TASK...........: Add a mysqlbinlog option to filter updates to certain tables
CREATION DATE..: Mon, 10 Aug 2009, 13:25
SUPERVISOR.....: Monty
IMPLEMENTOR....:
COPIES TO......: Psergey
CATEGORY.......: Server-Sprint
TASK ID........: 40 (http://askmonty.org/worklog/?tid=40)
VERSION........: Server-9.x
STATUS.........: Cancelled
PRIORITY.......: 60
WORKED HOURS...: 32
ESTIMATE.......: 32 (hours remain)
ORIG. ESTIMATE.: 48
PROGRESS NOTES:
-=-=(Guest - Tue, 16 Feb 2010, 10:23)=-=-
Status updated.
--- /tmp/wklog.40.old.18300 2010-02-16 10:23:20.000000000 +0200
+++ /tmp/wklog.40.new.18300 2010-02-16 10:23:20.000000000 +0200
@@ -1 +1 @@
-Assigned
+Cancelled
-=-=(Guest - Wed, 25 Nov 2009, 11:41)=-=-
Status updated.
--- /tmp/wklog.40.old.5760 2009-11-25 11:41:09.000000000 +0200
+++ /tmp/wklog.40.new.5760 2009-11-25 11:41:09.000000000 +0200
@@ -1 +1 @@
-Un-Assigned
+Assigned
-=-=(Guest - Wed, 25 Nov 2009, 11:41)=-=-
Category updated.
--- /tmp/wklog.40.old.5737 2009-11-25 11:41:03.000000000 +0200
+++ /tmp/wklog.40.new.5737 2009-11-25 11:41:03.000000000 +0200
@@ -1 +1 @@
-Server-RawIdeaBin
+Server-Sprint
-=-=(Bothorsen - Tue, 17 Nov 2009, 17:20)=-=-
Alex is closer to a working patch now.
Worked 14 hours and estimate 32 hours remain (original estimate unchanged).
-=-=(Bothorsen - Thu, 12 Nov 2009, 13:13)=-=-
Work hours by Alexi and Bo + estimated time for the task.
Worked 16 hours and estimate 46 hours remain (original estimate increased by 14 hours).
-=-=(Alexi - Sun, 08 Nov 2009, 15:18)=-=-
Low Level Design modified.
--- /tmp/wklog.40.old.15787 2009-11-08 15:18:11.000000000 +0200
+++ /tmp/wklog.40.new.15787 2009-11-08 15:18:11.000000000 +0200
@@ -62,7 +62,7 @@
it considers the query to extent to the end of the event.
2. For 'db' (current db) the trailing zero is redundant since the length
is already known.
-3. db_len = 0 means that this is the current db.
+3. In tables_info, db_len = 0 means that this is the current db.
When reading Query events from binary log, we can recognize its format
by its post-header length: in extended case the post-header includes 4
@@ -75,6 +75,77 @@
+ #define Q_QUERY_LEN_OFFSET Q_STATUS_VARS_LEN_OFFSET + 2
+ #define Q_QUERY_TABLES_INFO_LEN_OFFSET Q_QUERY_LEN_OFFSET + 2
+
+***********************************************************************
+HELP NEEDED
+***********************************************************************
+The QUERY_HEADER_LEN is used in the definition of MAX_LOG_EVENT_HEADER:
+
+log_event.h
+~~~~~~~~~~~
+#define MAX_LOG_EVENT_HEADER ( /* in order of Query_log_event::write */ \
+ LOG_EVENT_HEADER_LEN + /* write_header */ \
+ QUERY_HEADER_LEN + /* write_data */ \
+ EXECUTE_LOAD_QUERY_EXTRA_HEADER_LEN + /*write_post_header_for_derived */ \
+ MAX_SIZE_LOG_EVENT_STATUS + /* status */ \
+ NAME_LEN + 1)
+
+which is used only for setting
+
+ thd->variables.max_allowed_packet
+ mysql->net.max_packet_size
+
+Looks like (but I am not quite sure) that QUERY_HEADER_LEN can simply
+(without making any other changes) be substituted in this definition by
+QUERY_HEADER_LEN_EXT.
+
+Below I list all places where MAX_LOG_EVENT_HEADER is used:
+
+slave.cc
+~~~~~~~~
+static int init_slave_thread(...)
+{ ...
+ /*
+ Adding MAX_LOG_EVENT_HEADER_LEN to the max_allowed_packet on all
+ slave threads, since a replication event can become this much larger
+ than the corresponding packet (query) sent from client to master.
+ */
+ thd->variables.max_allowed_packet= global_system_variables.max_allowed_packet
+ + MAX_LOG_EVENT_HEADER; /* note, incr over the global not session var */
+ ...
+}
+pthread_handler_t handle_slave_io(...)
+{ ...
+ /*
+ Adding MAX_LOG_EVENT_HEADER_LEN to the max_packet_size on the I/O
+ thread, since a replication event can become this much larger than
+ the corresponding packet (query) sent from client to master.
+ */
+ mysql->net.max_packet_size= thd->net.max_packet_size+= MAX_LOG_EVENT_HEADER;
+ ...
+}
+
+sql_repl.cc
+~~~~~~~~~~~
+void mysql_binlog_send(...)
+{ ...
+ /*
+ Adding MAX_LOG_EVENT_HEADER_LEN, since a binlog event can become
+ this larger than the corresponding packet (query) sent
+ from client to master.
+ */
+ thd->variables.max_allowed_packet+= MAX_LOG_EVENT_HEADER;
+ ...
+}
+bool mysql_show_binlog_events(...)
+{ ...
+ /*
+ to account binlog event header size
+ */
+ thd->variables.max_allowed_packet+= MAX_LOG_EVENT_HEADER;
+ ...
+}
+
3. Changes in log events
************************
@@ -84,7 +155,7 @@
This setting is done in Format description event constructor which creates
the event for writing to binary log:
- if (binlog_with_tables_info)
+ if (opt_binlog_with_tables_info)
post_header_len[QUERY_EVENT - 1] = QUERY_HEADER_LEN_EXT;
else
post_header_len[QUERY_EVENT - 1] = QUERY_HEADER_LEN;
@@ -99,12 +170,12 @@
following manner:
switch (binlog_ver) {
- case 4: /* MySQL 5.0 and higher */
+ #ifndef MYSQL_CLIENT
+ case 4: /* MySQL 5.0 and higher */
...
-+ #else
-+ <error>
+ break;
+ #endif
+
case 1:
case 3:
...
@@ -132,7 +203,7 @@
--------------------------------
[Creates the event for binlogging]
-In case of binlog_with_tables_info = TRUE, set additionally query_len,
+In case of opt_binlog_with_tables_info = TRUE, set additionally query_len,
tables_info_len, and tables_info members (the constructor is to have
an additional 'tables_info' argument).
@@ -140,7 +211,7 @@
----------------
[Writes the event to binlog]
-In case of binlog_with_tables_info = TRUE, write additional members
+In case of opt_binlog_with_tables_info = TRUE, write additional members
(query_len, tables_info_len, and tables_info) to binary log. Also
write corresponding whole event length to the common-header.
-=-=(Alexi - Sun, 08 Nov 2009, 10:40)=-=-
Low Level Design modified.
--- /tmp/wklog.40.old.5055 2009-11-08 08:40:02.000000000 +0000
+++ /tmp/wklog.40.new.5055 2009-11-08 08:40:02.000000000 +0000
@@ -3,6 +3,7 @@
1. Adding --binlog-with-tables-info option
******************************************
+GLOBAL, read-only option.
When set, Query events are to be written in the extended binary
format which contains tables_info. When not set, Query events
-=-=(Alexi - Thu, 05 Nov 2009, 12:37)=-=-
Low Level Design modified.
--- /tmp/wklog.40.old.11441 2009-11-05 12:37:16.000000000 +0200
+++ /tmp/wklog.40.new.11441 2009-11-05 12:37:16.000000000 +0200
@@ -1,9 +1,18 @@
OPTION: 2.5 Extend Query Events With Tables Info
================================================
-1. Query_log_event Binary Format
-********************************
-Changes to be done:
+1. Adding --binlog-with-tables-info option
+******************************************
+
+When set, Query events are to be written in the extended binary
+format which contains tables_info. When not set, Query events
+are to be written in usual format (without any changes).
+
+2. Query event extended binary format
+*************************************
+
+When --binlog-with-tables-info is set, Query events are writen
+to binary log in the following (extended) format.
Query_log_event binary format
---------------------------------
@@ -24,12 +33,12 @@
error_code 2
status_vars_len 2
+ query_len 2 (see Note 1)
-+ tables_info_len 2 (see Note 2)
++ tables_info_len 2
---------------------------------
BODY:
status_vars status_vars_len
- db db_len + 1
-+ db db_len (see Note 3)
++ db db_len (see Note 2)
query query_len
+ tables_info
@@ -37,7 +46,7 @@
---------------------------------
Name Size (bytes)
---------------------------------
- db_len 1 (see Note 4)
+ db_len 1 (see Note 3)
db db_len
table_name_len 1
table_name table_name_len
@@ -48,19 +57,99 @@
table_name table_name_len
NOTES
-1. Currently Query_log_event format doesn't include 'query_len' because
+1. In usual format, Query_log_event doesn't include 'query_len' because
it considers the query to extent to the end of the event.
-2. If tables_info is not included in the event (--binlog-with-tables-info
- option), tables_info_len = 0.
-3. The trailing zero is redundant since the length is already known.
-4. In case of db = current db, db_len = 0 and db = empty, because
- current db is already included in the current event format.
+2. For 'db' (current db) the trailing zero is redundant since the length
+ is already known.
+3. db_len = 0 means that this is the current db.
+
+When reading Query events from binary log, we can recognize its format
+by its post-header length: in extended case the post-header includes 4
+additional bytes.
+
+ #define QUERY_HEADER_LEN (QUERY_HEADER_MINIMAL_LEN + 4)
++ #define QUERY_HEADER_LEN_EXT (QUERY_HEADER_LEN + 4)
+ ...
+ #define Q_STATUS_VARS_LEN_OFFSET 11
++ #define Q_QUERY_LEN_OFFSET Q_STATUS_VARS_LEN_OFFSET + 2
++ #define Q_QUERY_TABLES_INFO_LEN_OFFSET Q_QUERY_LEN_OFFSET + 2
+
+3. Changes in log events
+************************
+
+3.1. Format description event
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+Changes needed here concern setting post-header length for Query events.
+This setting is done in Format description event constructor which creates
+the event for writing to binary log:
+
+ if (binlog_with_tables_info)
+ post_header_len[QUERY_EVENT - 1] = QUERY_HEADER_LEN_EXT;
+ else
+ post_header_len[QUERY_EVENT - 1] = QUERY_HEADER_LEN;
+
+This change is to be done only for case binlog_ver = 4.
+
+NOTE. The refered above constructor is allowed to be invoked in a client
+context for creating "artificial" Format description events in case of
+MySQL < 5.0 (e.g. see mysqlbinlog code). To avoid compilation problems
+(because of 'binlog_with_tables_info') and taking into account the
+"MySQL < 5.0" restriction, we have to #ifdef out the above code in
+following manner:
+
+ switch (binlog_ver) {
+ case 4: /* MySQL 5.0 and higher */
++ #ifndef MYSQL_CLIENT
+ ...
++ #else
++ <error>
++ #endif
+ case 1:
+ case 3:
+ ...
+ }
+
+3.2. Query event
+~~~~~~~~~~~~~~~~
+Changes needed here include adding tables_info and tables_info_len
+members (member for query length already exists) and modifying the
+following function-members:
+
+Query_log_event(buf) constructor
+--------------------------------
+[Parses binary format written to the 'buf']
+
+Getting post-header length from the Format description event (passed
+to the constructor as an argument), define whether buf contains an
+extended or usual Query event and parse the buf contents accordingly.
+
+NOTE. Defining Query event format here should be done with taking into
+account that this constructor can be called within a Query-derived
+event with the event_type argument != QUERY_EVENT.
+
+Query_log_event(thd) constructor
+--------------------------------
+[Creates the event for binlogging]
+
+In case of binlog_with_tables_info = TRUE, set additionally query_len,
+tables_info_len, and tables_info members (the constructor is to have
+an additional 'tables_info' argument).
+
+write() function
+----------------
+[Writes the event to binlog]
+
+In case of binlog_with_tables_info = TRUE, write additional members
+(query_len, tables_info_len, and tables_info) to binary log. Also
+write corresponding whole event length to the common-header.
+
+<To be continued>
-2. Where to get tables info from?
+4. Where to get tables info from?
*********************************
-2.1. Case study: CREATE TABLE
-******************************
+4.1. Case study: CREATE TABLE
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
*** CREATE TABLE table [SELECT ...]
@@ -129,4 +218,4 @@
}
}
-To be continued
+<To be continued>
-=-=(Alexi - Wed, 04 Nov 2009, 10:21)=-=-
Low Level Design modified.
--- /tmp/wklog.40.old.6734 2009-11-04 10:21:20.000000000 +0200
+++ /tmp/wklog.40.new.6734 2009-11-04 10:21:20.000000000 +0200
@@ -21,9 +21,9 @@
slave_proxy_id 4
exec_time 4
db_len 1
-+ query_len 2 (see Note 1)
error_code 2
status_vars_len 2
++ query_len 2 (see Note 1)
+ tables_info_len 2 (see Note 2)
---------------------------------
BODY:
-=-=(Alexi - Tue, 03 Nov 2009, 11:19)=-=-
Low Level Design modified.
--- /tmp/wklog.40.old.7187 2009-11-03 11:19:22.000000000 +0200
+++ /tmp/wklog.40.new.7187 2009-11-03 11:19:22.000000000 +0200
@@ -1 +1,132 @@
+OPTION: 2.5 Extend Query Events With Tables Info
+================================================
+1. Query_log_event Binary Format
+********************************
+Changes to be done:
+
+ Query_log_event binary format
+ ---------------------------------
+ Name Size (bytes)
+ ---------------------------------
+ COMMON HEADER:
+ timestamp 4
+ type 1
+ server_id 4
+ total_size 4
+ master_position 4
+ flags 2
+ ---------------------------------
+ POST HEADER:
+ slave_proxy_id 4
+ exec_time 4
+ db_len 1
++ query_len 2 (see Note 1)
+ error_code 2
+ status_vars_len 2
++ tables_info_len 2 (see Note 2)
+ ---------------------------------
+ BODY:
+ status_vars status_vars_len
+- db db_len + 1
++ db db_len (see Note 3)
+ query query_len
++ tables_info
+
+ tables_info binary format
+ ---------------------------------
+ Name Size (bytes)
+ ---------------------------------
+ db_len 1 (see Note 4)
+ db db_len
+ table_name_len 1
+ table_name table_name_len
+ ...
+ db_len 1
+ db db_len
+ table_name_len 1
+ table_name table_name_len
+
+NOTES
+1. Currently Query_log_event format doesn't include 'query_len' because
+ it considers the query to extent to the end of the event.
+2. If tables_info is not included in the event (--binlog-with-tables-info
+ option), tables_info_len = 0.
+3. The trailing zero is redundant since the length is already known.
+4. In case of db = current db, db_len = 0 and db = empty, because
+ current db is already included in the current event format.
+
+2. Where to get tables info from?
+*********************************
+
+2.1. Case study: CREATE TABLE
+******************************
+
+*** CREATE TABLE table [SELECT ...]
+
+ bool mysql_create_table_no_lock(
+ THD *thd,
+ const char *db,
+ const char *table_name, ...)
+ {
+ ...
+ // -------------------------------------
+ // WL40: To be included in tables_info:
+ // * db, table_name
+ // * thd->lex->query_tables (tables refered to in
+ // the select-part; empty if no select-part)
+ // -------------------------------------
+ write_bin_log(thd, TRUE, thd->query, thd->query_length);
+ }
+
+*** CREATE TABLE table LIKE src-table
+
+ bool mysql_create_like_table(
+ ...
+ TABLE_LIST *table,
+ TABLE_LIST *src_table,
+ ...)
+ {
+ ...
+ if (thd->current_stmt_binlog_row_based)
+ { // RBR: In this case we don't replicate temp tables
+ if (!(create_info->options & HA_LEX_CREATE_TMP_TABLE))
+ {
+ if (src_table->table->s->tmp_table)
+ { // CREATE normal-table LIKE temp-table:
+
+ // Generate new query without LIKE-part
+ store_create_info(thd, table, &query, create_info, FALSE);
+
+ // -------------------------------------
+ // WL40: To include to tables_info:
+ // * table (src_table is not included)
+ // -------------------------------------
+ write_bin_log(thd, TRUE, query.ptr(), query.length());
+ }
+ else
+ { // CREATE normal-table LIKE normal-table
+
+ // -------------------------------------
+ // WL40: To include to log_tables_info:
+ // * table
+ // * src_table
+ // -------------------------------------
+ write_bin_log(thd, TRUE, thd->query, thd->query_length);
+ }
+ }
+ // CREATE temp-table LIKE ...
+ // This case is not replicated
+ }
+ else
+ { // SBR:
+ // -------------------------------------
+ // WL40: To include to tables_info:
+ // * table
+ // * src_table
+ // -------------------------------------
+ write_bin_log(thd, TRUE, thd->query, thd->query_length);
+ }
+ }
+
+To be continued
------------------------------------------------------------
-=-=(View All Progress Notes, 17 total)=-=-
http://askmonty.org/worklog/index.pl?tid=40&nolimit=1
DESCRIPTION:
Replication slave can be set to filter updates to certain tables with
--replicate-[wild-]{do,ignore}-table options.
This task is about adding similar functionality to mysqlbinlog.
HIGH-LEVEL SPECIFICATION:
1. Context
----------
(See http://askmonty.org/wiki/index.php/Scratch/ReplicationOptions for global
overview)
At the moment, the server has these replication slave options:
--replicate-do-table=db.tbl
--replicate-ignore-table=db.tbl
--replicate-wild-do-table=pattern.pattern
--replicate-wild-ignore-table=pattern.pattern
They affect both RBR and SBR events. SBR events are checked after the
statement has been parsed, the server iterates over list of used tables and
checks them againist --replicate instructions.
What is interesting is that this scheme still allows to update the ignored
table through a VIEW.
2. Table filtering in mysqlbinlog
---------------------------------
Per-table filtering of RBR events is easy (as it is relatively easy to extract
the name of the table that the event applies to).
Per-table filtering of SBR events is hard, as generally it is not apparent
which tables the statement refers to.
This opens possible options:
2.1 Put the parser into mysqlbinlog
-----------------------------------
Once we have a full parser in mysqlbinlog, we'll be able to check which tables
are used by a statement, and will allow to show behaviour identical to those
that one obtains when using --replicate-* slave options.
(It is not clear how much effort is needed to put the parser into mysqlbinlog.
Any guesses?)
2.2 Use dumb regexp match
-------------------------
Use a really dumb approach. A query is considered to be modifying table X if
it matches an expression
CREATE TABLE $tablename
DROP $tablename
UPDATE ...$tablename ... SET // here '...' can't contain the word 'SET'
DELETE ...$tablename ... WHERE // same as above
ALTER TABLE $tablename
.. etc (go get from the grammar) ..
The advantage over doing the same in awk is that mysqlbinlog will also process
RBR statements, and together with that will provide a working solution for
those who are careful with their table names not mixing with string constants
and such.
(TODO: string constants are of particular concern as they come from
[potentially hostile] users, unlike e.g. table aliases which come from
[not hostile] developers. Remove also all string constants before attempting
to do match?)
2.3 Have the master put annotations
-----------------------------------
We could add a master option so that it injects into query a mark that tells
which tables the query will affect, e.g. for the query
UPDATE t1 LEFT JOIN db3.t2 ON ... WHERE ...
the binlog will have
/* !mysqlbinlog: updates t1,db3.t2 */ UPDATE t1 LEFT JOIN ...
and further processing in mysqlbinlog will be trivial.
2.4 Implement server functionality to ignore certain tables
-----------------------------------------------------------
We could add a general facility in the server to ignore certain tables:
SET SESSION ignored_tables = "db1.t1,db2.t2";
This would work similar to --replicate-ignore-table, but in a general way not
restricted to the slave SQL thread.
It would then be trivial for mysqlbinlog to add such statements at the start
of the output, or probably the user could just do it manually with no need for
additional options for mysqlbinlog.
It might be useful to integrate this with the code that already handles
--replicate-ignore-db and similar slave options.
2.5 Extend Query Events With Tables Info
----------------------------------------
We could extend query events structure with a tables info - a list of tables
which the query refers to:
<current query event structure>
tables_info_len
dbase_len dbase
table_len table
...
dbase_len dbase
table_len table
Note. In case of <dbase> = current data base, we can set dbase_len = 0
and dbase = empty because current query event structure already
includes current data base name.
Note. Possibly it is reasonable also to add a --binlog-with-tables-info
option which defines whether tables info must be included to the
query events.
LOW-LEVEL DESIGN:
OPTION: 2.5 Extend Query Events With Tables Info
================================================
1. Adding --binlog-with-tables-info option
******************************************
GLOBAL, read-only option.
When set, Query events are to be written in the extended binary
format which contains tables_info. When not set, Query events
are to be written in usual format (without any changes).
2. Query event extended binary format
*************************************
When --binlog-with-tables-info is set, Query events are writen
to binary log in the following (extended) format.
Query_log_event binary format
---------------------------------
Name Size (bytes)
---------------------------------
COMMON HEADER:
timestamp 4
type 1
server_id 4
total_size 4
master_position 4
flags 2
---------------------------------
POST HEADER:
slave_proxy_id 4
exec_time 4
db_len 1
error_code 2
status_vars_len 2
+ query_len 2 (see Note 1)
+ tables_info_len 2
---------------------------------
BODY:
status_vars status_vars_len
- db db_len + 1
+ db db_len (see Note 2)
query query_len
+ tables_info
tables_info binary format
---------------------------------
Name Size (bytes)
---------------------------------
db_len 1 (see Note 3)
db db_len
table_name_len 1
table_name table_name_len
...
db_len 1
db db_len
table_name_len 1
table_name table_name_len
NOTES
1. In usual format, Query_log_event doesn't include 'query_len' because
it considers the query to extent to the end of the event.
2. For 'db' (current db) the trailing zero is redundant since the length
is already known.
3. In tables_info, db_len = 0 means that this is the current db.
When reading Query events from binary log, we can recognize its format
by its post-header length: in extended case the post-header includes 4
additional bytes.
#define QUERY_HEADER_LEN (QUERY_HEADER_MINIMAL_LEN + 4)
+ #define QUERY_HEADER_LEN_EXT (QUERY_HEADER_LEN + 4)
...
#define Q_STATUS_VARS_LEN_OFFSET 11
+ #define Q_QUERY_LEN_OFFSET Q_STATUS_VARS_LEN_OFFSET + 2
+ #define Q_QUERY_TABLES_INFO_LEN_OFFSET Q_QUERY_LEN_OFFSET + 2
***********************************************************************
HELP NEEDED
***********************************************************************
The QUERY_HEADER_LEN is used in the definition of MAX_LOG_EVENT_HEADER:
log_event.h
~~~~~~~~~~~
#define MAX_LOG_EVENT_HEADER ( /* in order of Query_log_event::write */ \
LOG_EVENT_HEADER_LEN + /* write_header */ \
QUERY_HEADER_LEN + /* write_data */ \
EXECUTE_LOAD_QUERY_EXTRA_HEADER_LEN + /*write_post_header_for_derived */ \
MAX_SIZE_LOG_EVENT_STATUS + /* status */ \
NAME_LEN + 1)
which is used only for setting
thd->variables.max_allowed_packet
mysql->net.max_packet_size
Looks like (but I am not quite sure) that QUERY_HEADER_LEN can simply
(without making any other changes) be substituted in this definition by
QUERY_HEADER_LEN_EXT.
Below I list all places where MAX_LOG_EVENT_HEADER is used:
slave.cc
~~~~~~~~
static int init_slave_thread(...)
{ ...
/*
Adding MAX_LOG_EVENT_HEADER_LEN to the max_allowed_packet on all
slave threads, since a replication event can become this much larger
than the corresponding packet (query) sent from client to master.
*/
thd->variables.max_allowed_packet= global_system_variables.max_allowed_packet
+ MAX_LOG_EVENT_HEADER; /* note, incr over the global not session var */
...
}
pthread_handler_t handle_slave_io(...)
{ ...
/*
Adding MAX_LOG_EVENT_HEADER_LEN to the max_packet_size on the I/O
thread, since a replication event can become this much larger than
the corresponding packet (query) sent from client to master.
*/
mysql->net.max_packet_size= thd->net.max_packet_size+= MAX_LOG_EVENT_HEADER;
...
}
sql_repl.cc
~~~~~~~~~~~
void mysql_binlog_send(...)
{ ...
/*
Adding MAX_LOG_EVENT_HEADER_LEN, since a binlog event can become
this larger than the corresponding packet (query) sent
from client to master.
*/
thd->variables.max_allowed_packet+= MAX_LOG_EVENT_HEADER;
...
}
bool mysql_show_binlog_events(...)
{ ...
/*
to account binlog event header size
*/
thd->variables.max_allowed_packet+= MAX_LOG_EVENT_HEADER;
...
}
3. Changes in log events
************************
3.1. Format description event
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
Changes needed here concern setting post-header length for Query events.
This setting is done in Format description event constructor which creates
the event for writing to binary log:
if (opt_binlog_with_tables_info)
post_header_len[QUERY_EVENT - 1] = QUERY_HEADER_LEN_EXT;
else
post_header_len[QUERY_EVENT - 1] = QUERY_HEADER_LEN;
This change is to be done only for case binlog_ver = 4.
NOTE. The refered above constructor is allowed to be invoked in a client
context for creating "artificial" Format description events in case of
MySQL < 5.0 (e.g. see mysqlbinlog code). To avoid compilation problems
(because of 'binlog_with_tables_info') and taking into account the
"MySQL < 5.0" restriction, we have to #ifdef out the above code in
following manner:
switch (binlog_ver) {
+ #ifndef MYSQL_CLIENT
case 4: /* MySQL 5.0 and higher */
...
break;
+ #endif
case 1:
case 3:
...
}
3.2. Query event
~~~~~~~~~~~~~~~~
Changes needed here include adding tables_info and tables_info_len
members (member for query length already exists) and modifying the
following function-members:
Query_log_event(buf) constructor
--------------------------------
[Parses binary format written to the 'buf']
Getting post-header length from the Format description event (passed
to the constructor as an argument), define whether buf contains an
extended or usual Query event and parse the buf contents accordingly.
NOTE. Defining Query event format here should be done with taking into
account that this constructor can be called within a Query-derived
event with the event_type argument != QUERY_EVENT.
Query_log_event(thd) constructor
--------------------------------
[Creates the event for binlogging]
In case of opt_binlog_with_tables_info = TRUE, set additionally query_len,
tables_info_len, and tables_info members (the constructor is to have
an additional 'tables_info' argument).
write() function
----------------
[Writes the event to binlog]
In case of opt_binlog_with_tables_info = TRUE, write additional members
(query_len, tables_info_len, and tables_info) to binary log. Also
write corresponding whole event length to the common-header.
<To be continued>
4. Where to get tables info from?
*********************************
4.1. Case study: CREATE TABLE
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
*** CREATE TABLE table [SELECT ...]
bool mysql_create_table_no_lock(
THD *thd,
const char *db,
const char *table_name, ...)
{
...
// -------------------------------------
// WL40: To be included in tables_info:
// * db, table_name
// * thd->lex->query_tables (tables refered to in
// the select-part; empty if no select-part)
// -------------------------------------
write_bin_log(thd, TRUE, thd->query, thd->query_length);
}
*** CREATE TABLE table LIKE src-table
bool mysql_create_like_table(
...
TABLE_LIST *table,
TABLE_LIST *src_table,
...)
{
...
if (thd->current_stmt_binlog_row_based)
{ // RBR: In this case we don't replicate temp tables
if (!(create_info->options & HA_LEX_CREATE_TMP_TABLE))
{
if (src_table->table->s->tmp_table)
{ // CREATE normal-table LIKE temp-table:
// Generate new query without LIKE-part
store_create_info(thd, table, &query, create_info, FALSE);
// -------------------------------------
// WL40: To include to tables_info:
// * table (src_table is not included)
// -------------------------------------
write_bin_log(thd, TRUE, query.ptr(), query.length());
}
else
{ // CREATE normal-table LIKE normal-table
// -------------------------------------
// WL40: To include to log_tables_info:
// * table
// * src_table
// -------------------------------------
write_bin_log(thd, TRUE, thd->query, thd->query_length);
}
}
// CREATE temp-table LIKE ...
// This case is not replicated
}
else
{ // SBR:
// -------------------------------------
// WL40: To include to tables_info:
// * table
// * src_table
// -------------------------------------
write_bin_log(thd, TRUE, thd->query, thd->query_length);
}
}
<To be continued>
ESTIMATED WORK TIME
ESTIMATED COMPLETION DATE
-----------------------------------------------------------------------
WorkLog (v3.5.9)
1
0

[Maria-developers] Updated (by Guest): Add a mysqlbinlog option to filter updates to certain tables (40)
by worklog-noreply@askmonty.org 16 Feb '10
by worklog-noreply@askmonty.org 16 Feb '10
16 Feb '10
-----------------------------------------------------------------------
WORKLOG TASK
-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-
TASK...........: Add a mysqlbinlog option to filter updates to certain tables
CREATION DATE..: Mon, 10 Aug 2009, 13:25
SUPERVISOR.....: Monty
IMPLEMENTOR....:
COPIES TO......: Psergey
CATEGORY.......: Server-Sprint
TASK ID........: 40 (http://askmonty.org/worklog/?tid=40)
VERSION........: Server-9.x
STATUS.........: Cancelled
PRIORITY.......: 60
WORKED HOURS...: 32
ESTIMATE.......: 32 (hours remain)
ORIG. ESTIMATE.: 48
PROGRESS NOTES:
-=-=(Guest - Tue, 16 Feb 2010, 10:23)=-=-
Status updated.
--- /tmp/wklog.40.old.18300 2010-02-16 10:23:20.000000000 +0200
+++ /tmp/wklog.40.new.18300 2010-02-16 10:23:20.000000000 +0200
@@ -1 +1 @@
-Assigned
+Cancelled
-=-=(Guest - Wed, 25 Nov 2009, 11:41)=-=-
Status updated.
--- /tmp/wklog.40.old.5760 2009-11-25 11:41:09.000000000 +0200
+++ /tmp/wklog.40.new.5760 2009-11-25 11:41:09.000000000 +0200
@@ -1 +1 @@
-Un-Assigned
+Assigned
-=-=(Guest - Wed, 25 Nov 2009, 11:41)=-=-
Category updated.
--- /tmp/wklog.40.old.5737 2009-11-25 11:41:03.000000000 +0200
+++ /tmp/wklog.40.new.5737 2009-11-25 11:41:03.000000000 +0200
@@ -1 +1 @@
-Server-RawIdeaBin
+Server-Sprint
-=-=(Bothorsen - Tue, 17 Nov 2009, 17:20)=-=-
Alex is closer to a working patch now.
Worked 14 hours and estimate 32 hours remain (original estimate unchanged).
-=-=(Bothorsen - Thu, 12 Nov 2009, 13:13)=-=-
Work hours by Alexi and Bo + estimated time for the task.
Worked 16 hours and estimate 46 hours remain (original estimate increased by 14 hours).
-=-=(Alexi - Sun, 08 Nov 2009, 15:18)=-=-
Low Level Design modified.
--- /tmp/wklog.40.old.15787 2009-11-08 15:18:11.000000000 +0200
+++ /tmp/wklog.40.new.15787 2009-11-08 15:18:11.000000000 +0200
@@ -62,7 +62,7 @@
it considers the query to extent to the end of the event.
2. For 'db' (current db) the trailing zero is redundant since the length
is already known.
-3. db_len = 0 means that this is the current db.
+3. In tables_info, db_len = 0 means that this is the current db.
When reading Query events from binary log, we can recognize its format
by its post-header length: in extended case the post-header includes 4
@@ -75,6 +75,77 @@
+ #define Q_QUERY_LEN_OFFSET Q_STATUS_VARS_LEN_OFFSET + 2
+ #define Q_QUERY_TABLES_INFO_LEN_OFFSET Q_QUERY_LEN_OFFSET + 2
+
+***********************************************************************
+HELP NEEDED
+***********************************************************************
+The QUERY_HEADER_LEN is used in the definition of MAX_LOG_EVENT_HEADER:
+
+log_event.h
+~~~~~~~~~~~
+#define MAX_LOG_EVENT_HEADER ( /* in order of Query_log_event::write */ \
+ LOG_EVENT_HEADER_LEN + /* write_header */ \
+ QUERY_HEADER_LEN + /* write_data */ \
+ EXECUTE_LOAD_QUERY_EXTRA_HEADER_LEN + /*write_post_header_for_derived */ \
+ MAX_SIZE_LOG_EVENT_STATUS + /* status */ \
+ NAME_LEN + 1)
+
+which is used only for setting
+
+ thd->variables.max_allowed_packet
+ mysql->net.max_packet_size
+
+Looks like (but I am not quite sure) that QUERY_HEADER_LEN can simply
+(without making any other changes) be substituted in this definition by
+QUERY_HEADER_LEN_EXT.
+
+Below I list all places where MAX_LOG_EVENT_HEADER is used:
+
+slave.cc
+~~~~~~~~
+static int init_slave_thread(...)
+{ ...
+ /*
+ Adding MAX_LOG_EVENT_HEADER_LEN to the max_allowed_packet on all
+ slave threads, since a replication event can become this much larger
+ than the corresponding packet (query) sent from client to master.
+ */
+ thd->variables.max_allowed_packet= global_system_variables.max_allowed_packet
+ + MAX_LOG_EVENT_HEADER; /* note, incr over the global not session var */
+ ...
+}
+pthread_handler_t handle_slave_io(...)
+{ ...
+ /*
+ Adding MAX_LOG_EVENT_HEADER_LEN to the max_packet_size on the I/O
+ thread, since a replication event can become this much larger than
+ the corresponding packet (query) sent from client to master.
+ */
+ mysql->net.max_packet_size= thd->net.max_packet_size+= MAX_LOG_EVENT_HEADER;
+ ...
+}
+
+sql_repl.cc
+~~~~~~~~~~~
+void mysql_binlog_send(...)
+{ ...
+ /*
+ Adding MAX_LOG_EVENT_HEADER_LEN, since a binlog event can become
+ this larger than the corresponding packet (query) sent
+ from client to master.
+ */
+ thd->variables.max_allowed_packet+= MAX_LOG_EVENT_HEADER;
+ ...
+}
+bool mysql_show_binlog_events(...)
+{ ...
+ /*
+ to account binlog event header size
+ */
+ thd->variables.max_allowed_packet+= MAX_LOG_EVENT_HEADER;
+ ...
+}
+
3. Changes in log events
************************
@@ -84,7 +155,7 @@
This setting is done in Format description event constructor which creates
the event for writing to binary log:
- if (binlog_with_tables_info)
+ if (opt_binlog_with_tables_info)
post_header_len[QUERY_EVENT - 1] = QUERY_HEADER_LEN_EXT;
else
post_header_len[QUERY_EVENT - 1] = QUERY_HEADER_LEN;
@@ -99,12 +170,12 @@
following manner:
switch (binlog_ver) {
- case 4: /* MySQL 5.0 and higher */
+ #ifndef MYSQL_CLIENT
+ case 4: /* MySQL 5.0 and higher */
...
-+ #else
-+ <error>
+ break;
+ #endif
+
case 1:
case 3:
...
@@ -132,7 +203,7 @@
--------------------------------
[Creates the event for binlogging]
-In case of binlog_with_tables_info = TRUE, set additionally query_len,
+In case of opt_binlog_with_tables_info = TRUE, set additionally query_len,
tables_info_len, and tables_info members (the constructor is to have
an additional 'tables_info' argument).
@@ -140,7 +211,7 @@
----------------
[Writes the event to binlog]
-In case of binlog_with_tables_info = TRUE, write additional members
+In case of opt_binlog_with_tables_info = TRUE, write additional members
(query_len, tables_info_len, and tables_info) to binary log. Also
write corresponding whole event length to the common-header.
-=-=(Alexi - Sun, 08 Nov 2009, 10:40)=-=-
Low Level Design modified.
--- /tmp/wklog.40.old.5055 2009-11-08 08:40:02.000000000 +0000
+++ /tmp/wklog.40.new.5055 2009-11-08 08:40:02.000000000 +0000
@@ -3,6 +3,7 @@
1. Adding --binlog-with-tables-info option
******************************************
+GLOBAL, read-only option.
When set, Query events are to be written in the extended binary
format which contains tables_info. When not set, Query events
-=-=(Alexi - Thu, 05 Nov 2009, 12:37)=-=-
Low Level Design modified.
--- /tmp/wklog.40.old.11441 2009-11-05 12:37:16.000000000 +0200
+++ /tmp/wklog.40.new.11441 2009-11-05 12:37:16.000000000 +0200
@@ -1,9 +1,18 @@
OPTION: 2.5 Extend Query Events With Tables Info
================================================
-1. Query_log_event Binary Format
-********************************
-Changes to be done:
+1. Adding --binlog-with-tables-info option
+******************************************
+
+When set, Query events are to be written in the extended binary
+format which contains tables_info. When not set, Query events
+are to be written in usual format (without any changes).
+
+2. Query event extended binary format
+*************************************
+
+When --binlog-with-tables-info is set, Query events are writen
+to binary log in the following (extended) format.
Query_log_event binary format
---------------------------------
@@ -24,12 +33,12 @@
error_code 2
status_vars_len 2
+ query_len 2 (see Note 1)
-+ tables_info_len 2 (see Note 2)
++ tables_info_len 2
---------------------------------
BODY:
status_vars status_vars_len
- db db_len + 1
-+ db db_len (see Note 3)
++ db db_len (see Note 2)
query query_len
+ tables_info
@@ -37,7 +46,7 @@
---------------------------------
Name Size (bytes)
---------------------------------
- db_len 1 (see Note 4)
+ db_len 1 (see Note 3)
db db_len
table_name_len 1
table_name table_name_len
@@ -48,19 +57,99 @@
table_name table_name_len
NOTES
-1. Currently Query_log_event format doesn't include 'query_len' because
+1. In usual format, Query_log_event doesn't include 'query_len' because
it considers the query to extent to the end of the event.
-2. If tables_info is not included in the event (--binlog-with-tables-info
- option), tables_info_len = 0.
-3. The trailing zero is redundant since the length is already known.
-4. In case of db = current db, db_len = 0 and db = empty, because
- current db is already included in the current event format.
+2. For 'db' (current db) the trailing zero is redundant since the length
+ is already known.
+3. db_len = 0 means that this is the current db.
+
+When reading Query events from binary log, we can recognize its format
+by its post-header length: in extended case the post-header includes 4
+additional bytes.
+
+ #define QUERY_HEADER_LEN (QUERY_HEADER_MINIMAL_LEN + 4)
++ #define QUERY_HEADER_LEN_EXT (QUERY_HEADER_LEN + 4)
+ ...
+ #define Q_STATUS_VARS_LEN_OFFSET 11
++ #define Q_QUERY_LEN_OFFSET Q_STATUS_VARS_LEN_OFFSET + 2
++ #define Q_QUERY_TABLES_INFO_LEN_OFFSET Q_QUERY_LEN_OFFSET + 2
+
+3. Changes in log events
+************************
+
+3.1. Format description event
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+Changes needed here concern setting post-header length for Query events.
+This setting is done in Format description event constructor which creates
+the event for writing to binary log:
+
+ if (binlog_with_tables_info)
+ post_header_len[QUERY_EVENT - 1] = QUERY_HEADER_LEN_EXT;
+ else
+ post_header_len[QUERY_EVENT - 1] = QUERY_HEADER_LEN;
+
+This change is to be done only for case binlog_ver = 4.
+
+NOTE. The refered above constructor is allowed to be invoked in a client
+context for creating "artificial" Format description events in case of
+MySQL < 5.0 (e.g. see mysqlbinlog code). To avoid compilation problems
+(because of 'binlog_with_tables_info') and taking into account the
+"MySQL < 5.0" restriction, we have to #ifdef out the above code in
+following manner:
+
+ switch (binlog_ver) {
+ case 4: /* MySQL 5.0 and higher */
++ #ifndef MYSQL_CLIENT
+ ...
++ #else
++ <error>
++ #endif
+ case 1:
+ case 3:
+ ...
+ }
+
+3.2. Query event
+~~~~~~~~~~~~~~~~
+Changes needed here include adding tables_info and tables_info_len
+members (member for query length already exists) and modifying the
+following function-members:
+
+Query_log_event(buf) constructor
+--------------------------------
+[Parses binary format written to the 'buf']
+
+Getting post-header length from the Format description event (passed
+to the constructor as an argument), define whether buf contains an
+extended or usual Query event and parse the buf contents accordingly.
+
+NOTE. Defining Query event format here should be done with taking into
+account that this constructor can be called within a Query-derived
+event with the event_type argument != QUERY_EVENT.
+
+Query_log_event(thd) constructor
+--------------------------------
+[Creates the event for binlogging]
+
+In case of binlog_with_tables_info = TRUE, set additionally query_len,
+tables_info_len, and tables_info members (the constructor is to have
+an additional 'tables_info' argument).
+
+write() function
+----------------
+[Writes the event to binlog]
+
+In case of binlog_with_tables_info = TRUE, write additional members
+(query_len, tables_info_len, and tables_info) to binary log. Also
+write corresponding whole event length to the common-header.
+
+<To be continued>
-2. Where to get tables info from?
+4. Where to get tables info from?
*********************************
-2.1. Case study: CREATE TABLE
-******************************
+4.1. Case study: CREATE TABLE
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
*** CREATE TABLE table [SELECT ...]
@@ -129,4 +218,4 @@
}
}
-To be continued
+<To be continued>
-=-=(Alexi - Wed, 04 Nov 2009, 10:21)=-=-
Low Level Design modified.
--- /tmp/wklog.40.old.6734 2009-11-04 10:21:20.000000000 +0200
+++ /tmp/wklog.40.new.6734 2009-11-04 10:21:20.000000000 +0200
@@ -21,9 +21,9 @@
slave_proxy_id 4
exec_time 4
db_len 1
-+ query_len 2 (see Note 1)
error_code 2
status_vars_len 2
++ query_len 2 (see Note 1)
+ tables_info_len 2 (see Note 2)
---------------------------------
BODY:
-=-=(Alexi - Tue, 03 Nov 2009, 11:19)=-=-
Low Level Design modified.
--- /tmp/wklog.40.old.7187 2009-11-03 11:19:22.000000000 +0200
+++ /tmp/wklog.40.new.7187 2009-11-03 11:19:22.000000000 +0200
@@ -1 +1,132 @@
+OPTION: 2.5 Extend Query Events With Tables Info
+================================================
+1. Query_log_event Binary Format
+********************************
+Changes to be done:
+
+ Query_log_event binary format
+ ---------------------------------
+ Name Size (bytes)
+ ---------------------------------
+ COMMON HEADER:
+ timestamp 4
+ type 1
+ server_id 4
+ total_size 4
+ master_position 4
+ flags 2
+ ---------------------------------
+ POST HEADER:
+ slave_proxy_id 4
+ exec_time 4
+ db_len 1
++ query_len 2 (see Note 1)
+ error_code 2
+ status_vars_len 2
++ tables_info_len 2 (see Note 2)
+ ---------------------------------
+ BODY:
+ status_vars status_vars_len
+- db db_len + 1
++ db db_len (see Note 3)
+ query query_len
++ tables_info
+
+ tables_info binary format
+ ---------------------------------
+ Name Size (bytes)
+ ---------------------------------
+ db_len 1 (see Note 4)
+ db db_len
+ table_name_len 1
+ table_name table_name_len
+ ...
+ db_len 1
+ db db_len
+ table_name_len 1
+ table_name table_name_len
+
+NOTES
+1. Currently Query_log_event format doesn't include 'query_len' because
+ it considers the query to extent to the end of the event.
+2. If tables_info is not included in the event (--binlog-with-tables-info
+ option), tables_info_len = 0.
+3. The trailing zero is redundant since the length is already known.
+4. In case of db = current db, db_len = 0 and db = empty, because
+ current db is already included in the current event format.
+
+2. Where to get tables info from?
+*********************************
+
+2.1. Case study: CREATE TABLE
+******************************
+
+*** CREATE TABLE table [SELECT ...]
+
+ bool mysql_create_table_no_lock(
+ THD *thd,
+ const char *db,
+ const char *table_name, ...)
+ {
+ ...
+ // -------------------------------------
+ // WL40: To be included in tables_info:
+ // * db, table_name
+ // * thd->lex->query_tables (tables refered to in
+ // the select-part; empty if no select-part)
+ // -------------------------------------
+ write_bin_log(thd, TRUE, thd->query, thd->query_length);
+ }
+
+*** CREATE TABLE table LIKE src-table
+
+ bool mysql_create_like_table(
+ ...
+ TABLE_LIST *table,
+ TABLE_LIST *src_table,
+ ...)
+ {
+ ...
+ if (thd->current_stmt_binlog_row_based)
+ { // RBR: In this case we don't replicate temp tables
+ if (!(create_info->options & HA_LEX_CREATE_TMP_TABLE))
+ {
+ if (src_table->table->s->tmp_table)
+ { // CREATE normal-table LIKE temp-table:
+
+ // Generate new query without LIKE-part
+ store_create_info(thd, table, &query, create_info, FALSE);
+
+ // -------------------------------------
+ // WL40: To include to tables_info:
+ // * table (src_table is not included)
+ // -------------------------------------
+ write_bin_log(thd, TRUE, query.ptr(), query.length());
+ }
+ else
+ { // CREATE normal-table LIKE normal-table
+
+ // -------------------------------------
+ // WL40: To include to log_tables_info:
+ // * table
+ // * src_table
+ // -------------------------------------
+ write_bin_log(thd, TRUE, thd->query, thd->query_length);
+ }
+ }
+ // CREATE temp-table LIKE ...
+ // This case is not replicated
+ }
+ else
+ { // SBR:
+ // -------------------------------------
+ // WL40: To include to tables_info:
+ // * table
+ // * src_table
+ // -------------------------------------
+ write_bin_log(thd, TRUE, thd->query, thd->query_length);
+ }
+ }
+
+To be continued
------------------------------------------------------------
-=-=(View All Progress Notes, 17 total)=-=-
http://askmonty.org/worklog/index.pl?tid=40&nolimit=1
DESCRIPTION:
Replication slave can be set to filter updates to certain tables with
--replicate-[wild-]{do,ignore}-table options.
This task is about adding similar functionality to mysqlbinlog.
HIGH-LEVEL SPECIFICATION:
1. Context
----------
(See http://askmonty.org/wiki/index.php/Scratch/ReplicationOptions for global
overview)
At the moment, the server has these replication slave options:
--replicate-do-table=db.tbl
--replicate-ignore-table=db.tbl
--replicate-wild-do-table=pattern.pattern
--replicate-wild-ignore-table=pattern.pattern
They affect both RBR and SBR events. SBR events are checked after the
statement has been parsed, the server iterates over list of used tables and
checks them againist --replicate instructions.
What is interesting is that this scheme still allows to update the ignored
table through a VIEW.
2. Table filtering in mysqlbinlog
---------------------------------
Per-table filtering of RBR events is easy (as it is relatively easy to extract
the name of the table that the event applies to).
Per-table filtering of SBR events is hard, as generally it is not apparent
which tables the statement refers to.
This opens possible options:
2.1 Put the parser into mysqlbinlog
-----------------------------------
Once we have a full parser in mysqlbinlog, we'll be able to check which tables
are used by a statement, and will allow to show behaviour identical to those
that one obtains when using --replicate-* slave options.
(It is not clear how much effort is needed to put the parser into mysqlbinlog.
Any guesses?)
2.2 Use dumb regexp match
-------------------------
Use a really dumb approach. A query is considered to be modifying table X if
it matches an expression
CREATE TABLE $tablename
DROP $tablename
UPDATE ...$tablename ... SET // here '...' can't contain the word 'SET'
DELETE ...$tablename ... WHERE // same as above
ALTER TABLE $tablename
.. etc (go get from the grammar) ..
The advantage over doing the same in awk is that mysqlbinlog will also process
RBR statements, and together with that will provide a working solution for
those who are careful with their table names not mixing with string constants
and such.
(TODO: string constants are of particular concern as they come from
[potentially hostile] users, unlike e.g. table aliases which come from
[not hostile] developers. Remove also all string constants before attempting
to do match?)
2.3 Have the master put annotations
-----------------------------------
We could add a master option so that it injects into query a mark that tells
which tables the query will affect, e.g. for the query
UPDATE t1 LEFT JOIN db3.t2 ON ... WHERE ...
the binlog will have
/* !mysqlbinlog: updates t1,db3.t2 */ UPDATE t1 LEFT JOIN ...
and further processing in mysqlbinlog will be trivial.
2.4 Implement server functionality to ignore certain tables
-----------------------------------------------------------
We could add a general facility in the server to ignore certain tables:
SET SESSION ignored_tables = "db1.t1,db2.t2";
This would work similar to --replicate-ignore-table, but in a general way not
restricted to the slave SQL thread.
It would then be trivial for mysqlbinlog to add such statements at the start
of the output, or probably the user could just do it manually with no need for
additional options for mysqlbinlog.
It might be useful to integrate this with the code that already handles
--replicate-ignore-db and similar slave options.
2.5 Extend Query Events With Tables Info
----------------------------------------
We could extend query events structure with a tables info - a list of tables
which the query refers to:
<current query event structure>
tables_info_len
dbase_len dbase
table_len table
...
dbase_len dbase
table_len table
Note. In case of <dbase> = current data base, we can set dbase_len = 0
and dbase = empty because current query event structure already
includes current data base name.
Note. Possibly it is reasonable also to add a --binlog-with-tables-info
option which defines whether tables info must be included to the
query events.
LOW-LEVEL DESIGN:
OPTION: 2.5 Extend Query Events With Tables Info
================================================
1. Adding --binlog-with-tables-info option
******************************************
GLOBAL, read-only option.
When set, Query events are to be written in the extended binary
format which contains tables_info. When not set, Query events
are to be written in usual format (without any changes).
2. Query event extended binary format
*************************************
When --binlog-with-tables-info is set, Query events are writen
to binary log in the following (extended) format.
Query_log_event binary format
---------------------------------
Name Size (bytes)
---------------------------------
COMMON HEADER:
timestamp 4
type 1
server_id 4
total_size 4
master_position 4
flags 2
---------------------------------
POST HEADER:
slave_proxy_id 4
exec_time 4
db_len 1
error_code 2
status_vars_len 2
+ query_len 2 (see Note 1)
+ tables_info_len 2
---------------------------------
BODY:
status_vars status_vars_len
- db db_len + 1
+ db db_len (see Note 2)
query query_len
+ tables_info
tables_info binary format
---------------------------------
Name Size (bytes)
---------------------------------
db_len 1 (see Note 3)
db db_len
table_name_len 1
table_name table_name_len
...
db_len 1
db db_len
table_name_len 1
table_name table_name_len
NOTES
1. In usual format, Query_log_event doesn't include 'query_len' because
it considers the query to extent to the end of the event.
2. For 'db' (current db) the trailing zero is redundant since the length
is already known.
3. In tables_info, db_len = 0 means that this is the current db.
When reading Query events from binary log, we can recognize its format
by its post-header length: in extended case the post-header includes 4
additional bytes.
#define QUERY_HEADER_LEN (QUERY_HEADER_MINIMAL_LEN + 4)
+ #define QUERY_HEADER_LEN_EXT (QUERY_HEADER_LEN + 4)
...
#define Q_STATUS_VARS_LEN_OFFSET 11
+ #define Q_QUERY_LEN_OFFSET Q_STATUS_VARS_LEN_OFFSET + 2
+ #define Q_QUERY_TABLES_INFO_LEN_OFFSET Q_QUERY_LEN_OFFSET + 2
***********************************************************************
HELP NEEDED
***********************************************************************
The QUERY_HEADER_LEN is used in the definition of MAX_LOG_EVENT_HEADER:
log_event.h
~~~~~~~~~~~
#define MAX_LOG_EVENT_HEADER ( /* in order of Query_log_event::write */ \
LOG_EVENT_HEADER_LEN + /* write_header */ \
QUERY_HEADER_LEN + /* write_data */ \
EXECUTE_LOAD_QUERY_EXTRA_HEADER_LEN + /*write_post_header_for_derived */ \
MAX_SIZE_LOG_EVENT_STATUS + /* status */ \
NAME_LEN + 1)
which is used only for setting
thd->variables.max_allowed_packet
mysql->net.max_packet_size
Looks like (but I am not quite sure) that QUERY_HEADER_LEN can simply
(without making any other changes) be substituted in this definition by
QUERY_HEADER_LEN_EXT.
Below I list all places where MAX_LOG_EVENT_HEADER is used:
slave.cc
~~~~~~~~
static int init_slave_thread(...)
{ ...
/*
Adding MAX_LOG_EVENT_HEADER_LEN to the max_allowed_packet on all
slave threads, since a replication event can become this much larger
than the corresponding packet (query) sent from client to master.
*/
thd->variables.max_allowed_packet= global_system_variables.max_allowed_packet
+ MAX_LOG_EVENT_HEADER; /* note, incr over the global not session var */
...
}
pthread_handler_t handle_slave_io(...)
{ ...
/*
Adding MAX_LOG_EVENT_HEADER_LEN to the max_packet_size on the I/O
thread, since a replication event can become this much larger than
the corresponding packet (query) sent from client to master.
*/
mysql->net.max_packet_size= thd->net.max_packet_size+= MAX_LOG_EVENT_HEADER;
...
}
sql_repl.cc
~~~~~~~~~~~
void mysql_binlog_send(...)
{ ...
/*
Adding MAX_LOG_EVENT_HEADER_LEN, since a binlog event can become
this larger than the corresponding packet (query) sent
from client to master.
*/
thd->variables.max_allowed_packet+= MAX_LOG_EVENT_HEADER;
...
}
bool mysql_show_binlog_events(...)
{ ...
/*
to account binlog event header size
*/
thd->variables.max_allowed_packet+= MAX_LOG_EVENT_HEADER;
...
}
3. Changes in log events
************************
3.1. Format description event
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
Changes needed here concern setting post-header length for Query events.
This setting is done in Format description event constructor which creates
the event for writing to binary log:
if (opt_binlog_with_tables_info)
post_header_len[QUERY_EVENT - 1] = QUERY_HEADER_LEN_EXT;
else
post_header_len[QUERY_EVENT - 1] = QUERY_HEADER_LEN;
This change is to be done only for case binlog_ver = 4.
NOTE. The refered above constructor is allowed to be invoked in a client
context for creating "artificial" Format description events in case of
MySQL < 5.0 (e.g. see mysqlbinlog code). To avoid compilation problems
(because of 'binlog_with_tables_info') and taking into account the
"MySQL < 5.0" restriction, we have to #ifdef out the above code in
following manner:
switch (binlog_ver) {
+ #ifndef MYSQL_CLIENT
case 4: /* MySQL 5.0 and higher */
...
break;
+ #endif
case 1:
case 3:
...
}
3.2. Query event
~~~~~~~~~~~~~~~~
Changes needed here include adding tables_info and tables_info_len
members (member for query length already exists) and modifying the
following function-members:
Query_log_event(buf) constructor
--------------------------------
[Parses binary format written to the 'buf']
Getting post-header length from the Format description event (passed
to the constructor as an argument), define whether buf contains an
extended or usual Query event and parse the buf contents accordingly.
NOTE. Defining Query event format here should be done with taking into
account that this constructor can be called within a Query-derived
event with the event_type argument != QUERY_EVENT.
Query_log_event(thd) constructor
--------------------------------
[Creates the event for binlogging]
In case of opt_binlog_with_tables_info = TRUE, set additionally query_len,
tables_info_len, and tables_info members (the constructor is to have
an additional 'tables_info' argument).
write() function
----------------
[Writes the event to binlog]
In case of opt_binlog_with_tables_info = TRUE, write additional members
(query_len, tables_info_len, and tables_info) to binary log. Also
write corresponding whole event length to the common-header.
<To be continued>
4. Where to get tables info from?
*********************************
4.1. Case study: CREATE TABLE
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
*** CREATE TABLE table [SELECT ...]
bool mysql_create_table_no_lock(
THD *thd,
const char *db,
const char *table_name, ...)
{
...
// -------------------------------------
// WL40: To be included in tables_info:
// * db, table_name
// * thd->lex->query_tables (tables refered to in
// the select-part; empty if no select-part)
// -------------------------------------
write_bin_log(thd, TRUE, thd->query, thd->query_length);
}
*** CREATE TABLE table LIKE src-table
bool mysql_create_like_table(
...
TABLE_LIST *table,
TABLE_LIST *src_table,
...)
{
...
if (thd->current_stmt_binlog_row_based)
{ // RBR: In this case we don't replicate temp tables
if (!(create_info->options & HA_LEX_CREATE_TMP_TABLE))
{
if (src_table->table->s->tmp_table)
{ // CREATE normal-table LIKE temp-table:
// Generate new query without LIKE-part
store_create_info(thd, table, &query, create_info, FALSE);
// -------------------------------------
// WL40: To include to tables_info:
// * table (src_table is not included)
// -------------------------------------
write_bin_log(thd, TRUE, query.ptr(), query.length());
}
else
{ // CREATE normal-table LIKE normal-table
// -------------------------------------
// WL40: To include to log_tables_info:
// * table
// * src_table
// -------------------------------------
write_bin_log(thd, TRUE, thd->query, thd->query_length);
}
}
// CREATE temp-table LIKE ...
// This case is not replicated
}
else
{ // SBR:
// -------------------------------------
// WL40: To include to tables_info:
// * table
// * src_table
// -------------------------------------
write_bin_log(thd, TRUE, thd->query, thd->query_length);
}
}
<To be continued>
ESTIMATED WORK TIME
ESTIMATED COMPLETION DATE
-----------------------------------------------------------------------
WorkLog (v3.5.9)
1
0

[Maria-developers] Rev 2758: Subquery optimization backport: in file:///home/psergey/dev/maria-5.3-subqueries-r6/
by Sergey Petrunya 15 Feb '10
by Sergey Petrunya 15 Feb '10
15 Feb '10
At file:///home/psergey/dev/maria-5.3-subqueries-r6/
------------------------------------------------------------
revno: 2758
revision-id: psergey(a)askmonty.org-20100215215306-hc0levm9ag1lv1b1
parent: psergey(a)askmonty.org-20100212181041-5rwekm1wpvwaikkx
committer: Sergey Petrunya <psergey(a)askmonty.org>
branch nick: maria-5.3-subqueries-r6
timestamp: Tue 2010-02-16 00:53:06 +0300
message:
Subquery optimization backport:
- Factor out subquery code into sql/opt_subselect.{h,cc}
- Stop using the term "confluent" (was used due to misreading the dictionary)
Diff too large for email (8074 lines, the limit is 1000).
1
0

[Maria-developers] Rev 2734: Maria WL#61 in file:///Users/bell/maria/bzr/work-maria-5.2-engine/
by sanja@askmonty.org 15 Feb '10
by sanja@askmonty.org 15 Feb '10
15 Feb '10
At file:///Users/bell/maria/bzr/work-maria-5.2-engine/
------------------------------------------------------------
revno: 2734
revision-id: sanja(a)askmonty.org-20100215074703-tqcssnpbf43grygo
parent: psergey(a)askmonty.org-20091202142609-18bp41q8mejxl47t
committer: sanja(a)askmonty.org
branch nick: work-maria-5.2-engine
timestamp: Mon 2010-02-15 09:47:03 +0200
message:
Maria WL#61
Interface for maria extensions.
Additional information about plugins (maturity and string version) interface for maria extensions.
=== modified file 'CMakeLists.txt'
--- a/CMakeLists.txt 2009-10-03 19:24:13 +0000
+++ b/CMakeLists.txt 2010-02-15 07:47:03 +0000
@@ -251,6 +251,7 @@
IF (ENGINE_BUILD_TYPE STREQUAL "STATIC")
SET (mysql_plugin_defs "${mysql_plugin_defs},builtin_${PLUGIN_NAME}_plugin")
+ SET (mariadb_extra_plugin_defs "${mariadb_extra_plugin_defs},builtin_mariadb_${PLUGIN_NAME}_plugin")
SET (MYSQLD_STATIC_ENGINE_LIBS ${MYSQLD_STATIC_ENGINE_LIBS} ${PLUGIN_NAME})
SET (STORAGE_ENGINE_DEFS "${STORAGE_ENGINE_DEFS} -DWITH_${ENGINE}_STORAGE_ENGINE")
SET (WITH_${ENGINE}_STORAGE_ENGINE TRUE)
@@ -269,6 +270,7 @@
IF(NOT WITHOUT_PARTITION_STORAGE_ENGINE)
SET (STORAGE_ENGINE_DEFS "${STORAGE_ENGINE_DEFS} -DWITH_PARTITION_STORAGE_ENGINE")
SET (mysql_plugin_defs "${mysql_plugin_defs},builtin_partition_plugin")
+ SET (mariadb_extra_plugin_defs "${mariadb_extra_plugin_defs},builtin_mariadb_partition_plugin")
ENDIF(NOT WITHOUT_PARTITION_STORAGE_ENGINE)
# Special handling for tmp tables with the maria engine
=== modified file 'config/ac-macros/plugins.m4'
--- a/config/ac-macros/plugins.m4 2009-04-25 10:05:32 +0000
+++ b/config/ac-macros/plugins.m4 2010-02-15 07:47:03 +0000
@@ -461,6 +461,7 @@
])
])
mysql_plugin_defs="$mysql_plugin_defs, [builtin_]$2[_plugin]"
+ mariadb_extra_plugin_defs="$mariadb_extra_plugin_defs, [builtin_mariadb_]$2[_plugin]"
[with_plugin_]$2=yes
AC_MSG_RESULT([yes])
m4_ifdef([$11],[
=== modified file 'configure.in'
--- a/configure.in 2009-11-12 04:31:28 +0000
+++ b/configure.in 2010-02-15 07:47:03 +0000
@@ -2842,6 +2842,7 @@
AC_SUBST(mysql_plugin_dirs)
AC_SUBST(mysql_plugin_libs)
AC_SUBST(mysql_plugin_defs)
+AC_SUBST(mariadb_extra_plugin_defs)
# Now that sql_client_dirs and sql_server_dirs are stable, determine the union.
=== modified file 'include/mysql/plugin.h'
--- a/include/mysql/plugin.h 2009-09-07 20:50:10 +0000
+++ b/include/mysql/plugin.h 2010-02-15 07:47:03 +0000
@@ -65,7 +65,10 @@
Plugin API. Common for all plugin types.
*/
+/* MySQL plugin interface version */
#define MYSQL_PLUGIN_INTERFACE_VERSION 0x0100
+/* MariaDB extentsion interface version */
+#define MARIAEXT_PLUGIN_INTERFACE_VERSION 0x0100
/*
The allowable types of plugins
@@ -86,6 +89,21 @@
#define PLUGIN_LICENSE_GPL_STRING "GPL"
#define PLUGIN_LICENSE_BSD_STRING "BSD"
+/* definitions of code maturity for plugins */
+#define PLUGIN_MATURITY_UNKNOWN 0
+#define PLUGIN_MATURITY_TEST 1
+#define PLUGIN_MATURITY_ALPHA 2
+#define PLUGIN_MATURITY_BETA 3
+#define PLUGIN_MATURITY_GAMMA 4
+#define PLUGIN_MATURITY_RELEASE 5
+
+#define PLUGIN_MATURITY_UNKNOWN_STR "Unknown"
+#define PLUGIN_MATURITY_TEST_STR "Test"
+#define PLUGIN_MATURITY_ALPHA_STR "Alpha"
+#define PLUGIN_MATURITY_BETA_STR "Beta"
+#define PLUGIN_MATURITY_GAMMA_STR "Gamma"
+#define PLUGIN_MATURITY_RELEASE_STR "Release"
+
/*
Macros for beginning and ending plugin declarations. Between
mysql_declare_plugin and mysql_declare_plugin_end there should
@@ -94,15 +112,29 @@
#ifndef MYSQL_DYNAMIC_PLUGIN
+
#define __MYSQL_DECLARE_PLUGIN(NAME, VERSION, PSIZE, DECLS) \
int VERSION= MYSQL_PLUGIN_INTERFACE_VERSION; \
int PSIZE= sizeof(struct st_mysql_plugin); \
struct st_mysql_plugin DECLS[]= {
+
+#define __MARIAEXT_DECLARE_PLUGIN(NAME, VERSION, PSIZE, DECLS) \
+int VERSION= MARIAEXT_PLUGIN_INTERFACE_VERSION; \
+int PSIZE= sizeof(struct st_mariaext_plugin); \
+struct st_mariaext_plugin DECLS[]= {
+
#else
+
#define __MYSQL_DECLARE_PLUGIN(NAME, VERSION, PSIZE, DECLS) \
MYSQL_PLUGIN_EXPORT int _mysql_plugin_interface_version_= MYSQL_PLUGIN_INTERFACE_VERSION; \
MYSQL_PLUGIN_EXPORT int _mysql_sizeof_struct_st_plugin_= sizeof(struct st_mysql_plugin); \
MYSQL_PLUGIN_EXPORT struct st_mysql_plugin _mysql_plugin_declarations_[]= {
+
+#define __MARIAEXT_DECLARE_PLUGIN(NAME, VERSION, PSIZE, DECLS) \
+MYSQL_PLUGIN_EXPORT int _mariaext_plugin_interface_version_= MARIAEXT_PLUGIN_INTERFACE_VERSION; \
+MYSQL_PLUGIN_EXPORT int _mariaext_sizeof_struct_st_plugin_= sizeof(struct st_mariaext_plugin); \
+MYSQL_PLUGIN_EXPORT struct st_mariaext_plugin _mariaext_plugin_declarations_[]= {
+
#endif
#define mysql_declare_plugin(NAME) \
@@ -111,7 +143,14 @@
builtin_ ## NAME ## _sizeof_struct_st_plugin, \
builtin_ ## NAME ## _plugin)
+#define mariaext_declare_plugin(NAME) \
+__MARIAEXT_DECLARE_PLUGIN(NAME, \
+ builtin_mariadb_ ## NAME ## _plugin_interface_version, \
+ builtin_mariadb_ ## NAME ## _sizeof_struct_st_plugin, \
+ builtin_mariadb_ ## NAME ## _plugin)
+
#define mysql_declare_plugin_end ,{0,0,0,0,0,0,0,0,0,0,0,0}}
+#define mariaext_declare_plugin_end ,{0,0}}
/*
declarations for SHOW STATUS support in plugins
@@ -407,6 +446,16 @@
void * __reserved1; /* reserved for dependency checking */
};
+/*
+ MariaDB extension for plugins declaration structure.
+*/
+
+struct st_mariaext_plugin
+{
+ const char *version_info; /* plugin version string */
+ int maturity; /* HA_PLUGIN_MATURITY_XXX */
+};
+
/*************************************************************************
API for Full-text parser plugin. (MYSQL_FTPARSER_PLUGIN)
*/
=== modified file 'include/mysql/plugin.h.pp'
--- a/include/mysql/plugin.h.pp 2008-10-10 15:28:41 +0000
+++ b/include/mysql/plugin.h.pp 2010-02-15 07:47:03 +0000
@@ -46,6 +46,11 @@
struct st_mysql_sys_var **system_vars;
void * __reserved1;
};
+struct st_mariaext_plugin
+{
+ const char *version_info;
+ int maturity;
+};
enum enum_ftparser_mode
{
MYSQL_FTPARSER_SIMPLE_MODE= 0,
=== modified file 'mysql-test/r/information_schema.result'
--- a/mysql-test/r/information_schema.result 2009-10-19 17:14:48 +0000
+++ b/mysql-test/r/information_schema.result 2010-02-15 07:47:03 +0000
@@ -1175,7 +1175,7 @@
group by column_type order by num;
column_type group_concat(table_schema, '.', table_name) num
varchar(27) information_schema.COLUMNS 1
-varchar(7) information_schema.ROUTINES,information_schema.VIEWS 2
+varchar(7) information_schema.PLUGINS,information_schema.ROUTINES,information_schema.VIEWS 3
varchar(20) information_schema.FILES,information_schema.FILES,information_schema.PLUGINS,information_schema.PLUGINS,information_schema.PLUGINS,information_schema.PROFILING 6
create table t1(f1 char(1) not null, f2 char(9) not null)
default character set utf8;
=== modified file 'plugin/daemon_example/daemon_example.cc'
--- a/plugin/daemon_example/daemon_example.cc 2007-06-27 14:49:12 +0000
+++ b/plugin/daemon_example/daemon_example.cc 2010-02-15 07:47:03 +0000
@@ -200,3 +200,9 @@
NULL /* config options */
}
mysql_declare_plugin_end;
+mariaext_declare_plugin(daemon_example)
+{
+ "1.0", /* string version */
+ PLUGIN_MATURITY_TEST /* maturity */
+}
+mariaext_declare_plugin_end;
=== modified file 'plugin/fulltext/plugin_example.c'
--- a/plugin/fulltext/plugin_example.c 2007-04-26 19:26:04 +0000
+++ b/plugin/fulltext/plugin_example.c 2010-02-15 07:47:03 +0000
@@ -270,4 +270,10 @@
NULL
}
mysql_declare_plugin_end;
+mariaext_declare_plugin(ftexample)
+{
+ "0.01", /* string version */
+ PLUGIN_MATURITY_TEST /* maturity */
+}
+mariaext_declare_plugin_end;
=== modified file 'sql/ha_ndbcluster.cc'
--- a/sql/ha_ndbcluster.cc 2009-09-07 20:50:10 +0000
+++ b/sql/ha_ndbcluster.cc 2010-02-15 07:47:03 +0000
@@ -10561,5 +10561,11 @@
NULL /* config options */
}
mysql_declare_plugin_end;
+mariaext_declare_plugin(ndbcluster)
+{
+ "1.0", /* string version */
+ PLUGIN_MATURITY_BETA /* maturity */
+}
+mariaext_declare_plugin_end;
#endif
=== modified file 'sql/ha_partition.cc'
--- a/sql/ha_partition.cc 2009-11-12 04:31:28 +0000
+++ b/sql/ha_partition.cc 2010-02-15 07:47:03 +0000
@@ -6510,5 +6510,11 @@
NULL /* config options */
}
mysql_declare_plugin_end;
+mariaext_declare_plugin(partition)
+{
+ "1.0", /* string version */
+ PLUGIN_MATURITY_RELEASE /* maturity */
+}
+mariaext_declare_plugin_end;
#endif
=== modified file 'sql/log.cc'
--- a/sql/log.cc 2009-11-12 04:31:28 +0000
+++ b/sql/log.cc 2010-02-15 07:47:03 +0000
@@ -5795,3 +5795,9 @@
NULL /* config options */
}
mysql_declare_plugin_end;
+mariaext_declare_plugin(binlog)
+{
+ "1.0", /* string version */
+ PLUGIN_MATURITY_RELEASE /* maturity */
+}
+mariaext_declare_plugin_end;
=== modified file 'sql/sql_builtin.cc.in'
--- a/sql/sql_builtin.cc.in 2006-12-31 01:29:11 +0000
+++ b/sql/sql_builtin.cc.in 2010-02-15 07:47:03 +0000
@@ -16,6 +16,7 @@
#include <mysql/plugin.h>
typedef struct st_mysql_plugin builtin_plugin[];
+typedef struct st_mariaext_plugin builtin_mariadb_plugin[];
extern builtin_plugin
builtin_binlog_plugin@mysql_plugin_defs@;
@@ -25,3 +26,10 @@
builtin_binlog_plugin@mysql_plugin_defs@,(struct st_mysql_plugin *)0
};
+extern builtin_mariadb_plugin
+ builtin_mariadb_binlog_plugin@mariadb_extra_plugin_defs@;
+
+struct st_mariaext_plugin *mysqld_bltnmexts[]=
+{
+ builtin_mariadb_binlog_plugin@mariadb_extra_plugin_defs@,(struct st_mariaext_plugin *)0
+};
=== modified file 'sql/sql_plugin.cc'
--- a/sql/sql_plugin.cc 2009-11-12 04:31:28 +0000
+++ b/sql/sql_plugin.cc 2010-02-15 07:47:03 +0000
@@ -28,6 +28,9 @@
#endif
extern struct st_mysql_plugin *mysqld_builtins[];
+extern struct st_mariaext_plugin *mysqld_bltnmexts[];
+static st_mariaext_plugin no_mariaext[2]= {{"Unknown", 0}, {0, 0}};
+static st_mariaext_plugin *empty_mariaext= no_mariaext;
/**
@note The order of the enumeration is critical.
@@ -82,6 +85,14 @@
"_mysql_sizeof_struct_st_plugin_";
static const char *plugin_declarations_sym= "_mysql_plugin_declarations_";
static int min_plugin_interface_version= MYSQL_PLUGIN_INTERFACE_VERSION & ~0xFF;
+static const char *mariaext_plugin_interface_version_sym=
+ "_mariaext_plugin_interface_version_";
+static const char *mariaext_sizeof_st_plugin_sym=
+ "_mariaext_sizeof_struct_st_plugin_";
+static const char *mariaext_plugin_declarations_sym=
+ "_mariaext_plugin_declarations_";
+static int min_mariaext_plugin_interface_version=
+ MARIAEXT_PLUGIN_INTERFACE_VERSION & ~0xFF;
#endif
/* Note that 'int version' must be the first field of every plugin
@@ -352,6 +363,7 @@
char dlpath[FN_REFLEN];
uint plugin_dir_len, dummy_errors, dlpathlen;
struct st_plugin_dl *tmp, plugin_dl;
+ struct st_mariaext_plugin *mariaext= no_mariaext;
void *sym;
DBUG_ENTER("plugin_dl_add");
plugin_dir_len= strlen(opt_plugin_dir);
@@ -507,6 +519,15 @@
files_charset_info, dl->str, dl->length, system_charset_info,
&dummy_errors);
plugin_dl.dl.str[plugin_dl.dl.length]= 0;
+
+ if ((sym= dlsym(plugin_dl.handle, mariaext_plugin_interface_version_sym)) &&
+ (*(int *)sym == MARIAEXT_PLUGIN_INTERFACE_VERSION) &&
+ (sym= dlsym(plugin_dl.handle, mariaext_plugin_declarations_sym)))
+ {
+ mariaext= (struct st_mariaext_plugin *) sym;
+ }
+ plugin_dl.mariaext= mariaext;
+
/* Add this dll to array */
if (! (tmp= plugin_dl_insert_or_reuse(&plugin_dl)))
{
@@ -719,6 +740,7 @@
{
struct st_plugin_int tmp;
struct st_mysql_plugin *plugin;
+ struct st_mariaext_plugin *ext, *mariaext;
DBUG_ENTER("plugin_add");
if (plugin_find_internal(name, MYSQL_ANY_PLUGIN))
{
@@ -732,9 +754,22 @@
bzero((char*) &tmp, sizeof(tmp));
if (! (tmp.plugin_dl= plugin_dl_add(dl, report)))
DBUG_RETURN(TRUE);
+
/* Find plugin by name */
- for (plugin= tmp.plugin_dl->plugins; plugin->info; plugin++)
+ for (plugin= tmp.plugin_dl->plugins, ext= tmp.plugin_dl->mariaext;
+ plugin->info;
+ plugin++, ext++)
{
+ mariaext= ext;
+ if (!ext->version_info)
+ {
+ /*
+ Plugin didn't have any mariadb extensions; Use default one and
+ reset counter to do same for next internal plugin.
+ */
+ mariaext= empty_mariaext;
+ ext--;
+ }
uint name_len= strlen(plugin->name);
if (plugin->type >= 0 && plugin->type < MYSQL_MAX_PLUGIN_TYPE_NUM &&
! my_strnncoll(system_charset_info,
@@ -759,6 +794,7 @@
goto err;
}
tmp.plugin= plugin;
+ tmp.mariaext= mariaext;
tmp.name.str= (char *)plugin->name;
tmp.name.length= name_len;
tmp.ref_count= 0;
@@ -1121,7 +1157,9 @@
uint i;
bool is_myisam;
struct st_mysql_plugin **builtins;
+ struct st_mariaext_plugin **bltnmexts;
struct st_mysql_plugin *plugin;
+ struct st_mariaext_plugin *ext, *mariaext;
struct st_plugin_int tmp, *plugin_ptr, **reap;
MEM_ROOT tmp_root;
bool reaped_mandatory_plugin= FALSE;
@@ -1160,10 +1198,29 @@
/*
First we register builtin plugins
*/
- for (builtins= mysqld_builtins; *builtins; builtins++)
+ for (builtins= mysqld_builtins, bltnmexts= mysqld_bltnmexts;
+ *builtins;
+ builtins++, bltnmexts++)
{
- for (plugin= *builtins; plugin->info; plugin++)
+ /* extensions should be the same numbers as static plugins) */
+ DBUG_ASSERT(*bltnmexts);
+ for (plugin= *builtins, ext= *bltnmexts;
+ plugin->info;
+ plugin++, ext++)
{
+
+ /* in case if plugin describe less extensions then plugins */
+ mariaext= ext;
+ if (!ext->version_info)
+ {
+ /*
+ Plugin didn't have any mariadb extensions; Use default one and
+ reset counter to do same for next internal plugin.
+ */
+ mariaext= empty_mariaext;
+ ext--;
+ }
+
if (opt_ignore_builtin_innodb &&
!my_strnncoll(&my_charset_latin1, (const uchar*) plugin->name,
6, (const uchar*) "InnoDB", 6))
@@ -1186,6 +1243,7 @@
#endif
bzero(&tmp, sizeof(tmp));
tmp.plugin= plugin;
+ tmp.mariaext= mariaext;
tmp.name.str= (char *)plugin->name;
tmp.name.length= strlen(plugin->name);
tmp.state= 0;
=== modified file 'sql/sql_plugin.h'
--- a/sql/sql_plugin.h 2009-05-14 12:03:33 +0000
+++ b/sql/sql_plugin.h 2010-02-15 07:47:03 +0000
@@ -63,6 +63,7 @@
LEX_STRING dl;
void *handle;
struct st_mysql_plugin *plugins;
+ struct st_mariaext_plugin *mariaext;
int version;
uint ref_count; /* number of plugins loaded from the library */
};
@@ -74,6 +75,7 @@
LEX_STRING name;
struct st_mysql_plugin *plugin;
struct st_plugin_dl *plugin_dl;
+ struct st_mariaext_plugin *mariaext;
uint state;
uint ref_count; /* number of threads using the plugin */
void *data; /* plugin type specific, e.g. handlerton */
@@ -95,6 +97,7 @@
#define plugin_name(pi) (&((pi)->name))
#define plugin_state(pi) ((pi)->state)
#define plugin_equals(p1,p2) ((p1) == (p2))
+#define plugin_ext(pi) ((pi)->mariaext)
#else
typedef struct st_plugin_int **plugin_ref;
#define plugin_decl(pi) ((pi)[0]->plugin)
@@ -103,6 +106,8 @@
#define plugin_name(pi) (&((pi)[0]->name))
#define plugin_state(pi) ((pi)[0]->state)
#define plugin_equals(p1,p2) ((p1) && (p2) && (p1)[0] == (p2)[0])
+#define plugin_ext(pi) ((pi)[0]->mariaext)
+
#endif
typedef int (*plugin_type_init)(struct st_plugin_int *);
=== modified file 'sql/sql_show.cc'
--- a/sql/sql_show.cc 2009-11-12 04:31:28 +0000
+++ b/sql/sql_show.cc 2010-02-15 07:47:03 +0000
@@ -94,12 +94,21 @@
return my_snprintf(buf, buf_length, "%d.%d", version>>8,version&0xff);
}
+static const LEX_STRING maturity_name[]={
+ { C_STRING_WITH_LEN(PLUGIN_MATURITY_UNKNOWN_STR) },
+ { C_STRING_WITH_LEN(PLUGIN_MATURITY_TEST_STR) },
+ { C_STRING_WITH_LEN(PLUGIN_MATURITY_ALPHA_STR) },
+ { C_STRING_WITH_LEN(PLUGIN_MATURITY_BETA_STR) },
+ { C_STRING_WITH_LEN(PLUGIN_MATURITY_GAMMA_STR) },
+ { C_STRING_WITH_LEN(PLUGIN_MATURITY_RELEASE_STR) }};
+
static my_bool show_plugins(THD *thd, plugin_ref plugin,
void *arg)
{
TABLE *table= (TABLE*) arg;
struct st_mysql_plugin *plug= plugin_decl(plugin);
struct st_plugin_dl *plugin_dl= plugin_dlib(plugin);
+ struct st_mariaext_plugin *mariaext= plugin_ext(plugin);
CHARSET_INFO *cs= system_charset_info;
char version_buf[20];
@@ -186,6 +195,26 @@
}
table->field[9]->set_notnull();
+ if ((uint) mariaext->maturity <= PLUGIN_MATURITY_RELEASE)
+ table->field[10]->store(maturity_name[mariaext->maturity].str,
+ maturity_name[mariaext->maturity].length,
+ cs);
+ else
+ {
+ DBUG_ASSERT(0);
+ table->field[10]->store("Unknown", 7, cs);
+ }
+ table->field[10]->set_notnull();
+
+ if (mariaext->version_info)
+ {
+ table->field[11]->store(mariaext->version_info,
+ strlen(mariaext->version_info), cs);
+ table->field[11]->set_notnull();
+ }
+ else
+ table->field[11]->set_null();
+
return schema_table_store_record(thd, table);
}
@@ -6990,6 +7019,8 @@
{"PLUGIN_AUTHOR", NAME_CHAR_LEN, MYSQL_TYPE_STRING, 0, 1, 0, SKIP_OPEN_TABLE},
{"PLUGIN_DESCRIPTION", 65535, MYSQL_TYPE_STRING, 0, 1, 0, SKIP_OPEN_TABLE},
{"PLUGIN_LICENSE", 80, MYSQL_TYPE_STRING, 0, 1, "License", SKIP_OPEN_TABLE},
+ {"PLUGIN_MATURITY", 7, MYSQL_TYPE_STRING, 0, 1, 0, SKIP_OPEN_TABLE},
+ {"PLUGIN_AUTH_VERSION", 80, MYSQL_TYPE_STRING, 0, 1, 0, SKIP_OPEN_TABLE},
{0, 0, MYSQL_TYPE_STRING, 0, 0, 0, SKIP_OPEN_TABLE}
};
=== modified file 'storage/archive/ha_archive.cc'
--- a/storage/archive/ha_archive.cc 2009-09-07 20:50:10 +0000
+++ b/storage/archive/ha_archive.cc 2010-02-15 07:47:03 +0000
@@ -1642,4 +1642,10 @@
NULL /* config options */
}
mysql_declare_plugin_end;
+mariaext_declare_plugin(archive)
+{
+ "1.0", /* string version */
+ PLUGIN_MATURITY_RELEASE /* maturity */
+}
+mariaext_declare_plugin_end;
=== modified file 'storage/blackhole/ha_blackhole.cc'
--- a/storage/blackhole/ha_blackhole.cc 2008-11-10 20:21:49 +0000
+++ b/storage/blackhole/ha_blackhole.cc 2010-02-15 07:47:03 +0000
@@ -369,3 +369,9 @@
NULL /* config options */
}
mysql_declare_plugin_end;
+mariaext_declare_plugin(blackhole)
+{
+ "1.0", /* string version */
+ PLUGIN_MATURITY_RELEASE /* maturity */
+}
+mariaext_declare_plugin_end;
=== modified file 'storage/csv/ha_tina.cc'
--- a/storage/csv/ha_tina.cc 2009-04-25 10:05:32 +0000
+++ b/storage/csv/ha_tina.cc 2010-02-15 07:47:03 +0000
@@ -1636,4 +1636,9 @@
NULL /* config options */
}
mysql_declare_plugin_end;
-
+mariaext_declare_plugin(csv)
+{
+ "1.0", /* string version */
+ PLUGIN_MATURITY_RELEASE /* maturity */
+}
+mariaext_declare_plugin_end;
=== modified file 'storage/example/ha_example.cc'
--- a/storage/example/ha_example.cc 2008-02-24 13:12:17 +0000
+++ b/storage/example/ha_example.cc 2010-02-15 07:47:03 +0000
@@ -906,3 +906,9 @@
NULL /* config options */
}
mysql_declare_plugin_end;
+mariaext_declare_plugin(example)
+{
+ "0.1", /* string version */
+ PLUGIN_MATURITY_TEST /* maturity */
+}
+mariaext_declare_plugin_end;
=== modified file 'storage/federated/ha_federated.cc'
--- a/storage/federated/ha_federated.cc 2009-09-07 20:50:10 +0000
+++ b/storage/federated/ha_federated.cc 2010-02-15 07:47:03 +0000
@@ -3379,3 +3379,9 @@
NULL /* config options */
}
mysql_declare_plugin_end;
+mariaext_declare_plugin(federated)
+{
+ "1.0", /* string version */
+ PLUGIN_MATURITY_BETA /* maturity */
+}
+mariaext_declare_plugin_end;
=== modified file 'storage/federatedx/ha_federatedx.cc'
--- a/storage/federatedx/ha_federatedx.cc 2009-11-03 11:08:09 +0000
+++ b/storage/federatedx/ha_federatedx.cc 2010-02-15 07:47:03 +0000
@@ -3485,9 +3485,15 @@
PLUGIN_LICENSE_GPL,
federatedx_db_init, /* Plugin Init */
federatedx_done, /* Plugin Deinit */
- 0x0100 /* 1.0 */,
+ 0x0200 /* 2.0 */,
NULL, /* status variables */
NULL, /* system variables */
NULL /* config options */
}
mysql_declare_plugin_end;
+mariaext_declare_plugin(federated)
+{
+ "2.0", /* string version */
+ PLUGIN_MATURITY_BETA /* maturity */
+}
+mariaext_declare_plugin_end;
=== modified file 'storage/heap/ha_heap.cc'
--- a/storage/heap/ha_heap.cc 2009-09-07 20:50:10 +0000
+++ b/storage/heap/ha_heap.cc 2010-02-15 07:47:03 +0000
@@ -767,3 +767,9 @@
NULL /* config options */
}
mysql_declare_plugin_end;
+mariaext_declare_plugin(heap)
+{
+ "1.0", /* string version */
+ PLUGIN_MATURITY_RELEASE /* maturity */
+}
+mariaext_declare_plugin_end;
=== modified file 'storage/ibmdb2i/ha_ibmdb2i.cc'
--- a/storage/ibmdb2i/ha_ibmdb2i.cc 2009-07-08 09:10:01 +0000
+++ b/storage/ibmdb2i/ha_ibmdb2i.cc 2010-02-15 07:47:03 +0000
@@ -3357,3 +3357,9 @@
NULL /* config options */
}
mysql_declare_plugin_end;
+mariaext_declare_plugin(ibmdb2i)
+{
+ "1.0", /* string version */
+ PLUGIN_MATURITY_UNKNOWN /* maturity */
+}
+mariaext_declare_plugin_end;
=== modified file 'storage/innobase/handler/ha_innodb.cc'
--- a/storage/innobase/handler/ha_innodb.cc 2009-10-16 22:57:48 +0000
+++ b/storage/innobase/handler/ha_innodb.cc 2010-02-15 07:47:03 +0000
@@ -8684,6 +8684,12 @@
NULL /* reserved */
}
mysql_declare_plugin_end;
+mariaext_declare_plugin(innobase)
+{
+ "1.0", /* string version */
+ PLUGIN_MATURITY_RELEASE /* maturity */
+}
+mariaext_declare_plugin_end;
/** @brief Initialize the default value of innodb_commit_concurrency.
=== modified file 'storage/innodb_plugin/handler/ha_innodb.cc'
--- a/storage/innodb_plugin/handler/ha_innodb.cc 2009-08-04 08:02:48 +0000
+++ b/storage/innodb_plugin/handler/ha_innodb.cc 2010-02-15 07:47:03 +0000
@@ -10032,6 +10032,12 @@
i_s_innodb_cmpmem,
i_s_innodb_cmpmem_reset
mysql_declare_plugin_end;
+mariaext_declare_plugin(innodb_plugin)
+{
+ INNODB_VERSION_STR, /* string version */
+ PLUGIN_MATURITY_RELEASE /* maturity */
+}
+mariaext_declare_plugin_end;
/** @brief Initialize the default value of innodb_commit_concurrency.
=== modified file 'storage/maria/ha_maria.cc'
--- a/storage/maria/ha_maria.cc 2009-10-26 11:35:42 +0000
+++ b/storage/maria/ha_maria.cc 2010-02-15 07:47:03 +0000
@@ -3346,9 +3346,15 @@
PLUGIN_LICENSE_GPL,
ha_maria_init, /* Plugin Init */
NULL, /* Plugin Deinit */
- 0x0100, /* 1.0 */
+ 0x0105, /* 1.5 */
status_variables, /* status variables */
system_variables, /* system variables */
NULL
}
mysql_declare_plugin_end;
+mariaext_declare_plugin(maria)
+{
+ "1.5", /* string version */
+ PLUGIN_MATURITY_GAMMA /* maturity */
+}
+mariaext_declare_plugin_end;
=== modified file 'storage/myisam/ha_myisam.cc'
--- a/storage/myisam/ha_myisam.cc 2009-10-17 19:12:28 +0000
+++ b/storage/myisam/ha_myisam.cc 2010-02-15 07:47:03 +0000
@@ -2183,6 +2183,12 @@
NULL /* config options */
}
mysql_declare_plugin_end;
+mariaext_declare_plugin(myisam)
+{
+ "1.0", /* string version */
+ PLUGIN_MATURITY_RELEASE /* maturity */
+}
+mariaext_declare_plugin_end;
#ifdef HAVE_QUERY_CACHE
=== modified file 'storage/myisammrg/ha_myisammrg.cc'
--- a/storage/myisammrg/ha_myisammrg.cc 2009-10-15 21:38:29 +0000
+++ b/storage/myisammrg/ha_myisammrg.cc 2010-02-15 07:47:03 +0000
@@ -1289,3 +1289,9 @@
NULL /* config options */
}
mysql_declare_plugin_end;
+mariaext_declare_plugin(myisammrg)
+{
+ "1.0", /* string version */
+ PLUGIN_MATURITY_RELEASE /* maturity */
+}
+mariaext_declare_plugin_end;
=== modified file 'storage/pbxt/src/ha_pbxt.cc'
--- a/storage/pbxt/src/ha_pbxt.cc 2009-09-03 06:15:03 +0000
+++ b/storage/pbxt/src/ha_pbxt.cc 2010-02-15 07:47:03 +0000
@@ -5507,6 +5507,18 @@
drizzle_declare_plugin_end;
#else
mysql_declare_plugin_end;
+#ifdef MARIADB_BASE_VERSION
+mariaext_declare_plugin(pbxt)
+{ /* PBXT */
+ "1.0.09g RC3", /* string version */
+ PLUGIN_MATURITY_GAMMA /* maturity */
+},
+{ /* PBXT_STATISTICS */
+ "1.0.09g RC3", /* string version */
+ PLUGIN_MATURITY_GAMMA /* maturity */
+}
+mariaext_declare_plugin_end;
+#endif
#endif
#if defined(XT_WIN) && defined(XT_COREDUMP)
=== modified file 'storage/xtradb/handler/ha_innodb.cc'
--- a/storage/xtradb/handler/ha_innodb.cc 2009-10-16 22:57:48 +0000
+++ b/storage/xtradb/handler/ha_innodb.cc 2010-02-15 07:47:03 +0000
@@ -10540,6 +10540,69 @@
i_s_innodb_index_stats,
i_s_innodb_patches
mysql_declare_plugin_end;
+mariaext_declare_plugin(innobase)
+{ /* InnoDB */
+ INNODB_VERSION_STR, /* string version */
+ PLUGIN_MATURITY_RELEASE /* maturity */
+},
+{ /* INNODB_RSEG */
+ "1.0",
+ PLUGIN_MATURITY_RELEASE
+},
+{ /* INNODB_BUFFER_POOL_PAGES */
+ "1.0",
+ PLUGIN_MATURITY_RELEASE
+},
+{ /* INNODB_BUFFER_POOL_PAGES_INDEX */
+ "1.0",
+ PLUGIN_MATURITY_RELEASE
+},
+{ /* INNODB_BUFFER_POOL_PAGES_BLOB */
+ "1.0",
+ PLUGIN_MATURITY_RELEASE
+},
+{ /* INNODB_TRX */
+ INNODB_VERSION_STR,
+ PLUGIN_MATURITY_RELEASE
+},
+{ /* INNODB_LOCKS */
+ INNODB_VERSION_STR,
+ PLUGIN_MATURITY_RELEASE
+},
+{ /* INNODB_LOCK_WAITS */
+ INNODB_VERSION_STR,
+ PLUGIN_MATURITY_RELEASE
+},
+{ /* INNODB_CMP */
+ INNODB_VERSION_STR,
+ PLUGIN_MATURITY_RELEASE
+},
+{ /* INNODB_CMP_RESET */
+ INNODB_VERSION_STR,
+ PLUGIN_MATURITY_RELEASE
+},
+{ /* INNODB_CMPMEM */
+ INNODB_VERSION_STR,
+ PLUGIN_MATURITY_RELEASE
+},
+{ /* INNODB_CMPMEM_RESET */
+ INNODB_VERSION_STR,
+ PLUGIN_MATURITY_RELEASE
+},
+{ /* INNODB_TABLE_STATS */
+ "1.0",
+ PLUGIN_MATURITY_RELEASE
+},
+{ /* INNODB_INDEX_STATS */
+ "1.0",
+ PLUGIN_MATURITY_RELEASE
+},
+{ /* XTRADB_ENHANCEMENTS */
+ INNODB_VERSION_STR,
+ PLUGIN_MATURITY_RELEASE
+}
+mariaext_declare_plugin_end;
+
/** @brief Initialize the default value of innodb_commit_concurrency.
1
0

Re: [Maria-developers] New (by Igor): Partitioned Key Cache for MyISAM (85)
by Arjen Lentz 15 Feb '10
by Arjen Lentz 15 Feb '10
15 Feb '10
Hi Igor
On 15/02/2010, at 10:55 AM, Igor Babaev wrote:
>> Great idea, but something it needs accessible stats to make it
>> usable.
>>
>> Right now we can already have multiple key caches, but there's no
>> way to
>> see the stats of them, so that makes it very difficult to use in a
>> production environment - I prefer to not have to guess how well a
>> cache
>> is doing ;-)
>>
>> Please
>> - FIRST figure out a way to make stats for multiple key cache visible
>> - implement that for the current multi keycaches
>> - then add the partitioned key cache feature
>
> Arjen,
>
> See the patch attached to the WL task.
Ok so we'd get an INFORMATION_SCHEMA.KEYCACHE
Or is it PERFORMANCE_SCHEMA ?
Anyway that's good, will that also show the multiple keycaches
(current implementation) ?
That would be great!
Regards,
Arjen.
--
Arjen Lentz, Exec.Director @ Open Query (http://openquery.com)
Exceptional Services for MySQL at a fixed budget.
Follow our blog at http://openquery.com/blog/
OurDelta: packages for MySQL and MariaDB @ http://ourdelta.org
1
0

[Maria-developers] New (by Igor): Partitioned Key Cache for MyISAM (85)
by worklog-noreply@askmonty.org 15 Feb '10
by worklog-noreply@askmonty.org 15 Feb '10
15 Feb '10
-----------------------------------------------------------------------
WORKLOG TASK
-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-
TASK...........: Partitioned Key Cache for MyISAM
CREATION DATE..: Sun, 14 Feb 2010, 00:10
SUPERVISOR.....: Monty
IMPLEMENTOR....:
COPIES TO......: Igor, Monty, Sergei
CATEGORY.......: Server-BackLog
TASK ID........: 85 (http://askmonty.org/worklog/?tid=85)
VERSION........: Benchmarks-3.0
STATUS.........: Assigned
PRIORITY.......: 60
WORKED HOURS...: 0
ESTIMATE.......: 80 (hours remain)
ORIG. ESTIMATE.: 80
PROGRESS NOTES:
DESCRIPTION:
A partitioned key cache is a collection of structures for regular MyiSAM key
caches called key cache partitions. Any page from a file can be placed into a
buffer of only one partition. The number of the partition is calculated from the
file number and the position of the page in the file, and it's always the same
for the page. The function that maps pages into partitions takes care of even
distribution of pages among partitions.
Partition key cache mitigate one of the major problem of simple key cache:
thread contention for key cache lock (mutex). Every call of a key cache
interface function must acquire this lock. So threads compete for this lock even
in the case when they have acquired shared locks for the file and pages they
want read from are in the key cache buffers. When working with a partitioned key
cache any key cache interface function that needs only one page has to acquire
the key cache lock only for the partition the page is ascribed to. This makes
the chances for threads not compete for the same key cache lock better.
The idea and the original of the partitioned key cache was provided by one of
our external contributers.
ESTIMATED WORK TIME
ESTIMATED COMPLETION DATE
-----------------------------------------------------------------------
WorkLog (v3.5.9)
2
1

[Maria-developers] Rev 2734: Maria WL#61 in file:///Users/bell/maria/bzr/work-maria-5.2-engine/
by sanja@askmonty.org 15 Feb '10
by sanja@askmonty.org 15 Feb '10
15 Feb '10
At file:///Users/bell/maria/bzr/work-maria-5.2-engine/
------------------------------------------------------------
revno: 2734
revision-id: sanja(a)askmonty.org-20100215001047-8cqnklgiv1pj3sa1
parent: psergey(a)askmonty.org-20091202142609-18bp41q8mejxl47t
committer: sanja(a)askmonty.org
branch nick: work-maria-5.2-engine
timestamp: Mon 2010-02-15 02:10:47 +0200
message:
Maria WL#61
Interface for maria extensions.
Additional information about plugins (maturity and string version) interface for maria extensions.
=== modified file 'CMakeLists.txt'
--- a/CMakeLists.txt 2009-10-03 19:24:13 +0000
+++ b/CMakeLists.txt 2010-02-15 00:10:47 +0000
@@ -251,6 +251,7 @@
IF (ENGINE_BUILD_TYPE STREQUAL "STATIC")
SET (mysql_plugin_defs "${mysql_plugin_defs},builtin_${PLUGIN_NAME}_plugin")
+ SET (mariadb_extra_plugin_defs "${mariadb_extra_plugin_defs},builtin_mariadb_${PLUGIN_NAME}_plugin")
SET (MYSQLD_STATIC_ENGINE_LIBS ${MYSQLD_STATIC_ENGINE_LIBS} ${PLUGIN_NAME})
SET (STORAGE_ENGINE_DEFS "${STORAGE_ENGINE_DEFS} -DWITH_${ENGINE}_STORAGE_ENGINE")
SET (WITH_${ENGINE}_STORAGE_ENGINE TRUE)
@@ -269,6 +270,7 @@
IF(NOT WITHOUT_PARTITION_STORAGE_ENGINE)
SET (STORAGE_ENGINE_DEFS "${STORAGE_ENGINE_DEFS} -DWITH_PARTITION_STORAGE_ENGINE")
SET (mysql_plugin_defs "${mysql_plugin_defs},builtin_partition_plugin")
+ SET (mariadb_extra_plugin_defs "${mariadb_extra_plugin_defs},builtin_mariadb_partition_plugin")
ENDIF(NOT WITHOUT_PARTITION_STORAGE_ENGINE)
# Special handling for tmp tables with the maria engine
=== modified file 'config/ac-macros/plugins.m4'
--- a/config/ac-macros/plugins.m4 2009-04-25 10:05:32 +0000
+++ b/config/ac-macros/plugins.m4 2010-02-15 00:10:47 +0000
@@ -461,6 +461,7 @@
])
])
mysql_plugin_defs="$mysql_plugin_defs, [builtin_]$2[_plugin]"
+ mariadb_extra_plugin_defs="$mariadb_extra_plugin_defs, [builtin_mariadb_]$2[_plugin]"
[with_plugin_]$2=yes
AC_MSG_RESULT([yes])
m4_ifdef([$11],[
=== modified file 'configure.in'
--- a/configure.in 2009-11-12 04:31:28 +0000
+++ b/configure.in 2010-02-15 00:10:47 +0000
@@ -2842,6 +2842,7 @@
AC_SUBST(mysql_plugin_dirs)
AC_SUBST(mysql_plugin_libs)
AC_SUBST(mysql_plugin_defs)
+AC_SUBST(mariadb_extra_plugin_defs)
# Now that sql_client_dirs and sql_server_dirs are stable, determine the union.
=== modified file 'include/mysql/plugin.h'
--- a/include/mysql/plugin.h 2009-09-07 20:50:10 +0000
+++ b/include/mysql/plugin.h 2010-02-15 00:10:47 +0000
@@ -65,7 +65,10 @@
Plugin API. Common for all plugin types.
*/
+/* MySQL plugin interface version */
#define MYSQL_PLUGIN_INTERFACE_VERSION 0x0100
+/* MariaDB extentsion interface version */
+#define MARIAEXT_PLUGIN_INTERFACE_VERSION 0x0100
/*
The allowable types of plugins
@@ -86,6 +89,21 @@
#define PLUGIN_LICENSE_GPL_STRING "GPL"
#define PLUGIN_LICENSE_BSD_STRING "BSD"
+/* definitions of code maturity for plugins */
+#define PLUGIN_MATURITY_UNKNOWN 0
+#define PLUGIN_MATURITY_TEST 1
+#define PLUGIN_MATURITY_ALPHA 2
+#define PLUGIN_MATURITY_BETA 3
+#define PLUGIN_MATURITY_GAMMA 4
+#define PLUGIN_MATURITY_RELEASE 5
+
+#define PLUGIN_MATURITY_UNKNOWN_STR "Unknown"
+#define PLUGIN_MATURITY_TEST_STR "Test"
+#define PLUGIN_MATURITY_ALPHA_STR "Alpha"
+#define PLUGIN_MATURITY_BETA_STR "Beta"
+#define PLUGIN_MATURITY_GAMMA_STR "Gamma"
+#define PLUGIN_MATURITY_RELEASE_STR "Release"
+
/*
Macros for beginning and ending plugin declarations. Between
mysql_declare_plugin and mysql_declare_plugin_end there should
@@ -94,15 +112,29 @@
#ifndef MYSQL_DYNAMIC_PLUGIN
+
#define __MYSQL_DECLARE_PLUGIN(NAME, VERSION, PSIZE, DECLS) \
int VERSION= MYSQL_PLUGIN_INTERFACE_VERSION; \
int PSIZE= sizeof(struct st_mysql_plugin); \
struct st_mysql_plugin DECLS[]= {
+
+#define __MARIAEXT_DECLARE_PLUGIN(NAME, VERSION, PSIZE, DECLS) \
+int VERSION= MARIAEXT_PLUGIN_INTERFACE_VERSION; \
+int PSIZE= sizeof(struct st_mariaext_plugin); \
+struct st_mariaext_plugin DECLS[]= {
+
#else
+
#define __MYSQL_DECLARE_PLUGIN(NAME, VERSION, PSIZE, DECLS) \
MYSQL_PLUGIN_EXPORT int _mysql_plugin_interface_version_= MYSQL_PLUGIN_INTERFACE_VERSION; \
MYSQL_PLUGIN_EXPORT int _mysql_sizeof_struct_st_plugin_= sizeof(struct st_mysql_plugin); \
MYSQL_PLUGIN_EXPORT struct st_mysql_plugin _mysql_plugin_declarations_[]= {
+
+#define __MARIAEXT_DECLARE_PLUGIN(NAME, VERSION, PSIZE, DECLS) \
+MYSQL_PLUGIN_EXPORT int _mariaext_plugin_interface_version_= MARIAEXT_PLUGIN_INTERFACE_VERSION; \
+MYSQL_PLUGIN_EXPORT int _mariaext_sizeof_struct_st_plugin_= sizeof(struct st_mariaext_plugin); \
+MYSQL_PLUGIN_EXPORT struct st_mariaext_plugin _mariaext_plugin_declarations_[]= {
+
#endif
#define mysql_declare_plugin(NAME) \
@@ -111,7 +143,14 @@
builtin_ ## NAME ## _sizeof_struct_st_plugin, \
builtin_ ## NAME ## _plugin)
+#define mariaext_declare_plugin(NAME) \
+__MARIAEXT_DECLARE_PLUGIN(NAME, \
+ builtin_mariadb_ ## NAME ## _plugin_interface_version, \
+ builtin_mariadb_ ## NAME ## _sizeof_struct_st_plugin, \
+ builtin_mariadb_ ## NAME ## _plugin)
+
#define mysql_declare_plugin_end ,{0,0,0,0,0,0,0,0,0,0,0,0}}
+#define mariaext_declare_plugin_end ,{0,0}}
/*
declarations for SHOW STATUS support in plugins
@@ -407,6 +446,16 @@
void * __reserved1; /* reserved for dependency checking */
};
+/*
+ MariaDB extension for plugins declaration structure.
+*/
+
+struct st_mariaext_plugin
+{
+ const char *version_info; /* plugin version string */
+ int maturity; /* HA_PLUGIN_MATURITY_XXX */
+};
+
/*************************************************************************
API for Full-text parser plugin. (MYSQL_FTPARSER_PLUGIN)
*/
=== modified file 'include/mysql/plugin.h.pp'
--- a/include/mysql/plugin.h.pp 2008-10-10 15:28:41 +0000
+++ b/include/mysql/plugin.h.pp 2010-02-15 00:10:47 +0000
@@ -46,6 +46,11 @@
struct st_mysql_sys_var **system_vars;
void * __reserved1;
};
+struct st_mariaext_plugin
+{
+ const char *version_info;
+ int maturity;
+};
enum enum_ftparser_mode
{
MYSQL_FTPARSER_SIMPLE_MODE= 0,
=== modified file 'plugin/daemon_example/daemon_example.cc'
--- a/plugin/daemon_example/daemon_example.cc 2007-06-27 14:49:12 +0000
+++ b/plugin/daemon_example/daemon_example.cc 2010-02-15 00:10:47 +0000
@@ -200,3 +200,9 @@
NULL /* config options */
}
mysql_declare_plugin_end;
+mariaext_declare_plugin(daemon_example)
+{
+ "1.0", /* string version */
+ PLUGIN_MATURITY_TEST /* maturity */
+}
+mariaext_declare_plugin_end;
=== modified file 'plugin/fulltext/plugin_example.c'
--- a/plugin/fulltext/plugin_example.c 2007-04-26 19:26:04 +0000
+++ b/plugin/fulltext/plugin_example.c 2010-02-15 00:10:47 +0000
@@ -270,4 +270,10 @@
NULL
}
mysql_declare_plugin_end;
+mariaext_declare_plugin(ftexample)
+{
+ "0.01", /* string version */
+ PLUGIN_MATURITY_TEST /* maturity */
+}
+mariaext_declare_plugin_end;
=== modified file 'sql/ha_ndbcluster.cc'
--- a/sql/ha_ndbcluster.cc 2009-09-07 20:50:10 +0000
+++ b/sql/ha_ndbcluster.cc 2010-02-15 00:10:47 +0000
@@ -10561,5 +10561,11 @@
NULL /* config options */
}
mysql_declare_plugin_end;
+mariaext_declare_plugin(ndbcluster)
+{
+ "1.0", /* string version */
+ PLUGIN_MATURITY_BETA /* maturity */
+}
+mariaext_declare_plugin_end;
#endif
=== modified file 'sql/ha_partition.cc'
--- a/sql/ha_partition.cc 2009-11-12 04:31:28 +0000
+++ b/sql/ha_partition.cc 2010-02-15 00:10:47 +0000
@@ -6510,5 +6510,11 @@
NULL /* config options */
}
mysql_declare_plugin_end;
+mariaext_declare_plugin(partition)
+{
+ "1.0", /* string version */
+ PLUGIN_MATURITY_RELEASE /* maturity */
+}
+mariaext_declare_plugin_end;
#endif
=== modified file 'sql/log.cc'
--- a/sql/log.cc 2009-11-12 04:31:28 +0000
+++ b/sql/log.cc 2010-02-15 00:10:47 +0000
@@ -5795,3 +5795,9 @@
NULL /* config options */
}
mysql_declare_plugin_end;
+mariaext_declare_plugin(binlog)
+{
+ "1.0", /* string version */
+ PLUGIN_MATURITY_RELEASE /* maturity */
+}
+mariaext_declare_plugin_end;
=== modified file 'sql/sql_builtin.cc.in'
--- a/sql/sql_builtin.cc.in 2006-12-31 01:29:11 +0000
+++ b/sql/sql_builtin.cc.in 2010-02-15 00:10:47 +0000
@@ -16,6 +16,7 @@
#include <mysql/plugin.h>
typedef struct st_mysql_plugin builtin_plugin[];
+typedef struct st_mariaext_plugin builtin_mariadb_plugin[];
extern builtin_plugin
builtin_binlog_plugin@mysql_plugin_defs@;
@@ -25,3 +26,10 @@
builtin_binlog_plugin@mysql_plugin_defs@,(struct st_mysql_plugin *)0
};
+extern builtin_mariadb_plugin
+ builtin_mariadb_binlog_plugin@mariadb_extra_plugin_defs@;
+
+struct st_mariaext_plugin *mysqld_bltnmexts[]=
+{
+ builtin_mariadb_binlog_plugin@mariadb_extra_plugin_defs@,(struct st_mariaext_plugin *)0
+};
=== modified file 'sql/sql_plugin.cc'
--- a/sql/sql_plugin.cc 2009-11-12 04:31:28 +0000
+++ b/sql/sql_plugin.cc 2010-02-15 00:10:47 +0000
@@ -28,6 +28,9 @@
#endif
extern struct st_mysql_plugin *mysqld_builtins[];
+extern struct st_mariaext_plugin *mysqld_bltnmexts[];
+static st_mariaext_plugin no_mariaext[2]= {{"Unknown", 0}, {0, 0}};
+static st_mariaext_plugin *empty_mariaext= no_mariaext;
/**
@note The order of the enumeration is critical.
@@ -82,6 +85,14 @@
"_mysql_sizeof_struct_st_plugin_";
static const char *plugin_declarations_sym= "_mysql_plugin_declarations_";
static int min_plugin_interface_version= MYSQL_PLUGIN_INTERFACE_VERSION & ~0xFF;
+static const char *mariaext_plugin_interface_version_sym=
+ "_mariaext_plugin_interface_version_";
+static const char *mariaext_sizeof_st_plugin_sym=
+ "_mariaext_sizeof_struct_st_plugin_";
+static const char *mariaext_plugin_declarations_sym=
+ "_mariaext_plugin_declarations_";
+static int min_mariaext_plugin_interface_version=
+ MARIAEXT_PLUGIN_INTERFACE_VERSION & ~0xFF;
#endif
/* Note that 'int version' must be the first field of every plugin
@@ -352,6 +363,7 @@
char dlpath[FN_REFLEN];
uint plugin_dir_len, dummy_errors, dlpathlen;
struct st_plugin_dl *tmp, plugin_dl;
+ struct st_mariaext_plugin *mariaext= no_mariaext;
void *sym;
DBUG_ENTER("plugin_dl_add");
plugin_dir_len= strlen(opt_plugin_dir);
@@ -507,6 +519,15 @@
files_charset_info, dl->str, dl->length, system_charset_info,
&dummy_errors);
plugin_dl.dl.str[plugin_dl.dl.length]= 0;
+
+ if ((sym= dlsym(plugin_dl.handle, mariaext_plugin_interface_version_sym)) &&
+ (*(int *)sym == MARIAEXT_PLUGIN_INTERFACE_VERSION) &&
+ (sym= dlsym(plugin_dl.handle, mariaext_plugin_declarations_sym)))
+ {
+ mariaext= (struct st_mariaext_plugin *) sym;
+ }
+ plugin_dl.mariaext= mariaext;
+
/* Add this dll to array */
if (! (tmp= plugin_dl_insert_or_reuse(&plugin_dl)))
{
@@ -719,6 +740,7 @@
{
struct st_plugin_int tmp;
struct st_mysql_plugin *plugin;
+ struct st_mariaext_plugin *ext, *mariaext;
DBUG_ENTER("plugin_add");
if (plugin_find_internal(name, MYSQL_ANY_PLUGIN))
{
@@ -732,9 +754,22 @@
bzero((char*) &tmp, sizeof(tmp));
if (! (tmp.plugin_dl= plugin_dl_add(dl, report)))
DBUG_RETURN(TRUE);
+
/* Find plugin by name */
- for (plugin= tmp.plugin_dl->plugins; plugin->info; plugin++)
+ for (plugin= tmp.plugin_dl->plugins, ext= tmp.plugin_dl->mariaext;
+ plugin->info;
+ plugin++, ext++)
{
+ mariaext= ext;
+ if (!ext->version_info)
+ {
+ /*
+ Plugin didn't have any mariadb extensions; Use default one and
+ reset counter to do same for next internal plugin.
+ */
+ mariaext= empty_mariaext;
+ ext--;
+ }
uint name_len= strlen(plugin->name);
if (plugin->type >= 0 && plugin->type < MYSQL_MAX_PLUGIN_TYPE_NUM &&
! my_strnncoll(system_charset_info,
@@ -759,6 +794,7 @@
goto err;
}
tmp.plugin= plugin;
+ tmp.mariaext= mariaext;
tmp.name.str= (char *)plugin->name;
tmp.name.length= name_len;
tmp.ref_count= 0;
@@ -1121,7 +1157,9 @@
uint i;
bool is_myisam;
struct st_mysql_plugin **builtins;
+ struct st_mariaext_plugin **bltnmexts;
struct st_mysql_plugin *plugin;
+ struct st_mariaext_plugin *ext, *mariaext;
struct st_plugin_int tmp, *plugin_ptr, **reap;
MEM_ROOT tmp_root;
bool reaped_mandatory_plugin= FALSE;
@@ -1160,10 +1198,29 @@
/*
First we register builtin plugins
*/
- for (builtins= mysqld_builtins; *builtins; builtins++)
+ for (builtins= mysqld_builtins, bltnmexts= mysqld_bltnmexts;
+ *builtins;
+ builtins++, bltnmexts++)
{
- for (plugin= *builtins; plugin->info; plugin++)
+ /* extensions should be the same numbers as static plugins) */
+ DBUG_ASSERT(*bltnmexts);
+ for (plugin= *builtins, ext= *bltnmexts;
+ plugin->info;
+ plugin++, ext++)
{
+
+ /* in case if plugin describe less extensions then plugins */
+ mariaext= ext;
+ if (!ext->version_info)
+ {
+ /*
+ Plugin didn't have any mariadb extensions; Use default one and
+ reset counter to do same for next internal plugin.
+ */
+ mariaext= empty_mariaext;
+ ext--;
+ }
+
if (opt_ignore_builtin_innodb &&
!my_strnncoll(&my_charset_latin1, (const uchar*) plugin->name,
6, (const uchar*) "InnoDB", 6))
@@ -1186,6 +1243,7 @@
#endif
bzero(&tmp, sizeof(tmp));
tmp.plugin= plugin;
+ tmp.mariaext= mariaext;
tmp.name.str= (char *)plugin->name;
tmp.name.length= strlen(plugin->name);
tmp.state= 0;
=== modified file 'sql/sql_plugin.h'
--- a/sql/sql_plugin.h 2009-05-14 12:03:33 +0000
+++ b/sql/sql_plugin.h 2010-02-15 00:10:47 +0000
@@ -63,6 +63,7 @@
LEX_STRING dl;
void *handle;
struct st_mysql_plugin *plugins;
+ struct st_mariaext_plugin *mariaext;
int version;
uint ref_count; /* number of plugins loaded from the library */
};
@@ -74,6 +75,7 @@
LEX_STRING name;
struct st_mysql_plugin *plugin;
struct st_plugin_dl *plugin_dl;
+ struct st_mariaext_plugin *mariaext;
uint state;
uint ref_count; /* number of threads using the plugin */
void *data; /* plugin type specific, e.g. handlerton */
@@ -95,6 +97,7 @@
#define plugin_name(pi) (&((pi)->name))
#define plugin_state(pi) ((pi)->state)
#define plugin_equals(p1,p2) ((p1) == (p2))
+#define plugin_ext(pi) ((pi)->mariaext)
#else
typedef struct st_plugin_int **plugin_ref;
#define plugin_decl(pi) ((pi)[0]->plugin)
@@ -103,6 +106,8 @@
#define plugin_name(pi) (&((pi)[0]->name))
#define plugin_state(pi) ((pi)[0]->state)
#define plugin_equals(p1,p2) ((p1) && (p2) && (p1)[0] == (p2)[0])
+#define plugin_ext(pi) ((pi)[0]->mariaext)
+
#endif
typedef int (*plugin_type_init)(struct st_plugin_int *);
=== modified file 'sql/sql_show.cc'
--- a/sql/sql_show.cc 2009-11-12 04:31:28 +0000
+++ b/sql/sql_show.cc 2010-02-15 00:10:47 +0000
@@ -94,12 +94,21 @@
return my_snprintf(buf, buf_length, "%d.%d", version>>8,version&0xff);
}
+static const LEX_STRING maturity_name[]={
+ { C_STRING_WITH_LEN(PLUGIN_MATURITY_UNKNOWN_STR) },
+ { C_STRING_WITH_LEN(PLUGIN_MATURITY_TEST_STR) },
+ { C_STRING_WITH_LEN(PLUGIN_MATURITY_ALPHA_STR) },
+ { C_STRING_WITH_LEN(PLUGIN_MATURITY_BETA_STR) },
+ { C_STRING_WITH_LEN(PLUGIN_MATURITY_GAMMA_STR) },
+ { C_STRING_WITH_LEN(PLUGIN_MATURITY_RELEASE_STR) }};
+
static my_bool show_plugins(THD *thd, plugin_ref plugin,
void *arg)
{
TABLE *table= (TABLE*) arg;
struct st_mysql_plugin *plug= plugin_decl(plugin);
struct st_plugin_dl *plugin_dl= plugin_dlib(plugin);
+ struct st_mariaext_plugin *mariaext= plugin_ext(plugin);
CHARSET_INFO *cs= system_charset_info;
char version_buf[20];
@@ -186,6 +195,26 @@
}
table->field[9]->set_notnull();
+ if ((uint) mariaext->maturity <= PLUGIN_MATURITY_RELEASE)
+ table->field[10]->store(maturity_name[mariaext->maturity].str,
+ maturity_name[mariaext->maturity].length,
+ cs);
+ else
+ {
+ DBUG_ASSERT(0);
+ table->field[10]->store("Unknown", 7, cs);
+ }
+ table->field[10]->set_notnull();
+
+ if (mariaext->version_info)
+ {
+ table->field[11]->store(mariaext->version_info,
+ strlen(mariaext->version_info), cs);
+ table->field[11]->set_notnull();
+ }
+ else
+ table->field[11]->set_null();
+
return schema_table_store_record(thd, table);
}
@@ -6990,6 +7019,8 @@
{"PLUGIN_AUTHOR", NAME_CHAR_LEN, MYSQL_TYPE_STRING, 0, 1, 0, SKIP_OPEN_TABLE},
{"PLUGIN_DESCRIPTION", 65535, MYSQL_TYPE_STRING, 0, 1, 0, SKIP_OPEN_TABLE},
{"PLUGIN_LICENSE", 80, MYSQL_TYPE_STRING, 0, 1, "License", SKIP_OPEN_TABLE},
+ {"PLUGIN_MATURITY", 7, MYSQL_TYPE_STRING, 0, 1, 0, SKIP_OPEN_TABLE},
+ {"PLUGIN_AUTH_VERSION", 80, MYSQL_TYPE_STRING, 0, 1, 0, SKIP_OPEN_TABLE},
{0, 0, MYSQL_TYPE_STRING, 0, 0, 0, SKIP_OPEN_TABLE}
};
=== modified file 'storage/archive/ha_archive.cc'
--- a/storage/archive/ha_archive.cc 2009-09-07 20:50:10 +0000
+++ b/storage/archive/ha_archive.cc 2010-02-15 00:10:47 +0000
@@ -1642,4 +1642,10 @@
NULL /* config options */
}
mysql_declare_plugin_end;
+mariaext_declare_plugin(archive)
+{
+ "1.0", /* string version */
+ PLUGIN_MATURITY_RELEASE /* maturity */
+}
+mariaext_declare_plugin_end;
=== modified file 'storage/blackhole/ha_blackhole.cc'
--- a/storage/blackhole/ha_blackhole.cc 2008-11-10 20:21:49 +0000
+++ b/storage/blackhole/ha_blackhole.cc 2010-02-15 00:10:47 +0000
@@ -369,3 +369,9 @@
NULL /* config options */
}
mysql_declare_plugin_end;
+mariaext_declare_plugin(blackhole)
+{
+ "1.0", /* string version */
+ PLUGIN_MATURITY_RELEASE /* maturity */
+}
+mariaext_declare_plugin_end;
=== modified file 'storage/csv/ha_tina.cc'
--- a/storage/csv/ha_tina.cc 2009-04-25 10:05:32 +0000
+++ b/storage/csv/ha_tina.cc 2010-02-15 00:10:47 +0000
@@ -1636,4 +1636,9 @@
NULL /* config options */
}
mysql_declare_plugin_end;
-
+mariaext_declare_plugin(csv)
+{
+ "1.0", /* string version */
+ PLUGIN_MATURITY_RELEASE /* maturity */
+}
+mariaext_declare_plugin_end;
=== modified file 'storage/example/ha_example.cc'
--- a/storage/example/ha_example.cc 2008-02-24 13:12:17 +0000
+++ b/storage/example/ha_example.cc 2010-02-15 00:10:47 +0000
@@ -906,3 +906,9 @@
NULL /* config options */
}
mysql_declare_plugin_end;
+mariaext_declare_plugin(example)
+{
+ "0.1", /* string version */
+ PLUGIN_MATURITY_TEST /* maturity */
+}
+mariaext_declare_plugin_end;
=== modified file 'storage/federated/ha_federated.cc'
--- a/storage/federated/ha_federated.cc 2009-09-07 20:50:10 +0000
+++ b/storage/federated/ha_federated.cc 2010-02-15 00:10:47 +0000
@@ -3379,3 +3379,9 @@
NULL /* config options */
}
mysql_declare_plugin_end;
+mariaext_declare_plugin(federated)
+{
+ "1.0", /* string version */
+ PLUGIN_MATURITY_BETA /* maturity */
+}
+mariaext_declare_plugin_end;
=== modified file 'storage/federatedx/ha_federatedx.cc'
--- a/storage/federatedx/ha_federatedx.cc 2009-11-03 11:08:09 +0000
+++ b/storage/federatedx/ha_federatedx.cc 2010-02-15 00:10:47 +0000
@@ -3485,9 +3485,15 @@
PLUGIN_LICENSE_GPL,
federatedx_db_init, /* Plugin Init */
federatedx_done, /* Plugin Deinit */
- 0x0100 /* 1.0 */,
+ 0x0200 /* 2.0 */,
NULL, /* status variables */
NULL, /* system variables */
NULL /* config options */
}
mysql_declare_plugin_end;
+mariaext_declare_plugin(federated)
+{
+ "2.0", /* string version */
+ PLUGIN_MATURITY_BETA /* maturity */
+}
+mariaext_declare_plugin_end;
=== modified file 'storage/heap/ha_heap.cc'
--- a/storage/heap/ha_heap.cc 2009-09-07 20:50:10 +0000
+++ b/storage/heap/ha_heap.cc 2010-02-15 00:10:47 +0000
@@ -767,3 +767,9 @@
NULL /* config options */
}
mysql_declare_plugin_end;
+mariaext_declare_plugin(heap)
+{
+ "1.0", /* string version */
+ PLUGIN_MATURITY_RELEASE /* maturity */
+}
+mariaext_declare_plugin_end;
=== modified file 'storage/ibmdb2i/ha_ibmdb2i.cc'
--- a/storage/ibmdb2i/ha_ibmdb2i.cc 2009-07-08 09:10:01 +0000
+++ b/storage/ibmdb2i/ha_ibmdb2i.cc 2010-02-15 00:10:47 +0000
@@ -3357,3 +3357,9 @@
NULL /* config options */
}
mysql_declare_plugin_end;
+mariaext_declare_plugin(ibmdb2i)
+{
+ "1.0", /* string version */
+ PLUGIN_MATURITY_UNKNOWN /* maturity */
+}
+mariaext_declare_plugin_end;
=== modified file 'storage/innobase/handler/ha_innodb.cc'
--- a/storage/innobase/handler/ha_innodb.cc 2009-10-16 22:57:48 +0000
+++ b/storage/innobase/handler/ha_innodb.cc 2010-02-15 00:10:47 +0000
@@ -8684,6 +8684,12 @@
NULL /* reserved */
}
mysql_declare_plugin_end;
+mariaext_declare_plugin(innobase)
+{
+ "1.0", /* string version */
+ PLUGIN_MATURITY_RELEASE /* maturity */
+}
+mariaext_declare_plugin_end;
/** @brief Initialize the default value of innodb_commit_concurrency.
=== modified file 'storage/innodb_plugin/handler/ha_innodb.cc'
--- a/storage/innodb_plugin/handler/ha_innodb.cc 2009-08-04 08:02:48 +0000
+++ b/storage/innodb_plugin/handler/ha_innodb.cc 2010-02-15 00:10:47 +0000
@@ -10032,6 +10032,12 @@
i_s_innodb_cmpmem,
i_s_innodb_cmpmem_reset
mysql_declare_plugin_end;
+mariaext_declare_plugin(innodb_plugin)
+{
+ INNODB_VERSION_STR, /* string version */
+ PLUGIN_MATURITY_RELEASE /* maturity */
+}
+mariaext_declare_plugin_end;
/** @brief Initialize the default value of innodb_commit_concurrency.
=== modified file 'storage/maria/ha_maria.cc'
--- a/storage/maria/ha_maria.cc 2009-10-26 11:35:42 +0000
+++ b/storage/maria/ha_maria.cc 2010-02-15 00:10:47 +0000
@@ -3346,9 +3346,15 @@
PLUGIN_LICENSE_GPL,
ha_maria_init, /* Plugin Init */
NULL, /* Plugin Deinit */
- 0x0100, /* 1.0 */
+ 0x0105, /* 1.5 */
status_variables, /* status variables */
system_variables, /* system variables */
NULL
}
mysql_declare_plugin_end;
+mariaext_declare_plugin(maria)
+{
+ "1.5", /* string version */
+ PLUGIN_MATURITY_GAMMA /* maturity */
+}
+mariaext_declare_plugin_end;
=== modified file 'storage/myisam/ha_myisam.cc'
--- a/storage/myisam/ha_myisam.cc 2009-10-17 19:12:28 +0000
+++ b/storage/myisam/ha_myisam.cc 2010-02-15 00:10:47 +0000
@@ -2183,6 +2183,12 @@
NULL /* config options */
}
mysql_declare_plugin_end;
+mariaext_declare_plugin(myisam)
+{
+ "1.0", /* string version */
+ PLUGIN_MATURITY_RELEASE /* maturity */
+}
+mariaext_declare_plugin_end;
#ifdef HAVE_QUERY_CACHE
=== modified file 'storage/myisammrg/ha_myisammrg.cc'
--- a/storage/myisammrg/ha_myisammrg.cc 2009-10-15 21:38:29 +0000
+++ b/storage/myisammrg/ha_myisammrg.cc 2010-02-15 00:10:47 +0000
@@ -1289,3 +1289,9 @@
NULL /* config options */
}
mysql_declare_plugin_end;
+mariaext_declare_plugin(myisammrg)
+{
+ "1.0", /* string version */
+ PLUGIN_MATURITY_RELEASE /* maturity */
+}
+mariaext_declare_plugin_end;
=== modified file 'storage/pbxt/src/ha_pbxt.cc'
--- a/storage/pbxt/src/ha_pbxt.cc 2009-09-03 06:15:03 +0000
+++ b/storage/pbxt/src/ha_pbxt.cc 2010-02-15 00:10:47 +0000
@@ -5507,6 +5507,18 @@
drizzle_declare_plugin_end;
#else
mysql_declare_plugin_end;
+#ifdef MARIADB_BASE_VERSION
+mariaext_declare_plugin(pbxt)
+{ /* PBXT */
+ "1.0.09g RC3", /* string version */
+ PLUGIN_MATURITY_GAMMA /* maturity */
+},
+{ /* PBXT_STATISTICS */
+ "1.0.09g RC3", /* string version */
+ PLUGIN_MATURITY_GAMMA /* maturity */
+}
+mariaext_declare_plugin_end;
+#endif
#endif
#if defined(XT_WIN) && defined(XT_COREDUMP)
=== modified file 'storage/xtradb/handler/ha_innodb.cc'
--- a/storage/xtradb/handler/ha_innodb.cc 2009-10-16 22:57:48 +0000
+++ b/storage/xtradb/handler/ha_innodb.cc 2010-02-15 00:10:47 +0000
@@ -10540,6 +10540,69 @@
i_s_innodb_index_stats,
i_s_innodb_patches
mysql_declare_plugin_end;
+mariaext_declare_plugin(innobase)
+{ /* InnoDB */
+ INNODB_VERSION_STR, /* string version */
+ PLUGIN_MATURITY_RELEASE /* maturity */
+},
+{ /* INNODB_RSEG */
+ "1.0",
+ PLUGIN_MATURITY_RELEASE
+},
+{ /* INNODB_BUFFER_POOL_PAGES */
+ "1.0",
+ PLUGIN_MATURITY_RELEASE
+},
+{ /* INNODB_BUFFER_POOL_PAGES_INDEX */
+ "1.0",
+ PLUGIN_MATURITY_RELEASE
+},
+{ /* INNODB_BUFFER_POOL_PAGES_BLOB */
+ "1.0",
+ PLUGIN_MATURITY_RELEASE
+},
+{ /* INNODB_TRX */
+ INNODB_VERSION_STR,
+ PLUGIN_MATURITY_RELEASE
+},
+{ /* INNODB_LOCKS */
+ INNODB_VERSION_STR,
+ PLUGIN_MATURITY_RELEASE
+},
+{ /* INNODB_LOCK_WAITS */
+ INNODB_VERSION_STR,
+ PLUGIN_MATURITY_RELEASE
+},
+{ /* INNODB_CMP */
+ INNODB_VERSION_STR,
+ PLUGIN_MATURITY_RELEASE
+},
+{ /* INNODB_CMP_RESET */
+ INNODB_VERSION_STR,
+ PLUGIN_MATURITY_RELEASE
+},
+{ /* INNODB_CMPMEM */
+ INNODB_VERSION_STR,
+ PLUGIN_MATURITY_RELEASE
+},
+{ /* INNODB_CMPMEM_RESET */
+ INNODB_VERSION_STR,
+ PLUGIN_MATURITY_RELEASE
+},
+{ /* INNODB_TABLE_STATS */
+ "1.0",
+ PLUGIN_MATURITY_RELEASE
+},
+{ /* INNODB_INDEX_STATS */
+ "1.0",
+ PLUGIN_MATURITY_RELEASE
+},
+{ /* XTRADB_ENHANCEMENTS */
+ INNODB_VERSION_STR,
+ PLUGIN_MATURITY_RELEASE
+}
+mariaext_declare_plugin_end;
+
/** @brief Initialize the default value of innodb_commit_concurrency.
1
0

[Maria-developers] Updated (by Igor): Partitioned Key Cache for MyISAM (86)
by worklog-noreply@askmonty.org 13 Feb '10
by worklog-noreply@askmonty.org 13 Feb '10
13 Feb '10
-----------------------------------------------------------------------
WORKLOG TASK
-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-
TASK...........: Partitioned Key Cache for MyISAM
CREATION DATE..: Sun, 14 Feb 2010, 00:17
SUPERVISOR.....: Monty
IMPLEMENTOR....: Igor
COPIES TO......: Igor, Monty, Sergei
CATEGORY.......: Server-Sprint
TASK ID........: 86 (http://askmonty.org/worklog/?tid=86)
VERSION........: Server-5.2
STATUS.........: Assigned
PRIORITY.......: 60
WORKED HOURS...: 0
ESTIMATE.......: 80 (hours remain)
ORIG. ESTIMATE.: 80
PROGRESS NOTES:
-=-=(Igor - Sun, 14 Feb 2010, 00:19)=-=-
Privacy level updated.
--- /tmp/wklog.86.old.10092 2010-02-13 22:19:03.000000000 +0000
+++ /tmp/wklog.86.new.10092 2010-02-13 22:19:03.000000000 +0000
@@ -1 +1 @@
-y
+n
-=-=(Igor - Sun, 14 Feb 2010, 00:19)=-=-
Category updated.
--- /tmp/wklog.86.old.10092 2010-02-13 22:19:03.000000000 +0000
+++ /tmp/wklog.86.new.10092 2010-02-13 22:19:03.000000000 +0000
@@ -1 +1 @@
-Server-BackLog
+Server-Sprint
-=-=(Igor - Sun, 14 Feb 2010, 00:18)=-=-
Version updated.
--- /tmp/wklog.86.old.10044 2010-02-14 00:18:31.000000000 +0200
+++ /tmp/wklog.86.new.10044 2010-02-14 00:18:31.000000000 +0200
@@ -1 +1 @@
-Benchmarks-3.0
+Server-5.2
DESCRIPTION:
A partitioned key cache is a collection of structures for regular MyiSAM key
caches called key cache partitions. Any page from a file can be placed into a
buffer of only one partition. The number of the partition is calculated from the
file number and the position of the page in the file, and it's always the same
for the page. The function that maps pages into partitions takes care of even
distribution of pages among partitions.
Partition key cache mitigate one of the major problem of simple key cache:
thread contention for key cache lock (mutex). Every call of a key cache
interface function must acquire this lock. So threads compete for this lock even
in the case when they have acquired shared locks for the file and pages they
want read from are in the key cache buffers. When working with a partitioned key
cache any key cache interface function that needs only one page has to acquire
the key cache lock only for the partition the page is ascribed to. This makes
the chances for threads not compete for the same key cache lock better.
The idea and the original of the partitioned key cache was provided by one of
our external contributers (see the attached file segmented_keycache_v2.diff with
the original patch from the contributor).
ESTIMATED WORK TIME
ESTIMATED COMPLETION DATE
-----------------------------------------------------------------------
WorkLog (v3.5.9)
1
0

[Maria-developers] Updated (by Igor): Partitioned Key Cache for MyISAM (86)
by worklog-noreply@askmonty.org 13 Feb '10
by worklog-noreply@askmonty.org 13 Feb '10
13 Feb '10
-----------------------------------------------------------------------
WORKLOG TASK
-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-
TASK...........: Partitioned Key Cache for MyISAM
CREATION DATE..: Sun, 14 Feb 2010, 00:17
SUPERVISOR.....: Monty
IMPLEMENTOR....: Igor
COPIES TO......: Igor, Monty, Sergei
CATEGORY.......: Server-Sprint
TASK ID........: 86 (http://askmonty.org/worklog/?tid=86)
VERSION........: Server-5.2
STATUS.........: Assigned
PRIORITY.......: 60
WORKED HOURS...: 0
ESTIMATE.......: 80 (hours remain)
ORIG. ESTIMATE.: 80
PROGRESS NOTES:
-=-=(Igor - Sun, 14 Feb 2010, 00:19)=-=-
Privacy level updated.
--- /tmp/wklog.86.old.10092 2010-02-13 22:19:03.000000000 +0000
+++ /tmp/wklog.86.new.10092 2010-02-13 22:19:03.000000000 +0000
@@ -1 +1 @@
-y
+n
-=-=(Igor - Sun, 14 Feb 2010, 00:19)=-=-
Category updated.
--- /tmp/wklog.86.old.10092 2010-02-13 22:19:03.000000000 +0000
+++ /tmp/wklog.86.new.10092 2010-02-13 22:19:03.000000000 +0000
@@ -1 +1 @@
-Server-BackLog
+Server-Sprint
-=-=(Igor - Sun, 14 Feb 2010, 00:18)=-=-
Version updated.
--- /tmp/wklog.86.old.10044 2010-02-14 00:18:31.000000000 +0200
+++ /tmp/wklog.86.new.10044 2010-02-14 00:18:31.000000000 +0200
@@ -1 +1 @@
-Benchmarks-3.0
+Server-5.2
DESCRIPTION:
A partitioned key cache is a collection of structures for regular MyiSAM key
caches called key cache partitions. Any page from a file can be placed into a
buffer of only one partition. The number of the partition is calculated from the
file number and the position of the page in the file, and it's always the same
for the page. The function that maps pages into partitions takes care of even
distribution of pages among partitions.
Partition key cache mitigate one of the major problem of simple key cache:
thread contention for key cache lock (mutex). Every call of a key cache
interface function must acquire this lock. So threads compete for this lock even
in the case when they have acquired shared locks for the file and pages they
want read from are in the key cache buffers. When working with a partitioned key
cache any key cache interface function that needs only one page has to acquire
the key cache lock only for the partition the page is ascribed to. This makes
the chances for threads not compete for the same key cache lock better.
The idea and the original of the partitioned key cache was provided by one of
our external contributers (see the attached file segmented_keycache_v2.diff with
the original patch from the contributor).
ESTIMATED WORK TIME
ESTIMATED COMPLETION DATE
-----------------------------------------------------------------------
WorkLog (v3.5.9)
1
0

[Maria-developers] Updated (by Igor): Partitioned Key Cache for MyISAM (86)
by worklog-noreply@askmonty.org 13 Feb '10
by worklog-noreply@askmonty.org 13 Feb '10
13 Feb '10
-----------------------------------------------------------------------
WORKLOG TASK
-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-
TASK...........: Partitioned Key Cache for MyISAM
CREATION DATE..: Sun, 14 Feb 2010, 00:17
SUPERVISOR.....: Monty
IMPLEMENTOR....: Igor
COPIES TO......: Igor, Monty, Sergei
CATEGORY.......: Server-Sprint
TASK ID........: 86 (http://askmonty.org/worklog/?tid=86)
VERSION........: Server-5.2
STATUS.........: Assigned
PRIORITY.......: 60
WORKED HOURS...: 0
ESTIMATE.......: 80 (hours remain)
ORIG. ESTIMATE.: 80
PROGRESS NOTES:
-=-=(Igor - Sun, 14 Feb 2010, 00:19)=-=-
Privacy level updated.
--- /tmp/wklog.86.old.10092 2010-02-13 22:19:03.000000000 +0000
+++ /tmp/wklog.86.new.10092 2010-02-13 22:19:03.000000000 +0000
@@ -1 +1 @@
-y
+n
-=-=(Igor - Sun, 14 Feb 2010, 00:19)=-=-
Category updated.
--- /tmp/wklog.86.old.10092 2010-02-13 22:19:03.000000000 +0000
+++ /tmp/wklog.86.new.10092 2010-02-13 22:19:03.000000000 +0000
@@ -1 +1 @@
-Server-BackLog
+Server-Sprint
-=-=(Igor - Sun, 14 Feb 2010, 00:18)=-=-
Version updated.
--- /tmp/wklog.86.old.10044 2010-02-14 00:18:31.000000000 +0200
+++ /tmp/wklog.86.new.10044 2010-02-14 00:18:31.000000000 +0200
@@ -1 +1 @@
-Benchmarks-3.0
+Server-5.2
DESCRIPTION:
A partitioned key cache is a collection of structures for regular MyiSAM key
caches called key cache partitions. Any page from a file can be placed into a
buffer of only one partition. The number of the partition is calculated from the
file number and the position of the page in the file, and it's always the same
for the page. The function that maps pages into partitions takes care of even
distribution of pages among partitions.
Partition key cache mitigate one of the major problem of simple key cache:
thread contention for key cache lock (mutex). Every call of a key cache
interface function must acquire this lock. So threads compete for this lock even
in the case when they have acquired shared locks for the file and pages they
want read from are in the key cache buffers. When working with a partitioned key
cache any key cache interface function that needs only one page has to acquire
the key cache lock only for the partition the page is ascribed to. This makes
the chances for threads not compete for the same key cache lock better.
The idea and the original of the partitioned key cache was provided by one of
our external contributers (see the attached file segmented_keycache_v2.diff with
the original patch from the contributor).
ESTIMATED WORK TIME
ESTIMATED COMPLETION DATE
-----------------------------------------------------------------------
WorkLog (v3.5.9)
1
0

[Maria-developers] Updated (by Igor): Partitioned Key Cache for MyISAM (86)
by worklog-noreply@askmonty.org 13 Feb '10
by worklog-noreply@askmonty.org 13 Feb '10
13 Feb '10
-----------------------------------------------------------------------
WORKLOG TASK
-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-
TASK...........: Partitioned Key Cache for MyISAM
CREATION DATE..: Sun, 14 Feb 2010, 00:17
SUPERVISOR.....: Monty
IMPLEMENTOR....: Igor
COPIES TO......: Igor, Monty, Sergei
CATEGORY.......: Server-BackLog
TASK ID........: 86 (http://askmonty.org/worklog/?tid=86)
VERSION........: Server-5.2
STATUS.........: Assigned
PRIORITY.......: 60
WORKED HOURS...: 0
ESTIMATE.......: 80 (hours remain)
ORIG. ESTIMATE.: 80
PROGRESS NOTES:
-=-=(Igor - Sun, 14 Feb 2010, 00:18)=-=-
Version updated.
--- /tmp/wklog.86.old.10044 2010-02-14 00:18:31.000000000 +0200
+++ /tmp/wklog.86.new.10044 2010-02-14 00:18:31.000000000 +0200
@@ -1 +1 @@
-Benchmarks-3.0
+Server-5.2
DESCRIPTION:
A partitioned key cache is a collection of structures for regular MyiSAM key
caches called key cache partitions. Any page from a file can be placed into a
buffer of only one partition. The number of the partition is calculated from the
file number and the position of the page in the file, and it's always the same
for the page. The function that maps pages into partitions takes care of even
distribution of pages among partitions.
Partition key cache mitigate one of the major problem of simple key cache:
thread contention for key cache lock (mutex). Every call of a key cache
interface function must acquire this lock. So threads compete for this lock even
in the case when they have acquired shared locks for the file and pages they
want read from are in the key cache buffers. When working with a partitioned key
cache any key cache interface function that needs only one page has to acquire
the key cache lock only for the partition the page is ascribed to. This makes
the chances for threads not compete for the same key cache lock better.
The idea and the original of the partitioned key cache was provided by one of
our external contributers (see the attached file segmented_keycache_v2.diff with
the original patch from the contributor).
ESTIMATED WORK TIME
ESTIMATED COMPLETION DATE
-----------------------------------------------------------------------
WorkLog (v3.5.9)
1
0

[Maria-developers] Updated (by Igor): Partitioned Key Cache for MyISAM (86)
by worklog-noreply@askmonty.org 13 Feb '10
by worklog-noreply@askmonty.org 13 Feb '10
13 Feb '10
-----------------------------------------------------------------------
WORKLOG TASK
-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-
TASK...........: Partitioned Key Cache for MyISAM
CREATION DATE..: Sun, 14 Feb 2010, 00:17
SUPERVISOR.....: Monty
IMPLEMENTOR....: Igor
COPIES TO......: Igor, Monty, Sergei
CATEGORY.......: Server-BackLog
TASK ID........: 86 (http://askmonty.org/worklog/?tid=86)
VERSION........: Server-5.2
STATUS.........: Assigned
PRIORITY.......: 60
WORKED HOURS...: 0
ESTIMATE.......: 80 (hours remain)
ORIG. ESTIMATE.: 80
PROGRESS NOTES:
-=-=(Igor - Sun, 14 Feb 2010, 00:18)=-=-
Version updated.
--- /tmp/wklog.86.old.10044 2010-02-14 00:18:31.000000000 +0200
+++ /tmp/wklog.86.new.10044 2010-02-14 00:18:31.000000000 +0200
@@ -1 +1 @@
-Benchmarks-3.0
+Server-5.2
DESCRIPTION:
A partitioned key cache is a collection of structures for regular MyiSAM key
caches called key cache partitions. Any page from a file can be placed into a
buffer of only one partition. The number of the partition is calculated from the
file number and the position of the page in the file, and it's always the same
for the page. The function that maps pages into partitions takes care of even
distribution of pages among partitions.
Partition key cache mitigate one of the major problem of simple key cache:
thread contention for key cache lock (mutex). Every call of a key cache
interface function must acquire this lock. So threads compete for this lock even
in the case when they have acquired shared locks for the file and pages they
want read from are in the key cache buffers. When working with a partitioned key
cache any key cache interface function that needs only one page has to acquire
the key cache lock only for the partition the page is ascribed to. This makes
the chances for threads not compete for the same key cache lock better.
The idea and the original of the partitioned key cache was provided by one of
our external contributers (see the attached file segmented_keycache_v2.diff with
the original patch from the contributor).
ESTIMATED WORK TIME
ESTIMATED COMPLETION DATE
-----------------------------------------------------------------------
WorkLog (v3.5.9)
1
0

[Maria-developers] Updated (by Igor): Partitioned Key Cache for MyISAM (86)
by worklog-noreply@askmonty.org 13 Feb '10
by worklog-noreply@askmonty.org 13 Feb '10
13 Feb '10
-----------------------------------------------------------------------
WORKLOG TASK
-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-
TASK...........: Partitioned Key Cache for MyISAM
CREATION DATE..: Sun, 14 Feb 2010, 00:17
SUPERVISOR.....: Monty
IMPLEMENTOR....: Igor
COPIES TO......: Igor, Monty, Sergei
CATEGORY.......: Server-BackLog
TASK ID........: 86 (http://askmonty.org/worklog/?tid=86)
VERSION........: Server-5.2
STATUS.........: Assigned
PRIORITY.......: 60
WORKED HOURS...: 0
ESTIMATE.......: 80 (hours remain)
ORIG. ESTIMATE.: 80
PROGRESS NOTES:
-=-=(Igor - Sun, 14 Feb 2010, 00:18)=-=-
Version updated.
--- /tmp/wklog.86.old.10044 2010-02-14 00:18:31.000000000 +0200
+++ /tmp/wklog.86.new.10044 2010-02-14 00:18:31.000000000 +0200
@@ -1 +1 @@
-Benchmarks-3.0
+Server-5.2
DESCRIPTION:
A partitioned key cache is a collection of structures for regular MyiSAM key
caches called key cache partitions. Any page from a file can be placed into a
buffer of only one partition. The number of the partition is calculated from the
file number and the position of the page in the file, and it's always the same
for the page. The function that maps pages into partitions takes care of even
distribution of pages among partitions.
Partition key cache mitigate one of the major problem of simple key cache:
thread contention for key cache lock (mutex). Every call of a key cache
interface function must acquire this lock. So threads compete for this lock even
in the case when they have acquired shared locks for the file and pages they
want read from are in the key cache buffers. When working with a partitioned key
cache any key cache interface function that needs only one page has to acquire
the key cache lock only for the partition the page is ascribed to. This makes
the chances for threads not compete for the same key cache lock better.
The idea and the original of the partitioned key cache was provided by one of
our external contributers (see the attached file segmented_keycache_v2.diff with
the original patch from the contributor).
ESTIMATED WORK TIME
ESTIMATED COMPLETION DATE
-----------------------------------------------------------------------
WorkLog (v3.5.9)
1
0

[Maria-developers] New (by Igor): Partitioned Key Cache for MyISAM (86)
by worklog-noreply@askmonty.org 13 Feb '10
by worklog-noreply@askmonty.org 13 Feb '10
13 Feb '10
-----------------------------------------------------------------------
WORKLOG TASK
-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-
TASK...........: Partitioned Key Cache for MyISAM
CREATION DATE..: Sun, 14 Feb 2010, 00:17
SUPERVISOR.....: Monty
IMPLEMENTOR....:
COPIES TO......: Igor, Monty, Sergei
CATEGORY.......: Server-BackLog
TASK ID........: 86 (http://askmonty.org/worklog/?tid=86)
VERSION........: Benchmarks-3.0
STATUS.........: Assigned
PRIORITY.......: 60
WORKED HOURS...: 0
ESTIMATE.......: 80 (hours remain)
ORIG. ESTIMATE.: 80
PROGRESS NOTES:
DESCRIPTION:
A partitioned key cache is a collection of structures for regular MyiSAM key
caches called key cache partitions. Any page from a file can be placed into a
buffer of only one partition. The number of the partition is calculated from the
file number and the position of the page in the file, and it's always the same
for the page. The function that maps pages into partitions takes care of even
distribution of pages among partitions.
Partition key cache mitigate one of the major problem of simple key cache:
thread contention for key cache lock (mutex). Every call of a key cache
interface function must acquire this lock. So threads compete for this lock even
in the case when they have acquired shared locks for the file and pages they
want read from are in the key cache buffers. When working with a partitioned key
cache any key cache interface function that needs only one page has to acquire
the key cache lock only for the partition the page is ascribed to. This makes
the chances for threads not compete for the same key cache lock better.
The idea and the original of the partitioned key cache was provided by one of
our external contributers (see the attached file segmented_keycache_v2.diff with
the original patch from the contributor).
ESTIMATED WORK TIME
ESTIMATED COMPLETION DATE
-----------------------------------------------------------------------
WorkLog (v3.5.9)
1
0

[Maria-developers] New (by Igor): Partitioned Key Cache for MyISAM (86)
by worklog-noreply@askmonty.org 13 Feb '10
by worklog-noreply@askmonty.org 13 Feb '10
13 Feb '10
-----------------------------------------------------------------------
WORKLOG TASK
-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-
TASK...........: Partitioned Key Cache for MyISAM
CREATION DATE..: Sun, 14 Feb 2010, 00:17
SUPERVISOR.....: Monty
IMPLEMENTOR....:
COPIES TO......: Igor, Monty, Sergei
CATEGORY.......: Server-BackLog
TASK ID........: 86 (http://askmonty.org/worklog/?tid=86)
VERSION........: Benchmarks-3.0
STATUS.........: Assigned
PRIORITY.......: 60
WORKED HOURS...: 0
ESTIMATE.......: 80 (hours remain)
ORIG. ESTIMATE.: 80
PROGRESS NOTES:
DESCRIPTION:
A partitioned key cache is a collection of structures for regular MyiSAM key
caches called key cache partitions. Any page from a file can be placed into a
buffer of only one partition. The number of the partition is calculated from the
file number and the position of the page in the file, and it's always the same
for the page. The function that maps pages into partitions takes care of even
distribution of pages among partitions.
Partition key cache mitigate one of the major problem of simple key cache:
thread contention for key cache lock (mutex). Every call of a key cache
interface function must acquire this lock. So threads compete for this lock even
in the case when they have acquired shared locks for the file and pages they
want read from are in the key cache buffers. When working with a partitioned key
cache any key cache interface function that needs only one page has to acquire
the key cache lock only for the partition the page is ascribed to. This makes
the chances for threads not compete for the same key cache lock better.
The idea and the original of the partitioned key cache was provided by one of
our external contributers (see the attached file segmented_keycache_v2.diff with
the original patch from the contributor).
ESTIMATED WORK TIME
ESTIMATED COMPLETION DATE
-----------------------------------------------------------------------
WorkLog (v3.5.9)
1
0

[Maria-developers] New (by Igor): Partitioned Key Cache for MyISAM (86)
by worklog-noreply@askmonty.org 13 Feb '10
by worklog-noreply@askmonty.org 13 Feb '10
13 Feb '10
-----------------------------------------------------------------------
WORKLOG TASK
-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-
TASK...........: Partitioned Key Cache for MyISAM
CREATION DATE..: Sun, 14 Feb 2010, 00:17
SUPERVISOR.....: Monty
IMPLEMENTOR....:
COPIES TO......: Igor, Monty, Sergei
CATEGORY.......: Server-BackLog
TASK ID........: 86 (http://askmonty.org/worklog/?tid=86)
VERSION........: Benchmarks-3.0
STATUS.........: Assigned
PRIORITY.......: 60
WORKED HOURS...: 0
ESTIMATE.......: 80 (hours remain)
ORIG. ESTIMATE.: 80
PROGRESS NOTES:
DESCRIPTION:
A partitioned key cache is a collection of structures for regular MyiSAM key
caches called key cache partitions. Any page from a file can be placed into a
buffer of only one partition. The number of the partition is calculated from the
file number and the position of the page in the file, and it's always the same
for the page. The function that maps pages into partitions takes care of even
distribution of pages among partitions.
Partition key cache mitigate one of the major problem of simple key cache:
thread contention for key cache lock (mutex). Every call of a key cache
interface function must acquire this lock. So threads compete for this lock even
in the case when they have acquired shared locks for the file and pages they
want read from are in the key cache buffers. When working with a partitioned key
cache any key cache interface function that needs only one page has to acquire
the key cache lock only for the partition the page is ascribed to. This makes
the chances for threads not compete for the same key cache lock better.
The idea and the original of the partitioned key cache was provided by one of
our external contributers (see the attached file segmented_keycache_v2.diff with
the original patch from the contributor).
ESTIMATED WORK TIME
ESTIMATED COMPLETION DATE
-----------------------------------------------------------------------
WorkLog (v3.5.9)
1
0

[Maria-developers] Updated (by Igor): Partitioned Key Cache for MyISAM (85)
by worklog-noreply@askmonty.org 13 Feb '10
by worklog-noreply@askmonty.org 13 Feb '10
13 Feb '10
-----------------------------------------------------------------------
WORKLOG TASK
-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-
TASK...........: Partitioned Key Cache for MyISAM
CREATION DATE..: Sun, 14 Feb 2010, 00:10
SUPERVISOR.....: Monty
IMPLEMENTOR....:
COPIES TO......: Igor, Monty, Sergei
CATEGORY.......: Server-Sprint
TASK ID........: 85 (http://askmonty.org/worklog/?tid=85)
VERSION........: Server-5.2
STATUS.........: Assigned
PRIORITY.......: 60
WORKED HOURS...: 0
ESTIMATE.......: 80 (hours remain)
ORIG. ESTIMATE.: 80
PROGRESS NOTES:
-=-=(Igor - Sun, 14 Feb 2010, 00:15)=-=-
Category updated.
--- /tmp/wklog.85.old.9810 2010-02-13 22:15:43.000000000 +0000
+++ /tmp/wklog.85.new.9810 2010-02-13 22:15:43.000000000 +0000
@@ -1 +1 @@
-Server-BackLog
+Server-Sprint
-=-=(Igor - Sun, 14 Feb 2010, 00:15)=-=-
Version updated.
--- /tmp/wklog.85.old.9810 2010-02-13 22:15:43.000000000 +0000
+++ /tmp/wklog.85.new.9810 2010-02-13 22:15:43.000000000 +0000
@@ -1 +1 @@
-Benchmarks-3.0
+Server-5.2
-=-=(Igor - Sun, 14 Feb 2010, 00:12)=-=-
New attachment: 'segmented_keycache_v2.diff'
DESCRIPTION:
A partitioned key cache is a collection of structures for regular MyiSAM key
caches called key cache partitions. Any page from a file can be placed into a
buffer of only one partition. The number of the partition is calculated from the
file number and the position of the page in the file, and it's always the same
for the page. The function that maps pages into partitions takes care of even
distribution of pages among partitions.
Partition key cache mitigate one of the major problem of simple key cache:
thread contention for key cache lock (mutex). Every call of a key cache
interface function must acquire this lock. So threads compete for this lock even
in the case when they have acquired shared locks for the file and pages they
want read from are in the key cache buffers. When working with a partitioned key
cache any key cache interface function that needs only one page has to acquire
the key cache lock only for the partition the page is ascribed to. This makes
the chances for threads not compete for the same key cache lock better.
The idea and the original of the partitioned key cache was provided by one of
our external contributers.
ESTIMATED WORK TIME
ESTIMATED COMPLETION DATE
-----------------------------------------------------------------------
WorkLog (v3.5.9)
1
0

[Maria-developers] Updated (by Igor): Partitioned Key Cache for MyISAM (85)
by worklog-noreply@askmonty.org 13 Feb '10
by worklog-noreply@askmonty.org 13 Feb '10
13 Feb '10
-----------------------------------------------------------------------
WORKLOG TASK
-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-
TASK...........: Partitioned Key Cache for MyISAM
CREATION DATE..: Sun, 14 Feb 2010, 00:10
SUPERVISOR.....: Monty
IMPLEMENTOR....:
COPIES TO......: Igor, Monty, Sergei
CATEGORY.......: Server-Sprint
TASK ID........: 85 (http://askmonty.org/worklog/?tid=85)
VERSION........: Server-5.2
STATUS.........: Assigned
PRIORITY.......: 60
WORKED HOURS...: 0
ESTIMATE.......: 80 (hours remain)
ORIG. ESTIMATE.: 80
PROGRESS NOTES:
-=-=(Igor - Sun, 14 Feb 2010, 00:15)=-=-
Category updated.
--- /tmp/wklog.85.old.9810 2010-02-13 22:15:43.000000000 +0000
+++ /tmp/wklog.85.new.9810 2010-02-13 22:15:43.000000000 +0000
@@ -1 +1 @@
-Server-BackLog
+Server-Sprint
-=-=(Igor - Sun, 14 Feb 2010, 00:15)=-=-
Version updated.
--- /tmp/wklog.85.old.9810 2010-02-13 22:15:43.000000000 +0000
+++ /tmp/wklog.85.new.9810 2010-02-13 22:15:43.000000000 +0000
@@ -1 +1 @@
-Benchmarks-3.0
+Server-5.2
-=-=(Igor - Sun, 14 Feb 2010, 00:12)=-=-
New attachment: 'segmented_keycache_v2.diff'
DESCRIPTION:
A partitioned key cache is a collection of structures for regular MyiSAM key
caches called key cache partitions. Any page from a file can be placed into a
buffer of only one partition. The number of the partition is calculated from the
file number and the position of the page in the file, and it's always the same
for the page. The function that maps pages into partitions takes care of even
distribution of pages among partitions.
Partition key cache mitigate one of the major problem of simple key cache:
thread contention for key cache lock (mutex). Every call of a key cache
interface function must acquire this lock. So threads compete for this lock even
in the case when they have acquired shared locks for the file and pages they
want read from are in the key cache buffers. When working with a partitioned key
cache any key cache interface function that needs only one page has to acquire
the key cache lock only for the partition the page is ascribed to. This makes
the chances for threads not compete for the same key cache lock better.
The idea and the original of the partitioned key cache was provided by one of
our external contributers.
ESTIMATED WORK TIME
ESTIMATED COMPLETION DATE
-----------------------------------------------------------------------
WorkLog (v3.5.9)
1
0

[Maria-developers] Updated (by Igor): Partitioned Key Cache for MyISAM (85)
by worklog-noreply@askmonty.org 13 Feb '10
by worklog-noreply@askmonty.org 13 Feb '10
13 Feb '10
-----------------------------------------------------------------------
WORKLOG TASK
-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-
TASK...........: Partitioned Key Cache for MyISAM
CREATION DATE..: Sun, 14 Feb 2010, 00:10
SUPERVISOR.....: Monty
IMPLEMENTOR....:
COPIES TO......: Igor, Monty, Sergei
CATEGORY.......: Server-Sprint
TASK ID........: 85 (http://askmonty.org/worklog/?tid=85)
VERSION........: Server-5.2
STATUS.........: Assigned
PRIORITY.......: 60
WORKED HOURS...: 0
ESTIMATE.......: 80 (hours remain)
ORIG. ESTIMATE.: 80
PROGRESS NOTES:
-=-=(Igor - Sun, 14 Feb 2010, 00:15)=-=-
Category updated.
--- /tmp/wklog.85.old.9810 2010-02-13 22:15:43.000000000 +0000
+++ /tmp/wklog.85.new.9810 2010-02-13 22:15:43.000000000 +0000
@@ -1 +1 @@
-Server-BackLog
+Server-Sprint
-=-=(Igor - Sun, 14 Feb 2010, 00:15)=-=-
Version updated.
--- /tmp/wklog.85.old.9810 2010-02-13 22:15:43.000000000 +0000
+++ /tmp/wklog.85.new.9810 2010-02-13 22:15:43.000000000 +0000
@@ -1 +1 @@
-Benchmarks-3.0
+Server-5.2
-=-=(Igor - Sun, 14 Feb 2010, 00:12)=-=-
New attachment: 'segmented_keycache_v2.diff'
DESCRIPTION:
A partitioned key cache is a collection of structures for regular MyiSAM key
caches called key cache partitions. Any page from a file can be placed into a
buffer of only one partition. The number of the partition is calculated from the
file number and the position of the page in the file, and it's always the same
for the page. The function that maps pages into partitions takes care of even
distribution of pages among partitions.
Partition key cache mitigate one of the major problem of simple key cache:
thread contention for key cache lock (mutex). Every call of a key cache
interface function must acquire this lock. So threads compete for this lock even
in the case when they have acquired shared locks for the file and pages they
want read from are in the key cache buffers. When working with a partitioned key
cache any key cache interface function that needs only one page has to acquire
the key cache lock only for the partition the page is ascribed to. This makes
the chances for threads not compete for the same key cache lock better.
The idea and the original of the partitioned key cache was provided by one of
our external contributers.
ESTIMATED WORK TIME
ESTIMATED COMPLETION DATE
-----------------------------------------------------------------------
WorkLog (v3.5.9)
1
0

[Maria-developers] Updated (by Igor): Partitioned Key Cache for MyISAM (85)
by worklog-noreply@askmonty.org 13 Feb '10
by worklog-noreply@askmonty.org 13 Feb '10
13 Feb '10
-----------------------------------------------------------------------
WORKLOG TASK
-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-
TASK...........: Partitioned Key Cache for MyISAM
CREATION DATE..: Sun, 14 Feb 2010, 00:10
SUPERVISOR.....: Monty
IMPLEMENTOR....:
COPIES TO......: Igor, Monty, Sergei
CATEGORY.......: Server-BackLog
TASK ID........: 85 (http://askmonty.org/worklog/?tid=85)
VERSION........: Benchmarks-3.0
STATUS.........: Assigned
PRIORITY.......: 60
WORKED HOURS...: 0
ESTIMATE.......: 80 (hours remain)
ORIG. ESTIMATE.: 80
PROGRESS NOTES:
-=-=(Igor - Sun, 14 Feb 2010, 00:12)=-=-
New attachment: 'segmented_keycache_v2.diff'
DESCRIPTION:
A partitioned key cache is a collection of structures for regular MyiSAM key
caches called key cache partitions. Any page from a file can be placed into a
buffer of only one partition. The number of the partition is calculated from the
file number and the position of the page in the file, and it's always the same
for the page. The function that maps pages into partitions takes care of even
distribution of pages among partitions.
Partition key cache mitigate one of the major problem of simple key cache:
thread contention for key cache lock (mutex). Every call of a key cache
interface function must acquire this lock. So threads compete for this lock even
in the case when they have acquired shared locks for the file and pages they
want read from are in the key cache buffers. When working with a partitioned key
cache any key cache interface function that needs only one page has to acquire
the key cache lock only for the partition the page is ascribed to. This makes
the chances for threads not compete for the same key cache lock better.
The idea and the original of the partitioned key cache was provided by one of
our external contributers.
ESTIMATED WORK TIME
ESTIMATED COMPLETION DATE
-----------------------------------------------------------------------
WorkLog (v3.5.9)
1
0

[Maria-developers] Updated (by Igor): Partitioned Key Cache for MyISAM (85)
by worklog-noreply@askmonty.org 13 Feb '10
by worklog-noreply@askmonty.org 13 Feb '10
13 Feb '10
-----------------------------------------------------------------------
WORKLOG TASK
-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-
TASK...........: Partitioned Key Cache for MyISAM
CREATION DATE..: Sun, 14 Feb 2010, 00:10
SUPERVISOR.....: Monty
IMPLEMENTOR....:
COPIES TO......: Igor, Monty, Sergei
CATEGORY.......: Server-BackLog
TASK ID........: 85 (http://askmonty.org/worklog/?tid=85)
VERSION........: Benchmarks-3.0
STATUS.........: Assigned
PRIORITY.......: 60
WORKED HOURS...: 0
ESTIMATE.......: 80 (hours remain)
ORIG. ESTIMATE.: 80
PROGRESS NOTES:
-=-=(Igor - Sun, 14 Feb 2010, 00:12)=-=-
New attachment: 'segmented_keycache_v2.diff'
DESCRIPTION:
A partitioned key cache is a collection of structures for regular MyiSAM key
caches called key cache partitions. Any page from a file can be placed into a
buffer of only one partition. The number of the partition is calculated from the
file number and the position of the page in the file, and it's always the same
for the page. The function that maps pages into partitions takes care of even
distribution of pages among partitions.
Partition key cache mitigate one of the major problem of simple key cache:
thread contention for key cache lock (mutex). Every call of a key cache
interface function must acquire this lock. So threads compete for this lock even
in the case when they have acquired shared locks for the file and pages they
want read from are in the key cache buffers. When working with a partitioned key
cache any key cache interface function that needs only one page has to acquire
the key cache lock only for the partition the page is ascribed to. This makes
the chances for threads not compete for the same key cache lock better.
The idea and the original of the partitioned key cache was provided by one of
our external contributers.
ESTIMATED WORK TIME
ESTIMATED COMPLETION DATE
-----------------------------------------------------------------------
WorkLog (v3.5.9)
1
0

[Maria-developers] Updated (by Igor): Partitioned Key Cache for MyISAM (85)
by worklog-noreply@askmonty.org 13 Feb '10
by worklog-noreply@askmonty.org 13 Feb '10
13 Feb '10
-----------------------------------------------------------------------
WORKLOG TASK
-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-
TASK...........: Partitioned Key Cache for MyISAM
CREATION DATE..: Sun, 14 Feb 2010, 00:10
SUPERVISOR.....: Monty
IMPLEMENTOR....:
COPIES TO......: Igor, Monty, Sergei
CATEGORY.......: Server-BackLog
TASK ID........: 85 (http://askmonty.org/worklog/?tid=85)
VERSION........: Benchmarks-3.0
STATUS.........: Assigned
PRIORITY.......: 60
WORKED HOURS...: 0
ESTIMATE.......: 80 (hours remain)
ORIG. ESTIMATE.: 80
PROGRESS NOTES:
-=-=(Igor - Sun, 14 Feb 2010, 00:12)=-=-
New attachment: 'segmented_keycache_v2.diff'
DESCRIPTION:
A partitioned key cache is a collection of structures for regular MyiSAM key
caches called key cache partitions. Any page from a file can be placed into a
buffer of only one partition. The number of the partition is calculated from the
file number and the position of the page in the file, and it's always the same
for the page. The function that maps pages into partitions takes care of even
distribution of pages among partitions.
Partition key cache mitigate one of the major problem of simple key cache:
thread contention for key cache lock (mutex). Every call of a key cache
interface function must acquire this lock. So threads compete for this lock even
in the case when they have acquired shared locks for the file and pages they
want read from are in the key cache buffers. When working with a partitioned key
cache any key cache interface function that needs only one page has to acquire
the key cache lock only for the partition the page is ascribed to. This makes
the chances for threads not compete for the same key cache lock better.
The idea and the original of the partitioned key cache was provided by one of
our external contributers.
ESTIMATED WORK TIME
ESTIMATED COMPLETION DATE
-----------------------------------------------------------------------
WorkLog (v3.5.9)
1
0

[Maria-developers] New (by Igor): Partitioned Key Cache for MyISAM (85)
by worklog-noreply@askmonty.org 13 Feb '10
by worklog-noreply@askmonty.org 13 Feb '10
13 Feb '10
-----------------------------------------------------------------------
WORKLOG TASK
-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-
TASK...........: Partitioned Key Cache for MyISAM
CREATION DATE..: Sun, 14 Feb 2010, 00:10
SUPERVISOR.....: Monty
IMPLEMENTOR....:
COPIES TO......: Igor, Monty, Sergei
CATEGORY.......: Server-BackLog
TASK ID........: 85 (http://askmonty.org/worklog/?tid=85)
VERSION........: Benchmarks-3.0
STATUS.........: Assigned
PRIORITY.......: 60
WORKED HOURS...: 0
ESTIMATE.......: 80 (hours remain)
ORIG. ESTIMATE.: 80
PROGRESS NOTES:
DESCRIPTION:
A partitioned key cache is a collection of structures for regular MyiSAM key
caches called key cache partitions. Any page from a file can be placed into a
buffer of only one partition. The number of the partition is calculated from the
file number and the position of the page in the file, and it's always the same
for the page. The function that maps pages into partitions takes care of even
distribution of pages among partitions.
Partition key cache mitigate one of the major problem of simple key cache:
thread contention for key cache lock (mutex). Every call of a key cache
interface function must acquire this lock. So threads compete for this lock even
in the case when they have acquired shared locks for the file and pages they
want read from are in the key cache buffers. When working with a partitioned key
cache any key cache interface function that needs only one page has to acquire
the key cache lock only for the partition the page is ascribed to. This makes
the chances for threads not compete for the same key cache lock better.
The idea and the original of the partitioned key cache was provided by one of
our external contributers.
ESTIMATED WORK TIME
ESTIMATED COMPLETION DATE
-----------------------------------------------------------------------
WorkLog (v3.5.9)
1
0

[Maria-developers] New (by Igor): Partitioned Key Cache for MyISAM (85)
by worklog-noreply@askmonty.org 13 Feb '10
by worklog-noreply@askmonty.org 13 Feb '10
13 Feb '10
-----------------------------------------------------------------------
WORKLOG TASK
-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-
TASK...........: Partitioned Key Cache for MyISAM
CREATION DATE..: Sun, 14 Feb 2010, 00:10
SUPERVISOR.....: Monty
IMPLEMENTOR....:
COPIES TO......: Igor, Monty, Sergei
CATEGORY.......: Server-BackLog
TASK ID........: 85 (http://askmonty.org/worklog/?tid=85)
VERSION........: Benchmarks-3.0
STATUS.........: Assigned
PRIORITY.......: 60
WORKED HOURS...: 0
ESTIMATE.......: 80 (hours remain)
ORIG. ESTIMATE.: 80
PROGRESS NOTES:
DESCRIPTION:
A partitioned key cache is a collection of structures for regular MyiSAM key
caches called key cache partitions. Any page from a file can be placed into a
buffer of only one partition. The number of the partition is calculated from the
file number and the position of the page in the file, and it's always the same
for the page. The function that maps pages into partitions takes care of even
distribution of pages among partitions.
Partition key cache mitigate one of the major problem of simple key cache:
thread contention for key cache lock (mutex). Every call of a key cache
interface function must acquire this lock. So threads compete for this lock even
in the case when they have acquired shared locks for the file and pages they
want read from are in the key cache buffers. When working with a partitioned key
cache any key cache interface function that needs only one page has to acquire
the key cache lock only for the partition the page is ascribed to. This makes
the chances for threads not compete for the same key cache lock better.
The idea and the original of the partitioned key cache was provided by one of
our external contributers.
ESTIMATED WORK TIME
ESTIMATED COMPLETION DATE
-----------------------------------------------------------------------
WorkLog (v3.5.9)
1
0

[Maria-developers] Rev 2757: Fix for previous cset in file:///home/psergey/dev/maria-5.3-subqueries-r6/
by Sergey Petrunya 12 Feb '10
by Sergey Petrunya 12 Feb '10
12 Feb '10
At file:///home/psergey/dev/maria-5.3-subqueries-r6/
------------------------------------------------------------
revno: 2757
revision-id: psergey(a)askmonty.org-20100212181041-5rwekm1wpvwaikkx
parent: psergey(a)askmonty.org-20100211235958-p11o4e80dlrn2bsq
committer: Sergey Petrunya <psergey(a)askmonty.org>
branch nick: maria-5.3-subqueries-r6
timestamp: Fri 2010-02-12 21:10:41 +0300
message:
Fix for previous cset
=== modified file 'sql/item_subselect.cc'
--- a/sql/item_subselect.cc 2010-02-11 23:59:58 +0000
+++ b/sql/item_subselect.cc 2010-02-12 18:10:41 +0000
@@ -1316,6 +1316,7 @@
(char *)in_left_expr_name);
master_unit->uncacheable|= UNCACHEABLE_DEPENDENT;
+ select_lex->uncacheable|= UNCACHEABLE_DEPENDENT;
}
if (!abort_on_null && left_expr->maybe_null && !pushed_cond_guards)
1
0

[Maria-developers] Rev 3770: MWL#68 Subquery optimization: Efficient NOT IN execution with NULLs in file:///home/tsk/mprog/src/mysql-6.0-mwl68/
by timour@askmonty.org 12 Feb '10
by timour@askmonty.org 12 Feb '10
12 Feb '10
At file:///home/tsk/mprog/src/mysql-6.0-mwl68/
------------------------------------------------------------
revno: 3770
revision-id: timour(a)askmonty.org-20100212143343-l0pjascssuqedfk6
parent: timour(a)askmonty.org-20100201120948-mdt7gtwcz50q1dzp
committer: timour(a)askmonty.org
branch nick: mysql-6.0-mwl68
timestamp: Fri 2010-02-12 16:33:43 +0200
message:
MWL#68 Subquery optimization: Efficient NOT IN execution with NULLs
This patch implements working partial matching for materialized subqueries.
The code passes the full regression test, except differences in EXPLAIN.
There are no other known test failures.
Diff too large for email (1593 lines, the limit is 1000).
1
0

[Maria-developers] bzr commit into file:///home/tsk/mprog/src/mysql-6.0-mwl68/ branch (timour:3770)
by timour@askmonty.org 12 Feb '10
by timour@askmonty.org 12 Feb '10
12 Feb '10
#At file:///home/tsk/mprog/src/mysql-6.0-mwl68/ based on revid:timour@askmonty.org-20100201120948-mdt7gtwcz50q1dzp
3770 timour(a)askmonty.org 2010-02-12
MWL#68 Subquery optimization: Efficient NOT IN execution with NULLs
This patch implements working partial matching for materialized subqueries.
The code passes the full regression test, except differences in EXPLAIN.
There are no other known test failures.
modified:
sql/item_subselect.cc
sql/item_subselect.h
sql/sql_class.cc
sql/sql_class.h
sql/sql_select.cc
=== modified file 'sql/item_subselect.cc'
--- a/sql/item_subselect.cc 2010-02-01 12:09:48 +0000
+++ b/sql/item_subselect.cc 2010-02-12 14:33:43 +0000
@@ -2436,6 +2436,17 @@ int subselect_uniquesubquery_engine::sca
for (;;)
{
error=table->file->rnd_next(table->record[0]);
+ /*
+ TODO: The below tests are wrong, Monty's proposal:
+ if (error) {
+ if (error == HA_ERR_RECORD_DELETED)
+ continue;
+ if (error = HA_ERR_END_OF_FILE)
+ break;
+ else
+ report error;
+ break;
+ */
if (error && error != HA_ERR_END_OF_FILE)
{
error= report_error(table, error);
@@ -2453,6 +2464,11 @@ int subselect_uniquesubquery_engine::sca
}
table->file->ha_rnd_end();
+ /*
+ TODO: it seems to be an error to return TRUE when the error was
+ HA_ERR_END_OF_FILE which is perfectly fine. HA_ERR_END_OF_FILE
+ only means we didn't find a match.
+ */
DBUG_RETURN(error != 0);
}
@@ -2517,6 +2533,10 @@ bool subselect_uniquesubquery_engine::co
See also the comment for the subselect_uniquesubquery_engine::exec()
function.
*/
+ /*
+ TODO: If not all outer cols are NULL, how we know the result is NULL,
+ and not FALSE? Even on top-level.
+ */
null_keypart= (*copy)->null_key;
if (null_keypart)
{
@@ -2556,6 +2576,59 @@ bool subselect_uniquesubquery_engine::co
/*
+ @retval 1 A NULL was found in the outer reference, index lookup is
+ not applicable, the outer ref is unsusable as a lookup key,
+ use some other method to find a match.
+ @retval 0 The outer ref was copied into an index lookup key.
+ @retval -1 The outer ref cannot possibly match any row, IN is FALSE.
+*/
+
+int subselect_uniquesubquery_engine::copy_ref_key_simple()
+{
+ for (store_key **copy= tab->ref.key_copy ; *copy ; copy++)
+ {
+ enum store_key::store_key_result store_res;
+ store_res= (*copy)->copy();
+ tab->ref.key_err= store_res;
+
+ /*
+ When there is a NULL part in the key we don't need to make index
+ lookup for such key thus we don't need to copy whole key.
+ If we later should do a sequential scan return OK. Fail otherwise.
+
+ See also the comment for the subselect_uniquesubquery_engine::exec()
+ function.
+ */
+ /*
+ TODO: If not all outer cols are NULL, how we know the result is NULL,
+ and not FALSE? Even on top-level.
+ */
+ null_keypart= (*copy)->null_key;
+ if (null_keypart)
+ return 1;
+
+ /*
+ Check if the error is equal to STORE_KEY_FATAL. This is not expressed
+ using the store_key::store_key_result enum because ref.key_err is a
+ boolean and we want to detect both TRUE and STORE_KEY_FATAL from the
+ space of the union of the values of [TRUE, FALSE] and
+ store_key::store_key_result.
+ TODO: fix the variable an return types.
+ */
+ if (store_res == store_key::STORE_KEY_FATAL)
+ {
+ /*
+ Error converting the left IN operand to the column type of the right
+ IN operand.
+ */
+ return -1;
+ }
+ }
+ return 0;
+}
+
+
+/*
Execute subselect
SYNOPSIS
@@ -2595,7 +2668,10 @@ int subselect_uniquesubquery_engine::exe
/* TODO: change to use of 'full_scan' here? */
if (copy_ref_key())
+ {
+ /* TODO: copy_ref_key() == 1 means NULL result, not error, why return 1? */
DBUG_RETURN(1);
+ }
if (table->status)
{
/*
@@ -2637,6 +2713,52 @@ int subselect_uniquesubquery_engine::exe
/*
+ TODO: this needs more thinking, as exec() is a bit wrong IMO.
+ - we don't need empty_result_set, as it is == 1 <=> when
+ item->value == 0
+ - scan_table() returns >0 even when there was no actuall error,
+ but we only found EOF while scanning.
+ - scan_table should not check table->status, but it should check
+ HA_ERR_END_OF_FILE
+*/
+
+int subselect_uniquesubquery_engine::index_lookup()
+{
+ DBUG_ENTER("subselect_uniquesubquery_engine::index_lookup");
+ int error;
+ TABLE *table= tab->table;
+ empty_result_set= TRUE;
+ table->status= 0;
+
+ if (!table->file->inited)
+ table->file->ha_index_init(tab->ref.key, 0);
+ error= table->file->index_read_map(table->record[0],
+ tab->ref.key_buff,
+ make_prev_keypart_map(tab->ref.key_parts),
+ HA_READ_KEY_EXACT);
+ DBUG_PRINT("info", ("lookup result: %i", error));
+ if (error &&
+ error != HA_ERR_KEY_NOT_FOUND && error != HA_ERR_END_OF_FILE)
+ error= report_error(table, error);
+ else
+ {
+ error= 0;
+ table->null_row= 0;
+ if (!table->status && (!cond || cond->val_int()))
+ {
+ ((Item_in_subselect *) item)->value= 1;
+ empty_result_set= FALSE;
+ }
+ else
+ ((Item_in_subselect *) item)->value= 0;
+ }
+
+ DBUG_RETURN(error);
+}
+
+
+
+/*
Index-lookup subselect 'engine' - run the subquery
SYNOPSIS
@@ -3136,6 +3258,7 @@ void subselect_hash_sj_engine::set_strat
Item_in_subselect *item_in= (Item_in_subselect *) item;
select_materialize_with_stats *result_sink=
(select_materialize_with_stats *) result;
+ Item *outer_col;
DBUG_ENTER("subselect_hash_sj_engine::set_strategy_using_data");
@@ -3146,13 +3269,20 @@ void subselect_hash_sj_engine::set_strat
{
if (!bitmap_is_set(&partial_match_key_parts, i))
continue;
-
- if (result_sink->get_null_count_of_col(i) == 0)
+ outer_col= item_in->left_expr->element_index(i);
+ /*
+ If column 'i' doesn't contain NULLs, and the corresponding outer reference
+ cannot have a NULL value, then 'i' is a non-nullable column.
+ */
+ if (result_sink->get_null_count_of_col(i) == 0 && !outer_col->maybe_null)
{
bitmap_clear_bit(&partial_match_key_parts, i);
bitmap_set_bit(&non_null_key_parts, i);
--count_partial_match_columns;
}
+ if (result_sink->get_null_count_of_col(i) ==
+ tmp_table->file->stats.records)
+ ++count_null_only_columns;
}
/* If no column contains NULLs use regular hash index lookups. */
@@ -3177,6 +3307,7 @@ bitmap_init_memroot(MY_BITMAP *map, uint
bitmap_buffer_size(n_bits))) ||
bitmap_init(map, bitmap_buf, n_bits, FALSE))
return TRUE;
+ bitmap_clear_all(map);
return FALSE;
}
@@ -3209,10 +3340,10 @@ bool subselect_hash_sj_engine::init_perm
DBUG_ENTER("subselect_hash_sj_engine::init_permanent");
- if (!(bitmap_init_memroot(&non_null_key_parts, tmp_columns->elements,
- thd->mem_root)) ||
- !(bitmap_init_memroot(&partial_match_key_parts, tmp_columns->elements,
- thd->mem_root)))
+ if (bitmap_init_memroot(&non_null_key_parts, tmp_columns->elements,
+ thd->mem_root) ||
+ bitmap_init_memroot(&partial_match_key_parts, tmp_columns->elements,
+ thd->mem_root))
DBUG_RETURN(TRUE);
set_strategy_using_schema();
@@ -3548,33 +3679,45 @@ int subselect_hash_sj_engine::exec()
if (strategy == PARTIAL_MATCH)
{
- subselect_rowid_merge_engine *new_lookup_engine;
+ subselect_rowid_merge_engine *rowid_merge_engine;
uint count_pm_keys;
MY_BITMAP *nn_key_parts;
+ bool has_covering_null_row;
+ select_materialize_with_stats *result_sink=
+ (select_materialize_with_stats *) result;
+
/* Total number of keys needed for partial matching. */
- if (count_partial_match_columns < tmp_table->s->fields)
- {
- count_pm_keys= count_partial_match_columns + 1;
- nn_key_parts= &non_null_key_parts;
- }
+ nn_key_parts= (count_partial_match_columns < tmp_table->s->fields) ?
+ &non_null_key_parts : NULL;
+
+ has_covering_null_row= (result_sink->get_max_nulls_in_row() ==
+ tmp_table->s->fields -
+ (nn_key_parts ? bitmap_bits_set(nn_key_parts) : 0));
+
+ if (has_covering_null_row)
+ count_pm_keys= nn_key_parts ? 1 : 0;
else
- {
- count_pm_keys= count_partial_match_columns;
- nn_key_parts= NULL;
- }
+ count_pm_keys= count_partial_match_columns - count_null_only_columns +
+ (nn_key_parts ? 1 : 0);
- if (!(new_lookup_engine=
- new subselect_rowid_merge_engine(lookup_engine,
+ if (!(rowid_merge_engine=
+ new subselect_rowid_merge_engine((subselect_uniquesubquery_engine*)
+ lookup_engine,
tmp_table,
count_pm_keys,
+ has_covering_null_row,
item, result)) ||
- new_lookup_engine->init(nn_key_parts, &partial_match_key_parts))
+ rowid_merge_engine->init(nn_key_parts, &partial_match_key_parts))
{
- delete new_lookup_engine;
strategy= PARTIAL_MATCH_SCAN;
+ delete rowid_merge_engine;
/* TODO: setup execution structures for partial match via scanning. */
}
- strategy= PARTIAL_MATCH_INDEX;
+ else
+ {
+ strategy= PARTIAL_MATCH_INDEX;
+ lookup_engine= rowid_merge_engine;
+ }
}
item_in->change_engine(lookup_engine);
@@ -3632,15 +3775,49 @@ Ordered_key::Ordered_key(uint key_idx_ar
ha_rows min_null_row_arg, ha_rows max_null_row_arg,
uchar *row_num_to_rowid_arg)
: key_idx(key_idx_arg), tbl(tbl_arg), search_key(search_key_arg),
- row_num_to_rowid(row_num_to_rowid_arg), null_count(null_count_arg),
- min_null_row(min_null_row_arg), max_null_row(max_null_row_arg)
+ row_num_to_rowid(row_num_to_rowid_arg), null_count(null_count_arg)
+{
+ DBUG_ASSERT(tbl->file->stats.records > null_count);
+ key_buff_elements= tbl->file->stats.records - null_count;
+ cur_key_idx= HA_POS_ERROR;
+
+ DBUG_ASSERT((null_count && min_null_row_arg && max_null_row_arg) ||
+ (!null_count && !min_null_row_arg && !max_null_row_arg));
+ if (null_count)
+ {
+ /* The counters are 1-based, for key access we need 0-based indexes. */
+ min_null_row= min_null_row_arg - 1;
+ max_null_row= max_null_row_arg - 1;
+ }
+ else
+ min_null_row= max_null_row= 0;
+}
+
+
+Ordered_key::~Ordered_key()
{
- key_column_count= search_key->cols();
- cur_row= HA_POS_ERROR;
+ /*
+ All data structures are allocated on thd->mem_root, thus we don't
+ free them here.
+ */
}
/*
+ Cleanup that needs to be done for each PS (re)execution.
+*/
+
+void Ordered_key::cleanup()
+{
+ /*
+ Currently these keys are recreated for each PS re-execution, thus
+ there is nothing to cleanup, the whole object goes away after execution
+ is over. All handler related initialization/deinitialization is done by
+ the parent subselect_rowid_merge_engine object.
+ */
+}
+
+/*
Initialize a multi-column index.
*/
@@ -3648,10 +3825,10 @@ bool Ordered_key::init(MY_BITMAP *column
{
THD *thd= tbl->in_use;
uint cur_key_col= 0;
+ Item_field *cur_tmp_field;
+ Item_func_lt *fn_less_than;
- DBUG_ENTER("Ordered_key::init");
-
- DBUG_ASSERT(key_column_count == bitmap_bits_set(columns_to_index));
+ key_column_count= bitmap_bits_set(columns_to_index);
// TODO: check for mem allocation err, revert to scan
@@ -3660,22 +3837,26 @@ bool Ordered_key::init(MY_BITMAP *column
compare_pred= (Item_func_lt**) thd->alloc(key_column_count *
sizeof(Item_func_lt*));
- for (uint i= 0; i < columns_to_index->n_bits; i++, cur_key_col++)
+ for (uint i= 0; i < columns_to_index->n_bits; i++)
{
if (!bitmap_is_set(columns_to_index, i))
continue;
- key_columns[cur_key_col]= new Item_field(tbl->field[i]);
+ cur_tmp_field= new Item_field(tbl->field[i]);
/* Create the predicate (tmp_column[i] < outer_ref[i]). */
- compare_pred[cur_key_col]= new Item_func_lt(key_columns[cur_key_col],
- search_key->element_index(i));
+ fn_less_than= new Item_func_lt(cur_tmp_field,
+ search_key->element_index(i));
+ fn_less_than->fix_fields(thd, (Item**) &fn_less_than);
+ key_columns[cur_key_col]= cur_tmp_field;
+ compare_pred[cur_key_col]= fn_less_than;
+ ++cur_key_col;
}
if (alloc_keys_buffers())
{
/* TODO revert to partial match via table scan. */
- DBUG_RETURN(TRUE);
+ return TRUE;
}
- DBUG_RETURN(FALSE);
+ return FALSE;
}
@@ -3687,9 +3868,7 @@ bool Ordered_key::init(int col_idx)
{
THD *thd= tbl->in_use;
- DBUG_ENTER("Ordered_key::init");
-
- DBUG_ASSERT(key_column_count == 1);
+ key_column_count= 1;
// TODO: check for mem allocation err, revert to scan
@@ -3700,23 +3879,25 @@ bool Ordered_key::init(int col_idx)
/* Create the predicate (tmp_column[i] < outer_ref[i]). */
compare_pred[0]= new Item_func_lt(key_columns[0],
search_key->element_index(col_idx));
+ compare_pred[0]->fix_fields(thd, (Item**)&compare_pred[0]);
if (alloc_keys_buffers())
{
/* TODO revert to partial match via table scan. */
- DBUG_RETURN(TRUE);
+ return TRUE;
}
- DBUG_RETURN(FALSE);
+ return FALSE;
}
bool Ordered_key::alloc_keys_buffers()
{
THD *thd= tbl->in_use;
- ha_rows row_count= tbl->file->stats.records;
- if (!(row_index= (ha_rows*) thd->alloc((row_count - null_count) *
- sizeof(ha_rows))))
+ DBUG_ASSERT(key_buff_elements > 0);
+
+ if (!(key_buff= (rownum_t*) thd->alloc(key_buff_elements *
+ sizeof(rownum_t))))
return TRUE;
/*
@@ -3724,10 +3905,14 @@ bool Ordered_key::alloc_keys_buffers()
(max_null_row - min_null_row), and then use min_null_row as
lookup offset.
*/
- if (!(bitmap_init_memroot(&null_key, max_null_row,
- thd->mem_root)))
+ if (bitmap_init_memroot(&null_key,
+ /* this is max array index, we need count, so +1. */
+ max_null_row + 1,
+ thd->mem_root))
return TRUE;
+ cur_key_idx= HA_POS_ERROR;
+
return FALSE;
}
@@ -3735,66 +3920,88 @@ bool Ordered_key::alloc_keys_buffers()
/*
Quick sort comparison function that compares two rows of the same table
indentfied with their row numbers.
+
+ @retval -1
+ @retval 0
+ @retval +1
*/
-int Ordered_key::cmp_rows_by_rownum(Ordered_key *key, ha_rows *a, ha_rows *b)
+int
+Ordered_key::cmp_keys_by_row_data(ha_rows a, ha_rows b)
{
uchar *rowid_a, *rowid_b;
int error, cmp_res;
- TABLE *tbl= key->tbl;
/* The length in bytes of the rowids (positions) of tmp_table. */
uint rowid_length= tbl->file->ref_length;
- DBUG_ENTER("Ordered_key::cmp_rows_by_rownum");
if (a == b)
- DBUG_RETURN(0);
+ return 0;
/* Get the corresponding rowids. */
- rowid_a= key->row_num_to_rowid + (*a) * rowid_length;
- rowid_b= key->row_num_to_rowid + (*b) * rowid_length;
+ rowid_a= row_num_to_rowid + a * rowid_length;
+ rowid_b= row_num_to_rowid + b * rowid_length;
/* Fetch the rows for comparison. */
error= tbl->file->rnd_pos(tbl->record[0], rowid_a);
DBUG_ASSERT(!error);
error= tbl->file->rnd_pos(tbl->record[1], rowid_b);
DBUG_ASSERT(!error);
- /* Compare the two rows. */
- for (Field **f_ptr= tbl->field; *f_ptr; f_ptr++)
+ /*
+ Compare the two rows by the corresponding values of the indexed
+ columns.
+ */
+ for (uint i= 0; i < key_column_count; i++)
{
- if ((cmp_res= (*f_ptr)->cmp_offset(tbl->s->rec_buff_length)))
- DBUG_RETURN(cmp_res);
+ Field *cur_field= key_columns[i]->field;
+ if ((cmp_res= cur_field->cmp_offset(tbl->s->rec_buff_length)))
+ return (cmp_res > 0 ? 1 : -1);
}
- DBUG_RETURN(0);
+ return 0;
+}
+
+
+int
+Ordered_key::cmp_keys_by_row_data_and_rownum(Ordered_key *key,
+ rownum_t* a, rownum_t* b)
+{
+ /* The result of comparing the two keys according to their row data. */
+ int cmp_row_res= key->cmp_keys_by_row_data(*a, *b);
+ if (cmp_row_res)
+ return cmp_row_res;
+ return (*a < *b) ? -1 : (*a > *b) ? 1 : 0;
}
void Ordered_key::sort_keys()
{
- my_qsort(row_index, tbl->file->stats.records, sizeof(ha_rows),
- (qsort_cmp) &cmp_rows_by_rownum);
+ my_qsort2(key_buff, key_buff_elements, sizeof(rownum_t),
+ (qsort2_cmp) &cmp_keys_by_row_data_and_rownum, (void*) this);
+ /* Invalidate the current row position. */
+ cur_key_idx= HA_POS_ERROR;
}
/*
Compare the value(s) of the current key in 'search_key' with the
- data of the current table record accessible via 'key_columns'.
+ data of the current table record.
@notes The comparison result follows from the way compare_pred
is created in Ordered_key::init. Currently compare_pred compares
a field in of the current row with the corresponding Item that
contains the search key.
+ @param row_num Number of the row (not index in the key_buff array)
+
@retval -1 if (current row < search_key)
@retval 0 if (current row == search_key)
@retval +1 if (current row > search_key)
*/
-int Ordered_key::compare_row_with_key(ha_rows row_num)
+int Ordered_key::cmp_key_with_search_key(rownum_t row_num)
{
/* The length in bytes of the rowids (positions) of tmp_table. */
uint rowid_length= tbl->file->ref_length;
uchar *cur_rowid= row_num_to_rowid + row_num * rowid_length;
int error, cmp_res;
- DBUG_ENTER("Ordered_key::compare");
error= tbl->file->rnd_pos(tbl->record[0], cur_rowid);
DBUG_ASSERT(!error);
@@ -3804,9 +4011,9 @@ int Ordered_key::compare_row_with_key(ha
/* Unlike Arg_comparator::compare_row() here there should be no NULLs. */
DBUG_ASSERT(!compare_pred[i]->null_value);
if (cmp_res)
- DBUG_RETURN(cmp_res);
+ return (cmp_res > 0 ? 1 : -1);
}
- DBUG_RETURN(0);
+ return 0;
}
@@ -3818,17 +4025,24 @@ int Ordered_key::compare_row_with_key(ha
bool Ordered_key::lookup()
{
- DBUG_ENTER("Ordered_key::lookup");
+ DBUG_ASSERT(key_buff_elements);
ha_rows lo= 0;
- ha_rows hi= tbl->file->stats.records - 1;
+ ha_rows hi= key_buff_elements - 1;
ha_rows mid;
int cmp_res;
while (lo <= hi)
{
mid= lo + (hi - lo) / 2;
- cmp_res= compare_row_with_key(mid);
+ cmp_res= cmp_key_with_search_key(key_buff[mid]);
+ /*
+ In order to find the minimum match, check if the pevious element is
+ equal or smaller than the found one. If equal, we need to search further
+ to the left.
+ */
+ if (!cmp_res && mid > 0)
+ cmp_res= !cmp_key_with_search_key(key_buff[mid - 1]) ? 1 : 0;
if (cmp_res == -1)
{
@@ -3838,17 +4052,48 @@ bool Ordered_key::lookup()
else if (cmp_res == 1)
{
/* row[mid] > search_key */
+ if (!mid)
+ goto not_found;
hi= mid - 1;
}
else
{
/* row[mid] == search_key */
- cur_row= mid;
- DBUG_RETURN(TRUE);
+ cur_key_idx= mid;
+ return TRUE;
}
}
+not_found:
+ cur_key_idx= HA_POS_ERROR;
+ return FALSE;
+}
- DBUG_RETURN(FALSE);
+
+/*
+ Move the current index pointer to the next key with the same column
+ values as the current key. Since the index is sorted, all such keys
+ are contiguous.
+*/
+
+bool Ordered_key::next_same()
+{
+ DBUG_ASSERT(key_buff_elements);
+
+ if (cur_key_idx < key_buff_elements - 1)
+ {
+ /*
+ TODO:
+ The below is quite inefficient, since as a result we will fetch every
+ row (except the last one) twice. There must be a more efficient way,
+ e.g. swapping record[0] and record[1], and reading only the new record.
+ */
+ if (!cmp_keys_by_row_data(key_buff[cur_key_idx], key_buff[cur_key_idx + 1]))
+ {
+ ++cur_key_idx;
+ return TRUE;
+ }
+ }
+ return FALSE;
}
@@ -3865,56 +4110,147 @@ subselect_rowid_merge_engine::init(MY_BI
/* The length in bytes of the rowids (positions) of tmp_table. */
uint rowid_length= tmp_table->file->ref_length;
ha_rows row_count= tmp_table->file->stats.records;
+ rownum_t cur_rownum= 0;
select_materialize_with_stats *result_sink=
(select_materialize_with_stats *) result;
uint cur_key= 0;
+ Item_in_subselect *item_in= (Item_in_subselect*) item;
+ int error;
- if (!(row_num_to_rowid= (uchar*) thd->alloc(row_count * rowid_length *
- sizeof(uchar))))
- return TRUE;
+ if (keys_count == 0)
+ {
+ /* There is nothing to initialize, we will only do regular lookups. */
+ return FALSE;
+ }
- if (!(bitmap_init_memroot(&matching_keys, keys_count, thd->mem_root)))
+ DBUG_ASSERT(!has_covering_null_row || (has_covering_null_row &&
+ keys_count == 1 &&
+ non_null_key_parts));
+
+ if (!(merge_keys= (Ordered_key**) thd->alloc(keys_count *
+ sizeof(Ordered_key*))) ||
+ !(row_num_to_rowid= (uchar*) thd->alloc(row_count * rowid_length *
+ sizeof(uchar))))
return TRUE;
- merge_keys= (Ordered_key**) thd->alloc(keys_count * sizeof(Ordered_key*));
/* Create the only non-NULL key if there is any. */
if (non_null_key_parts)
{
- non_null_key= new Ordered_key(cur_key, tmp_table, item, 0, 0, 0,
- row_num_to_rowid);
+ non_null_key= new Ordered_key(cur_key, tmp_table, item_in->left_expr,
+ 0, 0, 0, row_num_to_rowid);
if (non_null_key->init(non_null_key_parts))
{
// TODO: revert to partial matching via scanning
return TRUE;
}
merge_keys[cur_key]= non_null_key;
- non_null_key->sort_keys();
+ merge_keys[cur_key]->first();
++cur_key;
}
+
/*
- Create one single-column NULL-key for each column in
- partial_match_key_parts.
+ If there is a covering NULL row, the only key that is needed is the
+ only non-NULL key that is already created above.
*/
- for (uint i= 0; i < partial_match_key_parts->n_bits; i++, cur_key++)
+ if (!has_covering_null_row)
+ {
+ if (bitmap_init_memroot(&matching_keys, keys_count, thd->mem_root) ||
+ bitmap_init_memroot(&matching_outer_cols, keys_count, thd->mem_root) ||
+ bitmap_init_memroot(&null_only_columns, keys_count, thd->mem_root))
+ return TRUE;
+
+ /*
+ Create one single-column NULL-key for each column in
+ partial_match_key_parts.
+ */
+ for (uint i= 0; i < partial_match_key_parts->n_bits; i++)
+ {
+ if (!bitmap_is_set(partial_match_key_parts, i))
+ continue;
+
+ if (result_sink->get_null_count_of_col(i) == row_count)
+ bitmap_set_bit(&null_only_columns, cur_key);
+ else
+ {
+ merge_keys[cur_key]= new Ordered_key(cur_key, tmp_table,
+ item_in->left_expr->element_index(i),
+ result_sink->get_null_count_of_col(i),
+ result_sink->get_min_null_of_col(i),
+ result_sink->get_max_null_of_col(i),
+ row_num_to_rowid);
+ if (merge_keys[cur_key]->init(i))
+ {
+ // TODO: revert to partial matching via scanning
+ return TRUE;
+ }
+ merge_keys[cur_key]->first();
+ }
+ ++cur_key;
+ }
+ }
+
+ /* Populate the indexes with data from the temporary table. */
+ tmp_table->file->ha_rnd_init(1);
+ tmp_table->file->extra_opt(HA_EXTRA_CACHE,
+ current_thd->variables.read_buff_size);
+ tmp_table->null_row= 0;
+ while (TRUE)
{
- if (!bitmap_is_set(partial_match_key_parts, i))
+ error= tmp_table->file->rnd_next(tmp_table->record[0]);
+ if (error == HA_ERR_RECORD_DELETED)
+ {
+ /* We get this for duplicate records that should not be in tmp_table. */
continue;
+ }
+ /*
+ This is a temp table that we fully own, there should be no other
+ cause to stop the iteration than EOF.
+ */
+ DBUG_ASSERT(!error || error == HA_ERR_END_OF_FILE);
+ if (error == HA_ERR_END_OF_FILE)
+ {
+ DBUG_ASSERT(cur_rownum == tmp_table->file->stats.records);
+ break;
+ }
- merge_keys[cur_key]= new Ordered_key(cur_key, tmp_table, item,
- result_sink->get_null_count_of_col(i),
- result_sink->get_min_null_of_col(i),
- result_sink->get_max_null_of_col(i),
- row_num_to_rowid);
- if (merge_keys[cur_key]->init(i))
+ /*
+ Save the position of this record in the row_num -> rowid mapping.
+ */
+ tmp_table->file->position(tmp_table->record[0]);
+ memcpy(row_num_to_rowid + cur_rownum * rowid_length,
+ tmp_table->file->ref, rowid_length);
+
+ /* Add the current row number to the corresponding keys. */
+ if (non_null_key)
{
- // TODO: revert to partial matching via scanning
- return TRUE;
+ /* By definition there are no NULLs in the non-NULL key. */
+ non_null_key->add_key(cur_rownum);
}
- merge_keys[cur_key]->sort_keys();
+
+ for (uint i= (non_null_key ? 1 : 0); i < keys_count; i++)
+ {
+ /*
+ Check if the first and only indexed column contains NULL in the curent
+ row, and add the row number to the corresponding key.
+ */
+ if (tmp_table->field[merge_keys[i]->get_field_idx(0)]->is_null())
+ merge_keys[i]->set_null(cur_rownum);
+ else
+ merge_keys[i]->add_key(cur_rownum);
+ }
+ ++cur_rownum;
}
+ tmp_table->file->ha_rnd_end();
+
+ /* Sort the keys in each of the indexes. */
+ for (uint i= 0; i < keys_count; i++)
+ merge_keys[i]->sort_keys();
+
+ // TODO: sort all the keys by NULL selectivity
+
if (init_queue(&pq, keys_count, 0, FALSE,
- subselect_rowid_merge_engine::cmp_key_by_cur_row, NULL))
+ subselect_rowid_merge_engine::cmp_keys_by_cur_rownum, NULL))
{
// TODO: revert to partial matching via scanning
return TRUE;
@@ -3924,9 +4260,19 @@ subselect_rowid_merge_engine::init(MY_BI
}
+subselect_rowid_merge_engine::~subselect_rowid_merge_engine()
+{
+ delete_queue(&pq);
+}
+
+
void subselect_rowid_merge_engine::cleanup()
{
- // TODO
+ lookup_engine->cleanup();
+ /* Tell handler we don't need the index anymore */
+ if (tmp_table->file->inited)
+ tmp_table->file->ha_rnd_end();
+ queue_remove_all(&pq);
}
@@ -3934,8 +4280,8 @@ void subselect_rowid_merge_engine::clean
*/
int
-subselect_rowid_merge_engine::cmp_key_by_null_selectivity(Ordered_key *a,
- Ordered_key *b)
+subselect_rowid_merge_engine::cmp_keys_by_null_selectivity(Ordered_key *a,
+ Ordered_key *b)
{
double a_sel= a->null_selectivity();
double b_sel= b->null_selectivity();
@@ -3951,37 +4297,26 @@ subselect_rowid_merge_engine::cmp_key_by
*/
int
-subselect_rowid_merge_engine::cmp_key_by_cur_row(void *arg,
- uchar *k1, uchar *k2)
+subselect_rowid_merge_engine::cmp_keys_by_cur_rownum(void *arg,
+ uchar *k1, uchar *k2)
{
- ha_rows row1= ((Ordered_key*) k1)->current();
- ha_rows row2= ((Ordered_key*) k2)->current();
+ rownum_t r1= ((Ordered_key*) k1)->current();
+ rownum_t r2= ((Ordered_key*) k2)->current();
- if (row1 > row2)
- return 1;
- if (row1 == row2)
- return 0;
- return -1;
+ return (r1 < r2) ? -1 : (r1 > r2) ? 1 : 0;
}
/*
- Check if certain table row contains a NULL in all columns in all columns for
- which there is no value match.
-
- @details Notice that if a column is not in the set 'keys', we assume that has
- been checked otherwise that there is a partial or complete match for this
- column. This allows to encode columns that consist of only NULLs as simply
- missing in the set 'keys', because such columns match any value in any row.
+ Check if certain table row contains a NULL in all columns for which there is
+ no match in the corresponding value index.
@retval TRUE if a NULL row exists
@retval FALSE otherwise
*/
-bool subselect_rowid_merge_engine::test_null_row(ha_rows row_num)
+bool subselect_rowid_merge_engine::test_null_row(rownum_t row_num)
{
- DBUG_ENTER("subselect_rowid_merge_engine::test_null_row");
-
for (uint i = 0; i < keys_count; i++)
{
if (bitmap_is_set(&matching_keys, i))
@@ -3993,9 +4328,9 @@ bool subselect_rowid_merge_engine::test_
continue;
}
if (!merge_keys[i]->is_null(row_num))
- DBUG_RETURN(FALSE);
+ return FALSE;
}
- DBUG_RETURN(TRUE);
+ return TRUE;
}
@@ -4007,88 +4342,120 @@ bool subselect_rowid_merge_engine::test_
bool subselect_rowid_merge_engine::partial_match()
{
Ordered_key *min_key; /* Key that contains the current minimum position. */
- ha_rows min_row; /* Current row number of min_key. */
+ rownum_t min_row_num; /* Current row number of min_key. */
Ordered_key *cur_key;
- ha_rows cur_row;
-
- DBUG_ENTER("subselect_rowid_merge_engine::partial_match");
+ rownum_t cur_row_num;
+ uint count_nulls_in_search_key= 0;
/* If there is a non-NULL key, it must be the first key in the keys array. */
- DBUG_ASSERT(non_null_key && merge_keys[0] == non_null_key);
+ DBUG_ASSERT(!non_null_key || (non_null_key && merge_keys[0] == non_null_key));
/* Check if there is a match for the columns of the only non-NULL key. */
if (non_null_key && !non_null_key->lookup())
- DBUG_RETURN(FALSE);
+ return FALSE;
+
+ /*
+ If there is a NULL (sub)row that covers all NULL-able columns,
+ then there is a guranteed partial match, and we don't need to search
+ for the matching row.
+ */
+ if (has_covering_null_row)
+ return TRUE;
+
if (non_null_key)
queue_insert(&pq, (uchar *) non_null_key);
-
/*
- Add all non-empty value keys to the priority queue. Do not process the
- non_null_key, since it was already processed above.
+ Do not add the non_null_key, since it was already processed above.
*/
- uint i= non_null_key ? 1 : 0; /* Skip the non-NULL key, already processed. */
- for (; i < keys_count; i++)
+ bitmap_clear_all(&matching_outer_cols);
+ for (uint i= test(non_null_key); i < keys_count; i++)
{
- if (merge_keys[i]->lookup())
+ DBUG_ASSERT(merge_keys[i]->get_column_count() == 1);
+ if (merge_keys[i]->get_search_key(0)->is_null())
+ {
+ ++count_nulls_in_search_key;
+ bitmap_set_bit(&matching_outer_cols, merge_keys[i]->get_key_idx());
+ }
+ else if (merge_keys[i]->lookup())
queue_insert(&pq, (uchar *) merge_keys[i]);
}
+
/*
- Not all value keys are empty, thus we don't have only NULL keys. If we had,
- the only possible match is a NULL row, and we cheked there is no such row,
- therefore the result is known to be FALSE. In fact this algorithm makes
- sense for at least two non-NULL columns.
+ If the outer reference consists of only NULLs, or if it has NULLs in all
+ nullable columns, the result is UNKNOWN.
*/
- DBUG_ASSERT(pq.elements > 1);
+ if (count_nulls_in_search_key ==
+ ((Item_in_subselect *) item)->left_expr->cols() -
+ (non_null_key ? non_null_key->get_column_count() : 0))
+ return TRUE;
+
+ /*
+ If there is no NULL (sub)row that covers all NULL columns, and there is no
+ single match for any of the NULL columns, the result is FALSE.
+ */
+ if (pq.elements - test(non_null_key) == 0)
+ return FALSE;
+
+ DBUG_ASSERT(pq.elements);
+
min_key= (Ordered_key*) queue_remove(&pq, 0);
- min_row= min_key->current();
- bitmap_clear_all(&matching_keys);
+ min_row_num= min_key->current();
+ bitmap_copy(&matching_keys, &null_only_columns);
bitmap_set_bit(&matching_keys, min_key->get_key_idx());
- min_key->next();
- if (!min_key->is_eof())
+ bitmap_union(&matching_keys, &matching_outer_cols);
+ if (min_key->next_same())
queue_insert(&pq, (uchar *) min_key);
+ if (pq.elements == 0)
+ {
+ /*
+ Check the only matching row of the only key min_key for NULL matches
+ in the other columns.
+ */
+ if (test_null_row(min_row_num))
+ return TRUE;
+ else
+ return FALSE;
+ }
+
while (TRUE)
{
cur_key= (Ordered_key*) queue_remove(&pq, 0);
- cur_row= min_key->current();
+ cur_row_num= cur_key->current();
- if (cur_row == min_row)
- {
+ if (cur_row_num == min_row_num)
bitmap_set_bit(&matching_keys, cur_key->get_key_idx());
- /* There cannot be a complete match, as we already checked for one. */
- DBUG_ASSERT(bitmap_bits_set(&matching_keys) < matching_keys.n_bits);
- }
else
{
/* Follows from the correct use of priority queue. */
- DBUG_ASSERT(cur_row > min_row);
- if (test_null_row(min_row))
- DBUG_RETURN(TRUE);
+ DBUG_ASSERT(cur_row_num > min_row_num);
+ if (test_null_row(min_row_num))
+ return TRUE;
else
{
min_key= cur_key;
- min_row= cur_row;
- bitmap_clear_all(&matching_keys);
+ min_row_num= cur_row_num;
+ bitmap_copy(&matching_keys, &null_only_columns);
bitmap_set_bit(&matching_keys, min_key->get_key_idx());
+ bitmap_union(&matching_keys, &matching_outer_cols);
}
}
- cur_key->next();
- if (!cur_key->is_eof())
+ if (cur_key->next_same())
queue_insert(&pq, (uchar *) cur_key);
if (pq.elements == 0)
{
/* Check the last row of the last column in PQ for NULL matches. */
- if (test_null_row(min_row))
- DBUG_RETURN(TRUE);
+ if (test_null_row(min_row_num))
+ return TRUE;
else
- DBUG_RETURN(FALSE);
+ return FALSE;
}
}
/* We should never get here. */
DBUG_ASSERT(FALSE);
- DBUG_RETURN(FALSE);
+ return FALSE;
}
@@ -4097,22 +4464,54 @@ int subselect_rowid_merge_engine::exec()
Item_in_subselect *item_in= (Item_in_subselect *) item;
int res;
- DBUG_ENTER("subselect_rowid_merge_engine::exec");
-
- if ((res= lookup_engine->exec()))
+ /* Try to find a matching row by index lookup. */
+ res= lookup_engine->copy_ref_key_simple();
+ if (res == -1)
+ {
+ /* The result is FALSE based on the outer reference. */
+ item_in->value= 0;
+ item_in->null_value= 0;
+ return 0;
+ }
+ else if (res == 0)
{
- /* An error occured during exec(). */
- DBUG_RETURN(res);
+ if ((res= lookup_engine->index_lookup()))
+ {
+ /* An error occured during lookup(). */
+ item_in->value= 0;
+ item_in->null_value= 0;
+ return res;
+ }
+ else if (item_in->value)
+ {
+ /*
+ A complete match was found, the result of IN is TRUE.
+ Notice: (this->item == lookup_engine->item)
+ */
+ return 0;
+ }
}
- else if (item_in->value == 1)
+
+ if (has_covering_null_row && !keys_count)
{
/*
- A complete match was found, the result of IN is TRUE.
- Notice: (this->item == lookup_engine->item)
+ If there is a NULL-only row that coveres all columns the result of IN
+ is UNKNOWN.
*/
- DBUG_RETURN(0);
+ item_in->value= 0;
+ /*
+ TODO: which one is the right way to propagate an UNKNOWN result?
+ Should we also set empty_result_set= FALSE; ???
+ */
+ //item_in->was_null= 1;
+ item_in->null_value= 1;
+ return 0;
}
+ /* All data accesses during execution are via handler::rnd_pos() */
+ if (tmp_table->file->inited)
+ tmp_table->file->ha_index_end();
+ tmp_table->file->ha_rnd_init(0);
/*
There is no complete match. Look for a partial match (UNKNOWN result), or
no match (FALSE).
@@ -4121,18 +4520,25 @@ int subselect_rowid_merge_engine::exec()
{
/* The result of IN is UNKNOWN. */
item_in->value= 0;
- /* TODO: which one is the right way to propagate an UNKNOWN result? */
- item_in->was_null= 1;
+ /*
+ TODO: which one is the right way to propagate an UNKNOWN result?
+ Should we also set empty_result_set= FALSE; ???
+ */
+ //item_in->was_null= 1;
item_in->null_value= 1;
}
else
{
/* The result of IN is FALSE. */
item_in->value= 0;
- /* TODO: which one is the right way to propagate an UNKNOWN result? */
- item_in->was_null= 0;
+ /*
+ TODO: which one is the right way to propagate an UNKNOWN result?
+ Should we also set empty_result_set= FALSE; ???
+ */
+ //item_in->was_null= 0;
item_in->null_value= 0;
}
+ tmp_table->file->ha_rnd_end();
- DBUG_RETURN(0);
+ return 0;
}
=== modified file 'sql/item_subselect.h'
--- a/sql/item_subselect.h 2010-02-01 12:09:48 +0000
+++ b/sql/item_subselect.h 2010-02-12 14:33:43 +0000
@@ -610,8 +610,10 @@ public:
virtual void print (String *str, enum_query_type query_type);
bool change_result(Item_subselect *si, select_result_interceptor *result);
bool no_tables();
+ int index_lookup();
int scan_table();
bool copy_ref_key();
+ int copy_ref_key_simple();
bool no_rows() { return empty_result_set; }
virtual enum_engine_type engine_type() { return UNIQUESUBQUERY_ENGINE; }
};
@@ -678,6 +680,34 @@ inline bool Item_subselect::is_uncacheab
return engine->uncacheable();
}
+/*
+ Distinguish the type od (0-based) row numbers from the type of the index into
+ an array of row numbers.
+*/
+typedef ha_rows rownum_t;
+
+
+/*
+ An Ordered_key is an in-memory table index that allows O(log(N)) time
+ lookups of a multi-part key.
+
+ If the index is over a single column, then this column may contain NULLs, and
+ the NULLs are stored and tested separately for NULL in O(1) via is_null().
+ Multi-part indexes assume that the indexed columns do not contain NULLs.
+
+ TODO:
+ = Due to the unnatural assymetry between single and multi-part indexes, it
+ makes sense to somehow refactor or extend the class.
+
+ = This class can be refactored into a base abstract interface, and two
+ subclasses:
+ - one to represent single-column indexes, and
+ - another to represent multi-column indexes.
+ Such separation would allow slightly more efficient implementation of
+ the single-column indexes.
+ = The current design requires such indexes to be fully recreated for each
+ PS (re)execution, however most of the comprising objects can be reused.
+*/
class Ordered_key
{
@@ -701,11 +731,12 @@ protected:
/* Value index related members. */
/*
The actual value index, consists of a sorted sequence of row numbers.
- There are tbl->file->stats.records elements in this array.
*/
- ha_rows *row_index;
- /* Current element in 'row_index'. */
- ha_rows cur_row;
+ rownum_t *key_buff;
+ /* Number of elements in key_buff. */
+ ha_rows key_buff_elements;
+ /* Current element in 'key_buff'. */
+ ha_rows cur_key_idx;
/*
Mapping from row numbers to row ids. The element row_num_to_rowid[i]
contains a buffer with the rowid for the row numbered 'i'.
@@ -734,15 +765,21 @@ protected:
Quick sort comparison function that compares two rows of the same table
indentfied with their row numbers.
*/
- static int cmp_rows_by_rownum(Ordered_key *key, ha_rows* a, ha_rows* b);
+ int cmp_keys_by_row_data(rownum_t a, rownum_t b);
+ static int cmp_keys_by_row_data_and_rownum(Ordered_key *key,
+ rownum_t* a, rownum_t* b);
- int compare_row_with_key(ha_rows row_num);
+ int cmp_key_with_search_key(rownum_t row_num);
public:
+ static void *operator new(size_t size) throw ()
+ { return sql_alloc(size); }
Ordered_key(uint key_idx_arg, TABLE *tbl_arg,
Item *search_key_arg, ha_rows null_count_arg,
ha_rows min_null_row_arg, ha_rows max_null_row_arg,
uchar *row_num_to_rowid_arg);
+ ~Ordered_key();
+ void cleanup();
/* Initialize a multi-column index. */
bool init(MY_BITMAP *columns_to_index);
/* Initialize a single-column index. */
@@ -750,10 +787,21 @@ public:
uint get_column_count() { return key_column_count; }
uint get_key_idx() { return key_idx; }
- void add_key(ha_rows row_num)
+ uint get_field_idx(uint i)
+ {
+ DBUG_ASSERT(i < key_column_count);
+ return key_columns[i]->field->field_index;
+ }
+ Item *get_search_key(uint i)
{
- row_index[cur_row]= row_num;
- ++cur_row;
+ return search_key->element_index(key_columns[i]->field->field_index);
+ }
+ void add_key(rownum_t row_num)
+ {
+ /* The caller must know how many elements to add. */
+ DBUG_ASSERT(key_buff_elements && cur_key_idx < key_buff_elements);
+ key_buff[cur_key_idx]= row_num;
+ ++cur_key_idx;
}
void sort_keys();
@@ -766,28 +814,38 @@ public:
this->search_key.
*/
bool lookup();
- /* Return the current index element. */
- ha_rows current() { return row_index[cur_row]; }
- /* Move the current index cursor at the next match. */
+ /* Move the current index cursor to the first key. */
+ void first()
+ {
+ DBUG_ASSERT(key_buff_elements);
+ cur_key_idx= 0;
+ }
+ /* TODO */
+ bool next_same();
+ /* Move the current index cursor to the next key. */
bool next()
{
- if (cur_row < tbl->file->stats.records)
+ DBUG_ASSERT(key_buff_elements);
+ if (cur_key_idx < key_buff_elements - 1)
{
- ++cur_row;
+ ++cur_key_idx;
return TRUE;
}
return FALSE;
};
- /* Return false if all matches are exhausted, true otherwise. */
- bool is_eof() { return cur_row == tbl->file->stats.records; }
+ /* Return the current index element. */
+ rownum_t current()
+ {
+ DBUG_ASSERT(key_buff_elements && cur_key_idx < key_buff_elements);
+ return key_buff[cur_key_idx];
+ }
- void set_null(ha_rows row_num)
+ void set_null(rownum_t row_num)
{
bitmap_set_bit(&null_key, row_num);
}
- bool is_null(ha_rows row_num)
+ bool is_null(rownum_t row_num)
{
- DBUG_ENTER("Ordered_key::is_null");
/*
Indexes consisting of only NULLs do not have a bitmap buffer at all.
Their only initialized member is 'n_bits', which is equal to the number
@@ -796,11 +854,11 @@ public:
if (null_count == tbl->file->stats.records)
{
DBUG_ASSERT(tbl->file->stats.records == null_key.n_bits);
- DBUG_RETURN(TRUE);
+ return TRUE;
}
if (row_num > max_null_row || row_num < min_null_row)
- DBUG_RETURN(FALSE);
- DBUG_RETURN(bitmap_is_set(&null_key, row_num));
+ return FALSE;
+ return bitmap_is_set(&null_key, row_num);
}
};
@@ -815,18 +873,28 @@ protected:
TRUE, then subselect_rowid_merge_engine further distinguishes between
FALSE and UNKNOWN.
*/
- subselect_engine *lookup_engine;
+ subselect_uniquesubquery_engine *lookup_engine;
/*
- Mapping from row numbers to row ids. The element row_num_to_rowid[i]
- contains a buffer with the rowid for the row numbered 'i'.
+ Mapping from row numbers to row ids. The rowids are stored sequentially
+ in the array - rowid[i] is located in row_num_to_rowid + i * rowid_length.
*/
uchar *row_num_to_rowid;
/*
A subset of all the keys for which there is a match for the same row.
- Used during execution. Computed for each call to exec().
+ Used during execution. Computed for each outer reference
*/
MY_BITMAP matching_keys;
/*
+ The columns of the outer reference that are NULL. Computed for each
+ outer reference.
+ */
+ MY_BITMAP matching_outer_cols;
+ /*
+ Columns that consist of only NULLs. Such columns match any value.
+ Computed once per query execution.
+ */
+ MY_BITMAP null_only_columns;
+ /*
Indexes of row numbers, sorted by <column_value, row_number>. If an
index may contain NULLs, the NULLs are stored efficiently in a bitmap.
@@ -849,44 +917,59 @@ protected:
This queue is used by the partial match algorithm in method exec().
*/
QUEUE pq;
+ /* True if there is a NULL (sub)row that covers all NULLable columns. */
+ bool has_covering_null_row;
protected:
/*
Comparison function to compare keys in order of increasing bitmap
selectivity.
*/
- static int cmp_key_by_null_selectivity(Ordered_key *a, Ordered_key *b);
+ static int cmp_keys_by_null_selectivity(Ordered_key *a, Ordered_key *b);
/*
Comparison function used by the priority queue pq, the 'smaller' key
is the one with the smaller current row number.
*/
- static int cmp_key_by_cur_row(void *arg, uchar *k1, uchar *k2);
+ static int cmp_keys_by_cur_rownum(void *arg, uchar *k1, uchar *k2);
- bool test_null_row(ha_rows row_num);
+ bool test_null_row(rownum_t row_num);
bool partial_match();
public:
- subselect_rowid_merge_engine(subselect_engine *lookup_engine_arg,
+ subselect_rowid_merge_engine(subselect_uniquesubquery_engine *engine_arg,
TABLE *tmp_table_arg, uint keys_count_arg,
+ uint has_covering_null_row_arg,
Item_subselect *item_arg,
select_result_interceptor *result_arg)
:subselect_engine(item_arg, result_arg),
- tmp_table(tmp_table_arg), lookup_engine(lookup_engine_arg),
- keys_count(keys_count_arg)
- {}
-
+ tmp_table(tmp_table_arg), lookup_engine(engine_arg),
+ keys_count(keys_count_arg), non_null_key(NULL),
+ has_covering_null_row(has_covering_null_row_arg)
+ {
+ thd= lookup_engine->get_thd();
+ }
+ ~subselect_rowid_merge_engine();
bool init(MY_BITMAP *non_null_key_parts, MY_BITMAP *partial_match_key_parts);
void cleanup();
int prepare() { return 0; }
void fix_length_and_dec(Item_cache**) {}
int exec();
- uint cols() { return 0; }
+ uint cols() { /* TODO: what is the correct value? */ return 1; }
uint8 uncacheable() { return UNCACHEABLE_DEPENDENT; }
void exclude() {}
table_map upper_select_const_tables() { return 0; }
void print(String*, enum_query_type) {}
bool change_result(Item_subselect*, select_result_interceptor*)
- { return false; }
+ { DBUG_ASSERT(FALSE); return false; }
bool no_tables() { return false; }
- bool no_rows() {return false; }
+ bool no_rows()
+ {
+ /*
+ TODO: It is completely unclear what is the semantics of this
+ method. The current result is computed so that the call to no_rows()
+ from Item_in_optimizer::val_int() sets Item_in_optimizer::null_value
+ correctly.
+ */
+ return !(((Item_in_subselect *) item)->null_value);
+ }
};
@@ -933,6 +1016,7 @@ protected:
/* Keyparts of the single column indexes with NULL, one keypart per index. */
MY_BITMAP partial_match_key_parts;
uint count_partial_match_columns;
+ uint count_null_only_columns;
/*
A conjunction of all the equality condtions between all pairs of expressions
that are arguments of an IN predicate. We need these to post-filter some
@@ -962,7 +1046,7 @@ public:
:subselect_engine(in_predicate, NULL), tmp_table(NULL),
is_materialized(FALSE), materialize_engine(old_engine), lookup_engine(NULL),
materialize_join(NULL), count_partial_match_columns(0),
- semi_join_conds(NULL)
+ count_null_only_columns(0), semi_join_conds(NULL)
{
set_thd(thd);
}
=== modified file 'sql/sql_class.cc'
--- a/sql/sql_class.cc 2010-01-22 16:18:05 +0000
+++ b/sql/sql_class.cc 2010-02-12 14:33:43 +0000
@@ -2931,12 +2931,13 @@ create_result_table(THD *thd_arg, List<I
options, HA_POS_ERROR, (char*) table_alias)))
return TRUE;
- /* TODO: if/where/when to free this buffer? */
- col_stat= (Column_statistics*) table->in_use->calloc(table->s->fields *
- sizeof(Column_statistics));
+ col_stat= (Column_statistics*) table->in_use->alloc(table->s->fields *
+ sizeof(Column_statistics));
if (!stat)
return TRUE;
+ cleanup();
+
table->file->extra(HA_EXTRA_WRITE_CACHE);
table->file->extra(HA_EXTRA_IGNORE_DUP_KEY);
return FALSE;
@@ -2966,14 +2967,14 @@ bool select_materialize_with_stats::send
{
++cur_col_stat->null_count;
cur_col_stat->max_null_row= count_rows;
- if (cur_col_stat->min_null_row == 0)
+ if (!cur_col_stat->min_null_row)
cur_col_stat->min_null_row= count_rows;
++nulls_in_row;
}
++cur_col_stat;
}
- if (nulls_in_row == items.elements)
- ++null_record_count;
+ if (nulls_in_row > max_nulls_in_row)
+ max_nulls_in_row= nulls_in_row;
return select_union::send_data(items);
}
=== modified file 'sql/sql_class.h'
--- a/sql/sql_class.h 2010-02-01 12:09:48 +0000
+++ b/sql/sql_class.h 2010-02-12 14:33:43 +0000
@@ -3044,17 +3044,20 @@ protected:
public:
/* Count of NULLs per column. */
ha_rows null_count;
- /* The row number that contains the last NULL in a column. */
- ha_rows max_null_row;
/* The row number that contains the first NULL in a column. */
ha_rows min_null_row;
+ /* The row number that contains the last NULL in a column. */
+ ha_rows max_null_row;
};
/* Array of statistics data per column. */
Column_statistics* col_stat;
- /* The number of rows that consist only of NULL values. */
- ha_rows null_record_count;
+ /*
+ The number of columns in the biggest sub-row that consists of only
+ NULL values.
+ */
+ ha_rows max_nulls_in_row;
/*
Count of rows writtent to the temp table. This is redundant as it is
already stored in handler::stats.records, however that one is relatively
@@ -3063,11 +3066,7 @@ protected:
ha_rows count_rows;
public:
- select_materialize_with_stats()
- {
- null_record_count= 0;
- count_rows= 0;
- }
+ select_materialize_with_stats() {}
virtual bool create_result_table(THD *thd, List<Item> *column_types,
bool is_distinct, ulonglong options,
const char *alias, bool bit_fields_as_long);
@@ -3075,9 +3074,9 @@ public:
bool send_data(List<Item> &items);
void cleanup()
{
- null_record_count= 0;
- count_rows= 0;
memset(col_stat, 0, table->s->fields * sizeof(Column_statistics));
+ max_nulls_in_row= 0;
+ count_rows= 0;
}
ha_rows get_null_count_of_col(uint idx)
{
@@ -3094,7 +3093,7 @@ public:
DBUG_ASSERT(idx < table->s->fields);
return col_stat[idx].min_null_row;
}
- ha_rows get_null_record_count() { return null_record_count; }
+ ha_rows get_max_nulls_in_row() { return max_nulls_in_row; }
};
=== modified file 'sql/sql_select.cc'
--- a/sql/sql_select.cc 2010-01-22 16:18:05 +0000
+++ b/sql/sql_select.cc 2010-02-12 14:33:43 +0000
@@ -707,7 +707,7 @@ JOIN::prepare(Item ***rref_pointer_array
subquery_types_allow_materialization(in_subs))
{
// psergey-todo: duplicated_subselect_card_check: where it's done?
- if (in_subs->is_top_level_item() && // 4
+ if (//in_subs->is_top_level_item() && // 4
!in_subs->is_correlated && // 5
in_subs->exec_method == Item_in_subselect::NOT_TRANSFORMED) // 6
in_subs->exec_method= Item_in_subselect::MATERIALIZATION;
1
0

[Maria-developers] [Branch ~maria-captains/maria/5.1] Rev 2817: Fix for LPBUG#516148 Test maria.maria3 fails when --without-maria-tmp-tables is set
by noreply@launchpad.net 12 Feb '10
by noreply@launchpad.net 12 Feb '10
12 Feb '10
------------------------------------------------------------
revno: 2817
committer: Michael Widenius <monty(a)askmonty.org>
branch nick: maria-5.1
timestamp: Fri 2010-02-12 16:21:13 +0200
message:
Fix for LPBUG#516148 Test maria.maria3 fails when --without-maria-tmp-tables is set
modified:
mysql-test/suite/maria/r/maria3.result
mysql-test/suite/maria/t/maria3.test
storage/maria/ha_maria.cc
--
lp:maria
https://code.launchpad.net/~maria-captains/maria/5.1
Your team Maria developers is subscribed to branch lp:maria.
To unsubscribe from this branch go to https://code.launchpad.net/~maria-captains/maria/5.1/+edit-subscription.
1
0

[Maria-developers] bzr commit into MariaDB 5.1, with Maria 1.5:maria branch (monty:2817)
by Michael Widenius 12 Feb '10
by Michael Widenius 12 Feb '10
12 Feb '10
#At lp:maria based on revid:monty@askmonty.org-20100211191524-rbd8pfcchi9ewm4a
2817 Michael Widenius 2010-02-12
Fix for LPBUG#516148 Test maria.maria3 fails when --without-maria-tmp-tables is set
modified:
mysql-test/suite/maria/r/maria3.result
mysql-test/suite/maria/t/maria3.test
storage/maria/ha_maria.cc
per-file messages:
mysql-test/suite/maria/r/maria3.result
Updated test results
mysql-test/suite/maria/t/maria3.test
Don't show maria_used_for_temp_tables, as it's value is depending on configure options
=== modified file 'mysql-test/suite/maria/r/maria3.result'
--- a/mysql-test/suite/maria/r/maria3.result 2009-09-18 01:04:43 +0000
+++ b/mysql-test/suite/maria/r/maria3.result 2010-02-12 14:21:13 +0000
@@ -301,7 +301,7 @@ check table t1 extended;
Table Op Msg_type Msg_text
test.t1 check status OK
drop table t1;
-show variables like 'maria%';
+select lower(variable_name) as Variable_name, Variable_value as Value from information_schema.session_variables where variable_name like "maria%" and variable_name not like "maria_used_for_temp_tables" order by 1;
Variable_name Value
maria_block_size 8192
maria_checkpoint_interval 30
@@ -309,16 +309,15 @@ maria_force_start_after_recovery_failure
maria_log_file_size 4294959104
maria_log_purge_type immediate
maria_max_sort_file_size 9223372036853727232
-maria_page_checksum OFF
maria_pagecache_age_threshold 300
maria_pagecache_buffer_size 8384512
maria_pagecache_division_limit 100
+maria_page_checksum OFF
maria_recover OFF
maria_repair_threads 1
maria_sort_buffer_size 8388608
maria_stats_method nulls_unequal
maria_sync_log_dir NEWFILE
-maria_used_for_temp_tables ON
show status like 'maria%';
Variable_name Value
Maria_pagecache_blocks_not_flushed #
=== modified file 'mysql-test/suite/maria/t/maria3.test'
--- a/mysql-test/suite/maria/t/maria3.test 2009-06-02 09:58:27 +0000
+++ b/mysql-test/suite/maria/t/maria3.test 2010-02-12 14:21:13 +0000
@@ -259,7 +259,7 @@ drop table t1;
# Fix if we are using safemalloc
--replace_result 8388572 8388600
-show variables like 'maria%';
+select lower(variable_name) as Variable_name, Variable_value as Value from information_schema.session_variables where variable_name like "maria%" and variable_name not like "maria_used_for_temp_tables" order by 1;
--replace_column 2 #
show status like 'maria%';
=== modified file 'storage/maria/ha_maria.cc'
--- a/storage/maria/ha_maria.cc 2010-02-10 19:06:24 +0000
+++ b/storage/maria/ha_maria.cc 2010-02-12 14:21:13 +0000
@@ -3278,11 +3278,11 @@ static struct st_mysql_sys_var* system_v
MYSQL_SYSVAR(block_size),
MYSQL_SYSVAR(checkpoint_interval),
MYSQL_SYSVAR(force_start_after_recovery_failures),
- MYSQL_SYSVAR(page_checksum),
MYSQL_SYSVAR(log_dir_path),
MYSQL_SYSVAR(log_file_size),
MYSQL_SYSVAR(log_purge_type),
MYSQL_SYSVAR(max_sort_file_size),
+ MYSQL_SYSVAR(page_checksum),
MYSQL_SYSVAR(pagecache_age_threshold),
MYSQL_SYSVAR(pagecache_buffer_size),
MYSQL_SYSVAR(pagecache_division_limit),
1
0

[Maria-developers] Rev 2741: Group commit for maria engine. in file:///Users/bell/maria/bzr/work-maria-5.2-groupcommit2/
by sanja@askmonty.org 12 Feb '10
by sanja@askmonty.org 12 Feb '10
12 Feb '10
At file:///Users/bell/maria/bzr/work-maria-5.2-groupcommit2/
------------------------------------------------------------
revno: 2741
revision-id: sanja(a)askmonty.org-20100212131228-bgxli0wfybhjkvg9
parent: sergii(a)pisem.net-20100212084731-b5jst7oxhzp251pg
committer: sanja(a)askmonty.org
branch nick: work-maria-5.2-groupcommit2
timestamp: Fri 2010-02-12 15:12:28 +0200
message:
Group commit for maria engine.
=== added file 'mysql-test/suite/maria/r/group_commit.result'
--- a/mysql-test/suite/maria/r/group_commit.result 1970-01-01 00:00:00 +0000
+++ b/mysql-test/suite/maria/r/group_commit.result 2010-02-12 13:12:28 +0000
@@ -0,0 +1,17 @@
+drop table if exists t1;
+create table t1 (a int);
+SET GLOBAL maria_group_commit="NONE";
+SET GLOBAL maria_group_commit_interval= 0;
+SET GLOBAL maria_group_commit="NONE";
+SET GLOBAL maria_group_commit_interval= 100;
+SET GLOBAL maria_group_commit="HARD";
+SET GLOBAL maria_group_commit_interval= 0;
+SET GLOBAL maria_group_commit="HARD";
+SET GLOBAL maria_group_commit_interval= 100;
+SET GLOBAL maria_group_commit="SOFT";
+SET GLOBAL maria_group_commit_interval= 0;
+SET GLOBAL maria_group_commit="SOFT";
+SET GLOBAL maria_group_commit_interval= 100;
+SET GLOBAL maria_group_commit="NONE";
+SET GLOBAL maria_group_commit_interval= 0;
+drop table t1;
=== modified file 'mysql-test/suite/maria/r/maria3.result'
--- a/mysql-test/suite/maria/r/maria3.result 2009-09-18 01:04:43 +0000
+++ b/mysql-test/suite/maria/r/maria3.result 2010-02-12 13:12:28 +0000
@@ -306,6 +306,8 @@
maria_block_size 8192
maria_checkpoint_interval 30
maria_force_start_after_recovery_failures 0
+maria_group_commit none
+maria_group_commit_interval 0
maria_log_file_size 4294959104
maria_log_purge_type immediate
maria_max_sort_file_size 9223372036853727232
@@ -328,6 +330,7 @@
Maria_pagecache_reads #
Maria_pagecache_write_requests #
Maria_pagecache_writes #
+Maria_transaction_log_syncs #
create table t1 (b char(0));
insert into t1 values(NULL),("");
select length(b) from t1;
=== added file 'mysql-test/suite/maria/t/group_commit.test'
--- a/mysql-test/suite/maria/t/group_commit.test 1970-01-01 00:00:00 +0000
+++ b/mysql-test/suite/maria/t/group_commit.test 2010-02-12 13:12:28 +0000
@@ -0,0 +1,71 @@
+# Test different ways of syncing (mostly syntax)
+
+--disable_warnings
+drop table if exists t1;
+--enable_warnings
+
+create table t1 (a int);
+
+SET GLOBAL maria_group_commit="NONE";
+SET GLOBAL maria_group_commit_interval= 0;
+--disable_query_log
+let $num = 5000;
+while ($num)
+{
+ insert into t1 values (1);
+ dec $num;
+}
+--enable_query_log
+SET GLOBAL maria_group_commit="NONE";
+SET GLOBAL maria_group_commit_interval= 100;
+--disable_query_log
+let $num = 5000;
+while ($num)
+{
+ insert into t1 values (1);
+ dec $num;
+}
+--enable_query_log
+SET GLOBAL maria_group_commit="HARD";
+SET GLOBAL maria_group_commit_interval= 0;
+--disable_query_log
+let $num = 5000;
+while ($num)
+{
+ insert into t1 values (1);
+ dec $num;
+}
+--enable_query_log
+SET GLOBAL maria_group_commit="HARD";
+SET GLOBAL maria_group_commit_interval= 100;
+--disable_query_log
+let $num = 5000;
+while ($num)
+{
+ insert into t1 values (1);
+ dec $num;
+}
+--enable_query_log
+SET GLOBAL maria_group_commit="SOFT";
+SET GLOBAL maria_group_commit_interval= 0;
+--disable_query_log
+let $num = 5000;
+while ($num)
+{
+ insert into t1 values (1);
+ dec $num;
+}
+--enable_query_log
+SET GLOBAL maria_group_commit="SOFT";
+SET GLOBAL maria_group_commit_interval= 100;
+--disable_query_log
+let $num = 5000;
+while ($num)
+{
+ insert into t1 values (1);
+ dec $num;
+}
+--enable_query_log
+SET GLOBAL maria_group_commit="NONE";
+SET GLOBAL maria_group_commit_interval= 0;
+drop table t1;
=== added directory 'randgen'
=== added directory 'randgen/conf'
=== added file 'randgen/conf/maria_group_commit.yy'
--- a/randgen/conf/maria_group_commit.yy 1970-01-01 00:00:00 +0000
+++ b/randgen/conf/maria_group_commit.yy 2010-02-12 13:12:28 +0000
@@ -0,0 +1,181 @@
+# test of group commit switching
+
+query:
+ select | insert | update| delete |
+ select | insert | update| delete |
+ select | insert | update| delete |
+ select | insert | update| delete |
+ select | insert | update| delete |
+ select | insert | update| delete |
+ select | insert | update| delete |
+ select | insert | update| delete |
+ select | insert | update| delete |
+ select | insert | update| delete |
+ select | insert | update| delete |
+ select | insert | update| delete |
+ select | insert | update| delete |
+ select | insert | update| delete |
+ select | insert | update| delete |
+ select | insert | update| delete |
+ select | insert | update| delete |
+ select | insert | update| delete |
+ select | insert | update| delete |
+ select | insert | update| delete |
+ select | insert | update| delete |
+ select | insert | update| delete |
+ select | insert | update| delete |
+ select | insert | update| delete |
+ select | insert | update| delete |
+ select | insert | update| delete |
+ select | insert | update| delete |
+ select | insert | update| delete |
+ select | insert | update| delete |
+ select | insert | update| delete |
+ select | insert | update| delete |
+ select | insert | update| delete |
+ select | insert | update| delete |
+ select | insert | update| delete |
+ select | insert | update| delete |
+ select | insert | update| delete |
+ select | insert | update| delete |
+ select | insert | update| delete |
+ select | insert | update| delete |
+ select | insert | update| delete |
+ select | insert | update| delete |
+ select | insert | update| delete |
+ select | insert | update| delete |
+ select | insert | update| delete |
+ select | insert | update| delete |
+ select | insert | update| delete |
+ select | insert | update| delete |
+ select | insert | update| delete |
+ select | insert | update| delete |
+ select | insert | update| delete |
+ select | insert | update| delete |
+ select | insert | update| delete |
+ select | insert | update| delete |
+ select | insert | update| delete |
+ select | insert | update| delete |
+ select | insert | update| delete |
+ select | insert | update| delete |
+ select | insert | update| delete |
+ select | insert | update| delete |
+ select | insert | update| delete |
+ select | insert | update| delete |
+ select | insert | update| delete |
+ select | insert | update| delete |
+ select | insert | update| delete |
+ select | insert | update| delete |
+ select | insert | update| delete |
+ select | insert | update| delete |
+ select | insert | update| delete |
+ select | insert | update| delete |
+ select | insert | update| delete |
+ select | insert | update| delete |
+ select | insert | update| delete |
+ select | insert | update| delete |
+ select | insert | update| delete |
+ select | insert | update| delete |
+ select | insert | update| delete |
+ select | insert | update| delete |
+ select | insert | update| delete |
+ select | insert | update| delete |
+ select | insert | update| delete |
+ select | insert | update| delete |
+ select | insert | update| delete |
+ select | insert | update| delete |
+ select | insert | update| delete |
+ select | insert | update| delete |
+ select | insert | update| delete |
+ select | insert | update| delete |
+ select | insert | update| delete |
+ select | insert | update| delete |
+ select | insert | update| delete |
+ select | insert | update| delete |
+ select | insert | update| delete |
+ select | insert | update| delete |
+ select | insert | update| delete |
+ select | insert | update| delete |
+ select | insert | update| delete |
+ select | insert | update| delete |
+ select | insert | update| delete |
+ select | insert | update| delete |
+ select | insert | update| delete |
+ select | insert | update| delete |
+ select | insert | update| delete |
+ select | insert | update| delete |
+ select | insert | update| delete |
+ select | insert | update| delete |
+ select | insert | update| delete |
+ select | insert | update| delete |
+ select | insert | update| delete |
+ select | insert | update| delete |
+ select | insert | update| delete |
+ select | insert | update| delete |
+ select | insert | update| delete |
+ select | insert | update| delete |
+ select | insert | update| delete |
+ select | insert | update| delete |
+ select | insert | update| delete |
+ select | insert | update| delete |
+ select | insert | update| delete |
+ select | insert | update| delete |
+ select | insert | update| delete |
+ change_group_commit | change_interval;
+
+
+select:
+ SELECT select_item FROM join where order_by limit;
+
+select_item:
+ * | X . _field ;
+
+join:
+ _table AS X |
+ _table AS X LEFT JOIN _table AS Y ON ( X . _field = Y . _field ) ;
+
+where:
+ |
+ WHERE X . _field < value |
+ WHERE X . _field > value |
+ WHERE X . _field = value ;
+
+where_delete:
+ |
+ WHERE _field < value |
+ WHERE _field > value |
+ WHERE _field = value ;
+
+order_by:
+ | ORDER BY X . _field ;
+
+limit:
+ | LIMIT _digit ;
+
+insert:
+ INSERT INTO _table ( _field , _field ) VALUES ( value , value ) ;
+
+update:
+ UPDATE _table AS X SET _field = value where order_by limit ;
+
+delete:
+ DELETE FROM _table where_delete LIMIT _digit ;
+
+value:
+ ' _letter ' | _digit | _date | _datetime | _time | _english ;
+
+change_group_commit:
+ SET GLOBAL MARIA_GROUP_COMMIT=none_soft_hard;
+
+none_soft_hard:
+ NONE | SOFT | HARD;
+
+change_interval:
+ set_interval | set_interval | set_interval | set_interval |
+ drop_interval;
+
+set_interval:
+ SET GLOBAL MARIA_GROUP_COMMIT_INTERVAL=_tinyint_unsigned;
+
+drop_interval:
+ SET GLOBAL MARIA_GROUP_COMMIT_INTERVAL=0;
=== modified file 'storage/maria/ha_maria.cc'
--- a/storage/maria/ha_maria.cc 2010-02-10 19:06:24 +0000
+++ b/storage/maria/ha_maria.cc 2010-02-12 13:12:28 +0000
@@ -102,22 +102,40 @@
array_elements(maria_translog_purge_type_names) - 1, "",
maria_translog_purge_type_names, NULL
};
+
+/* transactional log directory sync */
const char *maria_sync_log_dir_names[]=
{
"NEVER", "NEWFILE", "ALWAYS", NullS
};
-
TYPELIB maria_sync_log_dir_typelib=
{
array_elements(maria_sync_log_dir_names) - 1, "",
maria_sync_log_dir_names, NULL
};
+/* transactional log group commit */
+const char *maria_group_commit_names[]=
+{
+ "none", "hard", "soft", NullS
+};
+TYPELIB maria_group_commit_typelib=
+{
+ array_elements(maria_group_commit_names) - 1, "",
+ maria_group_commit_names, NULL
+};
+
/** Interval between background checkpoints in seconds */
static ulong checkpoint_interval;
static void update_checkpoint_interval(MYSQL_THD thd,
struct st_mysql_sys_var *var,
void *var_ptr, const void *save);
+static void update_maria_group_commit(MYSQL_THD thd,
+ struct st_mysql_sys_var *var,
+ void *var_ptr, const void *save);
+static void update_maria_group_commit_interval(MYSQL_THD thd,
+ struct st_mysql_sys_var *var,
+ void *var_ptr, const void *save);
/** After that many consecutive recovery failures, remove logs */
static ulong force_start_after_recovery_failures;
static void update_log_file_size(MYSQL_THD thd,
@@ -164,6 +182,24 @@
NULL, update_log_file_size, TRANSLOG_FILE_SIZE,
TRANSLOG_MIN_FILE_SIZE, 0xffffffffL, TRANSLOG_PAGE_SIZE);
+static MYSQL_SYSVAR_ENUM(group_commit, maria_group_commit,
+ PLUGIN_VAR_RQCMDARG,
+ "Specifies maria group commit mode. "
+ "Possible values are \"none\" (no group commit), "
+ "\"hard\" (with waiting to actual commit), "
+ "\"soft\" (no wait for commit (DANGEROUS!!!))",
+ NULL, update_maria_group_commit,
+ TRANSLOG_GCOMMIT_NONE, &maria_group_commit_typelib);
+
+static MYSQL_SYSVAR_ULONG(group_commit_interval, maria_group_commit_interval,
+ PLUGIN_VAR_RQCMDARG,
+ "Interval between commite in microseconds (1/1000000c)."
+ " 0 stands for no waiting"
+ " for other threads to come and do a commit in \"hard\" mode and no"
+ " sync()/commit at all in \"soft\" mode. Option has only an effect"
+ " if maria_group_commit is used",
+ NULL, update_maria_group_commit_interval, 0, 0, UINT_MAX, 1);
+
static MYSQL_SYSVAR_ENUM(log_purge_type, log_purge_type,
PLUGIN_VAR_RQCMDARG,
"Specifies how maria transactional log will be purged. "
@@ -3278,6 +3314,8 @@
MYSQL_SYSVAR(block_size),
MYSQL_SYSVAR(checkpoint_interval),
MYSQL_SYSVAR(force_start_after_recovery_failures),
+ MYSQL_SYSVAR(group_commit),
+ MYSQL_SYSVAR(group_commit_interval),
MYSQL_SYSVAR(page_checksum),
MYSQL_SYSVAR(log_dir_path),
MYSQL_SYSVAR(log_file_size),
@@ -3309,6 +3347,92 @@
}
/**
+ @brief Updates group commit mode
+*/
+
+static void update_maria_group_commit(MYSQL_THD thd,
+ struct st_mysql_sys_var *var,
+ void *var_ptr, const void *save)
+{
+ ulong value= (ulong)*((long *)var_ptr);
+ DBUG_ENTER("update_maria_group_commit");
+ DBUG_PRINT("enter", ("old value: %lu new value %lu rate %lu",
+ value, (ulong)(*(long *)save),
+ maria_group_commit_interval));
+ /* old value */
+ switch (value) {
+ case TRANSLOG_GCOMMIT_NONE:
+ break;
+ case TRANSLOG_GCOMMIT_HARD:
+ translog_hard_group_commit(FALSE);
+ break;
+ case TRANSLOG_GCOMMIT_SOFT:
+ translog_soft_sync(FALSE);
+ if (maria_group_commit_interval)
+ translog_soft_sync_end();
+ break;
+ default:
+ DBUG_ASSERT(0); /* impossible */
+ }
+ value= *(ulong *)var_ptr= (ulong)(*(long *)save);
+ translog_sync();
+ /* new value */
+ switch (value) {
+ case TRANSLOG_GCOMMIT_NONE:
+ break;
+ case TRANSLOG_GCOMMIT_HARD:
+ translog_hard_group_commit(TRUE);
+ break;
+ case TRANSLOG_GCOMMIT_SOFT:
+ translog_soft_sync(TRUE);
+ /* variable change made under global lock so we can just read it */
+ if (maria_group_commit_interval)
+ translog_soft_sync_start();
+ break;
+ default:
+ DBUG_ASSERT(0); /* impossible */
+ }
+ DBUG_VOID_RETURN;
+}
+
+/**
+ @brief Updates group commit interval
+*/
+
+static void update_maria_group_commit_interval(MYSQL_THD thd,
+ struct st_mysql_sys_var *var,
+ void *var_ptr, const void *save)
+{
+ ulong new_value= (ulong)*((long *)save);
+ ulong *value_ptr= (ulong*) var_ptr;
+ DBUG_ENTER("update_maria_group_commit_interval");
+ DBUG_PRINT("enter", ("old value: %lu new value %lu group commit %lu",
+ *value_ptr, new_value, maria_group_commit));
+
+ /* variable change made under global lock so we can just read it */
+ switch (maria_group_commit) {
+ case TRANSLOG_GCOMMIT_NONE:
+ *value_ptr= new_value;
+ translog_set_group_commit_interval(new_value);
+ break;
+ case TRANSLOG_GCOMMIT_HARD:
+ *value_ptr= new_value;
+ translog_set_group_commit_interval(new_value);
+ break;
+ case TRANSLOG_GCOMMIT_SOFT:
+ if (*value_ptr)
+ translog_soft_sync_end();
+ translog_set_group_commit_interval(new_value);
+ if ((*value_ptr= new_value))
+ translog_soft_sync_start();
+ break;
+ default:
+ DBUG_ASSERT(0); /* impossible */
+ }
+ DBUG_VOID_RETURN;
+}
+
+/**
@brief Updates the transaction log file limit.
*/
@@ -3330,6 +3454,7 @@
{"Maria_pagecache_reads", (char*) &maria_pagecache_var.global_cache_read, SHOW_LONGLONG},
{"Maria_pagecache_write_requests", (char*) &maria_pagecache_var.global_cache_w_requests, SHOW_LONGLONG},
{"Maria_pagecache_writes", (char*) &maria_pagecache_var.global_cache_write, SHOW_LONGLONG},
+ {"Maria_transaction_log_syncs", (char*) &translog_syncs, SHOW_LONGLONG},
{NullS, NullS, SHOW_LONG}
};
=== modified file 'storage/maria/ma_init.c'
--- a/storage/maria/ma_init.c 2008-10-09 20:03:54 +0000
+++ b/storage/maria/ma_init.c 2010-02-12 13:12:28 +0000
@@ -82,6 +82,11 @@
maria_inited= maria_multi_threaded= FALSE;
ft_free_stopwords();
ma_checkpoint_end();
+ if (translog_status == TRANSLOG_OK)
+ {
+ translog_soft_sync_end();
+ translog_sync();
+ }
if ((trid= trnman_get_max_trid()) > max_trid_in_control_file)
{
/*
=== modified file 'storage/maria/ma_loghandler.c'
--- a/storage/maria/ma_loghandler.c 2010-01-06 21:27:53 +0000
+++ b/storage/maria/ma_loghandler.c 2010-02-12 13:12:28 +0000
@@ -18,6 +18,7 @@
#include "ma_blockrec.h" /* for some constants and in-write hooks */
#include "ma_key_recover.h" /* For some in-write hooks */
#include "ma_checkpoint.h"
+#include "ma_servicethread.h"
/*
On Windows, neither my_open() nor my_sync() work for directories.
@@ -47,6 +48,15 @@
#include <m_ctype.h>
#endif
+/** @brief protects checkpoint_in_progress */
+static pthread_mutex_t LOCK_soft_sync;
+/** @brief for killing the background checkpoint thread */
+static pthread_cond_t COND_soft_sync;
+/** @brief control structure for checkpoint background thread */
+static MA_SERVICE_THREAD_CONTROL soft_sync_control=
+ {THREAD_DEAD, FALSE, &LOCK_soft_sync, &COND_soft_sync};
+
+
/* transaction log file descriptor */
typedef struct st_translog_file
{
@@ -124,10 +134,24 @@
/* Previous buffer offset to detect it flush finish */
TRANSLOG_ADDRESS prev_buffer_offset;
/*
+ If the buffer was forced to close it save value of its horizon
+ otherwise LSN_IMPOSSIBLE
+ */
+ TRANSLOG_ADDRESS pre_force_close_horizon;
+ /*
How much is written (or will be written when copy_to_buffer_in_progress
become 0) to this buffer
*/
translog_size_t size;
+ /*
+ When moving from one log buffer to another, we write the last of the
+ previous buffer to file and then move to start using the new log
+ buffer. In the case of a part filed last page, this page is not moved
+ to the start of the new buffer but instead we set the 'skip_data'
+ variable to tell us how much data at the beginning of the buffer is not
+ relevant.
+ */
+ uint skipped_data;
/* File handler for this buffer */
TRANSLOG_FILE *file;
/* Threads which are waiting for buffer filling/freeing */
@@ -304,6 +328,7 @@
*/
pthread_mutex_t log_flush_lock;
pthread_cond_t log_flush_cond;
+ pthread_cond_t new_goal_cond;
/* Protects changing of headers of finished files (max_lsn) */
pthread_mutex_t file_header_lock;
@@ -344,13 +369,39 @@
ulong log_purge_type= TRANSLOG_PURGE_IMMIDIATE;
ulong log_file_size= TRANSLOG_FILE_SIZE;
+/* sync() of log files directory mode */
ulong sync_log_dir= TRANSLOG_SYNC_DIR_NEWFILE;
+ulong maria_group_commit= TRANSLOG_GCOMMIT_NONE;
+ulong maria_group_commit_interval= 0;
/* Marker for end of log */
static uchar end_of_log= 0;
#define END_OF_LOG &end_of_log
+/**
+ Switch for "soft" sync (no real sync() but periodical sync by service
+ thread)
+*/
+static volatile my_bool soft_sync= FALSE;
+/**
+ Switch for "hard" group commit mode
+*/
+static volatile my_bool hard_group_commit= FALSE;
+/**
+ File numbers interval which have to be sync()
+*/
+static uint32 soft_sync_min= 0;
+static uint32 soft_sync_max= 0;
+static uint32 soft_need_sync= 1;
+/**
+ stores interval in microseconds
+*/
+static uint32 group_commit_wait= 0;
enum enum_translog_status translog_status= TRANSLOG_UNINITED;
+ulonglong translog_syncs= 0; /* Number of sync()s */
+
+/* time of last flush */
+static ulonglong flush_start= 0;
/* chunk types */
#define TRANSLOG_CHUNK_LSN 0x00 /* 0 chunk refer as LSN (head or tail */
@@ -980,12 +1031,17 @@
static TRANSLOG_FILE *get_current_logfile()
{
TRANSLOG_FILE *file;
+ DBUG_ENTER("get_current_logfile");
rw_rdlock(&log_descriptor.open_files_lock);
+ DBUG_PRINT("info", ("max_file: %lu min_file: %lu open_files: %lu",
+ (ulong) log_descriptor.max_file,
+ (ulong) log_descriptor.min_file,
+ (ulong) log_descriptor.open_files.elements));
DBUG_ASSERT(log_descriptor.max_file - log_descriptor.min_file + 1 ==
log_descriptor.open_files.elements);
file= *dynamic_element(&log_descriptor.open_files, 0, TRANSLOG_FILE **);
rw_unlock(&log_descriptor.open_files_lock);
- return (file);
+ DBUG_RETURN(file);
}
uchar NEAR maria_trans_file_magic[]=
@@ -1069,6 +1125,7 @@
static my_bool translog_max_lsn_to_header(File file, LSN lsn)
{
uchar lsn_buff[LSN_STORE_SIZE];
+ my_bool rc;
DBUG_ENTER("translog_max_lsn_to_header");
DBUG_PRINT("enter", ("File descriptor: %ld "
"lsn: (%lu,0x%lx)",
@@ -1077,11 +1134,17 @@
lsn_store(lsn_buff, lsn);
- DBUG_RETURN(my_pwrite(file, lsn_buff,
- LSN_STORE_SIZE,
- (LOG_HEADER_DATA_SIZE - LSN_STORE_SIZE),
- log_write_flags) != 0 ||
- my_sync(file, MYF(MY_WME)) != 0);
+ rc= (my_pwrite(file, lsn_buff,
+ LSN_STORE_SIZE,
+ (LOG_HEADER_DATA_SIZE - LSN_STORE_SIZE),
+ log_write_flags) != 0 ||
+ my_sync(file, MYF(MY_WME)) != 0);
+ /*
+ We should not increase counter in case of error above, but it is so
+ unlikely that we can ignore this case
+ */
+ translog_syncs++;
+ DBUG_RETURN(rc);
}
@@ -1423,7 +1486,9 @@
static my_bool translog_buffer_init(struct st_translog_buffer *buffer, int num)
{
DBUG_ENTER("translog_buffer_init");
- buffer->prev_last_lsn= buffer->last_lsn= LSN_IMPOSSIBLE;
+ buffer->pre_force_close_horizon=
+ buffer->prev_last_lsn= buffer->last_lsn=
+ LSN_IMPOSSIBLE;
DBUG_PRINT("info", ("last_lsn and prev_last_lsn set to 0 buffer: 0x%lx",
(ulong) buffer));
@@ -1435,6 +1500,7 @@
memset(buffer->buffer, TRANSLOG_FILLER, TRANSLOG_WRITE_BUFFER);
/* Buffer size */
buffer->size= 0;
+ buffer->skipped_data= 0;
/* cond of thread which is waiting for buffer filling */
if (pthread_cond_init(&buffer->waiting_filling_buffer, 0))
DBUG_RETURN(1);
@@ -1489,7 +1555,10 @@
TODO: sync only we have changed the log
*/
if (!file->is_sync)
+ {
rc= my_sync(file->handler.file, MYF(MY_WME));
+ translog_syncs++;
+ }
rc|= my_close(file->handler.file, MYF(MY_WME));
my_free(file, MYF(0));
return test(rc);
@@ -2044,7 +2113,8 @@
(ulong) LSN_OFFSET(log_descriptor.horizon),
(ulong) LSN_OFFSET(log_descriptor.horizon)));
DBUG_ASSERT(buffer_no == buffer->buffer_no);
- buffer->prev_last_lsn= buffer->last_lsn= LSN_IMPOSSIBLE;
+ buffer->pre_force_close_horizon=
+ buffer->prev_last_lsn= buffer->last_lsn= LSN_IMPOSSIBLE;
DBUG_PRINT("info", ("last_lsn and prev_last_lsn set to 0 buffer: 0x%lx",
(ulong) buffer));
buffer->offset= log_descriptor.horizon;
@@ -2052,6 +2122,7 @@
buffer->file= get_current_logfile();
buffer->overlay= 0;
buffer->size= 0;
+ buffer->skipped_data= 0;
translog_cursor_init(cursor, buffer, buffer_no);
DBUG_PRINT("info", ("file: #%ld (%d) init cursor #%u: 0x%lx "
"chaser: %d Size: %lu (%lu)",
@@ -2523,6 +2594,7 @@
TRANSLOG_ADDRESS offset= buffer->offset;
TRANSLOG_FILE *file= buffer->file;
uint8 ver= buffer->ver;
+ uint skipped_data;
DBUG_ENTER("translog_buffer_flush");
DBUG_PRINT("enter",
("Buffer: #%u 0x%lx file: %d offset: (%lu,0x%lx) size: %lu",
@@ -2557,6 +2629,8 @@
disk
*/
file= buffer->file;
+ skipped_data= buffer->skipped_data;
+ DBUG_ASSERT(skipped_data < TRANSLOG_PAGE_SIZE);
for (i= 0, pg= LSN_OFFSET(buffer->offset) / TRANSLOG_PAGE_SIZE;
i < buffer->size;
i+= TRANSLOG_PAGE_SIZE, pg++)
@@ -2573,13 +2647,16 @@
DBUG_ASSERT(i + TRANSLOG_PAGE_SIZE <= buffer->size);
if (translog_status != TRANSLOG_OK && translog_status != TRANSLOG_SHUTDOWN)
DBUG_RETURN(1);
- if (pagecache_inject(log_descriptor.pagecache,
+ if (pagecache_write_part(log_descriptor.pagecache,
&file->handler, pg, 3,
buffer->buffer + i,
PAGECACHE_PLAIN_PAGE,
PAGECACHE_LOCK_LEFT_UNLOCKED,
- PAGECACHE_PIN_LEFT_UNPINNED, 0,
- LSN_IMPOSSIBLE))
+ PAGECACHE_PIN_LEFT_UNPINNED,
+ PAGECACHE_WRITE_DONE, 0,
+ LSN_IMPOSSIBLE,
+ skipped_data,
+ TRANSLOG_PAGE_SIZE - skipped_data))
{
DBUG_PRINT("error",
("Can't write page (%lu,0x%lx) to pagecache, error: %d",
@@ -2589,10 +2666,12 @@
translog_stop_writing();
DBUG_RETURN(1);
}
+ skipped_data= 0;
}
file->is_sync= 0;
- if (my_pwrite(file->handler.file, buffer->buffer,
- buffer->size, LSN_OFFSET(buffer->offset),
+ if (my_pwrite(file->handler.file, buffer->buffer + buffer->skipped_data,
+ buffer->size - buffer->skipped_data,
+ LSN_OFFSET(buffer->offset) + buffer->skipped_data,
log_write_flags))
{
DBUG_PRINT("error", ("Can't write buffer (%lu,0x%lx) size %lu "
@@ -2985,6 +3064,7 @@
uchar *from, *table= NULL;
int is_last_unfinished_page;
uint last_protected_sector= 0;
+ uint skipped_data= curr_buffer->skipped_data;
TRANSLOG_FILE file_copy;
uint8 ver= curr_buffer->ver;
translog_wait_for_writers(curr_buffer);
@@ -2997,7 +3077,38 @@
}
DBUG_ASSERT(LSN_FILE_NO(addr) == LSN_FILE_NO(curr_buffer->offset));
from= curr_buffer->buffer + (addr - curr_buffer->offset);
- memcpy(buffer, from, TRANSLOG_PAGE_SIZE);
+ if (skipped_data && addr == curr_buffer->offset)
+ {
+ /*
+ We read page part of which is not present in buffer,
+ so we should read absent part from file (page cache actually)
+ */
+ file= get_logfile_by_number(file_no);
+ DBUG_ASSERT(file != NULL);
+ /*
+ it's ok to not lock the page because:
+ - The log handler has it's own page cache.
+ - There is only one thread that can access the log
+ cache at a time
+ */
+ if (!(buffer= pagecache_read(log_descriptor.pagecache,
+ &file->handler,
+ LSN_OFFSET(addr) / TRANSLOG_PAGE_SIZE,
+ 3, buffer,
+ PAGECACHE_PLAIN_PAGE,
+ PAGECACHE_LOCK_LEFT_UNLOCKED,
+ NULL)))
+ DBUG_RETURN(NULL);
+ }
+ else
+ skipped_data= 0; /* Read after skipped in buffer data */
+ /*
+ Now we have correct data in buffer up to 'skipped_data'. The
+ following memcpy() will move the data from the internal buffer
+ that was not yet on disk.
+ */
+ memcpy(buffer + skipped_data, from + skipped_data,
+ TRANSLOG_PAGE_SIZE - skipped_data);
/*
We can use copy then in translog_page_validator() because it
do not put it permanently somewhere.
@@ -3291,6 +3402,7 @@
uint32 next_page_offset, page_rest;
uint32 i;
File fd;
+ int rc;
TRANSLOG_VALIDATOR_DATA data;
char path[FN_REFLEN];
uchar page_buff[TRANSLOG_PAGE_SIZE];
@@ -3316,14 +3428,19 @@
TRANSLOG_PAGE_SIZE);
page_rest= next_page_offset - LSN_OFFSET(addr);
memset(page_buff, TRANSLOG_FILLER, page_rest);
- if ((fd= open_logfile_by_number_no_cache(LSN_FILE_NO(addr))) < 0 ||
- ((my_chsize(fd, next_page_offset, TRANSLOG_FILLER, MYF(MY_WME)) ||
- (page_rest && my_pwrite(fd, page_buff, page_rest, LSN_OFFSET(addr),
- log_write_flags)) ||
- my_sync(fd, MYF(MY_WME))) |
- my_close(fd, MYF(MY_WME))) ||
- (sync_log_dir >= TRANSLOG_SYNC_DIR_ALWAYS &&
- sync_dir(log_descriptor.directory_fd, MYF(MY_WME | MY_IGNORE_BADFD))))
+ rc= ((fd= open_logfile_by_number_no_cache(LSN_FILE_NO(addr))) < 0 ||
+ ((my_chsize(fd, next_page_offset, TRANSLOG_FILLER, MYF(MY_WME)) ||
+ (page_rest && my_pwrite(fd, page_buff, page_rest, LSN_OFFSET(addr),
+ log_write_flags)) ||
+ my_sync(fd, MYF(MY_WME)))));
+ translog_syncs++;
+ rc|= (fd > 0 && my_close(fd, MYF(MY_WME)));
+ if (sync_log_dir >= TRANSLOG_SYNC_DIR_ALWAYS)
+ {
+ rc|= sync_dir(log_descriptor.directory_fd, MYF(MY_WME | MY_IGNORE_BADFD));
+ translog_syncs++;
+ }
+ if (rc)
DBUG_RETURN(1);
/* fix the horizon */
@@ -3483,7 +3600,10 @@
my_bool version_changed= 0;
DBUG_ENTER("translog_init_with_table");
+ translog_syncs= 0;
+ flush_start= 0;
id_to_share= NULL;
+
log_descriptor.directory_fd= -1;
log_descriptor.is_everything_flushed= 1;
log_descriptor.flush_in_progress= 0;
@@ -3511,6 +3631,7 @@
pthread_mutex_init(&log_descriptor.dirty_buffer_mask_lock,
MY_MUTEX_INIT_FAST) ||
pthread_cond_init(&log_descriptor.log_flush_cond, 0) ||
+ pthread_cond_init(&log_descriptor.new_goal_cond, 0) ||
my_rwlock_init(&log_descriptor.open_files_lock,
NULL) ||
my_init_dynamic_array(&log_descriptor.open_files,
@@ -3912,7 +4033,6 @@
log_descriptor.flushed= log_descriptor.horizon;
log_descriptor.in_buffers_only= log_descriptor.bc.buffer->offset;
log_descriptor.max_lsn= LSN_IMPOSSIBLE; /* set to 0 */
- log_descriptor.previous_flush_horizon= log_descriptor.horizon;
/*
Now 'flushed' is set to 'horizon' value, but 'horizon' is (potentially)
address of the next LSN and we want indicate that all LSNs that are
@@ -3995,6 +4115,10 @@
It is beginning of the log => there is no LSNs in the log =>
There is no harm in leaving it "as-is".
*/
+ log_descriptor.previous_flush_horizon= log_descriptor.horizon;
+ DBUG_PRINT("info", ("previous_flush_horizon: (%lu,0x%lx)",
+ LSN_IN_PARTS(log_descriptor.
+ previous_flush_horizon)));
DBUG_RETURN(0);
}
file_no--;
@@ -4070,6 +4194,9 @@
translog_free_record_header(&rec);
}
}
+ log_descriptor.previous_flush_horizon= log_descriptor.horizon;
+ DBUG_PRINT("info", ("previous_flush_horizon: (%lu,0x%lx)",
+ LSN_IN_PARTS(log_descriptor.previous_flush_horizon)));
DBUG_RETURN(0);
err:
ma_message_no_user(0, "log initialization failed");
@@ -4157,6 +4284,7 @@
pthread_mutex_destroy(&log_descriptor.log_flush_lock);
pthread_mutex_destroy(&log_descriptor.dirty_buffer_mask_lock);
pthread_cond_destroy(&log_descriptor.log_flush_cond);
+ pthread_cond_destroy(&log_descriptor.new_goal_cond);
rwlock_destroy(&log_descriptor.open_files_lock);
delete_dynamic(&log_descriptor.open_files);
delete_dynamic(&log_descriptor.unfinished_files);
@@ -6885,11 +7013,11 @@
{
translog_size_t res;
DBUG_ENTER("translog_read_record_header_from_buffer");
- DBUG_ASSERT(translog_is_LSN_chunk(page[page_offset]));
- DBUG_ASSERT(translog_status == TRANSLOG_OK ||
- translog_status == TRANSLOG_READONLY);
DBUG_PRINT("info", ("page byte: 0x%x offset: %u",
(uint) page[page_offset], (uint) page_offset));
+ DBUG_ASSERT(translog_is_LSN_chunk(page[page_offset]));
+ DBUG_ASSERT(translog_status == TRANSLOG_OK ||
+ translog_status == TRANSLOG_READONLY);
buff->type= (page[page_offset] & TRANSLOG_REC_TYPE);
buff->short_trid= uint2korr(page + page_offset + 1);
DBUG_PRINT("info", ("Type %u, Short TrID %u, LSN (%lu,0x%lx)",
@@ -7356,27 +7484,27 @@
"Buffer addr: (%lu,0x%lx) "
"Page addr: (%lu,0x%lx) "
"size: %lu (%lu) Pg: %u left: %u in progress %u",
- (uint) log_descriptor.bc.buffer_no,
- (ulong) log_descriptor.bc.buffer,
- LSN_IN_PARTS(log_descriptor.bc.buffer->offset),
+ (uint) old_buffer_no,
+ (ulong) old_buffer,
+ LSN_IN_PARTS(old_buffer->offset),
(ulong) LSN_FILE_NO(log_descriptor.horizon),
(ulong) (LSN_OFFSET(log_descriptor.horizon) -
log_descriptor.bc.current_page_fill),
- (ulong) log_descriptor.bc.buffer->size,
+ (ulong) old_buffer->size,
(ulong) (log_descriptor.bc.ptr -log_descriptor.bc.
buffer->buffer),
(uint) log_descriptor.bc.current_page_fill,
(uint) left,
- (uint) log_descriptor.bc.buffer->
+ (uint) old_buffer->
copy_to_buffer_in_progress));
translog_lock_assert_owner();
LINT_INIT(current_page_fill);
- new_buff_beginning= log_descriptor.bc.buffer->offset;
- new_buff_beginning+= log_descriptor.bc.buffer->size; /* increase offset */
+ new_buff_beginning= old_buffer->offset;
+ new_buff_beginning+= old_buffer->size; /* increase offset */
DBUG_ASSERT(log_descriptor.bc.ptr !=NULL);
DBUG_ASSERT(LSN_FILE_NO(log_descriptor.horizon) ==
- LSN_FILE_NO(log_descriptor.bc.buffer->offset));
+ LSN_FILE_NO(old_buffer->offset));
translog_check_cursor(&log_descriptor.bc);
DBUG_ASSERT(left < TRANSLOG_PAGE_SIZE);
if (left)
@@ -7387,18 +7515,20 @@
*/
DBUG_PRINT("info", ("left: %u", (uint) left));
+ old_buffer->pre_force_close_horizon=
+ old_buffer->offset + old_buffer->size;
/* decrease offset */
new_buff_beginning-= log_descriptor.bc.current_page_fill;
current_page_fill= log_descriptor.bc.current_page_fill;
memset(log_descriptor.bc.ptr, TRANSLOG_FILLER, left);
- log_descriptor.bc.buffer->size+= left;
+ old_buffer->size+= left;
DBUG_PRINT("info", ("Finish Page buffer #%u: 0x%lx "
"Size: %lu",
- (uint) log_descriptor.bc.buffer->buffer_no,
- (ulong) log_descriptor.bc.buffer,
- (ulong) log_descriptor.bc.buffer->size));
- DBUG_ASSERT(log_descriptor.bc.buffer->buffer_no ==
+ (uint) old_buffer->buffer_no,
+ (ulong) old_buffer,
+ (ulong) old_buffer->size));
+ DBUG_ASSERT(old_buffer->buffer_no ==
log_descriptor.bc.buffer_no);
}
else
@@ -7509,11 +7639,21 @@
if (left)
{
- /*
- TODO: do not copy beginning of the page if we have no CRC or sector
- checks on
- */
- memcpy(new_buffer->buffer, data, current_page_fill);
+ if (log_descriptor.flags &
+ (TRANSLOG_PAGE_CRC | TRANSLOG_SECTOR_PROTECTION))
+ memcpy(new_buffer->buffer, data, current_page_fill);
+ else
+ {
+ /*
+ This page header does not change if we add more data to the page so
+ we can not copy it and will not overwrite later
+ */
+ new_buffer->skipped_data= current_page_fill;
+#ifndef DBUG_OFF
+ memset(new_buffer->buffer, 0xa5, current_page_fill);
+#endif
+ DBUG_ASSERT(new_buffer->skipped_data < TRANSLOG_PAGE_SIZE);
+ }
}
old_buffer->next_buffer_offset= new_buffer->offset;
translog_buffer_lock(new_buffer);
@@ -7561,6 +7701,7 @@
{
log_descriptor.next_pass_max_lsn= lsn;
log_descriptor.max_lsn_requester= pthread_self();
+ pthread_cond_broadcast(&log_descriptor.new_goal_cond);
}
while (flush_no == log_descriptor.flush_no)
{
@@ -7572,66 +7713,78 @@
/**
- @brief Flush the log up to given LSN (included)
-
- @param lsn log record serial number up to which (inclusive)
- the log has to be flushed
-
- @return Operation status
+ @brief sync() range of files (inclusive) and directory (by request)
+
+ @param min min internal file number to flush
+ @param max max internal file number to flush
+ @param sync_dir need sync directory
+
+ return Operation status
@retval 0 OK
@retval 1 Error
-
-*/
-
-my_bool translog_flush(TRANSLOG_ADDRESS lsn)
-{
- LSN sent_to_disk= LSN_IMPOSSIBLE;
- TRANSLOG_ADDRESS flush_horizon;
- uint fn, i;
+*/
+
+static my_bool translog_sync_files(uint32 min, uint32 max,
+ my_bool sync_dir)
+{
+ uint fn;
+ my_bool rc= 0;
+ ulonglong flush_interval;
+ DBUG_ENTER("translog_sync_files");
+ DBUG_PRINT("info", ("min: %lu max: %lu sync dir: %d",
+ (ulong) min, (ulong) max, (int) sync_dir));
+ DBUG_ASSERT(min <= max);
+
+ flush_interval= group_commit_wait;
+ if (flush_interval)
+ flush_start= my_micro_time();
+ for (fn= min; fn <= max; fn++)
+ {
+ TRANSLOG_FILE *file= get_logfile_by_number(fn);
+ DBUG_ASSERT(file != NULL);
+ if (!file->is_sync)
+ {
+ if (my_sync(file->handler.file, MYF(MY_WME)))
+ {
+ rc= 1;
+ translog_stop_writing();
+ DBUG_RETURN(rc);
+ }
+ translog_syncs++;
+ file->is_sync= 1;
+ }
+ }
+
+ if (sync_dir)
+ {
+ if (!(rc= sync_dir(log_descriptor.directory_fd,
+ MYF(MY_WME | MY_IGNORE_BADFD))))
+ translog_syncs++;
+ }
+
+ DBUG_RETURN(rc);
+}
+
+
+/*
+ @brief Flushes buffers with LSNs in them less or equal address <lsn>
+
+ @param lsn address up to which all LSNs should be flushed,
+ can be reset to real last LSN address
+ @parem sent_to_disk returns 'sent to disk' position
+ @param flush_horizon returns horizon of the flush
+
+ @note About terminology see comment to translog_flush().
+*/
+
+void translog_flush_buffers(TRANSLOG_ADDRESS *lsn,
+ TRANSLOG_ADDRESS *sent_to_disk,
+ TRANSLOG_ADDRESS *flush_horizon)
+{
dirty_buffer_mask_t dirty_buffer_mask;
+ uint i;
uint8 last_buffer_no, start_buffer_no;
- my_bool rc= 0;
- DBUG_ENTER("translog_flush");
- DBUG_PRINT("enter", ("Flush up to LSN: (%lu,0x%lx)", LSN_IN_PARTS(lsn)));
- DBUG_ASSERT(translog_status == TRANSLOG_OK ||
- translog_status == TRANSLOG_READONLY);
- LINT_INIT(sent_to_disk);
-
- pthread_mutex_lock(&log_descriptor.log_flush_lock);
- DBUG_PRINT("info", ("Everything is flushed up to (%lu,0x%lx)",
- LSN_IN_PARTS(log_descriptor.flushed)));
- if (cmp_translog_addr(log_descriptor.flushed, lsn) >= 0)
- {
- pthread_mutex_unlock(&log_descriptor.log_flush_lock);
- DBUG_RETURN(0);
- }
- if (log_descriptor.flush_in_progress)
- {
- translog_flush_set_new_goal_and_wait(lsn);
- if (!pthread_equal(log_descriptor.max_lsn_requester, pthread_self()))
- {
- /* fix lsn if it was horizon */
- if (cmp_translog_addr(lsn, log_descriptor.bc.buffer->last_lsn) > 0)
- lsn= BUFFER_MAX_LSN(log_descriptor.bc.buffer);
- translog_flush_wait_for_end(lsn);
- pthread_mutex_unlock(&log_descriptor.log_flush_lock);
- DBUG_RETURN(0);
- }
- log_descriptor.next_pass_max_lsn= LSN_IMPOSSIBLE;
- }
- log_descriptor.flush_in_progress= 1;
- flush_horizon= log_descriptor.previous_flush_horizon;
- DBUG_PRINT("info", ("flush_in_progress is set"));
- pthread_mutex_unlock(&log_descriptor.log_flush_lock);
-
- translog_lock();
- if (log_descriptor.is_everything_flushed)
- {
- DBUG_PRINT("info", ("everything is flushed"));
- rc= (translog_status == TRANSLOG_READONLY);
- translog_unlock();
- goto out;
- }
+ DBUG_ENTER("translog_flush_buffers");
/*
We will recheck information when will lock buffers one by
@@ -7656,15 +7809,15 @@
/*
if LSN up to which we have to flush bigger then maximum LSN of previous
buffer and at least one LSN was saved in the current buffer (last_lsn !=
- LSN_IMPOSSIBLE) then we better finish the current buffer.
+ LSN_IMPOSSIBLE) then we have to close the current buffer.
*/
- if (cmp_translog_addr(lsn, log_descriptor.bc.buffer->prev_last_lsn) > 0 &&
+ if (cmp_translog_addr(*lsn, log_descriptor.bc.buffer->prev_last_lsn) > 0 &&
log_descriptor.bc.buffer->last_lsn != LSN_IMPOSSIBLE)
{
struct st_translog_buffer *buffer= log_descriptor.bc.buffer;
- lsn= log_descriptor.bc.buffer->last_lsn; /* fix lsn if it was horizon */
+ *lsn= log_descriptor.bc.buffer->last_lsn; /* fix lsn if it was horizon */
DBUG_PRINT("info", ("LSN to flush fixed to last lsn: (%lu,0x%lx)",
- LSN_IN_PARTS(log_descriptor.bc.buffer->last_lsn)));
+ LSN_IN_PARTS(log_descriptor.bc.buffer->last_lsn)));
last_buffer_no= log_descriptor.bc.buffer_no;
log_descriptor.is_everything_flushed= 1;
translog_force_current_buffer_to_finish();
@@ -7676,8 +7829,10 @@
TRANSLOG_BUFFERS_NO);
translog_unlock();
}
- sent_to_disk= translog_get_sent_to_disk();
- if (cmp_translog_addr(lsn, sent_to_disk) > 0)
+
+ /* flush buffers */
+ *sent_to_disk= translog_get_sent_to_disk();
+ if (cmp_translog_addr(*lsn, *sent_to_disk) > 0)
{
DBUG_PRINT("info", ("Start buffer #: %u last buffer #: %u",
@@ -7697,53 +7852,238 @@
LSN_IN_PARTS(buffer->last_lsn),
(buffer->file ?
"dirty" : "closed")));
- if (buffer->prev_last_lsn <= lsn &&
+ if (buffer->prev_last_lsn <= *lsn &&
buffer->file != NULL)
{
- DBUG_ASSERT(flush_horizon <= buffer->offset + buffer->size);
- flush_horizon= buffer->offset + buffer->size;
+ DBUG_ASSERT(*flush_horizon <= buffer->offset + buffer->size);
+ *flush_horizon= (buffer->pre_force_close_horizon != LSN_IMPOSSIBLE ?
+ buffer->pre_force_close_horizon :
+ buffer->offset + buffer->size);
+ /* pre_force_close_horizon is reset during new buffer start */
+ DBUG_PRINT("info", ("flush_horizon: (%lu,0x%lx)",
+ LSN_IN_PARTS(*flush_horizon)));
+ DBUG_ASSERT(*flush_horizon <= log_descriptor.horizon);
+
translog_buffer_flush(buffer);
}
translog_buffer_unlock(buffer);
i= (i + 1) % TRANSLOG_BUFFERS_NO;
} while (i != last_buffer_no);
- sent_to_disk= translog_get_sent_to_disk();
- }
-
- /* sync files from previous flush till current one */
- for (fn= LSN_FILE_NO(log_descriptor.flushed); fn <= LSN_FILE_NO(lsn); fn++)
- {
- TRANSLOG_FILE *file= get_logfile_by_number(fn);
- DBUG_ASSERT(file != NULL);
- if (!file->is_sync)
- {
- if (my_sync(file->handler.file, MYF(MY_WME)))
+ *sent_to_disk= translog_get_sent_to_disk();
+ }
+
+ DBUG_VOID_RETURN;
+}
+
+/**
+ @brief Flush the log up to given LSN (included)
+
+ @param lsn log record serial number up to which (inclusive)
+ the log has to be flushed
+
+ @return Operation status
+ @retval 0 OK
+ @retval 1 Error
+
+ @note
+
+ - Non group commit logic: Commits made in passes. Thread which started
+ flush first is performing actual flush, other threads sets new goal (LSN)
+ of the next pass (if it is maximum) and waits for the pass end or just
+ wait for the pass end.
+
+ - If hard group commit enabled and rate set to zero:
+ The first thread sends all changed buffers to disk. This is repeated
+ as long as there are new LSNs added. The process can not loop
+ forever because we have limited number of threads and they will wait
+ for the data to be synced.
+ Pseudo code:
+
+ do
+ send changed buffers to disk
+ while new_goal
+ sync
+
+ - If hard group commit switched ON and less than rate microseconds has
+ passed from last sync, then after buffers have been sent to disk
+ wait until rate microseconds has passed since last sync, do sync and return.
+ This ensures that if we call sync infrequently we don't do any waits.
+
+ - If soft group commit enabled everything works as with 'non group commit'
+ but the thread doesn't do any real sync(). If rate is not zero the
+ sync() will be performed by a service thread with the given rate
+ when needed (new LSN appears).
+
+ @note Terminology:
+ 'sent to disk' means written to disk but not sync()ed,
+ 'flushed' mean sent to disk and synced().
+*/
+
+my_bool translog_flush(TRANSLOG_ADDRESS lsn)
+{
+ struct timespec abstime;
+ ulonglong flush_interval;
+ ulonglong time_spent;
+ LSN sent_to_disk= LSN_IMPOSSIBLE;
+ TRANSLOG_ADDRESS flush_horizon;
+ my_bool rc= 0;
+ my_bool hgroup_commit_at_start;
+ DBUG_ENTER("translog_flush");
+ DBUG_PRINT("enter", ("Flush up to LSN: (%lu,0x%lx)", LSN_IN_PARTS(lsn)));
+ DBUG_ASSERT(translog_status == TRANSLOG_OK ||
+ translog_status == TRANSLOG_READONLY);
+ LINT_INIT(sent_to_disk);
+
+ pthread_mutex_lock(&log_descriptor.log_flush_lock);
+ DBUG_PRINT("info", ("Everything is flushed up to (%lu,0x%lx)",
+ LSN_IN_PARTS(log_descriptor.flushed)));
+ if (cmp_translog_addr(log_descriptor.flushed, lsn) >= 0)
+ {
+ pthread_mutex_unlock(&log_descriptor.log_flush_lock);
+ DBUG_RETURN(0);
+ }
+ if (log_descriptor.flush_in_progress)
+ {
+ translog_lock();
+ /* fix lsn if it was horizon */
+ if (cmp_translog_addr(lsn, log_descriptor.bc.buffer->last_lsn) > 0)
+ lsn= BUFFER_MAX_LSN(log_descriptor.bc.buffer);
+ translog_unlock();
+ translog_flush_set_new_goal_and_wait(lsn);
+ if (!pthread_equal(log_descriptor.max_lsn_requester, pthread_self()))
+ {
+ /*
+ translog_flush_wait_for_end() release log_flush_lock while is
+ waiting then acquire it again
+ */
+ translog_flush_wait_for_end(lsn);
+ pthread_mutex_unlock(&log_descriptor.log_flush_lock);
+ DBUG_RETURN(0);
+ }
+ log_descriptor.next_pass_max_lsn= LSN_IMPOSSIBLE;
+ }
+ log_descriptor.flush_in_progress= 1;
+ flush_horizon= log_descriptor.previous_flush_horizon;
+ DBUG_PRINT("info", ("flush_in_progress is set, flush_horizon: (%lu,0x%lx)",
+ LSN_IN_PARTS(flush_horizon)));
+ pthread_mutex_unlock(&log_descriptor.log_flush_lock);
+
+ hgroup_commit_at_start= hard_group_commit;
+ if (hgroup_commit_at_start)
+ flush_interval= group_commit_wait;
+
+ translog_lock();
+ if (log_descriptor.is_everything_flushed)
+ {
+ DBUG_PRINT("info", ("everything is flushed"));
+ translog_unlock();
+ pthread_mutex_lock(&log_descriptor.log_flush_lock);
+ goto out;
+ }
+
+ for (;;)
+ {
+ /* Following function flushes buffers and makes translog_unlock() */
+ translog_flush_buffers(&lsn, &sent_to_disk, &flush_horizon);
+
+ if (!hgroup_commit_at_start)
+ break; /* flush pass is ended */
+
+retest:
+ /*
+ We do not check time here because pthread_mutex_lock rarely takes
+ a lot of time so we can sacrifice a bit precision to performance
+ (taking into account that my_micro_time() might be expensive call).
+ */
+ if (flush_interval == 0)
+ break; /* flush pass is ended */
+
+ pthread_mutex_lock(&log_descriptor.log_flush_lock);
+ if (log_descriptor.next_pass_max_lsn == LSN_IMPOSSIBLE)
+ {
+ if (flush_interval == 0 ||
+ (time_spent= (my_micro_time() - flush_start)) >= flush_interval)
{
- rc= 1;
- translog_stop_writing();
- sent_to_disk= LSN_IMPOSSIBLE;
- goto out;
+ pthread_mutex_unlock(&log_descriptor.log_flush_lock);
+ break;
}
- file->is_sync= 1;
- }
- }
-
- if (sync_log_dir >= TRANSLOG_SYNC_DIR_ALWAYS &&
- (LSN_FILE_NO(log_descriptor.previous_flush_horizon) !=
- LSN_FILE_NO(flush_horizon) ||
- ((LSN_OFFSET(log_descriptor.previous_flush_horizon) - 1) /
- TRANSLOG_PAGE_SIZE) !=
- ((LSN_OFFSET(flush_horizon) - 1) / TRANSLOG_PAGE_SIZE)))
- rc|= sync_dir(log_descriptor.directory_fd, MYF(MY_WME | MY_IGNORE_BADFD));
+ DBUG_PRINT("info", ("flush waits: %llu interval: %llu spent: %llu",
+ flush_interval - time_spent,
+ flush_interval, time_spent));
+ /* wait time or next goal */
+ set_timespec_nsec(abstime, flush_interval - time_spent);
+ pthread_cond_timedwait(&log_descriptor.new_goal_cond,
+ &log_descriptor.log_flush_lock,
+ &abstime);
+ pthread_mutex_unlock(&log_descriptor.log_flush_lock);
+ DBUG_PRINT("info", ("retest conditions"));
+ goto retest;
+ }
+
+ /* take next goal */
+ lsn= log_descriptor.next_pass_max_lsn;
+ log_descriptor.next_pass_max_lsn= LSN_IMPOSSIBLE;
+ /* prevent other thread from continue */
+ log_descriptor.max_lsn_requester= pthread_self();
+ DBUG_PRINT("info", ("flush took next goal: (%lu,0x%lx)",
+ LSN_IN_PARTS(lsn)));
+ pthread_mutex_unlock(&log_descriptor.log_flush_lock);
+
+ /* next flush pass */
+ DBUG_PRINT("info", ("next flush pass"));
+ translog_lock();
+ }
+
+ /*
+ sync() files from previous flush till current one
+ */
+ if (!soft_sync || hgroup_commit_at_start)
+ {
+ if ((rc=
+ translog_sync_files(LSN_FILE_NO(log_descriptor.flushed),
+ LSN_FILE_NO(lsn),
+ sync_log_dir >= TRANSLOG_SYNC_DIR_ALWAYS &&
+ (LSN_FILE_NO(log_descriptor.
+ previous_flush_horizon) !=
+ LSN_FILE_NO(flush_horizon) ||
+ (LSN_OFFSET(log_descriptor.
+ previous_flush_horizon) /
+ TRANSLOG_PAGE_SIZE) !=
+ (LSN_OFFSET(flush_horizon) /
+ TRANSLOG_PAGE_SIZE)))))
+ {
+ sent_to_disk= LSN_IMPOSSIBLE;
+ pthread_mutex_lock(&log_descriptor.log_flush_lock);
+ goto out;
+ }
+ /* keep values for soft sync() and forced sync() actual */
+ {
+ uint32 fileno= LSN_FILE_NO(lsn);
+ my_atomic_rwlock_wrlock(&soft_sync_rwl);
+ my_atomic_store32(&soft_sync_min, fileno);
+ my_atomic_store32(&soft_sync_max, fileno);
+ my_atomic_rwlock_wrunlock(&soft_sync_rwl);
+ }
+ }
+ else
+ {
+ my_atomic_rwlock_wrlock(&soft_sync_rwl);
+ my_atomic_store32(&soft_sync_max, LSN_FILE_NO(lsn));
+ my_atomic_store32(&soft_need_sync, 1);
+ my_atomic_rwlock_wrunlock(&soft_sync_rwl);
+ }
+
+ DBUG_ASSERT(flush_horizon <= log_descriptor.horizon);
+
+ pthread_mutex_lock(&log_descriptor.log_flush_lock);
log_descriptor.previous_flush_horizon= flush_horizon;
out:
- pthread_mutex_lock(&log_descriptor.log_flush_lock);
if (sent_to_disk != LSN_IMPOSSIBLE)
log_descriptor.flushed= sent_to_disk;
log_descriptor.flush_in_progress= 0;
log_descriptor.flush_no++;
DBUG_PRINT("info", ("flush_in_progress is dropped"));
- pthread_mutex_unlock(&log_descriptor.log_flush_lock);\
+ pthread_mutex_unlock(&log_descriptor.log_flush_lock);
pthread_cond_broadcast(&log_descriptor.log_flush_cond);
DBUG_RETURN(rc);
}
@@ -8113,6 +8453,8 @@
my_bool translog_purge(TRANSLOG_ADDRESS low)
{
uint32 last_need_file= LSN_FILE_NO(low);
+ uint32 min_unsync;
+ int soft;
TRANSLOG_ADDRESS horizon= translog_get_horizon();
int rc= 0;
DBUG_ENTER("translog_purge");
@@ -8120,12 +8462,26 @@
DBUG_ASSERT(translog_status == TRANSLOG_OK ||
translog_status == TRANSLOG_READONLY);
+ soft= soft_sync;
+ my_atomic_rwlock_wrlock(&soft_sync_rwl);
+ min_unsync= my_atomic_load32(&soft_sync_min);
+ my_atomic_rwlock_wrunlock(&soft_sync_rwl);
+ DBUG_PRINT("info", ("min_unsync: %lu", (ulong) min_unsync));
+ if (soft && min_unsync < last_need_file)
+ {
+ last_need_file= min_unsync;
+ DBUG_PRINT("info", ("last_need_file set to %lu", (ulong)last_need_file));
+ }
+
pthread_mutex_lock(&log_descriptor.purger_lock);
+ DBUG_PRINT("info", ("last_lsn_checked file: %lu:",
+ (ulong) log_descriptor.last_lsn_checked));
if (LSN_FILE_NO(log_descriptor.last_lsn_checked) < last_need_file)
{
uint32 i;
uint32 min_file= translog_first_file(horizon, 1);
DBUG_ASSERT(min_file != 0); /* log is already started */
+ DBUG_PRINT("info", ("min_file: %lu:",(ulong) min_file));
for(i= min_file; i < last_need_file && rc == 0; i++)
{
LSN lsn= translog_get_file_max_lsn_stored(i);
@@ -8356,6 +8712,159 @@
}
+
+/**
+ Sets soft sync mode
+
+ @param mode TRUE if we need switch soft sync on else off
+*/
+
+void translog_soft_sync(my_bool mode)
+{
+ soft_sync= mode;
+}
+
+
+/**
+ Sets hard group commit
+
+ @param mode TRUE if we need switch hard group commit on else off
+*/
+
+void translog_hard_group_commit(my_bool mode)
+{
+ hard_group_commit= mode;
+}
+
+
+/**
+ @brief forced log sync (used when we are switching modes)
+*/
+
+void translog_sync()
+{
+ uint32 max= get_current_logfile()->number;
+ uint32 min;
+ DBUG_ENTER("ma_translog_sync");
+
+ my_atomic_rwlock_rdlock(&soft_sync_rwl);
+ min= my_atomic_load32(&soft_sync_min);
+ my_atomic_rwlock_rdunlock(&soft_sync_rwl);
+ if (!min)
+ min= max;
+
+ translog_sync_files(min, max, sync_log_dir >= TRANSLOG_SYNC_DIR_ALWAYS);
+
+ DBUG_VOID_RETURN;
+}
+
+
+/**
+ @brief set rate for group commit
+
+ @param interval interval to set.
+
+ @note We use this function with additional variable because have to
+ restart service thread with new value which we can't make inside changing
+ variable routine (update_maria_group_commit_interval)
+*/
+
+void translog_set_group_commit_interval(uint32 interval)
+{
+ DBUG_ENTER("translog_set_group_commit_interval");
+ group_commit_wait= interval;
+ DBUG_PRINT("info", ("wait: %llu",
+ (ulonglong)group_commit_wait));
+ DBUG_VOID_RETURN;
+}
+
+
+/**
+ @brief syncing service thread
+*/
+
+static pthread_handler_t
+ma_soft_sync_background( void *arg __attribute__((unused)))
+{
+
+ my_thread_init();
+ {
+ DBUG_ENTER("ma_soft_sync_background");
+ for(;;)
+ {
+ ulonglong prev_loop= my_micro_time();
+ ulonglong time, sleep;
+ uint32 min, max, sync_request;
+ my_atomic_rwlock_rdlock(&soft_sync_rwl);
+ min= my_atomic_load32(&soft_sync_min);
+ max= my_atomic_load32(&soft_sync_max);
+ sync_request= my_atomic_load32(&soft_need_sync);
+ my_atomic_store32(&soft_sync_min, max);
+ my_atomic_store32(&soft_need_sync, 0);
+ my_atomic_rwlock_rdunlock(&soft_sync_rwl);
+
+ sleep= group_commit_wait;
+ if (sync_request)
+ translog_sync_files(min, max, FALSE);
+ time= my_micro_time() - prev_loop;
+ if (time > sleep)
+ sleep= 0;
+ else
+ sleep-= time;
+ if (my_service_thread_sleep(&soft_sync_control, sleep))
+ break;
+ }
+ my_service_thread_signal_end(&soft_sync_control);
+ my_thread_end();
+ DBUG_RETURN(0);
+ }
+}
+
+
+/**
+ @brief Starts syncing thread
+*/
+
+int translog_soft_sync_start(void)
+{
+ pthread_t th;
+ int res= 0;
+ uint32 min, max;
+ DBUG_ENTER("translog_soft_sync_start");
+
+ /* check and init variables */
+ my_atomic_rwlock_rdlock(&soft_sync_rwl);
+ min= my_atomic_load32(&soft_sync_min);
+ max= my_atomic_load32(&soft_sync_max);
+ if (!max)
+ my_atomic_store32(&soft_sync_max, (max= get_current_logfile()->number));
+ if (!min)
+ my_atomic_store32(&soft_sync_min, max);
+ my_atomic_store32(&soft_need_sync, 1);
+ my_atomic_rwlock_rdunlock(&soft_sync_rwl);
+
+ if (!(res= ma_service_thread_control_init(&soft_sync_control)))
+ if (!(res= pthread_create(&th, NULL, ma_soft_sync_background, NULL)))
+ soft_sync_control.status= THREAD_RUNNING;
+ DBUG_RETURN(res);
+}
+
+
+/**
+ @brief Stops syncing thread
+*/
+
+void translog_soft_sync_end(void)
+{
+ DBUG_ENTER("translog_soft_sync_end");
+ if (soft_sync_control.inited)
+ {
+ ma_service_thread_control_end(&soft_sync_control);
+ }
+ DBUG_VOID_RETURN;
+}
+
+
#ifdef MARIA_DUMP_LOG
#include <my_getopt.h>
extern void translog_example_table_init();
=== modified file 'storage/maria/ma_loghandler.h'
--- a/storage/maria/ma_loghandler.h 2009-01-15 22:25:53 +0000
+++ b/storage/maria/ma_loghandler.h 2010-02-12 13:12:28 +0000
@@ -342,6 +342,14 @@
TRANSLOG_SHUTDOWN /* going to shutdown the loghandler */
};
extern enum enum_translog_status translog_status;
+extern ulonglong translog_syncs; /* Number of sync()s */
+
+void translog_soft_sync(my_bool mode);
+void translog_hard_group_commit(my_bool mode);
+int translog_soft_sync_start(void);
+void translog_soft_sync_end(void);
+void translog_sync();
+void translog_set_group_commit_interval(uint32 interval);
/*
all the rest added because of recovery; should we make
@@ -441,6 +449,14 @@
typedef enum
{
+ TRANSLOG_GCOMMIT_NONE,
+ TRANSLOG_GCOMMIT_HARD,
+ TRANSLOG_GCOMMIT_SOFT
+} enum_maria_group_commit;
+extern ulong maria_group_commit;
+extern ulong maria_group_commit_interval;
+typedef enum
+{
TRANSLOG_PURGE_IMMIDIATE,
TRANSLOG_PURGE_EXTERNAL,
TRANSLOG_PURGE_ONDEMAND
1
0

[Maria-developers] Rev 2740: Group commit for maria storage engine. in file:///Users/bell/maria/bzr/work-maria-5.2-groupcommit/
by sanja@askmonty.org 12 Feb '10
by sanja@askmonty.org 12 Feb '10
12 Feb '10
At file:///Users/bell/maria/bzr/work-maria-5.2-groupcommit/
------------------------------------------------------------
revno: 2740
revision-id: sanja(a)askmonty.org-20100212091325-sluwoeo04cvmjewk
parent: knielsen(a)knielsen-hq.org-20100201190519-b9uktnn90rwwiile
committer: sanja(a)askmonty.org
branch nick: work-maria-5.2-groupcommit
timestamp: Fri 2010-02-12 11:13:25 +0200
message:
Group commit for maria storage engine.
=== added file 'mysql-test/suite/maria/r/group_commit.result'
--- a/mysql-test/suite/maria/r/group_commit.result 1970-01-01 00:00:00 +0000
+++ b/mysql-test/suite/maria/r/group_commit.result 2010-02-12 09:13:25 +0000
@@ -0,0 +1,17 @@
+drop table if exists t1;
+create table t1 (a int);
+SET GLOBAL maria_group_commit="NONE";
+SET GLOBAL maria_group_commit_interval= 0;
+SET GLOBAL maria_group_commit="NONE";
+SET GLOBAL maria_group_commit_interval= 100;
+SET GLOBAL maria_group_commit="HARD";
+SET GLOBAL maria_group_commit_interval= 0;
+SET GLOBAL maria_group_commit="HARD";
+SET GLOBAL maria_group_commit_interval= 100;
+SET GLOBAL maria_group_commit="SOFT";
+SET GLOBAL maria_group_commit_interval= 0;
+SET GLOBAL maria_group_commit="SOFT";
+SET GLOBAL maria_group_commit_interval= 100;
+SET GLOBAL maria_group_commit="NONE";
+SET GLOBAL maria_group_commit_interval= 0;
+drop table t1;
=== modified file 'mysql-test/suite/maria/r/maria3.result'
--- a/mysql-test/suite/maria/r/maria3.result 2009-09-18 01:04:43 +0000
+++ b/mysql-test/suite/maria/r/maria3.result 2010-02-12 09:13:25 +0000
@@ -306,6 +306,8 @@
maria_block_size 8192
maria_checkpoint_interval 30
maria_force_start_after_recovery_failures 0
+maria_group_commit none
+maria_group_commit_interval 0
maria_log_file_size 4294959104
maria_log_purge_type immediate
maria_max_sort_file_size 9223372036853727232
@@ -328,6 +330,7 @@
Maria_pagecache_reads #
Maria_pagecache_write_requests #
Maria_pagecache_writes #
+Maria_transaction_log_syncs #
create table t1 (b char(0));
insert into t1 values(NULL),("");
select length(b) from t1;
=== added file 'mysql-test/suite/maria/t/group_commit.test'
--- a/mysql-test/suite/maria/t/group_commit.test 1970-01-01 00:00:00 +0000
+++ b/mysql-test/suite/maria/t/group_commit.test 2010-02-12 09:13:25 +0000
@@ -0,0 +1,71 @@
+# Test different ways of syncing (mostly syntax)
+
+--disable_warnings
+drop table if exists t1;
+--enable_warnings
+
+create table t1 (a int);
+
+SET GLOBAL maria_group_commit="NONE";
+SET GLOBAL maria_group_commit_interval= 0;
+--disable_query_log
+let $num = 5000;
+while ($num)
+{
+ insert into t1 values (1);
+ dec $num;
+}
+--enable_query_log
+SET GLOBAL maria_group_commit="NONE";
+SET GLOBAL maria_group_commit_interval= 100;
+--disable_query_log
+let $num = 5000;
+while ($num)
+{
+ insert into t1 values (1);
+ dec $num;
+}
+--enable_query_log
+SET GLOBAL maria_group_commit="HARD";
+SET GLOBAL maria_group_commit_interval= 0;
+--disable_query_log
+let $num = 5000;
+while ($num)
+{
+ insert into t1 values (1);
+ dec $num;
+}
+--enable_query_log
+SET GLOBAL maria_group_commit="HARD";
+SET GLOBAL maria_group_commit_interval= 100;
+--disable_query_log
+let $num = 5000;
+while ($num)
+{
+ insert into t1 values (1);
+ dec $num;
+}
+--enable_query_log
+SET GLOBAL maria_group_commit="SOFT";
+SET GLOBAL maria_group_commit_interval= 0;
+--disable_query_log
+let $num = 5000;
+while ($num)
+{
+ insert into t1 values (1);
+ dec $num;
+}
+--enable_query_log
+SET GLOBAL maria_group_commit="SOFT";
+SET GLOBAL maria_group_commit_interval= 100;
+--disable_query_log
+let $num = 5000;
+while ($num)
+{
+ insert into t1 values (1);
+ dec $num;
+}
+--enable_query_log
+SET GLOBAL maria_group_commit="NONE";
+SET GLOBAL maria_group_commit_interval= 0;
+drop table t1;
=== added directory 'randgen'
=== added directory 'randgen/conf'
=== added file 'randgen/conf/maria_group_commit.yy'
--- a/randgen/conf/maria_group_commit.yy 1970-01-01 00:00:00 +0000
+++ b/randgen/conf/maria_group_commit.yy 2010-02-12 09:13:25 +0000
@@ -0,0 +1,181 @@
+# test of group commit switching
+
+query:
+ select | insert | update| delete |
+ select | insert | update| delete |
+ select | insert | update| delete |
+ select | insert | update| delete |
+ select | insert | update| delete |
+ select | insert | update| delete |
+ select | insert | update| delete |
+ select | insert | update| delete |
+ select | insert | update| delete |
+ select | insert | update| delete |
+ select | insert | update| delete |
+ select | insert | update| delete |
+ select | insert | update| delete |
+ select | insert | update| delete |
+ select | insert | update| delete |
+ select | insert | update| delete |
+ select | insert | update| delete |
+ select | insert | update| delete |
+ select | insert | update| delete |
+ select | insert | update| delete |
+ select | insert | update| delete |
+ select | insert | update| delete |
+ select | insert | update| delete |
+ select | insert | update| delete |
+ select | insert | update| delete |
+ select | insert | update| delete |
+ select | insert | update| delete |
+ select | insert | update| delete |
+ select | insert | update| delete |
+ select | insert | update| delete |
+ select | insert | update| delete |
+ select | insert | update| delete |
+ select | insert | update| delete |
+ select | insert | update| delete |
+ select | insert | update| delete |
+ select | insert | update| delete |
+ select | insert | update| delete |
+ select | insert | update| delete |
+ select | insert | update| delete |
+ select | insert | update| delete |
+ select | insert | update| delete |
+ select | insert | update| delete |
+ select | insert | update| delete |
+ select | insert | update| delete |
+ select | insert | update| delete |
+ select | insert | update| delete |
+ select | insert | update| delete |
+ select | insert | update| delete |
+ select | insert | update| delete |
+ select | insert | update| delete |
+ select | insert | update| delete |
+ select | insert | update| delete |
+ select | insert | update| delete |
+ select | insert | update| delete |
+ select | insert | update| delete |
+ select | insert | update| delete |
+ select | insert | update| delete |
+ select | insert | update| delete |
+ select | insert | update| delete |
+ select | insert | update| delete |
+ select | insert | update| delete |
+ select | insert | update| delete |
+ select | insert | update| delete |
+ select | insert | update| delete |
+ select | insert | update| delete |
+ select | insert | update| delete |
+ select | insert | update| delete |
+ select | insert | update| delete |
+ select | insert | update| delete |
+ select | insert | update| delete |
+ select | insert | update| delete |
+ select | insert | update| delete |
+ select | insert | update| delete |
+ select | insert | update| delete |
+ select | insert | update| delete |
+ select | insert | update| delete |
+ select | insert | update| delete |
+ select | insert | update| delete |
+ select | insert | update| delete |
+ select | insert | update| delete |
+ select | insert | update| delete |
+ select | insert | update| delete |
+ select | insert | update| delete |
+ select | insert | update| delete |
+ select | insert | update| delete |
+ select | insert | update| delete |
+ select | insert | update| delete |
+ select | insert | update| delete |
+ select | insert | update| delete |
+ select | insert | update| delete |
+ select | insert | update| delete |
+ select | insert | update| delete |
+ select | insert | update| delete |
+ select | insert | update| delete |
+ select | insert | update| delete |
+ select | insert | update| delete |
+ select | insert | update| delete |
+ select | insert | update| delete |
+ select | insert | update| delete |
+ select | insert | update| delete |
+ select | insert | update| delete |
+ select | insert | update| delete |
+ select | insert | update| delete |
+ select | insert | update| delete |
+ select | insert | update| delete |
+ select | insert | update| delete |
+ select | insert | update| delete |
+ select | insert | update| delete |
+ select | insert | update| delete |
+ select | insert | update| delete |
+ select | insert | update| delete |
+ select | insert | update| delete |
+ select | insert | update| delete |
+ select | insert | update| delete |
+ select | insert | update| delete |
+ select | insert | update| delete |
+ select | insert | update| delete |
+ select | insert | update| delete |
+ select | insert | update| delete |
+ select | insert | update| delete |
+ change_group_commit | change_interval;
+
+
+select:
+ SELECT select_item FROM join where order_by limit;
+
+select_item:
+ * | X . _field ;
+
+join:
+ _table AS X |
+ _table AS X LEFT JOIN _table AS Y ON ( X . _field = Y . _field ) ;
+
+where:
+ |
+ WHERE X . _field < value |
+ WHERE X . _field > value |
+ WHERE X . _field = value ;
+
+where_delete:
+ |
+ WHERE _field < value |
+ WHERE _field > value |
+ WHERE _field = value ;
+
+order_by:
+ | ORDER BY X . _field ;
+
+limit:
+ | LIMIT _digit ;
+
+insert:
+ INSERT INTO _table ( _field , _field ) VALUES ( value , value ) ;
+
+update:
+ UPDATE _table AS X SET _field = value where order_by limit ;
+
+delete:
+ DELETE FROM _table where_delete LIMIT _digit ;
+
+value:
+ ' _letter ' | _digit | _date | _datetime | _time | _english ;
+
+change_group_commit:
+ SET GLOBAL MARIA_GROUP_COMMIT=none_soft_hard;
+
+none_soft_hard:
+ NONE | SOFT | HARD;
+
+change_interval:
+ set_interval | set_interval | set_interval | set_interval |
+ drop_interval;
+
+set_interval:
+ SET GLOBAL MARIA_GROUP_COMMIT_INTERVAL=_tinyint_unsigned;
+
+drop_interval:
+ SET GLOBAL MARIA_GROUP_COMMIT_INTERVAL=0;
=== modified file 'storage/maria/ha_maria.cc'
--- a/storage/maria/ha_maria.cc 2009-12-03 11:34:11 +0000
+++ b/storage/maria/ha_maria.cc 2010-02-12 09:13:25 +0000
@@ -102,22 +102,40 @@
array_elements(maria_translog_purge_type_names) - 1, "",
maria_translog_purge_type_names, NULL
};
+
+/* transactional log directory sync */
const char *maria_sync_log_dir_names[]=
{
"NEVER", "NEWFILE", "ALWAYS", NullS
};
-
TYPELIB maria_sync_log_dir_typelib=
{
array_elements(maria_sync_log_dir_names) - 1, "",
maria_sync_log_dir_names, NULL
};
+/* transactional log group commit */
+const char *maria_group_commit_names[]=
+{
+ "none", "hard", "soft", NullS
+};
+TYPELIB maria_group_commit_typelib=
+{
+ array_elements(maria_group_commit_names) - 1, "",
+ maria_group_commit_names, NULL
+};
+
/** Interval between background checkpoints in seconds */
static ulong checkpoint_interval;
static void update_checkpoint_interval(MYSQL_THD thd,
struct st_mysql_sys_var *var,
void *var_ptr, const void *save);
+static void update_maria_group_commit(MYSQL_THD thd,
+ struct st_mysql_sys_var *var,
+ void *var_ptr, const void *save);
+static void update_maria_group_commit_interval(MYSQL_THD thd,
+ struct st_mysql_sys_var *var,
+ void *var_ptr, const void *save);
/** After that many consecutive recovery failures, remove logs */
static ulong force_start_after_recovery_failures;
static void update_log_file_size(MYSQL_THD thd,
@@ -164,6 +182,24 @@
NULL, update_log_file_size, TRANSLOG_FILE_SIZE,
TRANSLOG_MIN_FILE_SIZE, 0xffffffffL, TRANSLOG_PAGE_SIZE);
+static MYSQL_SYSVAR_ENUM(group_commit, maria_group_commit,
+ PLUGIN_VAR_RQCMDARG,
+ "Specifies maria group commit mode. "
+ "Possible values are \"none\" (no group commit), "
+ "\"hard\" (with waiting to actual commit), "
+ "\"soft\" (no wait for commit (DANGEROUS!!!))",
+ NULL, update_maria_group_commit,
+ TRANSLOG_GCOMMIT_NONE, &maria_group_commit_typelib);
+
+static MYSQL_SYSVAR_ULONG(group_commit_interval, maria_group_commit_interval,
+ PLUGIN_VAR_RQCMDARG,
+ "Interval between commite in microseconds (1/1000000c)."
+ " 0 stands for no waiting"
+ " for other threads to come and do a commit in \"hard\" mode and no"
+ " sync()/commit at all in \"soft\" mode. Option has only an effect"
+ " if maria_group_commit is used",
+ NULL, update_maria_group_commit_interval, 0, 0, UINT_MAX, 1);
+
static MYSQL_SYSVAR_ENUM(log_purge_type, log_purge_type,
PLUGIN_VAR_RQCMDARG,
"Specifies how maria transactional log will be purged. "
@@ -3275,6 +3311,8 @@
MYSQL_SYSVAR(block_size),
MYSQL_SYSVAR(checkpoint_interval),
MYSQL_SYSVAR(force_start_after_recovery_failures),
+ MYSQL_SYSVAR(group_commit),
+ MYSQL_SYSVAR(group_commit_interval),
MYSQL_SYSVAR(page_checksum),
MYSQL_SYSVAR(log_dir_path),
MYSQL_SYSVAR(log_file_size),
@@ -3306,6 +3344,92 @@
}
/**
+ @brief Updates group commit mode
+*/
+
+static void update_maria_group_commit(MYSQL_THD thd,
+ struct st_mysql_sys_var *var,
+ void *var_ptr, const void *save)
+{
+ ulong value= (ulong)*((long *)var_ptr);
+ DBUG_ENTER("update_maria_group_commit");
+ DBUG_PRINT("enter", ("old value: %lu new value %lu rate %lu",
+ value, (ulong)(*(long *)save),
+ maria_group_commit_interval));
+ /* old value */
+ switch (value) {
+ case TRANSLOG_GCOMMIT_NONE:
+ break;
+ case TRANSLOG_GCOMMIT_HARD:
+ translog_hard_group_commit(FALSE);
+ break;
+ case TRANSLOG_GCOMMIT_SOFT:
+ translog_soft_sync(FALSE);
+ if (maria_group_commit_interval)
+ translog_soft_sync_end();
+ break;
+ default:
+ DBUG_ASSERT(0); /* impossible */
+ }
+ value= *(ulong *)var_ptr= (ulong)(*(long *)save);
+ translog_sync();
+ /* new value */
+ switch (value) {
+ case TRANSLOG_GCOMMIT_NONE:
+ break;
+ case TRANSLOG_GCOMMIT_HARD:
+ translog_hard_group_commit(TRUE);
+ break;
+ case TRANSLOG_GCOMMIT_SOFT:
+ translog_soft_sync(TRUE);
+ /* variable change made under global lock so we can just read it */
+ if (maria_group_commit_interval)
+ translog_soft_sync_start();
+ break;
+ default:
+ DBUG_ASSERT(0); /* impossible */
+ }
+ DBUG_VOID_RETURN;
+}
+
+/**
+ @brief Updates group commit interval
+*/
+
+static void update_maria_group_commit_interval(MYSQL_THD thd,
+ struct st_mysql_sys_var *var,
+ void *var_ptr, const void *save)
+{
+ ulong new_value= (ulong)*((long *)save);
+ ulong *value_ptr= (ulong*) var_ptr;
+ DBUG_ENTER("update_maria_group_commit_interval");
+ DBUG_PRINT("enter", ("old value: %lu new value %lu group commit %lu",
+ *value_ptr, new_value, maria_group_commit));
+
+ /* variable change made under global lock so we can just read it */
+ switch (maria_group_commit) {
+ case TRANSLOG_GCOMMIT_NONE:
+ *value_ptr= new_value;
+ translog_set_group_commit_interval(new_value);
+ break;
+ case TRANSLOG_GCOMMIT_HARD:
+ *value_ptr= new_value;
+ translog_set_group_commit_interval(new_value);
+ break;
+ case TRANSLOG_GCOMMIT_SOFT:
+ if (*value_ptr)
+ translog_soft_sync_end();
+ translog_set_group_commit_interval(new_value);
+ if ((*value_ptr= new_value))
+ translog_soft_sync_start();
+ break;
+ default:
+ DBUG_ASSERT(0); /* impossible */
+ }
+ DBUG_VOID_RETURN;
+}
+
+/**
@brief Updates the transaction log file limit.
*/
@@ -3327,6 +3451,7 @@
{"Maria_pagecache_reads", (char*) &maria_pagecache_var.global_cache_read, SHOW_LONGLONG},
{"Maria_pagecache_write_requests", (char*) &maria_pagecache_var.global_cache_w_requests, SHOW_LONGLONG},
{"Maria_pagecache_writes", (char*) &maria_pagecache_var.global_cache_write, SHOW_LONGLONG},
+ {"Maria_transaction_log_syncs", (char*) &translog_syncs, SHOW_LONGLONG},
{NullS, NullS, SHOW_LONG}
};
=== modified file 'storage/maria/ma_init.c'
--- a/storage/maria/ma_init.c 2008-10-09 20:03:54 +0000
+++ b/storage/maria/ma_init.c 2010-02-12 09:13:25 +0000
@@ -82,6 +82,11 @@
maria_inited= maria_multi_threaded= FALSE;
ft_free_stopwords();
ma_checkpoint_end();
+ if (translog_status == TRANSLOG_OK)
+ {
+ translog_soft_sync_end();
+ translog_sync();
+ }
if ((trid= trnman_get_max_trid()) > max_trid_in_control_file)
{
/*
=== modified file 'storage/maria/ma_loghandler.c'
--- a/storage/maria/ma_loghandler.c 2010-01-06 21:27:53 +0000
+++ b/storage/maria/ma_loghandler.c 2010-02-12 09:13:25 +0000
@@ -18,6 +18,7 @@
#include "ma_blockrec.h" /* for some constants and in-write hooks */
#include "ma_key_recover.h" /* For some in-write hooks */
#include "ma_checkpoint.h"
+#include "ma_servicethread.h"
/*
On Windows, neither my_open() nor my_sync() work for directories.
@@ -47,6 +48,15 @@
#include <m_ctype.h>
#endif
+/** @brief protects checkpoint_in_progress */
+static pthread_mutex_t LOCK_soft_sync;
+/** @brief for killing the background checkpoint thread */
+static pthread_cond_t COND_soft_sync;
+/** @brief control structure for checkpoint background thread */
+static MA_SERVICE_THREAD_CONTROL soft_sync_control=
+ {THREAD_DEAD, FALSE, &LOCK_soft_sync, &COND_soft_sync};
+
+
/* transaction log file descriptor */
typedef struct st_translog_file
{
@@ -124,10 +134,24 @@
/* Previous buffer offset to detect it flush finish */
TRANSLOG_ADDRESS prev_buffer_offset;
/*
+ If the buffer was forced to close it save value of its horizon
+ otherwise LSN_IMPOSSIBLE
+ */
+ TRANSLOG_ADDRESS pre_force_close_horizon;
+ /*
How much is written (or will be written when copy_to_buffer_in_progress
become 0) to this buffer
*/
translog_size_t size;
+ /*
+ When moving from one log buffer to another, we write the last of the
+ previous buffer to file and then move to start using the new log
+ buffer. In the case of a part filed last page, this page is not moved
+ to the start of the new buffer but instead we set the 'skip_data'
+ variable to tell us how much data at the beginning of the buffer is not
+ relevant.
+ */
+ uint skipped_data;
/* File handler for this buffer */
TRANSLOG_FILE *file;
/* Threads which are waiting for buffer filling/freeing */
@@ -304,6 +328,7 @@
*/
pthread_mutex_t log_flush_lock;
pthread_cond_t log_flush_cond;
+ pthread_cond_t new_goal_cond;
/* Protects changing of headers of finished files (max_lsn) */
pthread_mutex_t file_header_lock;
@@ -344,13 +369,39 @@
ulong log_purge_type= TRANSLOG_PURGE_IMMIDIATE;
ulong log_file_size= TRANSLOG_FILE_SIZE;
+/* sync() of log files directory mode */
ulong sync_log_dir= TRANSLOG_SYNC_DIR_NEWFILE;
+ulong maria_group_commit= TRANSLOG_GCOMMIT_NONE;
+ulong maria_group_commit_interval= 0;
/* Marker for end of log */
static uchar end_of_log= 0;
#define END_OF_LOG &end_of_log
+/**
+ Switch for "soft" sync (no real sync() but periodical sync by service
+ thread)
+*/
+static volatile my_bool soft_sync= FALSE;
+/**
+ Switch for "hard" group commit mode
+*/
+static volatile my_bool hard_group_commit= FALSE;
+/**
+ File numbers interval which have to be sync()
+*/
+static uint32 soft_sync_min= 0;
+static uint32 soft_sync_max= 0;
+static uint32 soft_need_sync= 1;
+/**
+ stores interval in microseconds
+*/
+static uint32 group_commit_wait= 0;
enum enum_translog_status translog_status= TRANSLOG_UNINITED;
+ulonglong translog_syncs= 0; /* Number of sync()s */
+
+/* time of last flush */
+static ulonglong flush_start= 0;
/* chunk types */
#define TRANSLOG_CHUNK_LSN 0x00 /* 0 chunk refer as LSN (head or tail */
@@ -980,12 +1031,17 @@
static TRANSLOG_FILE *get_current_logfile()
{
TRANSLOG_FILE *file;
+ DBUG_ENTER("get_current_logfile");
rw_rdlock(&log_descriptor.open_files_lock);
+ DBUG_PRINT("info", ("max_file: %lu min_file: %lu open_files: %lu",
+ (ulong) log_descriptor.max_file,
+ (ulong) log_descriptor.min_file,
+ (ulong) log_descriptor.open_files.elements));
DBUG_ASSERT(log_descriptor.max_file - log_descriptor.min_file + 1 ==
log_descriptor.open_files.elements);
file= *dynamic_element(&log_descriptor.open_files, 0, TRANSLOG_FILE **);
rw_unlock(&log_descriptor.open_files_lock);
- return (file);
+ DBUG_RETURN(file);
}
uchar NEAR maria_trans_file_magic[]=
@@ -1069,6 +1125,7 @@
static my_bool translog_max_lsn_to_header(File file, LSN lsn)
{
uchar lsn_buff[LSN_STORE_SIZE];
+ my_bool rc;
DBUG_ENTER("translog_max_lsn_to_header");
DBUG_PRINT("enter", ("File descriptor: %ld "
"lsn: (%lu,0x%lx)",
@@ -1077,11 +1134,17 @@
lsn_store(lsn_buff, lsn);
- DBUG_RETURN(my_pwrite(file, lsn_buff,
- LSN_STORE_SIZE,
- (LOG_HEADER_DATA_SIZE - LSN_STORE_SIZE),
- log_write_flags) != 0 ||
- my_sync(file, MYF(MY_WME)) != 0);
+ rc= (my_pwrite(file, lsn_buff,
+ LSN_STORE_SIZE,
+ (LOG_HEADER_DATA_SIZE - LSN_STORE_SIZE),
+ log_write_flags) != 0 ||
+ my_sync(file, MYF(MY_WME)) != 0);
+ /*
+ We should not increase counter in case of error above, but it is so
+ unlikely that we can ignore this case
+ */
+ translog_syncs++;
+ DBUG_RETURN(rc);
}
@@ -1423,7 +1486,9 @@
static my_bool translog_buffer_init(struct st_translog_buffer *buffer, int num)
{
DBUG_ENTER("translog_buffer_init");
- buffer->prev_last_lsn= buffer->last_lsn= LSN_IMPOSSIBLE;
+ buffer->pre_force_close_horizon=
+ buffer->prev_last_lsn= buffer->last_lsn=
+ LSN_IMPOSSIBLE;
DBUG_PRINT("info", ("last_lsn and prev_last_lsn set to 0 buffer: 0x%lx",
(ulong) buffer));
@@ -1435,6 +1500,7 @@
memset(buffer->buffer, TRANSLOG_FILLER, TRANSLOG_WRITE_BUFFER);
/* Buffer size */
buffer->size= 0;
+ buffer->skipped_data= 0;
/* cond of thread which is waiting for buffer filling */
if (pthread_cond_init(&buffer->waiting_filling_buffer, 0))
DBUG_RETURN(1);
@@ -1489,7 +1555,10 @@
TODO: sync only we have changed the log
*/
if (!file->is_sync)
+ {
rc= my_sync(file->handler.file, MYF(MY_WME));
+ translog_syncs++;
+ }
rc|= my_close(file->handler.file, MYF(MY_WME));
my_free(file, MYF(0));
return test(rc);
@@ -2044,7 +2113,8 @@
(ulong) LSN_OFFSET(log_descriptor.horizon),
(ulong) LSN_OFFSET(log_descriptor.horizon)));
DBUG_ASSERT(buffer_no == buffer->buffer_no);
- buffer->prev_last_lsn= buffer->last_lsn= LSN_IMPOSSIBLE;
+ buffer->pre_force_close_horizon=
+ buffer->prev_last_lsn= buffer->last_lsn= LSN_IMPOSSIBLE;
DBUG_PRINT("info", ("last_lsn and prev_last_lsn set to 0 buffer: 0x%lx",
(ulong) buffer));
buffer->offset= log_descriptor.horizon;
@@ -2052,6 +2122,7 @@
buffer->file= get_current_logfile();
buffer->overlay= 0;
buffer->size= 0;
+ buffer->skipped_data= 0;
translog_cursor_init(cursor, buffer, buffer_no);
DBUG_PRINT("info", ("file: #%ld (%d) init cursor #%u: 0x%lx "
"chaser: %d Size: %lu (%lu)",
@@ -2523,6 +2594,7 @@
TRANSLOG_ADDRESS offset= buffer->offset;
TRANSLOG_FILE *file= buffer->file;
uint8 ver= buffer->ver;
+ uint skipped_data;
DBUG_ENTER("translog_buffer_flush");
DBUG_PRINT("enter",
("Buffer: #%u 0x%lx file: %d offset: (%lu,0x%lx) size: %lu",
@@ -2557,6 +2629,8 @@
disk
*/
file= buffer->file;
+ skipped_data= buffer->skipped_data;
+ DBUG_ASSERT(skipped_data < TRANSLOG_PAGE_SIZE);
for (i= 0, pg= LSN_OFFSET(buffer->offset) / TRANSLOG_PAGE_SIZE;
i < buffer->size;
i+= TRANSLOG_PAGE_SIZE, pg++)
@@ -2573,13 +2647,16 @@
DBUG_ASSERT(i + TRANSLOG_PAGE_SIZE <= buffer->size);
if (translog_status != TRANSLOG_OK && translog_status != TRANSLOG_SHUTDOWN)
DBUG_RETURN(1);
- if (pagecache_inject(log_descriptor.pagecache,
+ if (pagecache_write_part(log_descriptor.pagecache,
&file->handler, pg, 3,
buffer->buffer + i,
PAGECACHE_PLAIN_PAGE,
PAGECACHE_LOCK_LEFT_UNLOCKED,
- PAGECACHE_PIN_LEFT_UNPINNED, 0,
- LSN_IMPOSSIBLE))
+ PAGECACHE_PIN_LEFT_UNPINNED,
+ PAGECACHE_WRITE_DONE, 0,
+ LSN_IMPOSSIBLE,
+ skipped_data,
+ TRANSLOG_PAGE_SIZE - skipped_data))
{
DBUG_PRINT("error",
("Can't write page (%lu,0x%lx) to pagecache, error: %d",
@@ -2589,10 +2666,12 @@
translog_stop_writing();
DBUG_RETURN(1);
}
+ skipped_data= 0;
}
file->is_sync= 0;
- if (my_pwrite(file->handler.file, buffer->buffer,
- buffer->size, LSN_OFFSET(buffer->offset),
+ if (my_pwrite(file->handler.file, buffer->buffer + buffer->skipped_data,
+ buffer->size - buffer->skipped_data,
+ LSN_OFFSET(buffer->offset) + buffer->skipped_data,
log_write_flags))
{
DBUG_PRINT("error", ("Can't write buffer (%lu,0x%lx) size %lu "
@@ -2985,6 +3064,7 @@
uchar *from, *table= NULL;
int is_last_unfinished_page;
uint last_protected_sector= 0;
+ uint skipped_data= curr_buffer->skipped_data;
TRANSLOG_FILE file_copy;
uint8 ver= curr_buffer->ver;
translog_wait_for_writers(curr_buffer);
@@ -2997,7 +3077,38 @@
}
DBUG_ASSERT(LSN_FILE_NO(addr) == LSN_FILE_NO(curr_buffer->offset));
from= curr_buffer->buffer + (addr - curr_buffer->offset);
- memcpy(buffer, from, TRANSLOG_PAGE_SIZE);
+ if (skipped_data && addr == curr_buffer->offset)
+ {
+ /*
+ We read page part of which is not present in buffer,
+ so we should read absent part from file (page cache actually)
+ */
+ file= get_logfile_by_number(file_no);
+ DBUG_ASSERT(file != NULL);
+ /*
+ it's ok to not lock the page because:
+ - The log handler has it's own page cache.
+ - There is only one thread that can access the log
+ cache at a time
+ */
+ if (!(buffer= pagecache_read(log_descriptor.pagecache,
+ &file->handler,
+ LSN_OFFSET(addr) / TRANSLOG_PAGE_SIZE,
+ 3, buffer,
+ PAGECACHE_PLAIN_PAGE,
+ PAGECACHE_LOCK_LEFT_UNLOCKED,
+ NULL)))
+ DBUG_RETURN(NULL);
+ }
+ else
+ skipped_data= 0; /* Read after skipped in buffer data */
+ /*
+ Now we have correct data in buffer up to 'skipped_data'. The
+ following memcpy() will move the data from the internal buffer
+ that was not yet on disk.
+ */
+ memcpy(buffer + skipped_data, from + skipped_data,
+ TRANSLOG_PAGE_SIZE - skipped_data);
/*
We can use copy then in translog_page_validator() because it
do not put it permanently somewhere.
@@ -3291,6 +3402,7 @@
uint32 next_page_offset, page_rest;
uint32 i;
File fd;
+ int rc;
TRANSLOG_VALIDATOR_DATA data;
char path[FN_REFLEN];
uchar page_buff[TRANSLOG_PAGE_SIZE];
@@ -3316,14 +3428,19 @@
TRANSLOG_PAGE_SIZE);
page_rest= next_page_offset - LSN_OFFSET(addr);
memset(page_buff, TRANSLOG_FILLER, page_rest);
- if ((fd= open_logfile_by_number_no_cache(LSN_FILE_NO(addr))) < 0 ||
- ((my_chsize(fd, next_page_offset, TRANSLOG_FILLER, MYF(MY_WME)) ||
- (page_rest && my_pwrite(fd, page_buff, page_rest, LSN_OFFSET(addr),
- log_write_flags)) ||
- my_sync(fd, MYF(MY_WME))) |
- my_close(fd, MYF(MY_WME))) ||
- (sync_log_dir >= TRANSLOG_SYNC_DIR_ALWAYS &&
- sync_dir(log_descriptor.directory_fd, MYF(MY_WME | MY_IGNORE_BADFD))))
+ rc= ((fd= open_logfile_by_number_no_cache(LSN_FILE_NO(addr))) < 0 ||
+ ((my_chsize(fd, next_page_offset, TRANSLOG_FILLER, MYF(MY_WME)) ||
+ (page_rest && my_pwrite(fd, page_buff, page_rest, LSN_OFFSET(addr),
+ log_write_flags)) ||
+ my_sync(fd, MYF(MY_WME)))));
+ translog_syncs++;
+ rc|= (fd > 0 && my_close(fd, MYF(MY_WME)));
+ if (sync_log_dir >= TRANSLOG_SYNC_DIR_ALWAYS)
+ {
+ rc|= sync_dir(log_descriptor.directory_fd, MYF(MY_WME | MY_IGNORE_BADFD));
+ translog_syncs++;
+ }
+ if (rc)
DBUG_RETURN(1);
/* fix the horizon */
@@ -3483,7 +3600,10 @@
my_bool version_changed= 0;
DBUG_ENTER("translog_init_with_table");
+ translog_syncs= 0;
+ flush_start= 0;
id_to_share= NULL;
+
log_descriptor.directory_fd= -1;
log_descriptor.is_everything_flushed= 1;
log_descriptor.flush_in_progress= 0;
@@ -3511,6 +3631,7 @@
pthread_mutex_init(&log_descriptor.dirty_buffer_mask_lock,
MY_MUTEX_INIT_FAST) ||
pthread_cond_init(&log_descriptor.log_flush_cond, 0) ||
+ pthread_cond_init(&log_descriptor.new_goal_cond, 0) ||
my_rwlock_init(&log_descriptor.open_files_lock,
NULL) ||
my_init_dynamic_array(&log_descriptor.open_files,
@@ -3912,7 +4033,6 @@
log_descriptor.flushed= log_descriptor.horizon;
log_descriptor.in_buffers_only= log_descriptor.bc.buffer->offset;
log_descriptor.max_lsn= LSN_IMPOSSIBLE; /* set to 0 */
- log_descriptor.previous_flush_horizon= log_descriptor.horizon;
/*
Now 'flushed' is set to 'horizon' value, but 'horizon' is (potentially)
address of the next LSN and we want indicate that all LSNs that are
@@ -3995,6 +4115,10 @@
It is beginning of the log => there is no LSNs in the log =>
There is no harm in leaving it "as-is".
*/
+ log_descriptor.previous_flush_horizon= log_descriptor.horizon;
+ DBUG_PRINT("info", ("previous_flush_horizon: (%lu,0x%lx)",
+ LSN_IN_PARTS(log_descriptor.
+ previous_flush_horizon)));
DBUG_RETURN(0);
}
file_no--;
@@ -4070,6 +4194,9 @@
translog_free_record_header(&rec);
}
}
+ log_descriptor.previous_flush_horizon= log_descriptor.horizon;
+ DBUG_PRINT("info", ("previous_flush_horizon: (%lu,0x%lx)",
+ LSN_IN_PARTS(log_descriptor.previous_flush_horizon)));
DBUG_RETURN(0);
err:
ma_message_no_user(0, "log initialization failed");
@@ -4157,6 +4284,7 @@
pthread_mutex_destroy(&log_descriptor.log_flush_lock);
pthread_mutex_destroy(&log_descriptor.dirty_buffer_mask_lock);
pthread_cond_destroy(&log_descriptor.log_flush_cond);
+ pthread_cond_destroy(&log_descriptor.new_goal_cond);
rwlock_destroy(&log_descriptor.open_files_lock);
delete_dynamic(&log_descriptor.open_files);
delete_dynamic(&log_descriptor.unfinished_files);
@@ -6885,11 +7013,11 @@
{
translog_size_t res;
DBUG_ENTER("translog_read_record_header_from_buffer");
- DBUG_ASSERT(translog_is_LSN_chunk(page[page_offset]));
- DBUG_ASSERT(translog_status == TRANSLOG_OK ||
- translog_status == TRANSLOG_READONLY);
DBUG_PRINT("info", ("page byte: 0x%x offset: %u",
(uint) page[page_offset], (uint) page_offset));
+ DBUG_ASSERT(translog_is_LSN_chunk(page[page_offset]));
+ DBUG_ASSERT(translog_status == TRANSLOG_OK ||
+ translog_status == TRANSLOG_READONLY);
buff->type= (page[page_offset] & TRANSLOG_REC_TYPE);
buff->short_trid= uint2korr(page + page_offset + 1);
DBUG_PRINT("info", ("Type %u, Short TrID %u, LSN (%lu,0x%lx)",
@@ -7356,27 +7484,27 @@
"Buffer addr: (%lu,0x%lx) "
"Page addr: (%lu,0x%lx) "
"size: %lu (%lu) Pg: %u left: %u in progress %u",
- (uint) log_descriptor.bc.buffer_no,
- (ulong) log_descriptor.bc.buffer,
- LSN_IN_PARTS(log_descriptor.bc.buffer->offset),
+ (uint) old_buffer_no,
+ (ulong) old_buffer,
+ LSN_IN_PARTS(old_buffer->offset),
(ulong) LSN_FILE_NO(log_descriptor.horizon),
(ulong) (LSN_OFFSET(log_descriptor.horizon) -
log_descriptor.bc.current_page_fill),
- (ulong) log_descriptor.bc.buffer->size,
+ (ulong) old_buffer->size,
(ulong) (log_descriptor.bc.ptr -log_descriptor.bc.
buffer->buffer),
(uint) log_descriptor.bc.current_page_fill,
(uint) left,
- (uint) log_descriptor.bc.buffer->
+ (uint) old_buffer->
copy_to_buffer_in_progress));
translog_lock_assert_owner();
LINT_INIT(current_page_fill);
- new_buff_beginning= log_descriptor.bc.buffer->offset;
- new_buff_beginning+= log_descriptor.bc.buffer->size; /* increase offset */
+ new_buff_beginning= old_buffer->offset;
+ new_buff_beginning+= old_buffer->size; /* increase offset */
DBUG_ASSERT(log_descriptor.bc.ptr !=NULL);
DBUG_ASSERT(LSN_FILE_NO(log_descriptor.horizon) ==
- LSN_FILE_NO(log_descriptor.bc.buffer->offset));
+ LSN_FILE_NO(old_buffer->offset));
translog_check_cursor(&log_descriptor.bc);
DBUG_ASSERT(left < TRANSLOG_PAGE_SIZE);
if (left)
@@ -7387,18 +7515,20 @@
*/
DBUG_PRINT("info", ("left: %u", (uint) left));
+ old_buffer->pre_force_close_horizon=
+ old_buffer->offset + old_buffer->size;
/* decrease offset */
new_buff_beginning-= log_descriptor.bc.current_page_fill;
current_page_fill= log_descriptor.bc.current_page_fill;
memset(log_descriptor.bc.ptr, TRANSLOG_FILLER, left);
- log_descriptor.bc.buffer->size+= left;
+ old_buffer->size+= left;
DBUG_PRINT("info", ("Finish Page buffer #%u: 0x%lx "
"Size: %lu",
- (uint) log_descriptor.bc.buffer->buffer_no,
- (ulong) log_descriptor.bc.buffer,
- (ulong) log_descriptor.bc.buffer->size));
- DBUG_ASSERT(log_descriptor.bc.buffer->buffer_no ==
+ (uint) old_buffer->buffer_no,
+ (ulong) old_buffer,
+ (ulong) old_buffer->size));
+ DBUG_ASSERT(old_buffer->buffer_no ==
log_descriptor.bc.buffer_no);
}
else
@@ -7509,11 +7639,21 @@
if (left)
{
- /*
- TODO: do not copy beginning of the page if we have no CRC or sector
- checks on
- */
- memcpy(new_buffer->buffer, data, current_page_fill);
+ if (log_descriptor.flags &
+ (TRANSLOG_PAGE_CRC | TRANSLOG_SECTOR_PROTECTION))
+ memcpy(new_buffer->buffer, data, current_page_fill);
+ else
+ {
+ /*
+ This page header does not change if we add more data to the page so
+ we can not copy it and will not overwrite later
+ */
+ new_buffer->skipped_data= current_page_fill;
+#ifndef DBUG_OFF
+ memset(new_buffer->buffer, 0xa5, current_page_fill);
+#endif
+ DBUG_ASSERT(new_buffer->skipped_data < TRANSLOG_PAGE_SIZE);
+ }
}
old_buffer->next_buffer_offset= new_buffer->offset;
translog_buffer_lock(new_buffer);
@@ -7561,6 +7701,7 @@
{
log_descriptor.next_pass_max_lsn= lsn;
log_descriptor.max_lsn_requester= pthread_self();
+ pthread_cond_broadcast(&log_descriptor.new_goal_cond);
}
while (flush_no == log_descriptor.flush_no)
{
@@ -7572,66 +7713,78 @@
/**
- @brief Flush the log up to given LSN (included)
-
- @param lsn log record serial number up to which (inclusive)
- the log has to be flushed
-
- @return Operation status
+ @brief sync() range of files (inclusive) and directory (by request)
+
+ @param min min internal file number to flush
+ @param max max internal file number to flush
+ @param sync_dir need sync directory
+
+ return Operation status
@retval 0 OK
@retval 1 Error
-
-*/
-
-my_bool translog_flush(TRANSLOG_ADDRESS lsn)
-{
- LSN sent_to_disk= LSN_IMPOSSIBLE;
- TRANSLOG_ADDRESS flush_horizon;
- uint fn, i;
+*/
+
+static my_bool translog_sync_files(uint32 min, uint32 max,
+ my_bool sync_dir)
+{
+ uint fn;
+ my_bool rc= 0;
+ ulonglong flush_interval;
+ DBUG_ENTER("translog_sync_files");
+ DBUG_PRINT("info", ("min: %lu max: %lu sync dir: %d",
+ (ulong) min, (ulong) max, (int) sync_dir));
+ DBUG_ASSERT(min <= max);
+
+ flush_interval= group_commit_wait;
+ if (flush_interval)
+ flush_start= my_micro_time();
+ for (fn= min; fn <= max; fn++)
+ {
+ TRANSLOG_FILE *file= get_logfile_by_number(fn);
+ DBUG_ASSERT(file != NULL);
+ if (!file->is_sync)
+ {
+ if (my_sync(file->handler.file, MYF(MY_WME)))
+ {
+ rc= 1;
+ translog_stop_writing();
+ DBUG_RETURN(rc);
+ }
+ translog_syncs++;
+ file->is_sync= 1;
+ }
+ }
+
+ if (sync_dir)
+ {
+ if (!(rc= sync_dir(log_descriptor.directory_fd,
+ MYF(MY_WME | MY_IGNORE_BADFD))))
+ translog_syncs++;
+ }
+
+ DBUG_RETURN(rc);
+}
+
+
+/*
+ @brief Flushes buffers with LSNs in them less or equal address <lsn>
+
+ @param lsn address up to which all LSNs should be flushed,
+ can be reset to real last LSN address
+ @parem sent_to_disk returns 'sent to disk' position
+ @param flush_horizon returns horizon of the flush
+
+ @note About terminology see comment to translog_flush().
+*/
+
+void translog_flush_buffers(TRANSLOG_ADDRESS *lsn,
+ TRANSLOG_ADDRESS *sent_to_disk,
+ TRANSLOG_ADDRESS *flush_horizon)
+{
dirty_buffer_mask_t dirty_buffer_mask;
+ uint i;
uint8 last_buffer_no, start_buffer_no;
- my_bool rc= 0;
- DBUG_ENTER("translog_flush");
- DBUG_PRINT("enter", ("Flush up to LSN: (%lu,0x%lx)", LSN_IN_PARTS(lsn)));
- DBUG_ASSERT(translog_status == TRANSLOG_OK ||
- translog_status == TRANSLOG_READONLY);
- LINT_INIT(sent_to_disk);
-
- pthread_mutex_lock(&log_descriptor.log_flush_lock);
- DBUG_PRINT("info", ("Everything is flushed up to (%lu,0x%lx)",
- LSN_IN_PARTS(log_descriptor.flushed)));
- if (cmp_translog_addr(log_descriptor.flushed, lsn) >= 0)
- {
- pthread_mutex_unlock(&log_descriptor.log_flush_lock);
- DBUG_RETURN(0);
- }
- if (log_descriptor.flush_in_progress)
- {
- translog_flush_set_new_goal_and_wait(lsn);
- if (!pthread_equal(log_descriptor.max_lsn_requester, pthread_self()))
- {
- /* fix lsn if it was horizon */
- if (cmp_translog_addr(lsn, log_descriptor.bc.buffer->last_lsn) > 0)
- lsn= BUFFER_MAX_LSN(log_descriptor.bc.buffer);
- translog_flush_wait_for_end(lsn);
- pthread_mutex_unlock(&log_descriptor.log_flush_lock);
- DBUG_RETURN(0);
- }
- log_descriptor.next_pass_max_lsn= LSN_IMPOSSIBLE;
- }
- log_descriptor.flush_in_progress= 1;
- flush_horizon= log_descriptor.previous_flush_horizon;
- DBUG_PRINT("info", ("flush_in_progress is set"));
- pthread_mutex_unlock(&log_descriptor.log_flush_lock);
-
- translog_lock();
- if (log_descriptor.is_everything_flushed)
- {
- DBUG_PRINT("info", ("everything is flushed"));
- rc= (translog_status == TRANSLOG_READONLY);
- translog_unlock();
- goto out;
- }
+ DBUG_ENTER("translog_flush_buffers");
/*
We will recheck information when will lock buffers one by
@@ -7656,15 +7809,15 @@
/*
if LSN up to which we have to flush bigger then maximum LSN of previous
buffer and at least one LSN was saved in the current buffer (last_lsn !=
- LSN_IMPOSSIBLE) then we better finish the current buffer.
+ LSN_IMPOSSIBLE) then we have to close the current buffer.
*/
- if (cmp_translog_addr(lsn, log_descriptor.bc.buffer->prev_last_lsn) > 0 &&
+ if (cmp_translog_addr(*lsn, log_descriptor.bc.buffer->prev_last_lsn) > 0 &&
log_descriptor.bc.buffer->last_lsn != LSN_IMPOSSIBLE)
{
struct st_translog_buffer *buffer= log_descriptor.bc.buffer;
- lsn= log_descriptor.bc.buffer->last_lsn; /* fix lsn if it was horizon */
+ *lsn= log_descriptor.bc.buffer->last_lsn; /* fix lsn if it was horizon */
DBUG_PRINT("info", ("LSN to flush fixed to last lsn: (%lu,0x%lx)",
- LSN_IN_PARTS(log_descriptor.bc.buffer->last_lsn)));
+ LSN_IN_PARTS(log_descriptor.bc.buffer->last_lsn)));
last_buffer_no= log_descriptor.bc.buffer_no;
log_descriptor.is_everything_flushed= 1;
translog_force_current_buffer_to_finish();
@@ -7676,8 +7829,10 @@
TRANSLOG_BUFFERS_NO);
translog_unlock();
}
- sent_to_disk= translog_get_sent_to_disk();
- if (cmp_translog_addr(lsn, sent_to_disk) > 0)
+
+ /* flush buffers */
+ *sent_to_disk= translog_get_sent_to_disk();
+ if (cmp_translog_addr(*lsn, *sent_to_disk) > 0)
{
DBUG_PRINT("info", ("Start buffer #: %u last buffer #: %u",
@@ -7697,53 +7852,238 @@
LSN_IN_PARTS(buffer->last_lsn),
(buffer->file ?
"dirty" : "closed")));
- if (buffer->prev_last_lsn <= lsn &&
+ if (buffer->prev_last_lsn <= *lsn &&
buffer->file != NULL)
{
- DBUG_ASSERT(flush_horizon <= buffer->offset + buffer->size);
- flush_horizon= buffer->offset + buffer->size;
+ DBUG_ASSERT(*flush_horizon <= buffer->offset + buffer->size);
+ *flush_horizon= (buffer->pre_force_close_horizon != LSN_IMPOSSIBLE ?
+ buffer->pre_force_close_horizon :
+ buffer->offset + buffer->size);
+ /* pre_force_close_horizon is reset during new buffer start */
+ DBUG_PRINT("info", ("flush_horizon: (%lu,0x%lx)",
+ LSN_IN_PARTS(*flush_horizon)));
+ DBUG_ASSERT(*flush_horizon <= log_descriptor.horizon);
+
translog_buffer_flush(buffer);
}
translog_buffer_unlock(buffer);
i= (i + 1) % TRANSLOG_BUFFERS_NO;
} while (i != last_buffer_no);
- sent_to_disk= translog_get_sent_to_disk();
- }
-
- /* sync files from previous flush till current one */
- for (fn= LSN_FILE_NO(log_descriptor.flushed); fn <= LSN_FILE_NO(lsn); fn++)
- {
- TRANSLOG_FILE *file= get_logfile_by_number(fn);
- DBUG_ASSERT(file != NULL);
- if (!file->is_sync)
- {
- if (my_sync(file->handler.file, MYF(MY_WME)))
+ *sent_to_disk= translog_get_sent_to_disk();
+ }
+
+ DBUG_VOID_RETURN;
+}
+
+/**
+ @brief Flush the log up to given LSN (included)
+
+ @param lsn log record serial number up to which (inclusive)
+ the log has to be flushed
+
+ @return Operation status
+ @retval 0 OK
+ @retval 1 Error
+
+ @note
+
+ - Non group commit logic: Commits made in passes. Thread which started
+ flush first is performing actual flush, other threads sets new goal (LSN)
+ of the next pass (if it is maximum) and waits for the pass end or just
+ wait for the pass end.
+
+ - If hard group commit enabled and rate set to zero:
+ The first thread sends all changed buffers to disk. This is repeated
+ as long as there are new LSNs added. The process can not loop
+ forever because we have limited number of threads and they will wait
+ for the data to be synced.
+ Pseudo code:
+
+ do
+ send changed buffers to disk
+ while new_goal
+ sync
+
+ - If hard group commit switched ON and less than rate microseconds has
+ passed from last sync, then after buffers have been sent to disk
+ wait until rate microseconds has passed since last sync, do sync and return.
+ This ensures that if we call sync infrequently we don't do any waits.
+
+ - If soft group commit enabled everything works as with 'non group commit'
+ but the thread doesn't do any real sync(). If rate is not zero the
+ sync() will be performed by a service thread with the given rate
+ when needed (new LSN appears).
+
+ @note Terminology:
+ 'sent to disk' means written to disk but not sync()ed,
+ 'flushed' mean sent to disk and synced().
+*/
+
+my_bool translog_flush(TRANSLOG_ADDRESS lsn)
+{
+ struct timespec abstime;
+ ulonglong flush_interval;
+ ulonglong time_spent;
+ LSN sent_to_disk= LSN_IMPOSSIBLE;
+ TRANSLOG_ADDRESS flush_horizon;
+ my_bool rc= 0;
+ my_bool hgroup_commit_at_start;
+ DBUG_ENTER("translog_flush");
+ DBUG_PRINT("enter", ("Flush up to LSN: (%lu,0x%lx)", LSN_IN_PARTS(lsn)));
+ DBUG_ASSERT(translog_status == TRANSLOG_OK ||
+ translog_status == TRANSLOG_READONLY);
+ LINT_INIT(sent_to_disk);
+
+ pthread_mutex_lock(&log_descriptor.log_flush_lock);
+ DBUG_PRINT("info", ("Everything is flushed up to (%lu,0x%lx)",
+ LSN_IN_PARTS(log_descriptor.flushed)));
+ if (cmp_translog_addr(log_descriptor.flushed, lsn) >= 0)
+ {
+ pthread_mutex_unlock(&log_descriptor.log_flush_lock);
+ DBUG_RETURN(0);
+ }
+ if (log_descriptor.flush_in_progress)
+ {
+ translog_lock();
+ /* fix lsn if it was horizon */
+ if (cmp_translog_addr(lsn, log_descriptor.bc.buffer->last_lsn) > 0)
+ lsn= BUFFER_MAX_LSN(log_descriptor.bc.buffer);
+ translog_unlock();
+ translog_flush_set_new_goal_and_wait(lsn);
+ if (!pthread_equal(log_descriptor.max_lsn_requester, pthread_self()))
+ {
+ /*
+ translog_flush_wait_for_end() release log_flush_lock while is
+ waiting then acquire it again
+ */
+ translog_flush_wait_for_end(lsn);
+ pthread_mutex_unlock(&log_descriptor.log_flush_lock);
+ DBUG_RETURN(0);
+ }
+ log_descriptor.next_pass_max_lsn= LSN_IMPOSSIBLE;
+ }
+ log_descriptor.flush_in_progress= 1;
+ flush_horizon= log_descriptor.previous_flush_horizon;
+ DBUG_PRINT("info", ("flush_in_progress is set, flush_horizon: (%lu,0x%lx)",
+ LSN_IN_PARTS(flush_horizon)));
+ pthread_mutex_unlock(&log_descriptor.log_flush_lock);
+
+ hgroup_commit_at_start= hard_group_commit;
+ if (hgroup_commit_at_start)
+ flush_interval= group_commit_wait;
+
+ translog_lock();
+ if (log_descriptor.is_everything_flushed)
+ {
+ DBUG_PRINT("info", ("everything is flushed"));
+ translog_unlock();
+ pthread_mutex_lock(&log_descriptor.log_flush_lock);
+ goto out;
+ }
+
+ for (;;)
+ {
+ /* Following function flushes buffers and makes translog_unlock() */
+ translog_flush_buffers(&lsn, &sent_to_disk, &flush_horizon);
+
+ if (!hgroup_commit_at_start)
+ break; /* flush pass is ended */
+
+retest:
+ /*
+ We do not check time here because pthread_mutex_lock rarely takes
+ a lot of time so we can sacrifice a bit precision to performance
+ (taking into account that my_micro_time() might be expensive call).
+ */
+ if (flush_interval == 0)
+ break; /* flush pass is ended */
+
+ pthread_mutex_lock(&log_descriptor.log_flush_lock);
+ if (log_descriptor.next_pass_max_lsn == LSN_IMPOSSIBLE)
+ {
+ if (flush_interval == 0 ||
+ (time_spent= (my_micro_time() - flush_start)) >= flush_interval)
{
- rc= 1;
- translog_stop_writing();
- sent_to_disk= LSN_IMPOSSIBLE;
- goto out;
+ pthread_mutex_unlock(&log_descriptor.log_flush_lock);
+ break;
}
- file->is_sync= 1;
- }
- }
-
- if (sync_log_dir >= TRANSLOG_SYNC_DIR_ALWAYS &&
- (LSN_FILE_NO(log_descriptor.previous_flush_horizon) !=
- LSN_FILE_NO(flush_horizon) ||
- ((LSN_OFFSET(log_descriptor.previous_flush_horizon) - 1) /
- TRANSLOG_PAGE_SIZE) !=
- ((LSN_OFFSET(flush_horizon) - 1) / TRANSLOG_PAGE_SIZE)))
- rc|= sync_dir(log_descriptor.directory_fd, MYF(MY_WME | MY_IGNORE_BADFD));
+ DBUG_PRINT("info", ("flush waits: %llu interval: %llu spent: %llu",
+ flush_interval - time_spent,
+ flush_interval, time_spent));
+ /* wait time or next goal */
+ set_timespec_nsec(abstime, flush_interval - time_spent);
+ pthread_cond_timedwait(&log_descriptor.new_goal_cond,
+ &log_descriptor.log_flush_lock,
+ &abstime);
+ pthread_mutex_unlock(&log_descriptor.log_flush_lock);
+ DBUG_PRINT("info", ("retest conditions"));
+ goto retest;
+ }
+
+ /* take next goal */
+ lsn= log_descriptor.next_pass_max_lsn;
+ log_descriptor.next_pass_max_lsn= LSN_IMPOSSIBLE;
+ /* prevent other thread from continue */
+ log_descriptor.max_lsn_requester= pthread_self();
+ DBUG_PRINT("info", ("flush took next goal: (%lu,0x%lx)",
+ LSN_IN_PARTS(lsn)));
+ pthread_mutex_unlock(&log_descriptor.log_flush_lock);
+
+ /* next flush pass */
+ DBUG_PRINT("info", ("next flush pass"));
+ translog_lock();
+ }
+
+ /*
+ sync() files from previous flush till current one
+ */
+ if (!soft_sync || hgroup_commit_at_start)
+ {
+ if ((rc=
+ translog_sync_files(LSN_FILE_NO(log_descriptor.flushed),
+ LSN_FILE_NO(lsn),
+ sync_log_dir >= TRANSLOG_SYNC_DIR_ALWAYS &&
+ (LSN_FILE_NO(log_descriptor.
+ previous_flush_horizon) !=
+ LSN_FILE_NO(flush_horizon) ||
+ (LSN_OFFSET(log_descriptor.
+ previous_flush_horizon) /
+ TRANSLOG_PAGE_SIZE) !=
+ (LSN_OFFSET(flush_horizon) /
+ TRANSLOG_PAGE_SIZE)))))
+ {
+ sent_to_disk= LSN_IMPOSSIBLE;
+ pthread_mutex_lock(&log_descriptor.log_flush_lock);
+ goto out;
+ }
+ /* keep values for soft sync() and forced sync() actual */
+ {
+ uint32 fileno= LSN_FILE_NO(lsn);
+ my_atomic_rwlock_wrlock(&soft_sync_rwl);
+ my_atomic_store32(&soft_sync_min, fileno);
+ my_atomic_store32(&soft_sync_max, fileno);
+ my_atomic_rwlock_wrunlock(&soft_sync_rwl);
+ }
+ }
+ else
+ {
+ my_atomic_rwlock_wrlock(&soft_sync_rwl);
+ my_atomic_store32(&soft_sync_max, LSN_FILE_NO(lsn));
+ my_atomic_store32(&soft_need_sync, 1);
+ my_atomic_rwlock_wrunlock(&soft_sync_rwl);
+ }
+
+ DBUG_ASSERT(flush_horizon <= log_descriptor.horizon);
+
+ pthread_mutex_lock(&log_descriptor.log_flush_lock);
log_descriptor.previous_flush_horizon= flush_horizon;
out:
- pthread_mutex_lock(&log_descriptor.log_flush_lock);
if (sent_to_disk != LSN_IMPOSSIBLE)
log_descriptor.flushed= sent_to_disk;
log_descriptor.flush_in_progress= 0;
log_descriptor.flush_no++;
DBUG_PRINT("info", ("flush_in_progress is dropped"));
- pthread_mutex_unlock(&log_descriptor.log_flush_lock);\
+ pthread_mutex_unlock(&log_descriptor.log_flush_lock);
pthread_cond_broadcast(&log_descriptor.log_flush_cond);
DBUG_RETURN(rc);
}
@@ -8113,6 +8453,8 @@
my_bool translog_purge(TRANSLOG_ADDRESS low)
{
uint32 last_need_file= LSN_FILE_NO(low);
+ uint32 min_unsync;
+ int soft;
TRANSLOG_ADDRESS horizon= translog_get_horizon();
int rc= 0;
DBUG_ENTER("translog_purge");
@@ -8120,12 +8462,26 @@
DBUG_ASSERT(translog_status == TRANSLOG_OK ||
translog_status == TRANSLOG_READONLY);
+ soft= soft_sync;
+ my_atomic_rwlock_wrlock(&soft_sync_rwl);
+ min_unsync= my_atomic_load32(&soft_sync_min);
+ my_atomic_rwlock_wrunlock(&soft_sync_rwl);
+ DBUG_PRINT("info", ("min_unsync: %lu", (ulong) min_unsync));
+ if (soft && min_unsync < last_need_file)
+ {
+ last_need_file= min_unsync;
+ DBUG_PRINT("info", ("last_need_file set to %lu", (ulong)last_need_file));
+ }
+
pthread_mutex_lock(&log_descriptor.purger_lock);
+ DBUG_PRINT("info", ("last_lsn_checked file: %lu:",
+ (ulong) log_descriptor.last_lsn_checked));
if (LSN_FILE_NO(log_descriptor.last_lsn_checked) < last_need_file)
{
uint32 i;
uint32 min_file= translog_first_file(horizon, 1);
DBUG_ASSERT(min_file != 0); /* log is already started */
+ DBUG_PRINT("info", ("min_file: %lu:",(ulong) min_file));
for(i= min_file; i < last_need_file && rc == 0; i++)
{
LSN lsn= translog_get_file_max_lsn_stored(i);
@@ -8356,6 +8712,159 @@
}
+
+/**
+ Sets soft sync mode
+
+ @param mode TRUE if we need switch soft sync on else off
+*/
+
+void translog_soft_sync(my_bool mode)
+{
+ soft_sync= mode;
+}
+
+
+/**
+ Sets hard group commit
+
+ @param mode TRUE if we need switch hard group commit on else off
+*/
+
+void translog_hard_group_commit(my_bool mode)
+{
+ hard_group_commit= mode;
+}
+
+
+/**
+ @brief forced log sync (used when we are switching modes)
+*/
+
+void translog_sync()
+{
+ uint32 max= get_current_logfile()->number;
+ uint32 min;
+ DBUG_ENTER("ma_translog_sync");
+
+ my_atomic_rwlock_rdlock(&soft_sync_rwl);
+ min= my_atomic_load32(&soft_sync_min);
+ my_atomic_rwlock_rdunlock(&soft_sync_rwl);
+ if (!min)
+ min= max;
+
+ translog_sync_files(min, max, sync_log_dir >= TRANSLOG_SYNC_DIR_ALWAYS);
+
+ DBUG_VOID_RETURN;
+}
+
+
+/**
+ @brief set rate for group commit
+
+ @param interval interval to set.
+
+ @note We use this function with additional variable because have to
+ restart service thread with new value which we can't make inside changing
+ variable routine (update_maria_group_commit_interval)
+*/
+
+void translog_set_group_commit_interval(uint32 interval)
+{
+ DBUG_ENTER("translog_set_group_commit_interval");
+ group_commit_wait= interval;
+ DBUG_PRINT("info", ("wait: %llu",
+ (ulonglong)group_commit_wait));
+ DBUG_VOID_RETURN;
+}
+
+
+/**
+ @brief syncing service thread
+*/
+
+static pthread_handler_t
+ma_soft_sync_background( void *arg __attribute__((unused)))
+{
+
+ my_thread_init();
+ {
+ DBUG_ENTER("ma_soft_sync_background");
+ for(;;)
+ {
+ ulonglong prev_loop= my_micro_time();
+ ulonglong time, sleep;
+ uint32 min, max, sync_request;
+ my_atomic_rwlock_rdlock(&soft_sync_rwl);
+ min= my_atomic_load32(&soft_sync_min);
+ max= my_atomic_load32(&soft_sync_max);
+ sync_request= my_atomic_load32(&soft_need_sync);
+ my_atomic_store32(&soft_sync_min, max);
+ my_atomic_store32(&soft_need_sync, 0);
+ my_atomic_rwlock_rdunlock(&soft_sync_rwl);
+
+ sleep= group_commit_wait;
+ if (sync_request)
+ translog_sync_files(min, max, FALSE);
+ time= my_micro_time() - prev_loop;
+ if (time > sleep)
+ sleep= 0;
+ else
+ sleep-= time;
+ if (my_service_thread_sleep(&soft_sync_control, sleep))
+ break;
+ }
+ my_service_thread_signal_end(&soft_sync_control);
+ my_thread_end();
+ DBUG_RETURN(0);
+ }
+}
+
+
+/**
+ @brief Starts syncing thread
+*/
+
+int translog_soft_sync_start(void)
+{
+ pthread_t th;
+ int res= 0;
+ uint32 min, max;
+ DBUG_ENTER("translog_soft_sync_start");
+
+ /* check and init variables */
+ my_atomic_rwlock_rdlock(&soft_sync_rwl);
+ min= my_atomic_load32(&soft_sync_min);
+ max= my_atomic_load32(&soft_sync_max);
+ if (!max)
+ my_atomic_store32(&soft_sync_max, (max= get_current_logfile()->number));
+ if (!min)
+ my_atomic_store32(&soft_sync_min, max);
+ my_atomic_store32(&soft_need_sync, 1);
+ my_atomic_rwlock_rdunlock(&soft_sync_rwl);
+
+ if (!(res= ma_service_thread_control_init(&soft_sync_control)))
+ if (!(res= pthread_create(&th, NULL, ma_soft_sync_background, NULL)))
+ soft_sync_control.status= THREAD_RUNNING;
+ DBUG_RETURN(res);
+}
+
+
+/**
+ @brief Stops syncing thread
+*/
+
+void translog_soft_sync_end(void)
+{
+ DBUG_ENTER("translog_soft_sync_end");
+ if (soft_sync_control.inited)
+ {
+ ma_service_thread_control_end(&soft_sync_control);
+ }
+ DBUG_VOID_RETURN;
+}
+
+
#ifdef MARIA_DUMP_LOG
#include <my_getopt.h>
extern void translog_example_table_init();
=== modified file 'storage/maria/ma_loghandler.h'
--- a/storage/maria/ma_loghandler.h 2009-01-15 22:25:53 +0000
+++ b/storage/maria/ma_loghandler.h 2010-02-12 09:13:25 +0000
@@ -342,6 +342,14 @@
TRANSLOG_SHUTDOWN /* going to shutdown the loghandler */
};
extern enum enum_translog_status translog_status;
+extern ulonglong translog_syncs; /* Number of sync()s */
+
+void translog_soft_sync(my_bool mode);
+void translog_hard_group_commit(my_bool mode);
+int translog_soft_sync_start(void);
+void translog_soft_sync_end(void);
+void translog_sync();
+void translog_set_group_commit_interval(uint32 interval);
/*
all the rest added because of recovery; should we make
@@ -441,6 +449,14 @@
typedef enum
{
+ TRANSLOG_GCOMMIT_NONE,
+ TRANSLOG_GCOMMIT_HARD,
+ TRANSLOG_GCOMMIT_SOFT
+} enum_maria_group_commit;
+extern ulong maria_group_commit;
+extern ulong maria_group_commit_interval;
+typedef enum
+{
TRANSLOG_PURGE_IMMIDIATE,
TRANSLOG_PURGE_EXTERNAL,
TRANSLOG_PURGE_ONDEMAND
1
0

[Maria-developers] Rev 2740: Group commit for maria storage engine. in file:///Users/bell/maria/bzr/work-maria-5.2-groupcommit/
by sanja@askmonty.org 12 Feb '10
by sanja@askmonty.org 12 Feb '10
12 Feb '10
At file:///Users/bell/maria/bzr/work-maria-5.2-groupcommit/
------------------------------------------------------------
revno: 2740
revision-id: sanja(a)askmonty.org-20100212065247-vnhehxm6snm32c1j
parent: knielsen(a)knielsen-hq.org-20100201190519-b9uktnn90rwwiile
committer: sanja(a)askmonty.org
branch nick: work-maria-5.2-groupcommit
timestamp: Fri 2010-02-12 08:52:47 +0200
message:
Group commit for maria storage engine.
=== added file 'mysql-test/suite/maria/r/group_commit.result'
--- a/mysql-test/suite/maria/r/group_commit.result 1970-01-01 00:00:00 +0000
+++ b/mysql-test/suite/maria/r/group_commit.result 2010-02-12 06:52:47 +0000
@@ -0,0 +1,17 @@
+drop table if exists t1;
+create table t1 (a int);
+SET GLOBAL maria_group_commit="NONE";
+SET GLOBAL maria_group_commit_interval= 0;
+SET GLOBAL maria_group_commit="NONE";
+SET GLOBAL maria_group_commit_interval= 100;
+SET GLOBAL maria_group_commit="HARD";
+SET GLOBAL maria_group_commit_interval= 0;
+SET GLOBAL maria_group_commit="HARD";
+SET GLOBAL maria_group_commit_interval= 100;
+SET GLOBAL maria_group_commit="SOFT";
+SET GLOBAL maria_group_commit_interval= 0;
+SET GLOBAL maria_group_commit="SOFT";
+SET GLOBAL maria_group_commit_interval= 100;
+SET GLOBAL maria_group_commit="NONE";
+SET GLOBAL maria_group_commit_interval= 0;
+drop table t1;
=== modified file 'mysql-test/suite/maria/r/maria3.result'
--- a/mysql-test/suite/maria/r/maria3.result 2009-09-18 01:04:43 +0000
+++ b/mysql-test/suite/maria/r/maria3.result 2010-02-12 06:52:47 +0000
@@ -306,6 +306,8 @@
maria_block_size 8192
maria_checkpoint_interval 30
maria_force_start_after_recovery_failures 0
+maria_group_commit none
+maria_group_commit_interval 0
maria_log_file_size 4294959104
maria_log_purge_type immediate
maria_max_sort_file_size 9223372036853727232
@@ -328,6 +330,7 @@
Maria_pagecache_reads #
Maria_pagecache_write_requests #
Maria_pagecache_writes #
+Maria_transaction_log_syncs #
create table t1 (b char(0));
insert into t1 values(NULL),("");
select length(b) from t1;
=== added file 'mysql-test/suite/maria/t/group_commit.test'
--- a/mysql-test/suite/maria/t/group_commit.test 1970-01-01 00:00:00 +0000
+++ b/mysql-test/suite/maria/t/group_commit.test 2010-02-12 06:52:47 +0000
@@ -0,0 +1,71 @@
+# Test different ways of syncing (mostly syntax)
+
+--disable_warnings
+drop table if exists t1;
+--enable_warnings
+
+create table t1 (a int);
+
+SET GLOBAL maria_group_commit="NONE";
+SET GLOBAL maria_group_commit_interval= 0;
+--disable_query_log
+let $num = 5000;
+while ($num)
+{
+ insert into t1 values (1);
+ dec $num;
+}
+--enable_query_log
+SET GLOBAL maria_group_commit="NONE";
+SET GLOBAL maria_group_commit_interval= 100;
+--disable_query_log
+let $num = 5000;
+while ($num)
+{
+ insert into t1 values (1);
+ dec $num;
+}
+--enable_query_log
+SET GLOBAL maria_group_commit="HARD";
+SET GLOBAL maria_group_commit_interval= 0;
+--disable_query_log
+let $num = 5000;
+while ($num)
+{
+ insert into t1 values (1);
+ dec $num;
+}
+--enable_query_log
+SET GLOBAL maria_group_commit="HARD";
+SET GLOBAL maria_group_commit_interval= 100;
+--disable_query_log
+let $num = 5000;
+while ($num)
+{
+ insert into t1 values (1);
+ dec $num;
+}
+--enable_query_log
+SET GLOBAL maria_group_commit="SOFT";
+SET GLOBAL maria_group_commit_interval= 0;
+--disable_query_log
+let $num = 5000;
+while ($num)
+{
+ insert into t1 values (1);
+ dec $num;
+}
+--enable_query_log
+SET GLOBAL maria_group_commit="SOFT";
+SET GLOBAL maria_group_commit_interval= 100;
+--disable_query_log
+let $num = 5000;
+while ($num)
+{
+ insert into t1 values (1);
+ dec $num;
+}
+--enable_query_log
+SET GLOBAL maria_group_commit="NONE";
+SET GLOBAL maria_group_commit_interval= 0;
+drop table t1;
=== modified file 'storage/maria/ha_maria.cc'
--- a/storage/maria/ha_maria.cc 2009-12-03 11:34:11 +0000
+++ b/storage/maria/ha_maria.cc 2010-02-12 06:52:47 +0000
@@ -102,22 +102,40 @@
array_elements(maria_translog_purge_type_names) - 1, "",
maria_translog_purge_type_names, NULL
};
+
+/* transactional log directory sync */
const char *maria_sync_log_dir_names[]=
{
"NEVER", "NEWFILE", "ALWAYS", NullS
};
-
TYPELIB maria_sync_log_dir_typelib=
{
array_elements(maria_sync_log_dir_names) - 1, "",
maria_sync_log_dir_names, NULL
};
+/* transactional log group commit */
+const char *maria_group_commit_names[]=
+{
+ "none", "hard", "soft", NullS
+};
+TYPELIB maria_group_commit_typelib=
+{
+ array_elements(maria_group_commit_names) - 1, "",
+ maria_group_commit_names, NULL
+};
+
/** Interval between background checkpoints in seconds */
static ulong checkpoint_interval;
static void update_checkpoint_interval(MYSQL_THD thd,
struct st_mysql_sys_var *var,
void *var_ptr, const void *save);
+static void update_maria_group_commit(MYSQL_THD thd,
+ struct st_mysql_sys_var *var,
+ void *var_ptr, const void *save);
+static void update_maria_group_commit_interval(MYSQL_THD thd,
+ struct st_mysql_sys_var *var,
+ void *var_ptr, const void *save);
/** After that many consecutive recovery failures, remove logs */
static ulong force_start_after_recovery_failures;
static void update_log_file_size(MYSQL_THD thd,
@@ -164,6 +182,24 @@
NULL, update_log_file_size, TRANSLOG_FILE_SIZE,
TRANSLOG_MIN_FILE_SIZE, 0xffffffffL, TRANSLOG_PAGE_SIZE);
+static MYSQL_SYSVAR_ENUM(group_commit, maria_group_commit,
+ PLUGIN_VAR_RQCMDARG,
+ "Specifies maria group commit mode. "
+ "Possible values are \"none\" (no group commit), "
+ "\"hard\" (with waiting to actual commit), "
+ "\"soft\" (no wait for commit (DANGEROUS!!!))",
+ NULL, update_maria_group_commit,
+ TRANSLOG_GCOMMIT_NONE, &maria_group_commit_typelib);
+
+static MYSQL_SYSVAR_ULONG(group_commit_interval, maria_group_commit_interval,
+ PLUGIN_VAR_RQCMDARG,
+ "Interval between commite in microseconds (1/1000000c)."
+ " 0 stands for no waiting"
+ " for other threads to come and do a commit in \"hard\" mode and no"
+ " sync()/commit at all in \"soft\" mode. Option has only an effect"
+ " if maria_group_commit is used",
+ NULL, update_maria_group_commit_interval, 0, 0, UINT_MAX, 1);
+
static MYSQL_SYSVAR_ENUM(log_purge_type, log_purge_type,
PLUGIN_VAR_RQCMDARG,
"Specifies how maria transactional log will be purged. "
@@ -3275,6 +3311,8 @@
MYSQL_SYSVAR(block_size),
MYSQL_SYSVAR(checkpoint_interval),
MYSQL_SYSVAR(force_start_after_recovery_failures),
+ MYSQL_SYSVAR(group_commit),
+ MYSQL_SYSVAR(group_commit_interval),
MYSQL_SYSVAR(page_checksum),
MYSQL_SYSVAR(log_dir_path),
MYSQL_SYSVAR(log_file_size),
@@ -3306,6 +3344,92 @@
}
/**
+ @brief Updates group commit mode
+*/
+
+static void update_maria_group_commit(MYSQL_THD thd,
+ struct st_mysql_sys_var *var,
+ void *var_ptr, const void *save)
+{
+ ulong value= (ulong)*((long *)var_ptr);
+ DBUG_ENTER("update_maria_group_commit");
+ DBUG_PRINT("enter", ("old value: %lu new value %lu rate %lu",
+ value, (ulong)(*(long *)save),
+ maria_group_commit_interval));
+ /* old value */
+ switch (value) {
+ case TRANSLOG_GCOMMIT_NONE:
+ break;
+ case TRANSLOG_GCOMMIT_HARD:
+ translog_hard_group_commit(FALSE);
+ break;
+ case TRANSLOG_GCOMMIT_SOFT:
+ translog_soft_sync(FALSE);
+ if (maria_group_commit_interval)
+ translog_soft_sync_end();
+ break;
+ default:
+ DBUG_ASSERT(0); /* impossible */
+ }
+ value= *(ulong *)var_ptr= (ulong)(*(long *)save);
+ translog_sync();
+ /* new value */
+ switch (value) {
+ case TRANSLOG_GCOMMIT_NONE:
+ break;
+ case TRANSLOG_GCOMMIT_HARD:
+ translog_hard_group_commit(TRUE);
+ break;
+ case TRANSLOG_GCOMMIT_SOFT:
+ translog_soft_sync(TRUE);
+ /* variable change made under global lock so we can just read it */
+ if (maria_group_commit_interval)
+ translog_soft_sync_start();
+ break;
+ default:
+ DBUG_ASSERT(0); /* impossible */
+ }
+ DBUG_VOID_RETURN;
+}
+
+/**
+ @brief Updates group commit interval
+*/
+
+static void update_maria_group_commit_interval(MYSQL_THD thd,
+ struct st_mysql_sys_var *var,
+ void *var_ptr, const void *save)
+{
+ ulong new_value= (ulong)*((long *)save);
+ ulong *value_ptr= (ulong*) var_ptr;
+ DBUG_ENTER("update_maria_group_commit_interval");
+ DBUG_PRINT("enter", ("old value: %lu new value %lu group commit %lu",
+ *value_ptr, new_value, maria_group_commit));
+
+ /* variable change made under global lock so we can just read it */
+ switch (maria_group_commit) {
+ case TRANSLOG_GCOMMIT_NONE:
+ *value_ptr= new_value;
+ translog_set_group_commit_interval(new_value);
+ break;
+ case TRANSLOG_GCOMMIT_HARD:
+ *value_ptr= new_value;
+ translog_set_group_commit_interval(new_value);
+ break;
+ case TRANSLOG_GCOMMIT_SOFT:
+ if (*value_ptr)
+ translog_soft_sync_end();
+ translog_set_group_commit_interval(new_value);
+ if ((*value_ptr= new_value))
+ translog_soft_sync_start();
+ break;
+ default:
+ DBUG_ASSERT(0); /* impossible */
+ }
+ DBUG_VOID_RETURN;
+}
+
+/**
@brief Updates the transaction log file limit.
*/
@@ -3327,6 +3451,7 @@
{"Maria_pagecache_reads", (char*) &maria_pagecache_var.global_cache_read, SHOW_LONGLONG},
{"Maria_pagecache_write_requests", (char*) &maria_pagecache_var.global_cache_w_requests, SHOW_LONGLONG},
{"Maria_pagecache_writes", (char*) &maria_pagecache_var.global_cache_write, SHOW_LONGLONG},
+ {"Maria_transaction_log_syncs", (char*) &translog_syncs, SHOW_LONGLONG},
{NullS, NullS, SHOW_LONG}
};
=== modified file 'storage/maria/ma_init.c'
--- a/storage/maria/ma_init.c 2008-10-09 20:03:54 +0000
+++ b/storage/maria/ma_init.c 2010-02-12 06:52:47 +0000
@@ -82,6 +82,11 @@
maria_inited= maria_multi_threaded= FALSE;
ft_free_stopwords();
ma_checkpoint_end();
+ if (translog_status == TRANSLOG_OK)
+ {
+ translog_soft_sync_end();
+ translog_sync();
+ }
if ((trid= trnman_get_max_trid()) > max_trid_in_control_file)
{
/*
=== modified file 'storage/maria/ma_loghandler.c'
--- a/storage/maria/ma_loghandler.c 2010-01-06 21:27:53 +0000
+++ b/storage/maria/ma_loghandler.c 2010-02-12 06:52:47 +0000
@@ -18,6 +18,7 @@
#include "ma_blockrec.h" /* for some constants and in-write hooks */
#include "ma_key_recover.h" /* For some in-write hooks */
#include "ma_checkpoint.h"
+#include "ma_servicethread.h"
/*
On Windows, neither my_open() nor my_sync() work for directories.
@@ -47,6 +48,15 @@
#include <m_ctype.h>
#endif
+/** @brief protects checkpoint_in_progress */
+static pthread_mutex_t LOCK_soft_sync;
+/** @brief for killing the background checkpoint thread */
+static pthread_cond_t COND_soft_sync;
+/** @brief control structure for checkpoint background thread */
+static MA_SERVICE_THREAD_CONTROL soft_sync_control=
+ {THREAD_DEAD, FALSE, &LOCK_soft_sync, &COND_soft_sync};
+
+
/* transaction log file descriptor */
typedef struct st_translog_file
{
@@ -124,10 +134,24 @@
/* Previous buffer offset to detect it flush finish */
TRANSLOG_ADDRESS prev_buffer_offset;
/*
+ If the buffer was forced to close it save value of its horizon
+ otherwise LSN_IMPOSSIBLE
+ */
+ TRANSLOG_ADDRESS pre_force_close_horizon;
+ /*
How much is written (or will be written when copy_to_buffer_in_progress
become 0) to this buffer
*/
translog_size_t size;
+ /*
+ When moving from one log buffer to another, we write the last of the
+ previous buffer to file and then move to start using the new log
+ buffer. In the case of a part filed last page, this page is not moved
+ to the start of the new buffer but instead we set the 'skip_data'
+ variable to tell us how much data at the beginning of the buffer is not
+ relevant.
+ */
+ uint skipped_data;
/* File handler for this buffer */
TRANSLOG_FILE *file;
/* Threads which are waiting for buffer filling/freeing */
@@ -304,6 +328,7 @@
*/
pthread_mutex_t log_flush_lock;
pthread_cond_t log_flush_cond;
+ pthread_cond_t new_goal_cond;
/* Protects changing of headers of finished files (max_lsn) */
pthread_mutex_t file_header_lock;
@@ -344,13 +369,39 @@
ulong log_purge_type= TRANSLOG_PURGE_IMMIDIATE;
ulong log_file_size= TRANSLOG_FILE_SIZE;
+/* sync() of log files directory mode */
ulong sync_log_dir= TRANSLOG_SYNC_DIR_NEWFILE;
+ulong maria_group_commit= TRANSLOG_GCOMMIT_NONE;
+ulong maria_group_commit_interval= 0;
/* Marker for end of log */
static uchar end_of_log= 0;
#define END_OF_LOG &end_of_log
+/**
+ Switch for "soft" sync (no real sync() but periodical sync by service
+ thread)
+*/
+static volatile my_bool soft_sync= FALSE;
+/**
+ Switch for "hard" group commit mode
+*/
+static volatile my_bool hard_group_commit= FALSE;
+/**
+ File numbers interval which have to be sync()
+*/
+static uint32 soft_sync_min= 0;
+static uint32 soft_sync_max= 0;
+static uint32 soft_need_sync= 1;
+/**
+ stores interval in microseconds
+*/
+static uint32 group_commit_wait= 0;
enum enum_translog_status translog_status= TRANSLOG_UNINITED;
+ulonglong translog_syncs= 0; /* Number of sync()s */
+
+/* time of last flush */
+static ulonglong flush_start= 0;
/* chunk types */
#define TRANSLOG_CHUNK_LSN 0x00 /* 0 chunk refer as LSN (head or tail */
@@ -980,12 +1031,17 @@
static TRANSLOG_FILE *get_current_logfile()
{
TRANSLOG_FILE *file;
+ DBUG_ENTER("get_current_logfile");
rw_rdlock(&log_descriptor.open_files_lock);
+ DBUG_PRINT("info", ("max_file: %lu min_file: %lu open_files: %lu",
+ (ulong) log_descriptor.max_file,
+ (ulong) log_descriptor.min_file,
+ (ulong) log_descriptor.open_files.elements));
DBUG_ASSERT(log_descriptor.max_file - log_descriptor.min_file + 1 ==
log_descriptor.open_files.elements);
file= *dynamic_element(&log_descriptor.open_files, 0, TRANSLOG_FILE **);
rw_unlock(&log_descriptor.open_files_lock);
- return (file);
+ DBUG_RETURN(file);
}
uchar NEAR maria_trans_file_magic[]=
@@ -1069,6 +1125,7 @@
static my_bool translog_max_lsn_to_header(File file, LSN lsn)
{
uchar lsn_buff[LSN_STORE_SIZE];
+ my_bool rc;
DBUG_ENTER("translog_max_lsn_to_header");
DBUG_PRINT("enter", ("File descriptor: %ld "
"lsn: (%lu,0x%lx)",
@@ -1077,11 +1134,17 @@
lsn_store(lsn_buff, lsn);
- DBUG_RETURN(my_pwrite(file, lsn_buff,
- LSN_STORE_SIZE,
- (LOG_HEADER_DATA_SIZE - LSN_STORE_SIZE),
- log_write_flags) != 0 ||
- my_sync(file, MYF(MY_WME)) != 0);
+ rc= (my_pwrite(file, lsn_buff,
+ LSN_STORE_SIZE,
+ (LOG_HEADER_DATA_SIZE - LSN_STORE_SIZE),
+ log_write_flags) != 0 ||
+ my_sync(file, MYF(MY_WME)) != 0);
+ /*
+ We should not increase counter in case of error above, but it is so
+ unlikely that we can ignore this case
+ */
+ translog_syncs++;
+ DBUG_RETURN(rc);
}
@@ -1423,7 +1486,9 @@
static my_bool translog_buffer_init(struct st_translog_buffer *buffer, int num)
{
DBUG_ENTER("translog_buffer_init");
- buffer->prev_last_lsn= buffer->last_lsn= LSN_IMPOSSIBLE;
+ buffer->pre_force_close_horizon=
+ buffer->prev_last_lsn= buffer->last_lsn=
+ LSN_IMPOSSIBLE;
DBUG_PRINT("info", ("last_lsn and prev_last_lsn set to 0 buffer: 0x%lx",
(ulong) buffer));
@@ -1435,6 +1500,7 @@
memset(buffer->buffer, TRANSLOG_FILLER, TRANSLOG_WRITE_BUFFER);
/* Buffer size */
buffer->size= 0;
+ buffer->skipped_data= 0;
/* cond of thread which is waiting for buffer filling */
if (pthread_cond_init(&buffer->waiting_filling_buffer, 0))
DBUG_RETURN(1);
@@ -1489,7 +1555,10 @@
TODO: sync only we have changed the log
*/
if (!file->is_sync)
+ {
rc= my_sync(file->handler.file, MYF(MY_WME));
+ translog_syncs++;
+ }
rc|= my_close(file->handler.file, MYF(MY_WME));
my_free(file, MYF(0));
return test(rc);
@@ -2044,7 +2113,8 @@
(ulong) LSN_OFFSET(log_descriptor.horizon),
(ulong) LSN_OFFSET(log_descriptor.horizon)));
DBUG_ASSERT(buffer_no == buffer->buffer_no);
- buffer->prev_last_lsn= buffer->last_lsn= LSN_IMPOSSIBLE;
+ buffer->pre_force_close_horizon=
+ buffer->prev_last_lsn= buffer->last_lsn= LSN_IMPOSSIBLE;
DBUG_PRINT("info", ("last_lsn and prev_last_lsn set to 0 buffer: 0x%lx",
(ulong) buffer));
buffer->offset= log_descriptor.horizon;
@@ -2052,6 +2122,7 @@
buffer->file= get_current_logfile();
buffer->overlay= 0;
buffer->size= 0;
+ buffer->skipped_data= 0;
translog_cursor_init(cursor, buffer, buffer_no);
DBUG_PRINT("info", ("file: #%ld (%d) init cursor #%u: 0x%lx "
"chaser: %d Size: %lu (%lu)",
@@ -2523,6 +2594,7 @@
TRANSLOG_ADDRESS offset= buffer->offset;
TRANSLOG_FILE *file= buffer->file;
uint8 ver= buffer->ver;
+ uint skipped_data;
DBUG_ENTER("translog_buffer_flush");
DBUG_PRINT("enter",
("Buffer: #%u 0x%lx file: %d offset: (%lu,0x%lx) size: %lu",
@@ -2557,6 +2629,8 @@
disk
*/
file= buffer->file;
+ skipped_data= buffer->skipped_data;
+ DBUG_ASSERT(skipped_data < TRANSLOG_PAGE_SIZE);
for (i= 0, pg= LSN_OFFSET(buffer->offset) / TRANSLOG_PAGE_SIZE;
i < buffer->size;
i+= TRANSLOG_PAGE_SIZE, pg++)
@@ -2573,13 +2647,16 @@
DBUG_ASSERT(i + TRANSLOG_PAGE_SIZE <= buffer->size);
if (translog_status != TRANSLOG_OK && translog_status != TRANSLOG_SHUTDOWN)
DBUG_RETURN(1);
- if (pagecache_inject(log_descriptor.pagecache,
+ if (pagecache_write_part(log_descriptor.pagecache,
&file->handler, pg, 3,
buffer->buffer + i,
PAGECACHE_PLAIN_PAGE,
PAGECACHE_LOCK_LEFT_UNLOCKED,
- PAGECACHE_PIN_LEFT_UNPINNED, 0,
- LSN_IMPOSSIBLE))
+ PAGECACHE_PIN_LEFT_UNPINNED,
+ PAGECACHE_WRITE_DONE, 0,
+ LSN_IMPOSSIBLE,
+ skipped_data,
+ TRANSLOG_PAGE_SIZE - skipped_data))
{
DBUG_PRINT("error",
("Can't write page (%lu,0x%lx) to pagecache, error: %d",
@@ -2589,10 +2666,12 @@
translog_stop_writing();
DBUG_RETURN(1);
}
+ skipped_data= 0;
}
file->is_sync= 0;
- if (my_pwrite(file->handler.file, buffer->buffer,
- buffer->size, LSN_OFFSET(buffer->offset),
+ if (my_pwrite(file->handler.file, buffer->buffer + buffer->skipped_data,
+ buffer->size - buffer->skipped_data,
+ LSN_OFFSET(buffer->offset) + buffer->skipped_data,
log_write_flags))
{
DBUG_PRINT("error", ("Can't write buffer (%lu,0x%lx) size %lu "
@@ -2985,6 +3064,7 @@
uchar *from, *table= NULL;
int is_last_unfinished_page;
uint last_protected_sector= 0;
+ uint skipped_data= curr_buffer->skipped_data;
TRANSLOG_FILE file_copy;
uint8 ver= curr_buffer->ver;
translog_wait_for_writers(curr_buffer);
@@ -2997,7 +3077,38 @@
}
DBUG_ASSERT(LSN_FILE_NO(addr) == LSN_FILE_NO(curr_buffer->offset));
from= curr_buffer->buffer + (addr - curr_buffer->offset);
- memcpy(buffer, from, TRANSLOG_PAGE_SIZE);
+ if (skipped_data && addr == curr_buffer->offset)
+ {
+ /*
+ We read page part of which is not present in buffer,
+ so we should read absent part from file (page cache actually)
+ */
+ file= get_logfile_by_number(file_no);
+ DBUG_ASSERT(file != NULL);
+ /*
+ it's ok to not lock the page because:
+ - The log handler has it's own page cache.
+ - There is only one thread that can access the log
+ cache at a time
+ */
+ if (!(buffer= pagecache_read(log_descriptor.pagecache,
+ &file->handler,
+ LSN_OFFSET(addr) / TRANSLOG_PAGE_SIZE,
+ 3, buffer,
+ PAGECACHE_PLAIN_PAGE,
+ PAGECACHE_LOCK_LEFT_UNLOCKED,
+ NULL)))
+ DBUG_RETURN(NULL);
+ }
+ else
+ skipped_data= 0; /* Read after skipped in buffer data */
+ /*
+ Now we have correct data in buffer up to 'skipped_data'. The
+ following memcpy() will move the data from the internal buffer
+ that was not yet on disk.
+ */
+ memcpy(buffer + skipped_data, from + skipped_data,
+ TRANSLOG_PAGE_SIZE - skipped_data);
/*
We can use copy then in translog_page_validator() because it
do not put it permanently somewhere.
@@ -3291,6 +3402,7 @@
uint32 next_page_offset, page_rest;
uint32 i;
File fd;
+ int rc;
TRANSLOG_VALIDATOR_DATA data;
char path[FN_REFLEN];
uchar page_buff[TRANSLOG_PAGE_SIZE];
@@ -3316,14 +3428,19 @@
TRANSLOG_PAGE_SIZE);
page_rest= next_page_offset - LSN_OFFSET(addr);
memset(page_buff, TRANSLOG_FILLER, page_rest);
- if ((fd= open_logfile_by_number_no_cache(LSN_FILE_NO(addr))) < 0 ||
- ((my_chsize(fd, next_page_offset, TRANSLOG_FILLER, MYF(MY_WME)) ||
- (page_rest && my_pwrite(fd, page_buff, page_rest, LSN_OFFSET(addr),
- log_write_flags)) ||
- my_sync(fd, MYF(MY_WME))) |
- my_close(fd, MYF(MY_WME))) ||
- (sync_log_dir >= TRANSLOG_SYNC_DIR_ALWAYS &&
- sync_dir(log_descriptor.directory_fd, MYF(MY_WME | MY_IGNORE_BADFD))))
+ rc= ((fd= open_logfile_by_number_no_cache(LSN_FILE_NO(addr))) < 0 ||
+ ((my_chsize(fd, next_page_offset, TRANSLOG_FILLER, MYF(MY_WME)) ||
+ (page_rest && my_pwrite(fd, page_buff, page_rest, LSN_OFFSET(addr),
+ log_write_flags)) ||
+ my_sync(fd, MYF(MY_WME)))));
+ translog_syncs++;
+ rc|= (fd > 0 && my_close(fd, MYF(MY_WME)));
+ if (sync_log_dir >= TRANSLOG_SYNC_DIR_ALWAYS)
+ {
+ rc|= sync_dir(log_descriptor.directory_fd, MYF(MY_WME | MY_IGNORE_BADFD));
+ translog_syncs++;
+ }
+ if (rc)
DBUG_RETURN(1);
/* fix the horizon */
@@ -3483,7 +3600,10 @@
my_bool version_changed= 0;
DBUG_ENTER("translog_init_with_table");
+ translog_syncs= 0;
+ flush_start= 0;
id_to_share= NULL;
+
log_descriptor.directory_fd= -1;
log_descriptor.is_everything_flushed= 1;
log_descriptor.flush_in_progress= 0;
@@ -3511,6 +3631,7 @@
pthread_mutex_init(&log_descriptor.dirty_buffer_mask_lock,
MY_MUTEX_INIT_FAST) ||
pthread_cond_init(&log_descriptor.log_flush_cond, 0) ||
+ pthread_cond_init(&log_descriptor.new_goal_cond, 0) ||
my_rwlock_init(&log_descriptor.open_files_lock,
NULL) ||
my_init_dynamic_array(&log_descriptor.open_files,
@@ -3912,7 +4033,6 @@
log_descriptor.flushed= log_descriptor.horizon;
log_descriptor.in_buffers_only= log_descriptor.bc.buffer->offset;
log_descriptor.max_lsn= LSN_IMPOSSIBLE; /* set to 0 */
- log_descriptor.previous_flush_horizon= log_descriptor.horizon;
/*
Now 'flushed' is set to 'horizon' value, but 'horizon' is (potentially)
address of the next LSN and we want indicate that all LSNs that are
@@ -3995,6 +4115,10 @@
It is beginning of the log => there is no LSNs in the log =>
There is no harm in leaving it "as-is".
*/
+ log_descriptor.previous_flush_horizon= log_descriptor.horizon;
+ DBUG_PRINT("info", ("previous_flush_horizon: (%lu,0x%lx)",
+ LSN_IN_PARTS(log_descriptor.
+ previous_flush_horizon)));
DBUG_RETURN(0);
}
file_no--;
@@ -4070,6 +4194,9 @@
translog_free_record_header(&rec);
}
}
+ log_descriptor.previous_flush_horizon= log_descriptor.horizon;
+ DBUG_PRINT("info", ("previous_flush_horizon: (%lu,0x%lx)",
+ LSN_IN_PARTS(log_descriptor.previous_flush_horizon)));
DBUG_RETURN(0);
err:
ma_message_no_user(0, "log initialization failed");
@@ -4157,6 +4284,7 @@
pthread_mutex_destroy(&log_descriptor.log_flush_lock);
pthread_mutex_destroy(&log_descriptor.dirty_buffer_mask_lock);
pthread_cond_destroy(&log_descriptor.log_flush_cond);
+ pthread_cond_destroy(&log_descriptor.new_goal_cond);
rwlock_destroy(&log_descriptor.open_files_lock);
delete_dynamic(&log_descriptor.open_files);
delete_dynamic(&log_descriptor.unfinished_files);
@@ -6885,11 +7013,11 @@
{
translog_size_t res;
DBUG_ENTER("translog_read_record_header_from_buffer");
- DBUG_ASSERT(translog_is_LSN_chunk(page[page_offset]));
- DBUG_ASSERT(translog_status == TRANSLOG_OK ||
- translog_status == TRANSLOG_READONLY);
DBUG_PRINT("info", ("page byte: 0x%x offset: %u",
(uint) page[page_offset], (uint) page_offset));
+ DBUG_ASSERT(translog_is_LSN_chunk(page[page_offset]));
+ DBUG_ASSERT(translog_status == TRANSLOG_OK ||
+ translog_status == TRANSLOG_READONLY);
buff->type= (page[page_offset] & TRANSLOG_REC_TYPE);
buff->short_trid= uint2korr(page + page_offset + 1);
DBUG_PRINT("info", ("Type %u, Short TrID %u, LSN (%lu,0x%lx)",
@@ -7356,27 +7484,27 @@
"Buffer addr: (%lu,0x%lx) "
"Page addr: (%lu,0x%lx) "
"size: %lu (%lu) Pg: %u left: %u in progress %u",
- (uint) log_descriptor.bc.buffer_no,
- (ulong) log_descriptor.bc.buffer,
- LSN_IN_PARTS(log_descriptor.bc.buffer->offset),
+ (uint) old_buffer_no,
+ (ulong) old_buffer,
+ LSN_IN_PARTS(old_buffer->offset),
(ulong) LSN_FILE_NO(log_descriptor.horizon),
(ulong) (LSN_OFFSET(log_descriptor.horizon) -
log_descriptor.bc.current_page_fill),
- (ulong) log_descriptor.bc.buffer->size,
+ (ulong) old_buffer->size,
(ulong) (log_descriptor.bc.ptr -log_descriptor.bc.
buffer->buffer),
(uint) log_descriptor.bc.current_page_fill,
(uint) left,
- (uint) log_descriptor.bc.buffer->
+ (uint) old_buffer->
copy_to_buffer_in_progress));
translog_lock_assert_owner();
LINT_INIT(current_page_fill);
- new_buff_beginning= log_descriptor.bc.buffer->offset;
- new_buff_beginning+= log_descriptor.bc.buffer->size; /* increase offset */
+ new_buff_beginning= old_buffer->offset;
+ new_buff_beginning+= old_buffer->size; /* increase offset */
DBUG_ASSERT(log_descriptor.bc.ptr !=NULL);
DBUG_ASSERT(LSN_FILE_NO(log_descriptor.horizon) ==
- LSN_FILE_NO(log_descriptor.bc.buffer->offset));
+ LSN_FILE_NO(old_buffer->offset));
translog_check_cursor(&log_descriptor.bc);
DBUG_ASSERT(left < TRANSLOG_PAGE_SIZE);
if (left)
@@ -7387,18 +7515,20 @@
*/
DBUG_PRINT("info", ("left: %u", (uint) left));
+ old_buffer->pre_force_close_horizon=
+ old_buffer->offset + old_buffer->size;
/* decrease offset */
new_buff_beginning-= log_descriptor.bc.current_page_fill;
current_page_fill= log_descriptor.bc.current_page_fill;
memset(log_descriptor.bc.ptr, TRANSLOG_FILLER, left);
- log_descriptor.bc.buffer->size+= left;
+ old_buffer->size+= left;
DBUG_PRINT("info", ("Finish Page buffer #%u: 0x%lx "
"Size: %lu",
- (uint) log_descriptor.bc.buffer->buffer_no,
- (ulong) log_descriptor.bc.buffer,
- (ulong) log_descriptor.bc.buffer->size));
- DBUG_ASSERT(log_descriptor.bc.buffer->buffer_no ==
+ (uint) old_buffer->buffer_no,
+ (ulong) old_buffer,
+ (ulong) old_buffer->size));
+ DBUG_ASSERT(old_buffer->buffer_no ==
log_descriptor.bc.buffer_no);
}
else
@@ -7509,11 +7639,21 @@
if (left)
{
- /*
- TODO: do not copy beginning of the page if we have no CRC or sector
- checks on
- */
- memcpy(new_buffer->buffer, data, current_page_fill);
+ if (log_descriptor.flags &
+ (TRANSLOG_PAGE_CRC | TRANSLOG_SECTOR_PROTECTION))
+ memcpy(new_buffer->buffer, data, current_page_fill);
+ else
+ {
+ /*
+ This page header does not change if we add more data to the page so
+ we can not copy it and will not overwrite later
+ */
+ new_buffer->skipped_data= current_page_fill;
+#ifndef DBUG_OFF
+ memset(new_buffer->buffer, 0xa5, current_page_fill);
+#endif
+ DBUG_ASSERT(new_buffer->skipped_data < TRANSLOG_PAGE_SIZE);
+ }
}
old_buffer->next_buffer_offset= new_buffer->offset;
translog_buffer_lock(new_buffer);
@@ -7561,6 +7701,7 @@
{
log_descriptor.next_pass_max_lsn= lsn;
log_descriptor.max_lsn_requester= pthread_self();
+ pthread_cond_broadcast(&log_descriptor.new_goal_cond);
}
while (flush_no == log_descriptor.flush_no)
{
@@ -7572,66 +7713,78 @@
/**
- @brief Flush the log up to given LSN (included)
-
- @param lsn log record serial number up to which (inclusive)
- the log has to be flushed
-
- @return Operation status
+ @brief sync() range of files (inclusive) and directory (by request)
+
+ @param min min internal file number to flush
+ @param max max internal file number to flush
+ @param sync_dir need sync directory
+
+ return Operation status
@retval 0 OK
@retval 1 Error
-
-*/
-
-my_bool translog_flush(TRANSLOG_ADDRESS lsn)
-{
- LSN sent_to_disk= LSN_IMPOSSIBLE;
- TRANSLOG_ADDRESS flush_horizon;
- uint fn, i;
+*/
+
+static my_bool translog_sync_files(uint32 min, uint32 max,
+ my_bool sync_dir)
+{
+ uint fn;
+ my_bool rc= 0;
+ ulonglong flush_interval;
+ DBUG_ENTER("translog_sync_files");
+ DBUG_PRINT("info", ("min: %lu max: %lu sync dir: %d",
+ (ulong) min, (ulong) max, (int) sync_dir));
+ DBUG_ASSERT(min <= max);
+
+ flush_interval= group_commit_wait;
+ if (flush_interval)
+ flush_start= my_micro_time();
+ for (fn= min; fn <= max; fn++)
+ {
+ TRANSLOG_FILE *file= get_logfile_by_number(fn);
+ DBUG_ASSERT(file != NULL);
+ if (!file->is_sync)
+ {
+ if (my_sync(file->handler.file, MYF(MY_WME)))
+ {
+ rc= 1;
+ translog_stop_writing();
+ DBUG_RETURN(rc);
+ }
+ translog_syncs++;
+ file->is_sync= 1;
+ }
+ }
+
+ if (sync_dir)
+ {
+ if (!(rc= sync_dir(log_descriptor.directory_fd,
+ MYF(MY_WME | MY_IGNORE_BADFD))))
+ translog_syncs++;
+ }
+
+ DBUG_RETURN(rc);
+}
+
+
+/*
+ @brief Flushes buffers with LSNs in them less or equal address <lsn>
+
+ @param lsn address up to which all LSNs should be flushed,
+ can be reset to real last LSN address
+ @parem sent_to_disk returns 'sent to disk' position
+ @param flush_horizon returns horizon of the flush
+
+ @note About terminology see comment to translog_flush().
+*/
+
+void translog_flush_buffers(TRANSLOG_ADDRESS *lsn,
+ TRANSLOG_ADDRESS *sent_to_disk,
+ TRANSLOG_ADDRESS *flush_horizon)
+{
dirty_buffer_mask_t dirty_buffer_mask;
+ uint i;
uint8 last_buffer_no, start_buffer_no;
- my_bool rc= 0;
- DBUG_ENTER("translog_flush");
- DBUG_PRINT("enter", ("Flush up to LSN: (%lu,0x%lx)", LSN_IN_PARTS(lsn)));
- DBUG_ASSERT(translog_status == TRANSLOG_OK ||
- translog_status == TRANSLOG_READONLY);
- LINT_INIT(sent_to_disk);
-
- pthread_mutex_lock(&log_descriptor.log_flush_lock);
- DBUG_PRINT("info", ("Everything is flushed up to (%lu,0x%lx)",
- LSN_IN_PARTS(log_descriptor.flushed)));
- if (cmp_translog_addr(log_descriptor.flushed, lsn) >= 0)
- {
- pthread_mutex_unlock(&log_descriptor.log_flush_lock);
- DBUG_RETURN(0);
- }
- if (log_descriptor.flush_in_progress)
- {
- translog_flush_set_new_goal_and_wait(lsn);
- if (!pthread_equal(log_descriptor.max_lsn_requester, pthread_self()))
- {
- /* fix lsn if it was horizon */
- if (cmp_translog_addr(lsn, log_descriptor.bc.buffer->last_lsn) > 0)
- lsn= BUFFER_MAX_LSN(log_descriptor.bc.buffer);
- translog_flush_wait_for_end(lsn);
- pthread_mutex_unlock(&log_descriptor.log_flush_lock);
- DBUG_RETURN(0);
- }
- log_descriptor.next_pass_max_lsn= LSN_IMPOSSIBLE;
- }
- log_descriptor.flush_in_progress= 1;
- flush_horizon= log_descriptor.previous_flush_horizon;
- DBUG_PRINT("info", ("flush_in_progress is set"));
- pthread_mutex_unlock(&log_descriptor.log_flush_lock);
-
- translog_lock();
- if (log_descriptor.is_everything_flushed)
- {
- DBUG_PRINT("info", ("everything is flushed"));
- rc= (translog_status == TRANSLOG_READONLY);
- translog_unlock();
- goto out;
- }
+ DBUG_ENTER("translog_flush_buffers");
/*
We will recheck information when will lock buffers one by
@@ -7656,15 +7809,15 @@
/*
if LSN up to which we have to flush bigger then maximum LSN of previous
buffer and at least one LSN was saved in the current buffer (last_lsn !=
- LSN_IMPOSSIBLE) then we better finish the current buffer.
+ LSN_IMPOSSIBLE) then we have to close the current buffer.
*/
- if (cmp_translog_addr(lsn, log_descriptor.bc.buffer->prev_last_lsn) > 0 &&
+ if (cmp_translog_addr(*lsn, log_descriptor.bc.buffer->prev_last_lsn) > 0 &&
log_descriptor.bc.buffer->last_lsn != LSN_IMPOSSIBLE)
{
struct st_translog_buffer *buffer= log_descriptor.bc.buffer;
- lsn= log_descriptor.bc.buffer->last_lsn; /* fix lsn if it was horizon */
+ *lsn= log_descriptor.bc.buffer->last_lsn; /* fix lsn if it was horizon */
DBUG_PRINT("info", ("LSN to flush fixed to last lsn: (%lu,0x%lx)",
- LSN_IN_PARTS(log_descriptor.bc.buffer->last_lsn)));
+ LSN_IN_PARTS(log_descriptor.bc.buffer->last_lsn)));
last_buffer_no= log_descriptor.bc.buffer_no;
log_descriptor.is_everything_flushed= 1;
translog_force_current_buffer_to_finish();
@@ -7676,8 +7829,10 @@
TRANSLOG_BUFFERS_NO);
translog_unlock();
}
- sent_to_disk= translog_get_sent_to_disk();
- if (cmp_translog_addr(lsn, sent_to_disk) > 0)
+
+ /* flush buffers */
+ *sent_to_disk= translog_get_sent_to_disk();
+ if (cmp_translog_addr(*lsn, *sent_to_disk) > 0)
{
DBUG_PRINT("info", ("Start buffer #: %u last buffer #: %u",
@@ -7697,53 +7852,238 @@
LSN_IN_PARTS(buffer->last_lsn),
(buffer->file ?
"dirty" : "closed")));
- if (buffer->prev_last_lsn <= lsn &&
+ if (buffer->prev_last_lsn <= *lsn &&
buffer->file != NULL)
{
- DBUG_ASSERT(flush_horizon <= buffer->offset + buffer->size);
- flush_horizon= buffer->offset + buffer->size;
+ DBUG_ASSERT(*flush_horizon <= buffer->offset + buffer->size);
+ *flush_horizon= (buffer->pre_force_close_horizon != LSN_IMPOSSIBLE ?
+ buffer->pre_force_close_horizon :
+ buffer->offset + buffer->size);
+ /* pre_force_close_horizon is reset during new buffer start */
+ DBUG_PRINT("info", ("flush_horizon: (%lu,0x%lx)",
+ LSN_IN_PARTS(*flush_horizon)));
+ DBUG_ASSERT(*flush_horizon <= log_descriptor.horizon);
+
translog_buffer_flush(buffer);
}
translog_buffer_unlock(buffer);
i= (i + 1) % TRANSLOG_BUFFERS_NO;
} while (i != last_buffer_no);
- sent_to_disk= translog_get_sent_to_disk();
- }
-
- /* sync files from previous flush till current one */
- for (fn= LSN_FILE_NO(log_descriptor.flushed); fn <= LSN_FILE_NO(lsn); fn++)
- {
- TRANSLOG_FILE *file= get_logfile_by_number(fn);
- DBUG_ASSERT(file != NULL);
- if (!file->is_sync)
- {
- if (my_sync(file->handler.file, MYF(MY_WME)))
+ *sent_to_disk= translog_get_sent_to_disk();
+ }
+
+ DBUG_VOID_RETURN;
+}
+
+/**
+ @brief Flush the log up to given LSN (included)
+
+ @param lsn log record serial number up to which (inclusive)
+ the log has to be flushed
+
+ @return Operation status
+ @retval 0 OK
+ @retval 1 Error
+
+ @note
+
+ - Non group commit logic: Commits made in passes. Thread which started
+ flush first is performing actual flush, other threads sets new goal (LSN)
+ of the next pass (if it is maximum) and waits for the pass end or just
+ wait for the pass end.
+
+ - If hard group commit enabled and rate set to zero:
+ The first thread sends all changed buffers to disk. This is repeated
+ as long as there are new LSNs added. The process can not loop
+ forever because we have limited number of threads and they will wait
+ for the data to be synced.
+ Pseudo code:
+
+ do
+ send changed buffers to disk
+ while new_goal
+ sync
+
+ - If hard group commit switched ON and less than rate microseconds has
+ passed from last sync, then after buffers have been sent to disk
+ wait until rate microseconds has passed since last sync, do sync and return.
+ This ensures that if we call sync infrequently we don't do any waits.
+
+ - If soft group commit enabled everything works as with 'non group commit'
+ but the thread doesn't do any real sync(). If rate is not zero the
+ sync() will be performed by a service thread with the given rate
+ when needed (new LSN appears).
+
+ @note Terminology:
+ 'sent to disk' means written to disk but not sync()ed,
+ 'flushed' mean sent to disk and synced().
+*/
+
+my_bool translog_flush(TRANSLOG_ADDRESS lsn)
+{
+ struct timespec abstime;
+ ulonglong flush_interval;
+ ulonglong time_spent;
+ LSN sent_to_disk= LSN_IMPOSSIBLE;
+ TRANSLOG_ADDRESS flush_horizon;
+ my_bool rc= 0;
+ my_bool hgroup_commit_at_start;
+ DBUG_ENTER("translog_flush");
+ DBUG_PRINT("enter", ("Flush up to LSN: (%lu,0x%lx)", LSN_IN_PARTS(lsn)));
+ DBUG_ASSERT(translog_status == TRANSLOG_OK ||
+ translog_status == TRANSLOG_READONLY);
+ LINT_INIT(sent_to_disk);
+
+ pthread_mutex_lock(&log_descriptor.log_flush_lock);
+ DBUG_PRINT("info", ("Everything is flushed up to (%lu,0x%lx)",
+ LSN_IN_PARTS(log_descriptor.flushed)));
+ if (cmp_translog_addr(log_descriptor.flushed, lsn) >= 0)
+ {
+ pthread_mutex_unlock(&log_descriptor.log_flush_lock);
+ DBUG_RETURN(0);
+ }
+ if (log_descriptor.flush_in_progress)
+ {
+ translog_lock();
+ /* fix lsn if it was horizon */
+ if (cmp_translog_addr(lsn, log_descriptor.bc.buffer->last_lsn) > 0)
+ lsn= BUFFER_MAX_LSN(log_descriptor.bc.buffer);
+ translog_unlock();
+ translog_flush_set_new_goal_and_wait(lsn);
+ if (!pthread_equal(log_descriptor.max_lsn_requester, pthread_self()))
+ {
+ /*
+ translog_flush_wait_for_end() release log_flush_lock while is
+ waiting then acquire it again
+ */
+ translog_flush_wait_for_end(lsn);
+ pthread_mutex_unlock(&log_descriptor.log_flush_lock);
+ DBUG_RETURN(0);
+ }
+ log_descriptor.next_pass_max_lsn= LSN_IMPOSSIBLE;
+ }
+ log_descriptor.flush_in_progress= 1;
+ flush_horizon= log_descriptor.previous_flush_horizon;
+ DBUG_PRINT("info", ("flush_in_progress is set, flush_horizon: (%lu,0x%lx)",
+ LSN_IN_PARTS(flush_horizon)));
+ pthread_mutex_unlock(&log_descriptor.log_flush_lock);
+
+ hgroup_commit_at_start= hard_group_commit;
+ if (hgroup_commit_at_start)
+ flush_interval= group_commit_wait;
+
+ translog_lock();
+ if (log_descriptor.is_everything_flushed)
+ {
+ DBUG_PRINT("info", ("everything is flushed"));
+ translog_unlock();
+ pthread_mutex_lock(&log_descriptor.log_flush_lock);
+ goto out;
+ }
+
+ for (;;)
+ {
+ /* Following function flushes buffers and makes translog_unlock() */
+ translog_flush_buffers(&lsn, &sent_to_disk, &flush_horizon);
+
+ if (!hgroup_commit_at_start)
+ break; /* flush pass is ended */
+
+retest:
+ /*
+ We do not check time here because pthread_mutex_lock rarely takes
+ a lot of time so we can sacrifice a bit precision to performance
+ (taking into account that my_micro_time() might be expensive call).
+ */
+ if (flush_interval == 0)
+ break; /* flush pass is ended */
+
+ pthread_mutex_lock(&log_descriptor.log_flush_lock);
+ if (log_descriptor.next_pass_max_lsn == LSN_IMPOSSIBLE)
+ {
+ if (flush_interval == 0 ||
+ (time_spent= (my_micro_time() - flush_start)) >= flush_interval)
{
- rc= 1;
- translog_stop_writing();
- sent_to_disk= LSN_IMPOSSIBLE;
- goto out;
+ pthread_mutex_unlock(&log_descriptor.log_flush_lock);
+ break;
}
- file->is_sync= 1;
- }
- }
-
- if (sync_log_dir >= TRANSLOG_SYNC_DIR_ALWAYS &&
- (LSN_FILE_NO(log_descriptor.previous_flush_horizon) !=
- LSN_FILE_NO(flush_horizon) ||
- ((LSN_OFFSET(log_descriptor.previous_flush_horizon) - 1) /
- TRANSLOG_PAGE_SIZE) !=
- ((LSN_OFFSET(flush_horizon) - 1) / TRANSLOG_PAGE_SIZE)))
- rc|= sync_dir(log_descriptor.directory_fd, MYF(MY_WME | MY_IGNORE_BADFD));
+ DBUG_PRINT("info", ("flush waits: %llu interval: %llu spent: %llu",
+ flush_interval - time_spent,
+ flush_interval, time_spent));
+ /* wait time or next goal */
+ set_timespec_nsec(abstime, flush_interval - time_spent);
+ pthread_cond_timedwait(&log_descriptor.new_goal_cond,
+ &log_descriptor.log_flush_lock,
+ &abstime);
+ pthread_mutex_unlock(&log_descriptor.log_flush_lock);
+ DBUG_PRINT("info", ("retest conditions"));
+ goto retest;
+ }
+
+ /* take next goal */
+ lsn= log_descriptor.next_pass_max_lsn;
+ log_descriptor.next_pass_max_lsn= LSN_IMPOSSIBLE;
+ /* prevent other thread from continue */
+ log_descriptor.max_lsn_requester= pthread_self();
+ DBUG_PRINT("info", ("flush took next goal: (%lu,0x%lx)",
+ LSN_IN_PARTS(lsn)));
+ pthread_mutex_unlock(&log_descriptor.log_flush_lock);
+
+ /* next flush pass */
+ DBUG_PRINT("info", ("next flush pass"));
+ translog_lock();
+ }
+
+ /*
+ sync() files from previous flush till current one
+ */
+ if (!soft_sync || hgroup_commit_at_start)
+ {
+ if ((rc=
+ translog_sync_files(LSN_FILE_NO(log_descriptor.flushed),
+ LSN_FILE_NO(lsn),
+ sync_log_dir >= TRANSLOG_SYNC_DIR_ALWAYS &&
+ (LSN_FILE_NO(log_descriptor.
+ previous_flush_horizon) !=
+ LSN_FILE_NO(flush_horizon) ||
+ (LSN_OFFSET(log_descriptor.
+ previous_flush_horizon) /
+ TRANSLOG_PAGE_SIZE) !=
+ (LSN_OFFSET(flush_horizon) /
+ TRANSLOG_PAGE_SIZE)))))
+ {
+ sent_to_disk= LSN_IMPOSSIBLE;
+ pthread_mutex_lock(&log_descriptor.log_flush_lock);
+ goto out;
+ }
+ /* keep values for soft sync() and forced sync() actual */
+ {
+ uint32 fileno= LSN_FILE_NO(lsn);
+ my_atomic_rwlock_wrlock(&soft_sync_rwl);
+ my_atomic_store32(&soft_sync_min, fileno);
+ my_atomic_store32(&soft_sync_max, fileno);
+ my_atomic_rwlock_wrunlock(&soft_sync_rwl);
+ }
+ }
+ else
+ {
+ my_atomic_rwlock_wrlock(&soft_sync_rwl);
+ my_atomic_store32(&soft_sync_max, LSN_FILE_NO(lsn));
+ my_atomic_store32(&soft_need_sync, 1);
+ my_atomic_rwlock_wrunlock(&soft_sync_rwl);
+ }
+
+ DBUG_ASSERT(flush_horizon <= log_descriptor.horizon);
+
+ pthread_mutex_lock(&log_descriptor.log_flush_lock);
log_descriptor.previous_flush_horizon= flush_horizon;
out:
- pthread_mutex_lock(&log_descriptor.log_flush_lock);
if (sent_to_disk != LSN_IMPOSSIBLE)
log_descriptor.flushed= sent_to_disk;
log_descriptor.flush_in_progress= 0;
log_descriptor.flush_no++;
DBUG_PRINT("info", ("flush_in_progress is dropped"));
- pthread_mutex_unlock(&log_descriptor.log_flush_lock);\
+ pthread_mutex_unlock(&log_descriptor.log_flush_lock);
pthread_cond_broadcast(&log_descriptor.log_flush_cond);
DBUG_RETURN(rc);
}
@@ -8113,6 +8453,8 @@
my_bool translog_purge(TRANSLOG_ADDRESS low)
{
uint32 last_need_file= LSN_FILE_NO(low);
+ uint32 min_unsync;
+ int soft;
TRANSLOG_ADDRESS horizon= translog_get_horizon();
int rc= 0;
DBUG_ENTER("translog_purge");
@@ -8120,12 +8462,26 @@
DBUG_ASSERT(translog_status == TRANSLOG_OK ||
translog_status == TRANSLOG_READONLY);
+ soft= soft_sync;
+ my_atomic_rwlock_wrlock(&soft_sync_rwl);
+ min_unsync= my_atomic_load32(&soft_sync_min);
+ my_atomic_rwlock_wrunlock(&soft_sync_rwl);
+ DBUG_PRINT("info", ("min_unsync: %lu", (ulong) min_unsync));
+ if (soft && min_unsync < last_need_file)
+ {
+ last_need_file= min_unsync;
+ DBUG_PRINT("info", ("last_need_file set to %lu", (ulong)last_need_file));
+ }
+
pthread_mutex_lock(&log_descriptor.purger_lock);
+ DBUG_PRINT("info", ("last_lsn_checked file: %lu:",
+ (ulong) log_descriptor.last_lsn_checked));
if (LSN_FILE_NO(log_descriptor.last_lsn_checked) < last_need_file)
{
uint32 i;
uint32 min_file= translog_first_file(horizon, 1);
DBUG_ASSERT(min_file != 0); /* log is already started */
+ DBUG_PRINT("info", ("min_file: %lu:",(ulong) min_file));
for(i= min_file; i < last_need_file && rc == 0; i++)
{
LSN lsn= translog_get_file_max_lsn_stored(i);
@@ -8356,6 +8712,159 @@
}
+
+/**
+ Sets soft sync mode
+
+ @param mode TRUE if we need switch soft sync on else off
+*/
+
+void translog_soft_sync(my_bool mode)
+{
+ soft_sync= mode;
+}
+
+
+/**
+ Sets hard group commit
+
+ @param mode TRUE if we need switch hard group commit on else off
+*/
+
+void translog_hard_group_commit(my_bool mode)
+{
+ hard_group_commit= mode;
+}
+
+
+/**
+ @brief forced log sync (used when we are switching modes)
+*/
+
+void translog_sync()
+{
+ uint32 max= get_current_logfile()->number;
+ uint32 min;
+ DBUG_ENTER("ma_translog_sync");
+
+ my_atomic_rwlock_rdlock(&soft_sync_rwl);
+ min= my_atomic_load32(&soft_sync_min);
+ my_atomic_rwlock_rdunlock(&soft_sync_rwl);
+ if (!min)
+ min= max;
+
+ translog_sync_files(min, max, sync_log_dir >= TRANSLOG_SYNC_DIR_ALWAYS);
+
+ DBUG_VOID_RETURN;
+}
+
+
+/**
+ @brief set rate for group commit
+
+ @param interval interval to set.
+
+ @note We use this function with additional variable because have to
+ restart service thread with new value which we can't make inside changing
+ variable routine (update_maria_group_commit_interval)
+*/
+
+void translog_set_group_commit_interval(uint32 interval)
+{
+ DBUG_ENTER("translog_set_group_commit_interval");
+ group_commit_wait= interval;
+ DBUG_PRINT("info", ("wait: %llu",
+ (ulonglong)group_commit_wait));
+ DBUG_VOID_RETURN;
+}
+
+
+/**
+ @brief syncing service thread
+*/
+
+static pthread_handler_t
+ma_soft_sync_background( void *arg __attribute__((unused)))
+{
+
+ my_thread_init();
+ {
+ DBUG_ENTER("ma_soft_sync_background");
+ for(;;)
+ {
+ ulonglong prev_loop= my_micro_time();
+ ulonglong time, sleep;
+ uint32 min, max, sync_request;
+ my_atomic_rwlock_rdlock(&soft_sync_rwl);
+ min= my_atomic_load32(&soft_sync_min);
+ max= my_atomic_load32(&soft_sync_max);
+ sync_request= my_atomic_load32(&soft_need_sync);
+ my_atomic_store32(&soft_sync_min, max);
+ my_atomic_store32(&soft_need_sync, 0);
+ my_atomic_rwlock_rdunlock(&soft_sync_rwl);
+
+ sleep= group_commit_wait;
+ if (sync_request)
+ translog_sync_files(min, max, FALSE);
+ time= my_micro_time() - prev_loop;
+ if (time > sleep)
+ sleep= 0;
+ else
+ sleep-= time;
+ if (my_service_thread_sleep(&soft_sync_control, sleep))
+ break;
+ }
+ my_service_thread_signal_end(&soft_sync_control);
+ my_thread_end();
+ DBUG_RETURN(0);
+ }
+}
+
+
+/**
+ @brief Starts syncing thread
+*/
+
+int translog_soft_sync_start(void)
+{
+ pthread_t th;
+ int res= 0;
+ uint32 min, max;
+ DBUG_ENTER("translog_soft_sync_start");
+
+ /* check and init variables */
+ my_atomic_rwlock_rdlock(&soft_sync_rwl);
+ min= my_atomic_load32(&soft_sync_min);
+ max= my_atomic_load32(&soft_sync_max);
+ if (!max)
+ my_atomic_store32(&soft_sync_max, (max= get_current_logfile()->number));
+ if (!min)
+ my_atomic_store32(&soft_sync_min, max);
+ my_atomic_store32(&soft_need_sync, 1);
+ my_atomic_rwlock_rdunlock(&soft_sync_rwl);
+
+ if (!(res= ma_service_thread_control_init(&soft_sync_control)))
+ if (!(res= pthread_create(&th, NULL, ma_soft_sync_background, NULL)))
+ soft_sync_control.status= THREAD_RUNNING;
+ DBUG_RETURN(res);
+}
+
+
+/**
+ @brief Stops syncing thread
+*/
+
+void translog_soft_sync_end(void)
+{
+ DBUG_ENTER("translog_soft_sync_end");
+ if (soft_sync_control.inited)
+ {
+ ma_service_thread_control_end(&soft_sync_control);
+ }
+ DBUG_VOID_RETURN;
+}
+
+
#ifdef MARIA_DUMP_LOG
#include <my_getopt.h>
extern void translog_example_table_init();
=== modified file 'storage/maria/ma_loghandler.h'
--- a/storage/maria/ma_loghandler.h 2009-01-15 22:25:53 +0000
+++ b/storage/maria/ma_loghandler.h 2010-02-12 06:52:47 +0000
@@ -342,6 +342,14 @@
TRANSLOG_SHUTDOWN /* going to shutdown the loghandler */
};
extern enum enum_translog_status translog_status;
+extern ulonglong translog_syncs; /* Number of sync()s */
+
+void translog_soft_sync(my_bool mode);
+void translog_hard_group_commit(my_bool mode);
+int translog_soft_sync_start(void);
+void translog_soft_sync_end(void);
+void translog_sync();
+void translog_set_group_commit_interval(uint32 interval);
/*
all the rest added because of recovery; should we make
@@ -441,6 +449,14 @@
typedef enum
{
+ TRANSLOG_GCOMMIT_NONE,
+ TRANSLOG_GCOMMIT_HARD,
+ TRANSLOG_GCOMMIT_SOFT
+} enum_maria_group_commit;
+extern ulong maria_group_commit;
+extern ulong maria_group_commit_interval;
+typedef enum
+{
TRANSLOG_PURGE_IMMIDIATE,
TRANSLOG_PURGE_EXTERNAL,
TRANSLOG_PURGE_ONDEMAND
1
0

[Maria-developers] Rev 2756: BUG#31480: Incorrect result for nested subquery when executed via semi join in file:///home/psergey/dev/maria-5.3-subqueries-r6/
by Sergey Petrunya 12 Feb '10
by Sergey Petrunya 12 Feb '10
12 Feb '10
At file:///home/psergey/dev/maria-5.3-subqueries-r6/
------------------------------------------------------------
revno: 2756
revision-id: psergey(a)askmonty.org-20100211235958-p11o4e80dlrn2bsq
parent: psergey(a)askmonty.org-20100211223118-5fzuidow1pkubpzl
committer: Sergey Petrunya <psergey(a)askmonty.org>
branch nick: maria-5.3-subqueries-r6
timestamp: Fri 2010-02-12 02:59:58 +0300
message:
BUG#31480: Incorrect result for nested subquery when executed via semi join
- Variant #3 of the fix. It also
= Unifies code with table elimination's
= is able to handle FROM-subquery pullout.
=== modified file 'mysql-test/r/subselect_sj.result'
--- a/mysql-test/r/subselect_sj.result 2010-01-17 14:51:10 +0000
+++ b/mysql-test/r/subselect_sj.result 2010-02-11 23:59:58 +0000
@@ -779,3 +779,48 @@
1 PRIMARY it2 ALL NULL NULL NULL NULL 20 Using where; End temporary
DROP TABLE ot1, it1, it2;
# End of BUG#38075
+#
+# BUG#31480: Incorrect result for nested subquery when executed via semi join
+#
+create table t1 (a int not null, b int not null);
+create table t2 (c int not null, d int not null);
+create table t3 (e int not null);
+insert into t1 values (1,10);
+insert into t1 values (2,10);
+insert into t1 values (1,20);
+insert into t1 values (2,20);
+insert into t1 values (3,20);
+insert into t1 values (2,30);
+insert into t1 values (4,40);
+insert into t2 values (2,10);
+insert into t2 values (2,20);
+insert into t2 values (4,10);
+insert into t2 values (5,10);
+insert into t2 values (3,20);
+insert into t2 values (2,40);
+insert into t3 values (10);
+insert into t3 values (30);
+insert into t3 values (10);
+insert into t3 values (20);
+explain extended
+select a from t1
+where a in (select c from t2 where d >= some(select e from t3 where b=e));
+id select_type table type possible_keys key key_len ref rows filtered Extra
+1 PRIMARY t2 ALL NULL NULL NULL NULL 6 100.00 Start temporary
+1 PRIMARY t1 ALL NULL NULL NULL NULL 7 100.00 Using where; End temporary; Using join buffer
+3 DEPENDENT SUBQUERY t3 ALL NULL NULL NULL NULL 4 100.00 Using where
+Warnings:
+Note 1276 Field or reference 'test.t1.b' of SELECT #3 was resolved in SELECT #1
+Note 1003 select `test`.`t1`.`a` AS `a` from `test`.`t1` semi join (`test`.`t2`) where ((`test`.`t1`.`a` = `test`.`t2`.`c`) and <nop>(<in_optimizer>(`test`.`t2`.`d`,<exists>(select 1 AS `Not_used` from `test`.`t3` where ((`test`.`t1`.`b` = `test`.`t3`.`e`) and (<cache>(`test`.`t2`.`d`) >= `test`.`t3`.`e`))))))
+show warnings;
+Level Code Message
+Note 1276 Field or reference 'test.t1.b' of SELECT #3 was resolved in SELECT #1
+Note 1003 select `test`.`t1`.`a` AS `a` from `test`.`t1` semi join (`test`.`t2`) where ((`test`.`t1`.`a` = `test`.`t2`.`c`) and <nop>(<in_optimizer>(`test`.`t2`.`d`,<exists>(select 1 AS `Not_used` from `test`.`t3` where ((`test`.`t1`.`b` = `test`.`t3`.`e`) and (<cache>(`test`.`t2`.`d`) >= `test`.`t3`.`e`))))))
+select a from t1
+where a in (select c from t2 where d >= some(select e from t3 where b=e));
+a
+2
+2
+3
+2
+drop table t1, t2, t3;
=== modified file 'mysql-test/r/subselect_sj_jcl6.result'
--- a/mysql-test/r/subselect_sj_jcl6.result 2010-01-17 14:51:10 +0000
+++ b/mysql-test/r/subselect_sj_jcl6.result 2010-02-11 23:59:58 +0000
@@ -783,6 +783,51 @@
1 PRIMARY it2 ALL NULL NULL NULL NULL 20 Using where; End temporary; Using join buffer
DROP TABLE ot1, it1, it2;
# End of BUG#38075
+#
+# BUG#31480: Incorrect result for nested subquery when executed via semi join
+#
+create table t1 (a int not null, b int not null);
+create table t2 (c int not null, d int not null);
+create table t3 (e int not null);
+insert into t1 values (1,10);
+insert into t1 values (2,10);
+insert into t1 values (1,20);
+insert into t1 values (2,20);
+insert into t1 values (3,20);
+insert into t1 values (2,30);
+insert into t1 values (4,40);
+insert into t2 values (2,10);
+insert into t2 values (2,20);
+insert into t2 values (4,10);
+insert into t2 values (5,10);
+insert into t2 values (3,20);
+insert into t2 values (2,40);
+insert into t3 values (10);
+insert into t3 values (30);
+insert into t3 values (10);
+insert into t3 values (20);
+explain extended
+select a from t1
+where a in (select c from t2 where d >= some(select e from t3 where b=e));
+id select_type table type possible_keys key key_len ref rows filtered Extra
+1 PRIMARY t2 ALL NULL NULL NULL NULL 6 100.00 Start temporary
+1 PRIMARY t1 ALL NULL NULL NULL NULL 7 100.00 Using where; End temporary; Using join buffer
+3 DEPENDENT SUBQUERY t3 ALL NULL NULL NULL NULL 4 100.00 Using where
+Warnings:
+Note 1276 Field or reference 'test.t1.b' of SELECT #3 was resolved in SELECT #1
+Note 1003 select `test`.`t1`.`a` AS `a` from `test`.`t1` semi join (`test`.`t2`) where ((`test`.`t1`.`a` = `test`.`t2`.`c`) and <nop>(<in_optimizer>(`test`.`t2`.`d`,<exists>(select 1 AS `Not_used` from `test`.`t3` where ((`test`.`t1`.`b` = `test`.`t3`.`e`) and (<cache>(`test`.`t2`.`d`) >= `test`.`t3`.`e`))))))
+show warnings;
+Level Code Message
+Note 1276 Field or reference 'test.t1.b' of SELECT #3 was resolved in SELECT #1
+Note 1003 select `test`.`t1`.`a` AS `a` from `test`.`t1` semi join (`test`.`t2`) where ((`test`.`t1`.`a` = `test`.`t2`.`c`) and <nop>(<in_optimizer>(`test`.`t2`.`d`,<exists>(select 1 AS `Not_used` from `test`.`t3` where ((`test`.`t1`.`b` = `test`.`t3`.`e`) and (<cache>(`test`.`t2`.`d`) >= `test`.`t3`.`e`))))))
+select a from t1
+where a in (select c from t2 where d >= some(select e from t3 where b=e));
+a
+2
+2
+3
+2
+drop table t1, t2, t3;
set join_cache_level=default;
show variables like 'join_cache_level';
Variable_name Value
=== modified file 'mysql-test/t/subselect_sj.test'
--- a/mysql-test/t/subselect_sj.test 2010-01-17 14:51:10 +0000
+++ b/mysql-test/t/subselect_sj.test 2010-02-11 23:59:58 +0000
@@ -681,3 +681,41 @@
DROP TABLE ot1, it1, it2;
--echo # End of BUG#38075
+
+--echo #
+--echo # BUG#31480: Incorrect result for nested subquery when executed via semi join
+--echo #
+create table t1 (a int not null, b int not null);
+create table t2 (c int not null, d int not null);
+create table t3 (e int not null);
+
+insert into t1 values (1,10);
+insert into t1 values (2,10);
+insert into t1 values (1,20);
+insert into t1 values (2,20);
+insert into t1 values (3,20);
+insert into t1 values (2,30);
+insert into t1 values (4,40);
+
+insert into t2 values (2,10);
+insert into t2 values (2,20);
+insert into t2 values (4,10);
+insert into t2 values (5,10);
+insert into t2 values (3,20);
+insert into t2 values (2,40);
+
+insert into t3 values (10);
+insert into t3 values (30);
+insert into t3 values (10);
+insert into t3 values (20);
+
+explain extended
+select a from t1
+where a in (select c from t2 where d >= some(select e from t3 where b=e));
+show warnings;
+
+select a from t1
+where a in (select c from t2 where d >= some(select e from t3 where b=e));
+
+drop table t1, t2, t3;
+
=== modified file 'sql/item.cc'
--- a/sql/item.cc 2010-02-11 22:00:36 +0000
+++ b/sql/item.cc 2010-02-11 23:59:58 +0000
@@ -3647,7 +3647,7 @@
substitution)
*/
-static void mark_as_dependent(THD *thd, SELECT_LEX *last, SELECT_LEX *current,
+static bool mark_as_dependent(THD *thd, SELECT_LEX *last, SELECT_LEX *current,
Item_ident *resolved_item,
Item_ident *mark_item)
{
@@ -3658,7 +3658,9 @@
/* store pointer on SELECT_LEX from which item is dependent */
if (mark_item)
mark_item->depended_from= last;
- current->mark_as_dependent(last, resolved_item);
+ if (current->mark_as_dependent(thd, last, /** resolved_item psergey-thu
+ **/mark_item))
+ return TRUE;
if (thd->lex->describe & DESCRIBE_EXTENDED)
{
push_warning_printf(thd, MYSQL_ERROR::WARN_LEVEL_NOTE,
@@ -3668,6 +3670,7 @@
resolved_item->field_name,
current->select_number, last->select_number);
}
+ return FALSE;
}
@@ -4119,6 +4122,7 @@
((ref_type == REF_ITEM || ref_type == FIELD_ITEM) ?
(Item_ident*) (*reference) :
0));
+
/*
A reference to a view field had been found and we
substituted it instead of this Item (find_field_in_tables
@@ -4218,7 +4222,7 @@
return -1;
mark_as_dependent(thd, last_checked_context->select_lex,
- context->select_lex, this,
+ context->select_lex, rf,
rf);
return 0;
}
@@ -5998,7 +6002,7 @@
goto error;
thd->change_item_tree(reference, fld);
mark_as_dependent(thd, last_checked_context->select_lex,
- thd->lex->current_select, this, fld);
+ thd->lex->current_select, fld, fld);
/*
A reference is resolved to a nest level that's outer or the same as
the nest level of the enclosing set function : adjust the value of
@@ -6438,7 +6442,7 @@
if (depended_from == new_parent)
{
*ref= outer_ref;
- outer_ref->fix_after_pullout(new_parent, ref);
+ (*ref)->fix_after_pullout(new_parent, ref);
}
}
=== modified file 'sql/item.h'
--- a/sql/item.h 2010-02-11 22:00:36 +0000
+++ b/sql/item.h 2010-02-11 23:59:58 +0000
@@ -1115,7 +1115,9 @@
/*
- Class to be used to enumerate all field references in an item tree.
+ Class to be used to enumerate all field references in an item tree. This
+ includes references to outside but not fields of the tables within a
+ subquery.
Suggested usage:
class My_enumerator : public Field_enumerator
@@ -2377,6 +2379,8 @@
}
bool walk(Item_processor processor, bool walk_subquery, uchar *arg)
{ return (*ref)->walk(processor, walk_subquery, arg); }
+ bool enumerate_field_refs_processor(uchar *arg)
+ { return (*ref)->enumerate_field_refs_processor(arg); }
virtual void print(String *str, enum_query_type query_type);
bool result_as_longlong()
{
=== modified file 'sql/item_subselect.cc'
--- a/sql/item_subselect.cc 2010-01-28 13:48:33 +0000
+++ b/sql/item_subselect.cc 2010-02-11 23:59:58 +0000
@@ -39,8 +39,8 @@
Item_subselect::Item_subselect():
Item_result_field(), value_assigned(0), thd(0), substitution(0),
engine(0), old_engine(0), used_tables_cache(0), have_to_be_excluded(0),
- const_item_cache(1), in_fix_fields(0), engine_changed(0), changed(0),
- is_correlated(FALSE)
+ const_item_cache(1), inside_first_fix_fields(0), done_first_fix_fields(FALSE),
+ engine_changed(0), changed(0), is_correlated(FALSE)
{
with_subselect= 1;
reset();
@@ -167,18 +167,23 @@
DBUG_ASSERT(fixed == 0);
engine->set_thd((thd= thd_param));
- if (!in_fix_fields)
- refers_to.empty();
+ if (!done_first_fix_fields)
+ {
+ done_first_fix_fields= TRUE;
+ inside_first_fix_fields= TRUE;
+ }
+
eliminated= FALSE;
+ parent_select= thd_param->lex->current_select;
if (check_stack_overrun(thd, STACK_MIN_SIZE, (uchar*)&res))
return TRUE;
- in_fix_fields++;
res= engine->prepare();
// all transformation is done (used by prepared statements)
changed= 1;
+ inside_first_fix_fields= FALSE;
if (!res)
{
@@ -210,14 +215,12 @@
if (!(*ref)->fixed)
ret= (*ref)->fix_fields(thd, ref);
thd->where= save_where;
- in_fix_fields--;
return ret;
}
// Is it one field subselect?
if (engine->cols() > max_columns)
{
my_error(ER_OPERAND_COLUMNS, MYF(0), 1);
- in_fix_fields--;
return TRUE;
}
fix_length_and_dec();
@@ -234,7 +237,6 @@
fixed= 1;
err:
- in_fix_fields--;
thd->where= save_where;
return res;
}
@@ -242,11 +244,12 @@
bool Item_subselect::enumerate_field_refs_processor(uchar *arg)
{
- List_iterator<Item> it(refers_to);
- Item *item;
- while ((item= it++))
+ List_iterator<Ref_to_outside> it(upper_refs);
+ Ref_to_outside *upper;
+
+ while ((upper= it++))
{
- if (item->walk(&Item::enumerate_field_refs_processor, FALSE, arg))
+ if (upper->item->walk(&Item::enumerate_field_refs_processor, FALSE, arg))
return TRUE;
}
return FALSE;
@@ -258,6 +261,142 @@
return FALSE;
}
+
+bool Item_subselect::mark_as_dependent(THD *thd, st_select_lex *select,
+ Item *item)
+{
+ if (inside_first_fix_fields)
+ {
+ is_correlated= TRUE;
+ Ref_to_outside *upper;
+ if (!(upper= new (thd->stmt_arena->mem_root) Ref_to_outside()))
+ return TRUE;
+ upper->select= select;
+ upper->item= item;
+ if (upper_refs.push_back(upper, thd->stmt_arena->mem_root))
+ return TRUE;
+ }
+ return FALSE;
+}
+
+/*
+ Adjust attributes after our parent select has been merged into grandparent
+
+ DESCRIPTION
+ Subquery is a composite object which may be correlated, that is, it may
+ have
+ 1. references to tables of the parent select (i.e. one that has the clause
+ with the subquery predicate)
+ 2. references to tables of the grandparent select
+ 3. references to tables of further ancestors.
+
+ Before the pullout, this item indicates:
+ - #1 with table bits in used_tables()
+ - #2 and #3 with OUTER_REF_TABLE_BIT.
+
+ After parent has been merged with grandparent:
+ - references to parent and grandparent tables should be indicated with
+ table bits.
+ - references to greatgrandparent and further ancestors - with
+ OUTER_REF_TABLE_BIT.
+*/
+
+void Item_subselect::fix_after_pullout(st_select_lex *new_parent, Item **ref)
+{
+ recalc_used_tables(new_parent, TRUE);
+ parent_select= new_parent;
+}
+
+class Field_fixer: public Field_enumerator
+{
+public:
+ table_map used_tables; /* Collect used_tables here */
+ st_select_lex *new_parent; /* Select we're in */
+ virtual void visit_field(Field *field)
+ {
+ //for (TABLE_LIST *tbl= new_parent->leaf_tables; tbl; tbl= tbl->next_local)
+ //{
+ // if (tbl->table == field->table)
+ // {
+ used_tables|= field->table->map;
+ // return;
+ // }
+ //}
+ //used_tables |= OUTER_REF_TABLE_BIT;
+ }
+};
+
+
+/*
+ Recalculate used_tables_cache
+*/
+
+void Item_subselect::recalc_used_tables(st_select_lex *new_parent,
+ bool after_pullout)
+{
+ List_iterator<Ref_to_outside> it(upper_refs);
+ Ref_to_outside *upper;
+
+ used_tables_cache= 0;
+ while ((upper= it++))
+ {
+ bool found= FALSE;
+ /*
+ Check if
+ 1. the upper reference refers to the new immediate parent select, or
+ 2. one of the further ancestors.
+
+ We rely on the fact that the tree of selects is modified by some kind of
+ 'flattening', i.e. a process where child selects are merged into their
+ parents.
+ The merged selects are removed from the select tree but keep pointers to
+ their parents.
+ */
+ for (st_select_lex *sel= upper->select; sel; sel= sel->outer_select())
+ {
+ /*
+ If we've reached the new parent select by walking upwards from
+ reference's original select, this means that the reference is now
+ referring to the direct parent:
+ */
+ if (sel == new_parent)
+ {
+ found= TRUE;
+ /*
+ upper->item may be NULL when we've referred to a grouping function,
+ in which case we don't care about what it's table_map really is,
+ because item->with_sum_func==1 will ensure correct placement of the
+ item.
+ */
+ if (upper->item)
+ {
+ // Now, iterate over fields and collect used_tables() attribute:
+ Field_fixer fixer;
+ fixer.used_tables= 0;
+ fixer.new_parent= new_parent;
+ upper->item->walk(&Item::enumerate_field_refs_processor, FALSE,
+ (uchar*)&fixer);
+ used_tables_cache |= fixer.used_tables;
+ /*
+ if (after_pullout)
+ upper->item->fix_after_pullout(new_parent, &(upper->item));
+ upper->item->update_used_tables();
+ used_tables_cache |= upper->item->used_tables();
+ */
+ }
+ }
+ }
+ if (!found)
+ used_tables_cache|= OUTER_REF_TABLE_BIT;
+ }
+ /*
+ Don't update const_tables_cache yet as we don't yet know which of the
+ parent's tables are constant. Parent will call update_used_tables() after
+ he has done const table detection, and that will be our chance to update
+ const_tables_cache.
+ */
+}
+
bool Item_subselect::walk(Item_processor processor, bool walk_subquery,
uchar *argument)
{
@@ -397,6 +536,7 @@
void Item_subselect::update_used_tables()
{
+ recalc_used_tables(parent_select, FALSE);
if (!engine->uncacheable())
{
// did all used tables become static?
@@ -1843,6 +1983,18 @@
return result || Item_subselect::fix_fields(thd_arg, ref);
}
+void Item_in_subselect::fix_after_pullout(st_select_lex *new_parent, Item **ref)
+{
+ left_expr->fix_after_pullout(new_parent, &left_expr);
+ Item_subselect::fix_after_pullout(new_parent, ref);
+}
+
+void Item_in_subselect::update_used_tables()
+{
+ Item_subselect::update_used_tables();
+ left_expr->update_used_tables();
+ used_tables_cache |= left_expr->used_tables();
+}
/**
Try to create an engine to compute the subselect via materialization,
=== modified file 'sql/item_subselect.h'
--- a/sql/item_subselect.h 2010-01-28 13:48:33 +0000
+++ b/sql/item_subselect.h 2010-02-11 23:59:58 +0000
@@ -67,14 +67,32 @@
bool have_to_be_excluded;
/* cache of constant state */
bool const_item_cache;
-
+
+ bool inside_first_fix_fields;
+ bool done_first_fix_fields;
public:
- /*
- References from inside the subquery to the select that this predicate is
- in. References to parent selects not included.
+ /* A reference from inside subquery predicate to somewhere outside of it */
+ class Ref_to_outside : public Sql_alloc
+ {
+ public:
+ st_select_lex *select; /* Select where the reference is pointing to */
+ /*
+ What is being referred. This may be NULL when we're referring to an
+ aggregate function.
+ */
+ Item *item;
+ };
+ /*
+ References from within this subquery to somewhere outside of it (i.e. to
+ parent select, grandparent select, etc)
*/
- List<Item> refers_to;
- int in_fix_fields;
+ List<Ref_to_outside> upper_refs;
+ st_select_lex *parent_select;
+
+ /*
+ TRUE<=>Table Elimination has made it redundant to evaluate this select
+ (and so it is not part of QEP, etc)
+ */
bool eliminated;
/* changed engine indicator */
@@ -117,6 +135,9 @@
return null_value;
}
bool fix_fields(THD *thd, Item **ref);
+ bool mark_as_dependent(THD *thd, st_select_lex *select, Item *item);
+ void fix_after_pullout(st_select_lex *new_parent, Item **ref);
+ void recalc_used_tables(st_select_lex *new_parent, bool after_pullout);
virtual bool exec();
virtual void fix_length_and_dec();
table_map used_tables() const;
@@ -396,6 +417,8 @@
bool test_limit(st_select_lex_unit *unit);
virtual void print(String *str, enum_query_type query_type);
bool fix_fields(THD *thd, Item **ref);
+ void fix_after_pullout(st_select_lex *new_parent, Item **ref);
+ void update_used_tables();
bool setup_engine();
bool init_left_expr_cache();
bool is_expensive_processor(uchar *arg);
=== modified file 'sql/item_sum.cc'
--- a/sql/item_sum.cc 2009-10-15 21:38:29 +0000
+++ b/sql/item_sum.cc 2010-02-11 23:59:58 +0000
@@ -350,7 +350,7 @@
sl= sl->master_unit()->outer_select() )
sl->master_unit()->item->with_sum_func= 1;
}
- thd->lex->current_select->mark_as_dependent(aggr_sel, NULL);
+ thd->lex->current_select->mark_as_dependent(thd, aggr_sel, NULL);
return FALSE;
}
=== modified file 'sql/sql_lex.cc'
--- a/sql/sql_lex.cc 2010-01-28 13:48:33 +0000
+++ b/sql/sql_lex.cc 2010-02-11 23:59:58 +0000
@@ -1841,9 +1841,8 @@
'last' should be reachable from this st_select_lex_node
*/
-void st_select_lex::mark_as_dependent(st_select_lex *last, Item *dependency)
+bool st_select_lex::mark_as_dependent(THD *thd, st_select_lex *last, Item *dependency)
{
- SELECT_LEX *next_to_last;
/*
Mark all selects from resolved to 1 before select where was
found table as depended (of select where was found table)
@@ -1867,12 +1866,15 @@
sl->uncacheable|= UNCACHEABLE_UNITED;
}
}
- next_to_last= s;
+
+ Item_subselect *subquery_expr= s->master_unit()->item;
+ if (subquery_expr && subquery_expr->mark_as_dependent(thd, last,
+ dependency))
+ return TRUE;
}
is_correlated= TRUE;
this->master_unit()->item->is_correlated= TRUE;
- if (dependency)
- next_to_last->master_unit()->item->refers_to.push_back(dependency);
+ return FALSE;
}
bool st_select_lex_node::set_braces(bool value) { return 1; }
=== modified file 'sql/sql_lex.h'
--- a/sql/sql_lex.h 2010-01-28 13:48:33 +0000
+++ b/sql/sql_lex.h 2010-02-11 23:59:58 +0000
@@ -747,7 +747,7 @@
return master_unit()->return_after_parsing();
}
- void mark_as_dependent(st_select_lex *last, Item *dependency);
+ bool mark_as_dependent(THD *thd, st_select_lex *last, Item *dependency);
bool set_braces(bool value);
bool inc_in_sum_expr();
1
0

[Maria-developers] Rev 2755: Subquery optimizations: backport: enable disabled subquery code in BKA in file:///home/psergey/dev/maria-5.3-subqueries-r6/
by Sergey Petrunya 11 Feb '10
by Sergey Petrunya 11 Feb '10
11 Feb '10
At file:///home/psergey/dev/maria-5.3-subqueries-r6/
------------------------------------------------------------
revno: 2755
revision-id: psergey(a)askmonty.org-20100211223118-5fzuidow1pkubpzl
parent: psergey(a)askmonty.org-20100211220036-qh3iw743tbgwpzax
committer: Sergey Petrunya <psergey(a)askmonty.org>
branch nick: maria-5.3-subqueries-r6
timestamp: Fri 2010-02-12 01:31:18 +0300
message:
Subquery optimizations: backport: enable disabled subquery code in BKA
=== modified file 'sql/sql_select.h'
--- a/sql/sql_select.h 2010-01-28 13:48:33 +0000
+++ b/sql/sql_select.h 2010-02-11 22:31:18 +0000
@@ -282,13 +282,11 @@
}
bool check_rowid_field()
{
-/* !!!NB igor: enable the code in this comment after backporting the SJ code
if (keep_current_rowid && !used_rowid_fields)
{
used_rowid_fields= 1;
used_fieldlength+= table->file->ref_length;
}
-*/
return test(used_rowid_fields);
}
bool is_inner_table_of_semi_join_with_first_match()
1
0

[Maria-developers] Rev 2754: Subquery optimizations: backport in file:///home/psergey/dev/maria-5.3-subqueries-r6/
by Sergey Petrunya 11 Feb '10
by Sergey Petrunya 11 Feb '10
11 Feb '10
At file:///home/psergey/dev/maria-5.3-subqueries-r6/
------------------------------------------------------------
revno: 2754
revision-id: psergey(a)askmonty.org-20100211220036-qh3iw743tbgwpzax
parent: psergey(a)askmonty.org-20100211215932-qi36vl0i3zkl86bv
committer: Sergey Petrunya <psergey(a)askmonty.org>
branch nick: maria-5.3-subqueries-r6
timestamp: Fri 2010-02-12 01:00:36 +0300
message:
Subquery optimizations: backport
- Fix valgrind failure: do initialize Item::is_expensive_cache.
=== modified file 'sql/item.cc'
--- a/sql/item.cc 2010-01-17 14:55:08 +0000
+++ b/sql/item.cc 2010-02-11 22:00:36 +0000
@@ -373,8 +373,8 @@
Item::Item():
- rsize(0), name(0), orig_name(0), name_length(0), fixed(0),
- is_autogenerated_name(TRUE),
+ is_expensive_cache(-1), rsize(0), name(0), orig_name(0), name_length(0),
+ fixed(0), is_autogenerated_name(TRUE),
collation(&my_charset_bin, DERIVATION_COERCIBLE)
{
marker= 0;
@@ -410,6 +410,7 @@
tables.
*/
Item::Item(THD *thd, Item *item):
+ is_expensive_cache(-1),
rsize(0),
str_value(item->str_value),
name(item->name),
=== modified file 'sql/item.h'
--- a/sql/item.h 2010-01-28 13:48:33 +0000
+++ b/sql/item.h 2010-02-11 22:00:36 +0000
@@ -513,6 +513,9 @@
enum traverse_order { POSTFIX, PREFIX };
+ /* Cache of the result of is_expensive(). */
+ int8 is_expensive_cache;
+
/* Reuse size, only used by SP local variable assignment, otherwize 0 */
uint rsize;
@@ -878,9 +881,6 @@
static CHARSET_INFO *default_charset();
virtual CHARSET_INFO *compare_collation() { return NULL; }
- /* Cache of the result of is_expensive(). */
- int8 is_expensive_cache;
-
virtual bool walk(Item_processor processor, bool walk_subquery, uchar *arg)
{
return (this->*processor)(arg);
1
0

[Maria-developers] Rev 2753: Subquery optimizations backport: Update test results (checked) in file:///home/psergey/dev/maria-5.3-subqueries-r6/
by Sergey Petrunya 11 Feb '10
by Sergey Petrunya 11 Feb '10
11 Feb '10
At file:///home/psergey/dev/maria-5.3-subqueries-r6/
------------------------------------------------------------
revno: 2753
revision-id: psergey(a)askmonty.org-20100211215932-qi36vl0i3zkl86bv
parent: psergey(a)askmonty.org-20100211215823-63ikirl70ztmlk05
committer: Sergey Petrunya <psergey(a)askmonty.org>
branch nick: maria-5.3-subqueries-r6
timestamp: Fri 2010-02-12 00:59:32 +0300
message:
Subquery optimizations backport: Update test results (checked)
=== modified file 'mysql-test/r/join_cache.result'
--- a/mysql-test/r/join_cache.result 2009-12-21 02:26:15 +0000
+++ b/mysql-test/r/join_cache.result 2010-02-11 21:59:32 +0000
@@ -1028,8 +1028,8 @@
WHERE City.Country IN (SELECT Code FROM Country WHERE Country.Name LIKE 'L%') AND
City.Population > 100000;
id select_type table type possible_keys key key_len ref rows Extra
-1 PRIMARY City ALL Population NULL NULL NULL 4079 Using where
-2 DEPENDENT SUBQUERY Country unique_subquery PRIMARY,Name PRIMARY 3 func 1 Using where
+1 PRIMARY Country range PRIMARY,Name Name 52 NULL 10 Using index condition; Using MRR
+1 PRIMARY City ref Population,Country Country 3 world.Country.Code 18 Using where; Using join buffer
SELECT Name FROM City
WHERE City.Country IN (SELECT Code FROM Country WHERE Country.Name LIKE 'L%') AND
City.Population > 100000;
@@ -1343,8 +1343,8 @@
WHERE City.Country IN (SELECT Code FROM Country WHERE Country.Name LIKE 'L%') AND
City.Population > 100000;
id select_type table type possible_keys key key_len ref rows Extra
-1 PRIMARY City ALL Population NULL NULL NULL 4079 Using where
-2 DEPENDENT SUBQUERY Country unique_subquery PRIMARY,Name PRIMARY 3 func 1 Using where
+1 PRIMARY Country range PRIMARY,Name Name 52 NULL 10 Using index condition; Using MRR
+1 PRIMARY City ref Population,Country Country 3 world.Country.Code 18 Using where; Using join buffer
SELECT Name FROM City
WHERE City.Country IN (SELECT Code FROM Country WHERE Country.Name LIKE 'L%') AND
City.Population > 100000;
@@ -1658,8 +1658,8 @@
WHERE City.Country IN (SELECT Code FROM Country WHERE Country.Name LIKE 'L%') AND
City.Population > 100000;
id select_type table type possible_keys key key_len ref rows Extra
-1 PRIMARY City ALL Population NULL NULL NULL 4079 Using where
-2 DEPENDENT SUBQUERY Country unique_subquery PRIMARY,Name PRIMARY 3 func 1 Using where
+1 PRIMARY Country range PRIMARY,Name Name 52 NULL 10 Using index condition; Using MRR
+1 PRIMARY City ref Population,Country Country 3 world.Country.Code 18 Using where; Using join buffer
SELECT Name FROM City
WHERE City.Country IN (SELECT Code FROM Country WHERE Country.Name LIKE 'L%') AND
City.Population > 100000;
@@ -1973,8 +1973,8 @@
WHERE City.Country IN (SELECT Code FROM Country WHERE Country.Name LIKE 'L%') AND
City.Population > 100000;
id select_type table type possible_keys key key_len ref rows Extra
-1 PRIMARY City ALL Population NULL NULL NULL 4079 Using where
-2 DEPENDENT SUBQUERY Country unique_subquery PRIMARY,Name PRIMARY 3 func 1 Using where
+1 PRIMARY Country range PRIMARY,Name Name 52 NULL 10 Using index condition; Using MRR
+1 PRIMARY City ref Population,Country Country 3 world.Country.Code 18 Using where; Using join buffer
SELECT Name FROM City
WHERE City.Country IN (SELECT Code FROM Country WHERE Country.Name LIKE 'L%') AND
City.Population > 100000;
@@ -2292,8 +2292,8 @@
WHERE City.Country IN (SELECT Code FROM Country WHERE Country.Name LIKE 'L%') AND
City.Population > 100000;
id select_type table type possible_keys key key_len ref rows Extra
-1 PRIMARY City ALL Population NULL NULL NULL 4079 Using where
-2 DEPENDENT SUBQUERY Country unique_subquery PRIMARY,Name PRIMARY 3 func 1 Using where
+1 PRIMARY Country range PRIMARY,Name Name 52 NULL 10 Using index condition; Using MRR
+1 PRIMARY City ref Population,Country Country 3 world.Country.Code 18 Using where; Using join buffer
SELECT Name FROM City
WHERE City.Country IN (SELECT Code FROM Country WHERE Country.Name LIKE 'L%') AND
City.Population > 100000;
@@ -2514,8 +2514,8 @@
WHERE City.Country IN (SELECT Code FROM Country WHERE Country.Name LIKE 'L%') AND
City.Population > 100000;
id select_type table type possible_keys key key_len ref rows Extra
-1 PRIMARY City ALL Population NULL NULL NULL 4079 Using where
-2 DEPENDENT SUBQUERY Country unique_subquery PRIMARY,Name PRIMARY 3 func 1 Using where
+1 PRIMARY Country range PRIMARY,Name Name 52 NULL 10 Using index condition; Using MRR
+1 PRIMARY City ref Population,Country Country 3 world.Country.Code 18 Using where; Using join buffer
SELECT Name FROM City
WHERE City.Country IN (SELECT Code FROM Country WHERE Country.Name LIKE 'L%') AND
City.Population > 100000;
@@ -2736,8 +2736,8 @@
WHERE City.Country IN (SELECT Code FROM Country WHERE Country.Name LIKE 'L%') AND
City.Population > 100000;
id select_type table type possible_keys key key_len ref rows Extra
-1 PRIMARY City ALL Population NULL NULL NULL 4079 Using where
-2 DEPENDENT SUBQUERY Country unique_subquery PRIMARY,Name PRIMARY 3 func 1 Using where
+1 PRIMARY Country range PRIMARY,Name Name 52 NULL 10 Using index condition; Using MRR
+1 PRIMARY City ref Population,Country Country 3 world.Country.Code 18 Using where; Using join buffer
SELECT Name FROM City
WHERE City.Country IN (SELECT Code FROM Country WHERE Country.Name LIKE 'L%') AND
City.Population > 100000;
@@ -2958,8 +2958,8 @@
WHERE City.Country IN (SELECT Code FROM Country WHERE Country.Name LIKE 'L%') AND
City.Population > 100000;
id select_type table type possible_keys key key_len ref rows Extra
-1 PRIMARY City ALL Population NULL NULL NULL 4079 Using where
-2 DEPENDENT SUBQUERY Country unique_subquery PRIMARY,Name PRIMARY 3 func 1 Using where
+1 PRIMARY Country range PRIMARY,Name Name 52 NULL 10 Using index condition; Using MRR
+1 PRIMARY City ref Population,Country Country 3 world.Country.Code 18 Using where; Using join buffer
SELECT Name FROM City
WHERE City.Country IN (SELECT Code FROM Country WHERE Country.Name LIKE 'L%') AND
City.Population > 100000;
=== modified file 'mysql-test/r/type_datetime.result'
--- a/mysql-test/r/type_datetime.result 2009-02-13 18:07:03 +0000
+++ b/mysql-test/r/type_datetime.result 2010-02-11 21:59:32 +0000
@@ -514,10 +514,9 @@
where id in (select id from t1 as x1 where (t1.cur_date is null));
id select_type table type possible_keys key key_len ref rows filtered Extra
1 PRIMARY NULL NULL NULL NULL NULL NULL NULL NULL Impossible WHERE noticed after reading const tables
-2 DEPENDENT SUBQUERY NULL NULL NULL NULL NULL NULL NULL NULL Impossible WHERE
Warnings:
Note 1276 Field or reference 'test.t1.cur_date' of SELECT #2 was resolved in SELECT #1
-Note 1003 select '1' AS `id`,'2007-04-25 18:30:22' AS `cur_date` from `test`.`t1` where <in_optimizer>('1',<exists>(select 1 AS `Not_used` from `test`.`t1` `x1` where 0))
+Note 1003 select '1' AS `id`,'2007-04-25 18:30:22' AS `cur_date` from `test`.`t1` `x1` join `test`.`t1` where (('2007-04-25 18:30:22' = 0))
select * from t1
where id in (select id from t1 as x1 where (t1.cur_date is null));
id cur_date
@@ -526,10 +525,9 @@
where id in (select id from t2 as x1 where (t2.cur_date is null));
id select_type table type possible_keys key key_len ref rows filtered Extra
1 PRIMARY NULL NULL NULL NULL NULL NULL NULL NULL Impossible WHERE noticed after reading const tables
-2 DEPENDENT SUBQUERY NULL NULL NULL NULL NULL NULL NULL NULL Impossible WHERE
Warnings:
Note 1276 Field or reference 'test.t2.cur_date' of SELECT #2 was resolved in SELECT #1
-Note 1003 select '1' AS `id`,'2007-04-25' AS `cur_date` from `test`.`t2` where <in_optimizer>('1',<exists>(select 1 AS `Not_used` from `test`.`t2` `x1` where 0))
+Note 1003 select '1' AS `id`,'2007-04-25' AS `cur_date` from `test`.`t2` `x1` join `test`.`t2` where (('2007-04-25' = 0))
select * from t2
where id in (select id from t2 as x1 where (t2.cur_date is null));
id cur_date
@@ -540,10 +538,10 @@
where id in (select id from t1 as x1 where (t1.cur_date is null));
id select_type table type possible_keys key key_len ref rows filtered Extra
1 PRIMARY t1 ALL NULL NULL NULL NULL 2 100.00 Using where
-2 DEPENDENT SUBQUERY x1 ALL NULL NULL NULL NULL 2 100.00 Using where
+1 PRIMARY x1 ALL NULL NULL NULL NULL 2 100.00 Using where; FirstMatch(t1)
Warnings:
Note 1276 Field or reference 'test.t1.cur_date' of SELECT #2 was resolved in SELECT #1
-Note 1003 select `test`.`t1`.`id` AS `id`,`test`.`t1`.`cur_date` AS `cur_date` from `test`.`t1` where <in_optimizer>(`test`.`t1`.`id`,<exists>(select 1 AS `Not_used` from `test`.`t1` `x1` where ((`test`.`t1`.`cur_date` = 0) and (<cache>(`test`.`t1`.`id`) = `test`.`x1`.`id`))))
+Note 1003 select `test`.`t1`.`id` AS `id`,`test`.`t1`.`cur_date` AS `cur_date` from `test`.`t1` semi join (`test`.`t1` `x1`) where ((`test`.`x1`.`id` = `test`.`t1`.`id`) and (`test`.`t1`.`cur_date` = 0))
select * from t1
where id in (select id from t1 as x1 where (t1.cur_date is null));
id cur_date
@@ -552,10 +550,10 @@
where id in (select id from t2 as x1 where (t2.cur_date is null));
id select_type table type possible_keys key key_len ref rows filtered Extra
1 PRIMARY t2 ALL NULL NULL NULL NULL 2 100.00 Using where
-2 DEPENDENT SUBQUERY x1 ALL NULL NULL NULL NULL 2 100.00 Using where
+1 PRIMARY x1 ALL NULL NULL NULL NULL 2 100.00 Using where; FirstMatch(t2)
Warnings:
Note 1276 Field or reference 'test.t2.cur_date' of SELECT #2 was resolved in SELECT #1
-Note 1003 select `test`.`t2`.`id` AS `id`,`test`.`t2`.`cur_date` AS `cur_date` from `test`.`t2` where <in_optimizer>(`test`.`t2`.`id`,<exists>(select 1 AS `Not_used` from `test`.`t2` `x1` where ((`test`.`t2`.`cur_date` = 0) and (<cache>(`test`.`t2`.`id`) = `test`.`x1`.`id`))))
+Note 1003 select `test`.`t2`.`id` AS `id`,`test`.`t2`.`cur_date` AS `cur_date` from `test`.`t2` semi join (`test`.`t2` `x1`) where ((`test`.`x1`.`id` = `test`.`t2`.`id`) and (`test`.`t2`.`cur_date` = 0))
select * from t2
where id in (select id from t2 as x1 where (t2.cur_date is null));
id cur_date
1
0

[Maria-developers] Rev 2752: Apply Jorgen Loland's fix: Bug#45221: Query "SELECT pk FROM C WHERE pk IN (SELECT int_key)" failing in file:///home/psergey/dev/maria-5.3-subqueries-r6/
by Sergey Petrunya 11 Feb '10
by Sergey Petrunya 11 Feb '10
11 Feb '10
At file:///home/psergey/dev/maria-5.3-subqueries-r6/
------------------------------------------------------------
revno: 2752
revision-id: psergey(a)askmonty.org-20100211215823-63ikirl70ztmlk05
parent: psergey(a)askmonty.org-20100211215602-irdyu314ddwew1xd
committer: Sergey Petrunya <psergey(a)askmonty.org>
branch nick: maria-5.3-subqueries-r6
timestamp: Fri 2010-02-12 00:58:23 +0300
message:
Apply Jorgen Loland's fix: Bug#45221: Query "SELECT pk FROM C WHERE pk IN (SELECT int_key)" failing
XOR conditions are not optimized, and Item_cond_xor therefore
acts like type Func_item even though it inherits from Item_cond.
A subtle difference between Item_func and Item_cond is that
you can get the children Items from the former by calling
arguments(), and from the latter by calling argument_list().
However, since Item_cond_xor inherits from Item_cond,
arguments() did not return any Items.
The fact that Item_cond_xor::arguments() did not return it's
children items lead to a problem for make_cond_for_index();
the method accepted that XOR items on unindexed columns were
pushed using ICP. ICP evaluation of non-indexed columns
does not (and should not) work.
The fix for this bug is to make Item_cond_xor return it's
children items when the arguments() method is used. This makes
Item_cond_xor behave more like Item_func and in turn allows
make_cond_for_index() to discover any conflicting children
Items.
This is a temporary fix and should be removed when Item_cond_xor
is optimized.
=== modified file 'mysql-test/r/group_by.result'
--- a/mysql-test/r/group_by.result 2009-02-26 17:17:06 +0000
+++ b/mysql-test/r/group_by.result 2010-02-11 21:58:23 +0000
@@ -1542,8 +1542,8 @@
EXPLAIN SELECT 1 FROM t1 WHERE a IN
(SELECT a FROM t1 USE INDEX (i2) IGNORE INDEX (i2));
id select_type table type possible_keys key key_len ref rows Extra
-1 PRIMARY t1 index NULL PRIMARY 4 NULL 144 Using where; Using index
-2 DEPENDENT SUBQUERY t1 ALL NULL NULL NULL NULL 144 Using where
+1 PRIMARY t1 index PRIMARY,i2 PRIMARY 4 NULL 144 Using index
+1 PRIMARY t1 ALL NULL NULL NULL NULL 144 Using where; FirstMatch(t1)
CREATE TABLE t2 (a INT, b INT, KEY(a));
INSERT INTO t2 VALUES (1, 1), (2, 2), (3,3), (4,4);
EXPLAIN SELECT a, SUM(b) FROM t2 GROUP BY a LIMIT 2;
@@ -1555,8 +1555,8 @@
EXPLAIN SELECT 1 FROM t2 WHERE a IN
(SELECT a FROM t1 USE INDEX (i2) IGNORE INDEX (i2));
id select_type table type possible_keys key key_len ref rows Extra
-1 PRIMARY t2 index NULL a 5 NULL 4 Using where; Using index
-2 DEPENDENT SUBQUERY t1 ALL NULL NULL NULL NULL 144 Using where
+1 PRIMARY t2 index a a 5 NULL 4 Using index
+1 PRIMARY t1 ALL NULL NULL NULL NULL 144 Using where; FirstMatch(t2)
SHOW VARIABLES LIKE 'old';
Variable_name Value
old OFF
=== modified file 'sql/item_cmpfunc.h'
--- a/sql/item_cmpfunc.h 2010-01-17 14:55:08 +0000
+++ b/sql/item_cmpfunc.h 2010-02-11 21:58:23 +0000
@@ -1715,14 +1715,34 @@
class Item_cond_xor :public Item_cond
{
public:
- Item_cond_xor() :Item_cond() {}
- Item_cond_xor(Item *i1,Item *i2) :Item_cond(i1,i2) {}
+ Item_cond_xor(Item *i1,Item *i2) :Item_cond(i1,i2)
+ {
+ /*
+ Items must be stored in args[] as well because this Item_cond is
+ treated as a FUNC_ITEM (see type()). I.e., users of it will get
+ it's children by calling arguments(), not argument_list(). This
+ is a temporary solution until XOR is optimized and treated like
+ a full Item_cond citizen.
+ */
+ arg_count= 2;
+ args= tmp_arg;
+ args[0]= i1;
+ args[1]= i2;
+ }
enum Functype functype() const { return COND_XOR_FUNC; }
/* TODO: remove the next line when implementing XOR optimization */
enum Type type() const { return FUNC_ITEM; }
longlong val_int();
const char *func_name() const { return "xor"; }
void top_level_item() {}
+ /* Since child Items are stored in args[], Items cannot be added.
+ However, since Item_cond_xor is treated as a FUNC_ITEM (see
+ type()), the methods below should never be called.
+ */
+ bool add(Item *item) { DBUG_ASSERT(FALSE); return FALSE; }
+ bool add_at_head(Item *item) { DBUG_ASSERT(FALSE); return FALSE; }
+ bool add_at_head(List<Item> *nlist) { DBUG_ASSERT(FALSE); return FALSE; }
+ void copy_andor_arguments(THD *thd, Item_cond *item) { DBUG_ASSERT(FALSE); }
};
1
0

[Maria-developers] Rev 2751: Subquery backport: Update test results (checked) in file:///home/psergey/dev/maria-5.3-subqueries-r6/
by Sergey Petrunya 11 Feb '10
by Sergey Petrunya 11 Feb '10
11 Feb '10
At file:///home/psergey/dev/maria-5.3-subqueries-r6/
------------------------------------------------------------
revno: 2751
revision-id: psergey(a)askmonty.org-20100211215602-irdyu314ddwew1xd
parent: psergey(a)askmonty.org-20100211215456-u85owf67gwqkkss5
committer: Sergey Petrunya <psergey(a)askmonty.org>
branch nick: maria-5.3-subqueries-r6
timestamp: Fri 2010-02-12 00:56:02 +0300
message:
Subquery backport: Update test results (checked)
=== modified file 'mysql-test/r/explain.result'
--- a/mysql-test/r/explain.result 2009-12-15 07:16:46 +0000
+++ b/mysql-test/r/explain.result 2010-02-11 21:56:02 +0000
@@ -171,7 +171,7 @@
EXPLAIN SELECT OUTR.dt FROM t1 AS OUTR WHERE OUTR.dt IN (SELECT INNR.dt FROM t2 AS INNR WHERE OUTR.dt IS NULL );
id select_type table type possible_keys key key_len ref rows Extra
1 PRIMARY OUTR ALL NULL NULL NULL NULL 2 Using where
-2 DEPENDENT SUBQUERY INNR ALL NULL NULL NULL NULL 2 Using where
+1 PRIMARY INNR ALL NULL NULL NULL NULL 2 Using where; FirstMatch(OUTR)
flush tables;
SELECT OUTR.dt FROM t1 AS OUTR WHERE OUTR.dt IN (SELECT INNR.dt FROM t2 AS INNR WHERE OUTR.dt IS NULL );
dt
@@ -179,7 +179,7 @@
EXPLAIN SELECT OUTR.dt FROM t1 AS OUTR WHERE OUTR.dt IN ( SELECT INNR.dt FROM t2 AS INNR WHERE OUTR.t < '2005-11-13 7:41:31' );
id select_type table type possible_keys key key_len ref rows Extra
1 PRIMARY OUTR ALL NULL NULL NULL NULL 2 Using where
-2 DEPENDENT SUBQUERY INNR ALL NULL NULL NULL NULL 2 Using where
+1 PRIMARY INNR ALL NULL NULL NULL NULL 2 Using where; FirstMatch(OUTR)
flush tables;
SELECT OUTR.dt FROM t1 AS OUTR WHERE OUTR.dt IN ( SELECT INNR.dt FROM t2 AS INNR WHERE OUTR.t < '2005-11-13 7:41:31' );
dt
=== modified file 'mysql-test/r/group_min_max.result'
--- a/mysql-test/r/group_min_max.result 2009-08-30 07:03:37 +0000
+++ b/mysql-test/r/group_min_max.result 2010-02-11 21:56:02 +0000
@@ -2256,7 +2256,7 @@
a IN (SELECT max(b) FROM t1 GROUP BY a HAVING a < 2);
id select_type table type possible_keys key key_len ref rows Extra
1 PRIMARY t1_outer index NULL a 10 NULL 15 Using where; Using index
-2 DEPENDENT SUBQUERY t1 index NULL a 10 NULL 1 Using index
+2 SUBQUERY t1 range NULL a 5 NULL 8 Using index for group-by
EXPLAIN SELECT 1 FROM t1 AS t1_outer GROUP BY a HAVING
a > (SELECT max(b) FROM t1 GROUP BY a HAVING a < 2);
id select_type table type possible_keys key key_len ref rows Extra
=== modified file 'mysql-test/r/subselect3_jcl6.result'
--- a/mysql-test/r/subselect3_jcl6.result 2010-01-17 14:51:10 +0000
+++ b/mysql-test/r/subselect3_jcl6.result 2010-02-11 21:56:02 +0000
@@ -1140,7 +1140,7 @@
flush status;
select count(*) from t0 A, t0 B, t0 C, t0 D where D.a in (select a from t1 E);
count(*)
-4999
+5000
show status like 'Created_tmp_disk_tables';
Variable_name Value
Created_tmp_disk_tables 1
1
0

[Maria-developers] Rev 2750: Subquery optimization backport: Duplicate Elimination: in file:///home/psergey/dev/maria-5.3-subqueries-r6/
by Sergey Petrunya 11 Feb '10
by Sergey Petrunya 11 Feb '10
11 Feb '10
At file:///home/psergey/dev/maria-5.3-subqueries-r6/
------------------------------------------------------------
revno: 2750
revision-id: psergey(a)askmonty.org-20100211215456-u85owf67gwqkkss5
parent: psergey(a)askmonty.org-20100128134833-9000udjp5wa3tsff
committer: Sergey Petrunya <psergey(a)askmonty.org>
branch nick: maria-5.3-subqueries-r6
timestamp: Fri 2010-02-12 00:54:56 +0300
message:
Subquery optimization backport: Duplicate Elimination:
process temporary table overflow correctly.
=== modified file 'sql/sql_select.cc'
--- a/sql/sql_select.cc 2010-01-28 13:48:33 +0000
+++ b/sql/sql_select.cc 2010-02-11 21:54:56 +0000
@@ -16250,12 +16250,12 @@
if (error)
{
/* create_internal_tmp_table_from_heap will generate error if needed */
- if (sjtbl->tmp_table->file->is_fatal_error(error, HA_CHECK_DUP) &&
- create_internal_tmp_table_from_heap(thd, sjtbl->tmp_table,
+ if (!sjtbl->tmp_table->file->is_fatal_error(error, HA_CHECK_DUP))
+ DBUG_RETURN(1); /* Duplicate */
+ if (create_internal_tmp_table_from_heap(thd, sjtbl->tmp_table,
sjtbl->start_recinfo,
&sjtbl->recinfo, error, 1))
DBUG_RETURN(-1);
- DBUG_RETURN(1);
}
DBUG_RETURN(0);
}
1
0

[Maria-developers] [Branch ~maria-captains/maria/5.1] Rev 2816: Fix for LPBug#520243: useability bug of thread pool configuration
by noreply@launchpad.net 11 Feb '10
by noreply@launchpad.net 11 Feb '10
11 Feb '10
------------------------------------------------------------
revno: 2816
committer: Michael Widenius <monty(a)askmonty.org>
branch nick: maria-5.1
timestamp: Thu 2010-02-11 21:15:24 +0200
message:
Fix for LPBug#520243: useability bug of thread pool configuration
Now mysqld --help --verbose shows the value for thread-handling
Fixed also that mysqld --one-thread works as expected.
modified:
sql/mysqld.cc
--
lp:maria
https://code.launchpad.net/~maria-captains/maria/5.1
Your team Maria developers is subscribed to branch lp:maria.
To unsubscribe from this branch go to https://code.launchpad.net/~maria-captains/maria/5.1/+edit-subscription.
1
0

[Maria-developers] bzr commit into MariaDB 5.1, with Maria 1.5:maria branch (monty:2816)
by Michael Widenius 11 Feb '10
by Michael Widenius 11 Feb '10
11 Feb '10
#At lp:maria based on revid:monty@askmonty.org-20100210212606-xj84sp2fhbrf5epc
2816 Michael Widenius 2010-02-11
Fix for LPBug#520243: useability bug of thread pool configuration
Now mysqld --help --verbose shows the value for thread-handling
Fixed also that mysqld --one-thread works as expected.
modified:
sql/mysqld.cc
=== modified file 'sql/mysqld.cc'
--- a/sql/mysqld.cc 2010-02-10 19:06:24 +0000
+++ b/sql/mysqld.cc 2010-02-11 19:15:24 +0000
@@ -598,6 +598,7 @@ char *mysqld_unix_port, *opt_mysql_tmpdi
const char **errmesg; /**< Error messages */
const char *myisam_recover_options_str="OFF";
const char *myisam_stats_method_str="nulls_unequal";
+const char *opt_thread_handling= thread_handling_typelib.type_names[0];
/** name of reference on left espression in rewritten IN subquery */
const char *in_left_expr_name= "<left expr>";
@@ -7290,7 +7291,8 @@ The minimum value for this variable is 4
1024, 0},
{"thread_handling", OPT_THREAD_HANDLING,
"Define threads usage for handling queries: "
- "one-thread-per-connection or no-threads", 0, 0,
+ "one-thread-per-connection or no-threads",
+ (uchar**) &opt_thread_handling, (uchar**) &opt_thread_handling,
0, GET_STR, REQUIRED_ARG, 0, 0, 0, 0, 0, 0},
{"updatable_views_with_limit", OPT_UPDATABLE_VIEWS_WITH_LIMIT,
"1 = YES = Don't issue an error message (warning only) if a VIEW without presence of a key of the underlying table is used in queries with a LIMIT clause for updating. 0 = NO = Prohibit update of a VIEW, which does not contain a key of the underlying table and the query uses a LIMIT clause (usually get from GUI tools).",
@@ -8721,14 +8723,15 @@ mysqld_get_one_option(int optid,
break;
}
case OPT_ONE_THREAD:
- global_system_variables.thread_handling=
- SCHEDULER_ONE_THREAD_PER_CONNECTION;
+ global_system_variables.thread_handling= SCHEDULER_NO_THREADS;
+ opt_thread_handling= thread_handling_typelib.type_names[global_system_variables.thread_handling];
break;
case OPT_THREAD_HANDLING:
{
int id;
if (!find_opt_type(argument, &thread_handling_typelib, opt->name, &id))
global_system_variables.thread_handling= id - 1;
+ opt_thread_handling= thread_handling_typelib.type_names[global_system_variables.thread_handling];
break;
}
case OPT_FT_BOOLEAN_SYNTAX:
1
0

[Maria-developers] Updated (by Monty): Add support for dynamic columns (via google protocol buffers) (34)
by worklog-noreply@askmonty.org 11 Feb '10
by worklog-noreply@askmonty.org 11 Feb '10
11 Feb '10
-----------------------------------------------------------------------
WORKLOG TASK
-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-
TASK...........: Add support for dynamic columns (via google protocol buffers)
CREATION DATE..: Tue, 21 Jul 2009, 21:11
SUPERVISOR.....: Monty
IMPLEMENTOR....: Knielsen
COPIES TO......:
CATEGORY.......: Server-Sprint
TASK ID........: 34 (http://askmonty.org/worklog/?tid=34)
VERSION........: Server-5.3
STATUS.........: Assigned
PRIORITY.......: 60
WORKED HOURS...: 0
ESTIMATE.......: 0 (hours remain)
ORIG. ESTIMATE.: 0
PROGRESS NOTES:
-=-=(Monty - Thu, 11 Feb 2010, 20:04)=-=-
Version updated.
--- /tmp/wklog.34.old.18409 2010-02-11 18:04:35.000000000 +0000
+++ /tmp/wklog.34.new.18409 2010-02-11 18:04:35.000000000 +0000
@@ -1 +1 @@
-WorkLog-3.4
+Server-5.3
-=-=(Monty - Thu, 11 Feb 2010, 20:04)=-=-
Status updated.
--- /tmp/wklog.34.old.18409 2010-02-11 18:04:35.000000000 +0000
+++ /tmp/wklog.34.new.18409 2010-02-11 18:04:35.000000000 +0000
@@ -1 +1 @@
-Un-Assigned
+Assigned
-=-=(Monty - Thu, 11 Feb 2010, 20:04)=-=-
Title modified.
--- /tmp/wklog.34.old.18409 2010-02-11 18:04:35.000000000 +0000
+++ /tmp/wklog.34.new.18409 2010-02-11 18:04:35.000000000 +0000
@@ -1 +1 @@
-Add support for google protocol buffers
+Add support for dynamic columns (via google protocol buffers)
-=-=(Monty - Thu, 11 Feb 2010, 20:03)=-=-
High Level Description modified.
--- /tmp/wklog.34.old.18329 2010-02-11 18:03:42.000000000 +0000
+++ /tmp/wklog.34.new.18329 2010-02-11 18:03:42.000000000 +0000
@@ -19,3 +19,14 @@
Any support for indexing GPB data is outside of scope of this WL entry.
+Example usage:
+
+SELECT proto_get(blob, 1, varchar) from table_with_proto;
+
+UPDATE table_with_proto SET blob=proto_add(blob, 2, "hello") where id=1;
+
+UPDATE table_with_proto SET blob=proto_del(blob,4) where id=5;
+
+Note that 'proto_add()' will replace any old value with the given proto_id.
+
+
-=-=(Monty - Thu, 11 Feb 2010, 19:59)=-=-
High-Level Specification modified.
--- /tmp/wklog.34.old.17962 2010-02-11 19:59:45.000000000 +0200
+++ /tmp/wklog.34.new.17962 2010-02-11 19:59:45.000000000 +0200
@@ -1,13 +1,8 @@
-
-<contents>
1. GPB Encoding overview
2. GPB in an SQL database
-2.1 Informing server about GPB field names and types
-2.2 Addressing GPB fields
-2.2.1 Option1: SQL Function
-2.2.2 Option2: SQL columns
-</contents>
-
+3. Encoding to use for dynamic columns
+4. How to store and access data in a protocol buffer from SQL
+5. Extensions for the future
1. GPB Encoding overview
========================
@@ -37,42 +32,50 @@
traffic right away, and will open path to getting the best possible
performance.
-2.1 Informing server about GPB field names and types
-----------------------------------------------------
-User-friendly/meaningful access to GPB fields requires knowledge of GPB field
-names and types, which are not available from GPB message itself (see "GPB
-encoding overview" section).
-
-So the first issue to be addressed is to get the server to know the definition
-of stored messages. We intend to assume that all records have GPB messages
-that conform to a certain single definition, which gives one definition per
-GPB field.
+3. Encoding to use for dynamic columns
+======================================
-DecisionToMake: How to pass the server the GPB definition?
-First idea: add a CREATE TABLE parameter which will specify either the
-definition itself or path to .proto file with the definition.
+The data should be coded into the proto buffer in the following format:
+
+<field_number><value_type><value>[<field_number><value_type><value>...]
+
+Where field_number is a number between 0-65536 that identifes the field
+<value_type> is a enum of type 'Item_result'
+<value> is the value coded in proto format.
+
+In other words, we should have no nested or complex structure.
+
+4. How to store and access data in a protocol buffer from SQL
+============================================================
+
+User-friendly/meaningful access to GPB fields requires knowledge of
+GPB field names and types, which are not available from GPB message
+itself (see "GPB encoding overview" section).
+
+To make things easy for the user, we will at first stage provide SQL
+functions to manipulate a string that is actually in proto format.
-2.2 Addressing GPB fields
--------------------------
-We'll need to provide a way to access GPB fields. This can be complicated as
-structures that are encoded in GPB message can be nested and recursive.
-
-2.2.1 Option1: SQL Function
-~~~~~~~~~~~~~~~~~~~~~~~~~~~
-Introduce an SQL function GPB_FIELD(path) which will return contents of the
-field.
-- Return type of the function will be determined from GPB message definition.
-- For path, we can use XPath selector (a subset of XPath) syntax.
-
-(TODO ^ the above needs to be specified in more detail. is the selector as
-simple as filesystem path or we allow quantifiers (with predicates?)?)
-
-2.2.2 Option2: SQL columns
-~~~~~~~~~~~~~~~~~~~~~~~~~~
-Make GPB columns to be accessible as SQL columns.
-This approach has problems:
-- It might be hard to implement code-wise
- - (TODO will Virtual columns patch help??)
-- It is not clear how to access fields from nested structures. Should we allow
- quoted names like `foo/bar[2]/baz' ?
+The functions we should provde are:
+proto_get(gpb, field_number, type)
+
+This return the field tagged with 'field_number' from the 'gpb' buffer.
+
+Example: proto_get(blob, 1, varchar) -> Returns field number 1 as varchar
+
+proto_put(gpb, field_number, value)
+
+This returns a new gbp buffer with the new value appended.
+
+Example: proto_put(proto_put(blob, 1, 1), 2, "hello")
+
+5. Extension for future
+=======================
+
+In the future we may want to access data based on name and get MariaDB to
+automaticly know the correct type. To do this we need to be able to
+store a definition for the content of the proto buffer somewhere.
+
+DecisionToMake: How to pass the server the GPB definition?
+First idea: add a CREATE TABLE parameter which will specify the
+definition itself.
-=-=(Monty - Thu, 11 Feb 2010, 19:59)=-=-
High Level Description modified.
--- /tmp/wklog.34.old.17915 2010-02-11 17:59:17.000000000 +0000
+++ /tmp/wklog.34.new.17915 2010-02-11 17:59:17.000000000 +0000
@@ -1,5 +1,21 @@
-Add support for Google Protocol Buffers (further GPB). It should be possible
-to have columns that store GPB-encoded data, as well as use SQL constructs to
+Add support for dynamic columns:
+
+- A column that can hold information from many columns
+- One can instantly add or remove column data
+
+This is a useful feature for any store type of application, where you want to
+store different type of information for different kind of items.
+
+For example, for shoes you want to store: material, size, colour, maker
+For a computer you want to store ram, hard disk size etc...
+
+In a normal 'relational' system you would need to a table for each type.
+With dynamic columns you have all common items as fixed fields (like
+product_code, manufacturer, price) and the rest stored in a dynamic column.
+
+The proposed idea is to store the dynamic information in a blob in
+Google Protocol Buffers (further GPB) format and use SQL constructs to
extract parts of GPB data for use in select list, for filtering, and so forth.
+
Any support for indexing GPB data is outside of scope of this WL entry.
-=-=(Knielsen - Fri, 22 Jan 2010, 11:38)=-=-
Low Level Design modified.
--- /tmp/wklog.34.old.29965 2010-01-22 11:38:57.000000000 +0200
+++ /tmp/wklog.34.new.29965 2010-01-22 11:38:57.000000000 +0200
@@ -2,3 +2,12 @@
and a parser for text form of .proto file which then exposes the parsed
file via standard GPB message navigation API.
+* We should have both server-side support and client-side support (client side
+ means functions in libmysqlclient so that user can select the full BLOB and
+ extract fields in the application).
+
+* Add some kind of header to the GPB blob to support versioning and future
+ extensibility.
+
+* Add complete syntax description (update, add, drop, exists, ...).
+
-=-=(Psergey - Tue, 21 Jul 2009, 21:13)=-=-
Low Level Design modified.
--- /tmp/wklog.34.old.6462 2009-07-21 21:13:13.000000000 +0300
+++ /tmp/wklog.34.new.6462 2009-07-21 21:13:13.000000000 +0300
@@ -1 +1,4 @@
+* GPB tarball contains a protocol definition for .proto file structure itself
+ and a parser for text form of .proto file which then exposes the parsed
+ file via standard GPB message navigation API.
-=-=(Psergey - Tue, 21 Jul 2009, 21:12)=-=-
High-Level Specification modified.
--- /tmp/wklog.34.old.6399 2009-07-21 21:12:23.000000000 +0300
+++ /tmp/wklog.34.new.6399 2009-07-21 21:12:23.000000000 +0300
@@ -1 +1,78 @@
+<contents>
+1. GPB Encoding overview
+2. GPB in an SQL database
+2.1 Informing server about GPB field names and types
+2.2 Addressing GPB fields
+2.2.1 Option1: SQL Function
+2.2.2 Option2: SQL columns
+</contents>
+
+
+1. GPB Encoding overview
+========================
+
+GBB is a compact encoding for structured and typed data. A unit of GPB data
+(it is called message) is only partially self-describing: it's possible to
+iterate over its parts, but, quoting the spec
+
+http://code.google.com/apis/protocolbuffers/docs/encoding.html:
+ " the name and declared type for each field can only be determined on the
+ decoding end by referencing the message type's definition (i.e. the .proto
+ file). "
+
+2. GPB in an SQL database
+=========================
+
+It is possible to store GPB data in MariaDB today - one can declare a binary
+blob column and use it to store GPB messages. Storing and retrieving entire
+messages will be the only available operations, though, as the server has no
+idea about the GPB format.
+It is apparent that ability to peek inside GPB data from SQL layer would be of
+great advantage: one would be able to
+- select only certain fields or parts of GPB messages
+- filter records based on the values of GPB fields
+- etc
+performing such operations at SQL layer will allow to reduce client<->server
+traffic right away, and will open path to getting the best possible
+performance.
+
+2.1 Informing server about GPB field names and types
+----------------------------------------------------
+User-friendly/meaningful access to GPB fields requires knowledge of GPB field
+names and types, which are not available from GPB message itself (see "GPB
+encoding overview" section).
+
+So the first issue to be addressed is to get the server to know the definition
+of stored messages. We intend to assume that all records have GPB messages
+that conform to a certain single definition, which gives one definition per
+GPB field.
+
+DecisionToMake: How to pass the server the GPB definition?
+First idea: add a CREATE TABLE parameter which will specify either the
+definition itself or path to .proto file with the definition.
+
+2.2 Addressing GPB fields
+-------------------------
+We'll need to provide a way to access GPB fields. This can be complicated as
+structures that are encoded in GPB message can be nested and recursive.
+
+2.2.1 Option1: SQL Function
+~~~~~~~~~~~~~~~~~~~~~~~~~~~
+Introduce an SQL function GPB_FIELD(path) which will return contents of the
+field.
+- Return type of the function will be determined from GPB message definition.
+- For path, we can use XPath selector (a subset of XPath) syntax.
+
+(TODO ^ the above needs to be specified in more detail. is the selector as
+simple as filesystem path or we allow quantifiers (with predicates?)?)
+
+2.2.2 Option2: SQL columns
+~~~~~~~~~~~~~~~~~~~~~~~~~~
+Make GPB columns to be accessible as SQL columns.
+This approach has problems:
+- It might be hard to implement code-wise
+ - (TODO will Virtual columns patch help??)
+- It is not clear how to access fields from nested structures. Should we allow
+ quoted names like `foo/bar[2]/baz' ?
+
DESCRIPTION:
Add support for dynamic columns:
- A column that can hold information from many columns
- One can instantly add or remove column data
This is a useful feature for any store type of application, where you want to
store different type of information for different kind of items.
For example, for shoes you want to store: material, size, colour, maker
For a computer you want to store ram, hard disk size etc...
In a normal 'relational' system you would need to a table for each type.
With dynamic columns you have all common items as fixed fields (like
product_code, manufacturer, price) and the rest stored in a dynamic column.
The proposed idea is to store the dynamic information in a blob in
Google Protocol Buffers (further GPB) format and use SQL constructs to
extract parts of GPB data for use in select list, for filtering, and so forth.
Any support for indexing GPB data is outside of scope of this WL entry.
Example usage:
SELECT proto_get(blob, 1, varchar) from table_with_proto;
UPDATE table_with_proto SET blob=proto_add(blob, 2, "hello") where id=1;
UPDATE table_with_proto SET blob=proto_del(blob,4) where id=5;
Note that 'proto_add()' will replace any old value with the given proto_id.
HIGH-LEVEL SPECIFICATION:
1. GPB Encoding overview
2. GPB in an SQL database
3. Encoding to use for dynamic columns
4. How to store and access data in a protocol buffer from SQL
5. Extensions for the future
1. GPB Encoding overview
========================
GBB is a compact encoding for structured and typed data. A unit of GPB data
(it is called message) is only partially self-describing: it's possible to
iterate over its parts, but, quoting the spec
http://code.google.com/apis/protocolbuffers/docs/encoding.html:
" the name and declared type for each field can only be determined on the
decoding end by referencing the message type's definition (i.e. the .proto
file). "
2. GPB in an SQL database
=========================
It is possible to store GPB data in MariaDB today - one can declare a binary
blob column and use it to store GPB messages. Storing and retrieving entire
messages will be the only available operations, though, as the server has no
idea about the GPB format.
It is apparent that ability to peek inside GPB data from SQL layer would be of
great advantage: one would be able to
- select only certain fields or parts of GPB messages
- filter records based on the values of GPB fields
- etc
performing such operations at SQL layer will allow to reduce client<->server
traffic right away, and will open path to getting the best possible
performance.
3. Encoding to use for dynamic columns
======================================
The data should be coded into the proto buffer in the following format:
<field_number><value_type><value>[<field_number><value_type><value>...]
Where field_number is a number between 0-65536 that identifes the field
<value_type> is a enum of type 'Item_result'
<value> is the value coded in proto format.
In other words, we should have no nested or complex structure.
4. How to store and access data in a protocol buffer from SQL
============================================================
User-friendly/meaningful access to GPB fields requires knowledge of
GPB field names and types, which are not available from GPB message
itself (see "GPB encoding overview" section).
To make things easy for the user, we will at first stage provide SQL
functions to manipulate a string that is actually in proto format.
The functions we should provde are:
proto_get(gpb, field_number, type)
This return the field tagged with 'field_number' from the 'gpb' buffer.
Example: proto_get(blob, 1, varchar) -> Returns field number 1 as varchar
proto_put(gpb, field_number, value)
This returns a new gbp buffer with the new value appended.
Example: proto_put(proto_put(blob, 1, 1), 2, "hello")
5. Extension for future
=======================
In the future we may want to access data based on name and get MariaDB to
automaticly know the correct type. To do this we need to be able to
store a definition for the content of the proto buffer somewhere.
DecisionToMake: How to pass the server the GPB definition?
First idea: add a CREATE TABLE parameter which will specify the
definition itself.
LOW-LEVEL DESIGN:
* GPB tarball contains a protocol definition for .proto file structure itself
and a parser for text form of .proto file which then exposes the parsed
file via standard GPB message navigation API.
* We should have both server-side support and client-side support (client side
means functions in libmysqlclient so that user can select the full BLOB and
extract fields in the application).
* Add some kind of header to the GPB blob to support versioning and future
extensibility.
* Add complete syntax description (update, add, drop, exists, ...).
ESTIMATED WORK TIME
ESTIMATED COMPLETION DATE
-----------------------------------------------------------------------
WorkLog (v3.5.9)
1
0

[Maria-developers] Updated (by Monty): Add support for dynamic columns (via google protocol buffers) (34)
by worklog-noreply@askmonty.org 11 Feb '10
by worklog-noreply@askmonty.org 11 Feb '10
11 Feb '10
-----------------------------------------------------------------------
WORKLOG TASK
-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-
TASK...........: Add support for dynamic columns (via google protocol buffers)
CREATION DATE..: Tue, 21 Jul 2009, 21:11
SUPERVISOR.....: Monty
IMPLEMENTOR....: Knielsen
COPIES TO......:
CATEGORY.......: Server-Sprint
TASK ID........: 34 (http://askmonty.org/worklog/?tid=34)
VERSION........: Server-5.3
STATUS.........: Assigned
PRIORITY.......: 60
WORKED HOURS...: 0
ESTIMATE.......: 0 (hours remain)
ORIG. ESTIMATE.: 0
PROGRESS NOTES:
-=-=(Monty - Thu, 11 Feb 2010, 20:04)=-=-
Version updated.
--- /tmp/wklog.34.old.18409 2010-02-11 18:04:35.000000000 +0000
+++ /tmp/wklog.34.new.18409 2010-02-11 18:04:35.000000000 +0000
@@ -1 +1 @@
-WorkLog-3.4
+Server-5.3
-=-=(Monty - Thu, 11 Feb 2010, 20:04)=-=-
Status updated.
--- /tmp/wklog.34.old.18409 2010-02-11 18:04:35.000000000 +0000
+++ /tmp/wklog.34.new.18409 2010-02-11 18:04:35.000000000 +0000
@@ -1 +1 @@
-Un-Assigned
+Assigned
-=-=(Monty - Thu, 11 Feb 2010, 20:04)=-=-
Title modified.
--- /tmp/wklog.34.old.18409 2010-02-11 18:04:35.000000000 +0000
+++ /tmp/wklog.34.new.18409 2010-02-11 18:04:35.000000000 +0000
@@ -1 +1 @@
-Add support for google protocol buffers
+Add support for dynamic columns (via google protocol buffers)
-=-=(Monty - Thu, 11 Feb 2010, 20:03)=-=-
High Level Description modified.
--- /tmp/wklog.34.old.18329 2010-02-11 18:03:42.000000000 +0000
+++ /tmp/wklog.34.new.18329 2010-02-11 18:03:42.000000000 +0000
@@ -19,3 +19,14 @@
Any support for indexing GPB data is outside of scope of this WL entry.
+Example usage:
+
+SELECT proto_get(blob, 1, varchar) from table_with_proto;
+
+UPDATE table_with_proto SET blob=proto_add(blob, 2, "hello") where id=1;
+
+UPDATE table_with_proto SET blob=proto_del(blob,4) where id=5;
+
+Note that 'proto_add()' will replace any old value with the given proto_id.
+
+
-=-=(Monty - Thu, 11 Feb 2010, 19:59)=-=-
High-Level Specification modified.
--- /tmp/wklog.34.old.17962 2010-02-11 19:59:45.000000000 +0200
+++ /tmp/wklog.34.new.17962 2010-02-11 19:59:45.000000000 +0200
@@ -1,13 +1,8 @@
-
-<contents>
1. GPB Encoding overview
2. GPB in an SQL database
-2.1 Informing server about GPB field names and types
-2.2 Addressing GPB fields
-2.2.1 Option1: SQL Function
-2.2.2 Option2: SQL columns
-</contents>
-
+3. Encoding to use for dynamic columns
+4. How to store and access data in a protocol buffer from SQL
+5. Extensions for the future
1. GPB Encoding overview
========================
@@ -37,42 +32,50 @@
traffic right away, and will open path to getting the best possible
performance.
-2.1 Informing server about GPB field names and types
-----------------------------------------------------
-User-friendly/meaningful access to GPB fields requires knowledge of GPB field
-names and types, which are not available from GPB message itself (see "GPB
-encoding overview" section).
-
-So the first issue to be addressed is to get the server to know the definition
-of stored messages. We intend to assume that all records have GPB messages
-that conform to a certain single definition, which gives one definition per
-GPB field.
+3. Encoding to use for dynamic columns
+======================================
-DecisionToMake: How to pass the server the GPB definition?
-First idea: add a CREATE TABLE parameter which will specify either the
-definition itself or path to .proto file with the definition.
+The data should be coded into the proto buffer in the following format:
+
+<field_number><value_type><value>[<field_number><value_type><value>...]
+
+Where field_number is a number between 0-65536 that identifes the field
+<value_type> is a enum of type 'Item_result'
+<value> is the value coded in proto format.
+
+In other words, we should have no nested or complex structure.
+
+4. How to store and access data in a protocol buffer from SQL
+============================================================
+
+User-friendly/meaningful access to GPB fields requires knowledge of
+GPB field names and types, which are not available from GPB message
+itself (see "GPB encoding overview" section).
+
+To make things easy for the user, we will at first stage provide SQL
+functions to manipulate a string that is actually in proto format.
-2.2 Addressing GPB fields
--------------------------
-We'll need to provide a way to access GPB fields. This can be complicated as
-structures that are encoded in GPB message can be nested and recursive.
-
-2.2.1 Option1: SQL Function
-~~~~~~~~~~~~~~~~~~~~~~~~~~~
-Introduce an SQL function GPB_FIELD(path) which will return contents of the
-field.
-- Return type of the function will be determined from GPB message definition.
-- For path, we can use XPath selector (a subset of XPath) syntax.
-
-(TODO ^ the above needs to be specified in more detail. is the selector as
-simple as filesystem path or we allow quantifiers (with predicates?)?)
-
-2.2.2 Option2: SQL columns
-~~~~~~~~~~~~~~~~~~~~~~~~~~
-Make GPB columns to be accessible as SQL columns.
-This approach has problems:
-- It might be hard to implement code-wise
- - (TODO will Virtual columns patch help??)
-- It is not clear how to access fields from nested structures. Should we allow
- quoted names like `foo/bar[2]/baz' ?
+The functions we should provde are:
+proto_get(gpb, field_number, type)
+
+This return the field tagged with 'field_number' from the 'gpb' buffer.
+
+Example: proto_get(blob, 1, varchar) -> Returns field number 1 as varchar
+
+proto_put(gpb, field_number, value)
+
+This returns a new gbp buffer with the new value appended.
+
+Example: proto_put(proto_put(blob, 1, 1), 2, "hello")
+
+5. Extension for future
+=======================
+
+In the future we may want to access data based on name and get MariaDB to
+automaticly know the correct type. To do this we need to be able to
+store a definition for the content of the proto buffer somewhere.
+
+DecisionToMake: How to pass the server the GPB definition?
+First idea: add a CREATE TABLE parameter which will specify the
+definition itself.
-=-=(Monty - Thu, 11 Feb 2010, 19:59)=-=-
High Level Description modified.
--- /tmp/wklog.34.old.17915 2010-02-11 17:59:17.000000000 +0000
+++ /tmp/wklog.34.new.17915 2010-02-11 17:59:17.000000000 +0000
@@ -1,5 +1,21 @@
-Add support for Google Protocol Buffers (further GPB). It should be possible
-to have columns that store GPB-encoded data, as well as use SQL constructs to
+Add support for dynamic columns:
+
+- A column that can hold information from many columns
+- One can instantly add or remove column data
+
+This is a useful feature for any store type of application, where you want to
+store different type of information for different kind of items.
+
+For example, for shoes you want to store: material, size, colour, maker
+For a computer you want to store ram, hard disk size etc...
+
+In a normal 'relational' system you would need to a table for each type.
+With dynamic columns you have all common items as fixed fields (like
+product_code, manufacturer, price) and the rest stored in a dynamic column.
+
+The proposed idea is to store the dynamic information in a blob in
+Google Protocol Buffers (further GPB) format and use SQL constructs to
extract parts of GPB data for use in select list, for filtering, and so forth.
+
Any support for indexing GPB data is outside of scope of this WL entry.
-=-=(Knielsen - Fri, 22 Jan 2010, 11:38)=-=-
Low Level Design modified.
--- /tmp/wklog.34.old.29965 2010-01-22 11:38:57.000000000 +0200
+++ /tmp/wklog.34.new.29965 2010-01-22 11:38:57.000000000 +0200
@@ -2,3 +2,12 @@
and a parser for text form of .proto file which then exposes the parsed
file via standard GPB message navigation API.
+* We should have both server-side support and client-side support (client side
+ means functions in libmysqlclient so that user can select the full BLOB and
+ extract fields in the application).
+
+* Add some kind of header to the GPB blob to support versioning and future
+ extensibility.
+
+* Add complete syntax description (update, add, drop, exists, ...).
+
-=-=(Psergey - Tue, 21 Jul 2009, 21:13)=-=-
Low Level Design modified.
--- /tmp/wklog.34.old.6462 2009-07-21 21:13:13.000000000 +0300
+++ /tmp/wklog.34.new.6462 2009-07-21 21:13:13.000000000 +0300
@@ -1 +1,4 @@
+* GPB tarball contains a protocol definition for .proto file structure itself
+ and a parser for text form of .proto file which then exposes the parsed
+ file via standard GPB message navigation API.
-=-=(Psergey - Tue, 21 Jul 2009, 21:12)=-=-
High-Level Specification modified.
--- /tmp/wklog.34.old.6399 2009-07-21 21:12:23.000000000 +0300
+++ /tmp/wklog.34.new.6399 2009-07-21 21:12:23.000000000 +0300
@@ -1 +1,78 @@
+<contents>
+1. GPB Encoding overview
+2. GPB in an SQL database
+2.1 Informing server about GPB field names and types
+2.2 Addressing GPB fields
+2.2.1 Option1: SQL Function
+2.2.2 Option2: SQL columns
+</contents>
+
+
+1. GPB Encoding overview
+========================
+
+GBB is a compact encoding for structured and typed data. A unit of GPB data
+(it is called message) is only partially self-describing: it's possible to
+iterate over its parts, but, quoting the spec
+
+http://code.google.com/apis/protocolbuffers/docs/encoding.html:
+ " the name and declared type for each field can only be determined on the
+ decoding end by referencing the message type's definition (i.e. the .proto
+ file). "
+
+2. GPB in an SQL database
+=========================
+
+It is possible to store GPB data in MariaDB today - one can declare a binary
+blob column and use it to store GPB messages. Storing and retrieving entire
+messages will be the only available operations, though, as the server has no
+idea about the GPB format.
+It is apparent that ability to peek inside GPB data from SQL layer would be of
+great advantage: one would be able to
+- select only certain fields or parts of GPB messages
+- filter records based on the values of GPB fields
+- etc
+performing such operations at SQL layer will allow to reduce client<->server
+traffic right away, and will open path to getting the best possible
+performance.
+
+2.1 Informing server about GPB field names and types
+----------------------------------------------------
+User-friendly/meaningful access to GPB fields requires knowledge of GPB field
+names and types, which are not available from GPB message itself (see "GPB
+encoding overview" section).
+
+So the first issue to be addressed is to get the server to know the definition
+of stored messages. We intend to assume that all records have GPB messages
+that conform to a certain single definition, which gives one definition per
+GPB field.
+
+DecisionToMake: How to pass the server the GPB definition?
+First idea: add a CREATE TABLE parameter which will specify either the
+definition itself or path to .proto file with the definition.
+
+2.2 Addressing GPB fields
+-------------------------
+We'll need to provide a way to access GPB fields. This can be complicated as
+structures that are encoded in GPB message can be nested and recursive.
+
+2.2.1 Option1: SQL Function
+~~~~~~~~~~~~~~~~~~~~~~~~~~~
+Introduce an SQL function GPB_FIELD(path) which will return contents of the
+field.
+- Return type of the function will be determined from GPB message definition.
+- For path, we can use XPath selector (a subset of XPath) syntax.
+
+(TODO ^ the above needs to be specified in more detail. is the selector as
+simple as filesystem path or we allow quantifiers (with predicates?)?)
+
+2.2.2 Option2: SQL columns
+~~~~~~~~~~~~~~~~~~~~~~~~~~
+Make GPB columns to be accessible as SQL columns.
+This approach has problems:
+- It might be hard to implement code-wise
+ - (TODO will Virtual columns patch help??)
+- It is not clear how to access fields from nested structures. Should we allow
+ quoted names like `foo/bar[2]/baz' ?
+
DESCRIPTION:
Add support for dynamic columns:
- A column that can hold information from many columns
- One can instantly add or remove column data
This is a useful feature for any store type of application, where you want to
store different type of information for different kind of items.
For example, for shoes you want to store: material, size, colour, maker
For a computer you want to store ram, hard disk size etc...
In a normal 'relational' system you would need to a table for each type.
With dynamic columns you have all common items as fixed fields (like
product_code, manufacturer, price) and the rest stored in a dynamic column.
The proposed idea is to store the dynamic information in a blob in
Google Protocol Buffers (further GPB) format and use SQL constructs to
extract parts of GPB data for use in select list, for filtering, and so forth.
Any support for indexing GPB data is outside of scope of this WL entry.
Example usage:
SELECT proto_get(blob, 1, varchar) from table_with_proto;
UPDATE table_with_proto SET blob=proto_add(blob, 2, "hello") where id=1;
UPDATE table_with_proto SET blob=proto_del(blob,4) where id=5;
Note that 'proto_add()' will replace any old value with the given proto_id.
HIGH-LEVEL SPECIFICATION:
1. GPB Encoding overview
2. GPB in an SQL database
3. Encoding to use for dynamic columns
4. How to store and access data in a protocol buffer from SQL
5. Extensions for the future
1. GPB Encoding overview
========================
GBB is a compact encoding for structured and typed data. A unit of GPB data
(it is called message) is only partially self-describing: it's possible to
iterate over its parts, but, quoting the spec
http://code.google.com/apis/protocolbuffers/docs/encoding.html:
" the name and declared type for each field can only be determined on the
decoding end by referencing the message type's definition (i.e. the .proto
file). "
2. GPB in an SQL database
=========================
It is possible to store GPB data in MariaDB today - one can declare a binary
blob column and use it to store GPB messages. Storing and retrieving entire
messages will be the only available operations, though, as the server has no
idea about the GPB format.
It is apparent that ability to peek inside GPB data from SQL layer would be of
great advantage: one would be able to
- select only certain fields or parts of GPB messages
- filter records based on the values of GPB fields
- etc
performing such operations at SQL layer will allow to reduce client<->server
traffic right away, and will open path to getting the best possible
performance.
3. Encoding to use for dynamic columns
======================================
The data should be coded into the proto buffer in the following format:
<field_number><value_type><value>[<field_number><value_type><value>...]
Where field_number is a number between 0-65536 that identifes the field
<value_type> is a enum of type 'Item_result'
<value> is the value coded in proto format.
In other words, we should have no nested or complex structure.
4. How to store and access data in a protocol buffer from SQL
============================================================
User-friendly/meaningful access to GPB fields requires knowledge of
GPB field names and types, which are not available from GPB message
itself (see "GPB encoding overview" section).
To make things easy for the user, we will at first stage provide SQL
functions to manipulate a string that is actually in proto format.
The functions we should provde are:
proto_get(gpb, field_number, type)
This return the field tagged with 'field_number' from the 'gpb' buffer.
Example: proto_get(blob, 1, varchar) -> Returns field number 1 as varchar
proto_put(gpb, field_number, value)
This returns a new gbp buffer with the new value appended.
Example: proto_put(proto_put(blob, 1, 1), 2, "hello")
5. Extension for future
=======================
In the future we may want to access data based on name and get MariaDB to
automaticly know the correct type. To do this we need to be able to
store a definition for the content of the proto buffer somewhere.
DecisionToMake: How to pass the server the GPB definition?
First idea: add a CREATE TABLE parameter which will specify the
definition itself.
LOW-LEVEL DESIGN:
* GPB tarball contains a protocol definition for .proto file structure itself
and a parser for text form of .proto file which then exposes the parsed
file via standard GPB message navigation API.
* We should have both server-side support and client-side support (client side
means functions in libmysqlclient so that user can select the full BLOB and
extract fields in the application).
* Add some kind of header to the GPB blob to support versioning and future
extensibility.
* Add complete syntax description (update, add, drop, exists, ...).
ESTIMATED WORK TIME
ESTIMATED COMPLETION DATE
-----------------------------------------------------------------------
WorkLog (v3.5.9)
1
0

[Maria-developers] Updated (by Monty): Add support for dynamic columns (via google protocol buffers) (34)
by worklog-noreply@askmonty.org 11 Feb '10
by worklog-noreply@askmonty.org 11 Feb '10
11 Feb '10
-----------------------------------------------------------------------
WORKLOG TASK
-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-
TASK...........: Add support for dynamic columns (via google protocol buffers)
CREATION DATE..: Tue, 21 Jul 2009, 21:11
SUPERVISOR.....: Monty
IMPLEMENTOR....: Knielsen
COPIES TO......:
CATEGORY.......: Server-Sprint
TASK ID........: 34 (http://askmonty.org/worklog/?tid=34)
VERSION........: Server-5.3
STATUS.........: Assigned
PRIORITY.......: 60
WORKED HOURS...: 0
ESTIMATE.......: 0 (hours remain)
ORIG. ESTIMATE.: 0
PROGRESS NOTES:
-=-=(Monty - Thu, 11 Feb 2010, 20:04)=-=-
Version updated.
--- /tmp/wklog.34.old.18409 2010-02-11 18:04:35.000000000 +0000
+++ /tmp/wklog.34.new.18409 2010-02-11 18:04:35.000000000 +0000
@@ -1 +1 @@
-WorkLog-3.4
+Server-5.3
-=-=(Monty - Thu, 11 Feb 2010, 20:04)=-=-
Status updated.
--- /tmp/wklog.34.old.18409 2010-02-11 18:04:35.000000000 +0000
+++ /tmp/wklog.34.new.18409 2010-02-11 18:04:35.000000000 +0000
@@ -1 +1 @@
-Un-Assigned
+Assigned
-=-=(Monty - Thu, 11 Feb 2010, 20:04)=-=-
Title modified.
--- /tmp/wklog.34.old.18409 2010-02-11 18:04:35.000000000 +0000
+++ /tmp/wklog.34.new.18409 2010-02-11 18:04:35.000000000 +0000
@@ -1 +1 @@
-Add support for google protocol buffers
+Add support for dynamic columns (via google protocol buffers)
-=-=(Monty - Thu, 11 Feb 2010, 20:03)=-=-
High Level Description modified.
--- /tmp/wklog.34.old.18329 2010-02-11 18:03:42.000000000 +0000
+++ /tmp/wklog.34.new.18329 2010-02-11 18:03:42.000000000 +0000
@@ -19,3 +19,14 @@
Any support for indexing GPB data is outside of scope of this WL entry.
+Example usage:
+
+SELECT proto_get(blob, 1, varchar) from table_with_proto;
+
+UPDATE table_with_proto SET blob=proto_add(blob, 2, "hello") where id=1;
+
+UPDATE table_with_proto SET blob=proto_del(blob,4) where id=5;
+
+Note that 'proto_add()' will replace any old value with the given proto_id.
+
+
-=-=(Monty - Thu, 11 Feb 2010, 19:59)=-=-
High-Level Specification modified.
--- /tmp/wklog.34.old.17962 2010-02-11 19:59:45.000000000 +0200
+++ /tmp/wklog.34.new.17962 2010-02-11 19:59:45.000000000 +0200
@@ -1,13 +1,8 @@
-
-<contents>
1. GPB Encoding overview
2. GPB in an SQL database
-2.1 Informing server about GPB field names and types
-2.2 Addressing GPB fields
-2.2.1 Option1: SQL Function
-2.2.2 Option2: SQL columns
-</contents>
-
+3. Encoding to use for dynamic columns
+4. How to store and access data in a protocol buffer from SQL
+5. Extensions for the future
1. GPB Encoding overview
========================
@@ -37,42 +32,50 @@
traffic right away, and will open path to getting the best possible
performance.
-2.1 Informing server about GPB field names and types
-----------------------------------------------------
-User-friendly/meaningful access to GPB fields requires knowledge of GPB field
-names and types, which are not available from GPB message itself (see "GPB
-encoding overview" section).
-
-So the first issue to be addressed is to get the server to know the definition
-of stored messages. We intend to assume that all records have GPB messages
-that conform to a certain single definition, which gives one definition per
-GPB field.
+3. Encoding to use for dynamic columns
+======================================
-DecisionToMake: How to pass the server the GPB definition?
-First idea: add a CREATE TABLE parameter which will specify either the
-definition itself or path to .proto file with the definition.
+The data should be coded into the proto buffer in the following format:
+
+<field_number><value_type><value>[<field_number><value_type><value>...]
+
+Where field_number is a number between 0-65536 that identifes the field
+<value_type> is a enum of type 'Item_result'
+<value> is the value coded in proto format.
+
+In other words, we should have no nested or complex structure.
+
+4. How to store and access data in a protocol buffer from SQL
+============================================================
+
+User-friendly/meaningful access to GPB fields requires knowledge of
+GPB field names and types, which are not available from GPB message
+itself (see "GPB encoding overview" section).
+
+To make things easy for the user, we will at first stage provide SQL
+functions to manipulate a string that is actually in proto format.
-2.2 Addressing GPB fields
--------------------------
-We'll need to provide a way to access GPB fields. This can be complicated as
-structures that are encoded in GPB message can be nested and recursive.
-
-2.2.1 Option1: SQL Function
-~~~~~~~~~~~~~~~~~~~~~~~~~~~
-Introduce an SQL function GPB_FIELD(path) which will return contents of the
-field.
-- Return type of the function will be determined from GPB message definition.
-- For path, we can use XPath selector (a subset of XPath) syntax.
-
-(TODO ^ the above needs to be specified in more detail. is the selector as
-simple as filesystem path or we allow quantifiers (with predicates?)?)
-
-2.2.2 Option2: SQL columns
-~~~~~~~~~~~~~~~~~~~~~~~~~~
-Make GPB columns to be accessible as SQL columns.
-This approach has problems:
-- It might be hard to implement code-wise
- - (TODO will Virtual columns patch help??)
-- It is not clear how to access fields from nested structures. Should we allow
- quoted names like `foo/bar[2]/baz' ?
+The functions we should provde are:
+proto_get(gpb, field_number, type)
+
+This return the field tagged with 'field_number' from the 'gpb' buffer.
+
+Example: proto_get(blob, 1, varchar) -> Returns field number 1 as varchar
+
+proto_put(gpb, field_number, value)
+
+This returns a new gbp buffer with the new value appended.
+
+Example: proto_put(proto_put(blob, 1, 1), 2, "hello")
+
+5. Extension for future
+=======================
+
+In the future we may want to access data based on name and get MariaDB to
+automaticly know the correct type. To do this we need to be able to
+store a definition for the content of the proto buffer somewhere.
+
+DecisionToMake: How to pass the server the GPB definition?
+First idea: add a CREATE TABLE parameter which will specify the
+definition itself.
-=-=(Monty - Thu, 11 Feb 2010, 19:59)=-=-
High Level Description modified.
--- /tmp/wklog.34.old.17915 2010-02-11 17:59:17.000000000 +0000
+++ /tmp/wklog.34.new.17915 2010-02-11 17:59:17.000000000 +0000
@@ -1,5 +1,21 @@
-Add support for Google Protocol Buffers (further GPB). It should be possible
-to have columns that store GPB-encoded data, as well as use SQL constructs to
+Add support for dynamic columns:
+
+- A column that can hold information from many columns
+- One can instantly add or remove column data
+
+This is a useful feature for any store type of application, where you want to
+store different type of information for different kind of items.
+
+For example, for shoes you want to store: material, size, colour, maker
+For a computer you want to store ram, hard disk size etc...
+
+In a normal 'relational' system you would need to a table for each type.
+With dynamic columns you have all common items as fixed fields (like
+product_code, manufacturer, price) and the rest stored in a dynamic column.
+
+The proposed idea is to store the dynamic information in a blob in
+Google Protocol Buffers (further GPB) format and use SQL constructs to
extract parts of GPB data for use in select list, for filtering, and so forth.
+
Any support for indexing GPB data is outside of scope of this WL entry.
-=-=(Knielsen - Fri, 22 Jan 2010, 11:38)=-=-
Low Level Design modified.
--- /tmp/wklog.34.old.29965 2010-01-22 11:38:57.000000000 +0200
+++ /tmp/wklog.34.new.29965 2010-01-22 11:38:57.000000000 +0200
@@ -2,3 +2,12 @@
and a parser for text form of .proto file which then exposes the parsed
file via standard GPB message navigation API.
+* We should have both server-side support and client-side support (client side
+ means functions in libmysqlclient so that user can select the full BLOB and
+ extract fields in the application).
+
+* Add some kind of header to the GPB blob to support versioning and future
+ extensibility.
+
+* Add complete syntax description (update, add, drop, exists, ...).
+
-=-=(Psergey - Tue, 21 Jul 2009, 21:13)=-=-
Low Level Design modified.
--- /tmp/wklog.34.old.6462 2009-07-21 21:13:13.000000000 +0300
+++ /tmp/wklog.34.new.6462 2009-07-21 21:13:13.000000000 +0300
@@ -1 +1,4 @@
+* GPB tarball contains a protocol definition for .proto file structure itself
+ and a parser for text form of .proto file which then exposes the parsed
+ file via standard GPB message navigation API.
-=-=(Psergey - Tue, 21 Jul 2009, 21:12)=-=-
High-Level Specification modified.
--- /tmp/wklog.34.old.6399 2009-07-21 21:12:23.000000000 +0300
+++ /tmp/wklog.34.new.6399 2009-07-21 21:12:23.000000000 +0300
@@ -1 +1,78 @@
+<contents>
+1. GPB Encoding overview
+2. GPB in an SQL database
+2.1 Informing server about GPB field names and types
+2.2 Addressing GPB fields
+2.2.1 Option1: SQL Function
+2.2.2 Option2: SQL columns
+</contents>
+
+
+1. GPB Encoding overview
+========================
+
+GBB is a compact encoding for structured and typed data. A unit of GPB data
+(it is called message) is only partially self-describing: it's possible to
+iterate over its parts, but, quoting the spec
+
+http://code.google.com/apis/protocolbuffers/docs/encoding.html:
+ " the name and declared type for each field can only be determined on the
+ decoding end by referencing the message type's definition (i.e. the .proto
+ file). "
+
+2. GPB in an SQL database
+=========================
+
+It is possible to store GPB data in MariaDB today - one can declare a binary
+blob column and use it to store GPB messages. Storing and retrieving entire
+messages will be the only available operations, though, as the server has no
+idea about the GPB format.
+It is apparent that ability to peek inside GPB data from SQL layer would be of
+great advantage: one would be able to
+- select only certain fields or parts of GPB messages
+- filter records based on the values of GPB fields
+- etc
+performing such operations at SQL layer will allow to reduce client<->server
+traffic right away, and will open path to getting the best possible
+performance.
+
+2.1 Informing server about GPB field names and types
+----------------------------------------------------
+User-friendly/meaningful access to GPB fields requires knowledge of GPB field
+names and types, which are not available from GPB message itself (see "GPB
+encoding overview" section).
+
+So the first issue to be addressed is to get the server to know the definition
+of stored messages. We intend to assume that all records have GPB messages
+that conform to a certain single definition, which gives one definition per
+GPB field.
+
+DecisionToMake: How to pass the server the GPB definition?
+First idea: add a CREATE TABLE parameter which will specify either the
+definition itself or path to .proto file with the definition.
+
+2.2 Addressing GPB fields
+-------------------------
+We'll need to provide a way to access GPB fields. This can be complicated as
+structures that are encoded in GPB message can be nested and recursive.
+
+2.2.1 Option1: SQL Function
+~~~~~~~~~~~~~~~~~~~~~~~~~~~
+Introduce an SQL function GPB_FIELD(path) which will return contents of the
+field.
+- Return type of the function will be determined from GPB message definition.
+- For path, we can use XPath selector (a subset of XPath) syntax.
+
+(TODO ^ the above needs to be specified in more detail. is the selector as
+simple as filesystem path or we allow quantifiers (with predicates?)?)
+
+2.2.2 Option2: SQL columns
+~~~~~~~~~~~~~~~~~~~~~~~~~~
+Make GPB columns to be accessible as SQL columns.
+This approach has problems:
+- It might be hard to implement code-wise
+ - (TODO will Virtual columns patch help??)
+- It is not clear how to access fields from nested structures. Should we allow
+ quoted names like `foo/bar[2]/baz' ?
+
DESCRIPTION:
Add support for dynamic columns:
- A column that can hold information from many columns
- One can instantly add or remove column data
This is a useful feature for any store type of application, where you want to
store different type of information for different kind of items.
For example, for shoes you want to store: material, size, colour, maker
For a computer you want to store ram, hard disk size etc...
In a normal 'relational' system you would need to a table for each type.
With dynamic columns you have all common items as fixed fields (like
product_code, manufacturer, price) and the rest stored in a dynamic column.
The proposed idea is to store the dynamic information in a blob in
Google Protocol Buffers (further GPB) format and use SQL constructs to
extract parts of GPB data for use in select list, for filtering, and so forth.
Any support for indexing GPB data is outside of scope of this WL entry.
Example usage:
SELECT proto_get(blob, 1, varchar) from table_with_proto;
UPDATE table_with_proto SET blob=proto_add(blob, 2, "hello") where id=1;
UPDATE table_with_proto SET blob=proto_del(blob,4) where id=5;
Note that 'proto_add()' will replace any old value with the given proto_id.
HIGH-LEVEL SPECIFICATION:
1. GPB Encoding overview
2. GPB in an SQL database
3. Encoding to use for dynamic columns
4. How to store and access data in a protocol buffer from SQL
5. Extensions for the future
1. GPB Encoding overview
========================
GBB is a compact encoding for structured and typed data. A unit of GPB data
(it is called message) is only partially self-describing: it's possible to
iterate over its parts, but, quoting the spec
http://code.google.com/apis/protocolbuffers/docs/encoding.html:
" the name and declared type for each field can only be determined on the
decoding end by referencing the message type's definition (i.e. the .proto
file). "
2. GPB in an SQL database
=========================
It is possible to store GPB data in MariaDB today - one can declare a binary
blob column and use it to store GPB messages. Storing and retrieving entire
messages will be the only available operations, though, as the server has no
idea about the GPB format.
It is apparent that ability to peek inside GPB data from SQL layer would be of
great advantage: one would be able to
- select only certain fields or parts of GPB messages
- filter records based on the values of GPB fields
- etc
performing such operations at SQL layer will allow to reduce client<->server
traffic right away, and will open path to getting the best possible
performance.
3. Encoding to use for dynamic columns
======================================
The data should be coded into the proto buffer in the following format:
<field_number><value_type><value>[<field_number><value_type><value>...]
Where field_number is a number between 0-65536 that identifes the field
<value_type> is a enum of type 'Item_result'
<value> is the value coded in proto format.
In other words, we should have no nested or complex structure.
4. How to store and access data in a protocol buffer from SQL
============================================================
User-friendly/meaningful access to GPB fields requires knowledge of
GPB field names and types, which are not available from GPB message
itself (see "GPB encoding overview" section).
To make things easy for the user, we will at first stage provide SQL
functions to manipulate a string that is actually in proto format.
The functions we should provde are:
proto_get(gpb, field_number, type)
This return the field tagged with 'field_number' from the 'gpb' buffer.
Example: proto_get(blob, 1, varchar) -> Returns field number 1 as varchar
proto_put(gpb, field_number, value)
This returns a new gbp buffer with the new value appended.
Example: proto_put(proto_put(blob, 1, 1), 2, "hello")
5. Extension for future
=======================
In the future we may want to access data based on name and get MariaDB to
automaticly know the correct type. To do this we need to be able to
store a definition for the content of the proto buffer somewhere.
DecisionToMake: How to pass the server the GPB definition?
First idea: add a CREATE TABLE parameter which will specify the
definition itself.
LOW-LEVEL DESIGN:
* GPB tarball contains a protocol definition for .proto file structure itself
and a parser for text form of .proto file which then exposes the parsed
file via standard GPB message navigation API.
* We should have both server-side support and client-side support (client side
means functions in libmysqlclient so that user can select the full BLOB and
extract fields in the application).
* Add some kind of header to the GPB blob to support versioning and future
extensibility.
* Add complete syntax description (update, add, drop, exists, ...).
ESTIMATED WORK TIME
ESTIMATED COMPLETION DATE
-----------------------------------------------------------------------
WorkLog (v3.5.9)
1
0

[Maria-developers] Updated (by Monty): Add support for google protocol buffers (34)
by worklog-noreply@askmonty.org 11 Feb '10
by worklog-noreply@askmonty.org 11 Feb '10
11 Feb '10
-----------------------------------------------------------------------
WORKLOG TASK
-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-
TASK...........: Add support for google protocol buffers
CREATION DATE..: Tue, 21 Jul 2009, 21:11
SUPERVISOR.....: Monty
IMPLEMENTOR....:
COPIES TO......:
CATEGORY.......: Server-Sprint
TASK ID........: 34 (http://askmonty.org/worklog/?tid=34)
VERSION........: WorkLog-3.4
STATUS.........: Un-Assigned
PRIORITY.......: 60
WORKED HOURS...: 0
ESTIMATE.......: 0 (hours remain)
ORIG. ESTIMATE.: 0
PROGRESS NOTES:
-=-=(Monty - Thu, 11 Feb 2010, 20:03)=-=-
High Level Description modified.
--- /tmp/wklog.34.old.18329 2010-02-11 18:03:42.000000000 +0000
+++ /tmp/wklog.34.new.18329 2010-02-11 18:03:42.000000000 +0000
@@ -19,3 +19,14 @@
Any support for indexing GPB data is outside of scope of this WL entry.
+Example usage:
+
+SELECT proto_get(blob, 1, varchar) from table_with_proto;
+
+UPDATE table_with_proto SET blob=proto_add(blob, 2, "hello") where id=1;
+
+UPDATE table_with_proto SET blob=proto_del(blob,4) where id=5;
+
+Note that 'proto_add()' will replace any old value with the given proto_id.
+
+
-=-=(Monty - Thu, 11 Feb 2010, 19:59)=-=-
High-Level Specification modified.
--- /tmp/wklog.34.old.17962 2010-02-11 19:59:45.000000000 +0200
+++ /tmp/wklog.34.new.17962 2010-02-11 19:59:45.000000000 +0200
@@ -1,13 +1,8 @@
-
-<contents>
1. GPB Encoding overview
2. GPB in an SQL database
-2.1 Informing server about GPB field names and types
-2.2 Addressing GPB fields
-2.2.1 Option1: SQL Function
-2.2.2 Option2: SQL columns
-</contents>
-
+3. Encoding to use for dynamic columns
+4. How to store and access data in a protocol buffer from SQL
+5. Extensions for the future
1. GPB Encoding overview
========================
@@ -37,42 +32,50 @@
traffic right away, and will open path to getting the best possible
performance.
-2.1 Informing server about GPB field names and types
-----------------------------------------------------
-User-friendly/meaningful access to GPB fields requires knowledge of GPB field
-names and types, which are not available from GPB message itself (see "GPB
-encoding overview" section).
-
-So the first issue to be addressed is to get the server to know the definition
-of stored messages. We intend to assume that all records have GPB messages
-that conform to a certain single definition, which gives one definition per
-GPB field.
+3. Encoding to use for dynamic columns
+======================================
-DecisionToMake: How to pass the server the GPB definition?
-First idea: add a CREATE TABLE parameter which will specify either the
-definition itself or path to .proto file with the definition.
+The data should be coded into the proto buffer in the following format:
+
+<field_number><value_type><value>[<field_number><value_type><value>...]
+
+Where field_number is a number between 0-65536 that identifes the field
+<value_type> is a enum of type 'Item_result'
+<value> is the value coded in proto format.
+
+In other words, we should have no nested or complex structure.
+
+4. How to store and access data in a protocol buffer from SQL
+============================================================
+
+User-friendly/meaningful access to GPB fields requires knowledge of
+GPB field names and types, which are not available from GPB message
+itself (see "GPB encoding overview" section).
+
+To make things easy for the user, we will at first stage provide SQL
+functions to manipulate a string that is actually in proto format.
-2.2 Addressing GPB fields
--------------------------
-We'll need to provide a way to access GPB fields. This can be complicated as
-structures that are encoded in GPB message can be nested and recursive.
-
-2.2.1 Option1: SQL Function
-~~~~~~~~~~~~~~~~~~~~~~~~~~~
-Introduce an SQL function GPB_FIELD(path) which will return contents of the
-field.
-- Return type of the function will be determined from GPB message definition.
-- For path, we can use XPath selector (a subset of XPath) syntax.
-
-(TODO ^ the above needs to be specified in more detail. is the selector as
-simple as filesystem path or we allow quantifiers (with predicates?)?)
-
-2.2.2 Option2: SQL columns
-~~~~~~~~~~~~~~~~~~~~~~~~~~
-Make GPB columns to be accessible as SQL columns.
-This approach has problems:
-- It might be hard to implement code-wise
- - (TODO will Virtual columns patch help??)
-- It is not clear how to access fields from nested structures. Should we allow
- quoted names like `foo/bar[2]/baz' ?
+The functions we should provde are:
+proto_get(gpb, field_number, type)
+
+This return the field tagged with 'field_number' from the 'gpb' buffer.
+
+Example: proto_get(blob, 1, varchar) -> Returns field number 1 as varchar
+
+proto_put(gpb, field_number, value)
+
+This returns a new gbp buffer with the new value appended.
+
+Example: proto_put(proto_put(blob, 1, 1), 2, "hello")
+
+5. Extension for future
+=======================
+
+In the future we may want to access data based on name and get MariaDB to
+automaticly know the correct type. To do this we need to be able to
+store a definition for the content of the proto buffer somewhere.
+
+DecisionToMake: How to pass the server the GPB definition?
+First idea: add a CREATE TABLE parameter which will specify the
+definition itself.
-=-=(Monty - Thu, 11 Feb 2010, 19:59)=-=-
High Level Description modified.
--- /tmp/wklog.34.old.17915 2010-02-11 17:59:17.000000000 +0000
+++ /tmp/wklog.34.new.17915 2010-02-11 17:59:17.000000000 +0000
@@ -1,5 +1,21 @@
-Add support for Google Protocol Buffers (further GPB). It should be possible
-to have columns that store GPB-encoded data, as well as use SQL constructs to
+Add support for dynamic columns:
+
+- A column that can hold information from many columns
+- One can instantly add or remove column data
+
+This is a useful feature for any store type of application, where you want to
+store different type of information for different kind of items.
+
+For example, for shoes you want to store: material, size, colour, maker
+For a computer you want to store ram, hard disk size etc...
+
+In a normal 'relational' system you would need to a table for each type.
+With dynamic columns you have all common items as fixed fields (like
+product_code, manufacturer, price) and the rest stored in a dynamic column.
+
+The proposed idea is to store the dynamic information in a blob in
+Google Protocol Buffers (further GPB) format and use SQL constructs to
extract parts of GPB data for use in select list, for filtering, and so forth.
+
Any support for indexing GPB data is outside of scope of this WL entry.
-=-=(Knielsen - Fri, 22 Jan 2010, 11:38)=-=-
Low Level Design modified.
--- /tmp/wklog.34.old.29965 2010-01-22 11:38:57.000000000 +0200
+++ /tmp/wklog.34.new.29965 2010-01-22 11:38:57.000000000 +0200
@@ -2,3 +2,12 @@
and a parser for text form of .proto file which then exposes the parsed
file via standard GPB message navigation API.
+* We should have both server-side support and client-side support (client side
+ means functions in libmysqlclient so that user can select the full BLOB and
+ extract fields in the application).
+
+* Add some kind of header to the GPB blob to support versioning and future
+ extensibility.
+
+* Add complete syntax description (update, add, drop, exists, ...).
+
-=-=(Psergey - Tue, 21 Jul 2009, 21:13)=-=-
Low Level Design modified.
--- /tmp/wklog.34.old.6462 2009-07-21 21:13:13.000000000 +0300
+++ /tmp/wklog.34.new.6462 2009-07-21 21:13:13.000000000 +0300
@@ -1 +1,4 @@
+* GPB tarball contains a protocol definition for .proto file structure itself
+ and a parser for text form of .proto file which then exposes the parsed
+ file via standard GPB message navigation API.
-=-=(Psergey - Tue, 21 Jul 2009, 21:12)=-=-
High-Level Specification modified.
--- /tmp/wklog.34.old.6399 2009-07-21 21:12:23.000000000 +0300
+++ /tmp/wklog.34.new.6399 2009-07-21 21:12:23.000000000 +0300
@@ -1 +1,78 @@
+<contents>
+1. GPB Encoding overview
+2. GPB in an SQL database
+2.1 Informing server about GPB field names and types
+2.2 Addressing GPB fields
+2.2.1 Option1: SQL Function
+2.2.2 Option2: SQL columns
+</contents>
+
+
+1. GPB Encoding overview
+========================
+
+GBB is a compact encoding for structured and typed data. A unit of GPB data
+(it is called message) is only partially self-describing: it's possible to
+iterate over its parts, but, quoting the spec
+
+http://code.google.com/apis/protocolbuffers/docs/encoding.html:
+ " the name and declared type for each field can only be determined on the
+ decoding end by referencing the message type's definition (i.e. the .proto
+ file). "
+
+2. GPB in an SQL database
+=========================
+
+It is possible to store GPB data in MariaDB today - one can declare a binary
+blob column and use it to store GPB messages. Storing and retrieving entire
+messages will be the only available operations, though, as the server has no
+idea about the GPB format.
+It is apparent that ability to peek inside GPB data from SQL layer would be of
+great advantage: one would be able to
+- select only certain fields or parts of GPB messages
+- filter records based on the values of GPB fields
+- etc
+performing such operations at SQL layer will allow to reduce client<->server
+traffic right away, and will open path to getting the best possible
+performance.
+
+2.1 Informing server about GPB field names and types
+----------------------------------------------------
+User-friendly/meaningful access to GPB fields requires knowledge of GPB field
+names and types, which are not available from GPB message itself (see "GPB
+encoding overview" section).
+
+So the first issue to be addressed is to get the server to know the definition
+of stored messages. We intend to assume that all records have GPB messages
+that conform to a certain single definition, which gives one definition per
+GPB field.
+
+DecisionToMake: How to pass the server the GPB definition?
+First idea: add a CREATE TABLE parameter which will specify either the
+definition itself or path to .proto file with the definition.
+
+2.2 Addressing GPB fields
+-------------------------
+We'll need to provide a way to access GPB fields. This can be complicated as
+structures that are encoded in GPB message can be nested and recursive.
+
+2.2.1 Option1: SQL Function
+~~~~~~~~~~~~~~~~~~~~~~~~~~~
+Introduce an SQL function GPB_FIELD(path) which will return contents of the
+field.
+- Return type of the function will be determined from GPB message definition.
+- For path, we can use XPath selector (a subset of XPath) syntax.
+
+(TODO ^ the above needs to be specified in more detail. is the selector as
+simple as filesystem path or we allow quantifiers (with predicates?)?)
+
+2.2.2 Option2: SQL columns
+~~~~~~~~~~~~~~~~~~~~~~~~~~
+Make GPB columns to be accessible as SQL columns.
+This approach has problems:
+- It might be hard to implement code-wise
+ - (TODO will Virtual columns patch help??)
+- It is not clear how to access fields from nested structures. Should we allow
+ quoted names like `foo/bar[2]/baz' ?
+
DESCRIPTION:
Add support for dynamic columns:
- A column that can hold information from many columns
- One can instantly add or remove column data
This is a useful feature for any store type of application, where you want to
store different type of information for different kind of items.
For example, for shoes you want to store: material, size, colour, maker
For a computer you want to store ram, hard disk size etc...
In a normal 'relational' system you would need to a table for each type.
With dynamic columns you have all common items as fixed fields (like
product_code, manufacturer, price) and the rest stored in a dynamic column.
The proposed idea is to store the dynamic information in a blob in
Google Protocol Buffers (further GPB) format and use SQL constructs to
extract parts of GPB data for use in select list, for filtering, and so forth.
Any support for indexing GPB data is outside of scope of this WL entry.
Example usage:
SELECT proto_get(blob, 1, varchar) from table_with_proto;
UPDATE table_with_proto SET blob=proto_add(blob, 2, "hello") where id=1;
UPDATE table_with_proto SET blob=proto_del(blob,4) where id=5;
Note that 'proto_add()' will replace any old value with the given proto_id.
HIGH-LEVEL SPECIFICATION:
1. GPB Encoding overview
2. GPB in an SQL database
3. Encoding to use for dynamic columns
4. How to store and access data in a protocol buffer from SQL
5. Extensions for the future
1. GPB Encoding overview
========================
GBB is a compact encoding for structured and typed data. A unit of GPB data
(it is called message) is only partially self-describing: it's possible to
iterate over its parts, but, quoting the spec
http://code.google.com/apis/protocolbuffers/docs/encoding.html:
" the name and declared type for each field can only be determined on the
decoding end by referencing the message type's definition (i.e. the .proto
file). "
2. GPB in an SQL database
=========================
It is possible to store GPB data in MariaDB today - one can declare a binary
blob column and use it to store GPB messages. Storing and retrieving entire
messages will be the only available operations, though, as the server has no
idea about the GPB format.
It is apparent that ability to peek inside GPB data from SQL layer would be of
great advantage: one would be able to
- select only certain fields or parts of GPB messages
- filter records based on the values of GPB fields
- etc
performing such operations at SQL layer will allow to reduce client<->server
traffic right away, and will open path to getting the best possible
performance.
3. Encoding to use for dynamic columns
======================================
The data should be coded into the proto buffer in the following format:
<field_number><value_type><value>[<field_number><value_type><value>...]
Where field_number is a number between 0-65536 that identifes the field
<value_type> is a enum of type 'Item_result'
<value> is the value coded in proto format.
In other words, we should have no nested or complex structure.
4. How to store and access data in a protocol buffer from SQL
============================================================
User-friendly/meaningful access to GPB fields requires knowledge of
GPB field names and types, which are not available from GPB message
itself (see "GPB encoding overview" section).
To make things easy for the user, we will at first stage provide SQL
functions to manipulate a string that is actually in proto format.
The functions we should provde are:
proto_get(gpb, field_number, type)
This return the field tagged with 'field_number' from the 'gpb' buffer.
Example: proto_get(blob, 1, varchar) -> Returns field number 1 as varchar
proto_put(gpb, field_number, value)
This returns a new gbp buffer with the new value appended.
Example: proto_put(proto_put(blob, 1, 1), 2, "hello")
5. Extension for future
=======================
In the future we may want to access data based on name and get MariaDB to
automaticly know the correct type. To do this we need to be able to
store a definition for the content of the proto buffer somewhere.
DecisionToMake: How to pass the server the GPB definition?
First idea: add a CREATE TABLE parameter which will specify the
definition itself.
LOW-LEVEL DESIGN:
* GPB tarball contains a protocol definition for .proto file structure itself
and a parser for text form of .proto file which then exposes the parsed
file via standard GPB message navigation API.
* We should have both server-side support and client-side support (client side
means functions in libmysqlclient so that user can select the full BLOB and
extract fields in the application).
* Add some kind of header to the GPB blob to support versioning and future
extensibility.
* Add complete syntax description (update, add, drop, exists, ...).
ESTIMATED WORK TIME
ESTIMATED COMPLETION DATE
-----------------------------------------------------------------------
WorkLog (v3.5.9)
1
0

[Maria-developers] Updated (by Monty): Add support for google protocol buffers (34)
by worklog-noreply@askmonty.org 11 Feb '10
by worklog-noreply@askmonty.org 11 Feb '10
11 Feb '10
-----------------------------------------------------------------------
WORKLOG TASK
-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-
TASK...........: Add support for google protocol buffers
CREATION DATE..: Tue, 21 Jul 2009, 21:11
SUPERVISOR.....: Monty
IMPLEMENTOR....:
COPIES TO......:
CATEGORY.......: Server-Sprint
TASK ID........: 34 (http://askmonty.org/worklog/?tid=34)
VERSION........: WorkLog-3.4
STATUS.........: Un-Assigned
PRIORITY.......: 60
WORKED HOURS...: 0
ESTIMATE.......: 0 (hours remain)
ORIG. ESTIMATE.: 0
PROGRESS NOTES:
-=-=(Monty - Thu, 11 Feb 2010, 20:03)=-=-
High Level Description modified.
--- /tmp/wklog.34.old.18329 2010-02-11 18:03:42.000000000 +0000
+++ /tmp/wklog.34.new.18329 2010-02-11 18:03:42.000000000 +0000
@@ -19,3 +19,14 @@
Any support for indexing GPB data is outside of scope of this WL entry.
+Example usage:
+
+SELECT proto_get(blob, 1, varchar) from table_with_proto;
+
+UPDATE table_with_proto SET blob=proto_add(blob, 2, "hello") where id=1;
+
+UPDATE table_with_proto SET blob=proto_del(blob,4) where id=5;
+
+Note that 'proto_add()' will replace any old value with the given proto_id.
+
+
-=-=(Monty - Thu, 11 Feb 2010, 19:59)=-=-
High-Level Specification modified.
--- /tmp/wklog.34.old.17962 2010-02-11 19:59:45.000000000 +0200
+++ /tmp/wklog.34.new.17962 2010-02-11 19:59:45.000000000 +0200
@@ -1,13 +1,8 @@
-
-<contents>
1. GPB Encoding overview
2. GPB in an SQL database
-2.1 Informing server about GPB field names and types
-2.2 Addressing GPB fields
-2.2.1 Option1: SQL Function
-2.2.2 Option2: SQL columns
-</contents>
-
+3. Encoding to use for dynamic columns
+4. How to store and access data in a protocol buffer from SQL
+5. Extensions for the future
1. GPB Encoding overview
========================
@@ -37,42 +32,50 @@
traffic right away, and will open path to getting the best possible
performance.
-2.1 Informing server about GPB field names and types
-----------------------------------------------------
-User-friendly/meaningful access to GPB fields requires knowledge of GPB field
-names and types, which are not available from GPB message itself (see "GPB
-encoding overview" section).
-
-So the first issue to be addressed is to get the server to know the definition
-of stored messages. We intend to assume that all records have GPB messages
-that conform to a certain single definition, which gives one definition per
-GPB field.
+3. Encoding to use for dynamic columns
+======================================
-DecisionToMake: How to pass the server the GPB definition?
-First idea: add a CREATE TABLE parameter which will specify either the
-definition itself or path to .proto file with the definition.
+The data should be coded into the proto buffer in the following format:
+
+<field_number><value_type><value>[<field_number><value_type><value>...]
+
+Where field_number is a number between 0-65536 that identifes the field
+<value_type> is a enum of type 'Item_result'
+<value> is the value coded in proto format.
+
+In other words, we should have no nested or complex structure.
+
+4. How to store and access data in a protocol buffer from SQL
+============================================================
+
+User-friendly/meaningful access to GPB fields requires knowledge of
+GPB field names and types, which are not available from GPB message
+itself (see "GPB encoding overview" section).
+
+To make things easy for the user, we will at first stage provide SQL
+functions to manipulate a string that is actually in proto format.
-2.2 Addressing GPB fields
--------------------------
-We'll need to provide a way to access GPB fields. This can be complicated as
-structures that are encoded in GPB message can be nested and recursive.
-
-2.2.1 Option1: SQL Function
-~~~~~~~~~~~~~~~~~~~~~~~~~~~
-Introduce an SQL function GPB_FIELD(path) which will return contents of the
-field.
-- Return type of the function will be determined from GPB message definition.
-- For path, we can use XPath selector (a subset of XPath) syntax.
-
-(TODO ^ the above needs to be specified in more detail. is the selector as
-simple as filesystem path or we allow quantifiers (with predicates?)?)
-
-2.2.2 Option2: SQL columns
-~~~~~~~~~~~~~~~~~~~~~~~~~~
-Make GPB columns to be accessible as SQL columns.
-This approach has problems:
-- It might be hard to implement code-wise
- - (TODO will Virtual columns patch help??)
-- It is not clear how to access fields from nested structures. Should we allow
- quoted names like `foo/bar[2]/baz' ?
+The functions we should provde are:
+proto_get(gpb, field_number, type)
+
+This return the field tagged with 'field_number' from the 'gpb' buffer.
+
+Example: proto_get(blob, 1, varchar) -> Returns field number 1 as varchar
+
+proto_put(gpb, field_number, value)
+
+This returns a new gbp buffer with the new value appended.
+
+Example: proto_put(proto_put(blob, 1, 1), 2, "hello")
+
+5. Extension for future
+=======================
+
+In the future we may want to access data based on name and get MariaDB to
+automaticly know the correct type. To do this we need to be able to
+store a definition for the content of the proto buffer somewhere.
+
+DecisionToMake: How to pass the server the GPB definition?
+First idea: add a CREATE TABLE parameter which will specify the
+definition itself.
-=-=(Monty - Thu, 11 Feb 2010, 19:59)=-=-
High Level Description modified.
--- /tmp/wklog.34.old.17915 2010-02-11 17:59:17.000000000 +0000
+++ /tmp/wklog.34.new.17915 2010-02-11 17:59:17.000000000 +0000
@@ -1,5 +1,21 @@
-Add support for Google Protocol Buffers (further GPB). It should be possible
-to have columns that store GPB-encoded data, as well as use SQL constructs to
+Add support for dynamic columns:
+
+- A column that can hold information from many columns
+- One can instantly add or remove column data
+
+This is a useful feature for any store type of application, where you want to
+store different type of information for different kind of items.
+
+For example, for shoes you want to store: material, size, colour, maker
+For a computer you want to store ram, hard disk size etc...
+
+In a normal 'relational' system you would need to a table for each type.
+With dynamic columns you have all common items as fixed fields (like
+product_code, manufacturer, price) and the rest stored in a dynamic column.
+
+The proposed idea is to store the dynamic information in a blob in
+Google Protocol Buffers (further GPB) format and use SQL constructs to
extract parts of GPB data for use in select list, for filtering, and so forth.
+
Any support for indexing GPB data is outside of scope of this WL entry.
-=-=(Knielsen - Fri, 22 Jan 2010, 11:38)=-=-
Low Level Design modified.
--- /tmp/wklog.34.old.29965 2010-01-22 11:38:57.000000000 +0200
+++ /tmp/wklog.34.new.29965 2010-01-22 11:38:57.000000000 +0200
@@ -2,3 +2,12 @@
and a parser for text form of .proto file which then exposes the parsed
file via standard GPB message navigation API.
+* We should have both server-side support and client-side support (client side
+ means functions in libmysqlclient so that user can select the full BLOB and
+ extract fields in the application).
+
+* Add some kind of header to the GPB blob to support versioning and future
+ extensibility.
+
+* Add complete syntax description (update, add, drop, exists, ...).
+
-=-=(Psergey - Tue, 21 Jul 2009, 21:13)=-=-
Low Level Design modified.
--- /tmp/wklog.34.old.6462 2009-07-21 21:13:13.000000000 +0300
+++ /tmp/wklog.34.new.6462 2009-07-21 21:13:13.000000000 +0300
@@ -1 +1,4 @@
+* GPB tarball contains a protocol definition for .proto file structure itself
+ and a parser for text form of .proto file which then exposes the parsed
+ file via standard GPB message navigation API.
-=-=(Psergey - Tue, 21 Jul 2009, 21:12)=-=-
High-Level Specification modified.
--- /tmp/wklog.34.old.6399 2009-07-21 21:12:23.000000000 +0300
+++ /tmp/wklog.34.new.6399 2009-07-21 21:12:23.000000000 +0300
@@ -1 +1,78 @@
+<contents>
+1. GPB Encoding overview
+2. GPB in an SQL database
+2.1 Informing server about GPB field names and types
+2.2 Addressing GPB fields
+2.2.1 Option1: SQL Function
+2.2.2 Option2: SQL columns
+</contents>
+
+
+1. GPB Encoding overview
+========================
+
+GBB is a compact encoding for structured and typed data. A unit of GPB data
+(it is called message) is only partially self-describing: it's possible to
+iterate over its parts, but, quoting the spec
+
+http://code.google.com/apis/protocolbuffers/docs/encoding.html:
+ " the name and declared type for each field can only be determined on the
+ decoding end by referencing the message type's definition (i.e. the .proto
+ file). "
+
+2. GPB in an SQL database
+=========================
+
+It is possible to store GPB data in MariaDB today - one can declare a binary
+blob column and use it to store GPB messages. Storing and retrieving entire
+messages will be the only available operations, though, as the server has no
+idea about the GPB format.
+It is apparent that ability to peek inside GPB data from SQL layer would be of
+great advantage: one would be able to
+- select only certain fields or parts of GPB messages
+- filter records based on the values of GPB fields
+- etc
+performing such operations at SQL layer will allow to reduce client<->server
+traffic right away, and will open path to getting the best possible
+performance.
+
+2.1 Informing server about GPB field names and types
+----------------------------------------------------
+User-friendly/meaningful access to GPB fields requires knowledge of GPB field
+names and types, which are not available from GPB message itself (see "GPB
+encoding overview" section).
+
+So the first issue to be addressed is to get the server to know the definition
+of stored messages. We intend to assume that all records have GPB messages
+that conform to a certain single definition, which gives one definition per
+GPB field.
+
+DecisionToMake: How to pass the server the GPB definition?
+First idea: add a CREATE TABLE parameter which will specify either the
+definition itself or path to .proto file with the definition.
+
+2.2 Addressing GPB fields
+-------------------------
+We'll need to provide a way to access GPB fields. This can be complicated as
+structures that are encoded in GPB message can be nested and recursive.
+
+2.2.1 Option1: SQL Function
+~~~~~~~~~~~~~~~~~~~~~~~~~~~
+Introduce an SQL function GPB_FIELD(path) which will return contents of the
+field.
+- Return type of the function will be determined from GPB message definition.
+- For path, we can use XPath selector (a subset of XPath) syntax.
+
+(TODO ^ the above needs to be specified in more detail. is the selector as
+simple as filesystem path or we allow quantifiers (with predicates?)?)
+
+2.2.2 Option2: SQL columns
+~~~~~~~~~~~~~~~~~~~~~~~~~~
+Make GPB columns to be accessible as SQL columns.
+This approach has problems:
+- It might be hard to implement code-wise
+ - (TODO will Virtual columns patch help??)
+- It is not clear how to access fields from nested structures. Should we allow
+ quoted names like `foo/bar[2]/baz' ?
+
DESCRIPTION:
Add support for dynamic columns:
- A column that can hold information from many columns
- One can instantly add or remove column data
This is a useful feature for any store type of application, where you want to
store different type of information for different kind of items.
For example, for shoes you want to store: material, size, colour, maker
For a computer you want to store ram, hard disk size etc...
In a normal 'relational' system you would need to a table for each type.
With dynamic columns you have all common items as fixed fields (like
product_code, manufacturer, price) and the rest stored in a dynamic column.
The proposed idea is to store the dynamic information in a blob in
Google Protocol Buffers (further GPB) format and use SQL constructs to
extract parts of GPB data for use in select list, for filtering, and so forth.
Any support for indexing GPB data is outside of scope of this WL entry.
Example usage:
SELECT proto_get(blob, 1, varchar) from table_with_proto;
UPDATE table_with_proto SET blob=proto_add(blob, 2, "hello") where id=1;
UPDATE table_with_proto SET blob=proto_del(blob,4) where id=5;
Note that 'proto_add()' will replace any old value with the given proto_id.
HIGH-LEVEL SPECIFICATION:
1. GPB Encoding overview
2. GPB in an SQL database
3. Encoding to use for dynamic columns
4. How to store and access data in a protocol buffer from SQL
5. Extensions for the future
1. GPB Encoding overview
========================
GBB is a compact encoding for structured and typed data. A unit of GPB data
(it is called message) is only partially self-describing: it's possible to
iterate over its parts, but, quoting the spec
http://code.google.com/apis/protocolbuffers/docs/encoding.html:
" the name and declared type for each field can only be determined on the
decoding end by referencing the message type's definition (i.e. the .proto
file). "
2. GPB in an SQL database
=========================
It is possible to store GPB data in MariaDB today - one can declare a binary
blob column and use it to store GPB messages. Storing and retrieving entire
messages will be the only available operations, though, as the server has no
idea about the GPB format.
It is apparent that ability to peek inside GPB data from SQL layer would be of
great advantage: one would be able to
- select only certain fields or parts of GPB messages
- filter records based on the values of GPB fields
- etc
performing such operations at SQL layer will allow to reduce client<->server
traffic right away, and will open path to getting the best possible
performance.
3. Encoding to use for dynamic columns
======================================
The data should be coded into the proto buffer in the following format:
<field_number><value_type><value>[<field_number><value_type><value>...]
Where field_number is a number between 0-65536 that identifes the field
<value_type> is a enum of type 'Item_result'
<value> is the value coded in proto format.
In other words, we should have no nested or complex structure.
4. How to store and access data in a protocol buffer from SQL
============================================================
User-friendly/meaningful access to GPB fields requires knowledge of
GPB field names and types, which are not available from GPB message
itself (see "GPB encoding overview" section).
To make things easy for the user, we will at first stage provide SQL
functions to manipulate a string that is actually in proto format.
The functions we should provde are:
proto_get(gpb, field_number, type)
This return the field tagged with 'field_number' from the 'gpb' buffer.
Example: proto_get(blob, 1, varchar) -> Returns field number 1 as varchar
proto_put(gpb, field_number, value)
This returns a new gbp buffer with the new value appended.
Example: proto_put(proto_put(blob, 1, 1), 2, "hello")
5. Extension for future
=======================
In the future we may want to access data based on name and get MariaDB to
automaticly know the correct type. To do this we need to be able to
store a definition for the content of the proto buffer somewhere.
DecisionToMake: How to pass the server the GPB definition?
First idea: add a CREATE TABLE parameter which will specify the
definition itself.
LOW-LEVEL DESIGN:
* GPB tarball contains a protocol definition for .proto file structure itself
and a parser for text form of .proto file which then exposes the parsed
file via standard GPB message navigation API.
* We should have both server-side support and client-side support (client side
means functions in libmysqlclient so that user can select the full BLOB and
extract fields in the application).
* Add some kind of header to the GPB blob to support versioning and future
extensibility.
* Add complete syntax description (update, add, drop, exists, ...).
ESTIMATED WORK TIME
ESTIMATED COMPLETION DATE
-----------------------------------------------------------------------
WorkLog (v3.5.9)
1
0

[Maria-developers] Updated (by Monty): Add support for google protocol buffers (34)
by worklog-noreply@askmonty.org 11 Feb '10
by worklog-noreply@askmonty.org 11 Feb '10
11 Feb '10
-----------------------------------------------------------------------
WORKLOG TASK
-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-
TASK...........: Add support for google protocol buffers
CREATION DATE..: Tue, 21 Jul 2009, 21:11
SUPERVISOR.....: Monty
IMPLEMENTOR....:
COPIES TO......:
CATEGORY.......: Server-Sprint
TASK ID........: 34 (http://askmonty.org/worklog/?tid=34)
VERSION........: WorkLog-3.4
STATUS.........: Un-Assigned
PRIORITY.......: 60
WORKED HOURS...: 0
ESTIMATE.......: 0 (hours remain)
ORIG. ESTIMATE.: 0
PROGRESS NOTES:
-=-=(Monty - Thu, 11 Feb 2010, 19:59)=-=-
High-Level Specification modified.
--- /tmp/wklog.34.old.17962 2010-02-11 19:59:45.000000000 +0200
+++ /tmp/wklog.34.new.17962 2010-02-11 19:59:45.000000000 +0200
@@ -1,13 +1,8 @@
-
-<contents>
1. GPB Encoding overview
2. GPB in an SQL database
-2.1 Informing server about GPB field names and types
-2.2 Addressing GPB fields
-2.2.1 Option1: SQL Function
-2.2.2 Option2: SQL columns
-</contents>
-
+3. Encoding to use for dynamic columns
+4. How to store and access data in a protocol buffer from SQL
+5. Extensions for the future
1. GPB Encoding overview
========================
@@ -37,42 +32,50 @@
traffic right away, and will open path to getting the best possible
performance.
-2.1 Informing server about GPB field names and types
-----------------------------------------------------
-User-friendly/meaningful access to GPB fields requires knowledge of GPB field
-names and types, which are not available from GPB message itself (see "GPB
-encoding overview" section).
-
-So the first issue to be addressed is to get the server to know the definition
-of stored messages. We intend to assume that all records have GPB messages
-that conform to a certain single definition, which gives one definition per
-GPB field.
+3. Encoding to use for dynamic columns
+======================================
-DecisionToMake: How to pass the server the GPB definition?
-First idea: add a CREATE TABLE parameter which will specify either the
-definition itself or path to .proto file with the definition.
+The data should be coded into the proto buffer in the following format:
+
+<field_number><value_type><value>[<field_number><value_type><value>...]
+
+Where field_number is a number between 0-65536 that identifes the field
+<value_type> is a enum of type 'Item_result'
+<value> is the value coded in proto format.
+
+In other words, we should have no nested or complex structure.
+
+4. How to store and access data in a protocol buffer from SQL
+============================================================
+
+User-friendly/meaningful access to GPB fields requires knowledge of
+GPB field names and types, which are not available from GPB message
+itself (see "GPB encoding overview" section).
+
+To make things easy for the user, we will at first stage provide SQL
+functions to manipulate a string that is actually in proto format.
-2.2 Addressing GPB fields
--------------------------
-We'll need to provide a way to access GPB fields. This can be complicated as
-structures that are encoded in GPB message can be nested and recursive.
-
-2.2.1 Option1: SQL Function
-~~~~~~~~~~~~~~~~~~~~~~~~~~~
-Introduce an SQL function GPB_FIELD(path) which will return contents of the
-field.
-- Return type of the function will be determined from GPB message definition.
-- For path, we can use XPath selector (a subset of XPath) syntax.
-
-(TODO ^ the above needs to be specified in more detail. is the selector as
-simple as filesystem path or we allow quantifiers (with predicates?)?)
-
-2.2.2 Option2: SQL columns
-~~~~~~~~~~~~~~~~~~~~~~~~~~
-Make GPB columns to be accessible as SQL columns.
-This approach has problems:
-- It might be hard to implement code-wise
- - (TODO will Virtual columns patch help??)
-- It is not clear how to access fields from nested structures. Should we allow
- quoted names like `foo/bar[2]/baz' ?
+The functions we should provde are:
+proto_get(gpb, field_number, type)
+
+This return the field tagged with 'field_number' from the 'gpb' buffer.
+
+Example: proto_get(blob, 1, varchar) -> Returns field number 1 as varchar
+
+proto_put(gpb, field_number, value)
+
+This returns a new gbp buffer with the new value appended.
+
+Example: proto_put(proto_put(blob, 1, 1), 2, "hello")
+
+5. Extension for future
+=======================
+
+In the future we may want to access data based on name and get MariaDB to
+automaticly know the correct type. To do this we need to be able to
+store a definition for the content of the proto buffer somewhere.
+
+DecisionToMake: How to pass the server the GPB definition?
+First idea: add a CREATE TABLE parameter which will specify the
+definition itself.
-=-=(Monty - Thu, 11 Feb 2010, 19:59)=-=-
High Level Description modified.
--- /tmp/wklog.34.old.17915 2010-02-11 17:59:17.000000000 +0000
+++ /tmp/wklog.34.new.17915 2010-02-11 17:59:17.000000000 +0000
@@ -1,5 +1,21 @@
-Add support for Google Protocol Buffers (further GPB). It should be possible
-to have columns that store GPB-encoded data, as well as use SQL constructs to
+Add support for dynamic columns:
+
+- A column that can hold information from many columns
+- One can instantly add or remove column data
+
+This is a useful feature for any store type of application, where you want to
+store different type of information for different kind of items.
+
+For example, for shoes you want to store: material, size, colour, maker
+For a computer you want to store ram, hard disk size etc...
+
+In a normal 'relational' system you would need to a table for each type.
+With dynamic columns you have all common items as fixed fields (like
+product_code, manufacturer, price) and the rest stored in a dynamic column.
+
+The proposed idea is to store the dynamic information in a blob in
+Google Protocol Buffers (further GPB) format and use SQL constructs to
extract parts of GPB data for use in select list, for filtering, and so forth.
+
Any support for indexing GPB data is outside of scope of this WL entry.
-=-=(Knielsen - Fri, 22 Jan 2010, 11:38)=-=-
Low Level Design modified.
--- /tmp/wklog.34.old.29965 2010-01-22 11:38:57.000000000 +0200
+++ /tmp/wklog.34.new.29965 2010-01-22 11:38:57.000000000 +0200
@@ -2,3 +2,12 @@
and a parser for text form of .proto file which then exposes the parsed
file via standard GPB message navigation API.
+* We should have both server-side support and client-side support (client side
+ means functions in libmysqlclient so that user can select the full BLOB and
+ extract fields in the application).
+
+* Add some kind of header to the GPB blob to support versioning and future
+ extensibility.
+
+* Add complete syntax description (update, add, drop, exists, ...).
+
-=-=(Psergey - Tue, 21 Jul 2009, 21:13)=-=-
Low Level Design modified.
--- /tmp/wklog.34.old.6462 2009-07-21 21:13:13.000000000 +0300
+++ /tmp/wklog.34.new.6462 2009-07-21 21:13:13.000000000 +0300
@@ -1 +1,4 @@
+* GPB tarball contains a protocol definition for .proto file structure itself
+ and a parser for text form of .proto file which then exposes the parsed
+ file via standard GPB message navigation API.
-=-=(Psergey - Tue, 21 Jul 2009, 21:12)=-=-
High-Level Specification modified.
--- /tmp/wklog.34.old.6399 2009-07-21 21:12:23.000000000 +0300
+++ /tmp/wklog.34.new.6399 2009-07-21 21:12:23.000000000 +0300
@@ -1 +1,78 @@
+<contents>
+1. GPB Encoding overview
+2. GPB in an SQL database
+2.1 Informing server about GPB field names and types
+2.2 Addressing GPB fields
+2.2.1 Option1: SQL Function
+2.2.2 Option2: SQL columns
+</contents>
+
+
+1. GPB Encoding overview
+========================
+
+GBB is a compact encoding for structured and typed data. A unit of GPB data
+(it is called message) is only partially self-describing: it's possible to
+iterate over its parts, but, quoting the spec
+
+http://code.google.com/apis/protocolbuffers/docs/encoding.html:
+ " the name and declared type for each field can only be determined on the
+ decoding end by referencing the message type's definition (i.e. the .proto
+ file). "
+
+2. GPB in an SQL database
+=========================
+
+It is possible to store GPB data in MariaDB today - one can declare a binary
+blob column and use it to store GPB messages. Storing and retrieving entire
+messages will be the only available operations, though, as the server has no
+idea about the GPB format.
+It is apparent that ability to peek inside GPB data from SQL layer would be of
+great advantage: one would be able to
+- select only certain fields or parts of GPB messages
+- filter records based on the values of GPB fields
+- etc
+performing such operations at SQL layer will allow to reduce client<->server
+traffic right away, and will open path to getting the best possible
+performance.
+
+2.1 Informing server about GPB field names and types
+----------------------------------------------------
+User-friendly/meaningful access to GPB fields requires knowledge of GPB field
+names and types, which are not available from GPB message itself (see "GPB
+encoding overview" section).
+
+So the first issue to be addressed is to get the server to know the definition
+of stored messages. We intend to assume that all records have GPB messages
+that conform to a certain single definition, which gives one definition per
+GPB field.
+
+DecisionToMake: How to pass the server the GPB definition?
+First idea: add a CREATE TABLE parameter which will specify either the
+definition itself or path to .proto file with the definition.
+
+2.2 Addressing GPB fields
+-------------------------
+We'll need to provide a way to access GPB fields. This can be complicated as
+structures that are encoded in GPB message can be nested and recursive.
+
+2.2.1 Option1: SQL Function
+~~~~~~~~~~~~~~~~~~~~~~~~~~~
+Introduce an SQL function GPB_FIELD(path) which will return contents of the
+field.
+- Return type of the function will be determined from GPB message definition.
+- For path, we can use XPath selector (a subset of XPath) syntax.
+
+(TODO ^ the above needs to be specified in more detail. is the selector as
+simple as filesystem path or we allow quantifiers (with predicates?)?)
+
+2.2.2 Option2: SQL columns
+~~~~~~~~~~~~~~~~~~~~~~~~~~
+Make GPB columns to be accessible as SQL columns.
+This approach has problems:
+- It might be hard to implement code-wise
+ - (TODO will Virtual columns patch help??)
+- It is not clear how to access fields from nested structures. Should we allow
+ quoted names like `foo/bar[2]/baz' ?
+
DESCRIPTION:
Add support for dynamic columns:
- A column that can hold information from many columns
- One can instantly add or remove column data
This is a useful feature for any store type of application, where you want to
store different type of information for different kind of items.
For example, for shoes you want to store: material, size, colour, maker
For a computer you want to store ram, hard disk size etc...
In a normal 'relational' system you would need to a table for each type.
With dynamic columns you have all common items as fixed fields (like
product_code, manufacturer, price) and the rest stored in a dynamic column.
The proposed idea is to store the dynamic information in a blob in
Google Protocol Buffers (further GPB) format and use SQL constructs to
extract parts of GPB data for use in select list, for filtering, and so forth.
Any support for indexing GPB data is outside of scope of this WL entry.
HIGH-LEVEL SPECIFICATION:
1. GPB Encoding overview
2. GPB in an SQL database
3. Encoding to use for dynamic columns
4. How to store and access data in a protocol buffer from SQL
5. Extensions for the future
1. GPB Encoding overview
========================
GBB is a compact encoding for structured and typed data. A unit of GPB data
(it is called message) is only partially self-describing: it's possible to
iterate over its parts, but, quoting the spec
http://code.google.com/apis/protocolbuffers/docs/encoding.html:
" the name and declared type for each field can only be determined on the
decoding end by referencing the message type's definition (i.e. the .proto
file). "
2. GPB in an SQL database
=========================
It is possible to store GPB data in MariaDB today - one can declare a binary
blob column and use it to store GPB messages. Storing and retrieving entire
messages will be the only available operations, though, as the server has no
idea about the GPB format.
It is apparent that ability to peek inside GPB data from SQL layer would be of
great advantage: one would be able to
- select only certain fields or parts of GPB messages
- filter records based on the values of GPB fields
- etc
performing such operations at SQL layer will allow to reduce client<->server
traffic right away, and will open path to getting the best possible
performance.
3. Encoding to use for dynamic columns
======================================
The data should be coded into the proto buffer in the following format:
<field_number><value_type><value>[<field_number><value_type><value>...]
Where field_number is a number between 0-65536 that identifes the field
<value_type> is a enum of type 'Item_result'
<value> is the value coded in proto format.
In other words, we should have no nested or complex structure.
4. How to store and access data in a protocol buffer from SQL
============================================================
User-friendly/meaningful access to GPB fields requires knowledge of
GPB field names and types, which are not available from GPB message
itself (see "GPB encoding overview" section).
To make things easy for the user, we will at first stage provide SQL
functions to manipulate a string that is actually in proto format.
The functions we should provde are:
proto_get(gpb, field_number, type)
This return the field tagged with 'field_number' from the 'gpb' buffer.
Example: proto_get(blob, 1, varchar) -> Returns field number 1 as varchar
proto_put(gpb, field_number, value)
This returns a new gbp buffer with the new value appended.
Example: proto_put(proto_put(blob, 1, 1), 2, "hello")
5. Extension for future
=======================
In the future we may want to access data based on name and get MariaDB to
automaticly know the correct type. To do this we need to be able to
store a definition for the content of the proto buffer somewhere.
DecisionToMake: How to pass the server the GPB definition?
First idea: add a CREATE TABLE parameter which will specify the
definition itself.
LOW-LEVEL DESIGN:
* GPB tarball contains a protocol definition for .proto file structure itself
and a parser for text form of .proto file which then exposes the parsed
file via standard GPB message navigation API.
* We should have both server-side support and client-side support (client side
means functions in libmysqlclient so that user can select the full BLOB and
extract fields in the application).
* Add some kind of header to the GPB blob to support versioning and future
extensibility.
* Add complete syntax description (update, add, drop, exists, ...).
ESTIMATED WORK TIME
ESTIMATED COMPLETION DATE
-----------------------------------------------------------------------
WorkLog (v3.5.9)
1
0

[Maria-developers] Updated (by Monty): Add support for google protocol buffers (34)
by worklog-noreply@askmonty.org 11 Feb '10
by worklog-noreply@askmonty.org 11 Feb '10
11 Feb '10
-----------------------------------------------------------------------
WORKLOG TASK
-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-
TASK...........: Add support for google protocol buffers
CREATION DATE..: Tue, 21 Jul 2009, 21:11
SUPERVISOR.....: Monty
IMPLEMENTOR....:
COPIES TO......:
CATEGORY.......: Server-Sprint
TASK ID........: 34 (http://askmonty.org/worklog/?tid=34)
VERSION........: WorkLog-3.4
STATUS.........: Un-Assigned
PRIORITY.......: 60
WORKED HOURS...: 0
ESTIMATE.......: 0 (hours remain)
ORIG. ESTIMATE.: 0
PROGRESS NOTES:
-=-=(Monty - Thu, 11 Feb 2010, 19:59)=-=-
High-Level Specification modified.
--- /tmp/wklog.34.old.17962 2010-02-11 19:59:45.000000000 +0200
+++ /tmp/wklog.34.new.17962 2010-02-11 19:59:45.000000000 +0200
@@ -1,13 +1,8 @@
-
-<contents>
1. GPB Encoding overview
2. GPB in an SQL database
-2.1 Informing server about GPB field names and types
-2.2 Addressing GPB fields
-2.2.1 Option1: SQL Function
-2.2.2 Option2: SQL columns
-</contents>
-
+3. Encoding to use for dynamic columns
+4. How to store and access data in a protocol buffer from SQL
+5. Extensions for the future
1. GPB Encoding overview
========================
@@ -37,42 +32,50 @@
traffic right away, and will open path to getting the best possible
performance.
-2.1 Informing server about GPB field names and types
-----------------------------------------------------
-User-friendly/meaningful access to GPB fields requires knowledge of GPB field
-names and types, which are not available from GPB message itself (see "GPB
-encoding overview" section).
-
-So the first issue to be addressed is to get the server to know the definition
-of stored messages. We intend to assume that all records have GPB messages
-that conform to a certain single definition, which gives one definition per
-GPB field.
+3. Encoding to use for dynamic columns
+======================================
-DecisionToMake: How to pass the server the GPB definition?
-First idea: add a CREATE TABLE parameter which will specify either the
-definition itself or path to .proto file with the definition.
+The data should be coded into the proto buffer in the following format:
+
+<field_number><value_type><value>[<field_number><value_type><value>...]
+
+Where field_number is a number between 0-65536 that identifes the field
+<value_type> is a enum of type 'Item_result'
+<value> is the value coded in proto format.
+
+In other words, we should have no nested or complex structure.
+
+4. How to store and access data in a protocol buffer from SQL
+============================================================
+
+User-friendly/meaningful access to GPB fields requires knowledge of
+GPB field names and types, which are not available from GPB message
+itself (see "GPB encoding overview" section).
+
+To make things easy for the user, we will at first stage provide SQL
+functions to manipulate a string that is actually in proto format.
-2.2 Addressing GPB fields
--------------------------
-We'll need to provide a way to access GPB fields. This can be complicated as
-structures that are encoded in GPB message can be nested and recursive.
-
-2.2.1 Option1: SQL Function
-~~~~~~~~~~~~~~~~~~~~~~~~~~~
-Introduce an SQL function GPB_FIELD(path) which will return contents of the
-field.
-- Return type of the function will be determined from GPB message definition.
-- For path, we can use XPath selector (a subset of XPath) syntax.
-
-(TODO ^ the above needs to be specified in more detail. is the selector as
-simple as filesystem path or we allow quantifiers (with predicates?)?)
-
-2.2.2 Option2: SQL columns
-~~~~~~~~~~~~~~~~~~~~~~~~~~
-Make GPB columns to be accessible as SQL columns.
-This approach has problems:
-- It might be hard to implement code-wise
- - (TODO will Virtual columns patch help??)
-- It is not clear how to access fields from nested structures. Should we allow
- quoted names like `foo/bar[2]/baz' ?
+The functions we should provde are:
+proto_get(gpb, field_number, type)
+
+This return the field tagged with 'field_number' from the 'gpb' buffer.
+
+Example: proto_get(blob, 1, varchar) -> Returns field number 1 as varchar
+
+proto_put(gpb, field_number, value)
+
+This returns a new gbp buffer with the new value appended.
+
+Example: proto_put(proto_put(blob, 1, 1), 2, "hello")
+
+5. Extension for future
+=======================
+
+In the future we may want to access data based on name and get MariaDB to
+automaticly know the correct type. To do this we need to be able to
+store a definition for the content of the proto buffer somewhere.
+
+DecisionToMake: How to pass the server the GPB definition?
+First idea: add a CREATE TABLE parameter which will specify the
+definition itself.
-=-=(Monty - Thu, 11 Feb 2010, 19:59)=-=-
High Level Description modified.
--- /tmp/wklog.34.old.17915 2010-02-11 17:59:17.000000000 +0000
+++ /tmp/wklog.34.new.17915 2010-02-11 17:59:17.000000000 +0000
@@ -1,5 +1,21 @@
-Add support for Google Protocol Buffers (further GPB). It should be possible
-to have columns that store GPB-encoded data, as well as use SQL constructs to
+Add support for dynamic columns:
+
+- A column that can hold information from many columns
+- One can instantly add or remove column data
+
+This is a useful feature for any store type of application, where you want to
+store different type of information for different kind of items.
+
+For example, for shoes you want to store: material, size, colour, maker
+For a computer you want to store ram, hard disk size etc...
+
+In a normal 'relational' system you would need to a table for each type.
+With dynamic columns you have all common items as fixed fields (like
+product_code, manufacturer, price) and the rest stored in a dynamic column.
+
+The proposed idea is to store the dynamic information in a blob in
+Google Protocol Buffers (further GPB) format and use SQL constructs to
extract parts of GPB data for use in select list, for filtering, and so forth.
+
Any support for indexing GPB data is outside of scope of this WL entry.
-=-=(Knielsen - Fri, 22 Jan 2010, 11:38)=-=-
Low Level Design modified.
--- /tmp/wklog.34.old.29965 2010-01-22 11:38:57.000000000 +0200
+++ /tmp/wklog.34.new.29965 2010-01-22 11:38:57.000000000 +0200
@@ -2,3 +2,12 @@
and a parser for text form of .proto file which then exposes the parsed
file via standard GPB message navigation API.
+* We should have both server-side support and client-side support (client side
+ means functions in libmysqlclient so that user can select the full BLOB and
+ extract fields in the application).
+
+* Add some kind of header to the GPB blob to support versioning and future
+ extensibility.
+
+* Add complete syntax description (update, add, drop, exists, ...).
+
-=-=(Psergey - Tue, 21 Jul 2009, 21:13)=-=-
Low Level Design modified.
--- /tmp/wklog.34.old.6462 2009-07-21 21:13:13.000000000 +0300
+++ /tmp/wklog.34.new.6462 2009-07-21 21:13:13.000000000 +0300
@@ -1 +1,4 @@
+* GPB tarball contains a protocol definition for .proto file structure itself
+ and a parser for text form of .proto file which then exposes the parsed
+ file via standard GPB message navigation API.
-=-=(Psergey - Tue, 21 Jul 2009, 21:12)=-=-
High-Level Specification modified.
--- /tmp/wklog.34.old.6399 2009-07-21 21:12:23.000000000 +0300
+++ /tmp/wklog.34.new.6399 2009-07-21 21:12:23.000000000 +0300
@@ -1 +1,78 @@
+<contents>
+1. GPB Encoding overview
+2. GPB in an SQL database
+2.1 Informing server about GPB field names and types
+2.2 Addressing GPB fields
+2.2.1 Option1: SQL Function
+2.2.2 Option2: SQL columns
+</contents>
+
+
+1. GPB Encoding overview
+========================
+
+GBB is a compact encoding for structured and typed data. A unit of GPB data
+(it is called message) is only partially self-describing: it's possible to
+iterate over its parts, but, quoting the spec
+
+http://code.google.com/apis/protocolbuffers/docs/encoding.html:
+ " the name and declared type for each field can only be determined on the
+ decoding end by referencing the message type's definition (i.e. the .proto
+ file). "
+
+2. GPB in an SQL database
+=========================
+
+It is possible to store GPB data in MariaDB today - one can declare a binary
+blob column and use it to store GPB messages. Storing and retrieving entire
+messages will be the only available operations, though, as the server has no
+idea about the GPB format.
+It is apparent that ability to peek inside GPB data from SQL layer would be of
+great advantage: one would be able to
+- select only certain fields or parts of GPB messages
+- filter records based on the values of GPB fields
+- etc
+performing such operations at SQL layer will allow to reduce client<->server
+traffic right away, and will open path to getting the best possible
+performance.
+
+2.1 Informing server about GPB field names and types
+----------------------------------------------------
+User-friendly/meaningful access to GPB fields requires knowledge of GPB field
+names and types, which are not available from GPB message itself (see "GPB
+encoding overview" section).
+
+So the first issue to be addressed is to get the server to know the definition
+of stored messages. We intend to assume that all records have GPB messages
+that conform to a certain single definition, which gives one definition per
+GPB field.
+
+DecisionToMake: How to pass the server the GPB definition?
+First idea: add a CREATE TABLE parameter which will specify either the
+definition itself or path to .proto file with the definition.
+
+2.2 Addressing GPB fields
+-------------------------
+We'll need to provide a way to access GPB fields. This can be complicated as
+structures that are encoded in GPB message can be nested and recursive.
+
+2.2.1 Option1: SQL Function
+~~~~~~~~~~~~~~~~~~~~~~~~~~~
+Introduce an SQL function GPB_FIELD(path) which will return contents of the
+field.
+- Return type of the function will be determined from GPB message definition.
+- For path, we can use XPath selector (a subset of XPath) syntax.
+
+(TODO ^ the above needs to be specified in more detail. is the selector as
+simple as filesystem path or we allow quantifiers (with predicates?)?)
+
+2.2.2 Option2: SQL columns
+~~~~~~~~~~~~~~~~~~~~~~~~~~
+Make GPB columns to be accessible as SQL columns.
+This approach has problems:
+- It might be hard to implement code-wise
+ - (TODO will Virtual columns patch help??)
+- It is not clear how to access fields from nested structures. Should we allow
+ quoted names like `foo/bar[2]/baz' ?
+
DESCRIPTION:
Add support for dynamic columns:
- A column that can hold information from many columns
- One can instantly add or remove column data
This is a useful feature for any store type of application, where you want to
store different type of information for different kind of items.
For example, for shoes you want to store: material, size, colour, maker
For a computer you want to store ram, hard disk size etc...
In a normal 'relational' system you would need to a table for each type.
With dynamic columns you have all common items as fixed fields (like
product_code, manufacturer, price) and the rest stored in a dynamic column.
The proposed idea is to store the dynamic information in a blob in
Google Protocol Buffers (further GPB) format and use SQL constructs to
extract parts of GPB data for use in select list, for filtering, and so forth.
Any support for indexing GPB data is outside of scope of this WL entry.
HIGH-LEVEL SPECIFICATION:
1. GPB Encoding overview
2. GPB in an SQL database
3. Encoding to use for dynamic columns
4. How to store and access data in a protocol buffer from SQL
5. Extensions for the future
1. GPB Encoding overview
========================
GBB is a compact encoding for structured and typed data. A unit of GPB data
(it is called message) is only partially self-describing: it's possible to
iterate over its parts, but, quoting the spec
http://code.google.com/apis/protocolbuffers/docs/encoding.html:
" the name and declared type for each field can only be determined on the
decoding end by referencing the message type's definition (i.e. the .proto
file). "
2. GPB in an SQL database
=========================
It is possible to store GPB data in MariaDB today - one can declare a binary
blob column and use it to store GPB messages. Storing and retrieving entire
messages will be the only available operations, though, as the server has no
idea about the GPB format.
It is apparent that ability to peek inside GPB data from SQL layer would be of
great advantage: one would be able to
- select only certain fields or parts of GPB messages
- filter records based on the values of GPB fields
- etc
performing such operations at SQL layer will allow to reduce client<->server
traffic right away, and will open path to getting the best possible
performance.
3. Encoding to use for dynamic columns
======================================
The data should be coded into the proto buffer in the following format:
<field_number><value_type><value>[<field_number><value_type><value>...]
Where field_number is a number between 0-65536 that identifes the field
<value_type> is a enum of type 'Item_result'
<value> is the value coded in proto format.
In other words, we should have no nested or complex structure.
4. How to store and access data in a protocol buffer from SQL
============================================================
User-friendly/meaningful access to GPB fields requires knowledge of
GPB field names and types, which are not available from GPB message
itself (see "GPB encoding overview" section).
To make things easy for the user, we will at first stage provide SQL
functions to manipulate a string that is actually in proto format.
The functions we should provde are:
proto_get(gpb, field_number, type)
This return the field tagged with 'field_number' from the 'gpb' buffer.
Example: proto_get(blob, 1, varchar) -> Returns field number 1 as varchar
proto_put(gpb, field_number, value)
This returns a new gbp buffer with the new value appended.
Example: proto_put(proto_put(blob, 1, 1), 2, "hello")
5. Extension for future
=======================
In the future we may want to access data based on name and get MariaDB to
automaticly know the correct type. To do this we need to be able to
store a definition for the content of the proto buffer somewhere.
DecisionToMake: How to pass the server the GPB definition?
First idea: add a CREATE TABLE parameter which will specify the
definition itself.
LOW-LEVEL DESIGN:
* GPB tarball contains a protocol definition for .proto file structure itself
and a parser for text form of .proto file which then exposes the parsed
file via standard GPB message navigation API.
* We should have both server-side support and client-side support (client side
means functions in libmysqlclient so that user can select the full BLOB and
extract fields in the application).
* Add some kind of header to the GPB blob to support versioning and future
extensibility.
* Add complete syntax description (update, add, drop, exists, ...).
ESTIMATED WORK TIME
ESTIMATED COMPLETION DATE
-----------------------------------------------------------------------
WorkLog (v3.5.9)
1
0

[Maria-developers] Updated (by Monty): Add support for google protocol buffers (34)
by worklog-noreply@askmonty.org 11 Feb '10
by worklog-noreply@askmonty.org 11 Feb '10
11 Feb '10
-----------------------------------------------------------------------
WORKLOG TASK
-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-
TASK...........: Add support for google protocol buffers
CREATION DATE..: Tue, 21 Jul 2009, 21:11
SUPERVISOR.....: Monty
IMPLEMENTOR....:
COPIES TO......:
CATEGORY.......: Server-Sprint
TASK ID........: 34 (http://askmonty.org/worklog/?tid=34)
VERSION........: WorkLog-3.4
STATUS.........: Un-Assigned
PRIORITY.......: 60
WORKED HOURS...: 0
ESTIMATE.......: 0 (hours remain)
ORIG. ESTIMATE.: 0
PROGRESS NOTES:
-=-=(Monty - Thu, 11 Feb 2010, 19:59)=-=-
High Level Description modified.
--- /tmp/wklog.34.old.17915 2010-02-11 17:59:17.000000000 +0000
+++ /tmp/wklog.34.new.17915 2010-02-11 17:59:17.000000000 +0000
@@ -1,5 +1,21 @@
-Add support for Google Protocol Buffers (further GPB). It should be possible
-to have columns that store GPB-encoded data, as well as use SQL constructs to
+Add support for dynamic columns:
+
+- A column that can hold information from many columns
+- One can instantly add or remove column data
+
+This is a useful feature for any store type of application, where you want to
+store different type of information for different kind of items.
+
+For example, for shoes you want to store: material, size, colour, maker
+For a computer you want to store ram, hard disk size etc...
+
+In a normal 'relational' system you would need to a table for each type.
+With dynamic columns you have all common items as fixed fields (like
+product_code, manufacturer, price) and the rest stored in a dynamic column.
+
+The proposed idea is to store the dynamic information in a blob in
+Google Protocol Buffers (further GPB) format and use SQL constructs to
extract parts of GPB data for use in select list, for filtering, and so forth.
+
Any support for indexing GPB data is outside of scope of this WL entry.
-=-=(Knielsen - Fri, 22 Jan 2010, 11:38)=-=-
Low Level Design modified.
--- /tmp/wklog.34.old.29965 2010-01-22 11:38:57.000000000 +0200
+++ /tmp/wklog.34.new.29965 2010-01-22 11:38:57.000000000 +0200
@@ -2,3 +2,12 @@
and a parser for text form of .proto file which then exposes the parsed
file via standard GPB message navigation API.
+* We should have both server-side support and client-side support (client side
+ means functions in libmysqlclient so that user can select the full BLOB and
+ extract fields in the application).
+
+* Add some kind of header to the GPB blob to support versioning and future
+ extensibility.
+
+* Add complete syntax description (update, add, drop, exists, ...).
+
-=-=(Psergey - Tue, 21 Jul 2009, 21:13)=-=-
Low Level Design modified.
--- /tmp/wklog.34.old.6462 2009-07-21 21:13:13.000000000 +0300
+++ /tmp/wklog.34.new.6462 2009-07-21 21:13:13.000000000 +0300
@@ -1 +1,4 @@
+* GPB tarball contains a protocol definition for .proto file structure itself
+ and a parser for text form of .proto file which then exposes the parsed
+ file via standard GPB message navigation API.
-=-=(Psergey - Tue, 21 Jul 2009, 21:12)=-=-
High-Level Specification modified.
--- /tmp/wklog.34.old.6399 2009-07-21 21:12:23.000000000 +0300
+++ /tmp/wklog.34.new.6399 2009-07-21 21:12:23.000000000 +0300
@@ -1 +1,78 @@
+<contents>
+1. GPB Encoding overview
+2. GPB in an SQL database
+2.1 Informing server about GPB field names and types
+2.2 Addressing GPB fields
+2.2.1 Option1: SQL Function
+2.2.2 Option2: SQL columns
+</contents>
+
+
+1. GPB Encoding overview
+========================
+
+GBB is a compact encoding for structured and typed data. A unit of GPB data
+(it is called message) is only partially self-describing: it's possible to
+iterate over its parts, but, quoting the spec
+
+http://code.google.com/apis/protocolbuffers/docs/encoding.html:
+ " the name and declared type for each field can only be determined on the
+ decoding end by referencing the message type's definition (i.e. the .proto
+ file). "
+
+2. GPB in an SQL database
+=========================
+
+It is possible to store GPB data in MariaDB today - one can declare a binary
+blob column and use it to store GPB messages. Storing and retrieving entire
+messages will be the only available operations, though, as the server has no
+idea about the GPB format.
+It is apparent that ability to peek inside GPB data from SQL layer would be of
+great advantage: one would be able to
+- select only certain fields or parts of GPB messages
+- filter records based on the values of GPB fields
+- etc
+performing such operations at SQL layer will allow to reduce client<->server
+traffic right away, and will open path to getting the best possible
+performance.
+
+2.1 Informing server about GPB field names and types
+----------------------------------------------------
+User-friendly/meaningful access to GPB fields requires knowledge of GPB field
+names and types, which are not available from GPB message itself (see "GPB
+encoding overview" section).
+
+So the first issue to be addressed is to get the server to know the definition
+of stored messages. We intend to assume that all records have GPB messages
+that conform to a certain single definition, which gives one definition per
+GPB field.
+
+DecisionToMake: How to pass the server the GPB definition?
+First idea: add a CREATE TABLE parameter which will specify either the
+definition itself or path to .proto file with the definition.
+
+2.2 Addressing GPB fields
+-------------------------
+We'll need to provide a way to access GPB fields. This can be complicated as
+structures that are encoded in GPB message can be nested and recursive.
+
+2.2.1 Option1: SQL Function
+~~~~~~~~~~~~~~~~~~~~~~~~~~~
+Introduce an SQL function GPB_FIELD(path) which will return contents of the
+field.
+- Return type of the function will be determined from GPB message definition.
+- For path, we can use XPath selector (a subset of XPath) syntax.
+
+(TODO ^ the above needs to be specified in more detail. is the selector as
+simple as filesystem path or we allow quantifiers (with predicates?)?)
+
+2.2.2 Option2: SQL columns
+~~~~~~~~~~~~~~~~~~~~~~~~~~
+Make GPB columns to be accessible as SQL columns.
+This approach has problems:
+- It might be hard to implement code-wise
+ - (TODO will Virtual columns patch help??)
+- It is not clear how to access fields from nested structures. Should we allow
+ quoted names like `foo/bar[2]/baz' ?
+
DESCRIPTION:
Add support for dynamic columns:
- A column that can hold information from many columns
- One can instantly add or remove column data
This is a useful feature for any store type of application, where you want to
store different type of information for different kind of items.
For example, for shoes you want to store: material, size, colour, maker
For a computer you want to store ram, hard disk size etc...
In a normal 'relational' system you would need to a table for each type.
With dynamic columns you have all common items as fixed fields (like
product_code, manufacturer, price) and the rest stored in a dynamic column.
The proposed idea is to store the dynamic information in a blob in
Google Protocol Buffers (further GPB) format and use SQL constructs to
extract parts of GPB data for use in select list, for filtering, and so forth.
Any support for indexing GPB data is outside of scope of this WL entry.
HIGH-LEVEL SPECIFICATION:
<contents>
1. GPB Encoding overview
2. GPB in an SQL database
2.1 Informing server about GPB field names and types
2.2 Addressing GPB fields
2.2.1 Option1: SQL Function
2.2.2 Option2: SQL columns
</contents>
1. GPB Encoding overview
========================
GBB is a compact encoding for structured and typed data. A unit of GPB data
(it is called message) is only partially self-describing: it's possible to
iterate over its parts, but, quoting the spec
http://code.google.com/apis/protocolbuffers/docs/encoding.html:
" the name and declared type for each field can only be determined on the
decoding end by referencing the message type's definition (i.e. the .proto
file). "
2. GPB in an SQL database
=========================
It is possible to store GPB data in MariaDB today - one can declare a binary
blob column and use it to store GPB messages. Storing and retrieving entire
messages will be the only available operations, though, as the server has no
idea about the GPB format.
It is apparent that ability to peek inside GPB data from SQL layer would be of
great advantage: one would be able to
- select only certain fields or parts of GPB messages
- filter records based on the values of GPB fields
- etc
performing such operations at SQL layer will allow to reduce client<->server
traffic right away, and will open path to getting the best possible
performance.
2.1 Informing server about GPB field names and types
----------------------------------------------------
User-friendly/meaningful access to GPB fields requires knowledge of GPB field
names and types, which are not available from GPB message itself (see "GPB
encoding overview" section).
So the first issue to be addressed is to get the server to know the definition
of stored messages. We intend to assume that all records have GPB messages
that conform to a certain single definition, which gives one definition per
GPB field.
DecisionToMake: How to pass the server the GPB definition?
First idea: add a CREATE TABLE parameter which will specify either the
definition itself or path to .proto file with the definition.
2.2 Addressing GPB fields
-------------------------
We'll need to provide a way to access GPB fields. This can be complicated as
structures that are encoded in GPB message can be nested and recursive.
2.2.1 Option1: SQL Function
~~~~~~~~~~~~~~~~~~~~~~~~~~~
Introduce an SQL function GPB_FIELD(path) which will return contents of the
field.
- Return type of the function will be determined from GPB message definition.
- For path, we can use XPath selector (a subset of XPath) syntax.
(TODO ^ the above needs to be specified in more detail. is the selector as
simple as filesystem path or we allow quantifiers (with predicates?)?)
2.2.2 Option2: SQL columns
~~~~~~~~~~~~~~~~~~~~~~~~~~
Make GPB columns to be accessible as SQL columns.
This approach has problems:
- It might be hard to implement code-wise
- (TODO will Virtual columns patch help??)
- It is not clear how to access fields from nested structures. Should we allow
quoted names like `foo/bar[2]/baz' ?
LOW-LEVEL DESIGN:
* GPB tarball contains a protocol definition for .proto file structure itself
and a parser for text form of .proto file which then exposes the parsed
file via standard GPB message navigation API.
* We should have both server-side support and client-side support (client side
means functions in libmysqlclient so that user can select the full BLOB and
extract fields in the application).
* Add some kind of header to the GPB blob to support versioning and future
extensibility.
* Add complete syntax description (update, add, drop, exists, ...).
ESTIMATED WORK TIME
ESTIMATED COMPLETION DATE
-----------------------------------------------------------------------
WorkLog (v3.5.9)
1
0

[Maria-developers] Updated (by Monty): Add support for google protocol buffers (34)
by worklog-noreply@askmonty.org 11 Feb '10
by worklog-noreply@askmonty.org 11 Feb '10
11 Feb '10
-----------------------------------------------------------------------
WORKLOG TASK
-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-
TASK...........: Add support for google protocol buffers
CREATION DATE..: Tue, 21 Jul 2009, 21:11
SUPERVISOR.....: Monty
IMPLEMENTOR....:
COPIES TO......:
CATEGORY.......: Server-Sprint
TASK ID........: 34 (http://askmonty.org/worklog/?tid=34)
VERSION........: WorkLog-3.4
STATUS.........: Un-Assigned
PRIORITY.......: 60
WORKED HOURS...: 0
ESTIMATE.......: 0 (hours remain)
ORIG. ESTIMATE.: 0
PROGRESS NOTES:
-=-=(Monty - Thu, 11 Feb 2010, 19:59)=-=-
High Level Description modified.
--- /tmp/wklog.34.old.17915 2010-02-11 17:59:17.000000000 +0000
+++ /tmp/wklog.34.new.17915 2010-02-11 17:59:17.000000000 +0000
@@ -1,5 +1,21 @@
-Add support for Google Protocol Buffers (further GPB). It should be possible
-to have columns that store GPB-encoded data, as well as use SQL constructs to
+Add support for dynamic columns:
+
+- A column that can hold information from many columns
+- One can instantly add or remove column data
+
+This is a useful feature for any store type of application, where you want to
+store different type of information for different kind of items.
+
+For example, for shoes you want to store: material, size, colour, maker
+For a computer you want to store ram, hard disk size etc...
+
+In a normal 'relational' system you would need to a table for each type.
+With dynamic columns you have all common items as fixed fields (like
+product_code, manufacturer, price) and the rest stored in a dynamic column.
+
+The proposed idea is to store the dynamic information in a blob in
+Google Protocol Buffers (further GPB) format and use SQL constructs to
extract parts of GPB data for use in select list, for filtering, and so forth.
+
Any support for indexing GPB data is outside of scope of this WL entry.
-=-=(Knielsen - Fri, 22 Jan 2010, 11:38)=-=-
Low Level Design modified.
--- /tmp/wklog.34.old.29965 2010-01-22 11:38:57.000000000 +0200
+++ /tmp/wklog.34.new.29965 2010-01-22 11:38:57.000000000 +0200
@@ -2,3 +2,12 @@
and a parser for text form of .proto file which then exposes the parsed
file via standard GPB message navigation API.
+* We should have both server-side support and client-side support (client side
+ means functions in libmysqlclient so that user can select the full BLOB and
+ extract fields in the application).
+
+* Add some kind of header to the GPB blob to support versioning and future
+ extensibility.
+
+* Add complete syntax description (update, add, drop, exists, ...).
+
-=-=(Psergey - Tue, 21 Jul 2009, 21:13)=-=-
Low Level Design modified.
--- /tmp/wklog.34.old.6462 2009-07-21 21:13:13.000000000 +0300
+++ /tmp/wklog.34.new.6462 2009-07-21 21:13:13.000000000 +0300
@@ -1 +1,4 @@
+* GPB tarball contains a protocol definition for .proto file structure itself
+ and a parser for text form of .proto file which then exposes the parsed
+ file via standard GPB message navigation API.
-=-=(Psergey - Tue, 21 Jul 2009, 21:12)=-=-
High-Level Specification modified.
--- /tmp/wklog.34.old.6399 2009-07-21 21:12:23.000000000 +0300
+++ /tmp/wklog.34.new.6399 2009-07-21 21:12:23.000000000 +0300
@@ -1 +1,78 @@
+<contents>
+1. GPB Encoding overview
+2. GPB in an SQL database
+2.1 Informing server about GPB field names and types
+2.2 Addressing GPB fields
+2.2.1 Option1: SQL Function
+2.2.2 Option2: SQL columns
+</contents>
+
+
+1. GPB Encoding overview
+========================
+
+GBB is a compact encoding for structured and typed data. A unit of GPB data
+(it is called message) is only partially self-describing: it's possible to
+iterate over its parts, but, quoting the spec
+
+http://code.google.com/apis/protocolbuffers/docs/encoding.html:
+ " the name and declared type for each field can only be determined on the
+ decoding end by referencing the message type's definition (i.e. the .proto
+ file). "
+
+2. GPB in an SQL database
+=========================
+
+It is possible to store GPB data in MariaDB today - one can declare a binary
+blob column and use it to store GPB messages. Storing and retrieving entire
+messages will be the only available operations, though, as the server has no
+idea about the GPB format.
+It is apparent that ability to peek inside GPB data from SQL layer would be of
+great advantage: one would be able to
+- select only certain fields or parts of GPB messages
+- filter records based on the values of GPB fields
+- etc
+performing such operations at SQL layer will allow to reduce client<->server
+traffic right away, and will open path to getting the best possible
+performance.
+
+2.1 Informing server about GPB field names and types
+----------------------------------------------------
+User-friendly/meaningful access to GPB fields requires knowledge of GPB field
+names and types, which are not available from GPB message itself (see "GPB
+encoding overview" section).
+
+So the first issue to be addressed is to get the server to know the definition
+of stored messages. We intend to assume that all records have GPB messages
+that conform to a certain single definition, which gives one definition per
+GPB field.
+
+DecisionToMake: How to pass the server the GPB definition?
+First idea: add a CREATE TABLE parameter which will specify either the
+definition itself or path to .proto file with the definition.
+
+2.2 Addressing GPB fields
+-------------------------
+We'll need to provide a way to access GPB fields. This can be complicated as
+structures that are encoded in GPB message can be nested and recursive.
+
+2.2.1 Option1: SQL Function
+~~~~~~~~~~~~~~~~~~~~~~~~~~~
+Introduce an SQL function GPB_FIELD(path) which will return contents of the
+field.
+- Return type of the function will be determined from GPB message definition.
+- For path, we can use XPath selector (a subset of XPath) syntax.
+
+(TODO ^ the above needs to be specified in more detail. is the selector as
+simple as filesystem path or we allow quantifiers (with predicates?)?)
+
+2.2.2 Option2: SQL columns
+~~~~~~~~~~~~~~~~~~~~~~~~~~
+Make GPB columns to be accessible as SQL columns.
+This approach has problems:
+- It might be hard to implement code-wise
+ - (TODO will Virtual columns patch help??)
+- It is not clear how to access fields from nested structures. Should we allow
+ quoted names like `foo/bar[2]/baz' ?
+
DESCRIPTION:
Add support for dynamic columns:
- A column that can hold information from many columns
- One can instantly add or remove column data
This is a useful feature for any store type of application, where you want to
store different type of information for different kind of items.
For example, for shoes you want to store: material, size, colour, maker
For a computer you want to store ram, hard disk size etc...
In a normal 'relational' system you would need to a table for each type.
With dynamic columns you have all common items as fixed fields (like
product_code, manufacturer, price) and the rest stored in a dynamic column.
The proposed idea is to store the dynamic information in a blob in
Google Protocol Buffers (further GPB) format and use SQL constructs to
extract parts of GPB data for use in select list, for filtering, and so forth.
Any support for indexing GPB data is outside of scope of this WL entry.
HIGH-LEVEL SPECIFICATION:
<contents>
1. GPB Encoding overview
2. GPB in an SQL database
2.1 Informing server about GPB field names and types
2.2 Addressing GPB fields
2.2.1 Option1: SQL Function
2.2.2 Option2: SQL columns
</contents>
1. GPB Encoding overview
========================
GBB is a compact encoding for structured and typed data. A unit of GPB data
(it is called message) is only partially self-describing: it's possible to
iterate over its parts, but, quoting the spec
http://code.google.com/apis/protocolbuffers/docs/encoding.html:
" the name and declared type for each field can only be determined on the
decoding end by referencing the message type's definition (i.e. the .proto
file). "
2. GPB in an SQL database
=========================
It is possible to store GPB data in MariaDB today - one can declare a binary
blob column and use it to store GPB messages. Storing and retrieving entire
messages will be the only available operations, though, as the server has no
idea about the GPB format.
It is apparent that ability to peek inside GPB data from SQL layer would be of
great advantage: one would be able to
- select only certain fields or parts of GPB messages
- filter records based on the values of GPB fields
- etc
performing such operations at SQL layer will allow to reduce client<->server
traffic right away, and will open path to getting the best possible
performance.
2.1 Informing server about GPB field names and types
----------------------------------------------------
User-friendly/meaningful access to GPB fields requires knowledge of GPB field
names and types, which are not available from GPB message itself (see "GPB
encoding overview" section).
So the first issue to be addressed is to get the server to know the definition
of stored messages. We intend to assume that all records have GPB messages
that conform to a certain single definition, which gives one definition per
GPB field.
DecisionToMake: How to pass the server the GPB definition?
First idea: add a CREATE TABLE parameter which will specify either the
definition itself or path to .proto file with the definition.
2.2 Addressing GPB fields
-------------------------
We'll need to provide a way to access GPB fields. This can be complicated as
structures that are encoded in GPB message can be nested and recursive.
2.2.1 Option1: SQL Function
~~~~~~~~~~~~~~~~~~~~~~~~~~~
Introduce an SQL function GPB_FIELD(path) which will return contents of the
field.
- Return type of the function will be determined from GPB message definition.
- For path, we can use XPath selector (a subset of XPath) syntax.
(TODO ^ the above needs to be specified in more detail. is the selector as
simple as filesystem path or we allow quantifiers (with predicates?)?)
2.2.2 Option2: SQL columns
~~~~~~~~~~~~~~~~~~~~~~~~~~
Make GPB columns to be accessible as SQL columns.
This approach has problems:
- It might be hard to implement code-wise
- (TODO will Virtual columns patch help??)
- It is not clear how to access fields from nested structures. Should we allow
quoted names like `foo/bar[2]/baz' ?
LOW-LEVEL DESIGN:
* GPB tarball contains a protocol definition for .proto file structure itself
and a parser for text form of .proto file which then exposes the parsed
file via standard GPB message navigation API.
* We should have both server-side support and client-side support (client side
means functions in libmysqlclient so that user can select the full BLOB and
extract fields in the application).
* Add some kind of header to the GPB blob to support versioning and future
extensibility.
* Add complete syntax description (update, add, drop, exists, ...).
ESTIMATED WORK TIME
ESTIMATED COMPLETION DATE
-----------------------------------------------------------------------
WorkLog (v3.5.9)
1
0

Re: [Maria-developers] Rev 2740: Group commit for maria storage engine. in file:///Users/bell/maria/bzr/work-maria-5.2-groupcommit/
by Oleksandr Byelkin 11 Feb '10
by Oleksandr Byelkin 11 Feb '10
11 Feb '10
Hi!
10 февр. 2010, в 21:38, Sergei Golubchik написал(а):
[skip]
>>> Why use my_atomic_store32 ?
>>
>> As I understood idea of atomic operation it is guaranted that we will
>> read consistent value (not one byte from one value and other one from
>> other). Yes I remember your statement that on modern 32bit system
>> you
>> always get it consistent, then why we made atomic operations at all?
>
> Because my_atomic_store32() also adds a full memory barrier to the
> atomic store operation. That is, if you do
>
> my_atomic_store32(&a, 1);
> my_atomic_store32(&b, 2);
>
> and then in another thread
>
> if (my_atomic_load32(&b) == 2)
> {
> ...
> here you can be sure that a==1, because a=1 was executed before
> b=2. And neither compiler nor the cpu swapped two assignments.
> }
>
In other words it is real current value of the variable in all
threads. It looks like what I need.
2
1

[Maria-developers] Rev 2758: Subquery optimizations: backport in file:///home/psergey/dev/maria-5.3-subqueries-r3/
by Sergey Petrunya 11 Feb '10
by Sergey Petrunya 11 Feb '10
11 Feb '10
At file:///home/psergey/dev/maria-5.3-subqueries-r3/
------------------------------------------------------------
revno: 2758
revision-id: psergey(a)askmonty.org-20100211120315-o1hpcxl5lkbrbl25
parent: psergey(a)askmonty.org-20100209203217-al1k9h50zrlphy5d
committer: Sergey Petrunya <psergey(a)askmonty.org>
branch nick: maria-5.3-subqueries-r3
timestamp: Thu 2010-02-11 15:03:15 +0300
message:
Subquery optimizations: backport
- Fix valgrind failure: do initialize Item::is_expensive_cache.
=== modified file 'sql/item.cc'
--- a/sql/item.cc 2010-02-08 13:10:19 +0000
+++ b/sql/item.cc 2010-02-11 12:03:15 +0000
@@ -373,8 +373,8 @@
Item::Item():
- rsize(0), name(0), orig_name(0), name_length(0), fixed(0),
- is_autogenerated_name(TRUE),
+ is_expensive_cache(-1), rsize(0), name(0), orig_name(0), name_length(0),
+ fixed(0), is_autogenerated_name(TRUE),
collation(&my_charset_bin, DERIVATION_COERCIBLE)
{
marker= 0;
@@ -410,6 +410,7 @@
tables.
*/
Item::Item(THD *thd, Item *item):
+ is_expensive_cache(-1),
rsize(0),
str_value(item->str_value),
name(item->name),
=== modified file 'sql/item.h'
--- a/sql/item.h 2010-01-28 13:48:33 +0000
+++ b/sql/item.h 2010-02-11 12:03:15 +0000
@@ -513,6 +513,9 @@
enum traverse_order { POSTFIX, PREFIX };
+ /* Cache of the result of is_expensive(). */
+ int8 is_expensive_cache;
+
/* Reuse size, only used by SP local variable assignment, otherwize 0 */
uint rsize;
@@ -878,9 +881,6 @@
static CHARSET_INFO *default_charset();
virtual CHARSET_INFO *compare_collation() { return NULL; }
- /* Cache of the result of is_expensive(). */
- int8 is_expensive_cache;
-
virtual bool walk(Item_processor processor, bool walk_subquery, uchar *arg)
{
return (this->*processor)(arg);
1
0

[Maria-developers] [Branch ~maria-captains/maria/5.1] Rev 2815: Added option --temporary-tables to test speed of temporary tables
by noreply@launchpad.net 10 Feb '10
by noreply@launchpad.net 10 Feb '10
10 Feb '10
------------------------------------------------------------
revno: 2815
committer: Michael Widenius <monty(a)askmonty.org>
branch nick: maria-5.1
timestamp: Wed 2010-02-10 23:26:06 +0200
message:
Added option --temporary-tables to test speed of temporary tables
added:
mysql-test/suite/parts/t/partition_repair_myisam-master.opt
modified:
sql-bench/bench-init.pl.sh
sql-bench/server-cfg.sh
sql-bench/test-connect.sh
sql-bench/test-create.sh
--
lp:maria
https://code.launchpad.net/~maria-captains/maria/5.1
Your team Maria developers is subscribed to branch lp:maria.
To unsubscribe from this branch go to https://code.launchpad.net/~maria-captains/maria/5.1/+edit-subscription.
1
0

[Maria-developers] [Branch ~maria-captains/maria/5.1] Rev 2814: When one does a drop table, the indexes are not flushed to disk before drop anymore (with MyISAM/...
by noreply@launchpad.net 10 Feb '10
by noreply@launchpad.net 10 Feb '10
10 Feb '10
------------------------------------------------------------
revno: 2814
committer: Michael Widenius <monty(a)askmonty.org>
branch nick: maria-5.1
timestamp: Wed 2010-02-10 21:06:24 +0200
message:
When one does a drop table, the indexes are not flushed to disk before drop anymore (with MyISAM/Maria)
myisam-recover options changed from OFF to 'DEFAULT' to get less change of data loss when using MyISAM.
(The disadvantage is that changed MyISAM tables will be checked at access time; Use --myisam-recover=OFF for old behavior)
Don't call extra(HA_EXTRA_FORCE_REOPEN) in ALTER TABLE if table is locked as this will mark table as crashed!
Added assert to detect if we accidently would use MyISAM versioning in MySQL
modified:
include/my_base.h
mysql-test/mysql-test-run.pl
mysql-test/r/sp-destruct.result
mysql-test/r/variables.result
mysql-test/r/view.result
mysql-test/suite/maria/t/maria-recovery2-master.opt
mysql-test/t/sp-destruct.test
mysql-test/t/view.test
sql/lock.cc
sql/mysql_priv.h
sql/mysqld.cc
sql/sql_base.cc
sql/sql_delete.cc
sql/sql_table.cc
sql/table.cc
sql/table.h
storage/maria/ha_maria.cc
storage/maria/ma_blockrec.c
storage/maria/ma_close.c
storage/maria/ma_extra.c
storage/maria/ma_locking.c
storage/maria/ma_recovery.c
storage/maria/maria_def.h
storage/myisam/mi_close.c
storage/myisam/mi_extra.c
storage/myisam/mi_open.c
storage/myisam/myisamdef.h
--
lp:maria
https://code.launchpad.net/~maria-captains/maria/5.1
Your team Maria developers is subscribed to branch lp:maria.
To unsubscribe from this branch go to https://code.launchpad.net/~maria-captains/maria/5.1/+edit-subscription.
1
0

[Maria-developers] bzr commit into MariaDB 5.1, with Maria 1.5:maria branch (monty:2815)
by Michael Widenius 10 Feb '10
by Michael Widenius 10 Feb '10
10 Feb '10
#At lp:maria based on revid:monty@askmonty.org-20100210190624-38ucdn8y98k1v1zd
2815 Michael Widenius 2010-02-10
Added option --temporary-tables to test speed of temporary tables
added:
mysql-test/suite/parts/t/partition_repair_myisam-master.opt
modified:
sql-bench/bench-init.pl.sh
sql-bench/server-cfg.sh
sql-bench/test-connect.sh
sql-bench/test-create.sh
per-file messages:
mysql-test/suite/parts/t/partition_repair_myisam-master.opt
Added missing file from last push
sql-bench/bench-init.pl.sh
Added options:
--temporary-tables to test speed of temporary tables
sql-bench/server-cfg.sh
Added limit for number of temporary tables one can create
sql-bench/test-connect.sh
Skip test that doesn't work with temporary tables.
sql-bench/test-create.sh
Added limit for number of temporary tables one can create
=== added file 'mysql-test/suite/parts/t/partition_repair_myisam-master.opt'
--- a/mysql-test/suite/parts/t/partition_repair_myisam-master.opt 1970-01-01 00:00:00 +0000
+++ b/mysql-test/suite/parts/t/partition_repair_myisam-master.opt 2010-02-10 21:26:06 +0000
@@ -0,0 +1 @@
+--myisam-recover=off
=== modified file 'sql-bench/bench-init.pl.sh'
--- a/sql-bench/bench-init.pl.sh 2010-02-09 17:17:04 +0000
+++ b/sql-bench/bench-init.pl.sh 2010-02-10 21:26:06 +0000
@@ -39,7 +39,7 @@ require "$pwd/server-cfg" || die "Can't
$|=1; # Output data immediately
-$opt_skip_test=$opt_skip_create=$opt_skip_delete=$opt_verbose=$opt_fast_insert=$opt_lock_tables=$opt_debug=$opt_skip_delete=$opt_fast=$opt_force=$opt_log=$opt_use_old_results=$opt_help=$opt_odbc=$opt_small_test=$opt_small_tables=$opt_samll_key_tables=$opt_stage=$opt_old_headers=$opt_die_on_errors=$opt_tcpip=$opt_random=$opt_only_missing_tests=0;
+$opt_skip_test=$opt_skip_create=$opt_skip_delete=$opt_verbose=$opt_fast_insert=$opt_lock_tables=$opt_debug=$opt_skip_delete=$opt_fast=$opt_force=$opt_log=$opt_use_old_results=$opt_help=$opt_odbc=$opt_small_test=$opt_small_tables=$opt_samll_key_tables=$opt_stage=$opt_old_headers=$opt_die_on_errors=$opt_tcpip=$opt_random=$opt_only_missing_tests=$opt_temporary_tables=0;
$opt_cmp=$opt_user=$opt_password=$opt_connect_options=$opt_connect_command= "";
$opt_server="mysql"; $opt_dir="output";
$opt_host="localhost";$opt_database="test";
@@ -59,7 +59,7 @@ $log_prog_args=join(" ", skip_arguments(
"use-old-results","skip-test",
"optimization","hw",
"machine", "dir", "suffix", "log"));
-GetOptions("skip-test=s","comments=s","cmp=s","server=s","user=s","host=s","database=s","password=s","loop-count=i","row-count=i","skip-create","skip-delete","verbose","fast-insert","lock-tables","debug","fast","force","field-count=i","regions=i","groups=i","time-limit=i","log","use-old-results","machine=s","dir=s","suffix=s","help","odbc","small-test","small-tables","small-key-tables","stage=i","threads=i","random","old-headers","die-on-errors","create-options=s","hires","tcpip","silent","optimization=s","hw=s","socket=s","connect-options=s","connect-command=s","only-missing-tests") || usage();
+GetOptions("skip-test=s","comments=s","cmp=s","server=s","user=s","host=s","database=s","password=s","loop-count=i","row-count=i","skip-create","skip-delete","verbose","fast-insert","lock-tables","debug","fast","force","field-count=i","regions=i","groups=i","time-limit=i","log","use-old-results","machine=s","dir=s","suffix=s","help","odbc","small-test","small-tables","small-key-tables","stage=i","threads=i","random","old-headers","die-on-errors","create-options=s","hires","tcpip","silent","optimization=s","hw=s","socket=s","connect-options=s","connect-command=s","only-missing-tests","temporary-tables") || usage();
usage() if ($opt_help);
$server=get_server($opt_server,$opt_host,$opt_database,$opt_odbc,
@@ -454,6 +454,9 @@ All benchmarks takes the following optio
create all MySQL tables as InnoDB tables use:
--create-options=ENGINE=InnoDB
+--temporary-tables
+ Use temporary tables for all tests.
+
--database (Default $opt_database)
In which database the test tables are created.
=== modified file 'sql-bench/server-cfg.sh'
--- a/sql-bench/server-cfg.sh 2010-02-09 17:17:04 +0000
+++ b/sql-bench/server-cfg.sh 2010-02-10 21:26:06 +0000
@@ -159,6 +159,7 @@ sub new
$limits{'max_index'} = 16; # Max number of keys
$limits{'max_index_parts'} = 16; # Max segments/key
$limits{'max_tables'} = (($machine || '') =~ "^win") ? 5000 : 65000;
+ $limits{'max_temporary_tables'}= 400;
$limits{'max_text_size'} = 1000000; # Good enough for tests
$limits{'multi_drop'} = 1; # Drop table can take many tables
$limits{'order_by_position'} = 1; # Can use 'ORDER BY 1'
@@ -189,6 +190,7 @@ sub new
$self->{'transactions'} = 1; # Transactions enabled
$limits{'max_columns'} = 90; # Max number of columns in table
$limits{'max_tables'} = 32; # No comments
+ $limits{'max_temporary_tables'}= $limits{"max_tables"};
}
if (defined($main::opt_create_options) &&
$main::opt_create_options =~ /engine=bdb/i)
@@ -200,6 +202,7 @@ sub new
{
$limits{'working_blobs'} = 0; # Blobs not implemented yet
$limits{'max_tables'} = 500;
+ $limits{'max_temporary_tables'}= $limits{"max_tables"};
$self->{'transactions'} = 1; # Transactions enabled
}
@@ -270,7 +273,14 @@ sub create
my($self,$table_name,$fields,$index,$options) = @_;
my($query,@queries);
- $query="create table $table_name (";
+ if ($main::opt_temporary_tables)
+ {
+ $query="create temporary table $table_name (";
+ }
+ else
+ {
+ $query="create table $table_name (";
+ }
foreach $field (@$fields)
{
# $field =~ s/ decimal/ double(10,2)/i;
@@ -393,6 +403,7 @@ sub new
$limits{'max_conditions'} = 74;
$limits{'max_columns'} = 75;
$limits{'max_tables'} = 65000; # Should be big enough
+ $limits{'max_temporary_tables'}= $limits{"max_tables"};
$limits{'max_text_size'} = 32000;
$limits{'query_size'} = 65535;
$limits{'max_index'} = 5;
@@ -622,7 +633,9 @@ sub new
$limits{'max_conditions'} = 9999; # This makes Pg real slow
$limits{'max_index'} = 64; # Big enough
$limits{'max_index_parts'} = 16;
- $limits{'max_tables'} = 5000; # 10000 crashes pg 7.0.2
+ $limits{'max_tables'} = 65000;
+ $limits{'max_temporary_tables'}= $limits{"max_tables"};
+
$limits{'max_text_size'} = 65000; # Good enough for test
$limits{'multi_drop'} = 1;
$limits{'order_by_position'} = 1;
@@ -873,6 +886,8 @@ sub new
$limits{'max_conditions'} = 9999; # Probably big enough
$limits{'max_columns'} = 2000; # From crash-me
$limits{'max_tables'} = 65000; # Should be big enough
+ $limits{'max_temporary_tables'}= $limits{"max_tables"};
+
$limits{'max_text_size'} = 65492; # According to tests
$limits{'query_size'} = 65535; # Probably a limit
$limits{'max_index'} = 64; # Probably big enough
@@ -1104,6 +1119,7 @@ sub new
# above this value .... but can handle 2419 columns
# maybe something for crash-me ... but how to check ???
$limits{'max_tables'} = 65000; # Should be big enough
+ $limits{'max_temporary_tables'}= $limits{"max_tables"};
$limits{'max_text_size'} = 4095; # max returned ....
$limits{'query_size'} = 65535; # Not a limit, big enough
$limits{'max_index'} = 64; # Big enough
@@ -1374,6 +1390,8 @@ sub new
$limits{'max_conditions'} = 9999; # (Actually not a limit)
$limits{'max_columns'} = 254; # Max number of columns in table
$limits{'max_tables'} = 65000; # Should be big enough
+ $limits{'max_temporary_tables'}= $limits{"max_tables"};
+
$limits{'max_text_size'} = 2000; # Limit for blob test-connect
$limits{'query_size'} = 65525; # Max size with default buffers.
$limits{'max_index'} = 16; # Max number of keys
@@ -1647,6 +1665,8 @@ sub new
$limits{'max_column_name'} = 18; # max table and column name
$limits{'max_columns'} = 994; # Max number of columns in table
$limits{'max_tables'} = 65000; # Should be big enough
+ $limits{'max_temporary_tables'}= $limits{"max_tables"};
+
$limits{'max_index'} = 64; # Max number of keys
$limits{'max_index_parts'} = 15; # Max segments/key
$limits{'max_text_size'} = 65535; # Max size with default buffers. ??
@@ -1835,6 +1855,8 @@ sub new
$limits{'max_conditions'} = 97; # We get 'Query is too complex'
$limits{'max_columns'} = 255; # Max number of columns in table
$limits{'max_tables'} = 65000; # Should be big enough
+ $limits{'max_temporary_tables'}= $limits{"max_tables"};
+
$limits{'max_text_size'} = 255; # Max size with default buffers.
$limits{'query_size'} = 65535; # Not a limit, big enough
$limits{'max_index'} = 32; # Max number of keys
@@ -2020,6 +2042,8 @@ sub new
$limits{'max_conditions'} = 1030; # We get 'Query is too complex'
$limits{'max_columns'} = 250; # Max number of columns in table
$limits{'max_tables'} = 65000; # Should be big enough
+ $limits{'max_temporary_tables'}= $limits{"max_tables"};
+
$limits{'max_text_size'} = 9830; # Max size with default buffers.
$limits{'query_size'} = 9830; # Max size with default buffers.
$limits{'max_index'} = 64; # Max number of keys
@@ -2216,6 +2240,8 @@ sub new
$limits{'max_conditions'} = 1030; # We get 'Query is too complex'
$limits{'max_columns'} = 250; # Max number of columns in table
$limits{'max_tables'} = 65000; # Should be big enough
+ $limits{'max_temporary_tables'}= $limits{"max_tables"};
+
$limits{'max_text_size'} = 9830; # Max size with default buffers.
$limits{'query_size'} = 9830; # Max size with default buffers.
$limits{'max_index'} = 64; # Max number of keys
@@ -2448,6 +2474,8 @@ sub new
$limits{'max_conditions'} = 50; # (Actually not a limit)
$limits{'max_columns'} = 254; # Max number of columns in table
$limits{'max_tables'} = 65000; # Should be big enough
+ $limits{'max_temporary_tables'}= $limits{"max_tables"};
+
$limits{'max_text_size'} = 2000; # Limit for blob test-connect
$limits{'query_size'} = 65525; # Max size with default buffers.
$limits{'max_index'} = 16; # Max number of keys
@@ -2652,6 +2680,8 @@ sub new
$limits{'max_conditions'} = 418; # We get 'Query is too complex'
$limits{'max_columns'} = 500; # Max number of columns in table
$limits{'max_tables'} = 65000; # Should be big enough
+ $limits{'max_temporary_tables'}= $limits{"max_tables"};
+
$limits{'max_text_size'} = 254; # Max size with default buffers.
$limits{'query_size'} = 254; # Max size with default buffers.
$limits{'max_index'} = 48; # Max number of keys
@@ -2830,6 +2860,7 @@ sub new
$limits{'max_conditions'} = 9999; # (Actually not a limit)
$limits{'max_columns'} = 252; # Max number of columns in table
$limits{'max_tables'} = 65000; # Should be big enough
+ $limits{'max_temporary_tables'}= $limits{"max_tables"};
$limits{'max_text_size'} = 15000; # Max size with default buffers.
$limits{'query_size'} = 1000000; # Max size with default buffers.
$limits{'max_index'} = 32; # Max number of keys
@@ -3032,6 +3063,7 @@ sub new
$limits{'max_conditions'} = 9999; # (Actually not a limit)
$limits{'max_columns'} = 252; # Max number of columns in table
$limits{'max_tables'} = 65000; # Should be big enough
+ $limits{'max_temporary_tables'}= $limits{"max_tables"};
$limits{'max_text_size'} = 15000; # Max size with default buffers.
$limits{'query_size'} = 1000000; # Max size with default buffers.
$limits{'max_index'} = 65000; # Max number of keys
@@ -3228,6 +3260,7 @@ sub new
# The following should be 8192, but is smaller because Frontbase crashes..
$limits{'max_columns'} = 150; # Max number of columns in table
$limits{'max_tables'} = 5000; # 10000 crashed FrontBase
+ $limits{'max_temporary_tables'}= $limits{"max_tables"};
$limits{'max_text_size'} = 65000; # Max size with default buffers.
$limits{'query_size'} = 8000000; # Max size with default buffers.
$limits{'max_index'} = 38; # Max number of keys
@@ -3440,6 +3473,7 @@ sub new
$limits{'max_conditions'} = 9999; # (Actually not a limit) *
$limits{'max_columns'} = 1023; # Max number of columns in table *
$limits{'max_tables'} = 65000; # Should be big enough * unlimited actually
+ $limits{'max_temporary_tables'}= $limits{"max_tables"};
$limits{'max_text_size'} = 15000; # Max size with default buffers.
$limits{'query_size'} = 64*1024; # Max size with default buffers. *64 kb by default. May be set by system variable
$limits{'max_index'} = 510; # Max number of keys *
=== modified file 'sql-bench/test-connect.sh'
--- a/sql-bench/test-connect.sh 2009-05-29 13:40:55 +0000
+++ b/sql-bench/test-connect.sh 2010-02-10 21:26:06 +0000
@@ -161,41 +161,48 @@ if ($opt_fast && defined($server->{vacuu
{
$server->vacuum(0,\$dbh);
}
-$dbh->disconnect;
+if (!$main::opt_temporary_tables)
+{
+ $dbh->disconnect;
+}
#
# First test connect/select/disconnect
#
-print "Testing connect/select 1 row from table/disconnect\n";
+if (!$main::opt_temporary_tables)
+{
+ print "Testing connect/select 1 row from table/disconnect\n";
-$loop_time=new Benchmark;
-$errors=0;
+ $loop_time=new Benchmark;
+ $errors=0;
-for ($i=0 ; $i < $small_loop_count ; $i++)
-{
- for ($j=0; $j < $max_test ; $j++)
+ for ($i=0 ; $i < $small_loop_count ; $i++)
{
- last if ($dbh = DBI->connect($server->{'data_source'}, $opt_user, $opt_password));
- $errors++;
- }
- die $DBI::errstr if ($j == $max_test);
+ for ($j=0; $j < $max_test ; $j++)
+ {
+ last if ($dbh = DBI->connect($server->{'data_source'}, $opt_user, $opt_password));
+ $errors++;
+ }
+ die $DBI::errstr if ($j == $max_test);
- $sth = $dbh->do("select a,i,s,$i from bench1") # Select * from table with 1 record
+ $sth = $dbh->do("select a,i,s,$i from bench1") # Select * from table with 1 record
or die $DBI::errstr;
- $dbh->disconnect;
-}
+ $dbh->disconnect;
+ }
-$end_time=new Benchmark;
-print "Warning: $errors connections didn't work without a time delay\n" if ($errors);
-print "Time to connect+select_1_row ($small_loop_count): " .
+ $end_time=new Benchmark;
+ print "Warning: $errors connections didn't work without a time delay\n" if ($errors);
+ print "Time to connect+select_1_row ($small_loop_count): " .
timestr(timediff($end_time, $loop_time),"all") . "\n\n";
+ $dbh = $server->connect();
+}
+
#
# The same test, but without connect/disconnect
#
print "Testing select 1 row from table\n";
-$dbh = $server->connect();
$loop_time=new Benchmark;
for ($i=0 ; $i < $opt_loop_count ; $i++)
=== modified file 'sql-bench/test-create.sh'
--- a/sql-bench/test-create.sh 2009-05-29 13:40:55 +0000
+++ b/sql-bench/test-create.sh 2010-02-10 21:26:06 +0000
@@ -47,7 +47,15 @@ if ($opt_small_test)
$create_loop_count/=1000;
}
-$max_tables=min($limits->{'max_tables'},$opt_loop_count);
+if ($opt_temporary_tables)
+{
+ $max_tables=min($limits->{'max_tables'},$opt_loop_count);
+}
+else
+{
+ $max_tables=min($limits->{'max_tables'},$opt_loop_count);
+ $max_tables=400;
+}
if ($opt_small_test)
{
@@ -71,7 +79,7 @@ $dbh = $server->connect();
if ($opt_force) # If tables used in this test exist, drop 'em
{
print "Okay..Let's make sure that our tables don't exist yet.\n\n";
- for ($i=1 ; $i <= $max_tables ; $i++)
+ for ($i=1 ; $i <= max($max_tables, $create_loop_count) ; $i++)
{
$dbh->do("drop table bench_$i" . $server->{'drop_attr'});
}
@@ -245,7 +253,7 @@ for ($i=2 ; $i <= $keys ; $i++)
}
$loop_time=new Benchmark;
-for ($i=1 ; $i <= $opt_loop_count ; $i++)
+for ($i=1 ; $i <= $create_loop_count ; $i++)
{
do_many($dbh,$server->create("bench_$i", \@fields, \@keys));
$dbh->do("drop table bench_$i" . $server->{'drop_attr'}) or die $DBI::errstr;
1
0

[Maria-developers] Rev 2740: Group commit for maria storage engine. in file:///Users/bell/maria/bzr/work-maria-5.2-groupcommit/
by sanja@askmonty.org 10 Feb '10
by sanja@askmonty.org 10 Feb '10
10 Feb '10
At file:///Users/bell/maria/bzr/work-maria-5.2-groupcommit/
------------------------------------------------------------
revno: 2740
revision-id: sanja(a)askmonty.org-20100210205026-8l8veoi8dbon5cwl
parent: knielsen(a)knielsen-hq.org-20100201190519-b9uktnn90rwwiile
committer: sanja(a)askmonty.org
branch nick: work-maria-5.2-groupcommit
timestamp: Wed 2010-02-10 22:50:26 +0200
message:
Group commit for maria storage engine.
=== added file 'mysql-test/suite/maria/r/group_commit.result'
--- a/mysql-test/suite/maria/r/group_commit.result 1970-01-01 00:00:00 +0000
+++ b/mysql-test/suite/maria/r/group_commit.result 2010-02-10 20:50:26 +0000
@@ -0,0 +1,17 @@
+drop table if exists t1;
+create table t1 (a int);
+SET GLOBAL maria_group_commit="NONE";
+SET GLOBAL maria_group_commit_interval= 0;
+SET GLOBAL maria_group_commit="NONE";
+SET GLOBAL maria_group_commit_interval= 100;
+SET GLOBAL maria_group_commit="HARD";
+SET GLOBAL maria_group_commit_interval= 0;
+SET GLOBAL maria_group_commit="HARD";
+SET GLOBAL maria_group_commit_interval= 100;
+SET GLOBAL maria_group_commit="SOFT";
+SET GLOBAL maria_group_commit_interval= 0;
+SET GLOBAL maria_group_commit="SOFT";
+SET GLOBAL maria_group_commit_interval= 100;
+SET GLOBAL maria_group_commit="NONE";
+SET GLOBAL maria_group_commit_interval= 0;
+drop table t1;
=== modified file 'mysql-test/suite/maria/r/maria3.result'
--- a/mysql-test/suite/maria/r/maria3.result 2009-09-18 01:04:43 +0000
+++ b/mysql-test/suite/maria/r/maria3.result 2010-02-10 20:50:26 +0000
@@ -306,6 +306,8 @@
maria_block_size 8192
maria_checkpoint_interval 30
maria_force_start_after_recovery_failures 0
+maria_group_commit none
+maria_group_commit_interval 0
maria_log_file_size 4294959104
maria_log_purge_type immediate
maria_max_sort_file_size 9223372036853727232
@@ -328,6 +330,7 @@
Maria_pagecache_reads #
Maria_pagecache_write_requests #
Maria_pagecache_writes #
+Maria_transaction_log_syncs #
create table t1 (b char(0));
insert into t1 values(NULL),("");
select length(b) from t1;
=== added file 'mysql-test/suite/maria/t/group_commit.test'
--- a/mysql-test/suite/maria/t/group_commit.test 1970-01-01 00:00:00 +0000
+++ b/mysql-test/suite/maria/t/group_commit.test 2010-02-10 20:50:26 +0000
@@ -0,0 +1,71 @@
+# Test different ways of syncing (mostly syntax)
+
+--disable_warnings
+drop table if exists t1;
+--enable_warnings
+
+create table t1 (a int);
+
+SET GLOBAL maria_group_commit="NONE";
+SET GLOBAL maria_group_commit_interval= 0;
+--disable_query_log
+let $num = 5000;
+while ($num)
+{
+ insert into t1 values (1);
+ dec $num;
+}
+--enable_query_log
+SET GLOBAL maria_group_commit="NONE";
+SET GLOBAL maria_group_commit_interval= 100;
+--disable_query_log
+let $num = 5000;
+while ($num)
+{
+ insert into t1 values (1);
+ dec $num;
+}
+--enable_query_log
+SET GLOBAL maria_group_commit="HARD";
+SET GLOBAL maria_group_commit_interval= 0;
+--disable_query_log
+let $num = 5000;
+while ($num)
+{
+ insert into t1 values (1);
+ dec $num;
+}
+--enable_query_log
+SET GLOBAL maria_group_commit="HARD";
+SET GLOBAL maria_group_commit_interval= 100;
+--disable_query_log
+let $num = 5000;
+while ($num)
+{
+ insert into t1 values (1);
+ dec $num;
+}
+--enable_query_log
+SET GLOBAL maria_group_commit="SOFT";
+SET GLOBAL maria_group_commit_interval= 0;
+--disable_query_log
+let $num = 5000;
+while ($num)
+{
+ insert into t1 values (1);
+ dec $num;
+}
+--enable_query_log
+SET GLOBAL maria_group_commit="SOFT";
+SET GLOBAL maria_group_commit_interval= 100;
+--disable_query_log
+let $num = 5000;
+while ($num)
+{
+ insert into t1 values (1);
+ dec $num;
+}
+--enable_query_log
+SET GLOBAL maria_group_commit="NONE";
+SET GLOBAL maria_group_commit_interval= 0;
+drop table t1;
=== modified file 'storage/maria/ha_maria.cc'
--- a/storage/maria/ha_maria.cc 2009-12-03 11:34:11 +0000
+++ b/storage/maria/ha_maria.cc 2010-02-10 20:50:26 +0000
@@ -102,22 +102,40 @@
array_elements(maria_translog_purge_type_names) - 1, "",
maria_translog_purge_type_names, NULL
};
+
+/* transactional log directory sync */
const char *maria_sync_log_dir_names[]=
{
"NEVER", "NEWFILE", "ALWAYS", NullS
};
-
TYPELIB maria_sync_log_dir_typelib=
{
array_elements(maria_sync_log_dir_names) - 1, "",
maria_sync_log_dir_names, NULL
};
+/* transactional log group commit */
+const char *maria_group_commit_names[]=
+{
+ "none", "hard", "soft", NullS
+};
+TYPELIB maria_group_commit_typelib=
+{
+ array_elements(maria_group_commit_names) - 1, "",
+ maria_group_commit_names, NULL
+};
+
/** Interval between background checkpoints in seconds */
static ulong checkpoint_interval;
static void update_checkpoint_interval(MYSQL_THD thd,
struct st_mysql_sys_var *var,
void *var_ptr, const void *save);
+static void update_maria_group_commit(MYSQL_THD thd,
+ struct st_mysql_sys_var *var,
+ void *var_ptr, const void *save);
+static void update_maria_group_commit_interval(MYSQL_THD thd,
+ struct st_mysql_sys_var *var,
+ void *var_ptr, const void *save);
/** After that many consecutive recovery failures, remove logs */
static ulong force_start_after_recovery_failures;
static void update_log_file_size(MYSQL_THD thd,
@@ -164,6 +182,24 @@
NULL, update_log_file_size, TRANSLOG_FILE_SIZE,
TRANSLOG_MIN_FILE_SIZE, 0xffffffffL, TRANSLOG_PAGE_SIZE);
+static MYSQL_SYSVAR_ENUM(group_commit, maria_group_commit,
+ PLUGIN_VAR_RQCMDARG,
+ "Specifies maria group commit mode. "
+ "Possible values are \"none\" (no group commit), "
+ "\"hard\" (with waiting to actual commit), "
+ "\"soft\" (no wait for commit (DANGEROUS!!!))",
+ NULL, update_maria_group_commit,
+ TRANSLOG_GCOMMIT_NONE, &maria_group_commit_typelib);
+
+static MYSQL_SYSVAR_ULONG(group_commit_interval, maria_group_commit_interval,
+ PLUGIN_VAR_RQCMDARG,
+ "Interval between commite in microseconds (1/1000000c)."
+ " 0 stands for no waiting"
+ " for other threads to come and do a commit in \"hard\" mode and no"
+ " sync()/commit at all in \"soft\" mode. Option has only an effect"
+ " if maria_group_commit is used",
+ NULL, update_maria_group_commit_interval, 0, 0, UINT_MAX, 1);
+
static MYSQL_SYSVAR_ENUM(log_purge_type, log_purge_type,
PLUGIN_VAR_RQCMDARG,
"Specifies how maria transactional log will be purged. "
@@ -3275,6 +3311,8 @@
MYSQL_SYSVAR(block_size),
MYSQL_SYSVAR(checkpoint_interval),
MYSQL_SYSVAR(force_start_after_recovery_failures),
+ MYSQL_SYSVAR(group_commit),
+ MYSQL_SYSVAR(group_commit_interval),
MYSQL_SYSVAR(page_checksum),
MYSQL_SYSVAR(log_dir_path),
MYSQL_SYSVAR(log_file_size),
@@ -3306,6 +3344,92 @@
}
/**
+ @brief Updates group commit mode
+*/
+
+static void update_maria_group_commit(MYSQL_THD thd,
+ struct st_mysql_sys_var *var,
+ void *var_ptr, const void *save)
+{
+ ulong value= (ulong)*((long *)var_ptr);
+ DBUG_ENTER("update_maria_group_commit");
+ DBUG_PRINT("enter", ("old value: %lu new value %lu rate %lu",
+ value, (ulong)(*(long *)save),
+ maria_group_commit_interval));
+ /* old value */
+ switch (value) {
+ case TRANSLOG_GCOMMIT_NONE:
+ break;
+ case TRANSLOG_GCOMMIT_HARD:
+ translog_hard_group_commit(FALSE);
+ break;
+ case TRANSLOG_GCOMMIT_SOFT:
+ translog_soft_sync(FALSE);
+ if (maria_group_commit_interval)
+ translog_soft_sync_end();
+ break;
+ default:
+ DBUG_ASSERT(0); /* impossible */
+ }
+ value= *(ulong *)var_ptr= (ulong)(*(long *)save);
+ translog_sync();
+ /* new value */
+ switch (value) {
+ case TRANSLOG_GCOMMIT_NONE:
+ break;
+ case TRANSLOG_GCOMMIT_HARD:
+ translog_hard_group_commit(TRUE);
+ break;
+ case TRANSLOG_GCOMMIT_SOFT:
+ translog_soft_sync(TRUE);
+ /* variable change made under global lock so we can just read it */
+ if (maria_group_commit_interval)
+ translog_soft_sync_start();
+ break;
+ default:
+ DBUG_ASSERT(0); /* impossible */
+ }
+ DBUG_VOID_RETURN;
+}
+
+/**
+ @brief Updates group commit interval
+*/
+
+static void update_maria_group_commit_interval(MYSQL_THD thd,
+ struct st_mysql_sys_var *var,
+ void *var_ptr, const void *save)
+{
+ ulong new_value= (ulong)*((long *)save);
+ ulong *value_ptr= (ulong*) var_ptr;
+ DBUG_ENTER("update_maria_group_commit_interval");
+ DBUG_PRINT("enter", ("old value: %lu new value %lu group commit %lu",
+ *value_ptr, new_value, maria_group_commit));
+
+ /* variable change made under global lock so we can just read it */
+ switch (maria_group_commit) {
+ case TRANSLOG_GCOMMIT_NONE:
+ *value_ptr= new_value;
+ translog_set_group_commit_interval(new_value);
+ break;
+ case TRANSLOG_GCOMMIT_HARD:
+ *value_ptr= new_value;
+ translog_set_group_commit_interval(new_value);
+ break;
+ case TRANSLOG_GCOMMIT_SOFT:
+ if (*value_ptr)
+ translog_soft_sync_end();
+ translog_set_group_commit_interval(new_value);
+ if ((*value_ptr= new_value))
+ translog_soft_sync_start();
+ break;
+ default:
+ DBUG_ASSERT(0); /* impossible */
+ }
+ DBUG_VOID_RETURN;
+}
+
+/**
@brief Updates the transaction log file limit.
*/
@@ -3327,6 +3451,7 @@
{"Maria_pagecache_reads", (char*) &maria_pagecache_var.global_cache_read, SHOW_LONGLONG},
{"Maria_pagecache_write_requests", (char*) &maria_pagecache_var.global_cache_w_requests, SHOW_LONGLONG},
{"Maria_pagecache_writes", (char*) &maria_pagecache_var.global_cache_write, SHOW_LONGLONG},
+ {"Maria_transaction_log_syncs", (char*) &translog_syncs, SHOW_LONGLONG},
{NullS, NullS, SHOW_LONG}
};
=== modified file 'storage/maria/ma_init.c'
--- a/storage/maria/ma_init.c 2008-10-09 20:03:54 +0000
+++ b/storage/maria/ma_init.c 2010-02-10 20:50:26 +0000
@@ -82,6 +82,11 @@
maria_inited= maria_multi_threaded= FALSE;
ft_free_stopwords();
ma_checkpoint_end();
+ if (translog_status == TRANSLOG_OK)
+ {
+ translog_soft_sync_end();
+ translog_sync();
+ }
if ((trid= trnman_get_max_trid()) > max_trid_in_control_file)
{
/*
=== modified file 'storage/maria/ma_loghandler.c'
--- a/storage/maria/ma_loghandler.c 2010-01-06 21:27:53 +0000
+++ b/storage/maria/ma_loghandler.c 2010-02-10 20:50:26 +0000
@@ -18,6 +18,7 @@
#include "ma_blockrec.h" /* for some constants and in-write hooks */
#include "ma_key_recover.h" /* For some in-write hooks */
#include "ma_checkpoint.h"
+#include "ma_servicethread.h"
/*
On Windows, neither my_open() nor my_sync() work for directories.
@@ -47,6 +48,15 @@
#include <m_ctype.h>
#endif
+/** @brief protects checkpoint_in_progress */
+static pthread_mutex_t LOCK_soft_sync;
+/** @brief for killing the background checkpoint thread */
+static pthread_cond_t COND_soft_sync;
+/** @brief control structure for checkpoint background thread */
+static MA_SERVICE_THREAD_CONTROL soft_sync_control=
+ {THREAD_DEAD, FALSE, &LOCK_soft_sync, &COND_soft_sync};
+
+
/* transaction log file descriptor */
typedef struct st_translog_file
{
@@ -124,10 +134,24 @@
/* Previous buffer offset to detect it flush finish */
TRANSLOG_ADDRESS prev_buffer_offset;
/*
+ If the buffer was forced to close it save value of its horizon
+ otherwise LSN_IMPOSSIBLE
+ */
+ TRANSLOG_ADDRESS pre_force_close_horizon;
+ /*
How much is written (or will be written when copy_to_buffer_in_progress
become 0) to this buffer
*/
translog_size_t size;
+ /*
+ When moving from one log buffer to another, we write the last of the
+ previous buffer to file and then move to start using the new log
+ buffer. In the case of a part filed last page, this page is not moved
+ to the start of the new buffer but instead we set the 'skip_data'
+ variable to tell us how much data at the beginning of the buffer is not
+ relevant.
+ */
+ uint skipped_data;
/* File handler for this buffer */
TRANSLOG_FILE *file;
/* Threads which are waiting for buffer filling/freeing */
@@ -304,6 +328,7 @@
*/
pthread_mutex_t log_flush_lock;
pthread_cond_t log_flush_cond;
+ pthread_cond_t new_goal_cond;
/* Protects changing of headers of finished files (max_lsn) */
pthread_mutex_t file_header_lock;
@@ -344,13 +369,39 @@
ulong log_purge_type= TRANSLOG_PURGE_IMMIDIATE;
ulong log_file_size= TRANSLOG_FILE_SIZE;
+/* sync() of log files directory mode */
ulong sync_log_dir= TRANSLOG_SYNC_DIR_NEWFILE;
+ulong maria_group_commit= TRANSLOG_GCOMMIT_NONE;
+ulong maria_group_commit_interval= 0;
/* Marker for end of log */
static uchar end_of_log= 0;
#define END_OF_LOG &end_of_log
+/**
+ Switch for "soft" sync (no real sync() but periodical sync by service
+ thread)
+*/
+static volatile my_bool soft_sync= FALSE;
+/**
+ Switch for "hard" group commit mode
+*/
+static volatile my_bool hard_group_commit= FALSE;
+/**
+ File numbers interval which have to be sync()
+*/
+static uint32 soft_sync_min= 0;
+static uint32 soft_sync_max= 0;
+static uint32 soft_need_sync= 1;
+/**
+ stores interval in microseconds
+*/
+static uint32 group_commit_wait= 0;
enum enum_translog_status translog_status= TRANSLOG_UNINITED;
+ulonglong translog_syncs= 0; /* Number of sync()s */
+
+/* time of last flush */
+static ulonglong flush_start= 0;
/* chunk types */
#define TRANSLOG_CHUNK_LSN 0x00 /* 0 chunk refer as LSN (head or tail */
@@ -980,12 +1031,17 @@
static TRANSLOG_FILE *get_current_logfile()
{
TRANSLOG_FILE *file;
+ DBUG_ENTER("get_current_logfile");
rw_rdlock(&log_descriptor.open_files_lock);
+ DBUG_PRINT("info", ("max_file: %lu min_file: %lu open_files: %lu",
+ (ulong) log_descriptor.max_file,
+ (ulong) log_descriptor.min_file,
+ (ulong) log_descriptor.open_files.elements));
DBUG_ASSERT(log_descriptor.max_file - log_descriptor.min_file + 1 ==
log_descriptor.open_files.elements);
file= *dynamic_element(&log_descriptor.open_files, 0, TRANSLOG_FILE **);
rw_unlock(&log_descriptor.open_files_lock);
- return (file);
+ DBUG_RETURN(file);
}
uchar NEAR maria_trans_file_magic[]=
@@ -1069,6 +1125,7 @@
static my_bool translog_max_lsn_to_header(File file, LSN lsn)
{
uchar lsn_buff[LSN_STORE_SIZE];
+ my_bool rc;
DBUG_ENTER("translog_max_lsn_to_header");
DBUG_PRINT("enter", ("File descriptor: %ld "
"lsn: (%lu,0x%lx)",
@@ -1077,11 +1134,17 @@
lsn_store(lsn_buff, lsn);
- DBUG_RETURN(my_pwrite(file, lsn_buff,
- LSN_STORE_SIZE,
- (LOG_HEADER_DATA_SIZE - LSN_STORE_SIZE),
- log_write_flags) != 0 ||
- my_sync(file, MYF(MY_WME)) != 0);
+ rc= (my_pwrite(file, lsn_buff,
+ LSN_STORE_SIZE,
+ (LOG_HEADER_DATA_SIZE - LSN_STORE_SIZE),
+ log_write_flags) != 0 ||
+ my_sync(file, MYF(MY_WME)) != 0);
+ /*
+ We should not increase counter in case of error above, but it is so
+ unlikely that we can ignore this case
+ */
+ translog_syncs++;
+ DBUG_RETURN(rc);
}
@@ -1423,7 +1486,9 @@
static my_bool translog_buffer_init(struct st_translog_buffer *buffer, int num)
{
DBUG_ENTER("translog_buffer_init");
- buffer->prev_last_lsn= buffer->last_lsn= LSN_IMPOSSIBLE;
+ buffer->pre_force_close_horizon=
+ buffer->prev_last_lsn= buffer->last_lsn=
+ LSN_IMPOSSIBLE;
DBUG_PRINT("info", ("last_lsn and prev_last_lsn set to 0 buffer: 0x%lx",
(ulong) buffer));
@@ -1435,6 +1500,7 @@
memset(buffer->buffer, TRANSLOG_FILLER, TRANSLOG_WRITE_BUFFER);
/* Buffer size */
buffer->size= 0;
+ buffer->skipped_data= 0;
/* cond of thread which is waiting for buffer filling */
if (pthread_cond_init(&buffer->waiting_filling_buffer, 0))
DBUG_RETURN(1);
@@ -1489,7 +1555,10 @@
TODO: sync only we have changed the log
*/
if (!file->is_sync)
+ {
rc= my_sync(file->handler.file, MYF(MY_WME));
+ translog_syncs++;
+ }
rc|= my_close(file->handler.file, MYF(MY_WME));
my_free(file, MYF(0));
return test(rc);
@@ -2044,7 +2113,8 @@
(ulong) LSN_OFFSET(log_descriptor.horizon),
(ulong) LSN_OFFSET(log_descriptor.horizon)));
DBUG_ASSERT(buffer_no == buffer->buffer_no);
- buffer->prev_last_lsn= buffer->last_lsn= LSN_IMPOSSIBLE;
+ buffer->pre_force_close_horizon=
+ buffer->prev_last_lsn= buffer->last_lsn= LSN_IMPOSSIBLE;
DBUG_PRINT("info", ("last_lsn and prev_last_lsn set to 0 buffer: 0x%lx",
(ulong) buffer));
buffer->offset= log_descriptor.horizon;
@@ -2052,6 +2122,7 @@
buffer->file= get_current_logfile();
buffer->overlay= 0;
buffer->size= 0;
+ buffer->skipped_data= 0;
translog_cursor_init(cursor, buffer, buffer_no);
DBUG_PRINT("info", ("file: #%ld (%d) init cursor #%u: 0x%lx "
"chaser: %d Size: %lu (%lu)",
@@ -2523,6 +2594,7 @@
TRANSLOG_ADDRESS offset= buffer->offset;
TRANSLOG_FILE *file= buffer->file;
uint8 ver= buffer->ver;
+ uint skipped_data;
DBUG_ENTER("translog_buffer_flush");
DBUG_PRINT("enter",
("Buffer: #%u 0x%lx file: %d offset: (%lu,0x%lx) size: %lu",
@@ -2557,6 +2629,8 @@
disk
*/
file= buffer->file;
+ skipped_data= buffer->skipped_data;
+ DBUG_ASSERT(skipped_data < TRANSLOG_PAGE_SIZE);
for (i= 0, pg= LSN_OFFSET(buffer->offset) / TRANSLOG_PAGE_SIZE;
i < buffer->size;
i+= TRANSLOG_PAGE_SIZE, pg++)
@@ -2573,13 +2647,16 @@
DBUG_ASSERT(i + TRANSLOG_PAGE_SIZE <= buffer->size);
if (translog_status != TRANSLOG_OK && translog_status != TRANSLOG_SHUTDOWN)
DBUG_RETURN(1);
- if (pagecache_inject(log_descriptor.pagecache,
+ if (pagecache_write_part(log_descriptor.pagecache,
&file->handler, pg, 3,
buffer->buffer + i,
PAGECACHE_PLAIN_PAGE,
PAGECACHE_LOCK_LEFT_UNLOCKED,
- PAGECACHE_PIN_LEFT_UNPINNED, 0,
- LSN_IMPOSSIBLE))
+ PAGECACHE_PIN_LEFT_UNPINNED,
+ PAGECACHE_WRITE_DONE, 0,
+ LSN_IMPOSSIBLE,
+ skipped_data,
+ TRANSLOG_PAGE_SIZE - skipped_data))
{
DBUG_PRINT("error",
("Can't write page (%lu,0x%lx) to pagecache, error: %d",
@@ -2589,10 +2666,12 @@
translog_stop_writing();
DBUG_RETURN(1);
}
+ skipped_data= 0;
}
file->is_sync= 0;
- if (my_pwrite(file->handler.file, buffer->buffer,
- buffer->size, LSN_OFFSET(buffer->offset),
+ if (my_pwrite(file->handler.file, buffer->buffer + buffer->skipped_data,
+ buffer->size - buffer->skipped_data,
+ LSN_OFFSET(buffer->offset) + buffer->skipped_data,
log_write_flags))
{
DBUG_PRINT("error", ("Can't write buffer (%lu,0x%lx) size %lu "
@@ -2985,6 +3064,7 @@
uchar *from, *table= NULL;
int is_last_unfinished_page;
uint last_protected_sector= 0;
+ uint skipped_data= curr_buffer->skipped_data;
TRANSLOG_FILE file_copy;
uint8 ver= curr_buffer->ver;
translog_wait_for_writers(curr_buffer);
@@ -2997,7 +3077,38 @@
}
DBUG_ASSERT(LSN_FILE_NO(addr) == LSN_FILE_NO(curr_buffer->offset));
from= curr_buffer->buffer + (addr - curr_buffer->offset);
- memcpy(buffer, from, TRANSLOG_PAGE_SIZE);
+ if (skipped_data && addr == curr_buffer->offset)
+ {
+ /*
+ We read page part of which is not present in buffer,
+ so we should read absent part from file (page cache actually)
+ */
+ file= get_logfile_by_number(file_no);
+ DBUG_ASSERT(file != NULL);
+ /*
+ it's ok to not lock the page because:
+ - The log handler has it's own page cache.
+ - There is only one thread that can access the log
+ cache at a time
+ */
+ if (!(buffer= pagecache_read(log_descriptor.pagecache,
+ &file->handler,
+ LSN_OFFSET(addr) / TRANSLOG_PAGE_SIZE,
+ 3, buffer,
+ PAGECACHE_PLAIN_PAGE,
+ PAGECACHE_LOCK_LEFT_UNLOCKED,
+ NULL)))
+ DBUG_RETURN(NULL);
+ }
+ else
+ skipped_data= 0; /* Read after skipped in buffer data */
+ /*
+ Now we have correct data in buffer up to 'skipped_data'. The
+ following memcpy() will move the data from the internal buffer
+ that was not yet on disk.
+ */
+ memcpy(buffer + skipped_data, from + skipped_data,
+ TRANSLOG_PAGE_SIZE - skipped_data);
/*
We can use copy then in translog_page_validator() because it
do not put it permanently somewhere.
@@ -3291,6 +3402,7 @@
uint32 next_page_offset, page_rest;
uint32 i;
File fd;
+ int rc;
TRANSLOG_VALIDATOR_DATA data;
char path[FN_REFLEN];
uchar page_buff[TRANSLOG_PAGE_SIZE];
@@ -3316,14 +3428,19 @@
TRANSLOG_PAGE_SIZE);
page_rest= next_page_offset - LSN_OFFSET(addr);
memset(page_buff, TRANSLOG_FILLER, page_rest);
- if ((fd= open_logfile_by_number_no_cache(LSN_FILE_NO(addr))) < 0 ||
- ((my_chsize(fd, next_page_offset, TRANSLOG_FILLER, MYF(MY_WME)) ||
- (page_rest && my_pwrite(fd, page_buff, page_rest, LSN_OFFSET(addr),
- log_write_flags)) ||
- my_sync(fd, MYF(MY_WME))) |
- my_close(fd, MYF(MY_WME))) ||
- (sync_log_dir >= TRANSLOG_SYNC_DIR_ALWAYS &&
- sync_dir(log_descriptor.directory_fd, MYF(MY_WME | MY_IGNORE_BADFD))))
+ rc= ((fd= open_logfile_by_number_no_cache(LSN_FILE_NO(addr))) < 0 ||
+ ((my_chsize(fd, next_page_offset, TRANSLOG_FILLER, MYF(MY_WME)) ||
+ (page_rest && my_pwrite(fd, page_buff, page_rest, LSN_OFFSET(addr),
+ log_write_flags)) ||
+ my_sync(fd, MYF(MY_WME)))));
+ translog_syncs++;
+ rc|= (fd > 0 && my_close(fd, MYF(MY_WME)));
+ if (sync_log_dir >= TRANSLOG_SYNC_DIR_ALWAYS)
+ {
+ rc|= sync_dir(log_descriptor.directory_fd, MYF(MY_WME | MY_IGNORE_BADFD));
+ translog_syncs++;
+ }
+ if (rc)
DBUG_RETURN(1);
/* fix the horizon */
@@ -3511,6 +3628,7 @@
pthread_mutex_init(&log_descriptor.dirty_buffer_mask_lock,
MY_MUTEX_INIT_FAST) ||
pthread_cond_init(&log_descriptor.log_flush_cond, 0) ||
+ pthread_cond_init(&log_descriptor.new_goal_cond, 0) ||
my_rwlock_init(&log_descriptor.open_files_lock,
NULL) ||
my_init_dynamic_array(&log_descriptor.open_files,
@@ -3912,7 +4030,6 @@
log_descriptor.flushed= log_descriptor.horizon;
log_descriptor.in_buffers_only= log_descriptor.bc.buffer->offset;
log_descriptor.max_lsn= LSN_IMPOSSIBLE; /* set to 0 */
- log_descriptor.previous_flush_horizon= log_descriptor.horizon;
/*
Now 'flushed' is set to 'horizon' value, but 'horizon' is (potentially)
address of the next LSN and we want indicate that all LSNs that are
@@ -3995,6 +4112,10 @@
It is beginning of the log => there is no LSNs in the log =>
There is no harm in leaving it "as-is".
*/
+ log_descriptor.previous_flush_horizon= log_descriptor.horizon;
+ DBUG_PRINT("info", ("previous_flush_horizon: (%lu,0x%lx)",
+ LSN_IN_PARTS(log_descriptor.
+ previous_flush_horizon)));
DBUG_RETURN(0);
}
file_no--;
@@ -4070,6 +4191,9 @@
translog_free_record_header(&rec);
}
}
+ log_descriptor.previous_flush_horizon= log_descriptor.horizon;
+ DBUG_PRINT("info", ("previous_flush_horizon: (%lu,0x%lx)",
+ LSN_IN_PARTS(log_descriptor.previous_flush_horizon)));
DBUG_RETURN(0);
err:
ma_message_no_user(0, "log initialization failed");
@@ -4157,6 +4281,7 @@
pthread_mutex_destroy(&log_descriptor.log_flush_lock);
pthread_mutex_destroy(&log_descriptor.dirty_buffer_mask_lock);
pthread_cond_destroy(&log_descriptor.log_flush_cond);
+ pthread_cond_destroy(&log_descriptor.new_goal_cond);
rwlock_destroy(&log_descriptor.open_files_lock);
delete_dynamic(&log_descriptor.open_files);
delete_dynamic(&log_descriptor.unfinished_files);
@@ -6885,11 +7010,11 @@
{
translog_size_t res;
DBUG_ENTER("translog_read_record_header_from_buffer");
- DBUG_ASSERT(translog_is_LSN_chunk(page[page_offset]));
- DBUG_ASSERT(translog_status == TRANSLOG_OK ||
- translog_status == TRANSLOG_READONLY);
DBUG_PRINT("info", ("page byte: 0x%x offset: %u",
(uint) page[page_offset], (uint) page_offset));
+ DBUG_ASSERT(translog_is_LSN_chunk(page[page_offset]));
+ DBUG_ASSERT(translog_status == TRANSLOG_OK ||
+ translog_status == TRANSLOG_READONLY);
buff->type= (page[page_offset] & TRANSLOG_REC_TYPE);
buff->short_trid= uint2korr(page + page_offset + 1);
DBUG_PRINT("info", ("Type %u, Short TrID %u, LSN (%lu,0x%lx)",
@@ -7356,27 +7481,27 @@
"Buffer addr: (%lu,0x%lx) "
"Page addr: (%lu,0x%lx) "
"size: %lu (%lu) Pg: %u left: %u in progress %u",
- (uint) log_descriptor.bc.buffer_no,
- (ulong) log_descriptor.bc.buffer,
- LSN_IN_PARTS(log_descriptor.bc.buffer->offset),
+ (uint) old_buffer_no,
+ (ulong) old_buffer,
+ LSN_IN_PARTS(old_buffer->offset),
(ulong) LSN_FILE_NO(log_descriptor.horizon),
(ulong) (LSN_OFFSET(log_descriptor.horizon) -
log_descriptor.bc.current_page_fill),
- (ulong) log_descriptor.bc.buffer->size,
+ (ulong) old_buffer->size,
(ulong) (log_descriptor.bc.ptr -log_descriptor.bc.
buffer->buffer),
(uint) log_descriptor.bc.current_page_fill,
(uint) left,
- (uint) log_descriptor.bc.buffer->
+ (uint) old_buffer->
copy_to_buffer_in_progress));
translog_lock_assert_owner();
LINT_INIT(current_page_fill);
- new_buff_beginning= log_descriptor.bc.buffer->offset;
- new_buff_beginning+= log_descriptor.bc.buffer->size; /* increase offset */
+ new_buff_beginning= old_buffer->offset;
+ new_buff_beginning+= old_buffer->size; /* increase offset */
DBUG_ASSERT(log_descriptor.bc.ptr !=NULL);
DBUG_ASSERT(LSN_FILE_NO(log_descriptor.horizon) ==
- LSN_FILE_NO(log_descriptor.bc.buffer->offset));
+ LSN_FILE_NO(old_buffer->offset));
translog_check_cursor(&log_descriptor.bc);
DBUG_ASSERT(left < TRANSLOG_PAGE_SIZE);
if (left)
@@ -7387,18 +7512,20 @@
*/
DBUG_PRINT("info", ("left: %u", (uint) left));
+ old_buffer->pre_force_close_horizon=
+ old_buffer->offset + old_buffer->size;
/* decrease offset */
new_buff_beginning-= log_descriptor.bc.current_page_fill;
current_page_fill= log_descriptor.bc.current_page_fill;
memset(log_descriptor.bc.ptr, TRANSLOG_FILLER, left);
- log_descriptor.bc.buffer->size+= left;
+ old_buffer->size+= left;
DBUG_PRINT("info", ("Finish Page buffer #%u: 0x%lx "
"Size: %lu",
- (uint) log_descriptor.bc.buffer->buffer_no,
- (ulong) log_descriptor.bc.buffer,
- (ulong) log_descriptor.bc.buffer->size));
- DBUG_ASSERT(log_descriptor.bc.buffer->buffer_no ==
+ (uint) old_buffer->buffer_no,
+ (ulong) old_buffer,
+ (ulong) old_buffer->size));
+ DBUG_ASSERT(old_buffer->buffer_no ==
log_descriptor.bc.buffer_no);
}
else
@@ -7509,11 +7636,21 @@
if (left)
{
- /*
- TODO: do not copy beginning of the page if we have no CRC or sector
- checks on
- */
- memcpy(new_buffer->buffer, data, current_page_fill);
+ if (log_descriptor.flags &
+ (TRANSLOG_PAGE_CRC | TRANSLOG_SECTOR_PROTECTION))
+ memcpy(new_buffer->buffer, data, current_page_fill);
+ else
+ {
+ /*
+ This page header does not change if we add more data to the page so
+ we can not copy it and will not overwrite later
+ */
+ new_buffer->skipped_data= current_page_fill;
+#ifndef DBUG_OFF
+ memset(new_buffer->buffer, 0xa5, current_page_fill);
+#endif
+ DBUG_ASSERT(new_buffer->skipped_data < TRANSLOG_PAGE_SIZE);
+ }
}
old_buffer->next_buffer_offset= new_buffer->offset;
translog_buffer_lock(new_buffer);
@@ -7561,6 +7698,7 @@
{
log_descriptor.next_pass_max_lsn= lsn;
log_descriptor.max_lsn_requester= pthread_self();
+ pthread_cond_broadcast(&log_descriptor.new_goal_cond);
}
while (flush_no == log_descriptor.flush_no)
{
@@ -7572,66 +7710,78 @@
/**
- @brief Flush the log up to given LSN (included)
-
- @param lsn log record serial number up to which (inclusive)
- the log has to be flushed
-
- @return Operation status
+ @brief sync() range of files (inclusive) and directory (by request)
+
+ @param min min internal file number to flush
+ @param max max internal file number to flush
+ @param sync_dir need sync directory
+
+ return Operation status
@retval 0 OK
@retval 1 Error
-
-*/
-
-my_bool translog_flush(TRANSLOG_ADDRESS lsn)
-{
- LSN sent_to_disk= LSN_IMPOSSIBLE;
- TRANSLOG_ADDRESS flush_horizon;
- uint fn, i;
+*/
+
+static my_bool translog_sync_files(uint32 min, uint32 max,
+ my_bool sync_dir)
+{
+ uint fn;
+ my_bool rc= 0;
+ ulonglong flush_interval;
+ DBUG_ENTER("translog_sync_files");
+ DBUG_PRINT("info", ("min: %lu max: %lu sync dir: %d",
+ (ulong) min, (ulong) max, (int) sync_dir));
+ DBUG_ASSERT(min <= max);
+
+ flush_interval= group_commit_wait;
+ if (flush_interval)
+ flush_start= my_micro_time();
+ for (fn= min; fn <= max; fn++)
+ {
+ TRANSLOG_FILE *file= get_logfile_by_number(fn);
+ DBUG_ASSERT(file != NULL);
+ if (!file->is_sync)
+ {
+ if (my_sync(file->handler.file, MYF(MY_WME)))
+ {
+ rc= 1;
+ translog_stop_writing();
+ DBUG_RETURN(rc);
+ }
+ translog_syncs++;
+ file->is_sync= 1;
+ }
+ }
+
+ if (sync_dir)
+ {
+ if (!(rc= sync_dir(log_descriptor.directory_fd,
+ MYF(MY_WME | MY_IGNORE_BADFD))))
+ translog_syncs++;
+ }
+
+ DBUG_RETURN(rc);
+}
+
+
+/*
+ @brief Flushes buffers with LSNs in them less or equal address <lsn>
+
+ @param lsn address up to which all LSNs should be flushed,
+ can be reset to real last LSN address
+ @parem sent_to_disk returns 'sent to disk' position
+ @param flush_horizon returns horizon of the flush
+
+ @note About terminology see comment to translog_flush().
+*/
+
+void translog_flush_buffers(TRANSLOG_ADDRESS *lsn,
+ TRANSLOG_ADDRESS *sent_to_disk,
+ TRANSLOG_ADDRESS *flush_horizon)
+{
dirty_buffer_mask_t dirty_buffer_mask;
+ uint i;
uint8 last_buffer_no, start_buffer_no;
- my_bool rc= 0;
- DBUG_ENTER("translog_flush");
- DBUG_PRINT("enter", ("Flush up to LSN: (%lu,0x%lx)", LSN_IN_PARTS(lsn)));
- DBUG_ASSERT(translog_status == TRANSLOG_OK ||
- translog_status == TRANSLOG_READONLY);
- LINT_INIT(sent_to_disk);
-
- pthread_mutex_lock(&log_descriptor.log_flush_lock);
- DBUG_PRINT("info", ("Everything is flushed up to (%lu,0x%lx)",
- LSN_IN_PARTS(log_descriptor.flushed)));
- if (cmp_translog_addr(log_descriptor.flushed, lsn) >= 0)
- {
- pthread_mutex_unlock(&log_descriptor.log_flush_lock);
- DBUG_RETURN(0);
- }
- if (log_descriptor.flush_in_progress)
- {
- translog_flush_set_new_goal_and_wait(lsn);
- if (!pthread_equal(log_descriptor.max_lsn_requester, pthread_self()))
- {
- /* fix lsn if it was horizon */
- if (cmp_translog_addr(lsn, log_descriptor.bc.buffer->last_lsn) > 0)
- lsn= BUFFER_MAX_LSN(log_descriptor.bc.buffer);
- translog_flush_wait_for_end(lsn);
- pthread_mutex_unlock(&log_descriptor.log_flush_lock);
- DBUG_RETURN(0);
- }
- log_descriptor.next_pass_max_lsn= LSN_IMPOSSIBLE;
- }
- log_descriptor.flush_in_progress= 1;
- flush_horizon= log_descriptor.previous_flush_horizon;
- DBUG_PRINT("info", ("flush_in_progress is set"));
- pthread_mutex_unlock(&log_descriptor.log_flush_lock);
-
- translog_lock();
- if (log_descriptor.is_everything_flushed)
- {
- DBUG_PRINT("info", ("everything is flushed"));
- rc= (translog_status == TRANSLOG_READONLY);
- translog_unlock();
- goto out;
- }
+ DBUG_ENTER("translog_flush_buffers");
/*
We will recheck information when will lock buffers one by
@@ -7656,15 +7806,15 @@
/*
if LSN up to which we have to flush bigger then maximum LSN of previous
buffer and at least one LSN was saved in the current buffer (last_lsn !=
- LSN_IMPOSSIBLE) then we better finish the current buffer.
+ LSN_IMPOSSIBLE) then we have to close the current buffer.
*/
- if (cmp_translog_addr(lsn, log_descriptor.bc.buffer->prev_last_lsn) > 0 &&
+ if (cmp_translog_addr(*lsn, log_descriptor.bc.buffer->prev_last_lsn) > 0 &&
log_descriptor.bc.buffer->last_lsn != LSN_IMPOSSIBLE)
{
struct st_translog_buffer *buffer= log_descriptor.bc.buffer;
- lsn= log_descriptor.bc.buffer->last_lsn; /* fix lsn if it was horizon */
+ *lsn= log_descriptor.bc.buffer->last_lsn; /* fix lsn if it was horizon */
DBUG_PRINT("info", ("LSN to flush fixed to last lsn: (%lu,0x%lx)",
- LSN_IN_PARTS(log_descriptor.bc.buffer->last_lsn)));
+ LSN_IN_PARTS(log_descriptor.bc.buffer->last_lsn)));
last_buffer_no= log_descriptor.bc.buffer_no;
log_descriptor.is_everything_flushed= 1;
translog_force_current_buffer_to_finish();
@@ -7676,8 +7826,10 @@
TRANSLOG_BUFFERS_NO);
translog_unlock();
}
- sent_to_disk= translog_get_sent_to_disk();
- if (cmp_translog_addr(lsn, sent_to_disk) > 0)
+
+ /* flush buffers */
+ *sent_to_disk= translog_get_sent_to_disk();
+ if (cmp_translog_addr(*lsn, *sent_to_disk) > 0)
{
DBUG_PRINT("info", ("Start buffer #: %u last buffer #: %u",
@@ -7697,53 +7849,238 @@
LSN_IN_PARTS(buffer->last_lsn),
(buffer->file ?
"dirty" : "closed")));
- if (buffer->prev_last_lsn <= lsn &&
+ if (buffer->prev_last_lsn <= *lsn &&
buffer->file != NULL)
{
- DBUG_ASSERT(flush_horizon <= buffer->offset + buffer->size);
- flush_horizon= buffer->offset + buffer->size;
+ DBUG_ASSERT(*flush_horizon <= buffer->offset + buffer->size);
+ *flush_horizon= (buffer->pre_force_close_horizon != LSN_IMPOSSIBLE ?
+ buffer->pre_force_close_horizon :
+ buffer->offset + buffer->size);
+ /* pre_force_close_horizon is reset during new buffer start */
+ DBUG_PRINT("info", ("flush_horizon: (%lu,0x%lx)",
+ LSN_IN_PARTS(*flush_horizon)));
+ DBUG_ASSERT(*flush_horizon <= log_descriptor.horizon);
+
translog_buffer_flush(buffer);
}
translog_buffer_unlock(buffer);
i= (i + 1) % TRANSLOG_BUFFERS_NO;
} while (i != last_buffer_no);
- sent_to_disk= translog_get_sent_to_disk();
- }
-
- /* sync files from previous flush till current one */
- for (fn= LSN_FILE_NO(log_descriptor.flushed); fn <= LSN_FILE_NO(lsn); fn++)
- {
- TRANSLOG_FILE *file= get_logfile_by_number(fn);
- DBUG_ASSERT(file != NULL);
- if (!file->is_sync)
- {
- if (my_sync(file->handler.file, MYF(MY_WME)))
+ *sent_to_disk= translog_get_sent_to_disk();
+ }
+
+ DBUG_VOID_RETURN;
+}
+
+/**
+ @brief Flush the log up to given LSN (included)
+
+ @param lsn log record serial number up to which (inclusive)
+ the log has to be flushed
+
+ @return Operation status
+ @retval 0 OK
+ @retval 1 Error
+
+ @note
+
+ - Non group commit logic: Commits made in passes. Thread which started
+ flush first is performing actual flush, other threads sets new goal (LSN)
+ of the next pass (if it is maximum) and waits for the pass end or just
+ wait for the pass end.
+
+ - If hard group commit enabled and rate set to zero:
+ The first thread sends all changed buffers to disk. This is repeated
+ as long as there are new LSNs added. The process can not loop
+ forever because we have limited number of threads and they will wait
+ for the data to be synced.
+ Pseudo code:
+
+ do
+ send changed buffers to disk
+ while new_goal
+ sync
+
+ - If hard group commit switched ON and less than rate microseconds has
+ passed from last sync, then after buffers have been sent to disk
+ wait until rate microseconds has passed since last sync, do sync and return.
+ This ensures that if we call sync infrequently we don't do any waits.
+
+ - If soft group commit enabled everything works as with 'non group commit'
+ but the thread doesn't do any real sync(). If rate is not zero the
+ sync() will be performed by a service thread with the given rate
+ when needed (new LSN appears).
+
+ @note Terminology:
+ 'sent to disk' means written to disk but not sync()ed,
+ 'flushed' mean sent to disk and synced().
+*/
+
+my_bool translog_flush(TRANSLOG_ADDRESS lsn)
+{
+ struct timespec abstime;
+ ulonglong flush_interval;
+ ulonglong time_spent;
+ LSN sent_to_disk= LSN_IMPOSSIBLE;
+ TRANSLOG_ADDRESS flush_horizon;
+ my_bool rc= 0;
+ my_bool hgroup_commit_at_start;
+ DBUG_ENTER("translog_flush");
+ DBUG_PRINT("enter", ("Flush up to LSN: (%lu,0x%lx)", LSN_IN_PARTS(lsn)));
+ DBUG_ASSERT(translog_status == TRANSLOG_OK ||
+ translog_status == TRANSLOG_READONLY);
+ LINT_INIT(sent_to_disk);
+
+ pthread_mutex_lock(&log_descriptor.log_flush_lock);
+ DBUG_PRINT("info", ("Everything is flushed up to (%lu,0x%lx)",
+ LSN_IN_PARTS(log_descriptor.flushed)));
+ if (cmp_translog_addr(log_descriptor.flushed, lsn) >= 0)
+ {
+ pthread_mutex_unlock(&log_descriptor.log_flush_lock);
+ DBUG_RETURN(0);
+ }
+ if (log_descriptor.flush_in_progress)
+ {
+ translog_lock();
+ /* fix lsn if it was horizon */
+ if (cmp_translog_addr(lsn, log_descriptor.bc.buffer->last_lsn) > 0)
+ lsn= BUFFER_MAX_LSN(log_descriptor.bc.buffer);
+ translog_unlock();
+ translog_flush_set_new_goal_and_wait(lsn);
+ if (!pthread_equal(log_descriptor.max_lsn_requester, pthread_self()))
+ {
+ /*
+ translog_flush_wait_for_end() release log_flush_lock while is
+ waiting then acquire it again
+ */
+ translog_flush_wait_for_end(lsn);
+ pthread_mutex_unlock(&log_descriptor.log_flush_lock);
+ DBUG_RETURN(0);
+ }
+ log_descriptor.next_pass_max_lsn= LSN_IMPOSSIBLE;
+ }
+ log_descriptor.flush_in_progress= 1;
+ flush_horizon= log_descriptor.previous_flush_horizon;
+ DBUG_PRINT("info", ("flush_in_progress is set, flush_horizon: (%lu,0x%lx)",
+ LSN_IN_PARTS(flush_horizon)));
+ pthread_mutex_unlock(&log_descriptor.log_flush_lock);
+
+ hgroup_commit_at_start= hard_group_commit;
+ if (hgroup_commit_at_start)
+ flush_interval= group_commit_wait;
+
+ translog_lock();
+ if (log_descriptor.is_everything_flushed)
+ {
+ DBUG_PRINT("info", ("everything is flushed"));
+ translog_unlock();
+ pthread_mutex_lock(&log_descriptor.log_flush_lock);
+ goto out;
+ }
+
+ for (;;)
+ {
+ /* Following function flushes buffers and makes translog_unlock() */
+ translog_flush_buffers(&lsn, &sent_to_disk, &flush_horizon);
+
+ if (!hgroup_commit_at_start)
+ break; /* flush pass is ended */
+
+retest:
+ /*
+ We do not check time here because pthread_mutex_lock rarely takes
+ a lot of time so we can sacrifice a bit precision to performance
+ (taking into account that my_micro_time() might be expensive call).
+ */
+ if (flush_interval == 0)
+ break; /* flush pass is ended */
+
+ pthread_mutex_lock(&log_descriptor.log_flush_lock);
+ if (log_descriptor.next_pass_max_lsn == LSN_IMPOSSIBLE)
+ {
+ if (flush_interval == 0 ||
+ (time_spent= (my_micro_time() - flush_start)) >= flush_interval)
{
- rc= 1;
- translog_stop_writing();
- sent_to_disk= LSN_IMPOSSIBLE;
- goto out;
+ pthread_mutex_unlock(&log_descriptor.log_flush_lock);
+ break;
}
- file->is_sync= 1;
- }
- }
-
- if (sync_log_dir >= TRANSLOG_SYNC_DIR_ALWAYS &&
- (LSN_FILE_NO(log_descriptor.previous_flush_horizon) !=
- LSN_FILE_NO(flush_horizon) ||
- ((LSN_OFFSET(log_descriptor.previous_flush_horizon) - 1) /
- TRANSLOG_PAGE_SIZE) !=
- ((LSN_OFFSET(flush_horizon) - 1) / TRANSLOG_PAGE_SIZE)))
- rc|= sync_dir(log_descriptor.directory_fd, MYF(MY_WME | MY_IGNORE_BADFD));
+ DBUG_PRINT("info", ("flush waits: %llu interval: %llu spent: %llu",
+ flush_interval - time_spent,
+ flush_interval, time_spent));
+ /* wait time or next goal */
+ set_timespec_nsec(abstime, flush_interval - time_spent);
+ pthread_cond_timedwait(&log_descriptor.new_goal_cond,
+ &log_descriptor.log_flush_lock,
+ &abstime);
+ pthread_mutex_unlock(&log_descriptor.log_flush_lock);
+ DBUG_PRINT("info", ("retest conditions"));
+ goto retest;
+ }
+
+ /* take next goal */
+ lsn= log_descriptor.next_pass_max_lsn;
+ log_descriptor.next_pass_max_lsn= LSN_IMPOSSIBLE;
+ /* prevent other thread from continue */
+ log_descriptor.max_lsn_requester= pthread_self();
+ DBUG_PRINT("info", ("flush took next goal: (%lu,0x%lx)",
+ LSN_IN_PARTS(lsn)));
+ pthread_mutex_unlock(&log_descriptor.log_flush_lock);
+
+ /* next flush pass */
+ DBUG_PRINT("info", ("next flush pass"));
+ translog_lock();
+ }
+
+ /*
+ sync() files from previous flush till current one
+ */
+ if (!soft_sync || hgroup_commit_at_start)
+ {
+ if ((rc=
+ translog_sync_files(LSN_FILE_NO(log_descriptor.flushed),
+ LSN_FILE_NO(lsn),
+ sync_log_dir >= TRANSLOG_SYNC_DIR_ALWAYS &&
+ (LSN_FILE_NO(log_descriptor.
+ previous_flush_horizon) !=
+ LSN_FILE_NO(flush_horizon) ||
+ (LSN_OFFSET(log_descriptor.
+ previous_flush_horizon) /
+ TRANSLOG_PAGE_SIZE) !=
+ (LSN_OFFSET(flush_horizon) /
+ TRANSLOG_PAGE_SIZE)))))
+ {
+ sent_to_disk= LSN_IMPOSSIBLE;
+ pthread_mutex_lock(&log_descriptor.log_flush_lock);
+ goto out;
+ }
+ /* keep values for soft sync() and forced sync() actual */
+ {
+ uint32 fileno= LSN_FILE_NO(lsn);
+ my_atomic_rwlock_wrlock(&soft_sync_rwl);
+ my_atomic_store32(&soft_sync_min, fileno);
+ my_atomic_store32(&soft_sync_max, fileno);
+ my_atomic_rwlock_wrunlock(&soft_sync_rwl);
+ }
+ }
+ else
+ {
+ my_atomic_rwlock_wrlock(&soft_sync_rwl);
+ my_atomic_store32(&soft_sync_max, LSN_FILE_NO(lsn));
+ my_atomic_store32(&soft_need_sync, 1);
+ my_atomic_rwlock_wrunlock(&soft_sync_rwl);
+ }
+
+ DBUG_ASSERT(flush_horizon <= log_descriptor.horizon);
+
+ pthread_mutex_lock(&log_descriptor.log_flush_lock);
log_descriptor.previous_flush_horizon= flush_horizon;
out:
- pthread_mutex_lock(&log_descriptor.log_flush_lock);
if (sent_to_disk != LSN_IMPOSSIBLE)
log_descriptor.flushed= sent_to_disk;
log_descriptor.flush_in_progress= 0;
log_descriptor.flush_no++;
DBUG_PRINT("info", ("flush_in_progress is dropped"));
- pthread_mutex_unlock(&log_descriptor.log_flush_lock);\
+ pthread_mutex_unlock(&log_descriptor.log_flush_lock);
pthread_cond_broadcast(&log_descriptor.log_flush_cond);
DBUG_RETURN(rc);
}
@@ -8113,6 +8450,8 @@
my_bool translog_purge(TRANSLOG_ADDRESS low)
{
uint32 last_need_file= LSN_FILE_NO(low);
+ uint32 min_unsync;
+ int soft;
TRANSLOG_ADDRESS horizon= translog_get_horizon();
int rc= 0;
DBUG_ENTER("translog_purge");
@@ -8120,12 +8459,26 @@
DBUG_ASSERT(translog_status == TRANSLOG_OK ||
translog_status == TRANSLOG_READONLY);
+ soft= soft_sync;
+ my_atomic_rwlock_wrlock(&soft_sync_rwl);
+ min_unsync= my_atomic_load32(&soft_sync_min);
+ my_atomic_rwlock_wrunlock(&soft_sync_rwl);
+ DBUG_PRINT("info", ("min_unsync: %lu", (ulong) min_unsync));
+ if (soft && min_unsync < last_need_file)
+ {
+ last_need_file= min_unsync;
+ DBUG_PRINT("info", ("last_need_file set to %lu", (ulong)last_need_file));
+ }
+
pthread_mutex_lock(&log_descriptor.purger_lock);
+ DBUG_PRINT("info", ("last_lsn_checked file: %lu:",
+ (ulong) log_descriptor.last_lsn_checked));
if (LSN_FILE_NO(log_descriptor.last_lsn_checked) < last_need_file)
{
uint32 i;
uint32 min_file= translog_first_file(horizon, 1);
DBUG_ASSERT(min_file != 0); /* log is already started */
+ DBUG_PRINT("info", ("min_file: %lu:",(ulong) min_file));
for(i= min_file; i < last_need_file && rc == 0; i++)
{
LSN lsn= translog_get_file_max_lsn_stored(i);
@@ -8356,6 +8709,159 @@
}
+
+/**
+ Sets soft sync mode
+
+ @param mode TRUE if we need switch soft sync on else off
+*/
+
+void translog_soft_sync(my_bool mode)
+{
+ soft_sync= mode;
+}
+
+
+/**
+ Sets hard group commit
+
+ @param mode TRUE if we need switch hard group commit on else off
+*/
+
+void translog_hard_group_commit(my_bool mode)
+{
+ hard_group_commit= mode;
+}
+
+
+/**
+ @brief forced log sync (used when we are switching modes)
+*/
+
+void translog_sync()
+{
+ uint32 max= get_current_logfile()->number;
+ uint32 min;
+ DBUG_ENTER("ma_translog_sync");
+
+ my_atomic_rwlock_rdlock(&soft_sync_rwl);
+ min= my_atomic_load32(&soft_sync_min);
+ my_atomic_rwlock_rdunlock(&soft_sync_rwl);
+ if (!min)
+ min= max;
+
+ translog_sync_files(min, max, sync_log_dir >= TRANSLOG_SYNC_DIR_ALWAYS);
+
+ DBUG_VOID_RETURN;
+}
+
+
+/**
+ @brief set rate for group commit
+
+ @param interval interval to set.
+
+ @note We use this function with additional variable because have to
+ restart service thread with new value which we can't make inside changing
+ variable routine (update_maria_group_commit_interval)
+*/
+
+void translog_set_group_commit_interval(uint32 interval)
+{
+ DBUG_ENTER("translog_set_group_commit_interval");
+ group_commit_wait= interval;
+ DBUG_PRINT("info", ("wait: %llu",
+ (ulonglong)group_commit_wait));
+ DBUG_VOID_RETURN;
+}
+
+
+/**
+ @brief syncing service thread
+*/
+
+static pthread_handler_t
+ma_soft_sync_background( void *arg __attribute__((unused)))
+{
+
+ my_thread_init();
+ {
+ DBUG_ENTER("ma_soft_sync_background");
+ for(;;)
+ {
+ ulonglong prev_loop= my_micro_time();
+ ulonglong time, sleep;
+ uint32 min, max, sync_request;
+ my_atomic_rwlock_rdlock(&soft_sync_rwl);
+ min= my_atomic_load32(&soft_sync_min);
+ max= my_atomic_load32(&soft_sync_max);
+ sync_request= my_atomic_load32(&soft_need_sync);
+ my_atomic_store32(&soft_sync_min, max);
+ my_atomic_store32(&soft_need_sync, 0);
+ my_atomic_rwlock_rdunlock(&soft_sync_rwl);
+
+ sleep= group_commit_wait;
+ if (sync_request)
+ translog_sync_files(min, max, FALSE);
+ time= my_micro_time() - prev_loop;
+ if (time > sleep)
+ sleep= 0;
+ else
+ sleep-= time;
+ if (my_service_thread_sleep(&soft_sync_control, sleep))
+ break;
+ }
+ my_service_thread_signal_end(&soft_sync_control);
+ my_thread_end();
+ DBUG_RETURN(0);
+ }
+}
+
+
+/**
+ @brief Starts syncing thread
+*/
+
+int translog_soft_sync_start(void)
+{
+ pthread_t th;
+ int res= 0;
+ uint32 min, max;
+ DBUG_ENTER("translog_soft_sync_start");
+
+ /* check and init variables */
+ my_atomic_rwlock_rdlock(&soft_sync_rwl);
+ min= my_atomic_load32(&soft_sync_min);
+ max= my_atomic_load32(&soft_sync_max);
+ if (!max)
+ my_atomic_store32(&soft_sync_max, (max= get_current_logfile()->number));
+ if (!min)
+ my_atomic_store32(&soft_sync_min, max);
+ my_atomic_store32(&soft_need_sync, 1);
+ my_atomic_rwlock_rdunlock(&soft_sync_rwl);
+
+ if (!(res= ma_service_thread_control_init(&soft_sync_control)))
+ if (!(res= pthread_create(&th, NULL, ma_soft_sync_background, NULL)))
+ soft_sync_control.status= THREAD_RUNNING;
+ DBUG_RETURN(res);
+}
+
+
+/**
+ @brief Stops syncing thread
+*/
+
+void translog_soft_sync_end(void)
+{
+ DBUG_ENTER("translog_soft_sync_end");
+ if (soft_sync_control.inited)
+ {
+ ma_service_thread_control_end(&soft_sync_control);
+ }
+ DBUG_VOID_RETURN;
+}
+
+
#ifdef MARIA_DUMP_LOG
#include <my_getopt.h>
extern void translog_example_table_init();
=== modified file 'storage/maria/ma_loghandler.h'
--- a/storage/maria/ma_loghandler.h 2009-01-15 22:25:53 +0000
+++ b/storage/maria/ma_loghandler.h 2010-02-10 20:50:26 +0000
@@ -342,6 +342,14 @@
TRANSLOG_SHUTDOWN /* going to shutdown the loghandler */
};
extern enum enum_translog_status translog_status;
+extern ulonglong translog_syncs; /* Number of sync()s */
+
+void translog_soft_sync(my_bool mode);
+void translog_hard_group_commit(my_bool mode);
+int translog_soft_sync_start(void);
+void translog_soft_sync_end(void);
+void translog_sync();
+void translog_set_group_commit_interval(uint32 interval);
/*
all the rest added because of recovery; should we make
@@ -441,6 +449,14 @@
typedef enum
{
+ TRANSLOG_GCOMMIT_NONE,
+ TRANSLOG_GCOMMIT_HARD,
+ TRANSLOG_GCOMMIT_SOFT
+} enum_maria_group_commit;
+extern ulong maria_group_commit;
+extern ulong maria_group_commit_interval;
+typedef enum
+{
TRANSLOG_PURGE_IMMIDIATE,
TRANSLOG_PURGE_EXTERNAL,
TRANSLOG_PURGE_ONDEMAND
1
0

[Maria-developers] Rev 2740: Group commit for maria storage engine. in file:///Users/bell/maria/bzr/work-maria-5.2-groupcommit/
by sanja@askmonty.org 10 Feb '10
by sanja@askmonty.org 10 Feb '10
10 Feb '10
At file:///Users/bell/maria/bzr/work-maria-5.2-groupcommit/
------------------------------------------------------------
revno: 2740
revision-id: sanja(a)askmonty.org-20100209083259-ekki5zw4hbaeqpwh
parent: knielsen(a)knielsen-hq.org-20100201190519-b9uktnn90rwwiile
committer: sanja(a)askmonty.org
branch nick: work-maria-5.2-groupcommit
timestamp: Tue 2010-02-09 10:32:59 +0200
message:
Group commit for maria storage engine.
=== added file 'mysql-test/suite/maria/r/group_commit.result'
--- a/mysql-test/suite/maria/r/group_commit.result 1970-01-01 00:00:00 +0000
+++ b/mysql-test/suite/maria/r/group_commit.result 2010-02-09 08:32:59 +0000
@@ -0,0 +1,17 @@
+drop table if exists t1;
+create table t1 (a int);
+SET GLOBAL maria_group_commit="NONE";
+SET GLOBAL maria_group_commit_interval= 0;
+SET GLOBAL maria_group_commit="NONE";
+SET GLOBAL maria_group_commit_interval= 100;
+SET GLOBAL maria_group_commit="HARD";
+SET GLOBAL maria_group_commit_interval= 0;
+SET GLOBAL maria_group_commit="HARD";
+SET GLOBAL maria_group_commit_interval= 100;
+SET GLOBAL maria_group_commit="SOFT";
+SET GLOBAL maria_group_commit_interval= 0;
+SET GLOBAL maria_group_commit="SOFT";
+SET GLOBAL maria_group_commit_interval= 100;
+SET GLOBAL maria_group_commit="NONE";
+SET GLOBAL maria_group_commit_interval= 0;
+drop table t1;
=== modified file 'mysql-test/suite/maria/r/maria3.result'
--- a/mysql-test/suite/maria/r/maria3.result 2009-09-18 01:04:43 +0000
+++ b/mysql-test/suite/maria/r/maria3.result 2010-02-09 08:32:59 +0000
@@ -306,6 +306,8 @@
maria_block_size 8192
maria_checkpoint_interval 30
maria_force_start_after_recovery_failures 0
+maria_group_commit none
+maria_group_commit_interval 0
maria_log_file_size 4294959104
maria_log_purge_type immediate
maria_max_sort_file_size 9223372036853727232
@@ -328,6 +330,7 @@
Maria_pagecache_reads #
Maria_pagecache_write_requests #
Maria_pagecache_writes #
+Maria_transaction_log_syncs #
create table t1 (b char(0));
insert into t1 values(NULL),("");
select length(b) from t1;
=== added file 'mysql-test/suite/maria/t/group_commit.test'
--- a/mysql-test/suite/maria/t/group_commit.test 1970-01-01 00:00:00 +0000
+++ b/mysql-test/suite/maria/t/group_commit.test 2010-02-09 08:32:59 +0000
@@ -0,0 +1,71 @@
+# Test different ways of syncing (mostly syntax)
+
+--disable_warnings
+drop table if exists t1;
+--enable_warnings
+
+create table t1 (a int);
+
+SET GLOBAL maria_group_commit="NONE";
+SET GLOBAL maria_group_commit_interval= 0;
+--disable_query_log
+let $num = 5000;
+while ($num)
+{
+ insert into t1 values (1);
+ dec $num;
+}
+--enable_query_log
+SET GLOBAL maria_group_commit="NONE";
+SET GLOBAL maria_group_commit_interval= 100;
+--disable_query_log
+let $num = 5000;
+while ($num)
+{
+ insert into t1 values (1);
+ dec $num;
+}
+--enable_query_log
+SET GLOBAL maria_group_commit="HARD";
+SET GLOBAL maria_group_commit_interval= 0;
+--disable_query_log
+let $num = 5000;
+while ($num)
+{
+ insert into t1 values (1);
+ dec $num;
+}
+--enable_query_log
+SET GLOBAL maria_group_commit="HARD";
+SET GLOBAL maria_group_commit_interval= 100;
+--disable_query_log
+let $num = 5000;
+while ($num)
+{
+ insert into t1 values (1);
+ dec $num;
+}
+--enable_query_log
+SET GLOBAL maria_group_commit="SOFT";
+SET GLOBAL maria_group_commit_interval= 0;
+--disable_query_log
+let $num = 5000;
+while ($num)
+{
+ insert into t1 values (1);
+ dec $num;
+}
+--enable_query_log
+SET GLOBAL maria_group_commit="SOFT";
+SET GLOBAL maria_group_commit_interval= 100;
+--disable_query_log
+let $num = 5000;
+while ($num)
+{
+ insert into t1 values (1);
+ dec $num;
+}
+--enable_query_log
+SET GLOBAL maria_group_commit="NONE";
+SET GLOBAL maria_group_commit_interval= 0;
+drop table t1;
=== modified file 'storage/maria/ha_maria.cc'
--- a/storage/maria/ha_maria.cc 2009-12-03 11:34:11 +0000
+++ b/storage/maria/ha_maria.cc 2010-02-09 08:32:59 +0000
@@ -102,22 +102,40 @@
array_elements(maria_translog_purge_type_names) - 1, "",
maria_translog_purge_type_names, NULL
};
+
+/* transactional log directory sync */
const char *maria_sync_log_dir_names[]=
{
"NEVER", "NEWFILE", "ALWAYS", NullS
};
-
TYPELIB maria_sync_log_dir_typelib=
{
array_elements(maria_sync_log_dir_names) - 1, "",
maria_sync_log_dir_names, NULL
};
+/* transactional log group commit */
+const char *maria_group_commit_names[]=
+{
+ "none", "hard", "soft", NullS
+};
+TYPELIB maria_group_commit_typelib=
+{
+ array_elements(maria_group_commit_names) - 1, "",
+ maria_group_commit_names, NULL
+};
+
/** Interval between background checkpoints in seconds */
static ulong checkpoint_interval;
static void update_checkpoint_interval(MYSQL_THD thd,
struct st_mysql_sys_var *var,
void *var_ptr, const void *save);
+static void update_maria_group_commit(MYSQL_THD thd,
+ struct st_mysql_sys_var *var,
+ void *var_ptr, const void *save);
+static void update_maria_group_commit_interval(MYSQL_THD thd,
+ struct st_mysql_sys_var *var,
+ void *var_ptr, const void *save);
/** After that many consecutive recovery failures, remove logs */
static ulong force_start_after_recovery_failures;
static void update_log_file_size(MYSQL_THD thd,
@@ -164,6 +182,24 @@
NULL, update_log_file_size, TRANSLOG_FILE_SIZE,
TRANSLOG_MIN_FILE_SIZE, 0xffffffffL, TRANSLOG_PAGE_SIZE);
+static MYSQL_SYSVAR_ENUM(group_commit, maria_group_commit,
+ PLUGIN_VAR_RQCMDARG,
+ "Specifies maria group commit mode. "
+ "Possible values are \"none\" (no group commit), "
+ "\"hard\" (with waiting to actual commit), "
+ "\"soft\" (no wait for commit (DANGEROUS!!!))",
+ NULL, update_maria_group_commit,
+ TRANSLOG_GCOMMIT_NONE, &maria_group_commit_typelib);
+
+static MYSQL_SYSVAR_ULONG(group_commit_interval, maria_group_commit_interval,
+ PLUGIN_VAR_RQCMDARG,
+ "Interval between commite in microseconds (1/1000000c)."
+ " 0 stands for no waiting"
+ "for other threads to come and do a commit in \"hard\" mode and no"
+ " sync()/commit at all in \"soft\" mode. Option has only an effect"
+ "if maria_group_commit is used",
+ NULL, update_maria_group_commit_interval, 0, 0, UINT_MAX, 1);
+
static MYSQL_SYSVAR_ENUM(log_purge_type, log_purge_type,
PLUGIN_VAR_RQCMDARG,
"Specifies how maria transactional log will be purged. "
@@ -3275,6 +3311,8 @@
MYSQL_SYSVAR(block_size),
MYSQL_SYSVAR(checkpoint_interval),
MYSQL_SYSVAR(force_start_after_recovery_failures),
+ MYSQL_SYSVAR(group_commit),
+ MYSQL_SYSVAR(group_commit_interval),
MYSQL_SYSVAR(page_checksum),
MYSQL_SYSVAR(log_dir_path),
MYSQL_SYSVAR(log_file_size),
@@ -3306,6 +3344,92 @@
}
/**
+ @brief Updates group commit mode
+*/
+
+static void update_maria_group_commit(MYSQL_THD thd,
+ struct st_mysql_sys_var *var,
+ void *var_ptr, const void *save)
+{
+ ulong value= (ulong)*((long *)var_ptr);
+ DBUG_ENTER("update_maria_group_commit");
+ DBUG_PRINT("enter", ("old value: %lu new value %lu rate %lu",
+ value, (ulong)(*(long *)save),
+ maria_group_commit_interval));
+ /* old value */
+ switch (value) {
+ case TRANSLOG_GCOMMIT_NONE:
+ break;
+ case TRANSLOG_GCOMMIT_HARD:
+ translog_hard_group_commit(FALSE);
+ break;
+ case TRANSLOG_GCOMMIT_SOFT:
+ translog_soft_sync(FALSE);
+ if (maria_group_commit_interval)
+ translog_soft_sync_end();
+ break;
+ default:
+ DBUG_ASSERT(0); /* impossible */
+ }
+ value= *(ulong *)var_ptr= (ulong)(*(long *)save);
+ translog_sync();
+ /* new value */
+ switch (value) {
+ case TRANSLOG_GCOMMIT_NONE:
+ break;
+ case TRANSLOG_GCOMMIT_HARD:
+ translog_hard_group_commit(TRUE);
+ break;
+ case TRANSLOG_GCOMMIT_SOFT:
+ translog_soft_sync(TRUE);
+ /* variable change made under global lock so we can just read it */
+ if (maria_group_commit_interval)
+ translog_soft_sync_start();
+ break;
+ default:
+ DBUG_ASSERT(0); /* impossible */
+ }
+ DBUG_VOID_RETURN;
+}
+
+/**
+ @brief Updates group commit interval
+*/
+
+static void update_maria_group_commit_interval(MYSQL_THD thd,
+ struct st_mysql_sys_var *var,
+ void *var_ptr, const void *save)
+{
+ ulong new_value= (ulong)*((long *)save);
+ ulong *value_ptr= (ulong*) var_ptr;
+ DBUG_ENTER("update_maria_group_commit_interval");
+ DBUG_PRINT("enter", ("old value: %lu new value %lu group commit %lu",
+ *value_ptr, new_value, maria_group_commit));
+
+ /* variable change made under global lock so we can just read it */
+ switch (maria_group_commit) {
+ case TRANSLOG_GCOMMIT_NONE:
+ *value_ptr= new_value;
+ translog_set_group_commit_interval(new_value);
+ break;
+ case TRANSLOG_GCOMMIT_HARD:
+ *value_ptr= new_value;
+ translog_set_group_commit_interval(new_value);
+ break;
+ case TRANSLOG_GCOMMIT_SOFT:
+ if (*value_ptr)
+ translog_soft_sync_end();
+ translog_set_group_commit_interval(new_value);
+ if ((*value_ptr= new_value))
+ translog_soft_sync_start();
+ break;
+ default:
+ DBUG_ASSERT(0); /* impossible */
+ }
+ DBUG_VOID_RETURN;
+}
+
+/**
@brief Updates the transaction log file limit.
*/
@@ -3327,6 +3451,7 @@
{"Maria_pagecache_reads", (char*) &maria_pagecache_var.global_cache_read, SHOW_LONGLONG},
{"Maria_pagecache_write_requests", (char*) &maria_pagecache_var.global_cache_w_requests, SHOW_LONGLONG},
{"Maria_pagecache_writes", (char*) &maria_pagecache_var.global_cache_write, SHOW_LONGLONG},
+ {"Maria_transaction_log_syncs", (char*) &translog_syncs, SHOW_LONGLONG},
{NullS, NullS, SHOW_LONG}
};
=== modified file 'storage/maria/ma_init.c'
--- a/storage/maria/ma_init.c 2008-10-09 20:03:54 +0000
+++ b/storage/maria/ma_init.c 2010-02-09 08:32:59 +0000
@@ -82,6 +82,11 @@
maria_inited= maria_multi_threaded= FALSE;
ft_free_stopwords();
ma_checkpoint_end();
+ if (translog_status == TRANSLOG_OK)
+ {
+ translog_soft_sync_end();
+ translog_sync();
+ }
if ((trid= trnman_get_max_trid()) > max_trid_in_control_file)
{
/*
=== modified file 'storage/maria/ma_loghandler.c'
--- a/storage/maria/ma_loghandler.c 2010-01-06 21:27:53 +0000
+++ b/storage/maria/ma_loghandler.c 2010-02-09 08:32:59 +0000
@@ -18,6 +18,7 @@
#include "ma_blockrec.h" /* for some constants and in-write hooks */
#include "ma_key_recover.h" /* For some in-write hooks */
#include "ma_checkpoint.h"
+#include "ma_servicethread.h"
/*
On Windows, neither my_open() nor my_sync() work for directories.
@@ -47,6 +48,15 @@
#include <m_ctype.h>
#endif
+/** @brief protects checkpoint_in_progress */
+static pthread_mutex_t LOCK_soft_sync;
+/** @brief for killing the background checkpoint thread */
+static pthread_cond_t COND_soft_sync;
+/** @brief control structure for checkpoint background thread */
+static MA_SERVICE_THREAD_CONTROL soft_sync_control=
+ {THREAD_DEAD, FALSE, &LOCK_soft_sync, &COND_soft_sync};
+
+
/* transaction log file descriptor */
typedef struct st_translog_file
{
@@ -124,10 +134,20 @@
/* Previous buffer offset to detect it flush finish */
TRANSLOG_ADDRESS prev_buffer_offset;
/*
+ If the buffer was forced to close it save value of its horizon
+ otherwise LSN_IMPOSSIBLE
+ */
+ TRANSLOG_ADDRESS pre_force_close_horizon;
+ /*
How much is written (or will be written when copy_to_buffer_in_progress
become 0) to this buffer
*/
translog_size_t size;
+ /*
+ How much data was skipped during moving page from previous buffer
+ to this one (it is optimisation of forcing buffer to finish
+ */
+ uint skipped_data;
/* File handler for this buffer */
TRANSLOG_FILE *file;
/* Threads which are waiting for buffer filling/freeing */
@@ -304,6 +324,7 @@
*/
pthread_mutex_t log_flush_lock;
pthread_cond_t log_flush_cond;
+ pthread_cond_t new_goal_cond;
/* Protects changing of headers of finished files (max_lsn) */
pthread_mutex_t file_header_lock;
@@ -344,13 +365,38 @@
ulong log_purge_type= TRANSLOG_PURGE_IMMIDIATE;
ulong log_file_size= TRANSLOG_FILE_SIZE;
+/* sync() of log files directory mode */
ulong sync_log_dir= TRANSLOG_SYNC_DIR_NEWFILE;
+ulong maria_group_commit= TRANSLOG_GCOMMIT_NONE;
+ulong maria_group_commit_interval= 0;
/* Marker for end of log */
static uchar end_of_log= 0;
#define END_OF_LOG &end_of_log
+/**
+ Switch for "soft" sync (no real sync() but periodical sync by service
+ thread)
+*/
+static volatile my_bool soft_sync= FALSE;
+/**
+ Switch for "hard" group commit mode
+*/
+static volatile my_bool hard_group_commit= FALSE;
+/**
+ File numbers interval which have to be sync()
+*/
+static uint32 soft_sync_min= 0;
+static uint32 soft_sync_max= 0;
+/**
+ stores interval in microseconds
+*/
+static uint32 group_commit_wait= 0;
enum enum_translog_status translog_status= TRANSLOG_UNINITED;
+ulonglong translog_syncs= 0; /* Number of sync()s */
+
+/* time of last flush */
+static ulonglong flush_start= 0;
/* chunk types */
#define TRANSLOG_CHUNK_LSN 0x00 /* 0 chunk refer as LSN (head or tail */
@@ -980,12 +1026,17 @@
static TRANSLOG_FILE *get_current_logfile()
{
TRANSLOG_FILE *file;
+ DBUG_ENTER("get_current_logfile");
rw_rdlock(&log_descriptor.open_files_lock);
+ DBUG_PRINT("info", ("max_file: %lu min_file: %lu open_files: %lu",
+ (ulong) log_descriptor.max_file,
+ (ulong) log_descriptor.min_file,
+ (ulong) log_descriptor.open_files.elements));
DBUG_ASSERT(log_descriptor.max_file - log_descriptor.min_file + 1 ==
log_descriptor.open_files.elements);
file= *dynamic_element(&log_descriptor.open_files, 0, TRANSLOG_FILE **);
rw_unlock(&log_descriptor.open_files_lock);
- return (file);
+ DBUG_RETURN(file);
}
uchar NEAR maria_trans_file_magic[]=
@@ -1069,6 +1120,7 @@
static my_bool translog_max_lsn_to_header(File file, LSN lsn)
{
uchar lsn_buff[LSN_STORE_SIZE];
+ my_bool rc;
DBUG_ENTER("translog_max_lsn_to_header");
DBUG_PRINT("enter", ("File descriptor: %ld "
"lsn: (%lu,0x%lx)",
@@ -1077,11 +1129,13 @@
lsn_store(lsn_buff, lsn);
- DBUG_RETURN(my_pwrite(file, lsn_buff,
- LSN_STORE_SIZE,
- (LOG_HEADER_DATA_SIZE - LSN_STORE_SIZE),
- log_write_flags) != 0 ||
- my_sync(file, MYF(MY_WME)) != 0);
+ if (!(rc= (my_pwrite(file, lsn_buff,
+ LSN_STORE_SIZE,
+ (LOG_HEADER_DATA_SIZE - LSN_STORE_SIZE),
+ log_write_flags) != 0 ||
+ my_sync(file, MYF(MY_WME)) != 0)))
+ translog_syncs++;
+ DBUG_RETURN(rc);
}
@@ -1423,7 +1477,9 @@
static my_bool translog_buffer_init(struct st_translog_buffer *buffer, int num)
{
DBUG_ENTER("translog_buffer_init");
- buffer->prev_last_lsn= buffer->last_lsn= LSN_IMPOSSIBLE;
+ buffer->pre_force_close_horizon=
+ buffer->prev_last_lsn= buffer->last_lsn=
+ LSN_IMPOSSIBLE;
DBUG_PRINT("info", ("last_lsn and prev_last_lsn set to 0 buffer: 0x%lx",
(ulong) buffer));
@@ -1435,6 +1491,7 @@
memset(buffer->buffer, TRANSLOG_FILLER, TRANSLOG_WRITE_BUFFER);
/* Buffer size */
buffer->size= 0;
+ buffer->skipped_data= 0;
/* cond of thread which is waiting for buffer filling */
if (pthread_cond_init(&buffer->waiting_filling_buffer, 0))
DBUG_RETURN(1);
@@ -1489,7 +1546,10 @@
TODO: sync only we have changed the log
*/
if (!file->is_sync)
+ {
rc= my_sync(file->handler.file, MYF(MY_WME));
+ translog_syncs++;
+ }
rc|= my_close(file->handler.file, MYF(MY_WME));
my_free(file, MYF(0));
return test(rc);
@@ -2044,7 +2104,8 @@
(ulong) LSN_OFFSET(log_descriptor.horizon),
(ulong) LSN_OFFSET(log_descriptor.horizon)));
DBUG_ASSERT(buffer_no == buffer->buffer_no);
- buffer->prev_last_lsn= buffer->last_lsn= LSN_IMPOSSIBLE;
+ buffer->pre_force_close_horizon=
+ buffer->prev_last_lsn= buffer->last_lsn= LSN_IMPOSSIBLE;
DBUG_PRINT("info", ("last_lsn and prev_last_lsn set to 0 buffer: 0x%lx",
(ulong) buffer));
buffer->offset= log_descriptor.horizon;
@@ -2052,6 +2113,7 @@
buffer->file= get_current_logfile();
buffer->overlay= 0;
buffer->size= 0;
+ buffer->skipped_data= 0;
translog_cursor_init(cursor, buffer, buffer_no);
DBUG_PRINT("info", ("file: #%ld (%d) init cursor #%u: 0x%lx "
"chaser: %d Size: %lu (%lu)",
@@ -2523,6 +2585,7 @@
TRANSLOG_ADDRESS offset= buffer->offset;
TRANSLOG_FILE *file= buffer->file;
uint8 ver= buffer->ver;
+ uint skipped_data;
DBUG_ENTER("translog_buffer_flush");
DBUG_PRINT("enter",
("Buffer: #%u 0x%lx file: %d offset: (%lu,0x%lx) size: %lu",
@@ -2557,6 +2620,8 @@
disk
*/
file= buffer->file;
+ skipped_data= buffer->skipped_data;
+ DBUG_ASSERT(skipped_data < TRANSLOG_PAGE_SIZE);
for (i= 0, pg= LSN_OFFSET(buffer->offset) / TRANSLOG_PAGE_SIZE;
i < buffer->size;
i+= TRANSLOG_PAGE_SIZE, pg++)
@@ -2573,13 +2638,16 @@
DBUG_ASSERT(i + TRANSLOG_PAGE_SIZE <= buffer->size);
if (translog_status != TRANSLOG_OK && translog_status != TRANSLOG_SHUTDOWN)
DBUG_RETURN(1);
- if (pagecache_inject(log_descriptor.pagecache,
+ if (pagecache_write_part(log_descriptor.pagecache,
&file->handler, pg, 3,
buffer->buffer + i,
PAGECACHE_PLAIN_PAGE,
PAGECACHE_LOCK_LEFT_UNLOCKED,
- PAGECACHE_PIN_LEFT_UNPINNED, 0,
- LSN_IMPOSSIBLE))
+ PAGECACHE_PIN_LEFT_UNPINNED,
+ PAGECACHE_WRITE_DONE, 0,
+ LSN_IMPOSSIBLE,
+ skipped_data,
+ TRANSLOG_PAGE_SIZE - skipped_data))
{
DBUG_PRINT("error",
("Can't write page (%lu,0x%lx) to pagecache, error: %d",
@@ -2589,10 +2657,12 @@
translog_stop_writing();
DBUG_RETURN(1);
}
+ skipped_data= 0;
}
file->is_sync= 0;
- if (my_pwrite(file->handler.file, buffer->buffer,
- buffer->size, LSN_OFFSET(buffer->offset),
+ if (my_pwrite(file->handler.file, buffer->buffer + buffer->skipped_data,
+ buffer->size - buffer->skipped_data,
+ LSN_OFFSET(buffer->offset) + buffer->skipped_data,
log_write_flags))
{
DBUG_PRINT("error", ("Can't write buffer (%lu,0x%lx) size %lu "
@@ -2985,6 +3055,7 @@
uchar *from, *table= NULL;
int is_last_unfinished_page;
uint last_protected_sector= 0;
+ uint skipped_data= curr_buffer->skipped_data;
TRANSLOG_FILE file_copy;
uint8 ver= curr_buffer->ver;
translog_wait_for_writers(curr_buffer);
@@ -2997,7 +3068,25 @@
}
DBUG_ASSERT(LSN_FILE_NO(addr) == LSN_FILE_NO(curr_buffer->offset));
from= curr_buffer->buffer + (addr - curr_buffer->offset);
- memcpy(buffer, from, TRANSLOG_PAGE_SIZE);
+ if (skipped_data > (addr - curr_buffer->offset))
+ {
+ /*
+ We read page part of which is not present in buffer,
+ so we should read absent part from file (page cache actually)
+ */
+ file= get_logfile_by_number(file_no);
+ DBUG_ASSERT(file != NULL);
+ buffer= pagecache_read(log_descriptor.pagecache, &file->handler,
+ LSN_OFFSET(addr) / TRANSLOG_PAGE_SIZE,
+ 3, buffer,
+ PAGECACHE_PLAIN_PAGE,
+ PAGECACHE_LOCK_LEFT_UNLOCKED,
+ NULL);
+ }
+ else
+ skipped_data= 0; /* Read after skipped in buffer data */
+ memcpy(buffer + skipped_data, from + skipped_data,
+ TRANSLOG_PAGE_SIZE - skipped_data);
/*
We can use copy then in translog_page_validator() because it
do not put it permanently somewhere.
@@ -3291,6 +3380,7 @@
uint32 next_page_offset, page_rest;
uint32 i;
File fd;
+ int rc;
TRANSLOG_VALIDATOR_DATA data;
char path[FN_REFLEN];
uchar page_buff[TRANSLOG_PAGE_SIZE];
@@ -3316,14 +3406,19 @@
TRANSLOG_PAGE_SIZE);
page_rest= next_page_offset - LSN_OFFSET(addr);
memset(page_buff, TRANSLOG_FILLER, page_rest);
- if ((fd= open_logfile_by_number_no_cache(LSN_FILE_NO(addr))) < 0 ||
- ((my_chsize(fd, next_page_offset, TRANSLOG_FILLER, MYF(MY_WME)) ||
- (page_rest && my_pwrite(fd, page_buff, page_rest, LSN_OFFSET(addr),
- log_write_flags)) ||
- my_sync(fd, MYF(MY_WME))) |
- my_close(fd, MYF(MY_WME))) ||
- (sync_log_dir >= TRANSLOG_SYNC_DIR_ALWAYS &&
- sync_dir(log_descriptor.directory_fd, MYF(MY_WME | MY_IGNORE_BADFD))))
+ rc= ((fd= open_logfile_by_number_no_cache(LSN_FILE_NO(addr))) < 0 ||
+ ((my_chsize(fd, next_page_offset, TRANSLOG_FILLER, MYF(MY_WME)) ||
+ (page_rest && my_pwrite(fd, page_buff, page_rest, LSN_OFFSET(addr),
+ log_write_flags)) ||
+ my_sync(fd, MYF(MY_WME)))));
+ translog_syncs++;
+ rc|= (fd > 0 && my_close(fd, MYF(MY_WME)));
+ if (sync_log_dir >= TRANSLOG_SYNC_DIR_ALWAYS)
+ {
+ rc|= sync_dir(log_descriptor.directory_fd, MYF(MY_WME | MY_IGNORE_BADFD));
+ translog_syncs++;
+ }
+ if (rc)
DBUG_RETURN(1);
/* fix the horizon */
@@ -3511,6 +3606,7 @@
pthread_mutex_init(&log_descriptor.dirty_buffer_mask_lock,
MY_MUTEX_INIT_FAST) ||
pthread_cond_init(&log_descriptor.log_flush_cond, 0) ||
+ pthread_cond_init(&log_descriptor.new_goal_cond, 0) ||
my_rwlock_init(&log_descriptor.open_files_lock,
NULL) ||
my_init_dynamic_array(&log_descriptor.open_files,
@@ -3912,7 +4008,6 @@
log_descriptor.flushed= log_descriptor.horizon;
log_descriptor.in_buffers_only= log_descriptor.bc.buffer->offset;
log_descriptor.max_lsn= LSN_IMPOSSIBLE; /* set to 0 */
- log_descriptor.previous_flush_horizon= log_descriptor.horizon;
/*
Now 'flushed' is set to 'horizon' value, but 'horizon' is (potentially)
address of the next LSN and we want indicate that all LSNs that are
@@ -3995,6 +4090,10 @@
It is beginning of the log => there is no LSNs in the log =>
There is no harm in leaving it "as-is".
*/
+ log_descriptor.previous_flush_horizon= log_descriptor.horizon;
+ DBUG_PRINT("info", ("previous_flush_horizon: (%lu,0x%lx)",
+ LSN_IN_PARTS(log_descriptor.
+ previous_flush_horizon)));
DBUG_RETURN(0);
}
file_no--;
@@ -4070,6 +4169,9 @@
translog_free_record_header(&rec);
}
}
+ log_descriptor.previous_flush_horizon= log_descriptor.horizon;
+ DBUG_PRINT("info", ("previous_flush_horizon: (%lu,0x%lx)",
+ LSN_IN_PARTS(log_descriptor.previous_flush_horizon)));
DBUG_RETURN(0);
err:
ma_message_no_user(0, "log initialization failed");
@@ -4157,6 +4259,7 @@
pthread_mutex_destroy(&log_descriptor.log_flush_lock);
pthread_mutex_destroy(&log_descriptor.dirty_buffer_mask_lock);
pthread_cond_destroy(&log_descriptor.log_flush_cond);
+ pthread_cond_destroy(&log_descriptor.new_goal_cond);
rwlock_destroy(&log_descriptor.open_files_lock);
delete_dynamic(&log_descriptor.open_files);
delete_dynamic(&log_descriptor.unfinished_files);
@@ -6885,11 +6988,11 @@
{
translog_size_t res;
DBUG_ENTER("translog_read_record_header_from_buffer");
- DBUG_ASSERT(translog_is_LSN_chunk(page[page_offset]));
- DBUG_ASSERT(translog_status == TRANSLOG_OK ||
- translog_status == TRANSLOG_READONLY);
DBUG_PRINT("info", ("page byte: 0x%x offset: %u",
(uint) page[page_offset], (uint) page_offset));
+ DBUG_ASSERT(translog_is_LSN_chunk(page[page_offset]));
+ DBUG_ASSERT(translog_status == TRANSLOG_OK ||
+ translog_status == TRANSLOG_READONLY);
buff->type= (page[page_offset] & TRANSLOG_REC_TYPE);
buff->short_trid= uint2korr(page + page_offset + 1);
DBUG_PRINT("info", ("Type %u, Short TrID %u, LSN (%lu,0x%lx)",
@@ -7356,27 +7459,27 @@
"Buffer addr: (%lu,0x%lx) "
"Page addr: (%lu,0x%lx) "
"size: %lu (%lu) Pg: %u left: %u in progress %u",
- (uint) log_descriptor.bc.buffer_no,
- (ulong) log_descriptor.bc.buffer,
- LSN_IN_PARTS(log_descriptor.bc.buffer->offset),
+ (uint) old_buffer_no,
+ (ulong) old_buffer,
+ LSN_IN_PARTS(old_buffer->offset),
(ulong) LSN_FILE_NO(log_descriptor.horizon),
(ulong) (LSN_OFFSET(log_descriptor.horizon) -
log_descriptor.bc.current_page_fill),
- (ulong) log_descriptor.bc.buffer->size,
+ (ulong) old_buffer->size,
(ulong) (log_descriptor.bc.ptr -log_descriptor.bc.
buffer->buffer),
(uint) log_descriptor.bc.current_page_fill,
(uint) left,
- (uint) log_descriptor.bc.buffer->
+ (uint) old_buffer->
copy_to_buffer_in_progress));
translog_lock_assert_owner();
LINT_INIT(current_page_fill);
- new_buff_beginning= log_descriptor.bc.buffer->offset;
- new_buff_beginning+= log_descriptor.bc.buffer->size; /* increase offset */
+ new_buff_beginning= old_buffer->offset;
+ new_buff_beginning+= old_buffer->size; /* increase offset */
DBUG_ASSERT(log_descriptor.bc.ptr !=NULL);
DBUG_ASSERT(LSN_FILE_NO(log_descriptor.horizon) ==
- LSN_FILE_NO(log_descriptor.bc.buffer->offset));
+ LSN_FILE_NO(old_buffer->offset));
translog_check_cursor(&log_descriptor.bc);
DBUG_ASSERT(left < TRANSLOG_PAGE_SIZE);
if (left)
@@ -7387,18 +7490,20 @@
*/
DBUG_PRINT("info", ("left: %u", (uint) left));
+ old_buffer->pre_force_close_horizon=
+ old_buffer->offset + old_buffer->size;
/* decrease offset */
new_buff_beginning-= log_descriptor.bc.current_page_fill;
current_page_fill= log_descriptor.bc.current_page_fill;
memset(log_descriptor.bc.ptr, TRANSLOG_FILLER, left);
- log_descriptor.bc.buffer->size+= left;
+ old_buffer->size+= left;
DBUG_PRINT("info", ("Finish Page buffer #%u: 0x%lx "
"Size: %lu",
- (uint) log_descriptor.bc.buffer->buffer_no,
- (ulong) log_descriptor.bc.buffer,
- (ulong) log_descriptor.bc.buffer->size));
- DBUG_ASSERT(log_descriptor.bc.buffer->buffer_no ==
+ (uint) old_buffer->buffer_no,
+ (ulong) old_buffer,
+ (ulong) old_buffer->size));
+ DBUG_ASSERT(old_buffer->buffer_no ==
log_descriptor.bc.buffer_no);
}
else
@@ -7509,11 +7614,21 @@
if (left)
{
- /*
- TODO: do not copy beginning of the page if we have no CRC or sector
- checks on
- */
- memcpy(new_buffer->buffer, data, current_page_fill);
+ if (log_descriptor.flags &
+ (TRANSLOG_PAGE_CRC | TRANSLOG_SECTOR_PROTECTION))
+ memcpy(new_buffer->buffer, data, current_page_fill);
+ else
+ {
+ /*
+ This page header does not change if we add more data to the page so
+ we can not copy it and will not overwrite later
+ */
+ new_buffer->skipped_data= current_page_fill;
+#ifndef DBUG_OFF
+ memset(new_buffer->buffer, 0xa5, current_page_fill);
+#endif
+ DBUG_ASSERT(new_buffer->skipped_data < TRANSLOG_PAGE_SIZE);
+ }
}
old_buffer->next_buffer_offset= new_buffer->offset;
translog_buffer_lock(new_buffer);
@@ -7561,6 +7676,7 @@
{
log_descriptor.next_pass_max_lsn= lsn;
log_descriptor.max_lsn_requester= pthread_self();
+ pthread_cond_broadcast(&log_descriptor.new_goal_cond);
}
while (flush_no == log_descriptor.flush_no)
{
@@ -7572,66 +7688,78 @@
/**
- @brief Flush the log up to given LSN (included)
-
- @param lsn log record serial number up to which (inclusive)
- the log has to be flushed
-
- @return Operation status
+ @brief sync() range of files (inclusive) and directory (by request)
+
+ @param min min internal file number to flush
+ @param max max internal file number to flush
+ @param sync_dir need sync directory
+
+ return Operation status
@retval 0 OK
@retval 1 Error
-
-*/
-
-my_bool translog_flush(TRANSLOG_ADDRESS lsn)
-{
- LSN sent_to_disk= LSN_IMPOSSIBLE;
- TRANSLOG_ADDRESS flush_horizon;
- uint fn, i;
+*/
+
+static my_bool translog_sync_files(uint32 min, uint32 max,
+ my_bool sync_dir)
+{
+ uint fn;
+ my_bool rc= 0;
+ ulonglong flush_interval;
+ DBUG_ENTER("translog_sync_files");
+ DBUG_PRINT("info", ("min: %lu max: %lu sync dir: %d",
+ (ulong) min, (ulong) max, (int) sync_dir));
+ DBUG_ASSERT(min <= max);
+
+ flush_interval= group_commit_wait;
+ if (flush_interval)
+ flush_start= my_micro_time();
+ for (fn= min; fn <= max; fn++)
+ {
+ TRANSLOG_FILE *file= get_logfile_by_number(fn);
+ DBUG_ASSERT(file != NULL);
+ if (!file->is_sync)
+ {
+ if (my_sync(file->handler.file, MYF(MY_WME)))
+ {
+ rc= 1;
+ translog_stop_writing();
+ DBUG_RETURN(rc);
+ }
+ translog_syncs++;
+ file->is_sync= 1;
+ }
+ }
+
+ if (sync_dir)
+ {
+ if (!(rc= sync_dir(log_descriptor.directory_fd,
+ MYF(MY_WME | MY_IGNORE_BADFD))))
+ translog_syncs++;
+ }
+
+ DBUG_RETURN(rc);
+}
+
+
+/*
+ @brief Flushes buffers with LSNs in them less or equal address <lsn>
+
+ @param lsn address up to which all LSNs should be flushed,
+ can be reset to real last LSN address
+ @parem sent_to_disk returns 'sent to disk' position
+ @param flush_horizon returns horizon of the flush
+
+ @note About terminology see comment to translog_flush().
+*/
+
+void translog_flush_buffers(TRANSLOG_ADDRESS *lsn,
+ TRANSLOG_ADDRESS *sent_to_disk,
+ TRANSLOG_ADDRESS *flush_horizon)
+{
dirty_buffer_mask_t dirty_buffer_mask;
+ uint i;
uint8 last_buffer_no, start_buffer_no;
- my_bool rc= 0;
- DBUG_ENTER("translog_flush");
- DBUG_PRINT("enter", ("Flush up to LSN: (%lu,0x%lx)", LSN_IN_PARTS(lsn)));
- DBUG_ASSERT(translog_status == TRANSLOG_OK ||
- translog_status == TRANSLOG_READONLY);
- LINT_INIT(sent_to_disk);
-
- pthread_mutex_lock(&log_descriptor.log_flush_lock);
- DBUG_PRINT("info", ("Everything is flushed up to (%lu,0x%lx)",
- LSN_IN_PARTS(log_descriptor.flushed)));
- if (cmp_translog_addr(log_descriptor.flushed, lsn) >= 0)
- {
- pthread_mutex_unlock(&log_descriptor.log_flush_lock);
- DBUG_RETURN(0);
- }
- if (log_descriptor.flush_in_progress)
- {
- translog_flush_set_new_goal_and_wait(lsn);
- if (!pthread_equal(log_descriptor.max_lsn_requester, pthread_self()))
- {
- /* fix lsn if it was horizon */
- if (cmp_translog_addr(lsn, log_descriptor.bc.buffer->last_lsn) > 0)
- lsn= BUFFER_MAX_LSN(log_descriptor.bc.buffer);
- translog_flush_wait_for_end(lsn);
- pthread_mutex_unlock(&log_descriptor.log_flush_lock);
- DBUG_RETURN(0);
- }
- log_descriptor.next_pass_max_lsn= LSN_IMPOSSIBLE;
- }
- log_descriptor.flush_in_progress= 1;
- flush_horizon= log_descriptor.previous_flush_horizon;
- DBUG_PRINT("info", ("flush_in_progress is set"));
- pthread_mutex_unlock(&log_descriptor.log_flush_lock);
-
- translog_lock();
- if (log_descriptor.is_everything_flushed)
- {
- DBUG_PRINT("info", ("everything is flushed"));
- rc= (translog_status == TRANSLOG_READONLY);
- translog_unlock();
- goto out;
- }
+ DBUG_ENTER("translog_flush_buffers");
/*
We will recheck information when will lock buffers one by
@@ -7656,15 +7784,15 @@
/*
if LSN up to which we have to flush bigger then maximum LSN of previous
buffer and at least one LSN was saved in the current buffer (last_lsn !=
- LSN_IMPOSSIBLE) then we better finish the current buffer.
+ LSN_IMPOSSIBLE) then we have to close the current buffer.
*/
- if (cmp_translog_addr(lsn, log_descriptor.bc.buffer->prev_last_lsn) > 0 &&
+ if (cmp_translog_addr(*lsn, log_descriptor.bc.buffer->prev_last_lsn) > 0 &&
log_descriptor.bc.buffer->last_lsn != LSN_IMPOSSIBLE)
{
struct st_translog_buffer *buffer= log_descriptor.bc.buffer;
- lsn= log_descriptor.bc.buffer->last_lsn; /* fix lsn if it was horizon */
+ *lsn= log_descriptor.bc.buffer->last_lsn; /* fix lsn if it was horizon */
DBUG_PRINT("info", ("LSN to flush fixed to last lsn: (%lu,0x%lx)",
- LSN_IN_PARTS(log_descriptor.bc.buffer->last_lsn)));
+ LSN_IN_PARTS(log_descriptor.bc.buffer->last_lsn)));
last_buffer_no= log_descriptor.bc.buffer_no;
log_descriptor.is_everything_flushed= 1;
translog_force_current_buffer_to_finish();
@@ -7676,8 +7804,10 @@
TRANSLOG_BUFFERS_NO);
translog_unlock();
}
- sent_to_disk= translog_get_sent_to_disk();
- if (cmp_translog_addr(lsn, sent_to_disk) > 0)
+
+ /* flush buffers */
+ *sent_to_disk= translog_get_sent_to_disk();
+ if (cmp_translog_addr(*lsn, *sent_to_disk) > 0)
{
DBUG_PRINT("info", ("Start buffer #: %u last buffer #: %u",
@@ -7697,53 +7827,237 @@
LSN_IN_PARTS(buffer->last_lsn),
(buffer->file ?
"dirty" : "closed")));
- if (buffer->prev_last_lsn <= lsn &&
+ if (buffer->prev_last_lsn <= *lsn &&
buffer->file != NULL)
{
- DBUG_ASSERT(flush_horizon <= buffer->offset + buffer->size);
- flush_horizon= buffer->offset + buffer->size;
+ DBUG_ASSERT(*flush_horizon <= buffer->offset + buffer->size);
+ *flush_horizon= (buffer->pre_force_close_horizon != LSN_IMPOSSIBLE ?
+ buffer->pre_force_close_horizon :
+ buffer->offset + buffer->size);
+ /* pre_force_close_horizon is reset during new buffer start */
+ DBUG_PRINT("info", ("flush_horizon: (%lu,0x%lx)",
+ LSN_IN_PARTS(*flush_horizon)));
+ DBUG_ASSERT(*flush_horizon <= log_descriptor.horizon);
+
translog_buffer_flush(buffer);
}
translog_buffer_unlock(buffer);
i= (i + 1) % TRANSLOG_BUFFERS_NO;
} while (i != last_buffer_no);
- sent_to_disk= translog_get_sent_to_disk();
- }
-
- /* sync files from previous flush till current one */
- for (fn= LSN_FILE_NO(log_descriptor.flushed); fn <= LSN_FILE_NO(lsn); fn++)
- {
- TRANSLOG_FILE *file= get_logfile_by_number(fn);
- DBUG_ASSERT(file != NULL);
- if (!file->is_sync)
- {
- if (my_sync(file->handler.file, MYF(MY_WME)))
+ *sent_to_disk= translog_get_sent_to_disk();
+ }
+
+ DBUG_VOID_RETURN;
+}
+
+/**
+ @brief Flush the log up to given LSN (included)
+
+ @param lsn log record serial number up to which (inclusive)
+ the log has to be flushed
+
+ @return Operation status
+ @retval 0 OK
+ @retval 1 Error
+
+ @note
+
+ - Non group commit logic: Commits made in passes. Thread which started
+ flush first is performing actual flush, other threads sets new goal (LSN)
+ of the next pass (if it is maximum) and waits for the pass end or just
+ wait for the pass end.
+
+ - If hard group commit enabled and rate set to zero:
+ The first thread sends all changed buffers to disk. This is repeated
+ as long as there are new LSNs added. The process can not loop
+ forever because we have limited number of threads and they will wait
+ for the data to be synced.
+ Pseudo code:
+
+ do
+ send changed buffers to disk
+ while new_goal
+ sync
+
+ - If hard group commit switched ON and less than rate microseconds has
+ passed from last sync, then after buffers have been sent to disk
+ wait until rate microseconds has passed since last sync, do sync and return.
+ This ensures that if we call sync infrequently we don't do any waits.
+
+ - If soft group commit enabled everything works as with 'non group commit'
+ but the thread doesn't do any real sync(). If rate is not zero the
+ sync() will be performed by a service thread with the given rate
+ when needed (new LSN appears).
+
+ @note Terminology:
+ 'sent to disk' means written to disk but not sync()ed,
+ 'flushed' mean sent to disk and synced().
+*/
+
+my_bool translog_flush(TRANSLOG_ADDRESS lsn)
+{
+ struct timespec abstime;
+ ulonglong flush_interval;
+ ulonglong time_spent;
+ LSN sent_to_disk= LSN_IMPOSSIBLE;
+ TRANSLOG_ADDRESS flush_horizon;
+ my_bool rc= 0;
+ my_bool hgroup_commit_at_start;
+ DBUG_ENTER("translog_flush");
+ DBUG_PRINT("enter", ("Flush up to LSN: (%lu,0x%lx)", LSN_IN_PARTS(lsn)));
+ DBUG_ASSERT(translog_status == TRANSLOG_OK ||
+ translog_status == TRANSLOG_READONLY);
+ LINT_INIT(sent_to_disk);
+
+ pthread_mutex_lock(&log_descriptor.log_flush_lock);
+ DBUG_PRINT("info", ("Everything is flushed up to (%lu,0x%lx)",
+ LSN_IN_PARTS(log_descriptor.flushed)));
+ if (cmp_translog_addr(log_descriptor.flushed, lsn) >= 0)
+
+
+ {
+ pthread_mutex_unlock(&log_descriptor.log_flush_lock);
+ DBUG_RETURN(0);
+ }
+ if (log_descriptor.flush_in_progress)
+ {
+ translog_lock();
+ /* fix lsn if it was horizon */
+ if (cmp_translog_addr(lsn, log_descriptor.bc.buffer->last_lsn) > 0)
+ lsn= BUFFER_MAX_LSN(log_descriptor.bc.buffer);
+ translog_unlock();
+ translog_flush_set_new_goal_and_wait(lsn);
+ if (!pthread_equal(log_descriptor.max_lsn_requester, pthread_self()))
+ {
+ /*
+ translog_flush_wait_for_end() release log_flush_lock while is
+ waiting then acquire it again
+ */
+ translog_flush_wait_for_end(lsn);
+ pthread_mutex_unlock(&log_descriptor.log_flush_lock);
+ DBUG_RETURN(0);
+ }
+ log_descriptor.next_pass_max_lsn= LSN_IMPOSSIBLE;
+ }
+ log_descriptor.flush_in_progress= 1;
+ flush_horizon= log_descriptor.previous_flush_horizon;
+ DBUG_PRINT("info", ("flush_in_progress is set, flush_horizon: (%lu,0x%lx)",
+ LSN_IN_PARTS(flush_horizon)));
+ pthread_mutex_unlock(&log_descriptor.log_flush_lock);
+
+ hgroup_commit_at_start= hard_group_commit;
+ if (hgroup_commit_at_start)
+ flush_interval= group_commit_wait;
+
+ translog_lock();
+ if (log_descriptor.is_everything_flushed)
+ {
+ DBUG_PRINT("info", ("everything is flushed"));
+ translog_unlock();
+ pthread_mutex_lock(&log_descriptor.log_flush_lock);
+ goto out;
+ }
+
+ for (;;)
+ {
+ /* Following function flushes buffers and makes translog_unlock() */
+ translog_flush_buffers(&lsn, &sent_to_disk, &flush_horizon);
+
+ if (!hgroup_commit_at_start)
+ break; /* flush pass is ended */
+
+retest:
+ if (flush_interval != 0 &&
+ (my_micro_time() - flush_start) >= flush_interval)
+ break; /* flush pass is ended */
+
+ pthread_mutex_lock(&log_descriptor.log_flush_lock);
+ if (log_descriptor.next_pass_max_lsn != LSN_IMPOSSIBLE)
+ {
+ /* take next goal */
+ lsn= log_descriptor.next_pass_max_lsn;
+ log_descriptor.next_pass_max_lsn= LSN_IMPOSSIBLE;
+ /* prevent other thread from continue */
+ log_descriptor.max_lsn_requester= pthread_self();
+ DBUG_PRINT("info", ("flush took next goal: (%lu,0x%lx)",
+ LSN_IN_PARTS(lsn)));
+ }
+ else
+ {
+ if (flush_interval == 0 ||
+ (time_spent= (my_micro_time() - flush_start)) >= flush_interval)
{
- rc= 1;
- translog_stop_writing();
- sent_to_disk= LSN_IMPOSSIBLE;
- goto out;
+ pthread_mutex_unlock(&log_descriptor.log_flush_lock);
+ break;
}
- file->is_sync= 1;
- }
- }
-
- if (sync_log_dir >= TRANSLOG_SYNC_DIR_ALWAYS &&
- (LSN_FILE_NO(log_descriptor.previous_flush_horizon) !=
- LSN_FILE_NO(flush_horizon) ||
- ((LSN_OFFSET(log_descriptor.previous_flush_horizon) - 1) /
- TRANSLOG_PAGE_SIZE) !=
- ((LSN_OFFSET(flush_horizon) - 1) / TRANSLOG_PAGE_SIZE)))
- rc|= sync_dir(log_descriptor.directory_fd, MYF(MY_WME | MY_IGNORE_BADFD));
+ DBUG_PRINT("info", ("flush waits: %llu interval: %llu spent: %llu",
+ flush_interval - time_spent,
+ flush_interval, time_spent));
+ /* wait time or next goal */
+ set_timespec_nsec(abstime, flush_interval - time_spent);
+ pthread_cond_timedwait(&log_descriptor.new_goal_cond,
+ &log_descriptor.log_flush_lock,
+ &abstime);
+ pthread_mutex_unlock(&log_descriptor.log_flush_lock);
+ DBUG_PRINT("info", ("retest conditions"));
+ goto retest;
+ }
+ pthread_mutex_unlock(&log_descriptor.log_flush_lock);
+
+ /* next flush pass */
+ DBUG_PRINT("info", ("next flush pass"));
+ translog_lock();
+ }
+
+ /*
+ sync() files from previous flush till current one
+ */
+ if (!soft_sync || hgroup_commit_at_start)
+ {
+ if ((rc=
+ translog_sync_files(LSN_FILE_NO(log_descriptor.flushed),
+ LSN_FILE_NO(lsn),
+ sync_log_dir >= TRANSLOG_SYNC_DIR_ALWAYS &&
+ (LSN_FILE_NO(log_descriptor.
+ previous_flush_horizon) !=
+ LSN_FILE_NO(flush_horizon) ||
+ (LSN_OFFSET(log_descriptor.
+ previous_flush_horizon) /
+ TRANSLOG_PAGE_SIZE) !=
+ (LSN_OFFSET(flush_horizon) /
+ TRANSLOG_PAGE_SIZE)))))
+ {
+ sent_to_disk= LSN_IMPOSSIBLE;
+ pthread_mutex_lock(&log_descriptor.log_flush_lock);
+ goto out;
+ }
+ /* keep values for soft sync() and forced sync() actual */
+ {
+ uint32 fileno= LSN_FILE_NO(lsn);
+ my_atomic_rwlock_wrlock(&soft_sync_rwl);
+ my_atomic_store32(&soft_sync_min, fileno);
+ my_atomic_store32(&soft_sync_max, fileno);
+ my_atomic_rwlock_wrunlock(&soft_sync_rwl);
+ }
+ }
+ else
+ {
+ my_atomic_rwlock_wrlock(&soft_sync_rwl);
+ my_atomic_store32(&soft_sync_max, LSN_FILE_NO(lsn));
+ my_atomic_rwlock_wrunlock(&soft_sync_rwl);
+ }
+
+ DBUG_ASSERT(flush_horizon <= log_descriptor.horizon);
+
+ pthread_mutex_lock(&log_descriptor.log_flush_lock);
log_descriptor.previous_flush_horizon= flush_horizon;
out:
- pthread_mutex_lock(&log_descriptor.log_flush_lock);
if (sent_to_disk != LSN_IMPOSSIBLE)
log_descriptor.flushed= sent_to_disk;
log_descriptor.flush_in_progress= 0;
log_descriptor.flush_no++;
DBUG_PRINT("info", ("flush_in_progress is dropped"));
- pthread_mutex_unlock(&log_descriptor.log_flush_lock);\
+ pthread_mutex_unlock(&log_descriptor.log_flush_lock);
pthread_cond_broadcast(&log_descriptor.log_flush_cond);
DBUG_RETURN(rc);
}
@@ -8113,6 +8427,8 @@
my_bool translog_purge(TRANSLOG_ADDRESS low)
{
uint32 last_need_file= LSN_FILE_NO(low);
+ uint32 min_unsync;
+ int soft;
TRANSLOG_ADDRESS horizon= translog_get_horizon();
int rc= 0;
DBUG_ENTER("translog_purge");
@@ -8120,12 +8436,23 @@
DBUG_ASSERT(translog_status == TRANSLOG_OK ||
translog_status == TRANSLOG_READONLY);
+ soft= soft_sync;
+ DBUG_PRINT("info", ("min_unsync: %lu", (ulong) min_unsync));
+ if (soft && min_unsync < last_need_file)
+ {
+ last_need_file= min_unsync;
+ DBUG_PRINT("info", ("last_need_file set to %lu", (ulong)last_need_file));
+ }
+
pthread_mutex_lock(&log_descriptor.purger_lock);
+ DBUG_PRINT("info", ("last_lsn_checked file: %lu:",
+ (ulong) log_descriptor.last_lsn_checked));
if (LSN_FILE_NO(log_descriptor.last_lsn_checked) < last_need_file)
{
uint32 i;
uint32 min_file= translog_first_file(horizon, 1);
DBUG_ASSERT(min_file != 0); /* log is already started */
+ DBUG_PRINT("info", ("min_file: %lu:",(ulong) min_file));
for(i= min_file; i < last_need_file && rc == 0; i++)
{
LSN lsn= translog_get_file_max_lsn_stored(i);
@@ -8356,6 +8683,155 @@
}
+
+/**
+ Sets soft sync mode
+
+ @param mode TRUE if we need switch soft sync on else off
+*/
+
+void translog_soft_sync(my_bool mode)
+{
+ soft_sync= mode;
+}
+
+
+/**
+ Sets hard group commit
+
+ @param mode TRUE if we need switch hard group commit on else off
+*/
+
+void translog_hard_group_commit(my_bool mode)
+{
+ hard_group_commit= mode;
+}
+
+
+/**
+ @brief forced log sync (used when we are switching modes)
+*/
+
+void translog_sync()
+{
+ uint32 max= get_current_logfile()->number;
+ uint32 min;
+ DBUG_ENTER("ma_translog_sync");
+
+ my_atomic_rwlock_rdlock(&soft_sync_rwl);
+ min= my_atomic_load32(&soft_sync_min);
+ my_atomic_rwlock_rdunlock(&soft_sync_rwl);
+ if (!min)
+ min= max;
+
+ translog_sync_files(min, max, sync_log_dir >= TRANSLOG_SYNC_DIR_ALWAYS);
+
+ DBUG_VOID_RETURN;
+}
+
+
+/**
+ @brief set rate for group commit
+
+ @param interval interval to set.
+
+ @note We use this function with additional variable because have to
+ restart service thread with new value which we can't make inside changing
+ variable routine (update_maria_group_commit_interval)
+*/
+
+void translog_set_group_commit_interval(uint32 interval)
+{
+ DBUG_ENTER("translog_set_group_commit_interval");
+ group_commit_wait= interval;
+ DBUG_PRINT("info", ("wait: %llu",
+ (ulonglong)group_commit_wait));
+ DBUG_VOID_RETURN;
+}
+
+
+/**
+ @brief syncing service thread
+*/
+
+static pthread_handler_t
+ma_soft_sync_background( void *arg __attribute__((unused)))
+{
+
+ my_thread_init();
+ {
+ DBUG_ENTER("ma_soft_sync_background");
+ for(;;)
+ {
+ ulonglong prev_loop= my_micro_time();
+ ulonglong time, sleep;
+ uint32 min, max;
+ my_atomic_rwlock_rdlock(&soft_sync_rwl);
+ min= my_atomic_load32(&soft_sync_min);
+ max= my_atomic_load32(&soft_sync_max);
+ my_atomic_store32(&soft_sync_min, max);
+ my_atomic_rwlock_rdunlock(&soft_sync_rwl);
+
+ sleep= group_commit_wait;
+ translog_sync_files(min, max, FALSE);
+ time= my_micro_time() - prev_loop;
+ if (time > sleep)
+ sleep= 0;
+ else
+ sleep-= time;
+ if (my_service_thread_sleep(&soft_sync_control, sleep))
+ break;
+ }
+ my_service_thread_signal_end(&soft_sync_control);
+ my_thread_end();
+ DBUG_RETURN(0);
+ }
+}
+
+
+/**
+ @brief Starts syncing thread
+*/
+
+int translog_soft_sync_start(void)
+{
+ pthread_t th;
+ int res= 0;
+ uint32 min, max;
+ DBUG_ENTER("translog_soft_sync_start");
+
+ /* check and init variables */
+ my_atomic_rwlock_rdlock(&soft_sync_rwl);
+ min= my_atomic_load32(&soft_sync_min);
+ max= my_atomic_load32(&soft_sync_max);
+ if (!max)
+ my_atomic_store32(&soft_sync_max, (max= get_current_logfile()->number));
+ if (!min)
+ my_atomic_store32(&soft_sync_min, max);
+ my_atomic_rwlock_rdunlock(&soft_sync_rwl);
+
+ if (!(res= ma_service_thread_control_init(&soft_sync_control)))
+ if (!(res= pthread_create(&th, NULL, ma_soft_sync_background, NULL)))
+ soft_sync_control.status= THREAD_RUNNING;
+ DBUG_RETURN(res);
+}
+
+
+/**
+ @brief Stops syncing thread
+*/
+
+void translog_soft_sync_end(void)
+{
+ DBUG_ENTER("translog_soft_sync_end");
+ if (soft_sync_control.inited)
+ {
+ ma_service_thread_control_end(&soft_sync_control);
+ }
+ DBUG_VOID_RETURN;
+}
+
+
#ifdef MARIA_DUMP_LOG
#include <my_getopt.h>
extern void translog_example_table_init();
=== modified file 'storage/maria/ma_loghandler.h'
--- a/storage/maria/ma_loghandler.h 2009-01-15 22:25:53 +0000
+++ b/storage/maria/ma_loghandler.h 2010-02-09 08:32:59 +0000
@@ -342,6 +342,14 @@
TRANSLOG_SHUTDOWN /* going to shutdown the loghandler */
};
extern enum enum_translog_status translog_status;
+extern ulonglong translog_syncs; /* Number of sync()s */
+
+void translog_soft_sync(my_bool mode);
+void translog_hard_group_commit(my_bool mode);
+int translog_soft_sync_start(void);
+void translog_soft_sync_end(void);
+void translog_sync();
+void translog_set_group_commit_interval(uint32 interval);
/*
all the rest added because of recovery; should we make
@@ -441,6 +449,14 @@
typedef enum
{
+ TRANSLOG_GCOMMIT_NONE,
+ TRANSLOG_GCOMMIT_HARD,
+ TRANSLOG_GCOMMIT_SOFT
+} enum_maria_group_commit;
+extern ulong maria_group_commit;
+extern ulong maria_group_commit_interval;
+typedef enum
+{
TRANSLOG_PURGE_IMMIDIATE,
TRANSLOG_PURGE_EXTERNAL,
TRANSLOG_PURGE_ONDEMAND
3
2

[Maria-developers] bzr commit into MariaDB 5.1, with Maria 1.5:maria branch (monty:2814)
by Michael Widenius 10 Feb '10
by Michael Widenius 10 Feb '10
10 Feb '10
#At lp:maria based on revid:monty@askmonty.org-20100209171704-h7stfhbh94k54tbf
2814 Michael Widenius 2010-02-10
When one does a drop table, the indexes are not flushed to disk before drop anymore (with MyISAM/Maria)
myisam-recover options changed from OFF to 'DEFAULT' to get less change of data loss when using MyISAM.
(The disadvantage is that changed MyISAM tables will be checked at access time; Use --myisam-recover=OFF for old behavior)
Don't call extra(HA_EXTRA_FORCE_REOPEN) in ALTER TABLE if table is locked as this will mark table as crashed!
Added assert to detect if we accidently would use MyISAM versioning in MySQL
modified:
include/my_base.h
mysql-test/mysql-test-run.pl
mysql-test/r/sp-destruct.result
mysql-test/r/variables.result
mysql-test/r/view.result
mysql-test/suite/maria/t/maria-recovery2-master.opt
mysql-test/t/sp-destruct.test
mysql-test/t/view.test
sql/lock.cc
sql/mysql_priv.h
sql/mysqld.cc
sql/sql_base.cc
sql/sql_delete.cc
sql/sql_table.cc
sql/table.cc
sql/table.h
storage/maria/ha_maria.cc
storage/maria/ma_blockrec.c
storage/maria/ma_close.c
storage/maria/ma_extra.c
storage/maria/ma_locking.c
storage/maria/ma_recovery.c
storage/maria/maria_def.h
storage/myisam/mi_close.c
storage/myisam/mi_extra.c
storage/myisam/mi_open.c
storage/myisam/myisamdef.h
per-file messages:
include/my_base.h
Mark NOT_USED as USED, as we now use this as a flag to not call extra()
mysql-test/mysql-test-run.pl
Don't write all options when there is something wrong with the arguments
mysql-test/r/sp-destruct.result
Add missing flush of mysql.proc (as the test copied live tables)
mysql-test/r/variables.result
myisam-recover options changed to 'default'
mysql-test/r/view.result
Don't show create time in result
mysql-test/suite/maria/t/maria-recovery2-master.opt
Don't run test with myisam-recover (as this produces extra warnings during simulated death)
mysql-test/t/sp-destruct.test
Add missing flush of mysql.proc (as the test copied live tables)
mysql-test/t/view.test
Don't show create time in result
sql/lock.cc
Added marker if table was deleted to argument list
sql/mysql_priv.h
Added marker if table was deleted to argument list
sql/mysqld.cc
myisam-recover options changed from OFF to 'DEFAULT' to get less change of data loss when using MyISAM
Allow one to specify OFF as argument to myisam-recover (was default before but one couldn't specify it)
sql/sql_base.cc
Mark if table is going to be deleted
sql/sql_delete.cc
Mark if table is going to be deleted
sql/sql_table.cc
Mark if table is going to be deleted
Don't call extra(HA_EXTRA_FORCE_REOPEN) in ALTER TABLE if table is locked as this will mark table as crashed!
sql/table.cc
Signal to handler if table is getting deleted as part of getting droped from table cache.
sql/table.h
Added marker if table is going to be deleted.
storage/maria/ha_maria.cc
Don't search for transaction handler if file is not transactional or outside of transaction
(Fixed possible core dump)
storage/maria/ma_blockrec.c
Don't write changed information if table is going to be deleted.
storage/maria/ma_close.c
Don't write changed information if table is going to be deleted.
storage/maria/ma_extra.c
Mark tables that are deleted as crased, to ensure good behavior on restart if we suddenly crash.
storage/maria/ma_locking.c
Cleanup
storage/maria/ma_recovery.c
We need trnman to be inited during redo phase (to be able to open tables checked with maria_chk)
storage/maria/maria_def.h
Added marker if table is going to be deleted.
storage/myisam/mi_close.c
Don't write changed information if table is going to be deleted.
storage/myisam/mi_extra.c
Mark tables that are deleted as crased, to ensure good behavior on restart if we suddenly crash.
storage/myisam/mi_open.c
Added assert to detect if we accidently would use MyISAM versioning in MySQL
storage/myisam/myisamdef.h
Added marker if table is going to be deleted.
=== modified file 'include/my_base.h'
--- a/include/my_base.h 2009-09-07 20:50:10 +0000
+++ b/include/my_base.h 2010-02-10 19:06:24 +0000
@@ -111,7 +111,7 @@ enum ha_storage_media {
enum ha_extra_function {
HA_EXTRA_NORMAL=0, /* Optimize for space (def) */
HA_EXTRA_QUICK=1, /* Optimize for speed */
- HA_EXTRA_NOT_USED=2,
+ HA_EXTRA_NOT_USED=2, /* Should be ignored by handler */
HA_EXTRA_CACHE=3, /* Cache record in HA_rrnd() */
HA_EXTRA_NO_CACHE=4, /* End caching of records (def) */
HA_EXTRA_NO_READCHECK=5, /* No readcheck on update */
=== modified file 'mysql-test/mysql-test-run.pl'
--- a/mysql-test/mysql-test-run.pl 2010-01-29 10:42:31 +0000
+++ b/mysql-test/mysql-test-run.pl 2010-02-10 19:06:24 +0000
@@ -5542,6 +5542,8 @@ sub usage ($) {
if ( $message )
{
print STDERR "$message\n";
+ print STDERR "For full list of options, use $0 --help\n";
+ exit;
}
print <<HERE;
=== modified file 'mysql-test/r/sp-destruct.result'
--- a/mysql-test/r/sp-destruct.result 2009-11-21 11:18:21 +0000
+++ b/mysql-test/r/sp-destruct.result 2010-02-10 19:06:24 +0000
@@ -1,4 +1,5 @@
call mtr.add_suppression("Column count of mysql.proc is wrong. Expected 20, found 19. The table is probably corrupted");
+flush table mysql.proc;
use test;
drop procedure if exists bug14233;
drop function if exists bug14233;
=== modified file 'mysql-test/r/variables.result'
--- a/mysql-test/r/variables.result 2010-01-11 13:15:28 +0000
+++ b/mysql-test/r/variables.result 2010-02-10 19:06:24 +0000
@@ -1261,12 +1261,12 @@ ERROR HY000: Variable 'lower_case_table_
#
SHOW VARIABLES like 'myisam_recover_options';
Variable_name Value
-myisam_recover_options OFF
+myisam_recover_options DEFAULT
SELECT @@session.myisam_recover_options;
ERROR HY000: Variable 'myisam_recover_options' is a GLOBAL variable
SELECT @@global.myisam_recover_options;
@@global.myisam_recover_options
-OFF
+DEFAULT
SET @@session.myisam_recover_options= 'x';
ERROR HY000: Variable 'myisam_recover_options' is a read only variable
SET @@global.myisam_recover_options= 'x';
=== modified file 'mysql-test/r/view.result'
--- a/mysql-test/r/view.result 2009-10-15 21:38:29 +0000
+++ b/mysql-test/r/view.result 2010-02-10 19:06:24 +0000
@@ -155,13 +155,13 @@ v5 VIEW
v6 VIEW
show table status;
Name Engine Version Row_format Rows Avg_row_length Data_length Max_data_length Index_length Data_free Auto_increment Create_time Update_time Check_time Collation Checksum Create_options Comment
-t1 MyISAM 10 Fixed 5 9 45 # 1024 0 NULL # # NULL latin1_swedish_ci NULL
-v1 NULL NULL NULL NULL NULL NULL # NULL NULL NULL # # NULL NULL NULL NULL VIEW
-v2 NULL NULL NULL NULL NULL NULL # NULL NULL NULL # # NULL NULL NULL NULL VIEW
-v3 NULL NULL NULL NULL NULL NULL # NULL NULL NULL # # NULL NULL NULL NULL VIEW
-v4 NULL NULL NULL NULL NULL NULL # NULL NULL NULL # # NULL NULL NULL NULL VIEW
-v5 NULL NULL NULL NULL NULL NULL # NULL NULL NULL # # NULL NULL NULL NULL VIEW
-v6 NULL NULL NULL NULL NULL NULL # NULL NULL NULL # # NULL NULL NULL NULL VIEW
+t1 MyISAM 10 Fixed 5 9 45 # 1024 0 NULL # # # latin1_swedish_ci NULL
+v1 NULL NULL NULL NULL NULL NULL # NULL NULL NULL # # # NULL NULL NULL VIEW
+v2 NULL NULL NULL NULL NULL NULL # NULL NULL NULL # # # NULL NULL NULL VIEW
+v3 NULL NULL NULL NULL NULL NULL # NULL NULL NULL # # # NULL NULL NULL VIEW
+v4 NULL NULL NULL NULL NULL NULL # NULL NULL NULL # # # NULL NULL NULL VIEW
+v5 NULL NULL NULL NULL NULL NULL # NULL NULL NULL # # # NULL NULL NULL VIEW
+v6 NULL NULL NULL NULL NULL NULL # NULL NULL NULL # # # NULL NULL NULL VIEW
drop view v1,v2,v3,v4,v5,v6;
create view v1 (c,d,e,f) as select a,b,
a in (select a+2 from t1), a = all (select a from t1) from t1;
=== modified file 'mysql-test/suite/maria/t/maria-recovery2-master.opt'
--- a/mysql-test/suite/maria/t/maria-recovery2-master.opt 2009-01-15 14:29:14 +0000
+++ b/mysql-test/suite/maria/t/maria-recovery2-master.opt 2010-02-10 19:06:24 +0000
@@ -1 +1 @@
---skip-stack-trace --skip-core-file --loose-maria-log-dir-path=$MYSQLTEST_VARDIR/tmp
+--skip-stack-trace --skip-core-file --loose-maria-log-dir-path=$MYSQLTEST_VARDIR/tmp --myisam-recover=
=== modified file 'mysql-test/t/sp-destruct.test'
--- a/mysql-test/t/sp-destruct.test 2009-11-21 11:18:21 +0000
+++ b/mysql-test/t/sp-destruct.test 2010-02-10 19:06:24 +0000
@@ -17,6 +17,7 @@ call mtr.add_suppression("Column count o
# Backup proc table
let $MYSQLD_DATADIR= `select @@datadir`;
+flush table mysql.proc;
--copy_file $MYSQLD_DATADIR/mysql/proc.frm $MYSQLTEST_VARDIR/tmp/proc.frm
--copy_file $MYSQLD_DATADIR/mysql/proc.MYD $MYSQLTEST_VARDIR/tmp/proc.MYD
--copy_file $MYSQLD_DATADIR/mysql/proc.MYI $MYSQLTEST_VARDIR/tmp/proc.MYI
=== modified file 'mysql-test/t/view.test'
--- a/mysql-test/t/view.test 2009-10-15 21:38:29 +0000
+++ b/mysql-test/t/view.test 2010-02-10 19:06:24 +0000
@@ -87,7 +87,7 @@ explain extended select c from v6;
# show table/table status test
show tables;
show full tables;
---replace_column 8 # 12 # 13 #
+--replace_column 8 # 12 # 13 # 14 #
show table status;
drop view v1,v2,v3,v4,v5,v6;
=== modified file 'sql/lock.cc'
--- a/sql/lock.cc 2009-10-15 21:38:29 +0000
+++ b/sql/lock.cc 2010-02-10 19:06:24 +0000
@@ -1049,10 +1049,14 @@ int lock_table_name(THD *thd, TABLE_LIST
DBUG_RETURN(-1);
table_list->table=table;
+ table->s->deleting= table_list->deleting;
/* Return 1 if table is in use */
DBUG_RETURN(test(remove_table_from_cache(thd, db, table_list->table_name,
- check_in_use ? RTFC_NO_FLAG : RTFC_WAIT_OTHER_THREAD_FLAG)));
+ (check_in_use ?
+ RTFC_NO_FLAG :
+ RTFC_WAIT_OTHER_THREAD_FLAG),
+ table_list->deleting)));
}
=== modified file 'sql/mysql_priv.h'
--- a/sql/mysql_priv.h 2009-12-03 11:19:05 +0000
+++ b/sql/mysql_priv.h 2010-02-10 19:06:24 +0000
@@ -1636,7 +1636,7 @@ uint prep_alter_part_table(THD *thd, TAB
#define RTFC_WAIT_OTHER_THREAD_FLAG 0x0002
#define RTFC_CHECK_KILLED_FLAG 0x0004
bool remove_table_from_cache(THD *thd, const char *db, const char *table,
- uint flags);
+ uint flags, my_bool deleting);
#define NORMAL_PART_NAME 0
#define TEMP_PART_NAME 1
=== modified file 'sql/mysqld.cc'
--- a/sql/mysqld.cc 2010-01-29 18:42:22 +0000
+++ b/sql/mysqld.cc 2010-02-10 19:06:24 +0000
@@ -7962,7 +7962,13 @@ static int mysql_init_variables(void)
refresh_version= 1L; /* Increments on each reload */
global_query_id= thread_id= 1L;
strmov(server_version, MYSQL_SERVER_VERSION);
- myisam_recover_options_str= sql_mode_str= "OFF";
+ sql_mode_str= "";
+
+ /* By default, auto-repair MyISAM tables after crash */
+ myisam_recover_options_str= "DEFAULT";
+ myisam_recover_options= HA_RECOVER_DEFAULT;
+ ha_open_options|= HA_OPEN_ABORT_IF_CRASHED;
+
myisam_stats_method_str= "nulls_unequal";
my_bind_addr = htonl(INADDR_ANY);
threads.empty();
@@ -8616,26 +8622,31 @@ mysqld_get_one_option(int optid,
#endif
case OPT_MYISAM_RECOVER:
{
- if (!argument)
- {
- myisam_recover_options= HA_RECOVER_DEFAULT;
- myisam_recover_options_str= myisam_recover_typelib.type_names[0];
- }
- else if (!argument[0])
+ if (argument && (!argument[0] ||
+ my_strcasecmp(system_charset_info, argument, "OFF") == 0))
{
myisam_recover_options= HA_RECOVER_NONE;
myisam_recover_options_str= "OFF";
+ ha_open_options&= ~HA_OPEN_ABORT_IF_CRASHED;
}
else
{
- myisam_recover_options_str=argument;
- myisam_recover_options=
- find_bit_type_or_exit(argument, &myisam_recover_typelib, opt->name,
- &error);
- if (error)
- return 1;
+ if (!argument)
+ {
+ myisam_recover_options= HA_RECOVER_DEFAULT;
+ myisam_recover_options_str= myisam_recover_typelib.type_names[0];
+ }
+ else
+ {
+ myisam_recover_options_str=argument;
+ myisam_recover_options=
+ find_bit_type_or_exit(argument, &myisam_recover_typelib, opt->name,
+ &error);
+ if (error)
+ return 1;
+ }
+ ha_open_options|=HA_OPEN_ABORT_IF_CRASHED;
}
- ha_open_options|=HA_OPEN_ABORT_IF_CRASHED;
break;
}
case OPT_CONCURRENT_INSERT:
=== modified file 'sql/sql_base.cc'
--- a/sql/sql_base.cc 2010-01-15 15:27:55 +0000
+++ b/sql/sql_base.cc 2010-02-10 19:06:24 +0000
@@ -930,7 +930,7 @@ bool close_cached_tables(THD *thd, TABLE
for (TABLE_LIST *table= tables; table; table= table->next_local)
{
if (remove_table_from_cache(thd, table->db, table->table_name,
- RTFC_OWNED_BY_THD_FLAG))
+ RTFC_OWNED_BY_THD_FLAG, table->deleting))
found=1;
}
if (!found)
@@ -8404,6 +8404,11 @@ void remove_db_from_cache(const char *db
if (!strcmp(table->s->db.str, db))
{
table->s->version= 0L; /* Free when thread is ready */
+ /*
+ This functions only called from DROP DATABASE code, so we are going
+ to drop all tables so we mark them as deleting
+ */
+ table->s->deleting= TRUE;
if (!table->in_use)
relink_unused(table);
}
@@ -8446,7 +8451,7 @@ void flush_tables()
*/
bool remove_table_from_cache(THD *thd, const char *db, const char *table_name,
- uint flags)
+ uint flags, my_bool deleting)
{
char key[MAX_DBKEY_LENGTH];
uint key_length;
@@ -8540,7 +8545,10 @@ bool remove_table_from_cache(THD *thd, c
}
}
while (unused_tables && !unused_tables->s->version)
+ {
+ unused_tables->s->deleting= deleting;
VOID(hash_delete(&open_cache,(uchar*) unused_tables));
+ }
DBUG_PRINT("info", ("Removing table from table_def_cache"));
/* Remove table from table definition cache if it's not in use */
@@ -8734,7 +8742,8 @@ int abort_and_upgrade_lock(ALTER_PARTITI
/* If MERGE child, forward lock handling to parent. */
mysql_lock_abort(lpt->thd, lpt->table->parent ? lpt->table->parent :
lpt->table, TRUE);
- VOID(remove_table_from_cache(lpt->thd, lpt->db, lpt->table_name, flags));
+ VOID(remove_table_from_cache(lpt->thd, lpt->db, lpt->table_name, flags,
+ FALSE));
VOID(pthread_mutex_unlock(&LOCK_open));
DBUG_RETURN(0);
}
@@ -8759,7 +8768,7 @@ void close_open_tables_and_downgrade(ALT
{
VOID(pthread_mutex_lock(&LOCK_open));
remove_table_from_cache(lpt->thd, lpt->db, lpt->table_name,
- RTFC_WAIT_OTHER_THREAD_FLAG);
+ RTFC_WAIT_OTHER_THREAD_FLAG, FALSE);
VOID(pthread_mutex_unlock(&LOCK_open));
/* If MERGE child, forward lock handling to parent. */
mysql_lock_downgrade_write(lpt->thd, lpt->table->parent ? lpt->table->parent :
=== modified file 'sql/sql_delete.cc'
--- a/sql/sql_delete.cc 2010-01-15 15:27:55 +0000
+++ b/sql/sql_delete.cc 2010-02-10 19:06:24 +0000
@@ -1088,6 +1088,7 @@ bool mysql_truncate(THD *thd, TABLE_LIST
HA_CREATE_INFO create_info;
char path[FN_REFLEN + 1];
TABLE *table;
+ TABLE_LIST *tbl;
bool error;
uint path_length;
bool is_temporary_table= false;
@@ -1108,6 +1109,9 @@ bool mysql_truncate(THD *thd, TABLE_LIST
if (!ha_check_storage_engine_flag(table_type, HTON_CAN_RECREATE))
goto trunc_by_del;
+ for (tbl= table_list; tbl; tbl= tbl->next_local)
+ tbl->deleting= TRUE; /* to trigger HA_PREPARE_FOR_DROP */
+
table->file->info(HA_STATUS_AUTO | HA_STATUS_NO_LOCK);
create_info.options|= HA_LEX_CREATE_TMP_TABLE;
=== modified file 'sql/sql_table.cc'
--- a/sql/sql_table.cc 2010-01-15 15:27:55 +0000
+++ b/sql/sql_table.cc 2010-02-10 19:06:24 +0000
@@ -1880,6 +1880,7 @@ int mysql_rm_table_part2(THD *thd, TABLE
{
TABLE_SHARE *share;
table->db_type= NULL;
+
if ((share= get_cached_table_share(table->db, table->table_name)))
table->db_type= share->db_type();
@@ -1974,9 +1975,10 @@ int mysql_rm_table_part2(THD *thd, TABLE
{
TABLE *locked_table;
abort_locked_tables(thd, db, table->table_name);
+ table->deleting= TRUE;
remove_table_from_cache(thd, db, table->table_name,
RTFC_WAIT_OTHER_THREAD_FLAG |
- RTFC_CHECK_KILLED_FLAG);
+ RTFC_CHECK_KILLED_FLAG, FALSE);
/*
If the table was used in lock tables, remember it so that
unlock_table_names can free it
@@ -4213,9 +4215,10 @@ void wait_while_table_is_used(THD *thd,T
/* Wait until all there are no other threads that has this table open */
remove_table_from_cache(thd, table->s->db.str,
table->s->table_name.str,
- RTFC_WAIT_OTHER_THREAD_FLAG);
+ RTFC_WAIT_OTHER_THREAD_FLAG, FALSE);
/* extra() call must come only after all instances above are closed */
- VOID(table->file->extra(function));
+ if (function != HA_EXTRA_NOT_USED)
+ VOID(table->file->extra(function));
DBUG_VOID_RETURN;
}
@@ -4717,7 +4720,7 @@ static bool mysql_admin_table(THD* thd,
remove_table_from_cache(thd, table->table->s->db.str,
table->table->s->table_name.str,
RTFC_WAIT_OTHER_THREAD_FLAG |
- RTFC_CHECK_KILLED_FLAG);
+ RTFC_CHECK_KILLED_FLAG, FALSE);
thd->exit_cond(old_message);
DBUG_EXECUTE_IF("wait_in_mysql_admin_table", wait_for_kill_signal(thd););
if (thd->killed)
@@ -4975,7 +4978,8 @@ send_result_message:
{
pthread_mutex_lock(&LOCK_open);
remove_table_from_cache(thd, table->table->s->db.str,
- table->table->s->table_name.str, RTFC_NO_FLAG);
+ table->table->s->table_name.str,
+ RTFC_NO_FLAG, FALSE);
pthread_mutex_unlock(&LOCK_open);
}
/* May be something modified consequently we have to invalidate cache */
@@ -6738,7 +6742,9 @@ view_err:
from concurrent DDL statements.
*/
VOID(pthread_mutex_lock(&LOCK_open));
- wait_while_table_is_used(thd, table, HA_EXTRA_FORCE_REOPEN);
+ wait_while_table_is_used(thd, table,
+ thd->locked_tables ? HA_EXTRA_NOT_USED :
+ HA_EXTRA_FORCE_REOPEN);
VOID(pthread_mutex_unlock(&LOCK_open));
DBUG_EXECUTE_IF("sleep_alter_enable_indexes", my_sleep(6000000););
error= table->file->ha_enable_indexes(HA_KEY_SWITCH_NONUNIQ_SAVE);
@@ -6746,7 +6752,9 @@ view_err:
break;
case DISABLE:
VOID(pthread_mutex_lock(&LOCK_open));
- wait_while_table_is_used(thd, table, HA_EXTRA_FORCE_REOPEN);
+ wait_while_table_is_used(thd, table,
+ thd->locked_tables ? HA_EXTRA_NOT_USED :
+ HA_EXTRA_FORCE_REOPEN);
VOID(pthread_mutex_unlock(&LOCK_open));
error=table->file->ha_disable_indexes(HA_KEY_SWITCH_NONUNIQ_SAVE);
/* COND_refresh will be signaled in close_thread_tables() */
@@ -7192,7 +7200,9 @@ view_err:
else
{
VOID(pthread_mutex_lock(&LOCK_open));
- wait_while_table_is_used(thd, table, HA_EXTRA_FORCE_REOPEN);
+ wait_while_table_is_used(thd, table,
+ thd->locked_tables ? HA_EXTRA_NOT_USED :
+ HA_EXTRA_FORCE_REOPEN);
VOID(pthread_mutex_unlock(&LOCK_open));
thd_proc_info(thd, "manage keys");
alter_table_manage_keys(table, table->file->indexes_are_disabled(),
=== modified file 'sql/table.cc'
--- a/sql/table.cc 2010-01-15 15:27:55 +0000
+++ b/sql/table.cc 2010-02-10 19:06:24 +0000
@@ -1977,7 +1977,11 @@ int closefrm(register TABLE *table, bool
DBUG_PRINT("enter", ("table: 0x%lx", (long) table));
if (table->db_stat)
+ {
+ if (table->s->deleting)
+ table->file->extra(HA_EXTRA_PREPARE_FOR_DROP);
error=table->file->close();
+ }
my_free((char*) table->alias, MYF(MY_ALLOW_ZERO_PTR));
table->alias= 0;
if (table->field)
=== modified file 'sql/table.h'
--- a/sql/table.h 2010-01-15 15:27:55 +0000
+++ b/sql/table.h 2010-02-10 19:06:24 +0000
@@ -431,6 +431,7 @@ typedef struct st_table_share
bool is_view;
bool name_lock, replace_with_name_lock;
bool waiting_on_cond; /* Protection against free */
+ bool deleting; /* going to delete this table */
ulong table_map_id; /* for row-based replication */
ulonglong table_map_version;
@@ -1379,7 +1380,7 @@ struct TABLE_LIST
*/
bool create;
bool internal_tmp_table;
-
+ bool deleting; /* going to delete this table */
/* View creation context. */
=== modified file 'storage/maria/ha_maria.cc'
--- a/storage/maria/ha_maria.cc 2009-12-03 11:34:11 +0000
+++ b/storage/maria/ha_maria.cc 2010-02-10 19:06:24 +0000
@@ -2255,9 +2255,12 @@ int ha_maria::extra(enum ha_extra_functi
extern_lock(F_UNLOCK) (which resets file->trn) followed by maria_close()
without calling commit/rollback in between. If file->trn is not set
we can't remove file->share from the transaction list in the extra() call.
+
+ table->in_use is not set in the case this is a done as part of closefrm()
+ as part of drop table.
*/
- if (!file->trn &&
+ if (file->s->now_transactional && !file->trn && table->in_use &&
(operation == HA_EXTRA_PREPARE_FOR_DROP ||
operation == HA_EXTRA_PREPARE_FOR_RENAME))
{
=== modified file 'storage/maria/ma_blockrec.c'
--- a/storage/maria/ma_blockrec.c 2010-01-28 11:35:10 +0000
+++ b/storage/maria/ma_blockrec.c 2010-02-10 19:06:24 +0000
@@ -430,8 +430,9 @@ my_bool _ma_once_end_block_record(MARIA_
if (share->bitmap.file.file >= 0)
{
if (flush_pagecache_blocks(share->pagecache, &share->bitmap.file,
- share->temporary ? FLUSH_IGNORE_CHANGED :
- FLUSH_RELEASE))
+ ((share->temporary || share->deleting) ?
+ FLUSH_IGNORE_CHANGED :
+ FLUSH_RELEASE)))
res= 1;
/*
File must be synced as it is going out of the maria_open_list and so
=== modified file 'storage/maria/ma_close.c'
--- a/storage/maria/ma_close.c 2010-01-29 18:42:22 +0000
+++ b/storage/maria/ma_close.c 2010-02-10 19:06:24 +0000
@@ -79,7 +79,7 @@ int maria_close(register MARIA_HA *info)
if ((*share->once_end)(share))
error= my_errno;
if (flush_pagecache_blocks(share->pagecache, &share->kfile,
- (share->temporary ?
+ ((share->temporary || share->deleting) ?
FLUSH_IGNORE_CHANGED :
FLUSH_RELEASE)))
error= my_errno;
=== modified file 'storage/maria/ma_extra.c'
--- a/storage/maria/ma_extra.c 2009-10-06 06:13:56 +0000
+++ b/storage/maria/ma_extra.c 2010-02-10 19:06:24 +0000
@@ -305,6 +305,12 @@ int maria_extra(MARIA_HA *info, enum ha_
pthread_mutex_unlock(&THR_LOCK_maria);
break;
case HA_EXTRA_PREPARE_FOR_DROP:
+ /* Signals about intent to delete this table */
+ share->deleting= TRUE;
+ share->global_changed= FALSE; /* force writing changed flag */
+ /* To force repair if reopened */
+ _ma_mark_file_changed(info);
+ /* Fall trough */
case HA_EXTRA_PREPARE_FOR_RENAME:
{
my_bool do_flush= test(function != HA_EXTRA_PREPARE_FOR_DROP);
=== modified file 'storage/maria/ma_locking.c'
--- a/storage/maria/ma_locking.c 2009-10-06 06:13:56 +0000
+++ b/storage/maria/ma_locking.c 2010-02-10 19:06:24 +0000
@@ -387,6 +387,9 @@ int _ma_test_if_changed(register MARIA_H
open_count is not maintained on disk for temporary tables.
*/
+#define _MA_ALREADY_MARKED_FILE_CHANGED \
+ ((share->state.changed & STATE_CHANGED) && share->global_changed)
+
int _ma_mark_file_changed(MARIA_HA *info)
{
uchar buff[3];
@@ -394,8 +397,6 @@ int _ma_mark_file_changed(MARIA_HA *info
int error= 1;
DBUG_ENTER("_ma_mark_file_changed");
-#define _MA_ALREADY_MARKED_FILE_CHANGED \
- ((share->state.changed & STATE_CHANGED) && share->global_changed)
if (_MA_ALREADY_MARKED_FILE_CHANGED)
DBUG_RETURN(0);
pthread_mutex_lock(&share->intern_lock); /* recheck under mutex */
=== modified file 'storage/maria/ma_recovery.c'
--- a/storage/maria/ma_recovery.c 2009-10-26 11:35:42 +0000
+++ b/storage/maria/ma_recovery.c 2010-02-10 19:06:24 +0000
@@ -312,11 +312,14 @@ int maria_apply_log(LSN from_lsn, enum m
now= my_getsystime();
in_redo_phase= TRUE;
+ trnman_init(max_trid_in_control_file);
if (run_redo_phase(from_lsn, apply))
{
ma_message_no_user(0, "Redo phase failed");
+ trnman_destroy();
goto err;
}
+ trnman_destroy();
if ((uncommitted_trans=
end_of_redo_phase(should_run_undo_phase)) == (uint)-1)
=== modified file 'storage/maria/maria_def.h'
--- a/storage/maria/maria_def.h 2009-11-29 23:08:56 +0000
+++ b/storage/maria/maria_def.h 2010-02-10 19:06:24 +0000
@@ -390,6 +390,7 @@ typedef struct st_maria_share
my_bool now_transactional;
my_bool have_versioning;
my_bool key_del_used; /* != 0 if key_del is locked */
+ my_bool deleting; /* we are going to delete this table */
#ifdef THREAD
THR_LOCK lock;
void (*lock_restore_status)(void *);
=== modified file 'storage/myisam/mi_close.c'
--- a/storage/myisam/mi_close.c 2009-09-07 20:50:10 +0000
+++ b/storage/myisam/mi_close.c 2010-02-10 19:06:24 +0000
@@ -64,8 +64,9 @@ int mi_close(register MI_INFO *info)
if (share->kfile >= 0) abort(););
if (share->kfile >= 0 &&
flush_key_blocks(share->key_cache, share->kfile,
- share->temporary ? FLUSH_IGNORE_CHANGED :
- FLUSH_RELEASE))
+ ((share->temporary || share->deleting) ?
+ FLUSH_IGNORE_CHANGED :
+ FLUSH_RELEASE)))
error=my_errno;
if (share->kfile >= 0)
{
=== modified file 'storage/myisam/mi_extra.c'
--- a/storage/myisam/mi_extra.c 2009-10-06 06:13:56 +0000
+++ b/storage/myisam/mi_extra.c 2010-02-10 19:06:24 +0000
@@ -256,8 +256,13 @@ int mi_extra(MI_INFO *info, enum ha_extr
share->last_version= 0L; /* Impossible version */
pthread_mutex_unlock(&THR_LOCK_myisam);
break;
- case HA_EXTRA_PREPARE_FOR_RENAME:
case HA_EXTRA_PREPARE_FOR_DROP:
+ /* Signals about intent to delete this table */
+ share->deleting= TRUE;
+ share->global_changed= FALSE; /* force writing changed flag */
+ _mi_mark_file_changed(info);
+ /* Fall trough */
+ case HA_EXTRA_PREPARE_FOR_RENAME:
pthread_mutex_lock(&THR_LOCK_myisam);
share->last_version= 0L; /* Impossible version */
pthread_mutex_lock(&share->intern_lock);
=== modified file 'storage/myisam/mi_open.c'
--- a/storage/myisam/mi_open.c 2009-12-03 11:19:05 +0000
+++ b/storage/myisam/mi_open.c 2010-02-10 19:06:24 +0000
@@ -58,6 +58,8 @@ MI_INFO *test_if_reopen(char *filename)
{
MI_INFO *info=(MI_INFO*) pos->data;
MYISAM_SHARE *share=info->s;
+ DBUG_ASSERT(strcmp(share->unique_file_name,filename) ||
+ share->last_version);
if (!strcmp(share->unique_file_name,filename) && share->last_version)
return info;
}
=== modified file 'storage/myisam/myisamdef.h'
--- a/storage/myisam/myisamdef.h 2009-12-03 11:34:11 +0000
+++ b/storage/myisam/myisamdef.h 2010-02-10 19:06:24 +0000
@@ -221,6 +221,7 @@ typedef struct st_mi_isam_share
my_bool changed, /* If changed since lock */
global_changed, /* If changed since open */
not_flushed, temporary, delay_key_write, concurrent_insert;
+ my_bool deleting; /* we are going to delete this table */
#ifdef THREAD
THR_LOCK lock;
pthread_mutex_t intern_lock; /* Locking for use with _locking */
1
0

[Maria-developers] Rev 2757: Subquery optimizations backport: Update test results (checked) in file:///home/psergey/dev/maria-5.3-subqueries-r3/
by Sergey Petrunya 09 Feb '10
by Sergey Petrunya 09 Feb '10
09 Feb '10
At file:///home/psergey/dev/maria-5.3-subqueries-r3/
------------------------------------------------------------
revno: 2757
revision-id: psergey(a)askmonty.org-20100209203217-al1k9h50zrlphy5d
parent: psergey(a)askmonty.org-20100208133030-e4zjy15b7o14ud8c
committer: Sergey Petrunya <psergey(a)askmonty.org>
branch nick: maria-5.3-subqueries-r3
timestamp: Tue 2010-02-09 23:32:17 +0300
message:
Subquery optimizations backport: Update test results (checked)
=== modified file 'mysql-test/r/join_cache.result'
--- a/mysql-test/r/join_cache.result 2009-12-21 02:26:15 +0000
+++ b/mysql-test/r/join_cache.result 2010-02-09 20:32:17 +0000
@@ -1028,8 +1028,8 @@
WHERE City.Country IN (SELECT Code FROM Country WHERE Country.Name LIKE 'L%') AND
City.Population > 100000;
id select_type table type possible_keys key key_len ref rows Extra
-1 PRIMARY City ALL Population NULL NULL NULL 4079 Using where
-2 DEPENDENT SUBQUERY Country unique_subquery PRIMARY,Name PRIMARY 3 func 1 Using where
+1 PRIMARY Country range PRIMARY,Name Name 52 NULL 10 Using index condition; Using MRR
+1 PRIMARY City ref Population,Country Country 3 world.Country.Code 18 Using where; Using join buffer
SELECT Name FROM City
WHERE City.Country IN (SELECT Code FROM Country WHERE Country.Name LIKE 'L%') AND
City.Population > 100000;
@@ -1343,8 +1343,8 @@
WHERE City.Country IN (SELECT Code FROM Country WHERE Country.Name LIKE 'L%') AND
City.Population > 100000;
id select_type table type possible_keys key key_len ref rows Extra
-1 PRIMARY City ALL Population NULL NULL NULL 4079 Using where
-2 DEPENDENT SUBQUERY Country unique_subquery PRIMARY,Name PRIMARY 3 func 1 Using where
+1 PRIMARY Country range PRIMARY,Name Name 52 NULL 10 Using index condition; Using MRR
+1 PRIMARY City ref Population,Country Country 3 world.Country.Code 18 Using where; Using join buffer
SELECT Name FROM City
WHERE City.Country IN (SELECT Code FROM Country WHERE Country.Name LIKE 'L%') AND
City.Population > 100000;
@@ -1658,8 +1658,8 @@
WHERE City.Country IN (SELECT Code FROM Country WHERE Country.Name LIKE 'L%') AND
City.Population > 100000;
id select_type table type possible_keys key key_len ref rows Extra
-1 PRIMARY City ALL Population NULL NULL NULL 4079 Using where
-2 DEPENDENT SUBQUERY Country unique_subquery PRIMARY,Name PRIMARY 3 func 1 Using where
+1 PRIMARY Country range PRIMARY,Name Name 52 NULL 10 Using index condition; Using MRR
+1 PRIMARY City ref Population,Country Country 3 world.Country.Code 18 Using where; Using join buffer
SELECT Name FROM City
WHERE City.Country IN (SELECT Code FROM Country WHERE Country.Name LIKE 'L%') AND
City.Population > 100000;
@@ -1973,8 +1973,8 @@
WHERE City.Country IN (SELECT Code FROM Country WHERE Country.Name LIKE 'L%') AND
City.Population > 100000;
id select_type table type possible_keys key key_len ref rows Extra
-1 PRIMARY City ALL Population NULL NULL NULL 4079 Using where
-2 DEPENDENT SUBQUERY Country unique_subquery PRIMARY,Name PRIMARY 3 func 1 Using where
+1 PRIMARY Country range PRIMARY,Name Name 52 NULL 10 Using index condition; Using MRR
+1 PRIMARY City ref Population,Country Country 3 world.Country.Code 18 Using where; Using join buffer
SELECT Name FROM City
WHERE City.Country IN (SELECT Code FROM Country WHERE Country.Name LIKE 'L%') AND
City.Population > 100000;
@@ -2292,8 +2292,8 @@
WHERE City.Country IN (SELECT Code FROM Country WHERE Country.Name LIKE 'L%') AND
City.Population > 100000;
id select_type table type possible_keys key key_len ref rows Extra
-1 PRIMARY City ALL Population NULL NULL NULL 4079 Using where
-2 DEPENDENT SUBQUERY Country unique_subquery PRIMARY,Name PRIMARY 3 func 1 Using where
+1 PRIMARY Country range PRIMARY,Name Name 52 NULL 10 Using index condition; Using MRR
+1 PRIMARY City ref Population,Country Country 3 world.Country.Code 18 Using where; Using join buffer
SELECT Name FROM City
WHERE City.Country IN (SELECT Code FROM Country WHERE Country.Name LIKE 'L%') AND
City.Population > 100000;
@@ -2514,8 +2514,8 @@
WHERE City.Country IN (SELECT Code FROM Country WHERE Country.Name LIKE 'L%') AND
City.Population > 100000;
id select_type table type possible_keys key key_len ref rows Extra
-1 PRIMARY City ALL Population NULL NULL NULL 4079 Using where
-2 DEPENDENT SUBQUERY Country unique_subquery PRIMARY,Name PRIMARY 3 func 1 Using where
+1 PRIMARY Country range PRIMARY,Name Name 52 NULL 10 Using index condition; Using MRR
+1 PRIMARY City ref Population,Country Country 3 world.Country.Code 18 Using where; Using join buffer
SELECT Name FROM City
WHERE City.Country IN (SELECT Code FROM Country WHERE Country.Name LIKE 'L%') AND
City.Population > 100000;
@@ -2736,8 +2736,8 @@
WHERE City.Country IN (SELECT Code FROM Country WHERE Country.Name LIKE 'L%') AND
City.Population > 100000;
id select_type table type possible_keys key key_len ref rows Extra
-1 PRIMARY City ALL Population NULL NULL NULL 4079 Using where
-2 DEPENDENT SUBQUERY Country unique_subquery PRIMARY,Name PRIMARY 3 func 1 Using where
+1 PRIMARY Country range PRIMARY,Name Name 52 NULL 10 Using index condition; Using MRR
+1 PRIMARY City ref Population,Country Country 3 world.Country.Code 18 Using where; Using join buffer
SELECT Name FROM City
WHERE City.Country IN (SELECT Code FROM Country WHERE Country.Name LIKE 'L%') AND
City.Population > 100000;
@@ -2958,8 +2958,8 @@
WHERE City.Country IN (SELECT Code FROM Country WHERE Country.Name LIKE 'L%') AND
City.Population > 100000;
id select_type table type possible_keys key key_len ref rows Extra
-1 PRIMARY City ALL Population NULL NULL NULL 4079 Using where
-2 DEPENDENT SUBQUERY Country unique_subquery PRIMARY,Name PRIMARY 3 func 1 Using where
+1 PRIMARY Country range PRIMARY,Name Name 52 NULL 10 Using index condition; Using MRR
+1 PRIMARY City ref Population,Country Country 3 world.Country.Code 18 Using where; Using join buffer
SELECT Name FROM City
WHERE City.Country IN (SELECT Code FROM Country WHERE Country.Name LIKE 'L%') AND
City.Population > 100000;
=== modified file 'mysql-test/r/type_datetime.result'
--- a/mysql-test/r/type_datetime.result 2009-02-13 18:07:03 +0000
+++ b/mysql-test/r/type_datetime.result 2010-02-09 20:32:17 +0000
@@ -514,10 +514,9 @@
where id in (select id from t1 as x1 where (t1.cur_date is null));
id select_type table type possible_keys key key_len ref rows filtered Extra
1 PRIMARY NULL NULL NULL NULL NULL NULL NULL NULL Impossible WHERE noticed after reading const tables
-2 DEPENDENT SUBQUERY NULL NULL NULL NULL NULL NULL NULL NULL Impossible WHERE
Warnings:
Note 1276 Field or reference 'test.t1.cur_date' of SELECT #2 was resolved in SELECT #1
-Note 1003 select '1' AS `id`,'2007-04-25 18:30:22' AS `cur_date` from `test`.`t1` where <in_optimizer>('1',<exists>(select 1 AS `Not_used` from `test`.`t1` `x1` where 0))
+Note 1003 select '1' AS `id`,'2007-04-25 18:30:22' AS `cur_date` from `test`.`t1` `x1` join `test`.`t1` where (('2007-04-25 18:30:22' = 0))
select * from t1
where id in (select id from t1 as x1 where (t1.cur_date is null));
id cur_date
@@ -526,10 +525,9 @@
where id in (select id from t2 as x1 where (t2.cur_date is null));
id select_type table type possible_keys key key_len ref rows filtered Extra
1 PRIMARY NULL NULL NULL NULL NULL NULL NULL NULL Impossible WHERE noticed after reading const tables
-2 DEPENDENT SUBQUERY NULL NULL NULL NULL NULL NULL NULL NULL Impossible WHERE
Warnings:
Note 1276 Field or reference 'test.t2.cur_date' of SELECT #2 was resolved in SELECT #1
-Note 1003 select '1' AS `id`,'2007-04-25' AS `cur_date` from `test`.`t2` where <in_optimizer>('1',<exists>(select 1 AS `Not_used` from `test`.`t2` `x1` where 0))
+Note 1003 select '1' AS `id`,'2007-04-25' AS `cur_date` from `test`.`t2` `x1` join `test`.`t2` where (('2007-04-25' = 0))
select * from t2
where id in (select id from t2 as x1 where (t2.cur_date is null));
id cur_date
@@ -540,10 +538,10 @@
where id in (select id from t1 as x1 where (t1.cur_date is null));
id select_type table type possible_keys key key_len ref rows filtered Extra
1 PRIMARY t1 ALL NULL NULL NULL NULL 2 100.00 Using where
-2 DEPENDENT SUBQUERY x1 ALL NULL NULL NULL NULL 2 100.00 Using where
+1 PRIMARY x1 ALL NULL NULL NULL NULL 2 100.00 Using where; FirstMatch(t1)
Warnings:
Note 1276 Field or reference 'test.t1.cur_date' of SELECT #2 was resolved in SELECT #1
-Note 1003 select `test`.`t1`.`id` AS `id`,`test`.`t1`.`cur_date` AS `cur_date` from `test`.`t1` where <in_optimizer>(`test`.`t1`.`id`,<exists>(select 1 AS `Not_used` from `test`.`t1` `x1` where ((`test`.`t1`.`cur_date` = 0) and (<cache>(`test`.`t1`.`id`) = `test`.`x1`.`id`))))
+Note 1003 select `test`.`t1`.`id` AS `id`,`test`.`t1`.`cur_date` AS `cur_date` from `test`.`t1` semi join (`test`.`t1` `x1`) where ((`test`.`x1`.`id` = `test`.`t1`.`id`) and (`test`.`t1`.`cur_date` = 0))
select * from t1
where id in (select id from t1 as x1 where (t1.cur_date is null));
id cur_date
@@ -552,10 +550,10 @@
where id in (select id from t2 as x1 where (t2.cur_date is null));
id select_type table type possible_keys key key_len ref rows filtered Extra
1 PRIMARY t2 ALL NULL NULL NULL NULL 2 100.00 Using where
-2 DEPENDENT SUBQUERY x1 ALL NULL NULL NULL NULL 2 100.00 Using where
+1 PRIMARY x1 ALL NULL NULL NULL NULL 2 100.00 Using where; FirstMatch(t2)
Warnings:
Note 1276 Field or reference 'test.t2.cur_date' of SELECT #2 was resolved in SELECT #1
-Note 1003 select `test`.`t2`.`id` AS `id`,`test`.`t2`.`cur_date` AS `cur_date` from `test`.`t2` where <in_optimizer>(`test`.`t2`.`id`,<exists>(select 1 AS `Not_used` from `test`.`t2` `x1` where ((`test`.`t2`.`cur_date` = 0) and (<cache>(`test`.`t2`.`id`) = `test`.`x1`.`id`))))
+Note 1003 select `test`.`t2`.`id` AS `id`,`test`.`t2`.`cur_date` AS `cur_date` from `test`.`t2` semi join (`test`.`t2` `x1`) where ((`test`.`x1`.`id` = `test`.`t2`.`id`) and (`test`.`t2`.`cur_date` = 0))
select * from t2
where id in (select id from t2 as x1 where (t2.cur_date is null));
id cur_date
1
0

09 Feb '10
Hi Daniel,
We really need to put some instructions prominently on the download page for
installing the .debs.
I have seen users having big problems installing them. Just something like
`dpkg --install *.deb` does not work at all :-(
I actually had to spend like 15 minutes sorting out exactly how to install it,
many users would have given up.
Here are the commands that I found will work on Debian 5 amd64 (hopefully
other .deb distros will be similar):
sudo apt-get update
sudo apt-get install libdbi-perl libdbd-mysql-perl psmisc
sudo dpkg --install mysql-common_5.1.42-mariadb73_all.deb
sudo dpkg --install libmariadbclient16_5.1.42-mariadb73_amd64.deb libmysqlclient16_5.1.42-mariadb73_amd64.deb mariadb-client_5.1.42-mariadb73_all.deb mariadb-client-5.1_5.1.42-mariadb73_amd64.deb mariadb-server_5.1.42-mariadb73_all.deb mariadb-server-5.1_5.1.42-mariadb73_amd64.deb
Also, as soon as OurDelta has the 5.1.42 repositories up, we need a strong
warning against manually installing the .debs, strongly suggesting the
OurDelta repositories instead, as it will work much better for users.
(With the repositories, just `apt-get install mariadb-server` will work, as
will the Synaptic package manager GUI).
I hope you will be able to sort out some instructions, else ask me and I will
try to help.
This is made somewhat more urgent since after FOSDEM a number of people will
probably want to try out the .debs ASAP, as we advertised them ...
- Kristian.
2
1

[Maria-developers] [Branch ~maria-captains/maria/5.1] Rev 2813: Added --connect-command="sql-string" to sql-bench test suite.
by noreply@launchpad.net 09 Feb '10
by noreply@launchpad.net 09 Feb '10
09 Feb '10
------------------------------------------------------------
revno: 2813
committer: Michael Widenius <monty(a)askmonty.org>
branch nick: maria-5.1
timestamp: Tue 2010-02-09 19:17:04 +0200
message:
Added --connect-command="sql-string" to sql-bench test suite.
This allows one to send an extra command to the mysql server to setup the environment before starting tests.
modified:
sql-bench/bench-init.pl.sh
sql-bench/server-cfg.sh
--
lp:maria
https://code.launchpad.net/~maria-captains/maria/5.1
Your team Maria developers is subscribed to branch lp:maria.
To unsubscribe from this branch go to https://code.launchpad.net/~maria-captains/maria/5.1/+edit-subscription.
1
0

[Maria-developers] bzr commit into MariaDB 5.1, with Maria 1.5:maria branch (monty:2813)
by Michael Widenius 09 Feb '10
by Michael Widenius 09 Feb '10
09 Feb '10
#At lp:maria based on revid:knielsen@knielsen-hq.org-20100131153603-9uo859vt0kra7tbz
2813 Michael Widenius 2010-02-09
Added --connect-command="sql-string" to sql-bench test suite.
This allows one to send an extra command to the mysql server to setup the environment before starting tests.
modified:
sql-bench/bench-init.pl.sh
sql-bench/server-cfg.sh
=== modified file 'sql-bench/bench-init.pl.sh'
--- a/sql-bench/bench-init.pl.sh 2009-05-29 13:40:55 +0000
+++ b/sql-bench/bench-init.pl.sh 2010-02-09 17:17:04 +0000
@@ -40,7 +40,7 @@ require "$pwd/server-cfg" || die "Can't
$|=1; # Output data immediately
$opt_skip_test=$opt_skip_create=$opt_skip_delete=$opt_verbose=$opt_fast_insert=$opt_lock_tables=$opt_debug=$opt_skip_delete=$opt_fast=$opt_force=$opt_log=$opt_use_old_results=$opt_help=$opt_odbc=$opt_small_test=$opt_small_tables=$opt_samll_key_tables=$opt_stage=$opt_old_headers=$opt_die_on_errors=$opt_tcpip=$opt_random=$opt_only_missing_tests=0;
-$opt_cmp=$opt_user=$opt_password=$opt_connect_options="";
+$opt_cmp=$opt_user=$opt_password=$opt_connect_options=$opt_connect_command= "";
$opt_server="mysql"; $opt_dir="output";
$opt_host="localhost";$opt_database="test";
$opt_machine=""; $opt_suffix="";
@@ -59,7 +59,7 @@ $log_prog_args=join(" ", skip_arguments(
"use-old-results","skip-test",
"optimization","hw",
"machine", "dir", "suffix", "log"));
-GetOptions("skip-test=s","comments=s","cmp=s","server=s","user=s","host=s","database=s","password=s","loop-count=i","row-count=i","skip-create","skip-delete","verbose","fast-insert","lock-tables","debug","fast","force","field-count=i","regions=i","groups=i","time-limit=i","log","use-old-results","machine=s","dir=s","suffix=s","help","odbc","small-test","small-tables","small-key-tables","stage=i","threads=i","random","old-headers","die-on-errors","create-options=s","hires","tcpip","silent","optimization=s","hw=s","socket=s","connect-options=s","only-missing-tests") || usage();
+GetOptions("skip-test=s","comments=s","cmp=s","server=s","user=s","host=s","database=s","password=s","loop-count=i","row-count=i","skip-create","skip-delete","verbose","fast-insert","lock-tables","debug","fast","force","field-count=i","regions=i","groups=i","time-limit=i","log","use-old-results","machine=s","dir=s","suffix=s","help","odbc","small-test","small-tables","small-key-tables","stage=i","threads=i","random","old-headers","die-on-errors","create-options=s","hires","tcpip","silent","optimization=s","hw=s","socket=s","connect-options=s","connect-command=s","only-missing-tests") || usage();
usage() if ($opt_help);
$server=get_server($opt_server,$opt_host,$opt_database,$opt_odbc,
@@ -595,6 +595,10 @@ All benchmarks takes the following optio
Add options, which uses at DBI connect.
For example --connect-options=mysql_read_default_file=/etc/my.cnf.
+--connect-command='SQL command'
+ Initialization command to execute when logged in. Useful for setting
+ up the environment.
+
EOF
exit(0);
}
=== modified file 'sql-bench/server-cfg.sh'
--- a/sql-bench/server-cfg.sh 2009-05-29 13:40:55 +0000
+++ b/sql-bench/server-cfg.sh 2010-02-09 17:17:04 +0000
@@ -249,6 +249,11 @@ sub connect
die "Got error: '$DBI::errstr' when connecting to " . $self->{'data_source'} ." with user: '$main::opt_user' password: '$main::opt_password'\n";
$dbh->do("SET OPTION LOG_OFF=1,UPDATE_LOG=0");
+ if ($main::opt_connect_command ne "")
+ {
+ $dbh->do($main::opt_connect_command) or
+ die "Can't execute connect_command: $main::opt_connect_command error: $DBI::errstr\n";
+ }
return $dbh;
}
1
0

[Maria-developers] New (by Serg): mutex/condition service (83)
by worklog-noreply@askmonty.org 09 Feb '10
by worklog-noreply@askmonty.org 09 Feb '10
09 Feb '10
-----------------------------------------------------------------------
WORKLOG TASK
-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-
TASK...........: mutex/condition service
CREATION DATE..: Tue, 09 Feb 2010, 18:20
SUPERVISOR.....: Sergei
IMPLEMENTOR....:
COPIES TO......:
CATEGORY.......: Server-RawIdeaBin
TASK ID........: 83 (http://askmonty.org/worklog/?tid=83)
VERSION........: WorkLog-3.4
STATUS.........: Un-Assigned
PRIORITY.......: 60
WORKED HOURS...: 0
ESTIMATE.......: 40 (hours remain)
ORIG. ESTIMATE.: 40
PROGRESS NOTES:
DESCRIPTION:
create a service that provides access to server implementation of mutexes,
conditions, and rwlocks.
ESTIMATED WORK TIME
ESTIMATED COMPLETION DATE
-----------------------------------------------------------------------
WorkLog (v3.5.9)
1
0

[Maria-developers] New (by Serg): mutex/condition service (83)
by worklog-noreply@askmonty.org 09 Feb '10
by worklog-noreply@askmonty.org 09 Feb '10
09 Feb '10
-----------------------------------------------------------------------
WORKLOG TASK
-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-
TASK...........: mutex/condition service
CREATION DATE..: Tue, 09 Feb 2010, 18:20
SUPERVISOR.....: Sergei
IMPLEMENTOR....:
COPIES TO......:
CATEGORY.......: Server-RawIdeaBin
TASK ID........: 83 (http://askmonty.org/worklog/?tid=83)
VERSION........: WorkLog-3.4
STATUS.........: Un-Assigned
PRIORITY.......: 60
WORKED HOURS...: 0
ESTIMATE.......: 40 (hours remain)
ORIG. ESTIMATE.: 40
PROGRESS NOTES:
DESCRIPTION:
create a service that provides access to server implementation of mutexes,
conditions, and rwlocks.
ESTIMATED WORK TIME
ESTIMATED COMPLETION DATE
-----------------------------------------------------------------------
WorkLog (v3.5.9)
1
0

[Maria-developers] New (by Serg): add charset support to ftparser plugins (82)
by worklog-noreply@askmonty.org 09 Feb '10
by worklog-noreply@askmonty.org 09 Feb '10
09 Feb '10
-----------------------------------------------------------------------
WORKLOG TASK
-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-
TASK...........: add charset support to ftparser plugins
CREATION DATE..: Tue, 09 Feb 2010, 18:19
SUPERVISOR.....: Sergei
IMPLEMENTOR....:
COPIES TO......:
CATEGORY.......: Server-RawIdeaBin
TASK ID........: 82 (http://askmonty.org/worklog/?tid=82)
VERSION........: WorkLog-3.4
STATUS.........: Un-Assigned
PRIORITY.......: 60
WORKED HOURS...: 0
ESTIMATE.......: 20 (hours remain)
ORIG. ESTIMATE.: 20
PROGRESS NOTES:
DESCRIPTION:
fulltext parser plugins don't know the character set of the text they parse and
have no access to the charset support functionality in the server.
extend the ftparser api to provide charset of the parsed text
ESTIMATED WORK TIME
ESTIMATED COMPLETION DATE
-----------------------------------------------------------------------
WorkLog (v3.5.9)
1
0

[Maria-developers] New (by Serg): add charset support to ftparser plugins (82)
by worklog-noreply@askmonty.org 09 Feb '10
by worklog-noreply@askmonty.org 09 Feb '10
09 Feb '10
-----------------------------------------------------------------------
WORKLOG TASK
-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-
TASK...........: add charset support to ftparser plugins
CREATION DATE..: Tue, 09 Feb 2010, 18:19
SUPERVISOR.....: Sergei
IMPLEMENTOR....:
COPIES TO......:
CATEGORY.......: Server-RawIdeaBin
TASK ID........: 82 (http://askmonty.org/worklog/?tid=82)
VERSION........: WorkLog-3.4
STATUS.........: Un-Assigned
PRIORITY.......: 60
WORKED HOURS...: 0
ESTIMATE.......: 20 (hours remain)
ORIG. ESTIMATE.: 20
PROGRESS NOTES:
DESCRIPTION:
fulltext parser plugins don't know the character set of the text they parse and
have no access to the charset support functionality in the server.
extend the ftparser api to provide charset of the parsed text
ESTIMATED WORK TIME
ESTIMATED COMPLETION DATE
-----------------------------------------------------------------------
WorkLog (v3.5.9)
1
0

[Maria-developers] New (by Serg): create a charset service (81)
by worklog-noreply@askmonty.org 09 Feb '10
by worklog-noreply@askmonty.org 09 Feb '10
09 Feb '10
-----------------------------------------------------------------------
WORKLOG TASK
-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-
TASK...........: create a charset service
CREATION DATE..: Tue, 09 Feb 2010, 18:17
SUPERVISOR.....: Bothorsen
IMPLEMENTOR....:
COPIES TO......:
CATEGORY.......: Server-RawIdeaBin
TASK ID........: 81 (http://askmonty.org/worklog/?tid=81)
VERSION........: WorkLog-3.4
STATUS.........: Un-Assigned
PRIORITY.......: 60
WORKED HOURS...: 0
ESTIMATE.......: 20 (hours remain)
ORIG. ESTIMATE.: 20
PROGRESS NOTES:
DESCRIPTION:
create a Service for CHARSET_INFO and related functionality
ESTIMATED WORK TIME
ESTIMATED COMPLETION DATE
-----------------------------------------------------------------------
WorkLog (v3.5.9)
1
0

[Maria-developers] New (by Serg): create a charset service (81)
by worklog-noreply@askmonty.org 09 Feb '10
by worklog-noreply@askmonty.org 09 Feb '10
09 Feb '10
-----------------------------------------------------------------------
WORKLOG TASK
-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-
TASK...........: create a charset service
CREATION DATE..: Tue, 09 Feb 2010, 18:17
SUPERVISOR.....: Bothorsen
IMPLEMENTOR....:
COPIES TO......:
CATEGORY.......: Server-RawIdeaBin
TASK ID........: 81 (http://askmonty.org/worklog/?tid=81)
VERSION........: WorkLog-3.4
STATUS.........: Un-Assigned
PRIORITY.......: 60
WORKED HOURS...: 0
ESTIMATE.......: 20 (hours remain)
ORIG. ESTIMATE.: 20
PROGRESS NOTES:
DESCRIPTION:
create a Service for CHARSET_INFO and related functionality
ESTIMATED WORK TIME
ESTIMATED COMPLETION DATE
-----------------------------------------------------------------------
WorkLog (v3.5.9)
1
0

[Maria-developers] New (by Serg): show plugins soname ... (80)
by worklog-noreply@askmonty.org 09 Feb '10
by worklog-noreply@askmonty.org 09 Feb '10
09 Feb '10
-----------------------------------------------------------------------
WORKLOG TASK
-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-
TASK...........: show plugins soname ...
CREATION DATE..: Tue, 09 Feb 2010, 18:16
SUPERVISOR.....: Sergei
IMPLEMENTOR....:
COPIES TO......:
CATEGORY.......: Server-RawIdeaBin
TASK ID........: 80 (http://askmonty.org/worklog/?tid=80)
VERSION........: WorkLog-3.4
STATUS.........: Un-Assigned
PRIORITY.......: 60
WORKED HOURS...: 0
ESTIMATE.......: 40 (hours remain)
ORIG. ESTIMATE.: 40
PROGRESS NOTES:
DESCRIPTION:
To install a plugin one needs to know its name in advance. There is no way to
examine a .so to see what it contains.
SHOW PLUGINS SONAME ... command will list all - including uninstalled - plugins
from a given .so
ESTIMATED WORK TIME
ESTIMATED COMPLETION DATE
-----------------------------------------------------------------------
WorkLog (v3.5.9)
1
0

[Maria-developers] New (by Serg): show plugins soname ... (80)
by worklog-noreply@askmonty.org 09 Feb '10
by worklog-noreply@askmonty.org 09 Feb '10
09 Feb '10
-----------------------------------------------------------------------
WORKLOG TASK
-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-
TASK...........: show plugins soname ...
CREATION DATE..: Tue, 09 Feb 2010, 18:16
SUPERVISOR.....: Sergei
IMPLEMENTOR....:
COPIES TO......:
CATEGORY.......: Server-RawIdeaBin
TASK ID........: 80 (http://askmonty.org/worklog/?tid=80)
VERSION........: WorkLog-3.4
STATUS.........: Un-Assigned
PRIORITY.......: 60
WORKED HOURS...: 0
ESTIMATE.......: 40 (hours remain)
ORIG. ESTIMATE.: 40
PROGRESS NOTES:
DESCRIPTION:
To install a plugin one needs to know its name in advance. There is no way to
examine a .so to see what it contains.
SHOW PLUGINS SONAME ... command will list all - including uninstalled - plugins
from a given .so
ESTIMATED WORK TIME
ESTIMATED COMPLETION DATE
-----------------------------------------------------------------------
WorkLog (v3.5.9)
1
0

[Maria-developers] New (by Serg): duplicate plugin names (79)
by worklog-noreply@askmonty.org 09 Feb '10
by worklog-noreply@askmonty.org 09 Feb '10
09 Feb '10
-----------------------------------------------------------------------
WORKLOG TASK
-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-
TASK...........: duplicate plugin names
CREATION DATE..: Tue, 09 Feb 2010, 18:14
SUPERVISOR.....: Sergei
IMPLEMENTOR....:
COPIES TO......:
CATEGORY.......: Server-RawIdeaBin
TASK ID........: 79 (http://askmonty.org/worklog/?tid=79)
VERSION........: WorkLog-3.4
STATUS.........: Un-Assigned
PRIORITY.......: 60
WORKED HOURS...: 0
ESTIMATE.......: 40 (hours remain)
ORIG. ESTIMATE.: 40
PROGRESS NOTES:
DESCRIPTION:
currently plugin names should be globally unique in the server.
In fact, they only need to be unique within a plugin type. There is no reason
why there can not be a fulltext parser plugin "csv" and storage engine plugin "csv".
This task allows plugins with the same name to be installed as long as they are
of different plugin type. It also adds
INSTALL plugin_type PLUGIN name SONAME "path";
command (e.g. INSTALL FTPARSER PLUGIN csv SONAME "csv.so"), to resolve the case
when one .so has two plugins with the same name.
ESTIMATED WORK TIME
ESTIMATED COMPLETION DATE
-----------------------------------------------------------------------
WorkLog (v3.5.9)
1
0

[Maria-developers] New (by Serg): duplicate plugin names (79)
by worklog-noreply@askmonty.org 09 Feb '10
by worklog-noreply@askmonty.org 09 Feb '10
09 Feb '10
-----------------------------------------------------------------------
WORKLOG TASK
-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-
TASK...........: duplicate plugin names
CREATION DATE..: Tue, 09 Feb 2010, 18:14
SUPERVISOR.....: Sergei
IMPLEMENTOR....:
COPIES TO......:
CATEGORY.......: Server-RawIdeaBin
TASK ID........: 79 (http://askmonty.org/worklog/?tid=79)
VERSION........: WorkLog-3.4
STATUS.........: Un-Assigned
PRIORITY.......: 60
WORKED HOURS...: 0
ESTIMATE.......: 40 (hours remain)
ORIG. ESTIMATE.: 40
PROGRESS NOTES:
DESCRIPTION:
currently plugin names should be globally unique in the server.
In fact, they only need to be unique within a plugin type. There is no reason
why there can not be a fulltext parser plugin "csv" and storage engine plugin "csv".
This task allows plugins with the same name to be installed as long as they are
of different plugin type. It also adds
INSTALL plugin_type PLUGIN name SONAME "path";
command (e.g. INSTALL FTPARSER PLUGIN csv SONAME "csv.so"), to resolve the case
when one .so has two plugins with the same name.
ESTIMATED WORK TIME
ESTIMATED COMPLETION DATE
-----------------------------------------------------------------------
WorkLog (v3.5.9)
1
0

09 Feb '10
-----------------------------------------------------------------------
WORKLOG TASK
-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-
TASK...........: INSTALL PLUGIN *
CREATION DATE..: Tue, 09 Feb 2010, 18:10
SUPERVISOR.....: Sergei
IMPLEMENTOR....:
COPIES TO......:
CATEGORY.......: Server-RawIdeaBin
TASK ID........: 78 (http://askmonty.org/worklog/?tid=78)
VERSION........: WorkLog-3.4
STATUS.........: Un-Assigned
PRIORITY.......: 60
WORKED HOURS...: 0
ESTIMATE.......: 10 (hours remain)
ORIG. ESTIMATE.: 10
PROGRESS NOTES:
DESCRIPTION:
InnoDB, XtraDB, PBXT (at least) come with a storage engine plugin and many
information_schema plugins in one .so file.
Currently one needs to install them all one by one.
INSTALL PLUGIN * SONAME xxx
would be a more convenient way to install everything at once.
ESTIMATED WORK TIME
ESTIMATED COMPLETION DATE
-----------------------------------------------------------------------
WorkLog (v3.5.9)
1
0

09 Feb '10
-----------------------------------------------------------------------
WORKLOG TASK
-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-
TASK...........: INSTALL PLUGIN *
CREATION DATE..: Tue, 09 Feb 2010, 18:10
SUPERVISOR.....: Sergei
IMPLEMENTOR....:
COPIES TO......:
CATEGORY.......: Server-RawIdeaBin
TASK ID........: 78 (http://askmonty.org/worklog/?tid=78)
VERSION........: WorkLog-3.4
STATUS.........: Un-Assigned
PRIORITY.......: 60
WORKED HOURS...: 0
ESTIMATE.......: 10 (hours remain)
ORIG. ESTIMATE.: 10
PROGRESS NOTES:
DESCRIPTION:
InnoDB, XtraDB, PBXT (at least) come with a storage engine plugin and many
information_schema plugins in one .so file.
Currently one needs to install them all one by one.
INSTALL PLUGIN * SONAME xxx
would be a more convenient way to install everything at once.
ESTIMATED WORK TIME
ESTIMATED COMPLETION DATE
-----------------------------------------------------------------------
WorkLog (v3.5.9)
1
0

[Maria-developers] Rev 2740: Group commit for maria storage engine. in file:///Users/bell/maria/bzr/work-maria-5.2-groupcommit/
by sanja@askmonty.org 09 Feb '10
by sanja@askmonty.org 09 Feb '10
09 Feb '10
At file:///Users/bell/maria/bzr/work-maria-5.2-groupcommit/
------------------------------------------------------------
revno: 2740
revision-id: sanja(a)askmonty.org-20100209082807-om89773tey55ok66
parent: knielsen(a)knielsen-hq.org-20100201190519-b9uktnn90rwwiile
committer: sanja(a)askmonty.org
branch nick: work-maria-5.2-groupcommit
timestamp: Tue 2010-02-09 10:28:07 +0200
message:
Group commit for maria storage engine.
Diff too large for email (1447 lines, the limit is 1000).
1
0

[Maria-developers] Rev 2740: Group commit for maria storage engine. in file:///Users/bell/maria/bzr/work-maria-5.2-groupcommit/
by sanja@askmonty.org 09 Feb '10
by sanja@askmonty.org 09 Feb '10
09 Feb '10
At file:///Users/bell/maria/bzr/work-maria-5.2-groupcommit/
------------------------------------------------------------
revno: 2740
revision-id: sanja(a)askmonty.org-20100209081347-si2jhe5q4olk5a08
parent: knielsen(a)knielsen-hq.org-20100201190519-b9uktnn90rwwiile
committer: sanja(a)askmonty.org
branch nick: work-maria-5.2-groupcommit
timestamp: Tue 2010-02-09 10:13:47 +0200
message:
Group commit for maria storage engine.
Diff too large for email (1444 lines, the limit is 1000).
1
0

[Maria-developers] Rev 2756: Apply Jorgen Loland's fix: in file:///home/psergey/dev/maria-5.3-subqueries-r3/
by Sergey Petrunya 08 Feb '10
by Sergey Petrunya 08 Feb '10
08 Feb '10
At file:///home/psergey/dev/maria-5.3-subqueries-r3/
------------------------------------------------------------
revno: 2756
revision-id: psergey(a)askmonty.org-20100208133030-e4zjy15b7o14ud8c
parent: psergey(a)askmonty.org-20100208132741-nj9zq7z8nwlfwchq
committer: Sergey Petrunya <psergey(a)askmonty.org>
branch nick: maria-5.3-subqueries-r3
timestamp: Mon 2010-02-08 15:30:30 +0200
message:
Apply Jorgen Loland's fix:
Bug#45221: Query "SELECT pk FROM C WHERE pk IN (SELECT int_key)" failing
XOR conditions are not optimized, and Item_cond_xor therefore
acts like type Func_item even though it inherits from Item_cond.
A subtle difference between Item_func and Item_cond is that
you can get the children Items from the former by calling
arguments(), and from the latter by calling argument_list().
However, since Item_cond_xor inherits from Item_cond,
arguments() did not return any Items.
The fact that Item_cond_xor::arguments() did not return it's
children items lead to a problem for make_cond_for_index();
the method accepted that XOR items on unindexed columns were
pushed using ICP. ICP evaluation of non-indexed columns
does not (and should not) work.
The fix for this bug is to make Item_cond_xor return it's
children items when the arguments() method is used. This makes
Item_cond_xor behave more like Item_func and in turn allows
make_cond_for_index() to discover any conflicting children
Items.
This is a temporary fix and should be removed when
Item_cond_xor is optimized.
=== modified file 'sql/item_cmpfunc.h'
--- a/sql/item_cmpfunc.h 2010-01-17 14:55:08 +0000
+++ b/sql/item_cmpfunc.h 2010-02-08 13:30:30 +0000
@@ -1715,14 +1715,34 @@
class Item_cond_xor :public Item_cond
{
public:
- Item_cond_xor() :Item_cond() {}
- Item_cond_xor(Item *i1,Item *i2) :Item_cond(i1,i2) {}
+ Item_cond_xor(Item *i1,Item *i2) :Item_cond(i1,i2)
+ {
+ /*
+ Items must be stored in args[] as well because this Item_cond is
+ treated as a FUNC_ITEM (see type()). I.e., users of it will get
+ it's children by calling arguments(), not argument_list(). This
+ is a temporary solution until XOR is optimized and treated like
+ a full Item_cond citizen.
+ */
+ arg_count= 2;
+ args= tmp_arg;
+ args[0]= i1;
+ args[1]= i2;
+ }
enum Functype functype() const { return COND_XOR_FUNC; }
/* TODO: remove the next line when implementing XOR optimization */
enum Type type() const { return FUNC_ITEM; }
longlong val_int();
const char *func_name() const { return "xor"; }
void top_level_item() {}
+ /* Since child Items are stored in args[], Items cannot be added.
+ However, since Item_cond_xor is treated as a FUNC_ITEM (see
+ type()), the methods below should never be called.
+ */
+ bool add(Item *item) { DBUG_ASSERT(FALSE); return FALSE; }
+ bool add_at_head(Item *item) { DBUG_ASSERT(FALSE); return FALSE; }
+ bool add_at_head(List<Item> *nlist) { DBUG_ASSERT(FALSE); return FALSE; }
+ void copy_andor_arguments(THD *thd, Item_cond *item) { DBUG_ASSERT(FALSE); }
};
1
0

[Maria-developers] Rev 2755: Subquery optimizations: backport in file:///home/psergey/dev/maria-5.3-subqueries-r3/
by Sergey Petrunya 08 Feb '10
by Sergey Petrunya 08 Feb '10
08 Feb '10
At file:///home/psergey/dev/maria-5.3-subqueries-r3/
------------------------------------------------------------
revno: 2755
revision-id: psergey(a)askmonty.org-20100208132741-nj9zq7z8nwlfwchq
parent: psergey(a)askmonty.org-20100208131019-q3vmltpo7vu9ihz6
committer: Sergey Petrunya <psergey(a)askmonty.org>
branch nick: maria-5.3-subqueries-r3
timestamp: Mon 2010-02-08 15:27:41 +0200
message:
Subquery optimizations: backport
- Test result updates (checked)
=== modified file 'mysql-test/r/group_by.result'
--- a/mysql-test/r/group_by.result 2009-02-26 17:17:06 +0000
+++ b/mysql-test/r/group_by.result 2010-02-08 13:27:41 +0000
@@ -1542,8 +1542,8 @@
EXPLAIN SELECT 1 FROM t1 WHERE a IN
(SELECT a FROM t1 USE INDEX (i2) IGNORE INDEX (i2));
id select_type table type possible_keys key key_len ref rows Extra
-1 PRIMARY t1 index NULL PRIMARY 4 NULL 144 Using where; Using index
-2 DEPENDENT SUBQUERY t1 ALL NULL NULL NULL NULL 144 Using where
+1 PRIMARY t1 index PRIMARY,i2 PRIMARY 4 NULL 144 Using index
+1 PRIMARY t1 ALL NULL NULL NULL NULL 144 Using where; FirstMatch(t1)
CREATE TABLE t2 (a INT, b INT, KEY(a));
INSERT INTO t2 VALUES (1, 1), (2, 2), (3,3), (4,4);
EXPLAIN SELECT a, SUM(b) FROM t2 GROUP BY a LIMIT 2;
@@ -1555,8 +1555,8 @@
EXPLAIN SELECT 1 FROM t2 WHERE a IN
(SELECT a FROM t1 USE INDEX (i2) IGNORE INDEX (i2));
id select_type table type possible_keys key key_len ref rows Extra
-1 PRIMARY t2 index NULL a 5 NULL 4 Using where; Using index
-2 DEPENDENT SUBQUERY t1 ALL NULL NULL NULL NULL 144 Using where
+1 PRIMARY t2 index a a 5 NULL 4 Using index
+1 PRIMARY t1 ALL NULL NULL NULL NULL 144 Using where; FirstMatch(t2)
SHOW VARIABLES LIKE 'old';
Variable_name Value
old OFF
1
0

[Maria-developers] Rev 2754: Subquery backport: pass the correct item (the one that was resolved in in file:///home/psergey/dev/maria-5.3-subqueries-r3/
by Sergey Petrunya 08 Feb '10
by Sergey Petrunya 08 Feb '10
08 Feb '10
At file:///home/psergey/dev/maria-5.3-subqueries-r3/
------------------------------------------------------------
revno: 2754
revision-id: psergey(a)askmonty.org-20100208131019-q3vmltpo7vu9ihz6
parent: psergey(a)askmonty.org-20100208130923-g38q4uiyu90g60w9
committer: Sergey Petrunya <psergey(a)askmonty.org>
branch nick: maria-5.3-subqueries-r3
timestamp: Mon 2010-02-08 15:10:19 +0200
message:
Subquery backport: pass the correct item (the one that was resolved in
ancestor select and has item->fixed==1) to mark_as_dependent
=== modified file 'sql/item.cc'
--- a/sql/item.cc 2010-02-08 09:56:16 +0000
+++ b/sql/item.cc 2010-02-08 13:10:19 +0000
@@ -3643,7 +3643,9 @@
@param current current select
@param resolved_item item which was resolved in outer SELECT(for warning)
@param mark_item item which should be marked (can be differ in case of
- substitution)
+ substitution) And also different when accessing VIEW
+ columns: in that case its Item_..._view_ref while
+ resolved_item is Item_field.
*/
static bool mark_as_dependent(THD *thd, SELECT_LEX *last, SELECT_LEX *current,
@@ -3657,7 +3659,7 @@
/* store pointer on SELECT_LEX from which item is dependent */
if (mark_item)
mark_item->depended_from= last;
- if (current->mark_as_dependent(thd, last, resolved_item))
+ if (current->mark_as_dependent(thd, last, /*resolved_item*/ mark_item)) //psergey-fix2
return TRUE;
if (thd->lex->describe & DESCRIBE_EXTENDED)
{
1
0

[Maria-developers] Rev 2753: Subquery backport: Update test results (checked) in file:///home/psergey/dev/maria-5.3-subqueries-r3/
by Sergey Petrunya 08 Feb '10
by Sergey Petrunya 08 Feb '10
08 Feb '10
At file:///home/psergey/dev/maria-5.3-subqueries-r3/
------------------------------------------------------------
revno: 2753
revision-id: psergey(a)askmonty.org-20100208130923-g38q4uiyu90g60w9
parent: psergey(a)askmonty.org-20100208125530-l0o01n6lxk95cn78
committer: Sergey Petrunya <psergey(a)askmonty.org>
branch nick: maria-5.3-subqueries-r3
timestamp: Mon 2010-02-08 15:09:23 +0200
message:
Subquery backport: Update test results (checked)
=== modified file 'mysql-test/r/explain.result'
--- a/mysql-test/r/explain.result 2009-12-15 07:16:46 +0000
+++ b/mysql-test/r/explain.result 2010-02-08 13:09:23 +0000
@@ -171,7 +171,7 @@
EXPLAIN SELECT OUTR.dt FROM t1 AS OUTR WHERE OUTR.dt IN (SELECT INNR.dt FROM t2 AS INNR WHERE OUTR.dt IS NULL );
id select_type table type possible_keys key key_len ref rows Extra
1 PRIMARY OUTR ALL NULL NULL NULL NULL 2 Using where
-2 DEPENDENT SUBQUERY INNR ALL NULL NULL NULL NULL 2 Using where
+1 PRIMARY INNR ALL NULL NULL NULL NULL 2 Using where; FirstMatch(OUTR)
flush tables;
SELECT OUTR.dt FROM t1 AS OUTR WHERE OUTR.dt IN (SELECT INNR.dt FROM t2 AS INNR WHERE OUTR.dt IS NULL );
dt
@@ -179,7 +179,7 @@
EXPLAIN SELECT OUTR.dt FROM t1 AS OUTR WHERE OUTR.dt IN ( SELECT INNR.dt FROM t2 AS INNR WHERE OUTR.t < '2005-11-13 7:41:31' );
id select_type table type possible_keys key key_len ref rows Extra
1 PRIMARY OUTR ALL NULL NULL NULL NULL 2 Using where
-2 DEPENDENT SUBQUERY INNR ALL NULL NULL NULL NULL 2 Using where
+1 PRIMARY INNR ALL NULL NULL NULL NULL 2 Using where; FirstMatch(OUTR)
flush tables;
SELECT OUTR.dt FROM t1 AS OUTR WHERE OUTR.dt IN ( SELECT INNR.dt FROM t2 AS INNR WHERE OUTR.t < '2005-11-13 7:41:31' );
dt
=== modified file 'mysql-test/r/group_min_max.result'
--- a/mysql-test/r/group_min_max.result 2009-08-30 07:03:37 +0000
+++ b/mysql-test/r/group_min_max.result 2010-02-08 13:09:23 +0000
@@ -2256,7 +2256,7 @@
a IN (SELECT max(b) FROM t1 GROUP BY a HAVING a < 2);
id select_type table type possible_keys key key_len ref rows Extra
1 PRIMARY t1_outer index NULL a 10 NULL 15 Using where; Using index
-2 DEPENDENT SUBQUERY t1 index NULL a 10 NULL 1 Using index
+2 SUBQUERY t1 range NULL a 5 NULL 8 Using index for group-by
EXPLAIN SELECT 1 FROM t1 AS t1_outer GROUP BY a HAVING
a > (SELECT max(b) FROM t1 GROUP BY a HAVING a < 2);
id select_type table type possible_keys key key_len ref rows Extra
=== modified file 'mysql-test/r/subselect3_jcl6.result'
--- a/mysql-test/r/subselect3_jcl6.result 2010-01-17 14:51:10 +0000
+++ b/mysql-test/r/subselect3_jcl6.result 2010-02-08 13:09:23 +0000
@@ -1140,7 +1140,7 @@
flush status;
select count(*) from t0 A, t0 B, t0 C, t0 D where D.a in (select a from t1 E);
count(*)
-4999
+5000
show status like 'Created_tmp_disk_tables';
Variable_name Value
Created_tmp_disk_tables 1
1
0

[Maria-developers] Rev 2740: options for CREATE TABLE (MWL#43) (version after first review). in file:///home/bell/maria/bzr/work-maria-5.2-createoptions/
by sanja@askmonty.org 05 Feb '10
by sanja@askmonty.org 05 Feb '10
05 Feb '10
At file:///home/bell/maria/bzr/work-maria-5.2-createoptions/
------------------------------------------------------------
revno: 2740
revision-id: sanja(a)askmonty.org-20100205170316-gg4nio1p81cpmjop
parent: knielsen(a)knielsen-hq.org-20100201190519-b9uktnn90rwwiile
committer: sanja(a)askmonty.org
branch nick: work-maria-5.2-createoptions
timestamp: Fri 2010-02-05 19:03:16 +0200
message:
options for CREATE TABLE (MWL#43) (version after first review).
Diff too large for email (1280 lines, the limit is 1000).
2
1

[Maria-developers] Updated (by Serg): Store in binlog text of statements that caused RBR events (47)
by worklog-noreply@askmonty.org 05 Feb '10
by worklog-noreply@askmonty.org 05 Feb '10
05 Feb '10
-----------------------------------------------------------------------
WORKLOG TASK
-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-
TASK...........: Store in binlog text of statements that caused RBR events
CREATION DATE..: Sat, 15 Aug 2009, 23:48
SUPERVISOR.....: Monty
IMPLEMENTOR....:
COPIES TO......: Knielsen, Serg
CATEGORY.......: Server-Sprint
TASK ID........: 47 (http://askmonty.org/worklog/?tid=47)
VERSION........: Server-9.x
STATUS.........: In-Progress
PRIORITY.......: 60
WORKED HOURS...: 20
ESTIMATE.......: 35 (hours remain)
ORIG. ESTIMATE.: 35
PROGRESS NOTES:
-=-=(Serg - Fri, 05 Feb 2010, 14:04)=-=-
Observers changed: Knielsen,Serg
-=-=(Guest - Fri, 05 Feb 2010, 13:40)=-=-
Category updated.
--- /tmp/wklog.47.old.9197 2010-02-05 13:40:36.000000000 +0200
+++ /tmp/wklog.47.new.9197 2010-02-05 13:40:36.000000000 +0200
@@ -1 +1 @@
-Server-RawIdeaBin
+Server-Sprint
-=-=(Guest - Fri, 05 Feb 2010, 13:40)=-=-
Status updated.
--- /tmp/wklog.47.old.9197 2010-02-05 13:40:36.000000000 +0200
+++ /tmp/wklog.47.new.9197 2010-02-05 13:40:36.000000000 +0200
@@ -1 +1 @@
-Un-Assigned
+In-Progress
-=-=(Alexi - Thu, 04 Feb 2010, 09:54)=-=-
Low Level Design modified.
--- /tmp/wklog.47.old.16174 2010-02-04 09:54:13.000000000 +0200
+++ /tmp/wklog.47.new.16174 2010-02-04 09:54:13.000000000 +0200
@@ -171,35 +171,20 @@
5. How slave IO thread requests Annotate_rows events
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-When requesting an event, the slave should inform the master whether
-it should send Annotate_rows events or not. To that end we add a new
-BINLOG_SEND_ANNOTATE_ROWS_EVENT flag used when requesting an event:
+If the replicate-annotate-rows-events option is not set on a slave, there
+is no need for master to send Annotate_rows events to this slave. The slave
+(or mysqlbinlog in remote case), before requesting binlog dump via the
+COM_BINLOG_DUMP command, informs the master whether it should send these
+events by executing the newly added COM_BINLOG_DUMP_OPTIONS_EXT server
+command:
+
+ case COM_BINLOG_DUMP_OPTIONS_EXT:
+ thd->binlog_dump_flags_ext= packet[0];
+ my_ok(thd);
+ break;
- #define BINLOG_DUMP_NON_BLOCK 1
- #define BINLOG_SEND_ANNOTATE_ROWS_EVENT 2
-
- pthread_handler_t handle_slave_io(void *arg)
- { ...
- request_dump(mysql, ...);
- ...
- }
-
- int request_dump(MYSQL* mysql, ...)
- { ...
- if (opt_log_slave_updates &&
- mi->io_thd->variables.binlog_annotate_rows_events)
- binlog_flags|= BINLOG_SEND_ANNOTATE_ROWS_EVENT;
- ...
- int2store(buf + 4, binlog_flags);
- ...
- simple_command(mysql, COM_BINLOG_DUMP, buf, ...);
- ...
- }
-
-NOTE. mysqlbinlog, when remotely requesting BINLOG_DUMP by calling the
-simple_command() function, should also use this flag if it wants (in case
-of the --print-annotate-rows-events option set) to recieve Annotate_rows
-events.
+Note. We add this new command and don't use COM_BINLOG_DUMP to avoid possible
+conflicts with MySQL/Sun.
6. How master executes the request
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
@@ -338,10 +323,4 @@
to internals(a)lists.mysql.com about this and suggesting to reserve the
event number.
-Also we should notice the introduction of the BINLOG_SEND_ANNOTATE_ROWS_EVENT
-flag taking into account that MySQL/Sun may also introduce a flag with the
-same value to be used in the request_dump-mysql_binlog_send interface.
-But this is mainly the question of merging: if a conflict concerning this
-flag occur, we may simply change the BINLOG_SEND_ANNOTATE_ROWS_EVENT value
-(this does not require additional changes in the code).
-=-=(Alexi - Sun, 20 Dec 2009, 16:00)=-=-
Low Level Design modified.
--- /tmp/wklog.47.old.19667 2009-12-20 14:00:56.000000000 +0000
+++ /tmp/wklog.47.new.19667 2009-12-20 14:00:56.000000000 +0000
@@ -196,6 +196,11 @@
...
}
+NOTE. mysqlbinlog, when remotely requesting BINLOG_DUMP by calling the
+simple_command() function, should also use this flag if it wants (in case
+of the --print-annotate-rows-events option set) to recieve Annotate_rows
+events.
+
6. How master executes the request
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
@@ -212,8 +217,7 @@
Log_event::read_log_event(&log, packet, ...);
...
if ((*packet)[EVENT_TYPE_OFFSET + 1] != ANNOTATE_ROWS_EVENT ||
- flags & BINLOG_SEND_ANNOTATE_ROWS_EVENT ||
- thd->server_id == 0 /* slave == mysqlbinlog */ )
+ flags & BINLOG_SEND_ANNOTATE_ROWS_EVENT)
{
my_net_write(net, packet->ptr(), packet->length());
}
-=-=(Alexi - Sun, 20 Dec 2009, 13:14)=-=-
Low Level Design modified.
--- /tmp/wklog.47.old.11350 2009-12-20 13:14:04.000000000 +0200
+++ /tmp/wklog.47.new.11350 2009-12-20 13:14:04.000000000 +0200
@@ -282,23 +282,18 @@
Annotate_rows_log_event* m_annotate_event;
};
-When the saved Annotate_rows object may be deleted? When all corresponding
-Rows events will be processed, i.e. before processing the first non-Rows
-event (note that Annotate_rows object resides in the binary log *after*
-the (possible) 'BEGIN' Query event which accompanies the rows events; note
-also that this deletion is adjusted with the case when some or all
-corresponding Rows events are filtered out by replicate filter rules):
+The saved Annotate_rows object should be deleted when all corresponding
+Rows events will be processed:
int exec_relay_log_event(THD* thd, Relay_log_info* rli)
{ ...
Log_event *ev= next_event(rli);
...
- if (rli->get_annotate_event() && !IS_RBR_EVENT_TYPE(ev->get_type_code()))
- rli->free_annotate_event();
-
apply_event_and_update_pos(ev, ...);
- if (ev->get_type_code() == ANNOTATE_ROWS_EVENT)
+ if (rli->get_annotate_event() && is_last_rows_event(ev))
+ rli->free_annotate_event();
+ else if (ev->get_type_code() == ANNOTATE_ROWS_EVENT)
rli->set_annotate_event((Annotate_rows_log_event*) ev);
else if (ev->get_type_code() != FORMAT_DESCRIPTION_EVENT)
delete ev;
@@ -307,10 +302,21 @@
where
- #define IS_RBR_EVENT_TYPE(type) ( (type) == TABLE_MAP_EVENT || \
- (type) == WRITE_ROWS_EVENT || \
+ bool is_last_rows_event(Log_event* ev)
+ {
+ Log_event_type type= ev->get_type_code();
+ if (IS_ROWS_EVENT_TYPE(type))
+ {
+ Rows_log_event* rows= (Rows_log_event*)ev;
+ return rows->get_flags(Rows_log_event::STMT_END_F);
+ }
+
+ return 0;
+ }
+
+ #define IS_ROWS_EVENT_TYPE(type) ((type) == WRITE_ROWS_EVENT || \
(type) == UPDATE_ROWS_EVENT || \
- (type) == DELETE_ROWS_EVENT )
+ (type) == DELETE_ROWS_EVENT)
8. General remarks
~~~~~~~~~~~~~~~~~~
-=-=(Alexi - Sun, 20 Dec 2009, 09:29)=-=-
Low Level Design modified.
--- /tmp/wklog.47.old.32726 2009-12-20 07:29:56.000000000 +0000
+++ /tmp/wklog.47.new.32726 2009-12-20 07:29:56.000000000 +0000
@@ -56,7 +56,7 @@
0000006A | 00 | 0 - RESERVED [27]
..... ...
00000081 | 00 | 0 - RESERVED [50]
- 00000082 | 00 | 0 - ANNOTATE_RBR_EVENT [51]
+ 00000082 | 00 | 0 - ANNOTATE_ROWS_EVENT [51]
************************
2. Outline of Annotate_rows event behavior
-=-=(Alexi - Sat, 19 Dec 2009, 16:10)=-=-
Low Level Design modified.
--- /tmp/wklog.47.old.16051 2009-12-19 16:10:48.000000000 +0200
+++ /tmp/wklog.47.new.16051 2009-12-19 16:10:48.000000000 +0200
@@ -253,7 +253,7 @@
thd query to that of the described by the event, i.e. to the query which
caused the subsequent Rows events (see "How Master writes Annotate_rows
events to the binary log" to follow what happens further when the subsequent
-Rows events is applied):
+Rows events are applied):
int Annotate_rows_log_event::do_apply_event(...)
{
-=-=(Alexi - Sat, 19 Dec 2009, 16:02)=-=-
High-Level Specification modified.
--- /tmp/wklog.47.old.15695 2009-12-19 16:02:33.000000000 +0200
+++ /tmp/wklog.47.new.15695 2009-12-19 16:02:33.000000000 +0200
@@ -12,7 +12,7 @@
post-header and contains the query text in its data part. Example:
************************
- ANNOTATE_RBR_EVENT
+ ANNOTATE_ROWS_EVENT
************************
00000220 | B6 A0 2C 4B | time_when = 1261215926
00000224 | 33 | event_type = 51
-=-=(Alexi - Sat, 19 Dec 2009, 15:58)=-=-
Low Level Design modified.
--- /tmp/wklog.47.old.15437 2009-12-19 13:58:12.000000000 +0000
+++ /tmp/wklog.47.new.15437 2009-12-19 13:58:12.000000000 +0000
@@ -1 +1,337 @@
+Content
+~~~~~~~
+ 1. Annotate_rows event number
+ 2. Outline of Annotate_rows event behavior
+ 3. How Master writes Annotate_rows events to the binary log
+ 4. How slave treats replicate-annotate-rows-events option
+ 5. How slave IO thread requests Annotate_rows events
+ 6. How master executes the request
+ 7. How slave SQL thread processes Annotate_rows events
+ 8. General remarks
+
+1. Annotate_rows event number
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+To avoid possible event numbers conflict with MySQL/Sun, we leave a gap
+between the last MySQL event number and the Annotate_rows event number:
+
+ enum Log_event_type
+ { ...
+ INCIDENT_EVENT= 26,
+ // New MySQL event numbers are to be added here
+ MYSQL_EVENTS_END,
+
+ MARIA_EVENTS_BEGIN= 51,
+ // New Maria event numbers start from here
+ ANNOTATE_ROWS_EVENT= 51,
+
+ ENUM_END_EVENT
+ };
+
+together with the corresponding extension of 'post_header_len' array in the
+Format description event. (This extension does not affect the compatibility
+of the binary log). Here is how Format description event looks like with
+this extension:
+
+ ************************
+ FORMAT_DESCRIPTION_EVENT
+ ************************
+ 00000004 | A1 A0 2C 4B | time_when = 1261215905
+ 00000008 | 0F | event_type = 15
+ 00000009 | 64 00 00 00 | server_id = 100
+ 0000000D | 7F 00 00 00 | event_len = 127
+ 00000011 | 83 00 00 00 | log_pos = 00000083
+ 00000015 | 01 00 | flags = LOG_EVENT_BINLOG_IN_USE_F
+ ------------------------
+ 00000017 | 04 00 | binlog_ver = 4
+ 00000019 | 35 2E 32 2E | server_ver = 5.2.0-MariaDB-alpha-debug-log
+ ..... ...
+ 0000004B | A1 A0 2C 4B | time_created = 1261215905
+ 0000004F | 13 | common_header_len = 19
+ ------------------------
+ post_header_len
+ ------------------------
+ 00000050 | 38 | 56 - START_EVENT_V3 [1]
+ ..... ...
+ 00000069 | 02 | 2 - INCIDENT_EVENT [26]
+ 0000006A | 00 | 0 - RESERVED [27]
+ ..... ...
+ 00000081 | 00 | 0 - RESERVED [50]
+ 00000082 | 00 | 0 - ANNOTATE_RBR_EVENT [51]
+ ************************
+
+2. Outline of Annotate_rows event behavior
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+Each Annotate_rows_log_event object has two private members describing the
+corresponding query:
+
+ char *m_query_txt;
+ uint m_query_len;
+
+When the object is created for writing to a binary log, this query is taken
+from 'thd' (for short, below we omit the 'Annotate_rows_log_event::' prefix
+as well as other implementation details):
+
+ Annotate_rows_log_event(THD *thd)
+ {
+ m_query_txt = thd->query();
+ m_query_len = thd->query_length();
+ }
+
+When the object is read from a binary log, the query is taken from the buffer
+containing the binary log representation of the event (this buffer is allocated
+in Log_event object from which all Log events are derived):
+
+ Annotate_rows_log_event(char *buf, uint event_len,
+ Format_description_log_event *desc)
+ {
+ m_query_len = event_len - desc->common_header_len;
+ m_query_txt = buf + desc->common_header_len;
+ }
+
+The events are written to the binary log by the Log_event::write() member
+which calls virtual write_data_header() and write_data_body() members
+("data header" and "post header" are synonym in replication terminology).
+In our case, data header is empty and data body is just the query:
+
+ bool write_data_body(IO_CACHE *file)
+ {
+ return my_b_safe_write(file, (uchar*) m_query_txt, m_query_len);
+ }
+
+Printing the event is just printing the query:
+
+ void Annotate_rows_log_event::print(FILE *file, PRINT_EVENT_INFO *pinfo)
+ {
+ my_b_printf(&pinfo->head_cache, "\tQuery: `%s`\n", m_query_txt);
+ }
+
+3. How Master writes Annotate_rows events to the binary log
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+The event is written to the binary log just before the group of Table_map
+events which precede corresponding Rows events (one query may generate
+several Table map events in the binary log, but the corresponding
+Annotate_rows event must be written only once before the first Table map
+event; hence the boolean variable 'with_annotate' below):
+
+ int write_locked_table_maps(THD *thd)
+ { ...
+ bool with_annotate= thd->variables.binlog_annotate_rows_events;
+ ...
+ for (uint i= 0; i < ... <number of tables> ...; ++i)
+ { ...
+ thd->binlog_write_table_map(table, ..., with_annotate);
+ with_annotate= 0; // write Annotate_event not more than once
+ ...
+ }
+ ...
+ }
+
+ int THD::binlog_write_table_map(TABLE *table, ..., bool with_annotate)
+ { ...
+ Table_map_log_event the_event(...);
+ ...
+ if (with_annotate)
+ {
+ Annotate_rows_log_event anno(this);
+ mysql_bin_log.write(&anno);
+ }
+
+ mysql_bin_log.write(&the_event);
+ ...
+ }
+
+4. How slave treats replicate-annotate-rows-events option
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+The replicate-annotate-rows-events option is treated just as the session
+value of the binlog_annotate_rows_events variable for the slave IO and
+SQL threads. This setting is done during initialization of these threads:
+
+ pthread_handler_t handle_slave_io(void *arg)
+ {
+ THD *thd= new THD;
+ ...
+ init_slave_thread(thd, SLAVE_THD_IO);
+ ...
+ }
+
+ pthread_handler_t handle_slave_sql(void *arg)
+ {
+ THD *thd= new THD;
+ ...
+ init_slave_thread(thd, SLAVE_THD_SQL);
+ ...
+ }
+
+ int init_slave_thread(THD* thd, SLAVE_THD_TYPE thd_type)
+ { ...
+ thd->variables.binlog_annotate_rows_events=
+ opt_replicate_annotate_rows_events;
+ ...
+ }
+
+5. How slave IO thread requests Annotate_rows events
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+When requesting an event, the slave should inform the master whether
+it should send Annotate_rows events or not. To that end we add a new
+BINLOG_SEND_ANNOTATE_ROWS_EVENT flag used when requesting an event:
+
+ #define BINLOG_DUMP_NON_BLOCK 1
+ #define BINLOG_SEND_ANNOTATE_ROWS_EVENT 2
+
+ pthread_handler_t handle_slave_io(void *arg)
+ { ...
+ request_dump(mysql, ...);
+ ...
+ }
+
+ int request_dump(MYSQL* mysql, ...)
+ { ...
+ if (opt_log_slave_updates &&
+ mi->io_thd->variables.binlog_annotate_rows_events)
+ binlog_flags|= BINLOG_SEND_ANNOTATE_ROWS_EVENT;
+ ...
+ int2store(buf + 4, binlog_flags);
+ ...
+ simple_command(mysql, COM_BINLOG_DUMP, buf, ...);
+ ...
+ }
+
+6. How master executes the request
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+ case COM_BINLOG_DUMP:
+ { ...
+ flags= uint2korr(packet + 4);
+ ...
+ mysql_binlog_send(thd, ..., flags);
+ ...
+ }
+
+ void mysql_binlog_send(THD* thd, ..., ushort flags)
+ { ...
+ Log_event::read_log_event(&log, packet, ...);
+ ...
+ if ((*packet)[EVENT_TYPE_OFFSET + 1] != ANNOTATE_ROWS_EVENT ||
+ flags & BINLOG_SEND_ANNOTATE_ROWS_EVENT ||
+ thd->server_id == 0 /* slave == mysqlbinlog */ )
+ {
+ my_net_write(net, packet->ptr(), packet->length());
+ }
+ ...
+ }
+
+7. How slave SQL thread processes Annotate_rows events
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+The slave processes each recieved event by "applying" it, i.e. by
+calling the Log_event::apply_event() function which in turn calls
+the virtual do_apply_event() member specific for each type of the
+event.
+
+ int exec_relay_log_event(THD* thd, Relay_log_info* rli)
+ { ...
+ Log_event *ev = next_event(rli);
+ ...
+ apply_event_and_update_pos(ev, ...);
+
+ if (ev->get_type_code() != FORMAT_DESCRIPTION_EVENT)
+ delete ev;
+ ...
+ }
+
+ int apply_event_and_update_pos(Log_event *ev, ...)
+ { ...
+ ev->apply_event(...);
+ ...
+ }
+
+ int Log_event::apply_event(...)
+ {
+ return do_apply_event(...);
+ }
+
+What does it mean to "apply" an Annotate_rows event? It means to set current
+thd query to that of the described by the event, i.e. to the query which
+caused the subsequent Rows events (see "How Master writes Annotate_rows
+events to the binary log" to follow what happens further when the subsequent
+Rows events is applied):
+
+ int Annotate_rows_log_event::do_apply_event(...)
+ {
+ thd->set_query(m_query_txt, m_query_len);
+ }
+
+NOTE. I am not sure, but possibly current values of thd->query and
+thd->query_length should be saved before calling set_query() and to be
+restored on the Annotate_rows_log_event object deletion.
+Is it really needed ?
+
+After calling this do_apply_event() function we may not delete the
+Annotate_rows_log_event object immediatedly (see exec_relay_log_event()
+above) because thd->query now points to the string inside this object.
+We may keep the pointer to this object in the Relay_log_info:
+
+ class Relay_log_info
+ {
+ public:
+ ...
+ void set_annotate_event(Annotate_rows_log_event*);
+ Annotate_rows_log_event* get_annotate_event();
+ void free_annotate_event();
+ ...
+ private:
+ Annotate_rows_log_event* m_annotate_event;
+ };
+
+When the saved Annotate_rows object may be deleted? When all corresponding
+Rows events will be processed, i.e. before processing the first non-Rows
+event (note that Annotate_rows object resides in the binary log *after*
+the (possible) 'BEGIN' Query event which accompanies the rows events; note
+also that this deletion is adjusted with the case when some or all
+corresponding Rows events are filtered out by replicate filter rules):
+
+ int exec_relay_log_event(THD* thd, Relay_log_info* rli)
+ { ...
+ Log_event *ev= next_event(rli);
+ ...
+ if (rli->get_annotate_event() && !IS_RBR_EVENT_TYPE(ev->get_type_code()))
+ rli->free_annotate_event();
+
+ apply_event_and_update_pos(ev, ...);
+
+ if (ev->get_type_code() == ANNOTATE_ROWS_EVENT)
+ rli->set_annotate_event((Annotate_rows_log_event*) ev);
+ else if (ev->get_type_code() != FORMAT_DESCRIPTION_EVENT)
+ delete ev;
+ ...
+ }
+
+where
+
+ #define IS_RBR_EVENT_TYPE(type) ( (type) == TABLE_MAP_EVENT || \
+ (type) == WRITE_ROWS_EVENT || \
+ (type) == UPDATE_ROWS_EVENT || \
+ (type) == DELETE_ROWS_EVENT )
+
+8. General remarks
+~~~~~~~~~~~~~~~~~~
+Kristian noticed that introducing new log event type should be coordinated
+somehow with MySQL/Sun:
+
+ Kristian: The numeric code for this event must be assigned carefully.
+ It should be coordinated with MySQL/Sun, otherwise we can get into a
+ situation where MySQL uses the same numeric code for one event that
+ MariaDB uses for ANNOTATE_ROWS_EVENT, which would make merging the two
+ impossible.
+ Alex: I reserved about 20 numbers not to have possible conflicts
+ with MySQL.
+ Kristian: Still, I think it would be appropriate to send a polite email
+ to internals(a)lists.mysql.com about this and suggesting to reserve the
+ event number.
+
+Also we should notice the introduction of the BINLOG_SEND_ANNOTATE_ROWS_EVENT
+flag taking into account that MySQL/Sun may also introduce a flag with the
+same value to be used in the request_dump-mysql_binlog_send interface.
+But this is mainly the question of merging: if a conflict concerning this
+flag occur, we may simply change the BINLOG_SEND_ANNOTATE_ROWS_EVENT value
+(this does not require additional changes in the code).
------------------------------------------------------------
-=-=(View All Progress Notes, 23 total)=-=-
http://askmonty.org/worklog/index.pl?tid=47&nolimit=1
DESCRIPTION:
Store in binlog (and show in mysqlbinlog output) texts of statements that
caused RBR events
This is needed for (list from Monty):
- Easier to understand why updates happened
- Would make it easier to find out where in application things went
wrong (as you can search for exact strings)
- Allow one to filter things based on comments in the statement.
The cost of this can be that the binlog will be approximately 2x in size
(especially insert of big blob's would be a bit painful), so this should
be an optional feature.
HIGH-LEVEL SPECIFICATION:
Content
~~~~~~~
1. Annotate_rows_log_event
2. Server option: --binlog-annotate-rows-events
3. Server option: --replicate-annotate-rows-events
4. mysqlbinlog option: --print-annotate-rows-events
5. mysqlbinlog output
1. Annotate_rows_log_event [ ANNOTATE_ROWS_EVENT ]
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
Describes the query which caused the corresponding rows events. Has empty
post-header and contains the query text in its data part. Example:
************************
ANNOTATE_ROWS_EVENT
************************
00000220 | B6 A0 2C 4B | time_when = 1261215926
00000224 | 33 | event_type = 51
00000225 | 64 00 00 00 | server_id = 100
00000229 | 36 00 00 00 | event_len = 54
0000022D | 56 02 00 00 | log_pos = 00000256
00000231 | 00 00 | flags = <none>
------------------------
00000233 | 49 4E 53 45 | query = "INSERT INTO t1 VALUES (1), (2), (3)"
00000237 | 52 54 20 49 |
0000023B | 4E 54 4F 20 |
0000023F | 74 31 20 56 |
00000243 | 41 4C 55 45 |
00000247 | 53 20 28 31 |
0000024B | 29 2C 20 28 |
0000024F | 32 29 2C 20 |
00000253 | 28 33 29 |
************************
In binary log, Annotate_rows event follows the (possible) 'BEGIN' Query event
and precedes the first of Table map events which accompany the corresponding
rows events. (See example in the "mysqlbinlog output" section below.)
2. Server option: --binlog-annotate-rows-events
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
Tells the master to write Annotate_rows events to the binary log.
* Variable Name: binlog_annotate_rows_events
* Scope: Global & Session
* Access Type: Dynamic
* Data Type: bool
* Default Value: OFF
NOTE. Session values allows to annotate only some selected statements:
...
SET SESSION binlog_annotate_rows_events=ON;
... statements to be annotated ...
SET SESSION binlog_annotate_rows_events=OFF;
... statements not to be annotated ...
3. Server option: --replicate-annotate-rows-events
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
Tells the slave to reproduce Annotate_rows events recieved from the master
in its own binary log (sensible only in pair with log-slave-updates option).
* Variable Name: replicate_annotate_rows_events
* Scope: Global
* Access Type: Read only
* Data Type: bool
* Default Value: OFF
NOTE. Why do we additionally need this 'replicate' option? Why not to make
the slave to reproduce this events when its binlog-annotate-rows-events
global value is ON? Well, because, for example, we may want to configure
the slave which should reproduce Annotate_rows events but has global
binlog-annotate-rows-events = OFF meaning this to be the default value for
the client threads (see also "How slave treats replicate-annotate-rows-events
option" in LLD part).
4. mysqlbinlog option: --print-annotate-rows-events
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
With this option, mysqlbinlog prints the content of Annotate_rows events (if
the binary log does contain them). Without this option (i.e. by default),
mysqlbinlog skips Annotate_rows events.
5. mysqlbinlog output
~~~~~~~~~~~~~~~~~~~~~
With --print-annotate-rows-events, mysqlbinlog outputs Annotate_rows events
in a form like this:
...
# at 1646
#091219 12:45:26 server id 100 end_log_pos 1714 Query thread_id=1
exec_time=0 error_code=0
SET TIMESTAMP=1261215926/*!*/;
BEGIN
/*!*/;
# at 1714
# at 1812
# at 1853
# at 1894
# at 1938
#091219 12:45:26 server id 100 end_log_pos 1812 Query: `DELETE t1, t2 FROM
t1 INNER JOIN t2 INNER JOIN t3 WHERE t1.a=t2.a AND t2.a=t3.a`
#091219 12:45:26 server id 100 end_log_pos 1853 Table_map: `test`.`t1`
mapped to number 16
#091219 12:45:26 server id 100 end_log_pos 1894 Table_map: `test`.`t2`
mapped to number 17
#091219 12:45:26 server id 100 end_log_pos 1938 Delete_rows: table id 16
#091219 12:45:26 server id 100 end_log_pos 1982 Delete_rows: table id 17
flags: STMT_END_F
...
LOW-LEVEL DESIGN:
Content
~~~~~~~
1. Annotate_rows event number
2. Outline of Annotate_rows event behavior
3. How Master writes Annotate_rows events to the binary log
4. How slave treats replicate-annotate-rows-events option
5. How slave IO thread requests Annotate_rows events
6. How master executes the request
7. How slave SQL thread processes Annotate_rows events
8. General remarks
1. Annotate_rows event number
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
To avoid possible event numbers conflict with MySQL/Sun, we leave a gap
between the last MySQL event number and the Annotate_rows event number:
enum Log_event_type
{ ...
INCIDENT_EVENT= 26,
// New MySQL event numbers are to be added here
MYSQL_EVENTS_END,
MARIA_EVENTS_BEGIN= 51,
// New Maria event numbers start from here
ANNOTATE_ROWS_EVENT= 51,
ENUM_END_EVENT
};
together with the corresponding extension of 'post_header_len' array in the
Format description event. (This extension does not affect the compatibility
of the binary log). Here is how Format description event looks like with
this extension:
************************
FORMAT_DESCRIPTION_EVENT
************************
00000004 | A1 A0 2C 4B | time_when = 1261215905
00000008 | 0F | event_type = 15
00000009 | 64 00 00 00 | server_id = 100
0000000D | 7F 00 00 00 | event_len = 127
00000011 | 83 00 00 00 | log_pos = 00000083
00000015 | 01 00 | flags = LOG_EVENT_BINLOG_IN_USE_F
------------------------
00000017 | 04 00 | binlog_ver = 4
00000019 | 35 2E 32 2E | server_ver = 5.2.0-MariaDB-alpha-debug-log
..... ...
0000004B | A1 A0 2C 4B | time_created = 1261215905
0000004F | 13 | common_header_len = 19
------------------------
post_header_len
------------------------
00000050 | 38 | 56 - START_EVENT_V3 [1]
..... ...
00000069 | 02 | 2 - INCIDENT_EVENT [26]
0000006A | 00 | 0 - RESERVED [27]
..... ...
00000081 | 00 | 0 - RESERVED [50]
00000082 | 00 | 0 - ANNOTATE_ROWS_EVENT [51]
************************
2. Outline of Annotate_rows event behavior
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
Each Annotate_rows_log_event object has two private members describing the
corresponding query:
char *m_query_txt;
uint m_query_len;
When the object is created for writing to a binary log, this query is taken
from 'thd' (for short, below we omit the 'Annotate_rows_log_event::' prefix
as well as other implementation details):
Annotate_rows_log_event(THD *thd)
{
m_query_txt = thd->query();
m_query_len = thd->query_length();
}
When the object is read from a binary log, the query is taken from the buffer
containing the binary log representation of the event (this buffer is allocated
in Log_event object from which all Log events are derived):
Annotate_rows_log_event(char *buf, uint event_len,
Format_description_log_event *desc)
{
m_query_len = event_len - desc->common_header_len;
m_query_txt = buf + desc->common_header_len;
}
The events are written to the binary log by the Log_event::write() member
which calls virtual write_data_header() and write_data_body() members
("data header" and "post header" are synonym in replication terminology).
In our case, data header is empty and data body is just the query:
bool write_data_body(IO_CACHE *file)
{
return my_b_safe_write(file, (uchar*) m_query_txt, m_query_len);
}
Printing the event is just printing the query:
void Annotate_rows_log_event::print(FILE *file, PRINT_EVENT_INFO *pinfo)
{
my_b_printf(&pinfo->head_cache, "\tQuery: `%s`\n", m_query_txt);
}
3. How Master writes Annotate_rows events to the binary log
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
The event is written to the binary log just before the group of Table_map
events which precede corresponding Rows events (one query may generate
several Table map events in the binary log, but the corresponding
Annotate_rows event must be written only once before the first Table map
event; hence the boolean variable 'with_annotate' below):
int write_locked_table_maps(THD *thd)
{ ...
bool with_annotate= thd->variables.binlog_annotate_rows_events;
...
for (uint i= 0; i < ... <number of tables> ...; ++i)
{ ...
thd->binlog_write_table_map(table, ..., with_annotate);
with_annotate= 0; // write Annotate_event not more than once
...
}
...
}
int THD::binlog_write_table_map(TABLE *table, ..., bool with_annotate)
{ ...
Table_map_log_event the_event(...);
...
if (with_annotate)
{
Annotate_rows_log_event anno(this);
mysql_bin_log.write(&anno);
}
mysql_bin_log.write(&the_event);
...
}
4. How slave treats replicate-annotate-rows-events option
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
The replicate-annotate-rows-events option is treated just as the session
value of the binlog_annotate_rows_events variable for the slave IO and
SQL threads. This setting is done during initialization of these threads:
pthread_handler_t handle_slave_io(void *arg)
{
THD *thd= new THD;
...
init_slave_thread(thd, SLAVE_THD_IO);
...
}
pthread_handler_t handle_slave_sql(void *arg)
{
THD *thd= new THD;
...
init_slave_thread(thd, SLAVE_THD_SQL);
...
}
int init_slave_thread(THD* thd, SLAVE_THD_TYPE thd_type)
{ ...
thd->variables.binlog_annotate_rows_events=
opt_replicate_annotate_rows_events;
...
}
5. How slave IO thread requests Annotate_rows events
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
If the replicate-annotate-rows-events option is not set on a slave, there
is no need for master to send Annotate_rows events to this slave. The slave
(or mysqlbinlog in remote case), before requesting binlog dump via the
COM_BINLOG_DUMP command, informs the master whether it should send these
events by executing the newly added COM_BINLOG_DUMP_OPTIONS_EXT server
command:
case COM_BINLOG_DUMP_OPTIONS_EXT:
thd->binlog_dump_flags_ext= packet[0];
my_ok(thd);
break;
Note. We add this new command and don't use COM_BINLOG_DUMP to avoid possible
conflicts with MySQL/Sun.
6. How master executes the request
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
case COM_BINLOG_DUMP:
{ ...
flags= uint2korr(packet + 4);
...
mysql_binlog_send(thd, ..., flags);
...
}
void mysql_binlog_send(THD* thd, ..., ushort flags)
{ ...
Log_event::read_log_event(&log, packet, ...);
...
if ((*packet)[EVENT_TYPE_OFFSET + 1] != ANNOTATE_ROWS_EVENT ||
flags & BINLOG_SEND_ANNOTATE_ROWS_EVENT)
{
my_net_write(net, packet->ptr(), packet->length());
}
...
}
7. How slave SQL thread processes Annotate_rows events
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
The slave processes each recieved event by "applying" it, i.e. by
calling the Log_event::apply_event() function which in turn calls
the virtual do_apply_event() member specific for each type of the
event.
int exec_relay_log_event(THD* thd, Relay_log_info* rli)
{ ...
Log_event *ev = next_event(rli);
...
apply_event_and_update_pos(ev, ...);
if (ev->get_type_code() != FORMAT_DESCRIPTION_EVENT)
delete ev;
...
}
int apply_event_and_update_pos(Log_event *ev, ...)
{ ...
ev->apply_event(...);
...
}
int Log_event::apply_event(...)
{
return do_apply_event(...);
}
What does it mean to "apply" an Annotate_rows event? It means to set current
thd query to that of the described by the event, i.e. to the query which
caused the subsequent Rows events (see "How Master writes Annotate_rows
events to the binary log" to follow what happens further when the subsequent
Rows events are applied):
int Annotate_rows_log_event::do_apply_event(...)
{
thd->set_query(m_query_txt, m_query_len);
}
NOTE. I am not sure, but possibly current values of thd->query and
thd->query_length should be saved before calling set_query() and to be
restored on the Annotate_rows_log_event object deletion.
Is it really needed ?
After calling this do_apply_event() function we may not delete the
Annotate_rows_log_event object immediatedly (see exec_relay_log_event()
above) because thd->query now points to the string inside this object.
We may keep the pointer to this object in the Relay_log_info:
class Relay_log_info
{
public:
...
void set_annotate_event(Annotate_rows_log_event*);
Annotate_rows_log_event* get_annotate_event();
void free_annotate_event();
...
private:
Annotate_rows_log_event* m_annotate_event;
};
The saved Annotate_rows object should be deleted when all corresponding
Rows events will be processed:
int exec_relay_log_event(THD* thd, Relay_log_info* rli)
{ ...
Log_event *ev= next_event(rli);
...
apply_event_and_update_pos(ev, ...);
if (rli->get_annotate_event() && is_last_rows_event(ev))
rli->free_annotate_event();
else if (ev->get_type_code() == ANNOTATE_ROWS_EVENT)
rli->set_annotate_event((Annotate_rows_log_event*) ev);
else if (ev->get_type_code() != FORMAT_DESCRIPTION_EVENT)
delete ev;
...
}
where
bool is_last_rows_event(Log_event* ev)
{
Log_event_type type= ev->get_type_code();
if (IS_ROWS_EVENT_TYPE(type))
{
Rows_log_event* rows= (Rows_log_event*)ev;
return rows->get_flags(Rows_log_event::STMT_END_F);
}
return 0;
}
#define IS_ROWS_EVENT_TYPE(type) ((type) == WRITE_ROWS_EVENT || \
(type) == UPDATE_ROWS_EVENT || \
(type) == DELETE_ROWS_EVENT)
8. General remarks
~~~~~~~~~~~~~~~~~~
Kristian noticed that introducing new log event type should be coordinated
somehow with MySQL/Sun:
Kristian: The numeric code for this event must be assigned carefully.
It should be coordinated with MySQL/Sun, otherwise we can get into a
situation where MySQL uses the same numeric code for one event that
MariaDB uses for ANNOTATE_ROWS_EVENT, which would make merging the two
impossible.
Alex: I reserved about 20 numbers not to have possible conflicts
with MySQL.
Kristian: Still, I think it would be appropriate to send a polite email
to internals(a)lists.mysql.com about this and suggesting to reserve the
event number.
ESTIMATED WORK TIME
ESTIMATED COMPLETION DATE
-----------------------------------------------------------------------
WorkLog (v3.5.9)
1
0

[Maria-developers] Updated (by Serg): Store in binlog text of statements that caused RBR events (47)
by worklog-noreply@askmonty.org 05 Feb '10
by worklog-noreply@askmonty.org 05 Feb '10
05 Feb '10
-----------------------------------------------------------------------
WORKLOG TASK
-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-
TASK...........: Store in binlog text of statements that caused RBR events
CREATION DATE..: Sat, 15 Aug 2009, 23:48
SUPERVISOR.....: Monty
IMPLEMENTOR....:
COPIES TO......: Knielsen, Serg
CATEGORY.......: Server-Sprint
TASK ID........: 47 (http://askmonty.org/worklog/?tid=47)
VERSION........: Server-9.x
STATUS.........: In-Progress
PRIORITY.......: 60
WORKED HOURS...: 20
ESTIMATE.......: 35 (hours remain)
ORIG. ESTIMATE.: 35
PROGRESS NOTES:
-=-=(Serg - Fri, 05 Feb 2010, 14:04)=-=-
Observers changed: Knielsen,Serg
-=-=(Guest - Fri, 05 Feb 2010, 13:40)=-=-
Category updated.
--- /tmp/wklog.47.old.9197 2010-02-05 13:40:36.000000000 +0200
+++ /tmp/wklog.47.new.9197 2010-02-05 13:40:36.000000000 +0200
@@ -1 +1 @@
-Server-RawIdeaBin
+Server-Sprint
-=-=(Guest - Fri, 05 Feb 2010, 13:40)=-=-
Status updated.
--- /tmp/wklog.47.old.9197 2010-02-05 13:40:36.000000000 +0200
+++ /tmp/wklog.47.new.9197 2010-02-05 13:40:36.000000000 +0200
@@ -1 +1 @@
-Un-Assigned
+In-Progress
-=-=(Alexi - Thu, 04 Feb 2010, 09:54)=-=-
Low Level Design modified.
--- /tmp/wklog.47.old.16174 2010-02-04 09:54:13.000000000 +0200
+++ /tmp/wklog.47.new.16174 2010-02-04 09:54:13.000000000 +0200
@@ -171,35 +171,20 @@
5. How slave IO thread requests Annotate_rows events
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-When requesting an event, the slave should inform the master whether
-it should send Annotate_rows events or not. To that end we add a new
-BINLOG_SEND_ANNOTATE_ROWS_EVENT flag used when requesting an event:
+If the replicate-annotate-rows-events option is not set on a slave, there
+is no need for master to send Annotate_rows events to this slave. The slave
+(or mysqlbinlog in remote case), before requesting binlog dump via the
+COM_BINLOG_DUMP command, informs the master whether it should send these
+events by executing the newly added COM_BINLOG_DUMP_OPTIONS_EXT server
+command:
+
+ case COM_BINLOG_DUMP_OPTIONS_EXT:
+ thd->binlog_dump_flags_ext= packet[0];
+ my_ok(thd);
+ break;
- #define BINLOG_DUMP_NON_BLOCK 1
- #define BINLOG_SEND_ANNOTATE_ROWS_EVENT 2
-
- pthread_handler_t handle_slave_io(void *arg)
- { ...
- request_dump(mysql, ...);
- ...
- }
-
- int request_dump(MYSQL* mysql, ...)
- { ...
- if (opt_log_slave_updates &&
- mi->io_thd->variables.binlog_annotate_rows_events)
- binlog_flags|= BINLOG_SEND_ANNOTATE_ROWS_EVENT;
- ...
- int2store(buf + 4, binlog_flags);
- ...
- simple_command(mysql, COM_BINLOG_DUMP, buf, ...);
- ...
- }
-
-NOTE. mysqlbinlog, when remotely requesting BINLOG_DUMP by calling the
-simple_command() function, should also use this flag if it wants (in case
-of the --print-annotate-rows-events option set) to recieve Annotate_rows
-events.
+Note. We add this new command and don't use COM_BINLOG_DUMP to avoid possible
+conflicts with MySQL/Sun.
6. How master executes the request
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
@@ -338,10 +323,4 @@
to internals(a)lists.mysql.com about this and suggesting to reserve the
event number.
-Also we should notice the introduction of the BINLOG_SEND_ANNOTATE_ROWS_EVENT
-flag taking into account that MySQL/Sun may also introduce a flag with the
-same value to be used in the request_dump-mysql_binlog_send interface.
-But this is mainly the question of merging: if a conflict concerning this
-flag occur, we may simply change the BINLOG_SEND_ANNOTATE_ROWS_EVENT value
-(this does not require additional changes in the code).
-=-=(Alexi - Sun, 20 Dec 2009, 16:00)=-=-
Low Level Design modified.
--- /tmp/wklog.47.old.19667 2009-12-20 14:00:56.000000000 +0000
+++ /tmp/wklog.47.new.19667 2009-12-20 14:00:56.000000000 +0000
@@ -196,6 +196,11 @@
...
}
+NOTE. mysqlbinlog, when remotely requesting BINLOG_DUMP by calling the
+simple_command() function, should also use this flag if it wants (in case
+of the --print-annotate-rows-events option set) to recieve Annotate_rows
+events.
+
6. How master executes the request
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
@@ -212,8 +217,7 @@
Log_event::read_log_event(&log, packet, ...);
...
if ((*packet)[EVENT_TYPE_OFFSET + 1] != ANNOTATE_ROWS_EVENT ||
- flags & BINLOG_SEND_ANNOTATE_ROWS_EVENT ||
- thd->server_id == 0 /* slave == mysqlbinlog */ )
+ flags & BINLOG_SEND_ANNOTATE_ROWS_EVENT)
{
my_net_write(net, packet->ptr(), packet->length());
}
-=-=(Alexi - Sun, 20 Dec 2009, 13:14)=-=-
Low Level Design modified.
--- /tmp/wklog.47.old.11350 2009-12-20 13:14:04.000000000 +0200
+++ /tmp/wklog.47.new.11350 2009-12-20 13:14:04.000000000 +0200
@@ -282,23 +282,18 @@
Annotate_rows_log_event* m_annotate_event;
};
-When the saved Annotate_rows object may be deleted? When all corresponding
-Rows events will be processed, i.e. before processing the first non-Rows
-event (note that Annotate_rows object resides in the binary log *after*
-the (possible) 'BEGIN' Query event which accompanies the rows events; note
-also that this deletion is adjusted with the case when some or all
-corresponding Rows events are filtered out by replicate filter rules):
+The saved Annotate_rows object should be deleted when all corresponding
+Rows events will be processed:
int exec_relay_log_event(THD* thd, Relay_log_info* rli)
{ ...
Log_event *ev= next_event(rli);
...
- if (rli->get_annotate_event() && !IS_RBR_EVENT_TYPE(ev->get_type_code()))
- rli->free_annotate_event();
-
apply_event_and_update_pos(ev, ...);
- if (ev->get_type_code() == ANNOTATE_ROWS_EVENT)
+ if (rli->get_annotate_event() && is_last_rows_event(ev))
+ rli->free_annotate_event();
+ else if (ev->get_type_code() == ANNOTATE_ROWS_EVENT)
rli->set_annotate_event((Annotate_rows_log_event*) ev);
else if (ev->get_type_code() != FORMAT_DESCRIPTION_EVENT)
delete ev;
@@ -307,10 +302,21 @@
where
- #define IS_RBR_EVENT_TYPE(type) ( (type) == TABLE_MAP_EVENT || \
- (type) == WRITE_ROWS_EVENT || \
+ bool is_last_rows_event(Log_event* ev)
+ {
+ Log_event_type type= ev->get_type_code();
+ if (IS_ROWS_EVENT_TYPE(type))
+ {
+ Rows_log_event* rows= (Rows_log_event*)ev;
+ return rows->get_flags(Rows_log_event::STMT_END_F);
+ }
+
+ return 0;
+ }
+
+ #define IS_ROWS_EVENT_TYPE(type) ((type) == WRITE_ROWS_EVENT || \
(type) == UPDATE_ROWS_EVENT || \
- (type) == DELETE_ROWS_EVENT )
+ (type) == DELETE_ROWS_EVENT)
8. General remarks
~~~~~~~~~~~~~~~~~~
-=-=(Alexi - Sun, 20 Dec 2009, 09:29)=-=-
Low Level Design modified.
--- /tmp/wklog.47.old.32726 2009-12-20 07:29:56.000000000 +0000
+++ /tmp/wklog.47.new.32726 2009-12-20 07:29:56.000000000 +0000
@@ -56,7 +56,7 @@
0000006A | 00 | 0 - RESERVED [27]
..... ...
00000081 | 00 | 0 - RESERVED [50]
- 00000082 | 00 | 0 - ANNOTATE_RBR_EVENT [51]
+ 00000082 | 00 | 0 - ANNOTATE_ROWS_EVENT [51]
************************
2. Outline of Annotate_rows event behavior
-=-=(Alexi - Sat, 19 Dec 2009, 16:10)=-=-
Low Level Design modified.
--- /tmp/wklog.47.old.16051 2009-12-19 16:10:48.000000000 +0200
+++ /tmp/wklog.47.new.16051 2009-12-19 16:10:48.000000000 +0200
@@ -253,7 +253,7 @@
thd query to that of the described by the event, i.e. to the query which
caused the subsequent Rows events (see "How Master writes Annotate_rows
events to the binary log" to follow what happens further when the subsequent
-Rows events is applied):
+Rows events are applied):
int Annotate_rows_log_event::do_apply_event(...)
{
-=-=(Alexi - Sat, 19 Dec 2009, 16:02)=-=-
High-Level Specification modified.
--- /tmp/wklog.47.old.15695 2009-12-19 16:02:33.000000000 +0200
+++ /tmp/wklog.47.new.15695 2009-12-19 16:02:33.000000000 +0200
@@ -12,7 +12,7 @@
post-header and contains the query text in its data part. Example:
************************
- ANNOTATE_RBR_EVENT
+ ANNOTATE_ROWS_EVENT
************************
00000220 | B6 A0 2C 4B | time_when = 1261215926
00000224 | 33 | event_type = 51
-=-=(Alexi - Sat, 19 Dec 2009, 15:58)=-=-
Low Level Design modified.
--- /tmp/wklog.47.old.15437 2009-12-19 13:58:12.000000000 +0000
+++ /tmp/wklog.47.new.15437 2009-12-19 13:58:12.000000000 +0000
@@ -1 +1,337 @@
+Content
+~~~~~~~
+ 1. Annotate_rows event number
+ 2. Outline of Annotate_rows event behavior
+ 3. How Master writes Annotate_rows events to the binary log
+ 4. How slave treats replicate-annotate-rows-events option
+ 5. How slave IO thread requests Annotate_rows events
+ 6. How master executes the request
+ 7. How slave SQL thread processes Annotate_rows events
+ 8. General remarks
+
+1. Annotate_rows event number
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+To avoid possible event numbers conflict with MySQL/Sun, we leave a gap
+between the last MySQL event number and the Annotate_rows event number:
+
+ enum Log_event_type
+ { ...
+ INCIDENT_EVENT= 26,
+ // New MySQL event numbers are to be added here
+ MYSQL_EVENTS_END,
+
+ MARIA_EVENTS_BEGIN= 51,
+ // New Maria event numbers start from here
+ ANNOTATE_ROWS_EVENT= 51,
+
+ ENUM_END_EVENT
+ };
+
+together with the corresponding extension of 'post_header_len' array in the
+Format description event. (This extension does not affect the compatibility
+of the binary log). Here is how Format description event looks like with
+this extension:
+
+ ************************
+ FORMAT_DESCRIPTION_EVENT
+ ************************
+ 00000004 | A1 A0 2C 4B | time_when = 1261215905
+ 00000008 | 0F | event_type = 15
+ 00000009 | 64 00 00 00 | server_id = 100
+ 0000000D | 7F 00 00 00 | event_len = 127
+ 00000011 | 83 00 00 00 | log_pos = 00000083
+ 00000015 | 01 00 | flags = LOG_EVENT_BINLOG_IN_USE_F
+ ------------------------
+ 00000017 | 04 00 | binlog_ver = 4
+ 00000019 | 35 2E 32 2E | server_ver = 5.2.0-MariaDB-alpha-debug-log
+ ..... ...
+ 0000004B | A1 A0 2C 4B | time_created = 1261215905
+ 0000004F | 13 | common_header_len = 19
+ ------------------------
+ post_header_len
+ ------------------------
+ 00000050 | 38 | 56 - START_EVENT_V3 [1]
+ ..... ...
+ 00000069 | 02 | 2 - INCIDENT_EVENT [26]
+ 0000006A | 00 | 0 - RESERVED [27]
+ ..... ...
+ 00000081 | 00 | 0 - RESERVED [50]
+ 00000082 | 00 | 0 - ANNOTATE_RBR_EVENT [51]
+ ************************
+
+2. Outline of Annotate_rows event behavior
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+Each Annotate_rows_log_event object has two private members describing the
+corresponding query:
+
+ char *m_query_txt;
+ uint m_query_len;
+
+When the object is created for writing to a binary log, this query is taken
+from 'thd' (for short, below we omit the 'Annotate_rows_log_event::' prefix
+as well as other implementation details):
+
+ Annotate_rows_log_event(THD *thd)
+ {
+ m_query_txt = thd->query();
+ m_query_len = thd->query_length();
+ }
+
+When the object is read from a binary log, the query is taken from the buffer
+containing the binary log representation of the event (this buffer is allocated
+in Log_event object from which all Log events are derived):
+
+ Annotate_rows_log_event(char *buf, uint event_len,
+ Format_description_log_event *desc)
+ {
+ m_query_len = event_len - desc->common_header_len;
+ m_query_txt = buf + desc->common_header_len;
+ }
+
+The events are written to the binary log by the Log_event::write() member
+which calls virtual write_data_header() and write_data_body() members
+("data header" and "post header" are synonym in replication terminology).
+In our case, data header is empty and data body is just the query:
+
+ bool write_data_body(IO_CACHE *file)
+ {
+ return my_b_safe_write(file, (uchar*) m_query_txt, m_query_len);
+ }
+
+Printing the event is just printing the query:
+
+ void Annotate_rows_log_event::print(FILE *file, PRINT_EVENT_INFO *pinfo)
+ {
+ my_b_printf(&pinfo->head_cache, "\tQuery: `%s`\n", m_query_txt);
+ }
+
+3. How Master writes Annotate_rows events to the binary log
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+The event is written to the binary log just before the group of Table_map
+events which precede corresponding Rows events (one query may generate
+several Table map events in the binary log, but the corresponding
+Annotate_rows event must be written only once before the first Table map
+event; hence the boolean variable 'with_annotate' below):
+
+ int write_locked_table_maps(THD *thd)
+ { ...
+ bool with_annotate= thd->variables.binlog_annotate_rows_events;
+ ...
+ for (uint i= 0; i < ... <number of tables> ...; ++i)
+ { ...
+ thd->binlog_write_table_map(table, ..., with_annotate);
+ with_annotate= 0; // write Annotate_event not more than once
+ ...
+ }
+ ...
+ }
+
+ int THD::binlog_write_table_map(TABLE *table, ..., bool with_annotate)
+ { ...
+ Table_map_log_event the_event(...);
+ ...
+ if (with_annotate)
+ {
+ Annotate_rows_log_event anno(this);
+ mysql_bin_log.write(&anno);
+ }
+
+ mysql_bin_log.write(&the_event);
+ ...
+ }
+
+4. How slave treats replicate-annotate-rows-events option
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+The replicate-annotate-rows-events option is treated just as the session
+value of the binlog_annotate_rows_events variable for the slave IO and
+SQL threads. This setting is done during initialization of these threads:
+
+ pthread_handler_t handle_slave_io(void *arg)
+ {
+ THD *thd= new THD;
+ ...
+ init_slave_thread(thd, SLAVE_THD_IO);
+ ...
+ }
+
+ pthread_handler_t handle_slave_sql(void *arg)
+ {
+ THD *thd= new THD;
+ ...
+ init_slave_thread(thd, SLAVE_THD_SQL);
+ ...
+ }
+
+ int init_slave_thread(THD* thd, SLAVE_THD_TYPE thd_type)
+ { ...
+ thd->variables.binlog_annotate_rows_events=
+ opt_replicate_annotate_rows_events;
+ ...
+ }
+
+5. How slave IO thread requests Annotate_rows events
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+When requesting an event, the slave should inform the master whether
+it should send Annotate_rows events or not. To that end we add a new
+BINLOG_SEND_ANNOTATE_ROWS_EVENT flag used when requesting an event:
+
+ #define BINLOG_DUMP_NON_BLOCK 1
+ #define BINLOG_SEND_ANNOTATE_ROWS_EVENT 2
+
+ pthread_handler_t handle_slave_io(void *arg)
+ { ...
+ request_dump(mysql, ...);
+ ...
+ }
+
+ int request_dump(MYSQL* mysql, ...)
+ { ...
+ if (opt_log_slave_updates &&
+ mi->io_thd->variables.binlog_annotate_rows_events)
+ binlog_flags|= BINLOG_SEND_ANNOTATE_ROWS_EVENT;
+ ...
+ int2store(buf + 4, binlog_flags);
+ ...
+ simple_command(mysql, COM_BINLOG_DUMP, buf, ...);
+ ...
+ }
+
+6. How master executes the request
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+ case COM_BINLOG_DUMP:
+ { ...
+ flags= uint2korr(packet + 4);
+ ...
+ mysql_binlog_send(thd, ..., flags);
+ ...
+ }
+
+ void mysql_binlog_send(THD* thd, ..., ushort flags)
+ { ...
+ Log_event::read_log_event(&log, packet, ...);
+ ...
+ if ((*packet)[EVENT_TYPE_OFFSET + 1] != ANNOTATE_ROWS_EVENT ||
+ flags & BINLOG_SEND_ANNOTATE_ROWS_EVENT ||
+ thd->server_id == 0 /* slave == mysqlbinlog */ )
+ {
+ my_net_write(net, packet->ptr(), packet->length());
+ }
+ ...
+ }
+
+7. How slave SQL thread processes Annotate_rows events
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+The slave processes each recieved event by "applying" it, i.e. by
+calling the Log_event::apply_event() function which in turn calls
+the virtual do_apply_event() member specific for each type of the
+event.
+
+ int exec_relay_log_event(THD* thd, Relay_log_info* rli)
+ { ...
+ Log_event *ev = next_event(rli);
+ ...
+ apply_event_and_update_pos(ev, ...);
+
+ if (ev->get_type_code() != FORMAT_DESCRIPTION_EVENT)
+ delete ev;
+ ...
+ }
+
+ int apply_event_and_update_pos(Log_event *ev, ...)
+ { ...
+ ev->apply_event(...);
+ ...
+ }
+
+ int Log_event::apply_event(...)
+ {
+ return do_apply_event(...);
+ }
+
+What does it mean to "apply" an Annotate_rows event? It means to set current
+thd query to that of the described by the event, i.e. to the query which
+caused the subsequent Rows events (see "How Master writes Annotate_rows
+events to the binary log" to follow what happens further when the subsequent
+Rows events is applied):
+
+ int Annotate_rows_log_event::do_apply_event(...)
+ {
+ thd->set_query(m_query_txt, m_query_len);
+ }
+
+NOTE. I am not sure, but possibly current values of thd->query and
+thd->query_length should be saved before calling set_query() and to be
+restored on the Annotate_rows_log_event object deletion.
+Is it really needed ?
+
+After calling this do_apply_event() function we may not delete the
+Annotate_rows_log_event object immediatedly (see exec_relay_log_event()
+above) because thd->query now points to the string inside this object.
+We may keep the pointer to this object in the Relay_log_info:
+
+ class Relay_log_info
+ {
+ public:
+ ...
+ void set_annotate_event(Annotate_rows_log_event*);
+ Annotate_rows_log_event* get_annotate_event();
+ void free_annotate_event();
+ ...
+ private:
+ Annotate_rows_log_event* m_annotate_event;
+ };
+
+When the saved Annotate_rows object may be deleted? When all corresponding
+Rows events will be processed, i.e. before processing the first non-Rows
+event (note that Annotate_rows object resides in the binary log *after*
+the (possible) 'BEGIN' Query event which accompanies the rows events; note
+also that this deletion is adjusted with the case when some or all
+corresponding Rows events are filtered out by replicate filter rules):
+
+ int exec_relay_log_event(THD* thd, Relay_log_info* rli)
+ { ...
+ Log_event *ev= next_event(rli);
+ ...
+ if (rli->get_annotate_event() && !IS_RBR_EVENT_TYPE(ev->get_type_code()))
+ rli->free_annotate_event();
+
+ apply_event_and_update_pos(ev, ...);
+
+ if (ev->get_type_code() == ANNOTATE_ROWS_EVENT)
+ rli->set_annotate_event((Annotate_rows_log_event*) ev);
+ else if (ev->get_type_code() != FORMAT_DESCRIPTION_EVENT)
+ delete ev;
+ ...
+ }
+
+where
+
+ #define IS_RBR_EVENT_TYPE(type) ( (type) == TABLE_MAP_EVENT || \
+ (type) == WRITE_ROWS_EVENT || \
+ (type) == UPDATE_ROWS_EVENT || \
+ (type) == DELETE_ROWS_EVENT )
+
+8. General remarks
+~~~~~~~~~~~~~~~~~~
+Kristian noticed that introducing new log event type should be coordinated
+somehow with MySQL/Sun:
+
+ Kristian: The numeric code for this event must be assigned carefully.
+ It should be coordinated with MySQL/Sun, otherwise we can get into a
+ situation where MySQL uses the same numeric code for one event that
+ MariaDB uses for ANNOTATE_ROWS_EVENT, which would make merging the two
+ impossible.
+ Alex: I reserved about 20 numbers not to have possible conflicts
+ with MySQL.
+ Kristian: Still, I think it would be appropriate to send a polite email
+ to internals(a)lists.mysql.com about this and suggesting to reserve the
+ event number.
+
+Also we should notice the introduction of the BINLOG_SEND_ANNOTATE_ROWS_EVENT
+flag taking into account that MySQL/Sun may also introduce a flag with the
+same value to be used in the request_dump-mysql_binlog_send interface.
+But this is mainly the question of merging: if a conflict concerning this
+flag occur, we may simply change the BINLOG_SEND_ANNOTATE_ROWS_EVENT value
+(this does not require additional changes in the code).
------------------------------------------------------------
-=-=(View All Progress Notes, 23 total)=-=-
http://askmonty.org/worklog/index.pl?tid=47&nolimit=1
DESCRIPTION:
Store in binlog (and show in mysqlbinlog output) texts of statements that
caused RBR events
This is needed for (list from Monty):
- Easier to understand why updates happened
- Would make it easier to find out where in application things went
wrong (as you can search for exact strings)
- Allow one to filter things based on comments in the statement.
The cost of this can be that the binlog will be approximately 2x in size
(especially insert of big blob's would be a bit painful), so this should
be an optional feature.
HIGH-LEVEL SPECIFICATION:
Content
~~~~~~~
1. Annotate_rows_log_event
2. Server option: --binlog-annotate-rows-events
3. Server option: --replicate-annotate-rows-events
4. mysqlbinlog option: --print-annotate-rows-events
5. mysqlbinlog output
1. Annotate_rows_log_event [ ANNOTATE_ROWS_EVENT ]
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
Describes the query which caused the corresponding rows events. Has empty
post-header and contains the query text in its data part. Example:
************************
ANNOTATE_ROWS_EVENT
************************
00000220 | B6 A0 2C 4B | time_when = 1261215926
00000224 | 33 | event_type = 51
00000225 | 64 00 00 00 | server_id = 100
00000229 | 36 00 00 00 | event_len = 54
0000022D | 56 02 00 00 | log_pos = 00000256
00000231 | 00 00 | flags = <none>
------------------------
00000233 | 49 4E 53 45 | query = "INSERT INTO t1 VALUES (1), (2), (3)"
00000237 | 52 54 20 49 |
0000023B | 4E 54 4F 20 |
0000023F | 74 31 20 56 |
00000243 | 41 4C 55 45 |
00000247 | 53 20 28 31 |
0000024B | 29 2C 20 28 |
0000024F | 32 29 2C 20 |
00000253 | 28 33 29 |
************************
In binary log, Annotate_rows event follows the (possible) 'BEGIN' Query event
and precedes the first of Table map events which accompany the corresponding
rows events. (See example in the "mysqlbinlog output" section below.)
2. Server option: --binlog-annotate-rows-events
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
Tells the master to write Annotate_rows events to the binary log.
* Variable Name: binlog_annotate_rows_events
* Scope: Global & Session
* Access Type: Dynamic
* Data Type: bool
* Default Value: OFF
NOTE. Session values allows to annotate only some selected statements:
...
SET SESSION binlog_annotate_rows_events=ON;
... statements to be annotated ...
SET SESSION binlog_annotate_rows_events=OFF;
... statements not to be annotated ...
3. Server option: --replicate-annotate-rows-events
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
Tells the slave to reproduce Annotate_rows events recieved from the master
in its own binary log (sensible only in pair with log-slave-updates option).
* Variable Name: replicate_annotate_rows_events
* Scope: Global
* Access Type: Read only
* Data Type: bool
* Default Value: OFF
NOTE. Why do we additionally need this 'replicate' option? Why not to make
the slave to reproduce this events when its binlog-annotate-rows-events
global value is ON? Well, because, for example, we may want to configure
the slave which should reproduce Annotate_rows events but has global
binlog-annotate-rows-events = OFF meaning this to be the default value for
the client threads (see also "How slave treats replicate-annotate-rows-events
option" in LLD part).
4. mysqlbinlog option: --print-annotate-rows-events
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
With this option, mysqlbinlog prints the content of Annotate_rows events (if
the binary log does contain them). Without this option (i.e. by default),
mysqlbinlog skips Annotate_rows events.
5. mysqlbinlog output
~~~~~~~~~~~~~~~~~~~~~
With --print-annotate-rows-events, mysqlbinlog outputs Annotate_rows events
in a form like this:
...
# at 1646
#091219 12:45:26 server id 100 end_log_pos 1714 Query thread_id=1
exec_time=0 error_code=0
SET TIMESTAMP=1261215926/*!*/;
BEGIN
/*!*/;
# at 1714
# at 1812
# at 1853
# at 1894
# at 1938
#091219 12:45:26 server id 100 end_log_pos 1812 Query: `DELETE t1, t2 FROM
t1 INNER JOIN t2 INNER JOIN t3 WHERE t1.a=t2.a AND t2.a=t3.a`
#091219 12:45:26 server id 100 end_log_pos 1853 Table_map: `test`.`t1`
mapped to number 16
#091219 12:45:26 server id 100 end_log_pos 1894 Table_map: `test`.`t2`
mapped to number 17
#091219 12:45:26 server id 100 end_log_pos 1938 Delete_rows: table id 16
#091219 12:45:26 server id 100 end_log_pos 1982 Delete_rows: table id 17
flags: STMT_END_F
...
LOW-LEVEL DESIGN:
Content
~~~~~~~
1. Annotate_rows event number
2. Outline of Annotate_rows event behavior
3. How Master writes Annotate_rows events to the binary log
4. How slave treats replicate-annotate-rows-events option
5. How slave IO thread requests Annotate_rows events
6. How master executes the request
7. How slave SQL thread processes Annotate_rows events
8. General remarks
1. Annotate_rows event number
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
To avoid possible event numbers conflict with MySQL/Sun, we leave a gap
between the last MySQL event number and the Annotate_rows event number:
enum Log_event_type
{ ...
INCIDENT_EVENT= 26,
// New MySQL event numbers are to be added here
MYSQL_EVENTS_END,
MARIA_EVENTS_BEGIN= 51,
// New Maria event numbers start from here
ANNOTATE_ROWS_EVENT= 51,
ENUM_END_EVENT
};
together with the corresponding extension of 'post_header_len' array in the
Format description event. (This extension does not affect the compatibility
of the binary log). Here is how Format description event looks like with
this extension:
************************
FORMAT_DESCRIPTION_EVENT
************************
00000004 | A1 A0 2C 4B | time_when = 1261215905
00000008 | 0F | event_type = 15
00000009 | 64 00 00 00 | server_id = 100
0000000D | 7F 00 00 00 | event_len = 127
00000011 | 83 00 00 00 | log_pos = 00000083
00000015 | 01 00 | flags = LOG_EVENT_BINLOG_IN_USE_F
------------------------
00000017 | 04 00 | binlog_ver = 4
00000019 | 35 2E 32 2E | server_ver = 5.2.0-MariaDB-alpha-debug-log
..... ...
0000004B | A1 A0 2C 4B | time_created = 1261215905
0000004F | 13 | common_header_len = 19
------------------------
post_header_len
------------------------
00000050 | 38 | 56 - START_EVENT_V3 [1]
..... ...
00000069 | 02 | 2 - INCIDENT_EVENT [26]
0000006A | 00 | 0 - RESERVED [27]
..... ...
00000081 | 00 | 0 - RESERVED [50]
00000082 | 00 | 0 - ANNOTATE_ROWS_EVENT [51]
************************
2. Outline of Annotate_rows event behavior
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
Each Annotate_rows_log_event object has two private members describing the
corresponding query:
char *m_query_txt;
uint m_query_len;
When the object is created for writing to a binary log, this query is taken
from 'thd' (for short, below we omit the 'Annotate_rows_log_event::' prefix
as well as other implementation details):
Annotate_rows_log_event(THD *thd)
{
m_query_txt = thd->query();
m_query_len = thd->query_length();
}
When the object is read from a binary log, the query is taken from the buffer
containing the binary log representation of the event (this buffer is allocated
in Log_event object from which all Log events are derived):
Annotate_rows_log_event(char *buf, uint event_len,
Format_description_log_event *desc)
{
m_query_len = event_len - desc->common_header_len;
m_query_txt = buf + desc->common_header_len;
}
The events are written to the binary log by the Log_event::write() member
which calls virtual write_data_header() and write_data_body() members
("data header" and "post header" are synonym in replication terminology).
In our case, data header is empty and data body is just the query:
bool write_data_body(IO_CACHE *file)
{
return my_b_safe_write(file, (uchar*) m_query_txt, m_query_len);
}
Printing the event is just printing the query:
void Annotate_rows_log_event::print(FILE *file, PRINT_EVENT_INFO *pinfo)
{
my_b_printf(&pinfo->head_cache, "\tQuery: `%s`\n", m_query_txt);
}
3. How Master writes Annotate_rows events to the binary log
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
The event is written to the binary log just before the group of Table_map
events which precede corresponding Rows events (one query may generate
several Table map events in the binary log, but the corresponding
Annotate_rows event must be written only once before the first Table map
event; hence the boolean variable 'with_annotate' below):
int write_locked_table_maps(THD *thd)
{ ...
bool with_annotate= thd->variables.binlog_annotate_rows_events;
...
for (uint i= 0; i < ... <number of tables> ...; ++i)
{ ...
thd->binlog_write_table_map(table, ..., with_annotate);
with_annotate= 0; // write Annotate_event not more than once
...
}
...
}
int THD::binlog_write_table_map(TABLE *table, ..., bool with_annotate)
{ ...
Table_map_log_event the_event(...);
...
if (with_annotate)
{
Annotate_rows_log_event anno(this);
mysql_bin_log.write(&anno);
}
mysql_bin_log.write(&the_event);
...
}
4. How slave treats replicate-annotate-rows-events option
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
The replicate-annotate-rows-events option is treated just as the session
value of the binlog_annotate_rows_events variable for the slave IO and
SQL threads. This setting is done during initialization of these threads:
pthread_handler_t handle_slave_io(void *arg)
{
THD *thd= new THD;
...
init_slave_thread(thd, SLAVE_THD_IO);
...
}
pthread_handler_t handle_slave_sql(void *arg)
{
THD *thd= new THD;
...
init_slave_thread(thd, SLAVE_THD_SQL);
...
}
int init_slave_thread(THD* thd, SLAVE_THD_TYPE thd_type)
{ ...
thd->variables.binlog_annotate_rows_events=
opt_replicate_annotate_rows_events;
...
}
5. How slave IO thread requests Annotate_rows events
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
If the replicate-annotate-rows-events option is not set on a slave, there
is no need for master to send Annotate_rows events to this slave. The slave
(or mysqlbinlog in remote case), before requesting binlog dump via the
COM_BINLOG_DUMP command, informs the master whether it should send these
events by executing the newly added COM_BINLOG_DUMP_OPTIONS_EXT server
command:
case COM_BINLOG_DUMP_OPTIONS_EXT:
thd->binlog_dump_flags_ext= packet[0];
my_ok(thd);
break;
Note. We add this new command and don't use COM_BINLOG_DUMP to avoid possible
conflicts with MySQL/Sun.
6. How master executes the request
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
case COM_BINLOG_DUMP:
{ ...
flags= uint2korr(packet + 4);
...
mysql_binlog_send(thd, ..., flags);
...
}
void mysql_binlog_send(THD* thd, ..., ushort flags)
{ ...
Log_event::read_log_event(&log, packet, ...);
...
if ((*packet)[EVENT_TYPE_OFFSET + 1] != ANNOTATE_ROWS_EVENT ||
flags & BINLOG_SEND_ANNOTATE_ROWS_EVENT)
{
my_net_write(net, packet->ptr(), packet->length());
}
...
}
7. How slave SQL thread processes Annotate_rows events
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
The slave processes each recieved event by "applying" it, i.e. by
calling the Log_event::apply_event() function which in turn calls
the virtual do_apply_event() member specific for each type of the
event.
int exec_relay_log_event(THD* thd, Relay_log_info* rli)
{ ...
Log_event *ev = next_event(rli);
...
apply_event_and_update_pos(ev, ...);
if (ev->get_type_code() != FORMAT_DESCRIPTION_EVENT)
delete ev;
...
}
int apply_event_and_update_pos(Log_event *ev, ...)
{ ...
ev->apply_event(...);
...
}
int Log_event::apply_event(...)
{
return do_apply_event(...);
}
What does it mean to "apply" an Annotate_rows event? It means to set current
thd query to that of the described by the event, i.e. to the query which
caused the subsequent Rows events (see "How Master writes Annotate_rows
events to the binary log" to follow what happens further when the subsequent
Rows events are applied):
int Annotate_rows_log_event::do_apply_event(...)
{
thd->set_query(m_query_txt, m_query_len);
}
NOTE. I am not sure, but possibly current values of thd->query and
thd->query_length should be saved before calling set_query() and to be
restored on the Annotate_rows_log_event object deletion.
Is it really needed ?
After calling this do_apply_event() function we may not delete the
Annotate_rows_log_event object immediatedly (see exec_relay_log_event()
above) because thd->query now points to the string inside this object.
We may keep the pointer to this object in the Relay_log_info:
class Relay_log_info
{
public:
...
void set_annotate_event(Annotate_rows_log_event*);
Annotate_rows_log_event* get_annotate_event();
void free_annotate_event();
...
private:
Annotate_rows_log_event* m_annotate_event;
};
The saved Annotate_rows object should be deleted when all corresponding
Rows events will be processed:
int exec_relay_log_event(THD* thd, Relay_log_info* rli)
{ ...
Log_event *ev= next_event(rli);
...
apply_event_and_update_pos(ev, ...);
if (rli->get_annotate_event() && is_last_rows_event(ev))
rli->free_annotate_event();
else if (ev->get_type_code() == ANNOTATE_ROWS_EVENT)
rli->set_annotate_event((Annotate_rows_log_event*) ev);
else if (ev->get_type_code() != FORMAT_DESCRIPTION_EVENT)
delete ev;
...
}
where
bool is_last_rows_event(Log_event* ev)
{
Log_event_type type= ev->get_type_code();
if (IS_ROWS_EVENT_TYPE(type))
{
Rows_log_event* rows= (Rows_log_event*)ev;
return rows->get_flags(Rows_log_event::STMT_END_F);
}
return 0;
}
#define IS_ROWS_EVENT_TYPE(type) ((type) == WRITE_ROWS_EVENT || \
(type) == UPDATE_ROWS_EVENT || \
(type) == DELETE_ROWS_EVENT)
8. General remarks
~~~~~~~~~~~~~~~~~~
Kristian noticed that introducing new log event type should be coordinated
somehow with MySQL/Sun:
Kristian: The numeric code for this event must be assigned carefully.
It should be coordinated with MySQL/Sun, otherwise we can get into a
situation where MySQL uses the same numeric code for one event that
MariaDB uses for ANNOTATE_ROWS_EVENT, which would make merging the two
impossible.
Alex: I reserved about 20 numbers not to have possible conflicts
with MySQL.
Kristian: Still, I think it would be appropriate to send a polite email
to internals(a)lists.mysql.com about this and suggesting to reserve the
event number.
ESTIMATED WORK TIME
ESTIMATED COMPLETION DATE
-----------------------------------------------------------------------
WorkLog (v3.5.9)
1
0

[Maria-developers] Updated (by Serg): Store in binlog text of statements that caused RBR events (47)
by worklog-noreply@askmonty.org 05 Feb '10
by worklog-noreply@askmonty.org 05 Feb '10
05 Feb '10
-----------------------------------------------------------------------
WORKLOG TASK
-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-
TASK...........: Store in binlog text of statements that caused RBR events
CREATION DATE..: Sat, 15 Aug 2009, 23:48
SUPERVISOR.....: Monty
IMPLEMENTOR....:
COPIES TO......: Knielsen, Serg
CATEGORY.......: Server-Sprint
TASK ID........: 47 (http://askmonty.org/worklog/?tid=47)
VERSION........: Server-9.x
STATUS.........: In-Progress
PRIORITY.......: 60
WORKED HOURS...: 20
ESTIMATE.......: 35 (hours remain)
ORIG. ESTIMATE.: 35
PROGRESS NOTES:
-=-=(Serg - Fri, 05 Feb 2010, 14:04)=-=-
Observers changed: Knielsen,Serg
-=-=(Guest - Fri, 05 Feb 2010, 13:40)=-=-
Category updated.
--- /tmp/wklog.47.old.9197 2010-02-05 13:40:36.000000000 +0200
+++ /tmp/wklog.47.new.9197 2010-02-05 13:40:36.000000000 +0200
@@ -1 +1 @@
-Server-RawIdeaBin
+Server-Sprint
-=-=(Guest - Fri, 05 Feb 2010, 13:40)=-=-
Status updated.
--- /tmp/wklog.47.old.9197 2010-02-05 13:40:36.000000000 +0200
+++ /tmp/wklog.47.new.9197 2010-02-05 13:40:36.000000000 +0200
@@ -1 +1 @@
-Un-Assigned
+In-Progress
-=-=(Alexi - Thu, 04 Feb 2010, 09:54)=-=-
Low Level Design modified.
--- /tmp/wklog.47.old.16174 2010-02-04 09:54:13.000000000 +0200
+++ /tmp/wklog.47.new.16174 2010-02-04 09:54:13.000000000 +0200
@@ -171,35 +171,20 @@
5. How slave IO thread requests Annotate_rows events
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-When requesting an event, the slave should inform the master whether
-it should send Annotate_rows events or not. To that end we add a new
-BINLOG_SEND_ANNOTATE_ROWS_EVENT flag used when requesting an event:
+If the replicate-annotate-rows-events option is not set on a slave, there
+is no need for master to send Annotate_rows events to this slave. The slave
+(or mysqlbinlog in remote case), before requesting binlog dump via the
+COM_BINLOG_DUMP command, informs the master whether it should send these
+events by executing the newly added COM_BINLOG_DUMP_OPTIONS_EXT server
+command:
+
+ case COM_BINLOG_DUMP_OPTIONS_EXT:
+ thd->binlog_dump_flags_ext= packet[0];
+ my_ok(thd);
+ break;
- #define BINLOG_DUMP_NON_BLOCK 1
- #define BINLOG_SEND_ANNOTATE_ROWS_EVENT 2
-
- pthread_handler_t handle_slave_io(void *arg)
- { ...
- request_dump(mysql, ...);
- ...
- }
-
- int request_dump(MYSQL* mysql, ...)
- { ...
- if (opt_log_slave_updates &&
- mi->io_thd->variables.binlog_annotate_rows_events)
- binlog_flags|= BINLOG_SEND_ANNOTATE_ROWS_EVENT;
- ...
- int2store(buf + 4, binlog_flags);
- ...
- simple_command(mysql, COM_BINLOG_DUMP, buf, ...);
- ...
- }
-
-NOTE. mysqlbinlog, when remotely requesting BINLOG_DUMP by calling the
-simple_command() function, should also use this flag if it wants (in case
-of the --print-annotate-rows-events option set) to recieve Annotate_rows
-events.
+Note. We add this new command and don't use COM_BINLOG_DUMP to avoid possible
+conflicts with MySQL/Sun.
6. How master executes the request
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
@@ -338,10 +323,4 @@
to internals(a)lists.mysql.com about this and suggesting to reserve the
event number.
-Also we should notice the introduction of the BINLOG_SEND_ANNOTATE_ROWS_EVENT
-flag taking into account that MySQL/Sun may also introduce a flag with the
-same value to be used in the request_dump-mysql_binlog_send interface.
-But this is mainly the question of merging: if a conflict concerning this
-flag occur, we may simply change the BINLOG_SEND_ANNOTATE_ROWS_EVENT value
-(this does not require additional changes in the code).
-=-=(Alexi - Sun, 20 Dec 2009, 16:00)=-=-
Low Level Design modified.
--- /tmp/wklog.47.old.19667 2009-12-20 14:00:56.000000000 +0000
+++ /tmp/wklog.47.new.19667 2009-12-20 14:00:56.000000000 +0000
@@ -196,6 +196,11 @@
...
}
+NOTE. mysqlbinlog, when remotely requesting BINLOG_DUMP by calling the
+simple_command() function, should also use this flag if it wants (in case
+of the --print-annotate-rows-events option set) to recieve Annotate_rows
+events.
+
6. How master executes the request
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
@@ -212,8 +217,7 @@
Log_event::read_log_event(&log, packet, ...);
...
if ((*packet)[EVENT_TYPE_OFFSET + 1] != ANNOTATE_ROWS_EVENT ||
- flags & BINLOG_SEND_ANNOTATE_ROWS_EVENT ||
- thd->server_id == 0 /* slave == mysqlbinlog */ )
+ flags & BINLOG_SEND_ANNOTATE_ROWS_EVENT)
{
my_net_write(net, packet->ptr(), packet->length());
}
-=-=(Alexi - Sun, 20 Dec 2009, 13:14)=-=-
Low Level Design modified.
--- /tmp/wklog.47.old.11350 2009-12-20 13:14:04.000000000 +0200
+++ /tmp/wklog.47.new.11350 2009-12-20 13:14:04.000000000 +0200
@@ -282,23 +282,18 @@
Annotate_rows_log_event* m_annotate_event;
};
-When the saved Annotate_rows object may be deleted? When all corresponding
-Rows events will be processed, i.e. before processing the first non-Rows
-event (note that Annotate_rows object resides in the binary log *after*
-the (possible) 'BEGIN' Query event which accompanies the rows events; note
-also that this deletion is adjusted with the case when some or all
-corresponding Rows events are filtered out by replicate filter rules):
+The saved Annotate_rows object should be deleted when all corresponding
+Rows events will be processed:
int exec_relay_log_event(THD* thd, Relay_log_info* rli)
{ ...
Log_event *ev= next_event(rli);
...
- if (rli->get_annotate_event() && !IS_RBR_EVENT_TYPE(ev->get_type_code()))
- rli->free_annotate_event();
-
apply_event_and_update_pos(ev, ...);
- if (ev->get_type_code() == ANNOTATE_ROWS_EVENT)
+ if (rli->get_annotate_event() && is_last_rows_event(ev))
+ rli->free_annotate_event();
+ else if (ev->get_type_code() == ANNOTATE_ROWS_EVENT)
rli->set_annotate_event((Annotate_rows_log_event*) ev);
else if (ev->get_type_code() != FORMAT_DESCRIPTION_EVENT)
delete ev;
@@ -307,10 +302,21 @@
where
- #define IS_RBR_EVENT_TYPE(type) ( (type) == TABLE_MAP_EVENT || \
- (type) == WRITE_ROWS_EVENT || \
+ bool is_last_rows_event(Log_event* ev)
+ {
+ Log_event_type type= ev->get_type_code();
+ if (IS_ROWS_EVENT_TYPE(type))
+ {
+ Rows_log_event* rows= (Rows_log_event*)ev;
+ return rows->get_flags(Rows_log_event::STMT_END_F);
+ }
+
+ return 0;
+ }
+
+ #define IS_ROWS_EVENT_TYPE(type) ((type) == WRITE_ROWS_EVENT || \
(type) == UPDATE_ROWS_EVENT || \
- (type) == DELETE_ROWS_EVENT )
+ (type) == DELETE_ROWS_EVENT)
8. General remarks
~~~~~~~~~~~~~~~~~~
-=-=(Alexi - Sun, 20 Dec 2009, 09:29)=-=-
Low Level Design modified.
--- /tmp/wklog.47.old.32726 2009-12-20 07:29:56.000000000 +0000
+++ /tmp/wklog.47.new.32726 2009-12-20 07:29:56.000000000 +0000
@@ -56,7 +56,7 @@
0000006A | 00 | 0 - RESERVED [27]
..... ...
00000081 | 00 | 0 - RESERVED [50]
- 00000082 | 00 | 0 - ANNOTATE_RBR_EVENT [51]
+ 00000082 | 00 | 0 - ANNOTATE_ROWS_EVENT [51]
************************
2. Outline of Annotate_rows event behavior
-=-=(Alexi - Sat, 19 Dec 2009, 16:10)=-=-
Low Level Design modified.
--- /tmp/wklog.47.old.16051 2009-12-19 16:10:48.000000000 +0200
+++ /tmp/wklog.47.new.16051 2009-12-19 16:10:48.000000000 +0200
@@ -253,7 +253,7 @@
thd query to that of the described by the event, i.e. to the query which
caused the subsequent Rows events (see "How Master writes Annotate_rows
events to the binary log" to follow what happens further when the subsequent
-Rows events is applied):
+Rows events are applied):
int Annotate_rows_log_event::do_apply_event(...)
{
-=-=(Alexi - Sat, 19 Dec 2009, 16:02)=-=-
High-Level Specification modified.
--- /tmp/wklog.47.old.15695 2009-12-19 16:02:33.000000000 +0200
+++ /tmp/wklog.47.new.15695 2009-12-19 16:02:33.000000000 +0200
@@ -12,7 +12,7 @@
post-header and contains the query text in its data part. Example:
************************
- ANNOTATE_RBR_EVENT
+ ANNOTATE_ROWS_EVENT
************************
00000220 | B6 A0 2C 4B | time_when = 1261215926
00000224 | 33 | event_type = 51
-=-=(Alexi - Sat, 19 Dec 2009, 15:58)=-=-
Low Level Design modified.
--- /tmp/wklog.47.old.15437 2009-12-19 13:58:12.000000000 +0000
+++ /tmp/wklog.47.new.15437 2009-12-19 13:58:12.000000000 +0000
@@ -1 +1,337 @@
+Content
+~~~~~~~
+ 1. Annotate_rows event number
+ 2. Outline of Annotate_rows event behavior
+ 3. How Master writes Annotate_rows events to the binary log
+ 4. How slave treats replicate-annotate-rows-events option
+ 5. How slave IO thread requests Annotate_rows events
+ 6. How master executes the request
+ 7. How slave SQL thread processes Annotate_rows events
+ 8. General remarks
+
+1. Annotate_rows event number
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+To avoid possible event numbers conflict with MySQL/Sun, we leave a gap
+between the last MySQL event number and the Annotate_rows event number:
+
+ enum Log_event_type
+ { ...
+ INCIDENT_EVENT= 26,
+ // New MySQL event numbers are to be added here
+ MYSQL_EVENTS_END,
+
+ MARIA_EVENTS_BEGIN= 51,
+ // New Maria event numbers start from here
+ ANNOTATE_ROWS_EVENT= 51,
+
+ ENUM_END_EVENT
+ };
+
+together with the corresponding extension of 'post_header_len' array in the
+Format description event. (This extension does not affect the compatibility
+of the binary log). Here is how Format description event looks like with
+this extension:
+
+ ************************
+ FORMAT_DESCRIPTION_EVENT
+ ************************
+ 00000004 | A1 A0 2C 4B | time_when = 1261215905
+ 00000008 | 0F | event_type = 15
+ 00000009 | 64 00 00 00 | server_id = 100
+ 0000000D | 7F 00 00 00 | event_len = 127
+ 00000011 | 83 00 00 00 | log_pos = 00000083
+ 00000015 | 01 00 | flags = LOG_EVENT_BINLOG_IN_USE_F
+ ------------------------
+ 00000017 | 04 00 | binlog_ver = 4
+ 00000019 | 35 2E 32 2E | server_ver = 5.2.0-MariaDB-alpha-debug-log
+ ..... ...
+ 0000004B | A1 A0 2C 4B | time_created = 1261215905
+ 0000004F | 13 | common_header_len = 19
+ ------------------------
+ post_header_len
+ ------------------------
+ 00000050 | 38 | 56 - START_EVENT_V3 [1]
+ ..... ...
+ 00000069 | 02 | 2 - INCIDENT_EVENT [26]
+ 0000006A | 00 | 0 - RESERVED [27]
+ ..... ...
+ 00000081 | 00 | 0 - RESERVED [50]
+ 00000082 | 00 | 0 - ANNOTATE_RBR_EVENT [51]
+ ************************
+
+2. Outline of Annotate_rows event behavior
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+Each Annotate_rows_log_event object has two private members describing the
+corresponding query:
+
+ char *m_query_txt;
+ uint m_query_len;
+
+When the object is created for writing to a binary log, this query is taken
+from 'thd' (for short, below we omit the 'Annotate_rows_log_event::' prefix
+as well as other implementation details):
+
+ Annotate_rows_log_event(THD *thd)
+ {
+ m_query_txt = thd->query();
+ m_query_len = thd->query_length();
+ }
+
+When the object is read from a binary log, the query is taken from the buffer
+containing the binary log representation of the event (this buffer is allocated
+in Log_event object from which all Log events are derived):
+
+ Annotate_rows_log_event(char *buf, uint event_len,
+ Format_description_log_event *desc)
+ {
+ m_query_len = event_len - desc->common_header_len;
+ m_query_txt = buf + desc->common_header_len;
+ }
+
+The events are written to the binary log by the Log_event::write() member
+which calls virtual write_data_header() and write_data_body() members
+("data header" and "post header" are synonym in replication terminology).
+In our case, data header is empty and data body is just the query:
+
+ bool write_data_body(IO_CACHE *file)
+ {
+ return my_b_safe_write(file, (uchar*) m_query_txt, m_query_len);
+ }
+
+Printing the event is just printing the query:
+
+ void Annotate_rows_log_event::print(FILE *file, PRINT_EVENT_INFO *pinfo)
+ {
+ my_b_printf(&pinfo->head_cache, "\tQuery: `%s`\n", m_query_txt);
+ }
+
+3. How Master writes Annotate_rows events to the binary log
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+The event is written to the binary log just before the group of Table_map
+events which precede corresponding Rows events (one query may generate
+several Table map events in the binary log, but the corresponding
+Annotate_rows event must be written only once before the first Table map
+event; hence the boolean variable 'with_annotate' below):
+
+ int write_locked_table_maps(THD *thd)
+ { ...
+ bool with_annotate= thd->variables.binlog_annotate_rows_events;
+ ...
+ for (uint i= 0; i < ... <number of tables> ...; ++i)
+ { ...
+ thd->binlog_write_table_map(table, ..., with_annotate);
+ with_annotate= 0; // write Annotate_event not more than once
+ ...
+ }
+ ...
+ }
+
+ int THD::binlog_write_table_map(TABLE *table, ..., bool with_annotate)
+ { ...
+ Table_map_log_event the_event(...);
+ ...
+ if (with_annotate)
+ {
+ Annotate_rows_log_event anno(this);
+ mysql_bin_log.write(&anno);
+ }
+
+ mysql_bin_log.write(&the_event);
+ ...
+ }
+
+4. How slave treats replicate-annotate-rows-events option
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+The replicate-annotate-rows-events option is treated just as the session
+value of the binlog_annotate_rows_events variable for the slave IO and
+SQL threads. This setting is done during initialization of these threads:
+
+ pthread_handler_t handle_slave_io(void *arg)
+ {
+ THD *thd= new THD;
+ ...
+ init_slave_thread(thd, SLAVE_THD_IO);
+ ...
+ }
+
+ pthread_handler_t handle_slave_sql(void *arg)
+ {
+ THD *thd= new THD;
+ ...
+ init_slave_thread(thd, SLAVE_THD_SQL);
+ ...
+ }
+
+ int init_slave_thread(THD* thd, SLAVE_THD_TYPE thd_type)
+ { ...
+ thd->variables.binlog_annotate_rows_events=
+ opt_replicate_annotate_rows_events;
+ ...
+ }
+
+5. How slave IO thread requests Annotate_rows events
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+When requesting an event, the slave should inform the master whether
+it should send Annotate_rows events or not. To that end we add a new
+BINLOG_SEND_ANNOTATE_ROWS_EVENT flag used when requesting an event:
+
+ #define BINLOG_DUMP_NON_BLOCK 1
+ #define BINLOG_SEND_ANNOTATE_ROWS_EVENT 2
+
+ pthread_handler_t handle_slave_io(void *arg)
+ { ...
+ request_dump(mysql, ...);
+ ...
+ }
+
+ int request_dump(MYSQL* mysql, ...)
+ { ...
+ if (opt_log_slave_updates &&
+ mi->io_thd->variables.binlog_annotate_rows_events)
+ binlog_flags|= BINLOG_SEND_ANNOTATE_ROWS_EVENT;
+ ...
+ int2store(buf + 4, binlog_flags);
+ ...
+ simple_command(mysql, COM_BINLOG_DUMP, buf, ...);
+ ...
+ }
+
+6. How master executes the request
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+ case COM_BINLOG_DUMP:
+ { ...
+ flags= uint2korr(packet + 4);
+ ...
+ mysql_binlog_send(thd, ..., flags);
+ ...
+ }
+
+ void mysql_binlog_send(THD* thd, ..., ushort flags)
+ { ...
+ Log_event::read_log_event(&log, packet, ...);
+ ...
+ if ((*packet)[EVENT_TYPE_OFFSET + 1] != ANNOTATE_ROWS_EVENT ||
+ flags & BINLOG_SEND_ANNOTATE_ROWS_EVENT ||
+ thd->server_id == 0 /* slave == mysqlbinlog */ )
+ {
+ my_net_write(net, packet->ptr(), packet->length());
+ }
+ ...
+ }
+
+7. How slave SQL thread processes Annotate_rows events
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+The slave processes each recieved event by "applying" it, i.e. by
+calling the Log_event::apply_event() function which in turn calls
+the virtual do_apply_event() member specific for each type of the
+event.
+
+ int exec_relay_log_event(THD* thd, Relay_log_info* rli)
+ { ...
+ Log_event *ev = next_event(rli);
+ ...
+ apply_event_and_update_pos(ev, ...);
+
+ if (ev->get_type_code() != FORMAT_DESCRIPTION_EVENT)
+ delete ev;
+ ...
+ }
+
+ int apply_event_and_update_pos(Log_event *ev, ...)
+ { ...
+ ev->apply_event(...);
+ ...
+ }
+
+ int Log_event::apply_event(...)
+ {
+ return do_apply_event(...);
+ }
+
+What does it mean to "apply" an Annotate_rows event? It means to set current
+thd query to that of the described by the event, i.e. to the query which
+caused the subsequent Rows events (see "How Master writes Annotate_rows
+events to the binary log" to follow what happens further when the subsequent
+Rows events is applied):
+
+ int Annotate_rows_log_event::do_apply_event(...)
+ {
+ thd->set_query(m_query_txt, m_query_len);
+ }
+
+NOTE. I am not sure, but possibly current values of thd->query and
+thd->query_length should be saved before calling set_query() and to be
+restored on the Annotate_rows_log_event object deletion.
+Is it really needed ?
+
+After calling this do_apply_event() function we may not delete the
+Annotate_rows_log_event object immediatedly (see exec_relay_log_event()
+above) because thd->query now points to the string inside this object.
+We may keep the pointer to this object in the Relay_log_info:
+
+ class Relay_log_info
+ {
+ public:
+ ...
+ void set_annotate_event(Annotate_rows_log_event*);
+ Annotate_rows_log_event* get_annotate_event();
+ void free_annotate_event();
+ ...
+ private:
+ Annotate_rows_log_event* m_annotate_event;
+ };
+
+When the saved Annotate_rows object may be deleted? When all corresponding
+Rows events will be processed, i.e. before processing the first non-Rows
+event (note that Annotate_rows object resides in the binary log *after*
+the (possible) 'BEGIN' Query event which accompanies the rows events; note
+also that this deletion is adjusted with the case when some or all
+corresponding Rows events are filtered out by replicate filter rules):
+
+ int exec_relay_log_event(THD* thd, Relay_log_info* rli)
+ { ...
+ Log_event *ev= next_event(rli);
+ ...
+ if (rli->get_annotate_event() && !IS_RBR_EVENT_TYPE(ev->get_type_code()))
+ rli->free_annotate_event();
+
+ apply_event_and_update_pos(ev, ...);
+
+ if (ev->get_type_code() == ANNOTATE_ROWS_EVENT)
+ rli->set_annotate_event((Annotate_rows_log_event*) ev);
+ else if (ev->get_type_code() != FORMAT_DESCRIPTION_EVENT)
+ delete ev;
+ ...
+ }
+
+where
+
+ #define IS_RBR_EVENT_TYPE(type) ( (type) == TABLE_MAP_EVENT || \
+ (type) == WRITE_ROWS_EVENT || \
+ (type) == UPDATE_ROWS_EVENT || \
+ (type) == DELETE_ROWS_EVENT )
+
+8. General remarks
+~~~~~~~~~~~~~~~~~~
+Kristian noticed that introducing new log event type should be coordinated
+somehow with MySQL/Sun:
+
+ Kristian: The numeric code for this event must be assigned carefully.
+ It should be coordinated with MySQL/Sun, otherwise we can get into a
+ situation where MySQL uses the same numeric code for one event that
+ MariaDB uses for ANNOTATE_ROWS_EVENT, which would make merging the two
+ impossible.
+ Alex: I reserved about 20 numbers not to have possible conflicts
+ with MySQL.
+ Kristian: Still, I think it would be appropriate to send a polite email
+ to internals(a)lists.mysql.com about this and suggesting to reserve the
+ event number.
+
+Also we should notice the introduction of the BINLOG_SEND_ANNOTATE_ROWS_EVENT
+flag taking into account that MySQL/Sun may also introduce a flag with the
+same value to be used in the request_dump-mysql_binlog_send interface.
+But this is mainly the question of merging: if a conflict concerning this
+flag occur, we may simply change the BINLOG_SEND_ANNOTATE_ROWS_EVENT value
+(this does not require additional changes in the code).
------------------------------------------------------------
-=-=(View All Progress Notes, 23 total)=-=-
http://askmonty.org/worklog/index.pl?tid=47&nolimit=1
DESCRIPTION:
Store in binlog (and show in mysqlbinlog output) texts of statements that
caused RBR events
This is needed for (list from Monty):
- Easier to understand why updates happened
- Would make it easier to find out where in application things went
wrong (as you can search for exact strings)
- Allow one to filter things based on comments in the statement.
The cost of this can be that the binlog will be approximately 2x in size
(especially insert of big blob's would be a bit painful), so this should
be an optional feature.
HIGH-LEVEL SPECIFICATION:
Content
~~~~~~~
1. Annotate_rows_log_event
2. Server option: --binlog-annotate-rows-events
3. Server option: --replicate-annotate-rows-events
4. mysqlbinlog option: --print-annotate-rows-events
5. mysqlbinlog output
1. Annotate_rows_log_event [ ANNOTATE_ROWS_EVENT ]
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
Describes the query which caused the corresponding rows events. Has empty
post-header and contains the query text in its data part. Example:
************************
ANNOTATE_ROWS_EVENT
************************
00000220 | B6 A0 2C 4B | time_when = 1261215926
00000224 | 33 | event_type = 51
00000225 | 64 00 00 00 | server_id = 100
00000229 | 36 00 00 00 | event_len = 54
0000022D | 56 02 00 00 | log_pos = 00000256
00000231 | 00 00 | flags = <none>
------------------------
00000233 | 49 4E 53 45 | query = "INSERT INTO t1 VALUES (1), (2), (3)"
00000237 | 52 54 20 49 |
0000023B | 4E 54 4F 20 |
0000023F | 74 31 20 56 |
00000243 | 41 4C 55 45 |
00000247 | 53 20 28 31 |
0000024B | 29 2C 20 28 |
0000024F | 32 29 2C 20 |
00000253 | 28 33 29 |
************************
In binary log, Annotate_rows event follows the (possible) 'BEGIN' Query event
and precedes the first of Table map events which accompany the corresponding
rows events. (See example in the "mysqlbinlog output" section below.)
2. Server option: --binlog-annotate-rows-events
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
Tells the master to write Annotate_rows events to the binary log.
* Variable Name: binlog_annotate_rows_events
* Scope: Global & Session
* Access Type: Dynamic
* Data Type: bool
* Default Value: OFF
NOTE. Session values allows to annotate only some selected statements:
...
SET SESSION binlog_annotate_rows_events=ON;
... statements to be annotated ...
SET SESSION binlog_annotate_rows_events=OFF;
... statements not to be annotated ...
3. Server option: --replicate-annotate-rows-events
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
Tells the slave to reproduce Annotate_rows events recieved from the master
in its own binary log (sensible only in pair with log-slave-updates option).
* Variable Name: replicate_annotate_rows_events
* Scope: Global
* Access Type: Read only
* Data Type: bool
* Default Value: OFF
NOTE. Why do we additionally need this 'replicate' option? Why not to make
the slave to reproduce this events when its binlog-annotate-rows-events
global value is ON? Well, because, for example, we may want to configure
the slave which should reproduce Annotate_rows events but has global
binlog-annotate-rows-events = OFF meaning this to be the default value for
the client threads (see also "How slave treats replicate-annotate-rows-events
option" in LLD part).
4. mysqlbinlog option: --print-annotate-rows-events
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
With this option, mysqlbinlog prints the content of Annotate_rows events (if
the binary log does contain them). Without this option (i.e. by default),
mysqlbinlog skips Annotate_rows events.
5. mysqlbinlog output
~~~~~~~~~~~~~~~~~~~~~
With --print-annotate-rows-events, mysqlbinlog outputs Annotate_rows events
in a form like this:
...
# at 1646
#091219 12:45:26 server id 100 end_log_pos 1714 Query thread_id=1
exec_time=0 error_code=0
SET TIMESTAMP=1261215926/*!*/;
BEGIN
/*!*/;
# at 1714
# at 1812
# at 1853
# at 1894
# at 1938
#091219 12:45:26 server id 100 end_log_pos 1812 Query: `DELETE t1, t2 FROM
t1 INNER JOIN t2 INNER JOIN t3 WHERE t1.a=t2.a AND t2.a=t3.a`
#091219 12:45:26 server id 100 end_log_pos 1853 Table_map: `test`.`t1`
mapped to number 16
#091219 12:45:26 server id 100 end_log_pos 1894 Table_map: `test`.`t2`
mapped to number 17
#091219 12:45:26 server id 100 end_log_pos 1938 Delete_rows: table id 16
#091219 12:45:26 server id 100 end_log_pos 1982 Delete_rows: table id 17
flags: STMT_END_F
...
LOW-LEVEL DESIGN:
Content
~~~~~~~
1. Annotate_rows event number
2. Outline of Annotate_rows event behavior
3. How Master writes Annotate_rows events to the binary log
4. How slave treats replicate-annotate-rows-events option
5. How slave IO thread requests Annotate_rows events
6. How master executes the request
7. How slave SQL thread processes Annotate_rows events
8. General remarks
1. Annotate_rows event number
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
To avoid possible event numbers conflict with MySQL/Sun, we leave a gap
between the last MySQL event number and the Annotate_rows event number:
enum Log_event_type
{ ...
INCIDENT_EVENT= 26,
// New MySQL event numbers are to be added here
MYSQL_EVENTS_END,
MARIA_EVENTS_BEGIN= 51,
// New Maria event numbers start from here
ANNOTATE_ROWS_EVENT= 51,
ENUM_END_EVENT
};
together with the corresponding extension of 'post_header_len' array in the
Format description event. (This extension does not affect the compatibility
of the binary log). Here is how Format description event looks like with
this extension:
************************
FORMAT_DESCRIPTION_EVENT
************************
00000004 | A1 A0 2C 4B | time_when = 1261215905
00000008 | 0F | event_type = 15
00000009 | 64 00 00 00 | server_id = 100
0000000D | 7F 00 00 00 | event_len = 127
00000011 | 83 00 00 00 | log_pos = 00000083
00000015 | 01 00 | flags = LOG_EVENT_BINLOG_IN_USE_F
------------------------
00000017 | 04 00 | binlog_ver = 4
00000019 | 35 2E 32 2E | server_ver = 5.2.0-MariaDB-alpha-debug-log
..... ...
0000004B | A1 A0 2C 4B | time_created = 1261215905
0000004F | 13 | common_header_len = 19
------------------------
post_header_len
------------------------
00000050 | 38 | 56 - START_EVENT_V3 [1]
..... ...
00000069 | 02 | 2 - INCIDENT_EVENT [26]
0000006A | 00 | 0 - RESERVED [27]
..... ...
00000081 | 00 | 0 - RESERVED [50]
00000082 | 00 | 0 - ANNOTATE_ROWS_EVENT [51]
************************
2. Outline of Annotate_rows event behavior
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
Each Annotate_rows_log_event object has two private members describing the
corresponding query:
char *m_query_txt;
uint m_query_len;
When the object is created for writing to a binary log, this query is taken
from 'thd' (for short, below we omit the 'Annotate_rows_log_event::' prefix
as well as other implementation details):
Annotate_rows_log_event(THD *thd)
{
m_query_txt = thd->query();
m_query_len = thd->query_length();
}
When the object is read from a binary log, the query is taken from the buffer
containing the binary log representation of the event (this buffer is allocated
in Log_event object from which all Log events are derived):
Annotate_rows_log_event(char *buf, uint event_len,
Format_description_log_event *desc)
{
m_query_len = event_len - desc->common_header_len;
m_query_txt = buf + desc->common_header_len;
}
The events are written to the binary log by the Log_event::write() member
which calls virtual write_data_header() and write_data_body() members
("data header" and "post header" are synonym in replication terminology).
In our case, data header is empty and data body is just the query:
bool write_data_body(IO_CACHE *file)
{
return my_b_safe_write(file, (uchar*) m_query_txt, m_query_len);
}
Printing the event is just printing the query:
void Annotate_rows_log_event::print(FILE *file, PRINT_EVENT_INFO *pinfo)
{
my_b_printf(&pinfo->head_cache, "\tQuery: `%s`\n", m_query_txt);
}
3. How Master writes Annotate_rows events to the binary log
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
The event is written to the binary log just before the group of Table_map
events which precede corresponding Rows events (one query may generate
several Table map events in the binary log, but the corresponding
Annotate_rows event must be written only once before the first Table map
event; hence the boolean variable 'with_annotate' below):
int write_locked_table_maps(THD *thd)
{ ...
bool with_annotate= thd->variables.binlog_annotate_rows_events;
...
for (uint i= 0; i < ... <number of tables> ...; ++i)
{ ...
thd->binlog_write_table_map(table, ..., with_annotate);
with_annotate= 0; // write Annotate_event not more than once
...
}
...
}
int THD::binlog_write_table_map(TABLE *table, ..., bool with_annotate)
{ ...
Table_map_log_event the_event(...);
...
if (with_annotate)
{
Annotate_rows_log_event anno(this);
mysql_bin_log.write(&anno);
}
mysql_bin_log.write(&the_event);
...
}
4. How slave treats replicate-annotate-rows-events option
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
The replicate-annotate-rows-events option is treated just as the session
value of the binlog_annotate_rows_events variable for the slave IO and
SQL threads. This setting is done during initialization of these threads:
pthread_handler_t handle_slave_io(void *arg)
{
THD *thd= new THD;
...
init_slave_thread(thd, SLAVE_THD_IO);
...
}
pthread_handler_t handle_slave_sql(void *arg)
{
THD *thd= new THD;
...
init_slave_thread(thd, SLAVE_THD_SQL);
...
}
int init_slave_thread(THD* thd, SLAVE_THD_TYPE thd_type)
{ ...
thd->variables.binlog_annotate_rows_events=
opt_replicate_annotate_rows_events;
...
}
5. How slave IO thread requests Annotate_rows events
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
If the replicate-annotate-rows-events option is not set on a slave, there
is no need for master to send Annotate_rows events to this slave. The slave
(or mysqlbinlog in remote case), before requesting binlog dump via the
COM_BINLOG_DUMP command, informs the master whether it should send these
events by executing the newly added COM_BINLOG_DUMP_OPTIONS_EXT server
command:
case COM_BINLOG_DUMP_OPTIONS_EXT:
thd->binlog_dump_flags_ext= packet[0];
my_ok(thd);
break;
Note. We add this new command and don't use COM_BINLOG_DUMP to avoid possible
conflicts with MySQL/Sun.
6. How master executes the request
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
case COM_BINLOG_DUMP:
{ ...
flags= uint2korr(packet + 4);
...
mysql_binlog_send(thd, ..., flags);
...
}
void mysql_binlog_send(THD* thd, ..., ushort flags)
{ ...
Log_event::read_log_event(&log, packet, ...);
...
if ((*packet)[EVENT_TYPE_OFFSET + 1] != ANNOTATE_ROWS_EVENT ||
flags & BINLOG_SEND_ANNOTATE_ROWS_EVENT)
{
my_net_write(net, packet->ptr(), packet->length());
}
...
}
7. How slave SQL thread processes Annotate_rows events
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
The slave processes each recieved event by "applying" it, i.e. by
calling the Log_event::apply_event() function which in turn calls
the virtual do_apply_event() member specific for each type of the
event.
int exec_relay_log_event(THD* thd, Relay_log_info* rli)
{ ...
Log_event *ev = next_event(rli);
...
apply_event_and_update_pos(ev, ...);
if (ev->get_type_code() != FORMAT_DESCRIPTION_EVENT)
delete ev;
...
}
int apply_event_and_update_pos(Log_event *ev, ...)
{ ...
ev->apply_event(...);
...
}
int Log_event::apply_event(...)
{
return do_apply_event(...);
}
What does it mean to "apply" an Annotate_rows event? It means to set current
thd query to that of the described by the event, i.e. to the query which
caused the subsequent Rows events (see "How Master writes Annotate_rows
events to the binary log" to follow what happens further when the subsequent
Rows events are applied):
int Annotate_rows_log_event::do_apply_event(...)
{
thd->set_query(m_query_txt, m_query_len);
}
NOTE. I am not sure, but possibly current values of thd->query and
thd->query_length should be saved before calling set_query() and to be
restored on the Annotate_rows_log_event object deletion.
Is it really needed ?
After calling this do_apply_event() function we may not delete the
Annotate_rows_log_event object immediatedly (see exec_relay_log_event()
above) because thd->query now points to the string inside this object.
We may keep the pointer to this object in the Relay_log_info:
class Relay_log_info
{
public:
...
void set_annotate_event(Annotate_rows_log_event*);
Annotate_rows_log_event* get_annotate_event();
void free_annotate_event();
...
private:
Annotate_rows_log_event* m_annotate_event;
};
The saved Annotate_rows object should be deleted when all corresponding
Rows events will be processed:
int exec_relay_log_event(THD* thd, Relay_log_info* rli)
{ ...
Log_event *ev= next_event(rli);
...
apply_event_and_update_pos(ev, ...);
if (rli->get_annotate_event() && is_last_rows_event(ev))
rli->free_annotate_event();
else if (ev->get_type_code() == ANNOTATE_ROWS_EVENT)
rli->set_annotate_event((Annotate_rows_log_event*) ev);
else if (ev->get_type_code() != FORMAT_DESCRIPTION_EVENT)
delete ev;
...
}
where
bool is_last_rows_event(Log_event* ev)
{
Log_event_type type= ev->get_type_code();
if (IS_ROWS_EVENT_TYPE(type))
{
Rows_log_event* rows= (Rows_log_event*)ev;
return rows->get_flags(Rows_log_event::STMT_END_F);
}
return 0;
}
#define IS_ROWS_EVENT_TYPE(type) ((type) == WRITE_ROWS_EVENT || \
(type) == UPDATE_ROWS_EVENT || \
(type) == DELETE_ROWS_EVENT)
8. General remarks
~~~~~~~~~~~~~~~~~~
Kristian noticed that introducing new log event type should be coordinated
somehow with MySQL/Sun:
Kristian: The numeric code for this event must be assigned carefully.
It should be coordinated with MySQL/Sun, otherwise we can get into a
situation where MySQL uses the same numeric code for one event that
MariaDB uses for ANNOTATE_ROWS_EVENT, which would make merging the two
impossible.
Alex: I reserved about 20 numbers not to have possible conflicts
with MySQL.
Kristian: Still, I think it would be appropriate to send a polite email
to internals(a)lists.mysql.com about this and suggesting to reserve the
event number.
ESTIMATED WORK TIME
ESTIMATED COMPLETION DATE
-----------------------------------------------------------------------
WorkLog (v3.5.9)
1
0

[Maria-developers] Updated (by Serg): Store in binlog text of statements that caused RBR events (47)
by worklog-noreply@askmonty.org 05 Feb '10
by worklog-noreply@askmonty.org 05 Feb '10
05 Feb '10
-----------------------------------------------------------------------
WORKLOG TASK
-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-
TASK...........: Store in binlog text of statements that caused RBR events
CREATION DATE..: Sat, 15 Aug 2009, 23:48
SUPERVISOR.....: Monty
IMPLEMENTOR....:
COPIES TO......: Knielsen, Serg
CATEGORY.......: Server-Sprint
TASK ID........: 47 (http://askmonty.org/worklog/?tid=47)
VERSION........: Server-9.x
STATUS.........: In-Progress
PRIORITY.......: 60
WORKED HOURS...: 20
ESTIMATE.......: 35 (hours remain)
ORIG. ESTIMATE.: 35
PROGRESS NOTES:
-=-=(Serg - Fri, 05 Feb 2010, 14:04)=-=-
Observers changed: Knielsen,Serg
-=-=(Guest - Fri, 05 Feb 2010, 13:40)=-=-
Category updated.
--- /tmp/wklog.47.old.9197 2010-02-05 13:40:36.000000000 +0200
+++ /tmp/wklog.47.new.9197 2010-02-05 13:40:36.000000000 +0200
@@ -1 +1 @@
-Server-RawIdeaBin
+Server-Sprint
-=-=(Guest - Fri, 05 Feb 2010, 13:40)=-=-
Status updated.
--- /tmp/wklog.47.old.9197 2010-02-05 13:40:36.000000000 +0200
+++ /tmp/wklog.47.new.9197 2010-02-05 13:40:36.000000000 +0200
@@ -1 +1 @@
-Un-Assigned
+In-Progress
-=-=(Alexi - Thu, 04 Feb 2010, 09:54)=-=-
Low Level Design modified.
--- /tmp/wklog.47.old.16174 2010-02-04 09:54:13.000000000 +0200
+++ /tmp/wklog.47.new.16174 2010-02-04 09:54:13.000000000 +0200
@@ -171,35 +171,20 @@
5. How slave IO thread requests Annotate_rows events
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-When requesting an event, the slave should inform the master whether
-it should send Annotate_rows events or not. To that end we add a new
-BINLOG_SEND_ANNOTATE_ROWS_EVENT flag used when requesting an event:
+If the replicate-annotate-rows-events option is not set on a slave, there
+is no need for master to send Annotate_rows events to this slave. The slave
+(or mysqlbinlog in remote case), before requesting binlog dump via the
+COM_BINLOG_DUMP command, informs the master whether it should send these
+events by executing the newly added COM_BINLOG_DUMP_OPTIONS_EXT server
+command:
+
+ case COM_BINLOG_DUMP_OPTIONS_EXT:
+ thd->binlog_dump_flags_ext= packet[0];
+ my_ok(thd);
+ break;
- #define BINLOG_DUMP_NON_BLOCK 1
- #define BINLOG_SEND_ANNOTATE_ROWS_EVENT 2
-
- pthread_handler_t handle_slave_io(void *arg)
- { ...
- request_dump(mysql, ...);
- ...
- }
-
- int request_dump(MYSQL* mysql, ...)
- { ...
- if (opt_log_slave_updates &&
- mi->io_thd->variables.binlog_annotate_rows_events)
- binlog_flags|= BINLOG_SEND_ANNOTATE_ROWS_EVENT;
- ...
- int2store(buf + 4, binlog_flags);
- ...
- simple_command(mysql, COM_BINLOG_DUMP, buf, ...);
- ...
- }
-
-NOTE. mysqlbinlog, when remotely requesting BINLOG_DUMP by calling the
-simple_command() function, should also use this flag if it wants (in case
-of the --print-annotate-rows-events option set) to recieve Annotate_rows
-events.
+Note. We add this new command and don't use COM_BINLOG_DUMP to avoid possible
+conflicts with MySQL/Sun.
6. How master executes the request
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
@@ -338,10 +323,4 @@
to internals(a)lists.mysql.com about this and suggesting to reserve the
event number.
-Also we should notice the introduction of the BINLOG_SEND_ANNOTATE_ROWS_EVENT
-flag taking into account that MySQL/Sun may also introduce a flag with the
-same value to be used in the request_dump-mysql_binlog_send interface.
-But this is mainly the question of merging: if a conflict concerning this
-flag occur, we may simply change the BINLOG_SEND_ANNOTATE_ROWS_EVENT value
-(this does not require additional changes in the code).
-=-=(Alexi - Sun, 20 Dec 2009, 16:00)=-=-
Low Level Design modified.
--- /tmp/wklog.47.old.19667 2009-12-20 14:00:56.000000000 +0000
+++ /tmp/wklog.47.new.19667 2009-12-20 14:00:56.000000000 +0000
@@ -196,6 +196,11 @@
...
}
+NOTE. mysqlbinlog, when remotely requesting BINLOG_DUMP by calling the
+simple_command() function, should also use this flag if it wants (in case
+of the --print-annotate-rows-events option set) to recieve Annotate_rows
+events.
+
6. How master executes the request
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
@@ -212,8 +217,7 @@
Log_event::read_log_event(&log, packet, ...);
...
if ((*packet)[EVENT_TYPE_OFFSET + 1] != ANNOTATE_ROWS_EVENT ||
- flags & BINLOG_SEND_ANNOTATE_ROWS_EVENT ||
- thd->server_id == 0 /* slave == mysqlbinlog */ )
+ flags & BINLOG_SEND_ANNOTATE_ROWS_EVENT)
{
my_net_write(net, packet->ptr(), packet->length());
}
-=-=(Alexi - Sun, 20 Dec 2009, 13:14)=-=-
Low Level Design modified.
--- /tmp/wklog.47.old.11350 2009-12-20 13:14:04.000000000 +0200
+++ /tmp/wklog.47.new.11350 2009-12-20 13:14:04.000000000 +0200
@@ -282,23 +282,18 @@
Annotate_rows_log_event* m_annotate_event;
};
-When the saved Annotate_rows object may be deleted? When all corresponding
-Rows events will be processed, i.e. before processing the first non-Rows
-event (note that Annotate_rows object resides in the binary log *after*
-the (possible) 'BEGIN' Query event which accompanies the rows events; note
-also that this deletion is adjusted with the case when some or all
-corresponding Rows events are filtered out by replicate filter rules):
+The saved Annotate_rows object should be deleted when all corresponding
+Rows events will be processed:
int exec_relay_log_event(THD* thd, Relay_log_info* rli)
{ ...
Log_event *ev= next_event(rli);
...
- if (rli->get_annotate_event() && !IS_RBR_EVENT_TYPE(ev->get_type_code()))
- rli->free_annotate_event();
-
apply_event_and_update_pos(ev, ...);
- if (ev->get_type_code() == ANNOTATE_ROWS_EVENT)
+ if (rli->get_annotate_event() && is_last_rows_event(ev))
+ rli->free_annotate_event();
+ else if (ev->get_type_code() == ANNOTATE_ROWS_EVENT)
rli->set_annotate_event((Annotate_rows_log_event*) ev);
else if (ev->get_type_code() != FORMAT_DESCRIPTION_EVENT)
delete ev;
@@ -307,10 +302,21 @@
where
- #define IS_RBR_EVENT_TYPE(type) ( (type) == TABLE_MAP_EVENT || \
- (type) == WRITE_ROWS_EVENT || \
+ bool is_last_rows_event(Log_event* ev)
+ {
+ Log_event_type type= ev->get_type_code();
+ if (IS_ROWS_EVENT_TYPE(type))
+ {
+ Rows_log_event* rows= (Rows_log_event*)ev;
+ return rows->get_flags(Rows_log_event::STMT_END_F);
+ }
+
+ return 0;
+ }
+
+ #define IS_ROWS_EVENT_TYPE(type) ((type) == WRITE_ROWS_EVENT || \
(type) == UPDATE_ROWS_EVENT || \
- (type) == DELETE_ROWS_EVENT )
+ (type) == DELETE_ROWS_EVENT)
8. General remarks
~~~~~~~~~~~~~~~~~~
-=-=(Alexi - Sun, 20 Dec 2009, 09:29)=-=-
Low Level Design modified.
--- /tmp/wklog.47.old.32726 2009-12-20 07:29:56.000000000 +0000
+++ /tmp/wklog.47.new.32726 2009-12-20 07:29:56.000000000 +0000
@@ -56,7 +56,7 @@
0000006A | 00 | 0 - RESERVED [27]
..... ...
00000081 | 00 | 0 - RESERVED [50]
- 00000082 | 00 | 0 - ANNOTATE_RBR_EVENT [51]
+ 00000082 | 00 | 0 - ANNOTATE_ROWS_EVENT [51]
************************
2. Outline of Annotate_rows event behavior
-=-=(Alexi - Sat, 19 Dec 2009, 16:10)=-=-
Low Level Design modified.
--- /tmp/wklog.47.old.16051 2009-12-19 16:10:48.000000000 +0200
+++ /tmp/wklog.47.new.16051 2009-12-19 16:10:48.000000000 +0200
@@ -253,7 +253,7 @@
thd query to that of the described by the event, i.e. to the query which
caused the subsequent Rows events (see "How Master writes Annotate_rows
events to the binary log" to follow what happens further when the subsequent
-Rows events is applied):
+Rows events are applied):
int Annotate_rows_log_event::do_apply_event(...)
{
-=-=(Alexi - Sat, 19 Dec 2009, 16:02)=-=-
High-Level Specification modified.
--- /tmp/wklog.47.old.15695 2009-12-19 16:02:33.000000000 +0200
+++ /tmp/wklog.47.new.15695 2009-12-19 16:02:33.000000000 +0200
@@ -12,7 +12,7 @@
post-header and contains the query text in its data part. Example:
************************
- ANNOTATE_RBR_EVENT
+ ANNOTATE_ROWS_EVENT
************************
00000220 | B6 A0 2C 4B | time_when = 1261215926
00000224 | 33 | event_type = 51
-=-=(Alexi - Sat, 19 Dec 2009, 15:58)=-=-
Low Level Design modified.
--- /tmp/wklog.47.old.15437 2009-12-19 13:58:12.000000000 +0000
+++ /tmp/wklog.47.new.15437 2009-12-19 13:58:12.000000000 +0000
@@ -1 +1,337 @@
+Content
+~~~~~~~
+ 1. Annotate_rows event number
+ 2. Outline of Annotate_rows event behavior
+ 3. How Master writes Annotate_rows events to the binary log
+ 4. How slave treats replicate-annotate-rows-events option
+ 5. How slave IO thread requests Annotate_rows events
+ 6. How master executes the request
+ 7. How slave SQL thread processes Annotate_rows events
+ 8. General remarks
+
+1. Annotate_rows event number
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+To avoid possible event numbers conflict with MySQL/Sun, we leave a gap
+between the last MySQL event number and the Annotate_rows event number:
+
+ enum Log_event_type
+ { ...
+ INCIDENT_EVENT= 26,
+ // New MySQL event numbers are to be added here
+ MYSQL_EVENTS_END,
+
+ MARIA_EVENTS_BEGIN= 51,
+ // New Maria event numbers start from here
+ ANNOTATE_ROWS_EVENT= 51,
+
+ ENUM_END_EVENT
+ };
+
+together with the corresponding extension of 'post_header_len' array in the
+Format description event. (This extension does not affect the compatibility
+of the binary log). Here is how Format description event looks like with
+this extension:
+
+ ************************
+ FORMAT_DESCRIPTION_EVENT
+ ************************
+ 00000004 | A1 A0 2C 4B | time_when = 1261215905
+ 00000008 | 0F | event_type = 15
+ 00000009 | 64 00 00 00 | server_id = 100
+ 0000000D | 7F 00 00 00 | event_len = 127
+ 00000011 | 83 00 00 00 | log_pos = 00000083
+ 00000015 | 01 00 | flags = LOG_EVENT_BINLOG_IN_USE_F
+ ------------------------
+ 00000017 | 04 00 | binlog_ver = 4
+ 00000019 | 35 2E 32 2E | server_ver = 5.2.0-MariaDB-alpha-debug-log
+ ..... ...
+ 0000004B | A1 A0 2C 4B | time_created = 1261215905
+ 0000004F | 13 | common_header_len = 19
+ ------------------------
+ post_header_len
+ ------------------------
+ 00000050 | 38 | 56 - START_EVENT_V3 [1]
+ ..... ...
+ 00000069 | 02 | 2 - INCIDENT_EVENT [26]
+ 0000006A | 00 | 0 - RESERVED [27]
+ ..... ...
+ 00000081 | 00 | 0 - RESERVED [50]
+ 00000082 | 00 | 0 - ANNOTATE_RBR_EVENT [51]
+ ************************
+
+2. Outline of Annotate_rows event behavior
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+Each Annotate_rows_log_event object has two private members describing the
+corresponding query:
+
+ char *m_query_txt;
+ uint m_query_len;
+
+When the object is created for writing to a binary log, this query is taken
+from 'thd' (for short, below we omit the 'Annotate_rows_log_event::' prefix
+as well as other implementation details):
+
+ Annotate_rows_log_event(THD *thd)
+ {
+ m_query_txt = thd->query();
+ m_query_len = thd->query_length();
+ }
+
+When the object is read from a binary log, the query is taken from the buffer
+containing the binary log representation of the event (this buffer is allocated
+in Log_event object from which all Log events are derived):
+
+ Annotate_rows_log_event(char *buf, uint event_len,
+ Format_description_log_event *desc)
+ {
+ m_query_len = event_len - desc->common_header_len;
+ m_query_txt = buf + desc->common_header_len;
+ }
+
+The events are written to the binary log by the Log_event::write() member
+which calls virtual write_data_header() and write_data_body() members
+("data header" and "post header" are synonym in replication terminology).
+In our case, data header is empty and data body is just the query:
+
+ bool write_data_body(IO_CACHE *file)
+ {
+ return my_b_safe_write(file, (uchar*) m_query_txt, m_query_len);
+ }
+
+Printing the event is just printing the query:
+
+ void Annotate_rows_log_event::print(FILE *file, PRINT_EVENT_INFO *pinfo)
+ {
+ my_b_printf(&pinfo->head_cache, "\tQuery: `%s`\n", m_query_txt);
+ }
+
+3. How Master writes Annotate_rows events to the binary log
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+The event is written to the binary log just before the group of Table_map
+events which precede corresponding Rows events (one query may generate
+several Table map events in the binary log, but the corresponding
+Annotate_rows event must be written only once before the first Table map
+event; hence the boolean variable 'with_annotate' below):
+
+ int write_locked_table_maps(THD *thd)
+ { ...
+ bool with_annotate= thd->variables.binlog_annotate_rows_events;
+ ...
+ for (uint i= 0; i < ... <number of tables> ...; ++i)
+ { ...
+ thd->binlog_write_table_map(table, ..., with_annotate);
+ with_annotate= 0; // write Annotate_event not more than once
+ ...
+ }
+ ...
+ }
+
+ int THD::binlog_write_table_map(TABLE *table, ..., bool with_annotate)
+ { ...
+ Table_map_log_event the_event(...);
+ ...
+ if (with_annotate)
+ {
+ Annotate_rows_log_event anno(this);
+ mysql_bin_log.write(&anno);
+ }
+
+ mysql_bin_log.write(&the_event);
+ ...
+ }
+
+4. How slave treats replicate-annotate-rows-events option
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+The replicate-annotate-rows-events option is treated just as the session
+value of the binlog_annotate_rows_events variable for the slave IO and
+SQL threads. This setting is done during initialization of these threads:
+
+ pthread_handler_t handle_slave_io(void *arg)
+ {
+ THD *thd= new THD;
+ ...
+ init_slave_thread(thd, SLAVE_THD_IO);
+ ...
+ }
+
+ pthread_handler_t handle_slave_sql(void *arg)
+ {
+ THD *thd= new THD;
+ ...
+ init_slave_thread(thd, SLAVE_THD_SQL);
+ ...
+ }
+
+ int init_slave_thread(THD* thd, SLAVE_THD_TYPE thd_type)
+ { ...
+ thd->variables.binlog_annotate_rows_events=
+ opt_replicate_annotate_rows_events;
+ ...
+ }
+
+5. How slave IO thread requests Annotate_rows events
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+When requesting an event, the slave should inform the master whether
+it should send Annotate_rows events or not. To that end we add a new
+BINLOG_SEND_ANNOTATE_ROWS_EVENT flag used when requesting an event:
+
+ #define BINLOG_DUMP_NON_BLOCK 1
+ #define BINLOG_SEND_ANNOTATE_ROWS_EVENT 2
+
+ pthread_handler_t handle_slave_io(void *arg)
+ { ...
+ request_dump(mysql, ...);
+ ...
+ }
+
+ int request_dump(MYSQL* mysql, ...)
+ { ...
+ if (opt_log_slave_updates &&
+ mi->io_thd->variables.binlog_annotate_rows_events)
+ binlog_flags|= BINLOG_SEND_ANNOTATE_ROWS_EVENT;
+ ...
+ int2store(buf + 4, binlog_flags);
+ ...
+ simple_command(mysql, COM_BINLOG_DUMP, buf, ...);
+ ...
+ }
+
+6. How master executes the request
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+ case COM_BINLOG_DUMP:
+ { ...
+ flags= uint2korr(packet + 4);
+ ...
+ mysql_binlog_send(thd, ..., flags);
+ ...
+ }
+
+ void mysql_binlog_send(THD* thd, ..., ushort flags)
+ { ...
+ Log_event::read_log_event(&log, packet, ...);
+ ...
+ if ((*packet)[EVENT_TYPE_OFFSET + 1] != ANNOTATE_ROWS_EVENT ||
+ flags & BINLOG_SEND_ANNOTATE_ROWS_EVENT ||
+ thd->server_id == 0 /* slave == mysqlbinlog */ )
+ {
+ my_net_write(net, packet->ptr(), packet->length());
+ }
+ ...
+ }
+
+7. How slave SQL thread processes Annotate_rows events
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+The slave processes each recieved event by "applying" it, i.e. by
+calling the Log_event::apply_event() function which in turn calls
+the virtual do_apply_event() member specific for each type of the
+event.
+
+ int exec_relay_log_event(THD* thd, Relay_log_info* rli)
+ { ...
+ Log_event *ev = next_event(rli);
+ ...
+ apply_event_and_update_pos(ev, ...);
+
+ if (ev->get_type_code() != FORMAT_DESCRIPTION_EVENT)
+ delete ev;
+ ...
+ }
+
+ int apply_event_and_update_pos(Log_event *ev, ...)
+ { ...
+ ev->apply_event(...);
+ ...
+ }
+
+ int Log_event::apply_event(...)
+ {
+ return do_apply_event(...);
+ }
+
+What does it mean to "apply" an Annotate_rows event? It means to set current
+thd query to that of the described by the event, i.e. to the query which
+caused the subsequent Rows events (see "How Master writes Annotate_rows
+events to the binary log" to follow what happens further when the subsequent
+Rows events is applied):
+
+ int Annotate_rows_log_event::do_apply_event(...)
+ {
+ thd->set_query(m_query_txt, m_query_len);
+ }
+
+NOTE. I am not sure, but possibly current values of thd->query and
+thd->query_length should be saved before calling set_query() and to be
+restored on the Annotate_rows_log_event object deletion.
+Is it really needed ?
+
+After calling this do_apply_event() function we may not delete the
+Annotate_rows_log_event object immediatedly (see exec_relay_log_event()
+above) because thd->query now points to the string inside this object.
+We may keep the pointer to this object in the Relay_log_info:
+
+ class Relay_log_info
+ {
+ public:
+ ...
+ void set_annotate_event(Annotate_rows_log_event*);
+ Annotate_rows_log_event* get_annotate_event();
+ void free_annotate_event();
+ ...
+ private:
+ Annotate_rows_log_event* m_annotate_event;
+ };
+
+When the saved Annotate_rows object may be deleted? When all corresponding
+Rows events will be processed, i.e. before processing the first non-Rows
+event (note that Annotate_rows object resides in the binary log *after*
+the (possible) 'BEGIN' Query event which accompanies the rows events; note
+also that this deletion is adjusted with the case when some or all
+corresponding Rows events are filtered out by replicate filter rules):
+
+ int exec_relay_log_event(THD* thd, Relay_log_info* rli)
+ { ...
+ Log_event *ev= next_event(rli);
+ ...
+ if (rli->get_annotate_event() && !IS_RBR_EVENT_TYPE(ev->get_type_code()))
+ rli->free_annotate_event();
+
+ apply_event_and_update_pos(ev, ...);
+
+ if (ev->get_type_code() == ANNOTATE_ROWS_EVENT)
+ rli->set_annotate_event((Annotate_rows_log_event*) ev);
+ else if (ev->get_type_code() != FORMAT_DESCRIPTION_EVENT)
+ delete ev;
+ ...
+ }
+
+where
+
+ #define IS_RBR_EVENT_TYPE(type) ( (type) == TABLE_MAP_EVENT || \
+ (type) == WRITE_ROWS_EVENT || \
+ (type) == UPDATE_ROWS_EVENT || \
+ (type) == DELETE_ROWS_EVENT )
+
+8. General remarks
+~~~~~~~~~~~~~~~~~~
+Kristian noticed that introducing new log event type should be coordinated
+somehow with MySQL/Sun:
+
+ Kristian: The numeric code for this event must be assigned carefully.
+ It should be coordinated with MySQL/Sun, otherwise we can get into a
+ situation where MySQL uses the same numeric code for one event that
+ MariaDB uses for ANNOTATE_ROWS_EVENT, which would make merging the two
+ impossible.
+ Alex: I reserved about 20 numbers not to have possible conflicts
+ with MySQL.
+ Kristian: Still, I think it would be appropriate to send a polite email
+ to internals(a)lists.mysql.com about this and suggesting to reserve the
+ event number.
+
+Also we should notice the introduction of the BINLOG_SEND_ANNOTATE_ROWS_EVENT
+flag taking into account that MySQL/Sun may also introduce a flag with the
+same value to be used in the request_dump-mysql_binlog_send interface.
+But this is mainly the question of merging: if a conflict concerning this
+flag occur, we may simply change the BINLOG_SEND_ANNOTATE_ROWS_EVENT value
+(this does not require additional changes in the code).
------------------------------------------------------------
-=-=(View All Progress Notes, 23 total)=-=-
http://askmonty.org/worklog/index.pl?tid=47&nolimit=1
DESCRIPTION:
Store in binlog (and show in mysqlbinlog output) texts of statements that
caused RBR events
This is needed for (list from Monty):
- Easier to understand why updates happened
- Would make it easier to find out where in application things went
wrong (as you can search for exact strings)
- Allow one to filter things based on comments in the statement.
The cost of this can be that the binlog will be approximately 2x in size
(especially insert of big blob's would be a bit painful), so this should
be an optional feature.
HIGH-LEVEL SPECIFICATION:
Content
~~~~~~~
1. Annotate_rows_log_event
2. Server option: --binlog-annotate-rows-events
3. Server option: --replicate-annotate-rows-events
4. mysqlbinlog option: --print-annotate-rows-events
5. mysqlbinlog output
1. Annotate_rows_log_event [ ANNOTATE_ROWS_EVENT ]
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
Describes the query which caused the corresponding rows events. Has empty
post-header and contains the query text in its data part. Example:
************************
ANNOTATE_ROWS_EVENT
************************
00000220 | B6 A0 2C 4B | time_when = 1261215926
00000224 | 33 | event_type = 51
00000225 | 64 00 00 00 | server_id = 100
00000229 | 36 00 00 00 | event_len = 54
0000022D | 56 02 00 00 | log_pos = 00000256
00000231 | 00 00 | flags = <none>
------------------------
00000233 | 49 4E 53 45 | query = "INSERT INTO t1 VALUES (1), (2), (3)"
00000237 | 52 54 20 49 |
0000023B | 4E 54 4F 20 |
0000023F | 74 31 20 56 |
00000243 | 41 4C 55 45 |
00000247 | 53 20 28 31 |
0000024B | 29 2C 20 28 |
0000024F | 32 29 2C 20 |
00000253 | 28 33 29 |
************************
In binary log, Annotate_rows event follows the (possible) 'BEGIN' Query event
and precedes the first of Table map events which accompany the corresponding
rows events. (See example in the "mysqlbinlog output" section below.)
2. Server option: --binlog-annotate-rows-events
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
Tells the master to write Annotate_rows events to the binary log.
* Variable Name: binlog_annotate_rows_events
* Scope: Global & Session
* Access Type: Dynamic
* Data Type: bool
* Default Value: OFF
NOTE. Session values allows to annotate only some selected statements:
...
SET SESSION binlog_annotate_rows_events=ON;
... statements to be annotated ...
SET SESSION binlog_annotate_rows_events=OFF;
... statements not to be annotated ...
3. Server option: --replicate-annotate-rows-events
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
Tells the slave to reproduce Annotate_rows events recieved from the master
in its own binary log (sensible only in pair with log-slave-updates option).
* Variable Name: replicate_annotate_rows_events
* Scope: Global
* Access Type: Read only
* Data Type: bool
* Default Value: OFF
NOTE. Why do we additionally need this 'replicate' option? Why not to make
the slave to reproduce this events when its binlog-annotate-rows-events
global value is ON? Well, because, for example, we may want to configure
the slave which should reproduce Annotate_rows events but has global
binlog-annotate-rows-events = OFF meaning this to be the default value for
the client threads (see also "How slave treats replicate-annotate-rows-events
option" in LLD part).
4. mysqlbinlog option: --print-annotate-rows-events
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
With this option, mysqlbinlog prints the content of Annotate_rows events (if
the binary log does contain them). Without this option (i.e. by default),
mysqlbinlog skips Annotate_rows events.
5. mysqlbinlog output
~~~~~~~~~~~~~~~~~~~~~
With --print-annotate-rows-events, mysqlbinlog outputs Annotate_rows events
in a form like this:
...
# at 1646
#091219 12:45:26 server id 100 end_log_pos 1714 Query thread_id=1
exec_time=0 error_code=0
SET TIMESTAMP=1261215926/*!*/;
BEGIN
/*!*/;
# at 1714
# at 1812
# at 1853
# at 1894
# at 1938
#091219 12:45:26 server id 100 end_log_pos 1812 Query: `DELETE t1, t2 FROM
t1 INNER JOIN t2 INNER JOIN t3 WHERE t1.a=t2.a AND t2.a=t3.a`
#091219 12:45:26 server id 100 end_log_pos 1853 Table_map: `test`.`t1`
mapped to number 16
#091219 12:45:26 server id 100 end_log_pos 1894 Table_map: `test`.`t2`
mapped to number 17
#091219 12:45:26 server id 100 end_log_pos 1938 Delete_rows: table id 16
#091219 12:45:26 server id 100 end_log_pos 1982 Delete_rows: table id 17
flags: STMT_END_F
...
LOW-LEVEL DESIGN:
Content
~~~~~~~
1. Annotate_rows event number
2. Outline of Annotate_rows event behavior
3. How Master writes Annotate_rows events to the binary log
4. How slave treats replicate-annotate-rows-events option
5. How slave IO thread requests Annotate_rows events
6. How master executes the request
7. How slave SQL thread processes Annotate_rows events
8. General remarks
1. Annotate_rows event number
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
To avoid possible event numbers conflict with MySQL/Sun, we leave a gap
between the last MySQL event number and the Annotate_rows event number:
enum Log_event_type
{ ...
INCIDENT_EVENT= 26,
// New MySQL event numbers are to be added here
MYSQL_EVENTS_END,
MARIA_EVENTS_BEGIN= 51,
// New Maria event numbers start from here
ANNOTATE_ROWS_EVENT= 51,
ENUM_END_EVENT
};
together with the corresponding extension of 'post_header_len' array in the
Format description event. (This extension does not affect the compatibility
of the binary log). Here is how Format description event looks like with
this extension:
************************
FORMAT_DESCRIPTION_EVENT
************************
00000004 | A1 A0 2C 4B | time_when = 1261215905
00000008 | 0F | event_type = 15
00000009 | 64 00 00 00 | server_id = 100
0000000D | 7F 00 00 00 | event_len = 127
00000011 | 83 00 00 00 | log_pos = 00000083
00000015 | 01 00 | flags = LOG_EVENT_BINLOG_IN_USE_F
------------------------
00000017 | 04 00 | binlog_ver = 4
00000019 | 35 2E 32 2E | server_ver = 5.2.0-MariaDB-alpha-debug-log
..... ...
0000004B | A1 A0 2C 4B | time_created = 1261215905
0000004F | 13 | common_header_len = 19
------------------------
post_header_len
------------------------
00000050 | 38 | 56 - START_EVENT_V3 [1]
..... ...
00000069 | 02 | 2 - INCIDENT_EVENT [26]
0000006A | 00 | 0 - RESERVED [27]
..... ...
00000081 | 00 | 0 - RESERVED [50]
00000082 | 00 | 0 - ANNOTATE_ROWS_EVENT [51]
************************
2. Outline of Annotate_rows event behavior
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
Each Annotate_rows_log_event object has two private members describing the
corresponding query:
char *m_query_txt;
uint m_query_len;
When the object is created for writing to a binary log, this query is taken
from 'thd' (for short, below we omit the 'Annotate_rows_log_event::' prefix
as well as other implementation details):
Annotate_rows_log_event(THD *thd)
{
m_query_txt = thd->query();
m_query_len = thd->query_length();
}
When the object is read from a binary log, the query is taken from the buffer
containing the binary log representation of the event (this buffer is allocated
in Log_event object from which all Log events are derived):
Annotate_rows_log_event(char *buf, uint event_len,
Format_description_log_event *desc)
{
m_query_len = event_len - desc->common_header_len;
m_query_txt = buf + desc->common_header_len;
}
The events are written to the binary log by the Log_event::write() member
which calls virtual write_data_header() and write_data_body() members
("data header" and "post header" are synonym in replication terminology).
In our case, data header is empty and data body is just the query:
bool write_data_body(IO_CACHE *file)
{
return my_b_safe_write(file, (uchar*) m_query_txt, m_query_len);
}
Printing the event is just printing the query:
void Annotate_rows_log_event::print(FILE *file, PRINT_EVENT_INFO *pinfo)
{
my_b_printf(&pinfo->head_cache, "\tQuery: `%s`\n", m_query_txt);
}
3. How Master writes Annotate_rows events to the binary log
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
The event is written to the binary log just before the group of Table_map
events which precede corresponding Rows events (one query may generate
several Table map events in the binary log, but the corresponding
Annotate_rows event must be written only once before the first Table map
event; hence the boolean variable 'with_annotate' below):
int write_locked_table_maps(THD *thd)
{ ...
bool with_annotate= thd->variables.binlog_annotate_rows_events;
...
for (uint i= 0; i < ... <number of tables> ...; ++i)
{ ...
thd->binlog_write_table_map(table, ..., with_annotate);
with_annotate= 0; // write Annotate_event not more than once
...
}
...
}
int THD::binlog_write_table_map(TABLE *table, ..., bool with_annotate)
{ ...
Table_map_log_event the_event(...);
...
if (with_annotate)
{
Annotate_rows_log_event anno(this);
mysql_bin_log.write(&anno);
}
mysql_bin_log.write(&the_event);
...
}
4. How slave treats replicate-annotate-rows-events option
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
The replicate-annotate-rows-events option is treated just as the session
value of the binlog_annotate_rows_events variable for the slave IO and
SQL threads. This setting is done during initialization of these threads:
pthread_handler_t handle_slave_io(void *arg)
{
THD *thd= new THD;
...
init_slave_thread(thd, SLAVE_THD_IO);
...
}
pthread_handler_t handle_slave_sql(void *arg)
{
THD *thd= new THD;
...
init_slave_thread(thd, SLAVE_THD_SQL);
...
}
int init_slave_thread(THD* thd, SLAVE_THD_TYPE thd_type)
{ ...
thd->variables.binlog_annotate_rows_events=
opt_replicate_annotate_rows_events;
...
}
5. How slave IO thread requests Annotate_rows events
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
If the replicate-annotate-rows-events option is not set on a slave, there
is no need for master to send Annotate_rows events to this slave. The slave
(or mysqlbinlog in remote case), before requesting binlog dump via the
COM_BINLOG_DUMP command, informs the master whether it should send these
events by executing the newly added COM_BINLOG_DUMP_OPTIONS_EXT server
command:
case COM_BINLOG_DUMP_OPTIONS_EXT:
thd->binlog_dump_flags_ext= packet[0];
my_ok(thd);
break;
Note. We add this new command and don't use COM_BINLOG_DUMP to avoid possible
conflicts with MySQL/Sun.
6. How master executes the request
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
case COM_BINLOG_DUMP:
{ ...
flags= uint2korr(packet + 4);
...
mysql_binlog_send(thd, ..., flags);
...
}
void mysql_binlog_send(THD* thd, ..., ushort flags)
{ ...
Log_event::read_log_event(&log, packet, ...);
...
if ((*packet)[EVENT_TYPE_OFFSET + 1] != ANNOTATE_ROWS_EVENT ||
flags & BINLOG_SEND_ANNOTATE_ROWS_EVENT)
{
my_net_write(net, packet->ptr(), packet->length());
}
...
}
7. How slave SQL thread processes Annotate_rows events
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
The slave processes each recieved event by "applying" it, i.e. by
calling the Log_event::apply_event() function which in turn calls
the virtual do_apply_event() member specific for each type of the
event.
int exec_relay_log_event(THD* thd, Relay_log_info* rli)
{ ...
Log_event *ev = next_event(rli);
...
apply_event_and_update_pos(ev, ...);
if (ev->get_type_code() != FORMAT_DESCRIPTION_EVENT)
delete ev;
...
}
int apply_event_and_update_pos(Log_event *ev, ...)
{ ...
ev->apply_event(...);
...
}
int Log_event::apply_event(...)
{
return do_apply_event(...);
}
What does it mean to "apply" an Annotate_rows event? It means to set current
thd query to that of the described by the event, i.e. to the query which
caused the subsequent Rows events (see "How Master writes Annotate_rows
events to the binary log" to follow what happens further when the subsequent
Rows events are applied):
int Annotate_rows_log_event::do_apply_event(...)
{
thd->set_query(m_query_txt, m_query_len);
}
NOTE. I am not sure, but possibly current values of thd->query and
thd->query_length should be saved before calling set_query() and to be
restored on the Annotate_rows_log_event object deletion.
Is it really needed ?
After calling this do_apply_event() function we may not delete the
Annotate_rows_log_event object immediatedly (see exec_relay_log_event()
above) because thd->query now points to the string inside this object.
We may keep the pointer to this object in the Relay_log_info:
class Relay_log_info
{
public:
...
void set_annotate_event(Annotate_rows_log_event*);
Annotate_rows_log_event* get_annotate_event();
void free_annotate_event();
...
private:
Annotate_rows_log_event* m_annotate_event;
};
The saved Annotate_rows object should be deleted when all corresponding
Rows events will be processed:
int exec_relay_log_event(THD* thd, Relay_log_info* rli)
{ ...
Log_event *ev= next_event(rli);
...
apply_event_and_update_pos(ev, ...);
if (rli->get_annotate_event() && is_last_rows_event(ev))
rli->free_annotate_event();
else if (ev->get_type_code() == ANNOTATE_ROWS_EVENT)
rli->set_annotate_event((Annotate_rows_log_event*) ev);
else if (ev->get_type_code() != FORMAT_DESCRIPTION_EVENT)
delete ev;
...
}
where
bool is_last_rows_event(Log_event* ev)
{
Log_event_type type= ev->get_type_code();
if (IS_ROWS_EVENT_TYPE(type))
{
Rows_log_event* rows= (Rows_log_event*)ev;
return rows->get_flags(Rows_log_event::STMT_END_F);
}
return 0;
}
#define IS_ROWS_EVENT_TYPE(type) ((type) == WRITE_ROWS_EVENT || \
(type) == UPDATE_ROWS_EVENT || \
(type) == DELETE_ROWS_EVENT)
8. General remarks
~~~~~~~~~~~~~~~~~~
Kristian noticed that introducing new log event type should be coordinated
somehow with MySQL/Sun:
Kristian: The numeric code for this event must be assigned carefully.
It should be coordinated with MySQL/Sun, otherwise we can get into a
situation where MySQL uses the same numeric code for one event that
MariaDB uses for ANNOTATE_ROWS_EVENT, which would make merging the two
impossible.
Alex: I reserved about 20 numbers not to have possible conflicts
with MySQL.
Kristian: Still, I think it would be appropriate to send a polite email
to internals(a)lists.mysql.com about this and suggesting to reserve the
event number.
ESTIMATED WORK TIME
ESTIMATED COMPLETION DATE
-----------------------------------------------------------------------
WorkLog (v3.5.9)
1
0

[Maria-developers] Updated (by Guest): Store in binlog text of statements that caused RBR events (47)
by worklog-noreply@askmonty.org 05 Feb '10
by worklog-noreply@askmonty.org 05 Feb '10
05 Feb '10
-----------------------------------------------------------------------
WORKLOG TASK
-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-
TASK...........: Store in binlog text of statements that caused RBR events
CREATION DATE..: Sat, 15 Aug 2009, 23:48
SUPERVISOR.....: Monty
IMPLEMENTOR....:
COPIES TO......: Knielsen
CATEGORY.......: Server-Sprint
TASK ID........: 47 (http://askmonty.org/worklog/?tid=47)
VERSION........: Server-9.x
STATUS.........: In-Progress
PRIORITY.......: 60
WORKED HOURS...: 20
ESTIMATE.......: 35 (hours remain)
ORIG. ESTIMATE.: 35
PROGRESS NOTES:
-=-=(Guest - Fri, 05 Feb 2010, 13:40)=-=-
Category updated.
--- /tmp/wklog.47.old.9197 2010-02-05 13:40:36.000000000 +0200
+++ /tmp/wklog.47.new.9197 2010-02-05 13:40:36.000000000 +0200
@@ -1 +1 @@
-Server-RawIdeaBin
+Server-Sprint
-=-=(Guest - Fri, 05 Feb 2010, 13:40)=-=-
Status updated.
--- /tmp/wklog.47.old.9197 2010-02-05 13:40:36.000000000 +0200
+++ /tmp/wklog.47.new.9197 2010-02-05 13:40:36.000000000 +0200
@@ -1 +1 @@
-Un-Assigned
+In-Progress
-=-=(Alexi - Thu, 04 Feb 2010, 09:54)=-=-
Low Level Design modified.
--- /tmp/wklog.47.old.16174 2010-02-04 09:54:13.000000000 +0200
+++ /tmp/wklog.47.new.16174 2010-02-04 09:54:13.000000000 +0200
@@ -171,35 +171,20 @@
5. How slave IO thread requests Annotate_rows events
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-When requesting an event, the slave should inform the master whether
-it should send Annotate_rows events or not. To that end we add a new
-BINLOG_SEND_ANNOTATE_ROWS_EVENT flag used when requesting an event:
+If the replicate-annotate-rows-events option is not set on a slave, there
+is no need for master to send Annotate_rows events to this slave. The slave
+(or mysqlbinlog in remote case), before requesting binlog dump via the
+COM_BINLOG_DUMP command, informs the master whether it should send these
+events by executing the newly added COM_BINLOG_DUMP_OPTIONS_EXT server
+command:
+
+ case COM_BINLOG_DUMP_OPTIONS_EXT:
+ thd->binlog_dump_flags_ext= packet[0];
+ my_ok(thd);
+ break;
- #define BINLOG_DUMP_NON_BLOCK 1
- #define BINLOG_SEND_ANNOTATE_ROWS_EVENT 2
-
- pthread_handler_t handle_slave_io(void *arg)
- { ...
- request_dump(mysql, ...);
- ...
- }
-
- int request_dump(MYSQL* mysql, ...)
- { ...
- if (opt_log_slave_updates &&
- mi->io_thd->variables.binlog_annotate_rows_events)
- binlog_flags|= BINLOG_SEND_ANNOTATE_ROWS_EVENT;
- ...
- int2store(buf + 4, binlog_flags);
- ...
- simple_command(mysql, COM_BINLOG_DUMP, buf, ...);
- ...
- }
-
-NOTE. mysqlbinlog, when remotely requesting BINLOG_DUMP by calling the
-simple_command() function, should also use this flag if it wants (in case
-of the --print-annotate-rows-events option set) to recieve Annotate_rows
-events.
+Note. We add this new command and don't use COM_BINLOG_DUMP to avoid possible
+conflicts with MySQL/Sun.
6. How master executes the request
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
@@ -338,10 +323,4 @@
to internals(a)lists.mysql.com about this and suggesting to reserve the
event number.
-Also we should notice the introduction of the BINLOG_SEND_ANNOTATE_ROWS_EVENT
-flag taking into account that MySQL/Sun may also introduce a flag with the
-same value to be used in the request_dump-mysql_binlog_send interface.
-But this is mainly the question of merging: if a conflict concerning this
-flag occur, we may simply change the BINLOG_SEND_ANNOTATE_ROWS_EVENT value
-(this does not require additional changes in the code).
-=-=(Alexi - Sun, 20 Dec 2009, 16:00)=-=-
Low Level Design modified.
--- /tmp/wklog.47.old.19667 2009-12-20 14:00:56.000000000 +0000
+++ /tmp/wklog.47.new.19667 2009-12-20 14:00:56.000000000 +0000
@@ -196,6 +196,11 @@
...
}
+NOTE. mysqlbinlog, when remotely requesting BINLOG_DUMP by calling the
+simple_command() function, should also use this flag if it wants (in case
+of the --print-annotate-rows-events option set) to recieve Annotate_rows
+events.
+
6. How master executes the request
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
@@ -212,8 +217,7 @@
Log_event::read_log_event(&log, packet, ...);
...
if ((*packet)[EVENT_TYPE_OFFSET + 1] != ANNOTATE_ROWS_EVENT ||
- flags & BINLOG_SEND_ANNOTATE_ROWS_EVENT ||
- thd->server_id == 0 /* slave == mysqlbinlog */ )
+ flags & BINLOG_SEND_ANNOTATE_ROWS_EVENT)
{
my_net_write(net, packet->ptr(), packet->length());
}
-=-=(Alexi - Sun, 20 Dec 2009, 13:14)=-=-
Low Level Design modified.
--- /tmp/wklog.47.old.11350 2009-12-20 13:14:04.000000000 +0200
+++ /tmp/wklog.47.new.11350 2009-12-20 13:14:04.000000000 +0200
@@ -282,23 +282,18 @@
Annotate_rows_log_event* m_annotate_event;
};
-When the saved Annotate_rows object may be deleted? When all corresponding
-Rows events will be processed, i.e. before processing the first non-Rows
-event (note that Annotate_rows object resides in the binary log *after*
-the (possible) 'BEGIN' Query event which accompanies the rows events; note
-also that this deletion is adjusted with the case when some or all
-corresponding Rows events are filtered out by replicate filter rules):
+The saved Annotate_rows object should be deleted when all corresponding
+Rows events will be processed:
int exec_relay_log_event(THD* thd, Relay_log_info* rli)
{ ...
Log_event *ev= next_event(rli);
...
- if (rli->get_annotate_event() && !IS_RBR_EVENT_TYPE(ev->get_type_code()))
- rli->free_annotate_event();
-
apply_event_and_update_pos(ev, ...);
- if (ev->get_type_code() == ANNOTATE_ROWS_EVENT)
+ if (rli->get_annotate_event() && is_last_rows_event(ev))
+ rli->free_annotate_event();
+ else if (ev->get_type_code() == ANNOTATE_ROWS_EVENT)
rli->set_annotate_event((Annotate_rows_log_event*) ev);
else if (ev->get_type_code() != FORMAT_DESCRIPTION_EVENT)
delete ev;
@@ -307,10 +302,21 @@
where
- #define IS_RBR_EVENT_TYPE(type) ( (type) == TABLE_MAP_EVENT || \
- (type) == WRITE_ROWS_EVENT || \
+ bool is_last_rows_event(Log_event* ev)
+ {
+ Log_event_type type= ev->get_type_code();
+ if (IS_ROWS_EVENT_TYPE(type))
+ {
+ Rows_log_event* rows= (Rows_log_event*)ev;
+ return rows->get_flags(Rows_log_event::STMT_END_F);
+ }
+
+ return 0;
+ }
+
+ #define IS_ROWS_EVENT_TYPE(type) ((type) == WRITE_ROWS_EVENT || \
(type) == UPDATE_ROWS_EVENT || \
- (type) == DELETE_ROWS_EVENT )
+ (type) == DELETE_ROWS_EVENT)
8. General remarks
~~~~~~~~~~~~~~~~~~
-=-=(Alexi - Sun, 20 Dec 2009, 09:29)=-=-
Low Level Design modified.
--- /tmp/wklog.47.old.32726 2009-12-20 07:29:56.000000000 +0000
+++ /tmp/wklog.47.new.32726 2009-12-20 07:29:56.000000000 +0000
@@ -56,7 +56,7 @@
0000006A | 00 | 0 - RESERVED [27]
..... ...
00000081 | 00 | 0 - RESERVED [50]
- 00000082 | 00 | 0 - ANNOTATE_RBR_EVENT [51]
+ 00000082 | 00 | 0 - ANNOTATE_ROWS_EVENT [51]
************************
2. Outline of Annotate_rows event behavior
-=-=(Alexi - Sat, 19 Dec 2009, 16:10)=-=-
Low Level Design modified.
--- /tmp/wklog.47.old.16051 2009-12-19 16:10:48.000000000 +0200
+++ /tmp/wklog.47.new.16051 2009-12-19 16:10:48.000000000 +0200
@@ -253,7 +253,7 @@
thd query to that of the described by the event, i.e. to the query which
caused the subsequent Rows events (see "How Master writes Annotate_rows
events to the binary log" to follow what happens further when the subsequent
-Rows events is applied):
+Rows events are applied):
int Annotate_rows_log_event::do_apply_event(...)
{
-=-=(Alexi - Sat, 19 Dec 2009, 16:02)=-=-
High-Level Specification modified.
--- /tmp/wklog.47.old.15695 2009-12-19 16:02:33.000000000 +0200
+++ /tmp/wklog.47.new.15695 2009-12-19 16:02:33.000000000 +0200
@@ -12,7 +12,7 @@
post-header and contains the query text in its data part. Example:
************************
- ANNOTATE_RBR_EVENT
+ ANNOTATE_ROWS_EVENT
************************
00000220 | B6 A0 2C 4B | time_when = 1261215926
00000224 | 33 | event_type = 51
-=-=(Alexi - Sat, 19 Dec 2009, 15:58)=-=-
Low Level Design modified.
--- /tmp/wklog.47.old.15437 2009-12-19 13:58:12.000000000 +0000
+++ /tmp/wklog.47.new.15437 2009-12-19 13:58:12.000000000 +0000
@@ -1 +1,337 @@
+Content
+~~~~~~~
+ 1. Annotate_rows event number
+ 2. Outline of Annotate_rows event behavior
+ 3. How Master writes Annotate_rows events to the binary log
+ 4. How slave treats replicate-annotate-rows-events option
+ 5. How slave IO thread requests Annotate_rows events
+ 6. How master executes the request
+ 7. How slave SQL thread processes Annotate_rows events
+ 8. General remarks
+
+1. Annotate_rows event number
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+To avoid possible event numbers conflict with MySQL/Sun, we leave a gap
+between the last MySQL event number and the Annotate_rows event number:
+
+ enum Log_event_type
+ { ...
+ INCIDENT_EVENT= 26,
+ // New MySQL event numbers are to be added here
+ MYSQL_EVENTS_END,
+
+ MARIA_EVENTS_BEGIN= 51,
+ // New Maria event numbers start from here
+ ANNOTATE_ROWS_EVENT= 51,
+
+ ENUM_END_EVENT
+ };
+
+together with the corresponding extension of 'post_header_len' array in the
+Format description event. (This extension does not affect the compatibility
+of the binary log). Here is how Format description event looks like with
+this extension:
+
+ ************************
+ FORMAT_DESCRIPTION_EVENT
+ ************************
+ 00000004 | A1 A0 2C 4B | time_when = 1261215905
+ 00000008 | 0F | event_type = 15
+ 00000009 | 64 00 00 00 | server_id = 100
+ 0000000D | 7F 00 00 00 | event_len = 127
+ 00000011 | 83 00 00 00 | log_pos = 00000083
+ 00000015 | 01 00 | flags = LOG_EVENT_BINLOG_IN_USE_F
+ ------------------------
+ 00000017 | 04 00 | binlog_ver = 4
+ 00000019 | 35 2E 32 2E | server_ver = 5.2.0-MariaDB-alpha-debug-log
+ ..... ...
+ 0000004B | A1 A0 2C 4B | time_created = 1261215905
+ 0000004F | 13 | common_header_len = 19
+ ------------------------
+ post_header_len
+ ------------------------
+ 00000050 | 38 | 56 - START_EVENT_V3 [1]
+ ..... ...
+ 00000069 | 02 | 2 - INCIDENT_EVENT [26]
+ 0000006A | 00 | 0 - RESERVED [27]
+ ..... ...
+ 00000081 | 00 | 0 - RESERVED [50]
+ 00000082 | 00 | 0 - ANNOTATE_RBR_EVENT [51]
+ ************************
+
+2. Outline of Annotate_rows event behavior
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+Each Annotate_rows_log_event object has two private members describing the
+corresponding query:
+
+ char *m_query_txt;
+ uint m_query_len;
+
+When the object is created for writing to a binary log, this query is taken
+from 'thd' (for short, below we omit the 'Annotate_rows_log_event::' prefix
+as well as other implementation details):
+
+ Annotate_rows_log_event(THD *thd)
+ {
+ m_query_txt = thd->query();
+ m_query_len = thd->query_length();
+ }
+
+When the object is read from a binary log, the query is taken from the buffer
+containing the binary log representation of the event (this buffer is allocated
+in Log_event object from which all Log events are derived):
+
+ Annotate_rows_log_event(char *buf, uint event_len,
+ Format_description_log_event *desc)
+ {
+ m_query_len = event_len - desc->common_header_len;
+ m_query_txt = buf + desc->common_header_len;
+ }
+
+The events are written to the binary log by the Log_event::write() member
+which calls virtual write_data_header() and write_data_body() members
+("data header" and "post header" are synonym in replication terminology).
+In our case, data header is empty and data body is just the query:
+
+ bool write_data_body(IO_CACHE *file)
+ {
+ return my_b_safe_write(file, (uchar*) m_query_txt, m_query_len);
+ }
+
+Printing the event is just printing the query:
+
+ void Annotate_rows_log_event::print(FILE *file, PRINT_EVENT_INFO *pinfo)
+ {
+ my_b_printf(&pinfo->head_cache, "\tQuery: `%s`\n", m_query_txt);
+ }
+
+3. How Master writes Annotate_rows events to the binary log
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+The event is written to the binary log just before the group of Table_map
+events which precede corresponding Rows events (one query may generate
+several Table map events in the binary log, but the corresponding
+Annotate_rows event must be written only once before the first Table map
+event; hence the boolean variable 'with_annotate' below):
+
+ int write_locked_table_maps(THD *thd)
+ { ...
+ bool with_annotate= thd->variables.binlog_annotate_rows_events;
+ ...
+ for (uint i= 0; i < ... <number of tables> ...; ++i)
+ { ...
+ thd->binlog_write_table_map(table, ..., with_annotate);
+ with_annotate= 0; // write Annotate_event not more than once
+ ...
+ }
+ ...
+ }
+
+ int THD::binlog_write_table_map(TABLE *table, ..., bool with_annotate)
+ { ...
+ Table_map_log_event the_event(...);
+ ...
+ if (with_annotate)
+ {
+ Annotate_rows_log_event anno(this);
+ mysql_bin_log.write(&anno);
+ }
+
+ mysql_bin_log.write(&the_event);
+ ...
+ }
+
+4. How slave treats replicate-annotate-rows-events option
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+The replicate-annotate-rows-events option is treated just as the session
+value of the binlog_annotate_rows_events variable for the slave IO and
+SQL threads. This setting is done during initialization of these threads:
+
+ pthread_handler_t handle_slave_io(void *arg)
+ {
+ THD *thd= new THD;
+ ...
+ init_slave_thread(thd, SLAVE_THD_IO);
+ ...
+ }
+
+ pthread_handler_t handle_slave_sql(void *arg)
+ {
+ THD *thd= new THD;
+ ...
+ init_slave_thread(thd, SLAVE_THD_SQL);
+ ...
+ }
+
+ int init_slave_thread(THD* thd, SLAVE_THD_TYPE thd_type)
+ { ...
+ thd->variables.binlog_annotate_rows_events=
+ opt_replicate_annotate_rows_events;
+ ...
+ }
+
+5. How slave IO thread requests Annotate_rows events
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+When requesting an event, the slave should inform the master whether
+it should send Annotate_rows events or not. To that end we add a new
+BINLOG_SEND_ANNOTATE_ROWS_EVENT flag used when requesting an event:
+
+ #define BINLOG_DUMP_NON_BLOCK 1
+ #define BINLOG_SEND_ANNOTATE_ROWS_EVENT 2
+
+ pthread_handler_t handle_slave_io(void *arg)
+ { ...
+ request_dump(mysql, ...);
+ ...
+ }
+
+ int request_dump(MYSQL* mysql, ...)
+ { ...
+ if (opt_log_slave_updates &&
+ mi->io_thd->variables.binlog_annotate_rows_events)
+ binlog_flags|= BINLOG_SEND_ANNOTATE_ROWS_EVENT;
+ ...
+ int2store(buf + 4, binlog_flags);
+ ...
+ simple_command(mysql, COM_BINLOG_DUMP, buf, ...);
+ ...
+ }
+
+6. How master executes the request
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+ case COM_BINLOG_DUMP:
+ { ...
+ flags= uint2korr(packet + 4);
+ ...
+ mysql_binlog_send(thd, ..., flags);
+ ...
+ }
+
+ void mysql_binlog_send(THD* thd, ..., ushort flags)
+ { ...
+ Log_event::read_log_event(&log, packet, ...);
+ ...
+ if ((*packet)[EVENT_TYPE_OFFSET + 1] != ANNOTATE_ROWS_EVENT ||
+ flags & BINLOG_SEND_ANNOTATE_ROWS_EVENT ||
+ thd->server_id == 0 /* slave == mysqlbinlog */ )
+ {
+ my_net_write(net, packet->ptr(), packet->length());
+ }
+ ...
+ }
+
+7. How slave SQL thread processes Annotate_rows events
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+The slave processes each recieved event by "applying" it, i.e. by
+calling the Log_event::apply_event() function which in turn calls
+the virtual do_apply_event() member specific for each type of the
+event.
+
+ int exec_relay_log_event(THD* thd, Relay_log_info* rli)
+ { ...
+ Log_event *ev = next_event(rli);
+ ...
+ apply_event_and_update_pos(ev, ...);
+
+ if (ev->get_type_code() != FORMAT_DESCRIPTION_EVENT)
+ delete ev;
+ ...
+ }
+
+ int apply_event_and_update_pos(Log_event *ev, ...)
+ { ...
+ ev->apply_event(...);
+ ...
+ }
+
+ int Log_event::apply_event(...)
+ {
+ return do_apply_event(...);
+ }
+
+What does it mean to "apply" an Annotate_rows event? It means to set current
+thd query to that of the described by the event, i.e. to the query which
+caused the subsequent Rows events (see "How Master writes Annotate_rows
+events to the binary log" to follow what happens further when the subsequent
+Rows events is applied):
+
+ int Annotate_rows_log_event::do_apply_event(...)
+ {
+ thd->set_query(m_query_txt, m_query_len);
+ }
+
+NOTE. I am not sure, but possibly current values of thd->query and
+thd->query_length should be saved before calling set_query() and to be
+restored on the Annotate_rows_log_event object deletion.
+Is it really needed ?
+
+After calling this do_apply_event() function we may not delete the
+Annotate_rows_log_event object immediatedly (see exec_relay_log_event()
+above) because thd->query now points to the string inside this object.
+We may keep the pointer to this object in the Relay_log_info:
+
+ class Relay_log_info
+ {
+ public:
+ ...
+ void set_annotate_event(Annotate_rows_log_event*);
+ Annotate_rows_log_event* get_annotate_event();
+ void free_annotate_event();
+ ...
+ private:
+ Annotate_rows_log_event* m_annotate_event;
+ };
+
+When the saved Annotate_rows object may be deleted? When all corresponding
+Rows events will be processed, i.e. before processing the first non-Rows
+event (note that Annotate_rows object resides in the binary log *after*
+the (possible) 'BEGIN' Query event which accompanies the rows events; note
+also that this deletion is adjusted with the case when some or all
+corresponding Rows events are filtered out by replicate filter rules):
+
+ int exec_relay_log_event(THD* thd, Relay_log_info* rli)
+ { ...
+ Log_event *ev= next_event(rli);
+ ...
+ if (rli->get_annotate_event() && !IS_RBR_EVENT_TYPE(ev->get_type_code()))
+ rli->free_annotate_event();
+
+ apply_event_and_update_pos(ev, ...);
+
+ if (ev->get_type_code() == ANNOTATE_ROWS_EVENT)
+ rli->set_annotate_event((Annotate_rows_log_event*) ev);
+ else if (ev->get_type_code() != FORMAT_DESCRIPTION_EVENT)
+ delete ev;
+ ...
+ }
+
+where
+
+ #define IS_RBR_EVENT_TYPE(type) ( (type) == TABLE_MAP_EVENT || \
+ (type) == WRITE_ROWS_EVENT || \
+ (type) == UPDATE_ROWS_EVENT || \
+ (type) == DELETE_ROWS_EVENT )
+
+8. General remarks
+~~~~~~~~~~~~~~~~~~
+Kristian noticed that introducing new log event type should be coordinated
+somehow with MySQL/Sun:
+
+ Kristian: The numeric code for this event must be assigned carefully.
+ It should be coordinated with MySQL/Sun, otherwise we can get into a
+ situation where MySQL uses the same numeric code for one event that
+ MariaDB uses for ANNOTATE_ROWS_EVENT, which would make merging the two
+ impossible.
+ Alex: I reserved about 20 numbers not to have possible conflicts
+ with MySQL.
+ Kristian: Still, I think it would be appropriate to send a polite email
+ to internals(a)lists.mysql.com about this and suggesting to reserve the
+ event number.
+
+Also we should notice the introduction of the BINLOG_SEND_ANNOTATE_ROWS_EVENT
+flag taking into account that MySQL/Sun may also introduce a flag with the
+same value to be used in the request_dump-mysql_binlog_send interface.
+But this is mainly the question of merging: if a conflict concerning this
+flag occur, we may simply change the BINLOG_SEND_ANNOTATE_ROWS_EVENT value
+(this does not require additional changes in the code).
-=-=(Alexi - Sat, 19 Dec 2009, 15:41)=-=-
High-Level Specification modified.
--- /tmp/wklog.47.old.14545 2009-12-19 15:41:21.000000000 +0200
+++ /tmp/wklog.47.new.14545 2009-12-19 15:41:21.000000000 +0200
@@ -1,122 +1,107 @@
-First suggestion:
-
-> I think for this we would actually need a new binlog event type
-> (Comment_log_event?). Unless we want to log an empty statement Query_log_event
-> containing only a comment (a bit of a hack).
-
-New server option
-~~~~~~~~~~~~~~~~~
- --binlog-annotate-rows-events
-
-Setting this option makes RBR (rows-) events in the binary log to be
-preceded by Annotate rows events (see below). The corresponding
-'binlog_annotate_rows_events' system variable is dynamic and has both
-global and session values. Default global value is OFF.
-
-Note. Session values are usefull to make it possible to annotate only
- some selected statements:
+Content
+~~~~~~~
+ 1. Annotate_rows_log_event
+ 2. Server option: --binlog-annotate-rows-events
+ 3. Server option: --replicate-annotate-rows-events
+ 4. mysqlbinlog option: --print-annotate-rows-events
+ 5. mysqlbinlog output
+
+1. Annotate_rows_log_event [ ANNOTATE_ROWS_EVENT ]
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+Describes the query which caused the corresponding rows events. Has empty
+post-header and contains the query text in its data part. Example:
+
+ ************************
+ ANNOTATE_RBR_EVENT
+ ************************
+ 00000220 | B6 A0 2C 4B | time_when = 1261215926
+ 00000224 | 33 | event_type = 51
+ 00000225 | 64 00 00 00 | server_id = 100
+ 00000229 | 36 00 00 00 | event_len = 54
+ 0000022D | 56 02 00 00 | log_pos = 00000256
+ 00000231 | 00 00 | flags = <none>
+ ------------------------
+ 00000233 | 49 4E 53 45 | query = "INSERT INTO t1 VALUES (1), (2), (3)"
+ 00000237 | 52 54 20 49 |
+ 0000023B | 4E 54 4F 20 |
+ 0000023F | 74 31 20 56 |
+ 00000243 | 41 4C 55 45 |
+ 00000247 | 53 20 28 31 |
+ 0000024B | 29 2C 20 28 |
+ 0000024F | 32 29 2C 20 |
+ 00000253 | 28 33 29 |
+ ************************
+
+In binary log, Annotate_rows event follows the (possible) 'BEGIN' Query event
+and precedes the first of Table map events which accompany the corresponding
+rows events. (See example in the "mysqlbinlog output" section below.)
+
+2. Server option: --binlog-annotate-rows-events
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+Tells the master to write Annotate_rows events to the binary log.
+
+ * Variable Name: binlog_annotate_rows_events
+ * Scope: Global & Session
+ * Access Type: Dynamic
+ * Data Type: bool
+ * Default Value: OFF
+NOTE. Session values allows to annotate only some selected statements:
...
SET SESSION binlog_annotate_rows_events=ON;
... statements to be annotated ...
SET SESSION binlog_annotate_rows_events=OFF;
... statements not to be annotated ...
-New binlog event type
-~~~~~~~~~~~~~~~~~~~~~
- Annotate_rows_log_event [ ANNOTATE_ROWS_EVENT ]
-
-Describes the query which caused the corresponding rows event. In binary log,
-precedes each Table_map_log_event. Contains empty post-header and the query
-text in its data part.
-
-The numeric code for this event must be assigned carefully. It should be
-coordinated with MySQL/Sun, otherwise we can get into a situation where MySQL
-uses the same numeric code for one event that MariaDB uses for
-ANNOTATE_ROWS_EVENT, which would make merging the two impossible.
-
-Example:
-
- ...
- ************************
- ANNOTATE_ROWS_EVENT [51]
- ************************
- 000000C7 | 54 1B 12 4B | time_when = 1259477844
- 000000CB | 33 | event_type = 51
- 000000CC | 64 00 00 00 | server_id = 100
- 000000D0 | 2C 00 00 00 | event_len = 44
- 000000D4 | F3 00 00 00 | log_pos = 000000F3
- 000000D8 | 00 00 | flags = <none>
- ------------------------
- 000000DA | 69 6E 73 65 | query = "insert into t1 values (1)"
- 000000DE | 72 74 20 69 |
- 000000E2 | 6E 74 6F 20 |
- 000000E6 | 74 31 20 76 |
- 000000EA | 61 6C 75 65 |
- 000000EE | 73 20 28 31 |
- 000000F2 | 29 |
- ************************
- TABLE_MAP_EVENT [19]
- ************************
- 000000F3 | 54 1B 12 4B | time_when = 1259477844
- 000000F7 | 13 | event_type = 19
- 000000F8 | 64 00 00 00 | server_id = 100
- 000000FC | 29 00 00 00 | event_len = 41
- 00000100 | 1C 01 00 00 | log_pos = 0000011C
- 00000104 | 00 00 | flags = <none>
- ------------------------
- ...
- ************************
- WRITE_ROWS_EVENT [23]
- ************************
- 0000011C | 54 1B 12 4B | time_when = 1259477844
- 00000120 | 17 | event_type = 23
- 00000121 | 64 00 00 00 | server_id = 100
- 00000125 | 22 00 00 00 | event_len = 34
- 00000129 | 3E 01 00 00 | log_pos = 0000013E
- 0000012D | 10 00 | flags = LOG_EVENT_UPDATE_TABLE_MAP_VERSION_F
- ------------------------
- 0000012F | 0F 00 00 00 | table_id = 15
- ...
-
-New mysqlbinlog option
-~~~~~~~~~~~~~~~~~~~~~~
- --print-annotate-rows-events
-
-With this option, mysqlbinlog prints the content of Annotate-rows
-events (if the binary log does contain them). Without this option
-(i.e. by default), mysqlbinlog skips Annotate rows events.
-
-
-mysqlbinlog output
-~~~~~~~~~~~~~~~~~~
-Something like this:
+3. Server option: --replicate-annotate-rows-events
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+Tells the slave to reproduce Annotate_rows events recieved from the master
+in its own binary log (sensible only in pair with log-slave-updates option).
+
+ * Variable Name: replicate_annotate_rows_events
+ * Scope: Global
+ * Access Type: Read only
+ * Data Type: bool
+ * Default Value: OFF
+
+NOTE. Why do we additionally need this 'replicate' option? Why not to make
+the slave to reproduce this events when its binlog-annotate-rows-events
+global value is ON? Well, because, for example, we may want to configure
+the slave which should reproduce Annotate_rows events but has global
+binlog-annotate-rows-events = OFF meaning this to be the default value for
+the client threads (see also "How slave treats replicate-annotate-rows-events
+option" in LLD part).
+
+4. mysqlbinlog option: --print-annotate-rows-events
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+With this option, mysqlbinlog prints the content of Annotate_rows events (if
+the binary log does contain them). Without this option (i.e. by default),
+mysqlbinlog skips Annotate_rows events.
+5. mysqlbinlog output
+~~~~~~~~~~~~~~~~~~~~~
+With --print-annotate-rows-events, mysqlbinlog outputs Annotate_rows events
+in a form like this:
...
- # at 199
- # at 243
- # at 284
- #091129 9:57:24 server id 100 end_log_pos 243 Query: `insert into t1 values
-(1)`
- #091129 9:57:24 server id 100 end_log_pos 284 Table_map: `test`.`t1` mapped
-to number 15
- #091129 9:57:24 server id 100 end_log_pos 318 Write_rows: table id 15
+ # at 1646
+ #091219 12:45:26 server id 100 end_log_pos 1714 Query thread_id=1
+exec_time=0 error_code=0
+ SET TIMESTAMP=1261215926/*!*/;
+ BEGIN
+ /*!*/;
+ # at 1714
+ # at 1812
+ # at 1853
+ # at 1894
+ # at 1938
+ #091219 12:45:26 server id 100 end_log_pos 1812 Query: `DELETE t1, t2 FROM
+t1 INNER JOIN t2 INNER JOIN t3 WHERE t1.a=t2.a AND t2.a=t3.a`
+ #091219 12:45:26 server id 100 end_log_pos 1853 Table_map: `test`.`t1`
+mapped to number 16
+ #091219 12:45:26 server id 100 end_log_pos 1894 Table_map: `test`.`t2`
+mapped to number 17
+ #091219 12:45:26 server id 100 end_log_pos 1938 Delete_rows: table id 16
+ #091219 12:45:26 server id 100 end_log_pos 1982 Delete_rows: table id 17
flags: STMT_END_F
-
- BINLOG '
- VBsSSzNkAAAALAAAAPMAAAAAAGluc2VydCBpbnRvIHQxIHZhbHVlcyAoMSk=
- VBsSSxNkAAAAKQAAABwBAAAAAA8AAAAAAAAABHRlc3QAAnQxAAEDAAE=
- VBsSSxdkAAAAIgAAAD4BAAAQAA8AAAAAAAEAAf/+AQAAAA==
- '/*!*/;
- ### INSERT INTO test.t1
- ### SET
- ### @1=1 /* INT meta=0 nullable=1 is_null=0 */
...
-When master sends Annotate rows events
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-1. Master always sends Annotate_rows events to mysqlbinlog (in
- remote case).
-2. Master sends Annotate_rows events to a slave only if the slave has
- both log-slave-updates and binlog-annotate-rows-events options set.
-
------------------------------------------------------------
-=-=(View All Progress Notes, 22 total)=-=-
http://askmonty.org/worklog/index.pl?tid=47&nolimit=1
DESCRIPTION:
Store in binlog (and show in mysqlbinlog output) texts of statements that
caused RBR events
This is needed for (list from Monty):
- Easier to understand why updates happened
- Would make it easier to find out where in application things went
wrong (as you can search for exact strings)
- Allow one to filter things based on comments in the statement.
The cost of this can be that the binlog will be approximately 2x in size
(especially insert of big blob's would be a bit painful), so this should
be an optional feature.
HIGH-LEVEL SPECIFICATION:
Content
~~~~~~~
1. Annotate_rows_log_event
2. Server option: --binlog-annotate-rows-events
3. Server option: --replicate-annotate-rows-events
4. mysqlbinlog option: --print-annotate-rows-events
5. mysqlbinlog output
1. Annotate_rows_log_event [ ANNOTATE_ROWS_EVENT ]
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
Describes the query which caused the corresponding rows events. Has empty
post-header and contains the query text in its data part. Example:
************************
ANNOTATE_ROWS_EVENT
************************
00000220 | B6 A0 2C 4B | time_when = 1261215926
00000224 | 33 | event_type = 51
00000225 | 64 00 00 00 | server_id = 100
00000229 | 36 00 00 00 | event_len = 54
0000022D | 56 02 00 00 | log_pos = 00000256
00000231 | 00 00 | flags = <none>
------------------------
00000233 | 49 4E 53 45 | query = "INSERT INTO t1 VALUES (1), (2), (3)"
00000237 | 52 54 20 49 |
0000023B | 4E 54 4F 20 |
0000023F | 74 31 20 56 |
00000243 | 41 4C 55 45 |
00000247 | 53 20 28 31 |
0000024B | 29 2C 20 28 |
0000024F | 32 29 2C 20 |
00000253 | 28 33 29 |
************************
In binary log, Annotate_rows event follows the (possible) 'BEGIN' Query event
and precedes the first of Table map events which accompany the corresponding
rows events. (See example in the "mysqlbinlog output" section below.)
2. Server option: --binlog-annotate-rows-events
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
Tells the master to write Annotate_rows events to the binary log.
* Variable Name: binlog_annotate_rows_events
* Scope: Global & Session
* Access Type: Dynamic
* Data Type: bool
* Default Value: OFF
NOTE. Session values allows to annotate only some selected statements:
...
SET SESSION binlog_annotate_rows_events=ON;
... statements to be annotated ...
SET SESSION binlog_annotate_rows_events=OFF;
... statements not to be annotated ...
3. Server option: --replicate-annotate-rows-events
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
Tells the slave to reproduce Annotate_rows events recieved from the master
in its own binary log (sensible only in pair with log-slave-updates option).
* Variable Name: replicate_annotate_rows_events
* Scope: Global
* Access Type: Read only
* Data Type: bool
* Default Value: OFF
NOTE. Why do we additionally need this 'replicate' option? Why not to make
the slave to reproduce this events when its binlog-annotate-rows-events
global value is ON? Well, because, for example, we may want to configure
the slave which should reproduce Annotate_rows events but has global
binlog-annotate-rows-events = OFF meaning this to be the default value for
the client threads (see also "How slave treats replicate-annotate-rows-events
option" in LLD part).
4. mysqlbinlog option: --print-annotate-rows-events
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
With this option, mysqlbinlog prints the content of Annotate_rows events (if
the binary log does contain them). Without this option (i.e. by default),
mysqlbinlog skips Annotate_rows events.
5. mysqlbinlog output
~~~~~~~~~~~~~~~~~~~~~
With --print-annotate-rows-events, mysqlbinlog outputs Annotate_rows events
in a form like this:
...
# at 1646
#091219 12:45:26 server id 100 end_log_pos 1714 Query thread_id=1
exec_time=0 error_code=0
SET TIMESTAMP=1261215926/*!*/;
BEGIN
/*!*/;
# at 1714
# at 1812
# at 1853
# at 1894
# at 1938
#091219 12:45:26 server id 100 end_log_pos 1812 Query: `DELETE t1, t2 FROM
t1 INNER JOIN t2 INNER JOIN t3 WHERE t1.a=t2.a AND t2.a=t3.a`
#091219 12:45:26 server id 100 end_log_pos 1853 Table_map: `test`.`t1`
mapped to number 16
#091219 12:45:26 server id 100 end_log_pos 1894 Table_map: `test`.`t2`
mapped to number 17
#091219 12:45:26 server id 100 end_log_pos 1938 Delete_rows: table id 16
#091219 12:45:26 server id 100 end_log_pos 1982 Delete_rows: table id 17
flags: STMT_END_F
...
LOW-LEVEL DESIGN:
Content
~~~~~~~
1. Annotate_rows event number
2. Outline of Annotate_rows event behavior
3. How Master writes Annotate_rows events to the binary log
4. How slave treats replicate-annotate-rows-events option
5. How slave IO thread requests Annotate_rows events
6. How master executes the request
7. How slave SQL thread processes Annotate_rows events
8. General remarks
1. Annotate_rows event number
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
To avoid possible event numbers conflict with MySQL/Sun, we leave a gap
between the last MySQL event number and the Annotate_rows event number:
enum Log_event_type
{ ...
INCIDENT_EVENT= 26,
// New MySQL event numbers are to be added here
MYSQL_EVENTS_END,
MARIA_EVENTS_BEGIN= 51,
// New Maria event numbers start from here
ANNOTATE_ROWS_EVENT= 51,
ENUM_END_EVENT
};
together with the corresponding extension of 'post_header_len' array in the
Format description event. (This extension does not affect the compatibility
of the binary log). Here is how Format description event looks like with
this extension:
************************
FORMAT_DESCRIPTION_EVENT
************************
00000004 | A1 A0 2C 4B | time_when = 1261215905
00000008 | 0F | event_type = 15
00000009 | 64 00 00 00 | server_id = 100
0000000D | 7F 00 00 00 | event_len = 127
00000011 | 83 00 00 00 | log_pos = 00000083
00000015 | 01 00 | flags = LOG_EVENT_BINLOG_IN_USE_F
------------------------
00000017 | 04 00 | binlog_ver = 4
00000019 | 35 2E 32 2E | server_ver = 5.2.0-MariaDB-alpha-debug-log
..... ...
0000004B | A1 A0 2C 4B | time_created = 1261215905
0000004F | 13 | common_header_len = 19
------------------------
post_header_len
------------------------
00000050 | 38 | 56 - START_EVENT_V3 [1]
..... ...
00000069 | 02 | 2 - INCIDENT_EVENT [26]
0000006A | 00 | 0 - RESERVED [27]
..... ...
00000081 | 00 | 0 - RESERVED [50]
00000082 | 00 | 0 - ANNOTATE_ROWS_EVENT [51]
************************
2. Outline of Annotate_rows event behavior
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
Each Annotate_rows_log_event object has two private members describing the
corresponding query:
char *m_query_txt;
uint m_query_len;
When the object is created for writing to a binary log, this query is taken
from 'thd' (for short, below we omit the 'Annotate_rows_log_event::' prefix
as well as other implementation details):
Annotate_rows_log_event(THD *thd)
{
m_query_txt = thd->query();
m_query_len = thd->query_length();
}
When the object is read from a binary log, the query is taken from the buffer
containing the binary log representation of the event (this buffer is allocated
in Log_event object from which all Log events are derived):
Annotate_rows_log_event(char *buf, uint event_len,
Format_description_log_event *desc)
{
m_query_len = event_len - desc->common_header_len;
m_query_txt = buf + desc->common_header_len;
}
The events are written to the binary log by the Log_event::write() member
which calls virtual write_data_header() and write_data_body() members
("data header" and "post header" are synonym in replication terminology).
In our case, data header is empty and data body is just the query:
bool write_data_body(IO_CACHE *file)
{
return my_b_safe_write(file, (uchar*) m_query_txt, m_query_len);
}
Printing the event is just printing the query:
void Annotate_rows_log_event::print(FILE *file, PRINT_EVENT_INFO *pinfo)
{
my_b_printf(&pinfo->head_cache, "\tQuery: `%s`\n", m_query_txt);
}
3. How Master writes Annotate_rows events to the binary log
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
The event is written to the binary log just before the group of Table_map
events which precede corresponding Rows events (one query may generate
several Table map events in the binary log, but the corresponding
Annotate_rows event must be written only once before the first Table map
event; hence the boolean variable 'with_annotate' below):
int write_locked_table_maps(THD *thd)
{ ...
bool with_annotate= thd->variables.binlog_annotate_rows_events;
...
for (uint i= 0; i < ... <number of tables> ...; ++i)
{ ...
thd->binlog_write_table_map(table, ..., with_annotate);
with_annotate= 0; // write Annotate_event not more than once
...
}
...
}
int THD::binlog_write_table_map(TABLE *table, ..., bool with_annotate)
{ ...
Table_map_log_event the_event(...);
...
if (with_annotate)
{
Annotate_rows_log_event anno(this);
mysql_bin_log.write(&anno);
}
mysql_bin_log.write(&the_event);
...
}
4. How slave treats replicate-annotate-rows-events option
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
The replicate-annotate-rows-events option is treated just as the session
value of the binlog_annotate_rows_events variable for the slave IO and
SQL threads. This setting is done during initialization of these threads:
pthread_handler_t handle_slave_io(void *arg)
{
THD *thd= new THD;
...
init_slave_thread(thd, SLAVE_THD_IO);
...
}
pthread_handler_t handle_slave_sql(void *arg)
{
THD *thd= new THD;
...
init_slave_thread(thd, SLAVE_THD_SQL);
...
}
int init_slave_thread(THD* thd, SLAVE_THD_TYPE thd_type)
{ ...
thd->variables.binlog_annotate_rows_events=
opt_replicate_annotate_rows_events;
...
}
5. How slave IO thread requests Annotate_rows events
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
If the replicate-annotate-rows-events option is not set on a slave, there
is no need for master to send Annotate_rows events to this slave. The slave
(or mysqlbinlog in remote case), before requesting binlog dump via the
COM_BINLOG_DUMP command, informs the master whether it should send these
events by executing the newly added COM_BINLOG_DUMP_OPTIONS_EXT server
command:
case COM_BINLOG_DUMP_OPTIONS_EXT:
thd->binlog_dump_flags_ext= packet[0];
my_ok(thd);
break;
Note. We add this new command and don't use COM_BINLOG_DUMP to avoid possible
conflicts with MySQL/Sun.
6. How master executes the request
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
case COM_BINLOG_DUMP:
{ ...
flags= uint2korr(packet + 4);
...
mysql_binlog_send(thd, ..., flags);
...
}
void mysql_binlog_send(THD* thd, ..., ushort flags)
{ ...
Log_event::read_log_event(&log, packet, ...);
...
if ((*packet)[EVENT_TYPE_OFFSET + 1] != ANNOTATE_ROWS_EVENT ||
flags & BINLOG_SEND_ANNOTATE_ROWS_EVENT)
{
my_net_write(net, packet->ptr(), packet->length());
}
...
}
7. How slave SQL thread processes Annotate_rows events
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
The slave processes each recieved event by "applying" it, i.e. by
calling the Log_event::apply_event() function which in turn calls
the virtual do_apply_event() member specific for each type of the
event.
int exec_relay_log_event(THD* thd, Relay_log_info* rli)
{ ...
Log_event *ev = next_event(rli);
...
apply_event_and_update_pos(ev, ...);
if (ev->get_type_code() != FORMAT_DESCRIPTION_EVENT)
delete ev;
...
}
int apply_event_and_update_pos(Log_event *ev, ...)
{ ...
ev->apply_event(...);
...
}
int Log_event::apply_event(...)
{
return do_apply_event(...);
}
What does it mean to "apply" an Annotate_rows event? It means to set current
thd query to that of the described by the event, i.e. to the query which
caused the subsequent Rows events (see "How Master writes Annotate_rows
events to the binary log" to follow what happens further when the subsequent
Rows events are applied):
int Annotate_rows_log_event::do_apply_event(...)
{
thd->set_query(m_query_txt, m_query_len);
}
NOTE. I am not sure, but possibly current values of thd->query and
thd->query_length should be saved before calling set_query() and to be
restored on the Annotate_rows_log_event object deletion.
Is it really needed ?
After calling this do_apply_event() function we may not delete the
Annotate_rows_log_event object immediatedly (see exec_relay_log_event()
above) because thd->query now points to the string inside this object.
We may keep the pointer to this object in the Relay_log_info:
class Relay_log_info
{
public:
...
void set_annotate_event(Annotate_rows_log_event*);
Annotate_rows_log_event* get_annotate_event();
void free_annotate_event();
...
private:
Annotate_rows_log_event* m_annotate_event;
};
The saved Annotate_rows object should be deleted when all corresponding
Rows events will be processed:
int exec_relay_log_event(THD* thd, Relay_log_info* rli)
{ ...
Log_event *ev= next_event(rli);
...
apply_event_and_update_pos(ev, ...);
if (rli->get_annotate_event() && is_last_rows_event(ev))
rli->free_annotate_event();
else if (ev->get_type_code() == ANNOTATE_ROWS_EVENT)
rli->set_annotate_event((Annotate_rows_log_event*) ev);
else if (ev->get_type_code() != FORMAT_DESCRIPTION_EVENT)
delete ev;
...
}
where
bool is_last_rows_event(Log_event* ev)
{
Log_event_type type= ev->get_type_code();
if (IS_ROWS_EVENT_TYPE(type))
{
Rows_log_event* rows= (Rows_log_event*)ev;
return rows->get_flags(Rows_log_event::STMT_END_F);
}
return 0;
}
#define IS_ROWS_EVENT_TYPE(type) ((type) == WRITE_ROWS_EVENT || \
(type) == UPDATE_ROWS_EVENT || \
(type) == DELETE_ROWS_EVENT)
8. General remarks
~~~~~~~~~~~~~~~~~~
Kristian noticed that introducing new log event type should be coordinated
somehow with MySQL/Sun:
Kristian: The numeric code for this event must be assigned carefully.
It should be coordinated with MySQL/Sun, otherwise we can get into a
situation where MySQL uses the same numeric code for one event that
MariaDB uses for ANNOTATE_ROWS_EVENT, which would make merging the two
impossible.
Alex: I reserved about 20 numbers not to have possible conflicts
with MySQL.
Kristian: Still, I think it would be appropriate to send a polite email
to internals(a)lists.mysql.com about this and suggesting to reserve the
event number.
ESTIMATED WORK TIME
ESTIMATED COMPLETION DATE
-----------------------------------------------------------------------
WorkLog (v3.5.9)
1
0

[Maria-developers] Updated (by Guest): Store in binlog text of statements that caused RBR events (47)
by worklog-noreply@askmonty.org 05 Feb '10
by worklog-noreply@askmonty.org 05 Feb '10
05 Feb '10
-----------------------------------------------------------------------
WORKLOG TASK
-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-
TASK...........: Store in binlog text of statements that caused RBR events
CREATION DATE..: Sat, 15 Aug 2009, 23:48
SUPERVISOR.....: Monty
IMPLEMENTOR....:
COPIES TO......: Knielsen
CATEGORY.......: Server-Sprint
TASK ID........: 47 (http://askmonty.org/worklog/?tid=47)
VERSION........: Server-9.x
STATUS.........: In-Progress
PRIORITY.......: 60
WORKED HOURS...: 20
ESTIMATE.......: 35 (hours remain)
ORIG. ESTIMATE.: 35
PROGRESS NOTES:
-=-=(Guest - Fri, 05 Feb 2010, 13:40)=-=-
Category updated.
--- /tmp/wklog.47.old.9197 2010-02-05 13:40:36.000000000 +0200
+++ /tmp/wklog.47.new.9197 2010-02-05 13:40:36.000000000 +0200
@@ -1 +1 @@
-Server-RawIdeaBin
+Server-Sprint
-=-=(Guest - Fri, 05 Feb 2010, 13:40)=-=-
Status updated.
--- /tmp/wklog.47.old.9197 2010-02-05 13:40:36.000000000 +0200
+++ /tmp/wklog.47.new.9197 2010-02-05 13:40:36.000000000 +0200
@@ -1 +1 @@
-Un-Assigned
+In-Progress
-=-=(Alexi - Thu, 04 Feb 2010, 09:54)=-=-
Low Level Design modified.
--- /tmp/wklog.47.old.16174 2010-02-04 09:54:13.000000000 +0200
+++ /tmp/wklog.47.new.16174 2010-02-04 09:54:13.000000000 +0200
@@ -171,35 +171,20 @@
5. How slave IO thread requests Annotate_rows events
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-When requesting an event, the slave should inform the master whether
-it should send Annotate_rows events or not. To that end we add a new
-BINLOG_SEND_ANNOTATE_ROWS_EVENT flag used when requesting an event:
+If the replicate-annotate-rows-events option is not set on a slave, there
+is no need for master to send Annotate_rows events to this slave. The slave
+(or mysqlbinlog in remote case), before requesting binlog dump via the
+COM_BINLOG_DUMP command, informs the master whether it should send these
+events by executing the newly added COM_BINLOG_DUMP_OPTIONS_EXT server
+command:
+
+ case COM_BINLOG_DUMP_OPTIONS_EXT:
+ thd->binlog_dump_flags_ext= packet[0];
+ my_ok(thd);
+ break;
- #define BINLOG_DUMP_NON_BLOCK 1
- #define BINLOG_SEND_ANNOTATE_ROWS_EVENT 2
-
- pthread_handler_t handle_slave_io(void *arg)
- { ...
- request_dump(mysql, ...);
- ...
- }
-
- int request_dump(MYSQL* mysql, ...)
- { ...
- if (opt_log_slave_updates &&
- mi->io_thd->variables.binlog_annotate_rows_events)
- binlog_flags|= BINLOG_SEND_ANNOTATE_ROWS_EVENT;
- ...
- int2store(buf + 4, binlog_flags);
- ...
- simple_command(mysql, COM_BINLOG_DUMP, buf, ...);
- ...
- }
-
-NOTE. mysqlbinlog, when remotely requesting BINLOG_DUMP by calling the
-simple_command() function, should also use this flag if it wants (in case
-of the --print-annotate-rows-events option set) to recieve Annotate_rows
-events.
+Note. We add this new command and don't use COM_BINLOG_DUMP to avoid possible
+conflicts with MySQL/Sun.
6. How master executes the request
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
@@ -338,10 +323,4 @@
to internals(a)lists.mysql.com about this and suggesting to reserve the
event number.
-Also we should notice the introduction of the BINLOG_SEND_ANNOTATE_ROWS_EVENT
-flag taking into account that MySQL/Sun may also introduce a flag with the
-same value to be used in the request_dump-mysql_binlog_send interface.
-But this is mainly the question of merging: if a conflict concerning this
-flag occur, we may simply change the BINLOG_SEND_ANNOTATE_ROWS_EVENT value
-(this does not require additional changes in the code).
-=-=(Alexi - Sun, 20 Dec 2009, 16:00)=-=-
Low Level Design modified.
--- /tmp/wklog.47.old.19667 2009-12-20 14:00:56.000000000 +0000
+++ /tmp/wklog.47.new.19667 2009-12-20 14:00:56.000000000 +0000
@@ -196,6 +196,11 @@
...
}
+NOTE. mysqlbinlog, when remotely requesting BINLOG_DUMP by calling the
+simple_command() function, should also use this flag if it wants (in case
+of the --print-annotate-rows-events option set) to recieve Annotate_rows
+events.
+
6. How master executes the request
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
@@ -212,8 +217,7 @@
Log_event::read_log_event(&log, packet, ...);
...
if ((*packet)[EVENT_TYPE_OFFSET + 1] != ANNOTATE_ROWS_EVENT ||
- flags & BINLOG_SEND_ANNOTATE_ROWS_EVENT ||
- thd->server_id == 0 /* slave == mysqlbinlog */ )
+ flags & BINLOG_SEND_ANNOTATE_ROWS_EVENT)
{
my_net_write(net, packet->ptr(), packet->length());
}
-=-=(Alexi - Sun, 20 Dec 2009, 13:14)=-=-
Low Level Design modified.
--- /tmp/wklog.47.old.11350 2009-12-20 13:14:04.000000000 +0200
+++ /tmp/wklog.47.new.11350 2009-12-20 13:14:04.000000000 +0200
@@ -282,23 +282,18 @@
Annotate_rows_log_event* m_annotate_event;
};
-When the saved Annotate_rows object may be deleted? When all corresponding
-Rows events will be processed, i.e. before processing the first non-Rows
-event (note that Annotate_rows object resides in the binary log *after*
-the (possible) 'BEGIN' Query event which accompanies the rows events; note
-also that this deletion is adjusted with the case when some or all
-corresponding Rows events are filtered out by replicate filter rules):
+The saved Annotate_rows object should be deleted when all corresponding
+Rows events will be processed:
int exec_relay_log_event(THD* thd, Relay_log_info* rli)
{ ...
Log_event *ev= next_event(rli);
...
- if (rli->get_annotate_event() && !IS_RBR_EVENT_TYPE(ev->get_type_code()))
- rli->free_annotate_event();
-
apply_event_and_update_pos(ev, ...);
- if (ev->get_type_code() == ANNOTATE_ROWS_EVENT)
+ if (rli->get_annotate_event() && is_last_rows_event(ev))
+ rli->free_annotate_event();
+ else if (ev->get_type_code() == ANNOTATE_ROWS_EVENT)
rli->set_annotate_event((Annotate_rows_log_event*) ev);
else if (ev->get_type_code() != FORMAT_DESCRIPTION_EVENT)
delete ev;
@@ -307,10 +302,21 @@
where
- #define IS_RBR_EVENT_TYPE(type) ( (type) == TABLE_MAP_EVENT || \
- (type) == WRITE_ROWS_EVENT || \
+ bool is_last_rows_event(Log_event* ev)
+ {
+ Log_event_type type= ev->get_type_code();
+ if (IS_ROWS_EVENT_TYPE(type))
+ {
+ Rows_log_event* rows= (Rows_log_event*)ev;
+ return rows->get_flags(Rows_log_event::STMT_END_F);
+ }
+
+ return 0;
+ }
+
+ #define IS_ROWS_EVENT_TYPE(type) ((type) == WRITE_ROWS_EVENT || \
(type) == UPDATE_ROWS_EVENT || \
- (type) == DELETE_ROWS_EVENT )
+ (type) == DELETE_ROWS_EVENT)
8. General remarks
~~~~~~~~~~~~~~~~~~
-=-=(Alexi - Sun, 20 Dec 2009, 09:29)=-=-
Low Level Design modified.
--- /tmp/wklog.47.old.32726 2009-12-20 07:29:56.000000000 +0000
+++ /tmp/wklog.47.new.32726 2009-12-20 07:29:56.000000000 +0000
@@ -56,7 +56,7 @@
0000006A | 00 | 0 - RESERVED [27]
..... ...
00000081 | 00 | 0 - RESERVED [50]
- 00000082 | 00 | 0 - ANNOTATE_RBR_EVENT [51]
+ 00000082 | 00 | 0 - ANNOTATE_ROWS_EVENT [51]
************************
2. Outline of Annotate_rows event behavior
-=-=(Alexi - Sat, 19 Dec 2009, 16:10)=-=-
Low Level Design modified.
--- /tmp/wklog.47.old.16051 2009-12-19 16:10:48.000000000 +0200
+++ /tmp/wklog.47.new.16051 2009-12-19 16:10:48.000000000 +0200
@@ -253,7 +253,7 @@
thd query to that of the described by the event, i.e. to the query which
caused the subsequent Rows events (see "How Master writes Annotate_rows
events to the binary log" to follow what happens further when the subsequent
-Rows events is applied):
+Rows events are applied):
int Annotate_rows_log_event::do_apply_event(...)
{
-=-=(Alexi - Sat, 19 Dec 2009, 16:02)=-=-
High-Level Specification modified.
--- /tmp/wklog.47.old.15695 2009-12-19 16:02:33.000000000 +0200
+++ /tmp/wklog.47.new.15695 2009-12-19 16:02:33.000000000 +0200
@@ -12,7 +12,7 @@
post-header and contains the query text in its data part. Example:
************************
- ANNOTATE_RBR_EVENT
+ ANNOTATE_ROWS_EVENT
************************
00000220 | B6 A0 2C 4B | time_when = 1261215926
00000224 | 33 | event_type = 51
-=-=(Alexi - Sat, 19 Dec 2009, 15:58)=-=-
Low Level Design modified.
--- /tmp/wklog.47.old.15437 2009-12-19 13:58:12.000000000 +0000
+++ /tmp/wklog.47.new.15437 2009-12-19 13:58:12.000000000 +0000
@@ -1 +1,337 @@
+Content
+~~~~~~~
+ 1. Annotate_rows event number
+ 2. Outline of Annotate_rows event behavior
+ 3. How Master writes Annotate_rows events to the binary log
+ 4. How slave treats replicate-annotate-rows-events option
+ 5. How slave IO thread requests Annotate_rows events
+ 6. How master executes the request
+ 7. How slave SQL thread processes Annotate_rows events
+ 8. General remarks
+
+1. Annotate_rows event number
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+To avoid possible event numbers conflict with MySQL/Sun, we leave a gap
+between the last MySQL event number and the Annotate_rows event number:
+
+ enum Log_event_type
+ { ...
+ INCIDENT_EVENT= 26,
+ // New MySQL event numbers are to be added here
+ MYSQL_EVENTS_END,
+
+ MARIA_EVENTS_BEGIN= 51,
+ // New Maria event numbers start from here
+ ANNOTATE_ROWS_EVENT= 51,
+
+ ENUM_END_EVENT
+ };
+
+together with the corresponding extension of 'post_header_len' array in the
+Format description event. (This extension does not affect the compatibility
+of the binary log). Here is how Format description event looks like with
+this extension:
+
+ ************************
+ FORMAT_DESCRIPTION_EVENT
+ ************************
+ 00000004 | A1 A0 2C 4B | time_when = 1261215905
+ 00000008 | 0F | event_type = 15
+ 00000009 | 64 00 00 00 | server_id = 100
+ 0000000D | 7F 00 00 00 | event_len = 127
+ 00000011 | 83 00 00 00 | log_pos = 00000083
+ 00000015 | 01 00 | flags = LOG_EVENT_BINLOG_IN_USE_F
+ ------------------------
+ 00000017 | 04 00 | binlog_ver = 4
+ 00000019 | 35 2E 32 2E | server_ver = 5.2.0-MariaDB-alpha-debug-log
+ ..... ...
+ 0000004B | A1 A0 2C 4B | time_created = 1261215905
+ 0000004F | 13 | common_header_len = 19
+ ------------------------
+ post_header_len
+ ------------------------
+ 00000050 | 38 | 56 - START_EVENT_V3 [1]
+ ..... ...
+ 00000069 | 02 | 2 - INCIDENT_EVENT [26]
+ 0000006A | 00 | 0 - RESERVED [27]
+ ..... ...
+ 00000081 | 00 | 0 - RESERVED [50]
+ 00000082 | 00 | 0 - ANNOTATE_RBR_EVENT [51]
+ ************************
+
+2. Outline of Annotate_rows event behavior
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+Each Annotate_rows_log_event object has two private members describing the
+corresponding query:
+
+ char *m_query_txt;
+ uint m_query_len;
+
+When the object is created for writing to a binary log, this query is taken
+from 'thd' (for short, below we omit the 'Annotate_rows_log_event::' prefix
+as well as other implementation details):
+
+ Annotate_rows_log_event(THD *thd)
+ {
+ m_query_txt = thd->query();
+ m_query_len = thd->query_length();
+ }
+
+When the object is read from a binary log, the query is taken from the buffer
+containing the binary log representation of the event (this buffer is allocated
+in Log_event object from which all Log events are derived):
+
+ Annotate_rows_log_event(char *buf, uint event_len,
+ Format_description_log_event *desc)
+ {
+ m_query_len = event_len - desc->common_header_len;
+ m_query_txt = buf + desc->common_header_len;
+ }
+
+The events are written to the binary log by the Log_event::write() member
+which calls virtual write_data_header() and write_data_body() members
+("data header" and "post header" are synonym in replication terminology).
+In our case, data header is empty and data body is just the query:
+
+ bool write_data_body(IO_CACHE *file)
+ {
+ return my_b_safe_write(file, (uchar*) m_query_txt, m_query_len);
+ }
+
+Printing the event is just printing the query:
+
+ void Annotate_rows_log_event::print(FILE *file, PRINT_EVENT_INFO *pinfo)
+ {
+ my_b_printf(&pinfo->head_cache, "\tQuery: `%s`\n", m_query_txt);
+ }
+
+3. How Master writes Annotate_rows events to the binary log
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+The event is written to the binary log just before the group of Table_map
+events which precede corresponding Rows events (one query may generate
+several Table map events in the binary log, but the corresponding
+Annotate_rows event must be written only once before the first Table map
+event; hence the boolean variable 'with_annotate' below):
+
+ int write_locked_table_maps(THD *thd)
+ { ...
+ bool with_annotate= thd->variables.binlog_annotate_rows_events;
+ ...
+ for (uint i= 0; i < ... <number of tables> ...; ++i)
+ { ...
+ thd->binlog_write_table_map(table, ..., with_annotate);
+ with_annotate= 0; // write Annotate_event not more than once
+ ...
+ }
+ ...
+ }
+
+ int THD::binlog_write_table_map(TABLE *table, ..., bool with_annotate)
+ { ...
+ Table_map_log_event the_event(...);
+ ...
+ if (with_annotate)
+ {
+ Annotate_rows_log_event anno(this);
+ mysql_bin_log.write(&anno);
+ }
+
+ mysql_bin_log.write(&the_event);
+ ...
+ }
+
+4. How slave treats replicate-annotate-rows-events option
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+The replicate-annotate-rows-events option is treated just as the session
+value of the binlog_annotate_rows_events variable for the slave IO and
+SQL threads. This setting is done during initialization of these threads:
+
+ pthread_handler_t handle_slave_io(void *arg)
+ {
+ THD *thd= new THD;
+ ...
+ init_slave_thread(thd, SLAVE_THD_IO);
+ ...
+ }
+
+ pthread_handler_t handle_slave_sql(void *arg)
+ {
+ THD *thd= new THD;
+ ...
+ init_slave_thread(thd, SLAVE_THD_SQL);
+ ...
+ }
+
+ int init_slave_thread(THD* thd, SLAVE_THD_TYPE thd_type)
+ { ...
+ thd->variables.binlog_annotate_rows_events=
+ opt_replicate_annotate_rows_events;
+ ...
+ }
+
+5. How slave IO thread requests Annotate_rows events
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+When requesting an event, the slave should inform the master whether
+it should send Annotate_rows events or not. To that end we add a new
+BINLOG_SEND_ANNOTATE_ROWS_EVENT flag used when requesting an event:
+
+ #define BINLOG_DUMP_NON_BLOCK 1
+ #define BINLOG_SEND_ANNOTATE_ROWS_EVENT 2
+
+ pthread_handler_t handle_slave_io(void *arg)
+ { ...
+ request_dump(mysql, ...);
+ ...
+ }
+
+ int request_dump(MYSQL* mysql, ...)
+ { ...
+ if (opt_log_slave_updates &&
+ mi->io_thd->variables.binlog_annotate_rows_events)
+ binlog_flags|= BINLOG_SEND_ANNOTATE_ROWS_EVENT;
+ ...
+ int2store(buf + 4, binlog_flags);
+ ...
+ simple_command(mysql, COM_BINLOG_DUMP, buf, ...);
+ ...
+ }
+
+6. How master executes the request
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+ case COM_BINLOG_DUMP:
+ { ...
+ flags= uint2korr(packet + 4);
+ ...
+ mysql_binlog_send(thd, ..., flags);
+ ...
+ }
+
+ void mysql_binlog_send(THD* thd, ..., ushort flags)
+ { ...
+ Log_event::read_log_event(&log, packet, ...);
+ ...
+ if ((*packet)[EVENT_TYPE_OFFSET + 1] != ANNOTATE_ROWS_EVENT ||
+ flags & BINLOG_SEND_ANNOTATE_ROWS_EVENT ||
+ thd->server_id == 0 /* slave == mysqlbinlog */ )
+ {
+ my_net_write(net, packet->ptr(), packet->length());
+ }
+ ...
+ }
+
+7. How slave SQL thread processes Annotate_rows events
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+The slave processes each recieved event by "applying" it, i.e. by
+calling the Log_event::apply_event() function which in turn calls
+the virtual do_apply_event() member specific for each type of the
+event.
+
+ int exec_relay_log_event(THD* thd, Relay_log_info* rli)
+ { ...
+ Log_event *ev = next_event(rli);
+ ...
+ apply_event_and_update_pos(ev, ...);
+
+ if (ev->get_type_code() != FORMAT_DESCRIPTION_EVENT)
+ delete ev;
+ ...
+ }
+
+ int apply_event_and_update_pos(Log_event *ev, ...)
+ { ...
+ ev->apply_event(...);
+ ...
+ }
+
+ int Log_event::apply_event(...)
+ {
+ return do_apply_event(...);
+ }
+
+What does it mean to "apply" an Annotate_rows event? It means to set current
+thd query to that of the described by the event, i.e. to the query which
+caused the subsequent Rows events (see "How Master writes Annotate_rows
+events to the binary log" to follow what happens further when the subsequent
+Rows events is applied):
+
+ int Annotate_rows_log_event::do_apply_event(...)
+ {
+ thd->set_query(m_query_txt, m_query_len);
+ }
+
+NOTE. I am not sure, but possibly current values of thd->query and
+thd->query_length should be saved before calling set_query() and to be
+restored on the Annotate_rows_log_event object deletion.
+Is it really needed ?
+
+After calling this do_apply_event() function we may not delete the
+Annotate_rows_log_event object immediatedly (see exec_relay_log_event()
+above) because thd->query now points to the string inside this object.
+We may keep the pointer to this object in the Relay_log_info:
+
+ class Relay_log_info
+ {
+ public:
+ ...
+ void set_annotate_event(Annotate_rows_log_event*);
+ Annotate_rows_log_event* get_annotate_event();
+ void free_annotate_event();
+ ...
+ private:
+ Annotate_rows_log_event* m_annotate_event;
+ };
+
+When the saved Annotate_rows object may be deleted? When all corresponding
+Rows events will be processed, i.e. before processing the first non-Rows
+event (note that Annotate_rows object resides in the binary log *after*
+the (possible) 'BEGIN' Query event which accompanies the rows events; note
+also that this deletion is adjusted with the case when some or all
+corresponding Rows events are filtered out by replicate filter rules):
+
+ int exec_relay_log_event(THD* thd, Relay_log_info* rli)
+ { ...
+ Log_event *ev= next_event(rli);
+ ...
+ if (rli->get_annotate_event() && !IS_RBR_EVENT_TYPE(ev->get_type_code()))
+ rli->free_annotate_event();
+
+ apply_event_and_update_pos(ev, ...);
+
+ if (ev->get_type_code() == ANNOTATE_ROWS_EVENT)
+ rli->set_annotate_event((Annotate_rows_log_event*) ev);
+ else if (ev->get_type_code() != FORMAT_DESCRIPTION_EVENT)
+ delete ev;
+ ...
+ }
+
+where
+
+ #define IS_RBR_EVENT_TYPE(type) ( (type) == TABLE_MAP_EVENT || \
+ (type) == WRITE_ROWS_EVENT || \
+ (type) == UPDATE_ROWS_EVENT || \
+ (type) == DELETE_ROWS_EVENT )
+
+8. General remarks
+~~~~~~~~~~~~~~~~~~
+Kristian noticed that introducing new log event type should be coordinated
+somehow with MySQL/Sun:
+
+ Kristian: The numeric code for this event must be assigned carefully.
+ It should be coordinated with MySQL/Sun, otherwise we can get into a
+ situation where MySQL uses the same numeric code for one event that
+ MariaDB uses for ANNOTATE_ROWS_EVENT, which would make merging the two
+ impossible.
+ Alex: I reserved about 20 numbers not to have possible conflicts
+ with MySQL.
+ Kristian: Still, I think it would be appropriate to send a polite email
+ to internals(a)lists.mysql.com about this and suggesting to reserve the
+ event number.
+
+Also we should notice the introduction of the BINLOG_SEND_ANNOTATE_ROWS_EVENT
+flag taking into account that MySQL/Sun may also introduce a flag with the
+same value to be used in the request_dump-mysql_binlog_send interface.
+But this is mainly the question of merging: if a conflict concerning this
+flag occur, we may simply change the BINLOG_SEND_ANNOTATE_ROWS_EVENT value
+(this does not require additional changes in the code).
-=-=(Alexi - Sat, 19 Dec 2009, 15:41)=-=-
High-Level Specification modified.
--- /tmp/wklog.47.old.14545 2009-12-19 15:41:21.000000000 +0200
+++ /tmp/wklog.47.new.14545 2009-12-19 15:41:21.000000000 +0200
@@ -1,122 +1,107 @@
-First suggestion:
-
-> I think for this we would actually need a new binlog event type
-> (Comment_log_event?). Unless we want to log an empty statement Query_log_event
-> containing only a comment (a bit of a hack).
-
-New server option
-~~~~~~~~~~~~~~~~~
- --binlog-annotate-rows-events
-
-Setting this option makes RBR (rows-) events in the binary log to be
-preceded by Annotate rows events (see below). The corresponding
-'binlog_annotate_rows_events' system variable is dynamic and has both
-global and session values. Default global value is OFF.
-
-Note. Session values are usefull to make it possible to annotate only
- some selected statements:
+Content
+~~~~~~~
+ 1. Annotate_rows_log_event
+ 2. Server option: --binlog-annotate-rows-events
+ 3. Server option: --replicate-annotate-rows-events
+ 4. mysqlbinlog option: --print-annotate-rows-events
+ 5. mysqlbinlog output
+
+1. Annotate_rows_log_event [ ANNOTATE_ROWS_EVENT ]
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+Describes the query which caused the corresponding rows events. Has empty
+post-header and contains the query text in its data part. Example:
+
+ ************************
+ ANNOTATE_RBR_EVENT
+ ************************
+ 00000220 | B6 A0 2C 4B | time_when = 1261215926
+ 00000224 | 33 | event_type = 51
+ 00000225 | 64 00 00 00 | server_id = 100
+ 00000229 | 36 00 00 00 | event_len = 54
+ 0000022D | 56 02 00 00 | log_pos = 00000256
+ 00000231 | 00 00 | flags = <none>
+ ------------------------
+ 00000233 | 49 4E 53 45 | query = "INSERT INTO t1 VALUES (1), (2), (3)"
+ 00000237 | 52 54 20 49 |
+ 0000023B | 4E 54 4F 20 |
+ 0000023F | 74 31 20 56 |
+ 00000243 | 41 4C 55 45 |
+ 00000247 | 53 20 28 31 |
+ 0000024B | 29 2C 20 28 |
+ 0000024F | 32 29 2C 20 |
+ 00000253 | 28 33 29 |
+ ************************
+
+In binary log, Annotate_rows event follows the (possible) 'BEGIN' Query event
+and precedes the first of Table map events which accompany the corresponding
+rows events. (See example in the "mysqlbinlog output" section below.)
+
+2. Server option: --binlog-annotate-rows-events
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+Tells the master to write Annotate_rows events to the binary log.
+
+ * Variable Name: binlog_annotate_rows_events
+ * Scope: Global & Session
+ * Access Type: Dynamic
+ * Data Type: bool
+ * Default Value: OFF
+NOTE. Session values allows to annotate only some selected statements:
...
SET SESSION binlog_annotate_rows_events=ON;
... statements to be annotated ...
SET SESSION binlog_annotate_rows_events=OFF;
... statements not to be annotated ...
-New binlog event type
-~~~~~~~~~~~~~~~~~~~~~
- Annotate_rows_log_event [ ANNOTATE_ROWS_EVENT ]
-
-Describes the query which caused the corresponding rows event. In binary log,
-precedes each Table_map_log_event. Contains empty post-header and the query
-text in its data part.
-
-The numeric code for this event must be assigned carefully. It should be
-coordinated with MySQL/Sun, otherwise we can get into a situation where MySQL
-uses the same numeric code for one event that MariaDB uses for
-ANNOTATE_ROWS_EVENT, which would make merging the two impossible.
-
-Example:
-
- ...
- ************************
- ANNOTATE_ROWS_EVENT [51]
- ************************
- 000000C7 | 54 1B 12 4B | time_when = 1259477844
- 000000CB | 33 | event_type = 51
- 000000CC | 64 00 00 00 | server_id = 100
- 000000D0 | 2C 00 00 00 | event_len = 44
- 000000D4 | F3 00 00 00 | log_pos = 000000F3
- 000000D8 | 00 00 | flags = <none>
- ------------------------
- 000000DA | 69 6E 73 65 | query = "insert into t1 values (1)"
- 000000DE | 72 74 20 69 |
- 000000E2 | 6E 74 6F 20 |
- 000000E6 | 74 31 20 76 |
- 000000EA | 61 6C 75 65 |
- 000000EE | 73 20 28 31 |
- 000000F2 | 29 |
- ************************
- TABLE_MAP_EVENT [19]
- ************************
- 000000F3 | 54 1B 12 4B | time_when = 1259477844
- 000000F7 | 13 | event_type = 19
- 000000F8 | 64 00 00 00 | server_id = 100
- 000000FC | 29 00 00 00 | event_len = 41
- 00000100 | 1C 01 00 00 | log_pos = 0000011C
- 00000104 | 00 00 | flags = <none>
- ------------------------
- ...
- ************************
- WRITE_ROWS_EVENT [23]
- ************************
- 0000011C | 54 1B 12 4B | time_when = 1259477844
- 00000120 | 17 | event_type = 23
- 00000121 | 64 00 00 00 | server_id = 100
- 00000125 | 22 00 00 00 | event_len = 34
- 00000129 | 3E 01 00 00 | log_pos = 0000013E
- 0000012D | 10 00 | flags = LOG_EVENT_UPDATE_TABLE_MAP_VERSION_F
- ------------------------
- 0000012F | 0F 00 00 00 | table_id = 15
- ...
-
-New mysqlbinlog option
-~~~~~~~~~~~~~~~~~~~~~~
- --print-annotate-rows-events
-
-With this option, mysqlbinlog prints the content of Annotate-rows
-events (if the binary log does contain them). Without this option
-(i.e. by default), mysqlbinlog skips Annotate rows events.
-
-
-mysqlbinlog output
-~~~~~~~~~~~~~~~~~~
-Something like this:
+3. Server option: --replicate-annotate-rows-events
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+Tells the slave to reproduce Annotate_rows events recieved from the master
+in its own binary log (sensible only in pair with log-slave-updates option).
+
+ * Variable Name: replicate_annotate_rows_events
+ * Scope: Global
+ * Access Type: Read only
+ * Data Type: bool
+ * Default Value: OFF
+
+NOTE. Why do we additionally need this 'replicate' option? Why not to make
+the slave to reproduce this events when its binlog-annotate-rows-events
+global value is ON? Well, because, for example, we may want to configure
+the slave which should reproduce Annotate_rows events but has global
+binlog-annotate-rows-events = OFF meaning this to be the default value for
+the client threads (see also "How slave treats replicate-annotate-rows-events
+option" in LLD part).
+
+4. mysqlbinlog option: --print-annotate-rows-events
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+With this option, mysqlbinlog prints the content of Annotate_rows events (if
+the binary log does contain them). Without this option (i.e. by default),
+mysqlbinlog skips Annotate_rows events.
+5. mysqlbinlog output
+~~~~~~~~~~~~~~~~~~~~~
+With --print-annotate-rows-events, mysqlbinlog outputs Annotate_rows events
+in a form like this:
...
- # at 199
- # at 243
- # at 284
- #091129 9:57:24 server id 100 end_log_pos 243 Query: `insert into t1 values
-(1)`
- #091129 9:57:24 server id 100 end_log_pos 284 Table_map: `test`.`t1` mapped
-to number 15
- #091129 9:57:24 server id 100 end_log_pos 318 Write_rows: table id 15
+ # at 1646
+ #091219 12:45:26 server id 100 end_log_pos 1714 Query thread_id=1
+exec_time=0 error_code=0
+ SET TIMESTAMP=1261215926/*!*/;
+ BEGIN
+ /*!*/;
+ # at 1714
+ # at 1812
+ # at 1853
+ # at 1894
+ # at 1938
+ #091219 12:45:26 server id 100 end_log_pos 1812 Query: `DELETE t1, t2 FROM
+t1 INNER JOIN t2 INNER JOIN t3 WHERE t1.a=t2.a AND t2.a=t3.a`
+ #091219 12:45:26 server id 100 end_log_pos 1853 Table_map: `test`.`t1`
+mapped to number 16
+ #091219 12:45:26 server id 100 end_log_pos 1894 Table_map: `test`.`t2`
+mapped to number 17
+ #091219 12:45:26 server id 100 end_log_pos 1938 Delete_rows: table id 16
+ #091219 12:45:26 server id 100 end_log_pos 1982 Delete_rows: table id 17
flags: STMT_END_F
-
- BINLOG '
- VBsSSzNkAAAALAAAAPMAAAAAAGluc2VydCBpbnRvIHQxIHZhbHVlcyAoMSk=
- VBsSSxNkAAAAKQAAABwBAAAAAA8AAAAAAAAABHRlc3QAAnQxAAEDAAE=
- VBsSSxdkAAAAIgAAAD4BAAAQAA8AAAAAAAEAAf/+AQAAAA==
- '/*!*/;
- ### INSERT INTO test.t1
- ### SET
- ### @1=1 /* INT meta=0 nullable=1 is_null=0 */
...
-When master sends Annotate rows events
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-1. Master always sends Annotate_rows events to mysqlbinlog (in
- remote case).
-2. Master sends Annotate_rows events to a slave only if the slave has
- both log-slave-updates and binlog-annotate-rows-events options set.
-
------------------------------------------------------------
-=-=(View All Progress Notes, 22 total)=-=-
http://askmonty.org/worklog/index.pl?tid=47&nolimit=1
DESCRIPTION:
Store in binlog (and show in mysqlbinlog output) texts of statements that
caused RBR events
This is needed for (list from Monty):
- Easier to understand why updates happened
- Would make it easier to find out where in application things went
wrong (as you can search for exact strings)
- Allow one to filter things based on comments in the statement.
The cost of this can be that the binlog will be approximately 2x in size
(especially insert of big blob's would be a bit painful), so this should
be an optional feature.
HIGH-LEVEL SPECIFICATION:
Content
~~~~~~~
1. Annotate_rows_log_event
2. Server option: --binlog-annotate-rows-events
3. Server option: --replicate-annotate-rows-events
4. mysqlbinlog option: --print-annotate-rows-events
5. mysqlbinlog output
1. Annotate_rows_log_event [ ANNOTATE_ROWS_EVENT ]
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
Describes the query which caused the corresponding rows events. Has empty
post-header and contains the query text in its data part. Example:
************************
ANNOTATE_ROWS_EVENT
************************
00000220 | B6 A0 2C 4B | time_when = 1261215926
00000224 | 33 | event_type = 51
00000225 | 64 00 00 00 | server_id = 100
00000229 | 36 00 00 00 | event_len = 54
0000022D | 56 02 00 00 | log_pos = 00000256
00000231 | 00 00 | flags = <none>
------------------------
00000233 | 49 4E 53 45 | query = "INSERT INTO t1 VALUES (1), (2), (3)"
00000237 | 52 54 20 49 |
0000023B | 4E 54 4F 20 |
0000023F | 74 31 20 56 |
00000243 | 41 4C 55 45 |
00000247 | 53 20 28 31 |
0000024B | 29 2C 20 28 |
0000024F | 32 29 2C 20 |
00000253 | 28 33 29 |
************************
In binary log, Annotate_rows event follows the (possible) 'BEGIN' Query event
and precedes the first of Table map events which accompany the corresponding
rows events. (See example in the "mysqlbinlog output" section below.)
2. Server option: --binlog-annotate-rows-events
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
Tells the master to write Annotate_rows events to the binary log.
* Variable Name: binlog_annotate_rows_events
* Scope: Global & Session
* Access Type: Dynamic
* Data Type: bool
* Default Value: OFF
NOTE. Session values allows to annotate only some selected statements:
...
SET SESSION binlog_annotate_rows_events=ON;
... statements to be annotated ...
SET SESSION binlog_annotate_rows_events=OFF;
... statements not to be annotated ...
3. Server option: --replicate-annotate-rows-events
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
Tells the slave to reproduce Annotate_rows events recieved from the master
in its own binary log (sensible only in pair with log-slave-updates option).
* Variable Name: replicate_annotate_rows_events
* Scope: Global
* Access Type: Read only
* Data Type: bool
* Default Value: OFF
NOTE. Why do we additionally need this 'replicate' option? Why not to make
the slave to reproduce this events when its binlog-annotate-rows-events
global value is ON? Well, because, for example, we may want to configure
the slave which should reproduce Annotate_rows events but has global
binlog-annotate-rows-events = OFF meaning this to be the default value for
the client threads (see also "How slave treats replicate-annotate-rows-events
option" in LLD part).
4. mysqlbinlog option: --print-annotate-rows-events
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
With this option, mysqlbinlog prints the content of Annotate_rows events (if
the binary log does contain them). Without this option (i.e. by default),
mysqlbinlog skips Annotate_rows events.
5. mysqlbinlog output
~~~~~~~~~~~~~~~~~~~~~
With --print-annotate-rows-events, mysqlbinlog outputs Annotate_rows events
in a form like this:
...
# at 1646
#091219 12:45:26 server id 100 end_log_pos 1714 Query thread_id=1
exec_time=0 error_code=0
SET TIMESTAMP=1261215926/*!*/;
BEGIN
/*!*/;
# at 1714
# at 1812
# at 1853
# at 1894
# at 1938
#091219 12:45:26 server id 100 end_log_pos 1812 Query: `DELETE t1, t2 FROM
t1 INNER JOIN t2 INNER JOIN t3 WHERE t1.a=t2.a AND t2.a=t3.a`
#091219 12:45:26 server id 100 end_log_pos 1853 Table_map: `test`.`t1`
mapped to number 16
#091219 12:45:26 server id 100 end_log_pos 1894 Table_map: `test`.`t2`
mapped to number 17
#091219 12:45:26 server id 100 end_log_pos 1938 Delete_rows: table id 16
#091219 12:45:26 server id 100 end_log_pos 1982 Delete_rows: table id 17
flags: STMT_END_F
...
LOW-LEVEL DESIGN:
Content
~~~~~~~
1. Annotate_rows event number
2. Outline of Annotate_rows event behavior
3. How Master writes Annotate_rows events to the binary log
4. How slave treats replicate-annotate-rows-events option
5. How slave IO thread requests Annotate_rows events
6. How master executes the request
7. How slave SQL thread processes Annotate_rows events
8. General remarks
1. Annotate_rows event number
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
To avoid possible event numbers conflict with MySQL/Sun, we leave a gap
between the last MySQL event number and the Annotate_rows event number:
enum Log_event_type
{ ...
INCIDENT_EVENT= 26,
// New MySQL event numbers are to be added here
MYSQL_EVENTS_END,
MARIA_EVENTS_BEGIN= 51,
// New Maria event numbers start from here
ANNOTATE_ROWS_EVENT= 51,
ENUM_END_EVENT
};
together with the corresponding extension of 'post_header_len' array in the
Format description event. (This extension does not affect the compatibility
of the binary log). Here is how Format description event looks like with
this extension:
************************
FORMAT_DESCRIPTION_EVENT
************************
00000004 | A1 A0 2C 4B | time_when = 1261215905
00000008 | 0F | event_type = 15
00000009 | 64 00 00 00 | server_id = 100
0000000D | 7F 00 00 00 | event_len = 127
00000011 | 83 00 00 00 | log_pos = 00000083
00000015 | 01 00 | flags = LOG_EVENT_BINLOG_IN_USE_F
------------------------
00000017 | 04 00 | binlog_ver = 4
00000019 | 35 2E 32 2E | server_ver = 5.2.0-MariaDB-alpha-debug-log
..... ...
0000004B | A1 A0 2C 4B | time_created = 1261215905
0000004F | 13 | common_header_len = 19
------------------------
post_header_len
------------------------
00000050 | 38 | 56 - START_EVENT_V3 [1]
..... ...
00000069 | 02 | 2 - INCIDENT_EVENT [26]
0000006A | 00 | 0 - RESERVED [27]
..... ...
00000081 | 00 | 0 - RESERVED [50]
00000082 | 00 | 0 - ANNOTATE_ROWS_EVENT [51]
************************
2. Outline of Annotate_rows event behavior
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
Each Annotate_rows_log_event object has two private members describing the
corresponding query:
char *m_query_txt;
uint m_query_len;
When the object is created for writing to a binary log, this query is taken
from 'thd' (for short, below we omit the 'Annotate_rows_log_event::' prefix
as well as other implementation details):
Annotate_rows_log_event(THD *thd)
{
m_query_txt = thd->query();
m_query_len = thd->query_length();
}
When the object is read from a binary log, the query is taken from the buffer
containing the binary log representation of the event (this buffer is allocated
in Log_event object from which all Log events are derived):
Annotate_rows_log_event(char *buf, uint event_len,
Format_description_log_event *desc)
{
m_query_len = event_len - desc->common_header_len;
m_query_txt = buf + desc->common_header_len;
}
The events are written to the binary log by the Log_event::write() member
which calls virtual write_data_header() and write_data_body() members
("data header" and "post header" are synonym in replication terminology).
In our case, data header is empty and data body is just the query:
bool write_data_body(IO_CACHE *file)
{
return my_b_safe_write(file, (uchar*) m_query_txt, m_query_len);
}
Printing the event is just printing the query:
void Annotate_rows_log_event::print(FILE *file, PRINT_EVENT_INFO *pinfo)
{
my_b_printf(&pinfo->head_cache, "\tQuery: `%s`\n", m_query_txt);
}
3. How Master writes Annotate_rows events to the binary log
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
The event is written to the binary log just before the group of Table_map
events which precede corresponding Rows events (one query may generate
several Table map events in the binary log, but the corresponding
Annotate_rows event must be written only once before the first Table map
event; hence the boolean variable 'with_annotate' below):
int write_locked_table_maps(THD *thd)
{ ...
bool with_annotate= thd->variables.binlog_annotate_rows_events;
...
for (uint i= 0; i < ... <number of tables> ...; ++i)
{ ...
thd->binlog_write_table_map(table, ..., with_annotate);
with_annotate= 0; // write Annotate_event not more than once
...
}
...
}
int THD::binlog_write_table_map(TABLE *table, ..., bool with_annotate)
{ ...
Table_map_log_event the_event(...);
...
if (with_annotate)
{
Annotate_rows_log_event anno(this);
mysql_bin_log.write(&anno);
}
mysql_bin_log.write(&the_event);
...
}
4. How slave treats replicate-annotate-rows-events option
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
The replicate-annotate-rows-events option is treated just as the session
value of the binlog_annotate_rows_events variable for the slave IO and
SQL threads. This setting is done during initialization of these threads:
pthread_handler_t handle_slave_io(void *arg)
{
THD *thd= new THD;
...
init_slave_thread(thd, SLAVE_THD_IO);
...
}
pthread_handler_t handle_slave_sql(void *arg)
{
THD *thd= new THD;
...
init_slave_thread(thd, SLAVE_THD_SQL);
...
}
int init_slave_thread(THD* thd, SLAVE_THD_TYPE thd_type)
{ ...
thd->variables.binlog_annotate_rows_events=
opt_replicate_annotate_rows_events;
...
}
5. How slave IO thread requests Annotate_rows events
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
If the replicate-annotate-rows-events option is not set on a slave, there
is no need for master to send Annotate_rows events to this slave. The slave
(or mysqlbinlog in remote case), before requesting binlog dump via the
COM_BINLOG_DUMP command, informs the master whether it should send these
events by executing the newly added COM_BINLOG_DUMP_OPTIONS_EXT server
command:
case COM_BINLOG_DUMP_OPTIONS_EXT:
thd->binlog_dump_flags_ext= packet[0];
my_ok(thd);
break;
Note. We add this new command and don't use COM_BINLOG_DUMP to avoid possible
conflicts with MySQL/Sun.
6. How master executes the request
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
case COM_BINLOG_DUMP:
{ ...
flags= uint2korr(packet + 4);
...
mysql_binlog_send(thd, ..., flags);
...
}
void mysql_binlog_send(THD* thd, ..., ushort flags)
{ ...
Log_event::read_log_event(&log, packet, ...);
...
if ((*packet)[EVENT_TYPE_OFFSET + 1] != ANNOTATE_ROWS_EVENT ||
flags & BINLOG_SEND_ANNOTATE_ROWS_EVENT)
{
my_net_write(net, packet->ptr(), packet->length());
}
...
}
7. How slave SQL thread processes Annotate_rows events
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
The slave processes each recieved event by "applying" it, i.e. by
calling the Log_event::apply_event() function which in turn calls
the virtual do_apply_event() member specific for each type of the
event.
int exec_relay_log_event(THD* thd, Relay_log_info* rli)
{ ...
Log_event *ev = next_event(rli);
...
apply_event_and_update_pos(ev, ...);
if (ev->get_type_code() != FORMAT_DESCRIPTION_EVENT)
delete ev;
...
}
int apply_event_and_update_pos(Log_event *ev, ...)
{ ...
ev->apply_event(...);
...
}
int Log_event::apply_event(...)
{
return do_apply_event(...);
}
What does it mean to "apply" an Annotate_rows event? It means to set current
thd query to that of the described by the event, i.e. to the query which
caused the subsequent Rows events (see "How Master writes Annotate_rows
events to the binary log" to follow what happens further when the subsequent
Rows events are applied):
int Annotate_rows_log_event::do_apply_event(...)
{
thd->set_query(m_query_txt, m_query_len);
}
NOTE. I am not sure, but possibly current values of thd->query and
thd->query_length should be saved before calling set_query() and to be
restored on the Annotate_rows_log_event object deletion.
Is it really needed ?
After calling this do_apply_event() function we may not delete the
Annotate_rows_log_event object immediatedly (see exec_relay_log_event()
above) because thd->query now points to the string inside this object.
We may keep the pointer to this object in the Relay_log_info:
class Relay_log_info
{
public:
...
void set_annotate_event(Annotate_rows_log_event*);
Annotate_rows_log_event* get_annotate_event();
void free_annotate_event();
...
private:
Annotate_rows_log_event* m_annotate_event;
};
The saved Annotate_rows object should be deleted when all corresponding
Rows events will be processed:
int exec_relay_log_event(THD* thd, Relay_log_info* rli)
{ ...
Log_event *ev= next_event(rli);
...
apply_event_and_update_pos(ev, ...);
if (rli->get_annotate_event() && is_last_rows_event(ev))
rli->free_annotate_event();
else if (ev->get_type_code() == ANNOTATE_ROWS_EVENT)
rli->set_annotate_event((Annotate_rows_log_event*) ev);
else if (ev->get_type_code() != FORMAT_DESCRIPTION_EVENT)
delete ev;
...
}
where
bool is_last_rows_event(Log_event* ev)
{
Log_event_type type= ev->get_type_code();
if (IS_ROWS_EVENT_TYPE(type))
{
Rows_log_event* rows= (Rows_log_event*)ev;
return rows->get_flags(Rows_log_event::STMT_END_F);
}
return 0;
}
#define IS_ROWS_EVENT_TYPE(type) ((type) == WRITE_ROWS_EVENT || \
(type) == UPDATE_ROWS_EVENT || \
(type) == DELETE_ROWS_EVENT)
8. General remarks
~~~~~~~~~~~~~~~~~~
Kristian noticed that introducing new log event type should be coordinated
somehow with MySQL/Sun:
Kristian: The numeric code for this event must be assigned carefully.
It should be coordinated with MySQL/Sun, otherwise we can get into a
situation where MySQL uses the same numeric code for one event that
MariaDB uses for ANNOTATE_ROWS_EVENT, which would make merging the two
impossible.
Alex: I reserved about 20 numbers not to have possible conflicts
with MySQL.
Kristian: Still, I think it would be appropriate to send a polite email
to internals(a)lists.mysql.com about this and suggesting to reserve the
event number.
ESTIMATED WORK TIME
ESTIMATED COMPLETION DATE
-----------------------------------------------------------------------
WorkLog (v3.5.9)
1
0

[Maria-developers] Updated (by Guest): Store in binlog text of statements that caused RBR events (47)
by worklog-noreply@askmonty.org 05 Feb '10
by worklog-noreply@askmonty.org 05 Feb '10
05 Feb '10
-----------------------------------------------------------------------
WORKLOG TASK
-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-
TASK...........: Store in binlog text of statements that caused RBR events
CREATION DATE..: Sat, 15 Aug 2009, 23:48
SUPERVISOR.....: Monty
IMPLEMENTOR....:
COPIES TO......: Knielsen
CATEGORY.......: Server-Sprint
TASK ID........: 47 (http://askmonty.org/worklog/?tid=47)
VERSION........: Server-9.x
STATUS.........: In-Progress
PRIORITY.......: 60
WORKED HOURS...: 20
ESTIMATE.......: 35 (hours remain)
ORIG. ESTIMATE.: 35
PROGRESS NOTES:
-=-=(Guest - Fri, 05 Feb 2010, 13:40)=-=-
Category updated.
--- /tmp/wklog.47.old.9197 2010-02-05 13:40:36.000000000 +0200
+++ /tmp/wklog.47.new.9197 2010-02-05 13:40:36.000000000 +0200
@@ -1 +1 @@
-Server-RawIdeaBin
+Server-Sprint
-=-=(Guest - Fri, 05 Feb 2010, 13:40)=-=-
Status updated.
--- /tmp/wklog.47.old.9197 2010-02-05 13:40:36.000000000 +0200
+++ /tmp/wklog.47.new.9197 2010-02-05 13:40:36.000000000 +0200
@@ -1 +1 @@
-Un-Assigned
+In-Progress
-=-=(Alexi - Thu, 04 Feb 2010, 09:54)=-=-
Low Level Design modified.
--- /tmp/wklog.47.old.16174 2010-02-04 09:54:13.000000000 +0200
+++ /tmp/wklog.47.new.16174 2010-02-04 09:54:13.000000000 +0200
@@ -171,35 +171,20 @@
5. How slave IO thread requests Annotate_rows events
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-When requesting an event, the slave should inform the master whether
-it should send Annotate_rows events or not. To that end we add a new
-BINLOG_SEND_ANNOTATE_ROWS_EVENT flag used when requesting an event:
+If the replicate-annotate-rows-events option is not set on a slave, there
+is no need for master to send Annotate_rows events to this slave. The slave
+(or mysqlbinlog in remote case), before requesting binlog dump via the
+COM_BINLOG_DUMP command, informs the master whether it should send these
+events by executing the newly added COM_BINLOG_DUMP_OPTIONS_EXT server
+command:
+
+ case COM_BINLOG_DUMP_OPTIONS_EXT:
+ thd->binlog_dump_flags_ext= packet[0];
+ my_ok(thd);
+ break;
- #define BINLOG_DUMP_NON_BLOCK 1
- #define BINLOG_SEND_ANNOTATE_ROWS_EVENT 2
-
- pthread_handler_t handle_slave_io(void *arg)
- { ...
- request_dump(mysql, ...);
- ...
- }
-
- int request_dump(MYSQL* mysql, ...)
- { ...
- if (opt_log_slave_updates &&
- mi->io_thd->variables.binlog_annotate_rows_events)
- binlog_flags|= BINLOG_SEND_ANNOTATE_ROWS_EVENT;
- ...
- int2store(buf + 4, binlog_flags);
- ...
- simple_command(mysql, COM_BINLOG_DUMP, buf, ...);
- ...
- }
-
-NOTE. mysqlbinlog, when remotely requesting BINLOG_DUMP by calling the
-simple_command() function, should also use this flag if it wants (in case
-of the --print-annotate-rows-events option set) to recieve Annotate_rows
-events.
+Note. We add this new command and don't use COM_BINLOG_DUMP to avoid possible
+conflicts with MySQL/Sun.
6. How master executes the request
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
@@ -338,10 +323,4 @@
to internals(a)lists.mysql.com about this and suggesting to reserve the
event number.
-Also we should notice the introduction of the BINLOG_SEND_ANNOTATE_ROWS_EVENT
-flag taking into account that MySQL/Sun may also introduce a flag with the
-same value to be used in the request_dump-mysql_binlog_send interface.
-But this is mainly the question of merging: if a conflict concerning this
-flag occur, we may simply change the BINLOG_SEND_ANNOTATE_ROWS_EVENT value
-(this does not require additional changes in the code).
-=-=(Alexi - Sun, 20 Dec 2009, 16:00)=-=-
Low Level Design modified.
--- /tmp/wklog.47.old.19667 2009-12-20 14:00:56.000000000 +0000
+++ /tmp/wklog.47.new.19667 2009-12-20 14:00:56.000000000 +0000
@@ -196,6 +196,11 @@
...
}
+NOTE. mysqlbinlog, when remotely requesting BINLOG_DUMP by calling the
+simple_command() function, should also use this flag if it wants (in case
+of the --print-annotate-rows-events option set) to recieve Annotate_rows
+events.
+
6. How master executes the request
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
@@ -212,8 +217,7 @@
Log_event::read_log_event(&log, packet, ...);
...
if ((*packet)[EVENT_TYPE_OFFSET + 1] != ANNOTATE_ROWS_EVENT ||
- flags & BINLOG_SEND_ANNOTATE_ROWS_EVENT ||
- thd->server_id == 0 /* slave == mysqlbinlog */ )
+ flags & BINLOG_SEND_ANNOTATE_ROWS_EVENT)
{
my_net_write(net, packet->ptr(), packet->length());
}
-=-=(Alexi - Sun, 20 Dec 2009, 13:14)=-=-
Low Level Design modified.
--- /tmp/wklog.47.old.11350 2009-12-20 13:14:04.000000000 +0200
+++ /tmp/wklog.47.new.11350 2009-12-20 13:14:04.000000000 +0200
@@ -282,23 +282,18 @@
Annotate_rows_log_event* m_annotate_event;
};
-When the saved Annotate_rows object may be deleted? When all corresponding
-Rows events will be processed, i.e. before processing the first non-Rows
-event (note that Annotate_rows object resides in the binary log *after*
-the (possible) 'BEGIN' Query event which accompanies the rows events; note
-also that this deletion is adjusted with the case when some or all
-corresponding Rows events are filtered out by replicate filter rules):
+The saved Annotate_rows object should be deleted when all corresponding
+Rows events will be processed:
int exec_relay_log_event(THD* thd, Relay_log_info* rli)
{ ...
Log_event *ev= next_event(rli);
...
- if (rli->get_annotate_event() && !IS_RBR_EVENT_TYPE(ev->get_type_code()))
- rli->free_annotate_event();
-
apply_event_and_update_pos(ev, ...);
- if (ev->get_type_code() == ANNOTATE_ROWS_EVENT)
+ if (rli->get_annotate_event() && is_last_rows_event(ev))
+ rli->free_annotate_event();
+ else if (ev->get_type_code() == ANNOTATE_ROWS_EVENT)
rli->set_annotate_event((Annotate_rows_log_event*) ev);
else if (ev->get_type_code() != FORMAT_DESCRIPTION_EVENT)
delete ev;
@@ -307,10 +302,21 @@
where
- #define IS_RBR_EVENT_TYPE(type) ( (type) == TABLE_MAP_EVENT || \
- (type) == WRITE_ROWS_EVENT || \
+ bool is_last_rows_event(Log_event* ev)
+ {
+ Log_event_type type= ev->get_type_code();
+ if (IS_ROWS_EVENT_TYPE(type))
+ {
+ Rows_log_event* rows= (Rows_log_event*)ev;
+ return rows->get_flags(Rows_log_event::STMT_END_F);
+ }
+
+ return 0;
+ }
+
+ #define IS_ROWS_EVENT_TYPE(type) ((type) == WRITE_ROWS_EVENT || \
(type) == UPDATE_ROWS_EVENT || \
- (type) == DELETE_ROWS_EVENT )
+ (type) == DELETE_ROWS_EVENT)
8. General remarks
~~~~~~~~~~~~~~~~~~
-=-=(Alexi - Sun, 20 Dec 2009, 09:29)=-=-
Low Level Design modified.
--- /tmp/wklog.47.old.32726 2009-12-20 07:29:56.000000000 +0000
+++ /tmp/wklog.47.new.32726 2009-12-20 07:29:56.000000000 +0000
@@ -56,7 +56,7 @@
0000006A | 00 | 0 - RESERVED [27]
..... ...
00000081 | 00 | 0 - RESERVED [50]
- 00000082 | 00 | 0 - ANNOTATE_RBR_EVENT [51]
+ 00000082 | 00 | 0 - ANNOTATE_ROWS_EVENT [51]
************************
2. Outline of Annotate_rows event behavior
-=-=(Alexi - Sat, 19 Dec 2009, 16:10)=-=-
Low Level Design modified.
--- /tmp/wklog.47.old.16051 2009-12-19 16:10:48.000000000 +0200
+++ /tmp/wklog.47.new.16051 2009-12-19 16:10:48.000000000 +0200
@@ -253,7 +253,7 @@
thd query to that of the described by the event, i.e. to the query which
caused the subsequent Rows events (see "How Master writes Annotate_rows
events to the binary log" to follow what happens further when the subsequent
-Rows events is applied):
+Rows events are applied):
int Annotate_rows_log_event::do_apply_event(...)
{
-=-=(Alexi - Sat, 19 Dec 2009, 16:02)=-=-
High-Level Specification modified.
--- /tmp/wklog.47.old.15695 2009-12-19 16:02:33.000000000 +0200
+++ /tmp/wklog.47.new.15695 2009-12-19 16:02:33.000000000 +0200
@@ -12,7 +12,7 @@
post-header and contains the query text in its data part. Example:
************************
- ANNOTATE_RBR_EVENT
+ ANNOTATE_ROWS_EVENT
************************
00000220 | B6 A0 2C 4B | time_when = 1261215926
00000224 | 33 | event_type = 51
-=-=(Alexi - Sat, 19 Dec 2009, 15:58)=-=-
Low Level Design modified.
--- /tmp/wklog.47.old.15437 2009-12-19 13:58:12.000000000 +0000
+++ /tmp/wklog.47.new.15437 2009-12-19 13:58:12.000000000 +0000
@@ -1 +1,337 @@
+Content
+~~~~~~~
+ 1. Annotate_rows event number
+ 2. Outline of Annotate_rows event behavior
+ 3. How Master writes Annotate_rows events to the binary log
+ 4. How slave treats replicate-annotate-rows-events option
+ 5. How slave IO thread requests Annotate_rows events
+ 6. How master executes the request
+ 7. How slave SQL thread processes Annotate_rows events
+ 8. General remarks
+
+1. Annotate_rows event number
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+To avoid possible event numbers conflict with MySQL/Sun, we leave a gap
+between the last MySQL event number and the Annotate_rows event number:
+
+ enum Log_event_type
+ { ...
+ INCIDENT_EVENT= 26,
+ // New MySQL event numbers are to be added here
+ MYSQL_EVENTS_END,
+
+ MARIA_EVENTS_BEGIN= 51,
+ // New Maria event numbers start from here
+ ANNOTATE_ROWS_EVENT= 51,
+
+ ENUM_END_EVENT
+ };
+
+together with the corresponding extension of 'post_header_len' array in the
+Format description event. (This extension does not affect the compatibility
+of the binary log). Here is how Format description event looks like with
+this extension:
+
+ ************************
+ FORMAT_DESCRIPTION_EVENT
+ ************************
+ 00000004 | A1 A0 2C 4B | time_when = 1261215905
+ 00000008 | 0F | event_type = 15
+ 00000009 | 64 00 00 00 | server_id = 100
+ 0000000D | 7F 00 00 00 | event_len = 127
+ 00000011 | 83 00 00 00 | log_pos = 00000083
+ 00000015 | 01 00 | flags = LOG_EVENT_BINLOG_IN_USE_F
+ ------------------------
+ 00000017 | 04 00 | binlog_ver = 4
+ 00000019 | 35 2E 32 2E | server_ver = 5.2.0-MariaDB-alpha-debug-log
+ ..... ...
+ 0000004B | A1 A0 2C 4B | time_created = 1261215905
+ 0000004F | 13 | common_header_len = 19
+ ------------------------
+ post_header_len
+ ------------------------
+ 00000050 | 38 | 56 - START_EVENT_V3 [1]
+ ..... ...
+ 00000069 | 02 | 2 - INCIDENT_EVENT [26]
+ 0000006A | 00 | 0 - RESERVED [27]
+ ..... ...
+ 00000081 | 00 | 0 - RESERVED [50]
+ 00000082 | 00 | 0 - ANNOTATE_RBR_EVENT [51]
+ ************************
+
+2. Outline of Annotate_rows event behavior
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+Each Annotate_rows_log_event object has two private members describing the
+corresponding query:
+
+ char *m_query_txt;
+ uint m_query_len;
+
+When the object is created for writing to a binary log, this query is taken
+from 'thd' (for short, below we omit the 'Annotate_rows_log_event::' prefix
+as well as other implementation details):
+
+ Annotate_rows_log_event(THD *thd)
+ {
+ m_query_txt = thd->query();
+ m_query_len = thd->query_length();
+ }
+
+When the object is read from a binary log, the query is taken from the buffer
+containing the binary log representation of the event (this buffer is allocated
+in Log_event object from which all Log events are derived):
+
+ Annotate_rows_log_event(char *buf, uint event_len,
+ Format_description_log_event *desc)
+ {
+ m_query_len = event_len - desc->common_header_len;
+ m_query_txt = buf + desc->common_header_len;
+ }
+
+The events are written to the binary log by the Log_event::write() member
+which calls virtual write_data_header() and write_data_body() members
+("data header" and "post header" are synonym in replication terminology).
+In our case, data header is empty and data body is just the query:
+
+ bool write_data_body(IO_CACHE *file)
+ {
+ return my_b_safe_write(file, (uchar*) m_query_txt, m_query_len);
+ }
+
+Printing the event is just printing the query:
+
+ void Annotate_rows_log_event::print(FILE *file, PRINT_EVENT_INFO *pinfo)
+ {
+ my_b_printf(&pinfo->head_cache, "\tQuery: `%s`\n", m_query_txt);
+ }
+
+3. How Master writes Annotate_rows events to the binary log
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+The event is written to the binary log just before the group of Table_map
+events which precede corresponding Rows events (one query may generate
+several Table map events in the binary log, but the corresponding
+Annotate_rows event must be written only once before the first Table map
+event; hence the boolean variable 'with_annotate' below):
+
+ int write_locked_table_maps(THD *thd)
+ { ...
+ bool with_annotate= thd->variables.binlog_annotate_rows_events;
+ ...
+ for (uint i= 0; i < ... <number of tables> ...; ++i)
+ { ...
+ thd->binlog_write_table_map(table, ..., with_annotate);
+ with_annotate= 0; // write Annotate_event not more than once
+ ...
+ }
+ ...
+ }
+
+ int THD::binlog_write_table_map(TABLE *table, ..., bool with_annotate)
+ { ...
+ Table_map_log_event the_event(...);
+ ...
+ if (with_annotate)
+ {
+ Annotate_rows_log_event anno(this);
+ mysql_bin_log.write(&anno);
+ }
+
+ mysql_bin_log.write(&the_event);
+ ...
+ }
+
+4. How slave treats replicate-annotate-rows-events option
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+The replicate-annotate-rows-events option is treated just as the session
+value of the binlog_annotate_rows_events variable for the slave IO and
+SQL threads. This setting is done during initialization of these threads:
+
+ pthread_handler_t handle_slave_io(void *arg)
+ {
+ THD *thd= new THD;
+ ...
+ init_slave_thread(thd, SLAVE_THD_IO);
+ ...
+ }
+
+ pthread_handler_t handle_slave_sql(void *arg)
+ {
+ THD *thd= new THD;
+ ...
+ init_slave_thread(thd, SLAVE_THD_SQL);
+ ...
+ }
+
+ int init_slave_thread(THD* thd, SLAVE_THD_TYPE thd_type)
+ { ...
+ thd->variables.binlog_annotate_rows_events=
+ opt_replicate_annotate_rows_events;
+ ...
+ }
+
+5. How slave IO thread requests Annotate_rows events
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+When requesting an event, the slave should inform the master whether
+it should send Annotate_rows events or not. To that end we add a new
+BINLOG_SEND_ANNOTATE_ROWS_EVENT flag used when requesting an event:
+
+ #define BINLOG_DUMP_NON_BLOCK 1
+ #define BINLOG_SEND_ANNOTATE_ROWS_EVENT 2
+
+ pthread_handler_t handle_slave_io(void *arg)
+ { ...
+ request_dump(mysql, ...);
+ ...
+ }
+
+ int request_dump(MYSQL* mysql, ...)
+ { ...
+ if (opt_log_slave_updates &&
+ mi->io_thd->variables.binlog_annotate_rows_events)
+ binlog_flags|= BINLOG_SEND_ANNOTATE_ROWS_EVENT;
+ ...
+ int2store(buf + 4, binlog_flags);
+ ...
+ simple_command(mysql, COM_BINLOG_DUMP, buf, ...);
+ ...
+ }
+
+6. How master executes the request
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+ case COM_BINLOG_DUMP:
+ { ...
+ flags= uint2korr(packet + 4);
+ ...
+ mysql_binlog_send(thd, ..., flags);
+ ...
+ }
+
+ void mysql_binlog_send(THD* thd, ..., ushort flags)
+ { ...
+ Log_event::read_log_event(&log, packet, ...);
+ ...
+ if ((*packet)[EVENT_TYPE_OFFSET + 1] != ANNOTATE_ROWS_EVENT ||
+ flags & BINLOG_SEND_ANNOTATE_ROWS_EVENT ||
+ thd->server_id == 0 /* slave == mysqlbinlog */ )
+ {
+ my_net_write(net, packet->ptr(), packet->length());
+ }
+ ...
+ }
+
+7. How slave SQL thread processes Annotate_rows events
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+The slave processes each recieved event by "applying" it, i.e. by
+calling the Log_event::apply_event() function which in turn calls
+the virtual do_apply_event() member specific for each type of the
+event.
+
+ int exec_relay_log_event(THD* thd, Relay_log_info* rli)
+ { ...
+ Log_event *ev = next_event(rli);
+ ...
+ apply_event_and_update_pos(ev, ...);
+
+ if (ev->get_type_code() != FORMAT_DESCRIPTION_EVENT)
+ delete ev;
+ ...
+ }
+
+ int apply_event_and_update_pos(Log_event *ev, ...)
+ { ...
+ ev->apply_event(...);
+ ...
+ }
+
+ int Log_event::apply_event(...)
+ {
+ return do_apply_event(...);
+ }
+
+What does it mean to "apply" an Annotate_rows event? It means to set current
+thd query to that of the described by the event, i.e. to the query which
+caused the subsequent Rows events (see "How Master writes Annotate_rows
+events to the binary log" to follow what happens further when the subsequent
+Rows events is applied):
+
+ int Annotate_rows_log_event::do_apply_event(...)
+ {
+ thd->set_query(m_query_txt, m_query_len);
+ }
+
+NOTE. I am not sure, but possibly current values of thd->query and
+thd->query_length should be saved before calling set_query() and to be
+restored on the Annotate_rows_log_event object deletion.
+Is it really needed ?
+
+After calling this do_apply_event() function we may not delete the
+Annotate_rows_log_event object immediatedly (see exec_relay_log_event()
+above) because thd->query now points to the string inside this object.
+We may keep the pointer to this object in the Relay_log_info:
+
+ class Relay_log_info
+ {
+ public:
+ ...
+ void set_annotate_event(Annotate_rows_log_event*);
+ Annotate_rows_log_event* get_annotate_event();
+ void free_annotate_event();
+ ...
+ private:
+ Annotate_rows_log_event* m_annotate_event;
+ };
+
+When the saved Annotate_rows object may be deleted? When all corresponding
+Rows events will be processed, i.e. before processing the first non-Rows
+event (note that Annotate_rows object resides in the binary log *after*
+the (possible) 'BEGIN' Query event which accompanies the rows events; note
+also that this deletion is adjusted with the case when some or all
+corresponding Rows events are filtered out by replicate filter rules):
+
+ int exec_relay_log_event(THD* thd, Relay_log_info* rli)
+ { ...
+ Log_event *ev= next_event(rli);
+ ...
+ if (rli->get_annotate_event() && !IS_RBR_EVENT_TYPE(ev->get_type_code()))
+ rli->free_annotate_event();
+
+ apply_event_and_update_pos(ev, ...);
+
+ if (ev->get_type_code() == ANNOTATE_ROWS_EVENT)
+ rli->set_annotate_event((Annotate_rows_log_event*) ev);
+ else if (ev->get_type_code() != FORMAT_DESCRIPTION_EVENT)
+ delete ev;
+ ...
+ }
+
+where
+
+ #define IS_RBR_EVENT_TYPE(type) ( (type) == TABLE_MAP_EVENT || \
+ (type) == WRITE_ROWS_EVENT || \
+ (type) == UPDATE_ROWS_EVENT || \
+ (type) == DELETE_ROWS_EVENT )
+
+8. General remarks
+~~~~~~~~~~~~~~~~~~
+Kristian noticed that introducing new log event type should be coordinated
+somehow with MySQL/Sun:
+
+ Kristian: The numeric code for this event must be assigned carefully.
+ It should be coordinated with MySQL/Sun, otherwise we can get into a
+ situation where MySQL uses the same numeric code for one event that
+ MariaDB uses for ANNOTATE_ROWS_EVENT, which would make merging the two
+ impossible.
+ Alex: I reserved about 20 numbers not to have possible conflicts
+ with MySQL.
+ Kristian: Still, I think it would be appropriate to send a polite email
+ to internals(a)lists.mysql.com about this and suggesting to reserve the
+ event number.
+
+Also we should notice the introduction of the BINLOG_SEND_ANNOTATE_ROWS_EVENT
+flag taking into account that MySQL/Sun may also introduce a flag with the
+same value to be used in the request_dump-mysql_binlog_send interface.
+But this is mainly the question of merging: if a conflict concerning this
+flag occur, we may simply change the BINLOG_SEND_ANNOTATE_ROWS_EVENT value
+(this does not require additional changes in the code).
-=-=(Alexi - Sat, 19 Dec 2009, 15:41)=-=-
High-Level Specification modified.
--- /tmp/wklog.47.old.14545 2009-12-19 15:41:21.000000000 +0200
+++ /tmp/wklog.47.new.14545 2009-12-19 15:41:21.000000000 +0200
@@ -1,122 +1,107 @@
-First suggestion:
-
-> I think for this we would actually need a new binlog event type
-> (Comment_log_event?). Unless we want to log an empty statement Query_log_event
-> containing only a comment (a bit of a hack).
-
-New server option
-~~~~~~~~~~~~~~~~~
- --binlog-annotate-rows-events
-
-Setting this option makes RBR (rows-) events in the binary log to be
-preceded by Annotate rows events (see below). The corresponding
-'binlog_annotate_rows_events' system variable is dynamic and has both
-global and session values. Default global value is OFF.
-
-Note. Session values are usefull to make it possible to annotate only
- some selected statements:
+Content
+~~~~~~~
+ 1. Annotate_rows_log_event
+ 2. Server option: --binlog-annotate-rows-events
+ 3. Server option: --replicate-annotate-rows-events
+ 4. mysqlbinlog option: --print-annotate-rows-events
+ 5. mysqlbinlog output
+
+1. Annotate_rows_log_event [ ANNOTATE_ROWS_EVENT ]
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+Describes the query which caused the corresponding rows events. Has empty
+post-header and contains the query text in its data part. Example:
+
+ ************************
+ ANNOTATE_RBR_EVENT
+ ************************
+ 00000220 | B6 A0 2C 4B | time_when = 1261215926
+ 00000224 | 33 | event_type = 51
+ 00000225 | 64 00 00 00 | server_id = 100
+ 00000229 | 36 00 00 00 | event_len = 54
+ 0000022D | 56 02 00 00 | log_pos = 00000256
+ 00000231 | 00 00 | flags = <none>
+ ------------------------
+ 00000233 | 49 4E 53 45 | query = "INSERT INTO t1 VALUES (1), (2), (3)"
+ 00000237 | 52 54 20 49 |
+ 0000023B | 4E 54 4F 20 |
+ 0000023F | 74 31 20 56 |
+ 00000243 | 41 4C 55 45 |
+ 00000247 | 53 20 28 31 |
+ 0000024B | 29 2C 20 28 |
+ 0000024F | 32 29 2C 20 |
+ 00000253 | 28 33 29 |
+ ************************
+
+In binary log, Annotate_rows event follows the (possible) 'BEGIN' Query event
+and precedes the first of Table map events which accompany the corresponding
+rows events. (See example in the "mysqlbinlog output" section below.)
+
+2. Server option: --binlog-annotate-rows-events
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+Tells the master to write Annotate_rows events to the binary log.
+
+ * Variable Name: binlog_annotate_rows_events
+ * Scope: Global & Session
+ * Access Type: Dynamic
+ * Data Type: bool
+ * Default Value: OFF
+NOTE. Session values allows to annotate only some selected statements:
...
SET SESSION binlog_annotate_rows_events=ON;
... statements to be annotated ...
SET SESSION binlog_annotate_rows_events=OFF;
... statements not to be annotated ...
-New binlog event type
-~~~~~~~~~~~~~~~~~~~~~
- Annotate_rows_log_event [ ANNOTATE_ROWS_EVENT ]
-
-Describes the query which caused the corresponding rows event. In binary log,
-precedes each Table_map_log_event. Contains empty post-header and the query
-text in its data part.
-
-The numeric code for this event must be assigned carefully. It should be
-coordinated with MySQL/Sun, otherwise we can get into a situation where MySQL
-uses the same numeric code for one event that MariaDB uses for
-ANNOTATE_ROWS_EVENT, which would make merging the two impossible.
-
-Example:
-
- ...
- ************************
- ANNOTATE_ROWS_EVENT [51]
- ************************
- 000000C7 | 54 1B 12 4B | time_when = 1259477844
- 000000CB | 33 | event_type = 51
- 000000CC | 64 00 00 00 | server_id = 100
- 000000D0 | 2C 00 00 00 | event_len = 44
- 000000D4 | F3 00 00 00 | log_pos = 000000F3
- 000000D8 | 00 00 | flags = <none>
- ------------------------
- 000000DA | 69 6E 73 65 | query = "insert into t1 values (1)"
- 000000DE | 72 74 20 69 |
- 000000E2 | 6E 74 6F 20 |
- 000000E6 | 74 31 20 76 |
- 000000EA | 61 6C 75 65 |
- 000000EE | 73 20 28 31 |
- 000000F2 | 29 |
- ************************
- TABLE_MAP_EVENT [19]
- ************************
- 000000F3 | 54 1B 12 4B | time_when = 1259477844
- 000000F7 | 13 | event_type = 19
- 000000F8 | 64 00 00 00 | server_id = 100
- 000000FC | 29 00 00 00 | event_len = 41
- 00000100 | 1C 01 00 00 | log_pos = 0000011C
- 00000104 | 00 00 | flags = <none>
- ------------------------
- ...
- ************************
- WRITE_ROWS_EVENT [23]
- ************************
- 0000011C | 54 1B 12 4B | time_when = 1259477844
- 00000120 | 17 | event_type = 23
- 00000121 | 64 00 00 00 | server_id = 100
- 00000125 | 22 00 00 00 | event_len = 34
- 00000129 | 3E 01 00 00 | log_pos = 0000013E
- 0000012D | 10 00 | flags = LOG_EVENT_UPDATE_TABLE_MAP_VERSION_F
- ------------------------
- 0000012F | 0F 00 00 00 | table_id = 15
- ...
-
-New mysqlbinlog option
-~~~~~~~~~~~~~~~~~~~~~~
- --print-annotate-rows-events
-
-With this option, mysqlbinlog prints the content of Annotate-rows
-events (if the binary log does contain them). Without this option
-(i.e. by default), mysqlbinlog skips Annotate rows events.
-
-
-mysqlbinlog output
-~~~~~~~~~~~~~~~~~~
-Something like this:
+3. Server option: --replicate-annotate-rows-events
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+Tells the slave to reproduce Annotate_rows events recieved from the master
+in its own binary log (sensible only in pair with log-slave-updates option).
+
+ * Variable Name: replicate_annotate_rows_events
+ * Scope: Global
+ * Access Type: Read only
+ * Data Type: bool
+ * Default Value: OFF
+
+NOTE. Why do we additionally need this 'replicate' option? Why not to make
+the slave to reproduce this events when its binlog-annotate-rows-events
+global value is ON? Well, because, for example, we may want to configure
+the slave which should reproduce Annotate_rows events but has global
+binlog-annotate-rows-events = OFF meaning this to be the default value for
+the client threads (see also "How slave treats replicate-annotate-rows-events
+option" in LLD part).
+
+4. mysqlbinlog option: --print-annotate-rows-events
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+With this option, mysqlbinlog prints the content of Annotate_rows events (if
+the binary log does contain them). Without this option (i.e. by default),
+mysqlbinlog skips Annotate_rows events.
+5. mysqlbinlog output
+~~~~~~~~~~~~~~~~~~~~~
+With --print-annotate-rows-events, mysqlbinlog outputs Annotate_rows events
+in a form like this:
...
- # at 199
- # at 243
- # at 284
- #091129 9:57:24 server id 100 end_log_pos 243 Query: `insert into t1 values
-(1)`
- #091129 9:57:24 server id 100 end_log_pos 284 Table_map: `test`.`t1` mapped
-to number 15
- #091129 9:57:24 server id 100 end_log_pos 318 Write_rows: table id 15
+ # at 1646
+ #091219 12:45:26 server id 100 end_log_pos 1714 Query thread_id=1
+exec_time=0 error_code=0
+ SET TIMESTAMP=1261215926/*!*/;
+ BEGIN
+ /*!*/;
+ # at 1714
+ # at 1812
+ # at 1853
+ # at 1894
+ # at 1938
+ #091219 12:45:26 server id 100 end_log_pos 1812 Query: `DELETE t1, t2 FROM
+t1 INNER JOIN t2 INNER JOIN t3 WHERE t1.a=t2.a AND t2.a=t3.a`
+ #091219 12:45:26 server id 100 end_log_pos 1853 Table_map: `test`.`t1`
+mapped to number 16
+ #091219 12:45:26 server id 100 end_log_pos 1894 Table_map: `test`.`t2`
+mapped to number 17
+ #091219 12:45:26 server id 100 end_log_pos 1938 Delete_rows: table id 16
+ #091219 12:45:26 server id 100 end_log_pos 1982 Delete_rows: table id 17
flags: STMT_END_F
-
- BINLOG '
- VBsSSzNkAAAALAAAAPMAAAAAAGluc2VydCBpbnRvIHQxIHZhbHVlcyAoMSk=
- VBsSSxNkAAAAKQAAABwBAAAAAA8AAAAAAAAABHRlc3QAAnQxAAEDAAE=
- VBsSSxdkAAAAIgAAAD4BAAAQAA8AAAAAAAEAAf/+AQAAAA==
- '/*!*/;
- ### INSERT INTO test.t1
- ### SET
- ### @1=1 /* INT meta=0 nullable=1 is_null=0 */
...
-When master sends Annotate rows events
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-1. Master always sends Annotate_rows events to mysqlbinlog (in
- remote case).
-2. Master sends Annotate_rows events to a slave only if the slave has
- both log-slave-updates and binlog-annotate-rows-events options set.
-
------------------------------------------------------------
-=-=(View All Progress Notes, 22 total)=-=-
http://askmonty.org/worklog/index.pl?tid=47&nolimit=1
DESCRIPTION:
Store in binlog (and show in mysqlbinlog output) texts of statements that
caused RBR events
This is needed for (list from Monty):
- Easier to understand why updates happened
- Would make it easier to find out where in application things went
wrong (as you can search for exact strings)
- Allow one to filter things based on comments in the statement.
The cost of this can be that the binlog will be approximately 2x in size
(especially insert of big blob's would be a bit painful), so this should
be an optional feature.
HIGH-LEVEL SPECIFICATION:
Content
~~~~~~~
1. Annotate_rows_log_event
2. Server option: --binlog-annotate-rows-events
3. Server option: --replicate-annotate-rows-events
4. mysqlbinlog option: --print-annotate-rows-events
5. mysqlbinlog output
1. Annotate_rows_log_event [ ANNOTATE_ROWS_EVENT ]
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
Describes the query which caused the corresponding rows events. Has empty
post-header and contains the query text in its data part. Example:
************************
ANNOTATE_ROWS_EVENT
************************
00000220 | B6 A0 2C 4B | time_when = 1261215926
00000224 | 33 | event_type = 51
00000225 | 64 00 00 00 | server_id = 100
00000229 | 36 00 00 00 | event_len = 54
0000022D | 56 02 00 00 | log_pos = 00000256
00000231 | 00 00 | flags = <none>
------------------------
00000233 | 49 4E 53 45 | query = "INSERT INTO t1 VALUES (1), (2), (3)"
00000237 | 52 54 20 49 |
0000023B | 4E 54 4F 20 |
0000023F | 74 31 20 56 |
00000243 | 41 4C 55 45 |
00000247 | 53 20 28 31 |
0000024B | 29 2C 20 28 |
0000024F | 32 29 2C 20 |
00000253 | 28 33 29 |
************************
In binary log, Annotate_rows event follows the (possible) 'BEGIN' Query event
and precedes the first of Table map events which accompany the corresponding
rows events. (See example in the "mysqlbinlog output" section below.)
2. Server option: --binlog-annotate-rows-events
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
Tells the master to write Annotate_rows events to the binary log.
* Variable Name: binlog_annotate_rows_events
* Scope: Global & Session
* Access Type: Dynamic
* Data Type: bool
* Default Value: OFF
NOTE. Session values allows to annotate only some selected statements:
...
SET SESSION binlog_annotate_rows_events=ON;
... statements to be annotated ...
SET SESSION binlog_annotate_rows_events=OFF;
... statements not to be annotated ...
3. Server option: --replicate-annotate-rows-events
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
Tells the slave to reproduce Annotate_rows events recieved from the master
in its own binary log (sensible only in pair with log-slave-updates option).
* Variable Name: replicate_annotate_rows_events
* Scope: Global
* Access Type: Read only
* Data Type: bool
* Default Value: OFF
NOTE. Why do we additionally need this 'replicate' option? Why not to make
the slave to reproduce this events when its binlog-annotate-rows-events
global value is ON? Well, because, for example, we may want to configure
the slave which should reproduce Annotate_rows events but has global
binlog-annotate-rows-events = OFF meaning this to be the default value for
the client threads (see also "How slave treats replicate-annotate-rows-events
option" in LLD part).
4. mysqlbinlog option: --print-annotate-rows-events
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
With this option, mysqlbinlog prints the content of Annotate_rows events (if
the binary log does contain them). Without this option (i.e. by default),
mysqlbinlog skips Annotate_rows events.
5. mysqlbinlog output
~~~~~~~~~~~~~~~~~~~~~
With --print-annotate-rows-events, mysqlbinlog outputs Annotate_rows events
in a form like this:
...
# at 1646
#091219 12:45:26 server id 100 end_log_pos 1714 Query thread_id=1
exec_time=0 error_code=0
SET TIMESTAMP=1261215926/*!*/;
BEGIN
/*!*/;
# at 1714
# at 1812
# at 1853
# at 1894
# at 1938
#091219 12:45:26 server id 100 end_log_pos 1812 Query: `DELETE t1, t2 FROM
t1 INNER JOIN t2 INNER JOIN t3 WHERE t1.a=t2.a AND t2.a=t3.a`
#091219 12:45:26 server id 100 end_log_pos 1853 Table_map: `test`.`t1`
mapped to number 16
#091219 12:45:26 server id 100 end_log_pos 1894 Table_map: `test`.`t2`
mapped to number 17
#091219 12:45:26 server id 100 end_log_pos 1938 Delete_rows: table id 16
#091219 12:45:26 server id 100 end_log_pos 1982 Delete_rows: table id 17
flags: STMT_END_F
...
LOW-LEVEL DESIGN:
Content
~~~~~~~
1. Annotate_rows event number
2. Outline of Annotate_rows event behavior
3. How Master writes Annotate_rows events to the binary log
4. How slave treats replicate-annotate-rows-events option
5. How slave IO thread requests Annotate_rows events
6. How master executes the request
7. How slave SQL thread processes Annotate_rows events
8. General remarks
1. Annotate_rows event number
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
To avoid possible event numbers conflict with MySQL/Sun, we leave a gap
between the last MySQL event number and the Annotate_rows event number:
enum Log_event_type
{ ...
INCIDENT_EVENT= 26,
// New MySQL event numbers are to be added here
MYSQL_EVENTS_END,
MARIA_EVENTS_BEGIN= 51,
// New Maria event numbers start from here
ANNOTATE_ROWS_EVENT= 51,
ENUM_END_EVENT
};
together with the corresponding extension of 'post_header_len' array in the
Format description event. (This extension does not affect the compatibility
of the binary log). Here is how Format description event looks like with
this extension:
************************
FORMAT_DESCRIPTION_EVENT
************************
00000004 | A1 A0 2C 4B | time_when = 1261215905
00000008 | 0F | event_type = 15
00000009 | 64 00 00 00 | server_id = 100
0000000D | 7F 00 00 00 | event_len = 127
00000011 | 83 00 00 00 | log_pos = 00000083
00000015 | 01 00 | flags = LOG_EVENT_BINLOG_IN_USE_F
------------------------
00000017 | 04 00 | binlog_ver = 4
00000019 | 35 2E 32 2E | server_ver = 5.2.0-MariaDB-alpha-debug-log
..... ...
0000004B | A1 A0 2C 4B | time_created = 1261215905
0000004F | 13 | common_header_len = 19
------------------------
post_header_len
------------------------
00000050 | 38 | 56 - START_EVENT_V3 [1]
..... ...
00000069 | 02 | 2 - INCIDENT_EVENT [26]
0000006A | 00 | 0 - RESERVED [27]
..... ...
00000081 | 00 | 0 - RESERVED [50]
00000082 | 00 | 0 - ANNOTATE_ROWS_EVENT [51]
************************
2. Outline of Annotate_rows event behavior
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
Each Annotate_rows_log_event object has two private members describing the
corresponding query:
char *m_query_txt;
uint m_query_len;
When the object is created for writing to a binary log, this query is taken
from 'thd' (for short, below we omit the 'Annotate_rows_log_event::' prefix
as well as other implementation details):
Annotate_rows_log_event(THD *thd)
{
m_query_txt = thd->query();
m_query_len = thd->query_length();
}
When the object is read from a binary log, the query is taken from the buffer
containing the binary log representation of the event (this buffer is allocated
in Log_event object from which all Log events are derived):
Annotate_rows_log_event(char *buf, uint event_len,
Format_description_log_event *desc)
{
m_query_len = event_len - desc->common_header_len;
m_query_txt = buf + desc->common_header_len;
}
The events are written to the binary log by the Log_event::write() member
which calls virtual write_data_header() and write_data_body() members
("data header" and "post header" are synonym in replication terminology).
In our case, data header is empty and data body is just the query:
bool write_data_body(IO_CACHE *file)
{
return my_b_safe_write(file, (uchar*) m_query_txt, m_query_len);
}
Printing the event is just printing the query:
void Annotate_rows_log_event::print(FILE *file, PRINT_EVENT_INFO *pinfo)
{
my_b_printf(&pinfo->head_cache, "\tQuery: `%s`\n", m_query_txt);
}
3. How Master writes Annotate_rows events to the binary log
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
The event is written to the binary log just before the group of Table_map
events which precede corresponding Rows events (one query may generate
several Table map events in the binary log, but the corresponding
Annotate_rows event must be written only once before the first Table map
event; hence the boolean variable 'with_annotate' below):
int write_locked_table_maps(THD *thd)
{ ...
bool with_annotate= thd->variables.binlog_annotate_rows_events;
...
for (uint i= 0; i < ... <number of tables> ...; ++i)
{ ...
thd->binlog_write_table_map(table, ..., with_annotate);
with_annotate= 0; // write Annotate_event not more than once
...
}
...
}
int THD::binlog_write_table_map(TABLE *table, ..., bool with_annotate)
{ ...
Table_map_log_event the_event(...);
...
if (with_annotate)
{
Annotate_rows_log_event anno(this);
mysql_bin_log.write(&anno);
}
mysql_bin_log.write(&the_event);
...
}
4. How slave treats replicate-annotate-rows-events option
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
The replicate-annotate-rows-events option is treated just as the session
value of the binlog_annotate_rows_events variable for the slave IO and
SQL threads. This setting is done during initialization of these threads:
pthread_handler_t handle_slave_io(void *arg)
{
THD *thd= new THD;
...
init_slave_thread(thd, SLAVE_THD_IO);
...
}
pthread_handler_t handle_slave_sql(void *arg)
{
THD *thd= new THD;
...
init_slave_thread(thd, SLAVE_THD_SQL);
...
}
int init_slave_thread(THD* thd, SLAVE_THD_TYPE thd_type)
{ ...
thd->variables.binlog_annotate_rows_events=
opt_replicate_annotate_rows_events;
...
}
5. How slave IO thread requests Annotate_rows events
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
If the replicate-annotate-rows-events option is not set on a slave, there
is no need for master to send Annotate_rows events to this slave. The slave
(or mysqlbinlog in remote case), before requesting binlog dump via the
COM_BINLOG_DUMP command, informs the master whether it should send these
events by executing the newly added COM_BINLOG_DUMP_OPTIONS_EXT server
command:
case COM_BINLOG_DUMP_OPTIONS_EXT:
thd->binlog_dump_flags_ext= packet[0];
my_ok(thd);
break;
Note. We add this new command and don't use COM_BINLOG_DUMP to avoid possible
conflicts with MySQL/Sun.
6. How master executes the request
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
case COM_BINLOG_DUMP:
{ ...
flags= uint2korr(packet + 4);
...
mysql_binlog_send(thd, ..., flags);
...
}
void mysql_binlog_send(THD* thd, ..., ushort flags)
{ ...
Log_event::read_log_event(&log, packet, ...);
...
if ((*packet)[EVENT_TYPE_OFFSET + 1] != ANNOTATE_ROWS_EVENT ||
flags & BINLOG_SEND_ANNOTATE_ROWS_EVENT)
{
my_net_write(net, packet->ptr(), packet->length());
}
...
}
7. How slave SQL thread processes Annotate_rows events
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
The slave processes each recieved event by "applying" it, i.e. by
calling the Log_event::apply_event() function which in turn calls
the virtual do_apply_event() member specific for each type of the
event.
int exec_relay_log_event(THD* thd, Relay_log_info* rli)
{ ...
Log_event *ev = next_event(rli);
...
apply_event_and_update_pos(ev, ...);
if (ev->get_type_code() != FORMAT_DESCRIPTION_EVENT)
delete ev;
...
}
int apply_event_and_update_pos(Log_event *ev, ...)
{ ...
ev->apply_event(...);
...
}
int Log_event::apply_event(...)
{
return do_apply_event(...);
}
What does it mean to "apply" an Annotate_rows event? It means to set current
thd query to that of the described by the event, i.e. to the query which
caused the subsequent Rows events (see "How Master writes Annotate_rows
events to the binary log" to follow what happens further when the subsequent
Rows events are applied):
int Annotate_rows_log_event::do_apply_event(...)
{
thd->set_query(m_query_txt, m_query_len);
}
NOTE. I am not sure, but possibly current values of thd->query and
thd->query_length should be saved before calling set_query() and to be
restored on the Annotate_rows_log_event object deletion.
Is it really needed ?
After calling this do_apply_event() function we may not delete the
Annotate_rows_log_event object immediatedly (see exec_relay_log_event()
above) because thd->query now points to the string inside this object.
We may keep the pointer to this object in the Relay_log_info:
class Relay_log_info
{
public:
...
void set_annotate_event(Annotate_rows_log_event*);
Annotate_rows_log_event* get_annotate_event();
void free_annotate_event();
...
private:
Annotate_rows_log_event* m_annotate_event;
};
The saved Annotate_rows object should be deleted when all corresponding
Rows events will be processed:
int exec_relay_log_event(THD* thd, Relay_log_info* rli)
{ ...
Log_event *ev= next_event(rli);
...
apply_event_and_update_pos(ev, ...);
if (rli->get_annotate_event() && is_last_rows_event(ev))
rli->free_annotate_event();
else if (ev->get_type_code() == ANNOTATE_ROWS_EVENT)
rli->set_annotate_event((Annotate_rows_log_event*) ev);
else if (ev->get_type_code() != FORMAT_DESCRIPTION_EVENT)
delete ev;
...
}
where
bool is_last_rows_event(Log_event* ev)
{
Log_event_type type= ev->get_type_code();
if (IS_ROWS_EVENT_TYPE(type))
{
Rows_log_event* rows= (Rows_log_event*)ev;
return rows->get_flags(Rows_log_event::STMT_END_F);
}
return 0;
}
#define IS_ROWS_EVENT_TYPE(type) ((type) == WRITE_ROWS_EVENT || \
(type) == UPDATE_ROWS_EVENT || \
(type) == DELETE_ROWS_EVENT)
8. General remarks
~~~~~~~~~~~~~~~~~~
Kristian noticed that introducing new log event type should be coordinated
somehow with MySQL/Sun:
Kristian: The numeric code for this event must be assigned carefully.
It should be coordinated with MySQL/Sun, otherwise we can get into a
situation where MySQL uses the same numeric code for one event that
MariaDB uses for ANNOTATE_ROWS_EVENT, which would make merging the two
impossible.
Alex: I reserved about 20 numbers not to have possible conflicts
with MySQL.
Kristian: Still, I think it would be appropriate to send a polite email
to internals(a)lists.mysql.com about this and suggesting to reserve the
event number.
ESTIMATED WORK TIME
ESTIMATED COMPLETION DATE
-----------------------------------------------------------------------
WorkLog (v3.5.9)
1
0

[Maria-developers] FYI: askmonty.org/buildbot is moving to buildbot.askmonty.org/buildbot
by Daniel Bartholomew 04 Feb '10
by Daniel Bartholomew 04 Feb '10
04 Feb '10
All,
This is an FYI for everyone that uses our buildbot web interface:
Due to an upcoming server move I am in the process of moving our
buildbot web interface from http://askmonty.org/buildbot to
http://buildbot.askmonty.org/buildbot
Both of the above currently work (and point at the same data), but as
of Friday 5 Feb 2010 at approx 09:00 US Eastern time,
http://askmonty.org/buildbot will turn into a redirect. The redirect
will handle all of your old bookmarks (i.e. they should still work), but
I thought I should at least let everyone know so that you aren't
surprised by the move. I've already updated all of the links I could
find on the askmonty.org wiki.
And if you are wondering why I kept the /buildbot subdirectory in
place on the new subdomain, well, it was easier and safer to keep it in
place, so I did. If you connect directly to
http://buildbot.askmonty.org you will automatically be redirected to
the /buildbot subdir.
Thanks!
--
Daniel Bartholomew
Monty Program - http://askmonty.org
1
0

[Maria-developers] Rev 157: saving uncommitted changes in /etc prior to apt run in file:///etc/
by timour@askmonty.org 04 Feb '10
by timour@askmonty.org 04 Feb '10
04 Feb '10
At file:///etc/
------------------------------------------------------------
revno: 157
revision-id: timour(a)askmonty.org-20100204112537-3a6fyd3ibqzi73wc
parent: root@lamia-20100203064317-whagbeui3pyl1bck
committer: timour(a)askmonty.org
branch nick: lamia /etc repository
timestamp: Thu 2010-02-04 13:25:37 +0200
message:
saving uncommitted changes in /etc prior to apt run
=== modified file 'cups/subscriptions.conf'
--- a/cups/subscriptions.conf 2010-02-03 06:43:17 +0000
+++ b/cups/subscriptions.conf 2010-02-04 11:25:37 +0000
@@ -1,11 +1,11 @@
# Subscription configuration file for CUPS v1.4.1
-# Written by cupsd on 2010-02-02 08:18
-NextSubscriptionId 101
-<Subscription 100>
+# Written by cupsd on 2010-02-04 13:19
+NextSubscriptionId 102
+<Subscription 101>
Events printer-state-changed printer-restarted printer-shutdown printer-stopped printer-added printer-deleted job-state-changed job-created job-completed job-stopped job-progress
Owner tsk
LeaseDuration 86400
Interval 0
-ExpirationTime 1265177812
-NextEventId 8
+ExpirationTime 1265368739
+NextEventId 1
</Subscription>
=== modified file 'cups/subscriptions.conf.O'
--- a/cups/subscriptions.conf.O 2010-02-03 06:43:17 +0000
+++ b/cups/subscriptions.conf.O 2010-02-04 11:25:37 +0000
@@ -1,11 +1,3 @@
# Subscription configuration file for CUPS v1.4.1
-# Written by cupsd on 2010-02-02 08:17
+# Written by cupsd on 2010-02-03 08:43
NextSubscriptionId 101
-<Subscription 100>
-Events printer-state-changed printer-restarted printer-shutdown printer-stopped printer-added printer-deleted job-state-changed job-created job-completed job-stopped job-progress
-Owner tsk
-LeaseDuration 86400
-Interval 0
-ExpirationTime 1265177812
-NextEventId 7
-</Subscription>
=== modified file 'resolv.conf'
--- a/resolv.conf 2010-02-01 11:56:20 +0000
+++ b/resolv.conf 2010-02-04 11:25:37 +0000
@@ -1,2 +1,3 @@
# Generated by NetworkManager
-nameserver 192.168.2.1
+nameserver 212.50.0.10
+nameserver 212.50.10.50
1
0

[Maria-developers] bzr commit into file:///etc/ branch (timour:157)
by timour@askmonty.org 04 Feb '10
by timour@askmonty.org 04 Feb '10
04 Feb '10
#At file:///etc/ based on revid:root@lamia-20100203064317-whagbeui3pyl1bck
157 timour(a)askmonty.org 2010-02-04
saving uncommitted changes in /etc prior to apt run
modified:
cups/subscriptions.conf
cups/subscriptions.conf.O
resolv.conf
=== modified file 'cups/subscriptions.conf'
--- a/cups/subscriptions.conf 2010-02-03 06:43:17 +0000
+++ b/cups/subscriptions.conf 2010-02-04 11:25:37 +0000
@@ -1,11 +1,11 @@
# Subscription configuration file for CUPS v1.4.1
-# Written by cupsd on 2010-02-02 08:18
-NextSubscriptionId 101
-<Subscription 100>
+# Written by cupsd on 2010-02-04 13:19
+NextSubscriptionId 102
+<Subscription 101>
Events printer-state-changed printer-restarted printer-shutdown printer-stopped printer-added printer-deleted job-state-changed job-created job-completed job-stopped job-progress
Owner tsk
LeaseDuration 86400
Interval 0
-ExpirationTime 1265177812
-NextEventId 8
+ExpirationTime 1265368739
+NextEventId 1
</Subscription>
=== modified file 'cups/subscriptions.conf.O'
--- a/cups/subscriptions.conf.O 2010-02-03 06:43:17 +0000
+++ b/cups/subscriptions.conf.O 2010-02-04 11:25:37 +0000
@@ -1,11 +1,3 @@
# Subscription configuration file for CUPS v1.4.1
-# Written by cupsd on 2010-02-02 08:17
+# Written by cupsd on 2010-02-03 08:43
NextSubscriptionId 101
-<Subscription 100>
-Events printer-state-changed printer-restarted printer-shutdown printer-stopped printer-added printer-deleted job-state-changed job-created job-completed job-stopped job-progress
-Owner tsk
-LeaseDuration 86400
-Interval 0
-ExpirationTime 1265177812
-NextEventId 7
-</Subscription>
=== modified file 'resolv.conf'
--- a/resolv.conf 2010-02-01 11:56:20 +0000
+++ b/resolv.conf 2010-02-04 11:25:37 +0000
@@ -1,2 +1,3 @@
# Generated by NetworkManager
-nameserver 192.168.2.1
+nameserver 212.50.0.10
+nameserver 212.50.10.50
1
0

[Maria-developers] Rev 8: Merge. in file:///Users/hakan/work/monty_program/mariadb-tools/
by Hakan Kuecuekyilmaz 04 Feb '10
by Hakan Kuecuekyilmaz 04 Feb '10
04 Feb '10
At file:///Users/hakan/work/monty_program/mariadb-tools/
------------------------------------------------------------
revno: 8 [merge]
revision-id: hakan(a)askmonty.org-20100204111840-0d9u3nyio2h8tavi
parent: hakan(a)askmonty.org-20100204011956-u74a4es6oogvd50w
parent: knielsen@hasky-20100202151012-pjcblb2v3bqsp5eo
committer: Hakan Kuecuekyilmaz <hakan(a)askmonty.org>
branch nick: mariadb-tools
timestamp: Thu 2010-02-04 12:18:40 +0100
message:
Merge.
modified:
buildbot/maria-master.cfg mariamaster.cfg-20091218103450-cvifjz3i70oerkej-1
buildbot/runvm runvm-20091218082659-ept0jpsqa5e8jno6-2
=== modified file 'buildbot/maria-master.cfg'
--- a/buildbot/maria-master.cfg 2010-01-02 21:17:58 +0000
+++ b/buildbot/maria-master.cfg 2010-02-02 15:10:12 +0000
@@ -59,7 +59,6 @@
,mkSlave("psergey-pylon-amd64", max_builds=1)
,mkSlave("psergey-foxhole-x86", max_builds=1)
,mkSlave("psergey-pslp2-x86", max_builds=1)
- ,mkSlave("hakan-mac-g5", max_builds=1)
,mkSlave("work-opensuse-amd64", max_builds=1)
,mkSlave("psergey-win32box", max_builds=1)
,mkSlave("vm-win-1", max_builds=1)
@@ -114,7 +113,7 @@
"lp:~maria-captains/maria/maria-5.2-merge-5.1" : "maria-5.2-merge-5.1",
"lp:~maria-captains/maria/5.2-dsmrr" : "5.2-dsmrr",
"lp:~maria-captains/maria/5.3" : "5.3",
- "lp:~maria-captains/maria/5.3" : "5.3-sj-subqueries"
+ "lp:~maria-captains/maria/5.3-sj-subqueries" : "5.3-sj-subqueries"
}
mailSource = mail.BzrLaunchpadEmailMaildirSource("/var/lib/buildbot/Maildir",
branchMap=myBranchMap)
@@ -163,7 +162,7 @@
# "lenny-amd64-dbg", "adutko-alpha",
"debian5-i386-fulltest",
"jaunty-x86-valgrind", "jaunty-amd64-rel",
- "gentoo-x86-dbg", "ubuntu-x86-dbg", "macosx-g5-dbg",
+ "gentoo-x86-dbg", "ubuntu-x86-dbg",
"macosx-x86-bld", "work-amd64-valgrind", "winxp-x86-nmake",
"win32-rel-nmake", "hardy-amd64-fulltest",
"gentoo-amd64-sanja", "opensolaris-511-x86",
@@ -180,6 +179,7 @@
branches=["mariadb-5.1-knielsen", "5.1-release", "5.1",
"5.1.39-oqgraph", "5.1-merge",
"mariadb-5.1-monty",
+ "5.2", "mariadb-5.2-monty",
],
treeStableTimer=1, # 1 sec for bzr
properties= {"bakebranch": "lp:~maria-captains/ourdelta/ourdelta-montyprogram-fixes"},
@@ -197,7 +197,9 @@
"kvm-deb-jaunty-amd64", "kvm-deb-jaunty-x86",
"kvm-deb-karmic-amd64", "kvm-deb-karmic-x86",
"kvm-deb-lucid-amd64", "kvm-deb-lucid-x86",
- "kvm-bintar-hardy-amd64", "kvm-bintar-hardy-x86",]))
+ "kvm-bintar-hardy-amd64", "kvm-bintar-hardy-x86",
+ "opensolaris-511-bintar",
+ ]))
####### BUILDERS
@@ -529,7 +531,7 @@
"slavename": "adutko-centos5-amd64",
"builddir": "centos5-amd64-minimal",
"factory": f_minimal,
- "category": "experimental",
+ "category": "main",
}
f_win32_rel_nmake = factory.BuildFactory()
@@ -699,7 +701,7 @@
"slavename": "adutko-ultrasparc3",
"builddir": "adutko-ultrasparc3",
"factory": f_sol_sparc_32,
- "category": "new",
+ "category": "main",
}
f_dbg_alpha = factory.BuildFactory()
@@ -771,23 +773,6 @@
"category": "experimental",
}
-f_dbg_g5 = factory.BuildFactory()
-f_dbg_g5.addStep(bzr_shared_repo)
-f_dbg_g5.addStep(bzr_checkout)
-f_dbg_g5.addStep(getCompileStep(["BUILD/compile-ppc-max"]))
-f_dbg_g5.addStep(getMTR(
- test_type="nm",
- test_info="Normal run, no --ps-protocol",
- command=["sh", "-c", "cd mysql-test && exec perl mysql-test-run.pl --force --retry=3 --skip-ndb --testcase-timeout=45 --suite-timeout=1080 --mysqld=--skip-safemalloc"],
- timeout=2700))
-
-bld_mac_g5 = {'name': "macosx-g5-dbg",
- 'slavename': "hakan-mac-g5",
- 'builddir': "hakan-mac-g5",
- 'factory': f_dbg_g5,
- "category": "experimental",
- }
-
f_mac_x86 = factory.BuildFactory()
f_mac_x86.addStep(bzr_shared_repo)
f_mac_x86.addStep(bzr_checkout)
@@ -878,6 +863,36 @@
"category": "experimental",
}
+# The trees for which we save binary packages.
+savedPackageBranches= ["5.1-release", "5.2-release", "5.3-release"]
+
+# Get a build step that will archive binary packages (or source tarball).
+# Only the newest 3 builds are saved for each (branch, builder) combination.
+# The packages are saved under the build number of the tarbake step, so it is
+# easy to locate all packages for a given release.
+def getPackageArchiveStep(source, tarbuildnum):
+ cmdText= ("TARBUILDNUM='" + tarbuildnum + "'\n" +
+ "SRC='" + source + "'\n" + """
+BUILDERNAME='%(buildername)s'
+BRANCH='%(branch)s'
+BASE="/archive/pack/$BRANCH"
+DST="$BASE/build-$TARBUILDNUM/$BUILDERNAME"
+set -ex
+mkdir -p "$BASE"
+rm -Rf "$DST"
+(ls -td $BASE/build-*/"$BUILDERNAME" || : ) | (read DUMMY || exit 0; read DUMMY || exit 0; while read VICTIM; do rm -Rf "$VICTIM"; done)
+rmdir $BASE/build-* 2>/dev/null || :
+mkdir -p "$DST"
+cp -r "$SRC" "$DST/"
+""")
+ return ShellCommand(
+ doStepIf=(lambda(step): step.getProperty("branch") in savedPackageBranches),
+ description=["archiving"],
+ descriptionDone=["archive"],
+ command=["sh", "-c", WithProperties(cmdText)],
+ )
+
+
f_kvm_tarbake_jaunty_x86= factory.BuildFactory()
f_kvm_tarbake_jaunty_x86.addStep(Compile(
description=["making", "dist"],
@@ -906,35 +921,38 @@
"""
set -ex
cd buildbot/build/$(cat buildbot/build/bakery.txt)/
-echo mariadb-*.tar.gz > ../distname.txt
-mv $(cat ../distname.txt) ../
+basename mariadb-*.tar.gz .tar.gz > ../distdirname.txt
+mv "$(cat ../distdirname.txt).tar.gz" ../
""",
- "= scp -P 2223 buildbot@localhost:buildbot/build/distname.txt .",
+ "= scp -P 2223 buildbot@localhost:buildbot/build/distdirname.txt .",
"= scp -P 2223 buildbot@localhost:buildbot/build/bakery.txt .",
"= scp -P 2223 'buildbot@localhost:buildbot/build/mariadb-*.tar.gz' .",
"= scp -P 2223 'buildbot@localhost:buildbot/build/bakery-*.tar.gz' .",
],
))
f_kvm_tarbake_jaunty_x86.addStep(SetProperty(
- property="distname",
- command=["cat", "distname.txt"],
+ property="distdirname",
+ command=["cat", "distdirname.txt"],
))
f_kvm_tarbake_jaunty_x86.addStep(SetProperty(
property="bakery",
command=["cat", "bakery.txt"],
))
-f_kvm_tarbake_jaunty_x86.addStep(FileUpload(slavesrc=WithProperties("%(distname)s"),
- masterdest=WithProperties("/var/lib/buildbot/OQ-tarballs/%(buildnumber)s:%(distname)s")))
+f_kvm_tarbake_jaunty_x86.addStep(getPackageArchiveStep("%(distdirname)s.tar.gz", "%(buildnumber)s"))
+f_kvm_tarbake_jaunty_x86.addStep(FileUpload(slavesrc=WithProperties("%(distdirname)s.tar.gz"),
+ masterdest=WithProperties("/var/lib/buildbot/OQ-tarballs/%(buildnumber)s:%(distdirname)s.tar.gz")))
f_kvm_tarbake_jaunty_x86.addStep(FileUpload(slavesrc=WithProperties("%(bakery)s.tar.gz"),
masterdest=WithProperties("/var/lib/buildbot/OQ-bakeries/%(buildnumber)s:%(bakery)s.tar.gz")))
f_kvm_tarbake_jaunty_x86.addStep(Trigger(
schedulerNames=["kvm-ourdelta-sched"],
waitForFinish=False,
updateSourceStamp=True,
- set_properties={ "tarballpath": WithProperties("/var/lib/buildbot/OQ-tarballs/%(buildnumber)s:%(distname)s"),
+ set_properties={ "tarballpath": WithProperties("/var/lib/buildbot/OQ-tarballs/%(buildnumber)s:%(distdirname)s.tar.gz"),
"bakerypath": WithProperties("/var/lib/buildbot/OQ-bakeries/%(buildnumber)s:%(bakery)s.tar.gz"),
+ "distname": WithProperties("%(distdirname)s.tar.gz"),
+ "tarbuildnum": WithProperties("%(buildnumber)s"),
},
- copy_properties=[ "distname", "bakery" ]))
+ copy_properties=[ "distdirname", "bakery" ]))
bld_kvm_tarbake_jaunty_x86 = {"name": "kvm-tarbake-jaunty-x86",
"slavename": "knielsen-kvm-x86",
@@ -977,6 +995,7 @@
"= rm -Rf rpms && mkdir rpms",
"= scp -P "+port+" 'buildbot@localhost:/usr/src/redhat/RPMS/"+arch+"/*.rpm' rpms/",
]))
+ rpm_fact.addStep(getPackageArchiveStep("rpms", "%(tarbuildnum)s"))
rpm_fact.addStep(Test(
description=["testing", "install"],
descriptionDone=["test", "install"],
@@ -1013,11 +1032,22 @@
deb_fact.addStep(FileDownload(
mastersrc=WithProperties("%(bakerypath)s"),
slavedest=WithProperties("%(bakery)s.tar.gz")))
+ # Extract the compiler warning suppressions file from the source tarball.
+ deb_fact.addStep(ShellCommand(
+ description=["getting", ".supp"],
+ descriptionDone=["get", ".supp"],
+ command=["sh", "-c", WithProperties("""
+rm -f compiler_warnings.supp
+tar zxf "%(distname)s" --strip 2 "$(basename %(distname)s .tar.gz)/support-files/compiler_warnings.supp"
+exit 0 # best-effort, not fatal if no suppression file
+""")]))
deb_fact.addStep(Compile(
description=["making", "debs"],
descriptionDone=["make", "debs"],
logfiles={"kernel": "kernel_"+port+".log"},
warningPattern=gccWarningPattern,
+ warningExtractor=Compile.warnExtractFromRegexpGroups,
+ suppressionFile=WithProperties("compiler_warnings.supp"),
command=["runvm", "--base-image=/kvm/vms/"+kvm_image+"-build.qcow2"] + args +["--logfile=kernel_"+port+".log", "vm-tmp-"+port+".qcow2",
"rm -Rf buildbot && mkdir buildbot",
WithProperties("= scp -P "+port+" %(distname)s buildbot@localhost:buildbot/"),
@@ -1039,6 +1069,7 @@
"= rm -Rf debs",
"= scp -r -P "+port+" buildbot@localhost:buildbot/debs .",
]))
+ deb_fact.addStep(getPackageArchiveStep("debs", "%(tarbuildnum)s"))
deb_fact.addStep(Test(
description=["testing", "install"],
descriptionDone=["test", "install"],
@@ -1144,6 +1175,7 @@
property="bindistname",
command=["cat", "bindistname.txt"],
))
+ bin_fact.addStep(getPackageArchiveStep("%(bindistname)s.tar.gz", "%(tarbuildnum)s"))
bin_fact.addStep(Test(
description=["testing", "bintar"],
descriptionDone=["test", "bintar"],
@@ -1178,14 +1210,55 @@
'slavename': 'opensolaris-x86',
'builddir': 'opensolaris-511-x86',
'factory': f_dbg_sol_64,
- "category": "new",
+ "category": "main",
}
+f_sol_64_bintar = factory.BuildFactory()
+f_sol_64_bintar.addStep(FileDownload(mastersrc=WithProperties("%(tarballpath)s"),
+ slavedest=WithProperties("%(distname)s")))
+f_sol_64_bintar.addStep(ShellCommand(
+ description="untarring", descriptionDone="untar",
+ command=["sh", "-c", WithProperties("rm -Rf %(distdirname)s && tar zxf %(distname)s")]))
+f_sol_64_bintar.addStep(getCompileStep(["sh", "-c", WithProperties("cd %(distdirname)s && BUILD/compile-solaris-amd64-debug")],
+ subdir="%(distdirname)s"))
+f_sol_64_bintar.addStep(ShellCommand(
+ description="packaging", descriptionDone="package",
+ command=["sh", "-c", WithProperties("""
+set -ex
+cd %(distdirname)s
+scripts/make_binary_distribution
+ORIG_TAR=$(echo mariadb-*.tar.gz)
+ORIG_DIR=$(basename ${ORIG_TAR} .tar.gz)
+NEW_DIR="$(echo "${ORIG_DIR}" | sed -e 's/-MariaDB//')"
+NEW_TAR="${NEW_DIR}.tar.gz"
+rm -Rf $ORIG_DIR
+tar zxf $ORIG_TAR
+mv $ORIG_DIR "${NEW_DIR}"
+tar zcf "${NEW_TAR}" "${NEW_DIR}/"
+echo "${NEW_TAR}" > ../bindistname.txt
+""")]))
+f_sol_64_bintar.addStep(SetProperty(
+ property="bindistname",
+ command=["cat", "bindistname.txt"],
+ ))
+f_sol_64_bintar.addStep(getPackageArchiveStep("%(distdirname)s/%(bindistname)s", "%(tarbuildnum)s"))
+f_sol_64_bintar.addStep(getMTR(
+ test_type="nm",
+ test_info="Normal run, no --ps-protocol",
+ command=["sh", "-c", WithProperties("cd %(distdirname)s/mysql-test && exec perl mysql-test-run.pl --force --retry=3 --skip-ndb --parallel=2")],
+ mtr_subdir=WithProperties("%(distdirname)s/mysql-test")))
+
+bld_opensolaris_511_bintar = {'name': 'opensolaris-511-bintar',
+ 'slavename': 'opensolaris-x86',
+ 'builddir': 'opensolaris-511-bintar',
+ 'factory': f_sol_64_bintar,
+ "category": "package",
+ }
+
c['builders'] = [bld_fulltest, bld_fulltest2, bld_work_valgrind,
bld_valgrind_32, bld_rel_amd64, bld_dbg_x86,
bld_makedist,
bld_archivist_amd64, bld_archivist_cnc,
- bld_mac_g5,
bld_dbg2_x86, bld_psergey_win32box, bld_win32_tarball,
bld_win32_zip,
bld_win32_rel_nmake, bld_mac_x86,
@@ -1205,6 +1278,7 @@
bld_kvm_deb_karmic_amd64, bld_kvm_deb_karmic_x86,
bld_kvm_deb_lucid_amd64, bld_kvm_deb_lucid_x86,
bld_kvm_bintar_hardy_amd64, bld_kvm_bintar_hardy_x86,
+ bld_opensolaris_511_bintar,
bld_opensolaris_511_x86,
bld_sol_sparc_32,
]
=== modified file 'buildbot/runvm'
--- a/buildbot/runvm 2010-01-14 13:43:51 +0000
+++ b/buildbot/runvm 2010-02-01 12:44:19 +0000
@@ -52,6 +52,19 @@
my $opt_baseimage= undef;
my @user_cmd_opt;
+# Disable host key checking for ssh.
+# This is a bit convoluted due to OpenSSH's slight security-paranoia.
+# Without this, we would get a login failure if using another VM image
+# (with different host key) on the same port, which is annoying.
+# An alternative would be to use CheckHostIP=no and HostKeyAlias=<img.qcow2>
+# to get ssh to check a different key for each image. But that would still
+# cause an error if re-generating an image (with new ssh host key), and it
+# doesn't really give any additional security.
+my @ssh_cmd_prefix= ($ssh_exec,
+ '-o', 'UserKnownHostsFile=/dev/null',
+ '-o', 'StrictHostKeyChecking=no',
+ '-o', 'LogLevel=ERROR');
+
my $image;
my $pidfile;
@@ -177,17 +190,6 @@
return $res;
}
-sub exec_guest_cmd {
- my ($cmd_and_args)= @_;
- if (scalar(@$cmd_and_args) == 0 || $cmd_and_args->[0] eq '')
- {
- # Otherwise ssh will do an interactive login, which we do not want.
- exit 0;
- }
-
- exec_guest_cmd $ssh_exec, '-p', $opt_port, @user_cmd_opt, 'localhost', @$cmd_and_args;
-}
-
sub is_port_used {
socket(SOCK, PF_INET, SOCK_STREAM, getprotobyname('tcp'))
or die "socket() failed: $!\n";
@@ -348,7 +350,7 @@
die "Fatal error: Cannot fork(): $!\n";
} elsif (!$pid) {
# Child.
- exec_with_print($ssh_exec, '-o', 'ConnectTimeout=4', '-p', $opt_port,
+ exec_with_print(@ssh_cmd_prefix, '-o', 'ConnectTimeout=4', '-p', $opt_port,
@user_cmd_opt, 'localhost',
'sudo', '/sbin/shutdown', '-h', 'now');
} else {
@@ -410,7 +412,7 @@
die "Fatal error: Cannot fork(): $!\n";
} elsif (!$pid) {
# Child.
- exec_with_print($ssh_exec, '-o', 'ConnectTimeout=4', '-p', $opt_port,
+ exec_with_print(@ssh_cmd_prefix, '-o', 'ConnectTimeout=4', '-p', $opt_port,
@user_cmd_opt, 'localhost', '/bin/true');
} else {
# Parent.
@@ -506,7 +508,7 @@
$res= system(substr($arg, 1));
} else {
print STDERR "+ $arg\n";
- $res= system($ssh_exec, '-p', $opt_port, @user_cmd_opt, 'localhost', $arg);
+ $res= system(@ssh_cmd_prefix, '-p', $opt_port, @user_cmd_opt, 'localhost', $arg);
}
if ($res < 0) {
print STDERR "Could not spawn command: $!\n";
1
0

[Maria-developers] Updated (by Alexi): Store in binlog text of statements that caused RBR events (47)
by worklog-noreply@askmonty.org 04 Feb '10
by worklog-noreply@askmonty.org 04 Feb '10
04 Feb '10
-----------------------------------------------------------------------
WORKLOG TASK
-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-
TASK...........: Store in binlog text of statements that caused RBR events
CREATION DATE..: Sat, 15 Aug 2009, 23:48
SUPERVISOR.....: Monty
IMPLEMENTOR....:
COPIES TO......: Knielsen
CATEGORY.......: Server-RawIdeaBin
TASK ID........: 47 (http://askmonty.org/worklog/?tid=47)
VERSION........: Server-9.x
STATUS.........: Un-Assigned
PRIORITY.......: 60
WORKED HOURS...: 20
ESTIMATE.......: 35 (hours remain)
ORIG. ESTIMATE.: 35
PROGRESS NOTES:
-=-=(Alexi - Thu, 04 Feb 2010, 09:54)=-=-
Low Level Design modified.
--- /tmp/wklog.47.old.16174 2010-02-04 09:54:13.000000000 +0200
+++ /tmp/wklog.47.new.16174 2010-02-04 09:54:13.000000000 +0200
@@ -171,35 +171,20 @@
5. How slave IO thread requests Annotate_rows events
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-When requesting an event, the slave should inform the master whether
-it should send Annotate_rows events or not. To that end we add a new
-BINLOG_SEND_ANNOTATE_ROWS_EVENT flag used when requesting an event:
+If the replicate-annotate-rows-events option is not set on a slave, there
+is no need for master to send Annotate_rows events to this slave. The slave
+(or mysqlbinlog in remote case), before requesting binlog dump via the
+COM_BINLOG_DUMP command, informs the master whether it should send these
+events by executing the newly added COM_BINLOG_DUMP_OPTIONS_EXT server
+command:
+
+ case COM_BINLOG_DUMP_OPTIONS_EXT:
+ thd->binlog_dump_flags_ext= packet[0];
+ my_ok(thd);
+ break;
- #define BINLOG_DUMP_NON_BLOCK 1
- #define BINLOG_SEND_ANNOTATE_ROWS_EVENT 2
-
- pthread_handler_t handle_slave_io(void *arg)
- { ...
- request_dump(mysql, ...);
- ...
- }
-
- int request_dump(MYSQL* mysql, ...)
- { ...
- if (opt_log_slave_updates &&
- mi->io_thd->variables.binlog_annotate_rows_events)
- binlog_flags|= BINLOG_SEND_ANNOTATE_ROWS_EVENT;
- ...
- int2store(buf + 4, binlog_flags);
- ...
- simple_command(mysql, COM_BINLOG_DUMP, buf, ...);
- ...
- }
-
-NOTE. mysqlbinlog, when remotely requesting BINLOG_DUMP by calling the
-simple_command() function, should also use this flag if it wants (in case
-of the --print-annotate-rows-events option set) to recieve Annotate_rows
-events.
+Note. We add this new command and don't use COM_BINLOG_DUMP to avoid possible
+conflicts with MySQL/Sun.
6. How master executes the request
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
@@ -338,10 +323,4 @@
to internals(a)lists.mysql.com about this and suggesting to reserve the
event number.
-Also we should notice the introduction of the BINLOG_SEND_ANNOTATE_ROWS_EVENT
-flag taking into account that MySQL/Sun may also introduce a flag with the
-same value to be used in the request_dump-mysql_binlog_send interface.
-But this is mainly the question of merging: if a conflict concerning this
-flag occur, we may simply change the BINLOG_SEND_ANNOTATE_ROWS_EVENT value
-(this does not require additional changes in the code).
-=-=(Alexi - Sun, 20 Dec 2009, 16:00)=-=-
Low Level Design modified.
--- /tmp/wklog.47.old.19667 2009-12-20 14:00:56.000000000 +0000
+++ /tmp/wklog.47.new.19667 2009-12-20 14:00:56.000000000 +0000
@@ -196,6 +196,11 @@
...
}
+NOTE. mysqlbinlog, when remotely requesting BINLOG_DUMP by calling the
+simple_command() function, should also use this flag if it wants (in case
+of the --print-annotate-rows-events option set) to recieve Annotate_rows
+events.
+
6. How master executes the request
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
@@ -212,8 +217,7 @@
Log_event::read_log_event(&log, packet, ...);
...
if ((*packet)[EVENT_TYPE_OFFSET + 1] != ANNOTATE_ROWS_EVENT ||
- flags & BINLOG_SEND_ANNOTATE_ROWS_EVENT ||
- thd->server_id == 0 /* slave == mysqlbinlog */ )
+ flags & BINLOG_SEND_ANNOTATE_ROWS_EVENT)
{
my_net_write(net, packet->ptr(), packet->length());
}
-=-=(Alexi - Sun, 20 Dec 2009, 13:14)=-=-
Low Level Design modified.
--- /tmp/wklog.47.old.11350 2009-12-20 13:14:04.000000000 +0200
+++ /tmp/wklog.47.new.11350 2009-12-20 13:14:04.000000000 +0200
@@ -282,23 +282,18 @@
Annotate_rows_log_event* m_annotate_event;
};
-When the saved Annotate_rows object may be deleted? When all corresponding
-Rows events will be processed, i.e. before processing the first non-Rows
-event (note that Annotate_rows object resides in the binary log *after*
-the (possible) 'BEGIN' Query event which accompanies the rows events; note
-also that this deletion is adjusted with the case when some or all
-corresponding Rows events are filtered out by replicate filter rules):
+The saved Annotate_rows object should be deleted when all corresponding
+Rows events will be processed:
int exec_relay_log_event(THD* thd, Relay_log_info* rli)
{ ...
Log_event *ev= next_event(rli);
...
- if (rli->get_annotate_event() && !IS_RBR_EVENT_TYPE(ev->get_type_code()))
- rli->free_annotate_event();
-
apply_event_and_update_pos(ev, ...);
- if (ev->get_type_code() == ANNOTATE_ROWS_EVENT)
+ if (rli->get_annotate_event() && is_last_rows_event(ev))
+ rli->free_annotate_event();
+ else if (ev->get_type_code() == ANNOTATE_ROWS_EVENT)
rli->set_annotate_event((Annotate_rows_log_event*) ev);
else if (ev->get_type_code() != FORMAT_DESCRIPTION_EVENT)
delete ev;
@@ -307,10 +302,21 @@
where
- #define IS_RBR_EVENT_TYPE(type) ( (type) == TABLE_MAP_EVENT || \
- (type) == WRITE_ROWS_EVENT || \
+ bool is_last_rows_event(Log_event* ev)
+ {
+ Log_event_type type= ev->get_type_code();
+ if (IS_ROWS_EVENT_TYPE(type))
+ {
+ Rows_log_event* rows= (Rows_log_event*)ev;
+ return rows->get_flags(Rows_log_event::STMT_END_F);
+ }
+
+ return 0;
+ }
+
+ #define IS_ROWS_EVENT_TYPE(type) ((type) == WRITE_ROWS_EVENT || \
(type) == UPDATE_ROWS_EVENT || \
- (type) == DELETE_ROWS_EVENT )
+ (type) == DELETE_ROWS_EVENT)
8. General remarks
~~~~~~~~~~~~~~~~~~
-=-=(Alexi - Sun, 20 Dec 2009, 09:29)=-=-
Low Level Design modified.
--- /tmp/wklog.47.old.32726 2009-12-20 07:29:56.000000000 +0000
+++ /tmp/wklog.47.new.32726 2009-12-20 07:29:56.000000000 +0000
@@ -56,7 +56,7 @@
0000006A | 00 | 0 - RESERVED [27]
..... ...
00000081 | 00 | 0 - RESERVED [50]
- 00000082 | 00 | 0 - ANNOTATE_RBR_EVENT [51]
+ 00000082 | 00 | 0 - ANNOTATE_ROWS_EVENT [51]
************************
2. Outline of Annotate_rows event behavior
-=-=(Alexi - Sat, 19 Dec 2009, 16:10)=-=-
Low Level Design modified.
--- /tmp/wklog.47.old.16051 2009-12-19 16:10:48.000000000 +0200
+++ /tmp/wklog.47.new.16051 2009-12-19 16:10:48.000000000 +0200
@@ -253,7 +253,7 @@
thd query to that of the described by the event, i.e. to the query which
caused the subsequent Rows events (see "How Master writes Annotate_rows
events to the binary log" to follow what happens further when the subsequent
-Rows events is applied):
+Rows events are applied):
int Annotate_rows_log_event::do_apply_event(...)
{
-=-=(Alexi - Sat, 19 Dec 2009, 16:02)=-=-
High-Level Specification modified.
--- /tmp/wklog.47.old.15695 2009-12-19 16:02:33.000000000 +0200
+++ /tmp/wklog.47.new.15695 2009-12-19 16:02:33.000000000 +0200
@@ -12,7 +12,7 @@
post-header and contains the query text in its data part. Example:
************************
- ANNOTATE_RBR_EVENT
+ ANNOTATE_ROWS_EVENT
************************
00000220 | B6 A0 2C 4B | time_when = 1261215926
00000224 | 33 | event_type = 51
-=-=(Alexi - Sat, 19 Dec 2009, 15:58)=-=-
Low Level Design modified.
--- /tmp/wklog.47.old.15437 2009-12-19 13:58:12.000000000 +0000
+++ /tmp/wklog.47.new.15437 2009-12-19 13:58:12.000000000 +0000
@@ -1 +1,337 @@
+Content
+~~~~~~~
+ 1. Annotate_rows event number
+ 2. Outline of Annotate_rows event behavior
+ 3. How Master writes Annotate_rows events to the binary log
+ 4. How slave treats replicate-annotate-rows-events option
+ 5. How slave IO thread requests Annotate_rows events
+ 6. How master executes the request
+ 7. How slave SQL thread processes Annotate_rows events
+ 8. General remarks
+
+1. Annotate_rows event number
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+To avoid possible event numbers conflict with MySQL/Sun, we leave a gap
+between the last MySQL event number and the Annotate_rows event number:
+
+ enum Log_event_type
+ { ...
+ INCIDENT_EVENT= 26,
+ // New MySQL event numbers are to be added here
+ MYSQL_EVENTS_END,
+
+ MARIA_EVENTS_BEGIN= 51,
+ // New Maria event numbers start from here
+ ANNOTATE_ROWS_EVENT= 51,
+
+ ENUM_END_EVENT
+ };
+
+together with the corresponding extension of 'post_header_len' array in the
+Format description event. (This extension does not affect the compatibility
+of the binary log). Here is how Format description event looks like with
+this extension:
+
+ ************************
+ FORMAT_DESCRIPTION_EVENT
+ ************************
+ 00000004 | A1 A0 2C 4B | time_when = 1261215905
+ 00000008 | 0F | event_type = 15
+ 00000009 | 64 00 00 00 | server_id = 100
+ 0000000D | 7F 00 00 00 | event_len = 127
+ 00000011 | 83 00 00 00 | log_pos = 00000083
+ 00000015 | 01 00 | flags = LOG_EVENT_BINLOG_IN_USE_F
+ ------------------------
+ 00000017 | 04 00 | binlog_ver = 4
+ 00000019 | 35 2E 32 2E | server_ver = 5.2.0-MariaDB-alpha-debug-log
+ ..... ...
+ 0000004B | A1 A0 2C 4B | time_created = 1261215905
+ 0000004F | 13 | common_header_len = 19
+ ------------------------
+ post_header_len
+ ------------------------
+ 00000050 | 38 | 56 - START_EVENT_V3 [1]
+ ..... ...
+ 00000069 | 02 | 2 - INCIDENT_EVENT [26]
+ 0000006A | 00 | 0 - RESERVED [27]
+ ..... ...
+ 00000081 | 00 | 0 - RESERVED [50]
+ 00000082 | 00 | 0 - ANNOTATE_RBR_EVENT [51]
+ ************************
+
+2. Outline of Annotate_rows event behavior
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+Each Annotate_rows_log_event object has two private members describing the
+corresponding query:
+
+ char *m_query_txt;
+ uint m_query_len;
+
+When the object is created for writing to a binary log, this query is taken
+from 'thd' (for short, below we omit the 'Annotate_rows_log_event::' prefix
+as well as other implementation details):
+
+ Annotate_rows_log_event(THD *thd)
+ {
+ m_query_txt = thd->query();
+ m_query_len = thd->query_length();
+ }
+
+When the object is read from a binary log, the query is taken from the buffer
+containing the binary log representation of the event (this buffer is allocated
+in Log_event object from which all Log events are derived):
+
+ Annotate_rows_log_event(char *buf, uint event_len,
+ Format_description_log_event *desc)
+ {
+ m_query_len = event_len - desc->common_header_len;
+ m_query_txt = buf + desc->common_header_len;
+ }
+
+The events are written to the binary log by the Log_event::write() member
+which calls virtual write_data_header() and write_data_body() members
+("data header" and "post header" are synonym in replication terminology).
+In our case, data header is empty and data body is just the query:
+
+ bool write_data_body(IO_CACHE *file)
+ {
+ return my_b_safe_write(file, (uchar*) m_query_txt, m_query_len);
+ }
+
+Printing the event is just printing the query:
+
+ void Annotate_rows_log_event::print(FILE *file, PRINT_EVENT_INFO *pinfo)
+ {
+ my_b_printf(&pinfo->head_cache, "\tQuery: `%s`\n", m_query_txt);
+ }
+
+3. How Master writes Annotate_rows events to the binary log
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+The event is written to the binary log just before the group of Table_map
+events which precede corresponding Rows events (one query may generate
+several Table map events in the binary log, but the corresponding
+Annotate_rows event must be written only once before the first Table map
+event; hence the boolean variable 'with_annotate' below):
+
+ int write_locked_table_maps(THD *thd)
+ { ...
+ bool with_annotate= thd->variables.binlog_annotate_rows_events;
+ ...
+ for (uint i= 0; i < ... <number of tables> ...; ++i)
+ { ...
+ thd->binlog_write_table_map(table, ..., with_annotate);
+ with_annotate= 0; // write Annotate_event not more than once
+ ...
+ }
+ ...
+ }
+
+ int THD::binlog_write_table_map(TABLE *table, ..., bool with_annotate)
+ { ...
+ Table_map_log_event the_event(...);
+ ...
+ if (with_annotate)
+ {
+ Annotate_rows_log_event anno(this);
+ mysql_bin_log.write(&anno);
+ }
+
+ mysql_bin_log.write(&the_event);
+ ...
+ }
+
+4. How slave treats replicate-annotate-rows-events option
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+The replicate-annotate-rows-events option is treated just as the session
+value of the binlog_annotate_rows_events variable for the slave IO and
+SQL threads. This setting is done during initialization of these threads:
+
+ pthread_handler_t handle_slave_io(void *arg)
+ {
+ THD *thd= new THD;
+ ...
+ init_slave_thread(thd, SLAVE_THD_IO);
+ ...
+ }
+
+ pthread_handler_t handle_slave_sql(void *arg)
+ {
+ THD *thd= new THD;
+ ...
+ init_slave_thread(thd, SLAVE_THD_SQL);
+ ...
+ }
+
+ int init_slave_thread(THD* thd, SLAVE_THD_TYPE thd_type)
+ { ...
+ thd->variables.binlog_annotate_rows_events=
+ opt_replicate_annotate_rows_events;
+ ...
+ }
+
+5. How slave IO thread requests Annotate_rows events
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+When requesting an event, the slave should inform the master whether
+it should send Annotate_rows events or not. To that end we add a new
+BINLOG_SEND_ANNOTATE_ROWS_EVENT flag used when requesting an event:
+
+ #define BINLOG_DUMP_NON_BLOCK 1
+ #define BINLOG_SEND_ANNOTATE_ROWS_EVENT 2
+
+ pthread_handler_t handle_slave_io(void *arg)
+ { ...
+ request_dump(mysql, ...);
+ ...
+ }
+
+ int request_dump(MYSQL* mysql, ...)
+ { ...
+ if (opt_log_slave_updates &&
+ mi->io_thd->variables.binlog_annotate_rows_events)
+ binlog_flags|= BINLOG_SEND_ANNOTATE_ROWS_EVENT;
+ ...
+ int2store(buf + 4, binlog_flags);
+ ...
+ simple_command(mysql, COM_BINLOG_DUMP, buf, ...);
+ ...
+ }
+
+6. How master executes the request
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+ case COM_BINLOG_DUMP:
+ { ...
+ flags= uint2korr(packet + 4);
+ ...
+ mysql_binlog_send(thd, ..., flags);
+ ...
+ }
+
+ void mysql_binlog_send(THD* thd, ..., ushort flags)
+ { ...
+ Log_event::read_log_event(&log, packet, ...);
+ ...
+ if ((*packet)[EVENT_TYPE_OFFSET + 1] != ANNOTATE_ROWS_EVENT ||
+ flags & BINLOG_SEND_ANNOTATE_ROWS_EVENT ||
+ thd->server_id == 0 /* slave == mysqlbinlog */ )
+ {
+ my_net_write(net, packet->ptr(), packet->length());
+ }
+ ...
+ }
+
+7. How slave SQL thread processes Annotate_rows events
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+The slave processes each recieved event by "applying" it, i.e. by
+calling the Log_event::apply_event() function which in turn calls
+the virtual do_apply_event() member specific for each type of the
+event.
+
+ int exec_relay_log_event(THD* thd, Relay_log_info* rli)
+ { ...
+ Log_event *ev = next_event(rli);
+ ...
+ apply_event_and_update_pos(ev, ...);
+
+ if (ev->get_type_code() != FORMAT_DESCRIPTION_EVENT)
+ delete ev;
+ ...
+ }
+
+ int apply_event_and_update_pos(Log_event *ev, ...)
+ { ...
+ ev->apply_event(...);
+ ...
+ }
+
+ int Log_event::apply_event(...)
+ {
+ return do_apply_event(...);
+ }
+
+What does it mean to "apply" an Annotate_rows event? It means to set current
+thd query to that of the described by the event, i.e. to the query which
+caused the subsequent Rows events (see "How Master writes Annotate_rows
+events to the binary log" to follow what happens further when the subsequent
+Rows events is applied):
+
+ int Annotate_rows_log_event::do_apply_event(...)
+ {
+ thd->set_query(m_query_txt, m_query_len);
+ }
+
+NOTE. I am not sure, but possibly current values of thd->query and
+thd->query_length should be saved before calling set_query() and to be
+restored on the Annotate_rows_log_event object deletion.
+Is it really needed ?
+
+After calling this do_apply_event() function we may not delete the
+Annotate_rows_log_event object immediatedly (see exec_relay_log_event()
+above) because thd->query now points to the string inside this object.
+We may keep the pointer to this object in the Relay_log_info:
+
+ class Relay_log_info
+ {
+ public:
+ ...
+ void set_annotate_event(Annotate_rows_log_event*);
+ Annotate_rows_log_event* get_annotate_event();
+ void free_annotate_event();
+ ...
+ private:
+ Annotate_rows_log_event* m_annotate_event;
+ };
+
+When the saved Annotate_rows object may be deleted? When all corresponding
+Rows events will be processed, i.e. before processing the first non-Rows
+event (note that Annotate_rows object resides in the binary log *after*
+the (possible) 'BEGIN' Query event which accompanies the rows events; note
+also that this deletion is adjusted with the case when some or all
+corresponding Rows events are filtered out by replicate filter rules):
+
+ int exec_relay_log_event(THD* thd, Relay_log_info* rli)
+ { ...
+ Log_event *ev= next_event(rli);
+ ...
+ if (rli->get_annotate_event() && !IS_RBR_EVENT_TYPE(ev->get_type_code()))
+ rli->free_annotate_event();
+
+ apply_event_and_update_pos(ev, ...);
+
+ if (ev->get_type_code() == ANNOTATE_ROWS_EVENT)
+ rli->set_annotate_event((Annotate_rows_log_event*) ev);
+ else if (ev->get_type_code() != FORMAT_DESCRIPTION_EVENT)
+ delete ev;
+ ...
+ }
+
+where
+
+ #define IS_RBR_EVENT_TYPE(type) ( (type) == TABLE_MAP_EVENT || \
+ (type) == WRITE_ROWS_EVENT || \
+ (type) == UPDATE_ROWS_EVENT || \
+ (type) == DELETE_ROWS_EVENT )
+
+8. General remarks
+~~~~~~~~~~~~~~~~~~
+Kristian noticed that introducing new log event type should be coordinated
+somehow with MySQL/Sun:
+
+ Kristian: The numeric code for this event must be assigned carefully.
+ It should be coordinated with MySQL/Sun, otherwise we can get into a
+ situation where MySQL uses the same numeric code for one event that
+ MariaDB uses for ANNOTATE_ROWS_EVENT, which would make merging the two
+ impossible.
+ Alex: I reserved about 20 numbers not to have possible conflicts
+ with MySQL.
+ Kristian: Still, I think it would be appropriate to send a polite email
+ to internals(a)lists.mysql.com about this and suggesting to reserve the
+ event number.
+
+Also we should notice the introduction of the BINLOG_SEND_ANNOTATE_ROWS_EVENT
+flag taking into account that MySQL/Sun may also introduce a flag with the
+same value to be used in the request_dump-mysql_binlog_send interface.
+But this is mainly the question of merging: if a conflict concerning this
+flag occur, we may simply change the BINLOG_SEND_ANNOTATE_ROWS_EVENT value
+(this does not require additional changes in the code).
-=-=(Alexi - Sat, 19 Dec 2009, 15:41)=-=-
High-Level Specification modified.
--- /tmp/wklog.47.old.14545 2009-12-19 15:41:21.000000000 +0200
+++ /tmp/wklog.47.new.14545 2009-12-19 15:41:21.000000000 +0200
@@ -1,122 +1,107 @@
-First suggestion:
-
-> I think for this we would actually need a new binlog event type
-> (Comment_log_event?). Unless we want to log an empty statement Query_log_event
-> containing only a comment (a bit of a hack).
-
-New server option
-~~~~~~~~~~~~~~~~~
- --binlog-annotate-rows-events
-
-Setting this option makes RBR (rows-) events in the binary log to be
-preceded by Annotate rows events (see below). The corresponding
-'binlog_annotate_rows_events' system variable is dynamic and has both
-global and session values. Default global value is OFF.
-
-Note. Session values are usefull to make it possible to annotate only
- some selected statements:
+Content
+~~~~~~~
+ 1. Annotate_rows_log_event
+ 2. Server option: --binlog-annotate-rows-events
+ 3. Server option: --replicate-annotate-rows-events
+ 4. mysqlbinlog option: --print-annotate-rows-events
+ 5. mysqlbinlog output
+
+1. Annotate_rows_log_event [ ANNOTATE_ROWS_EVENT ]
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+Describes the query which caused the corresponding rows events. Has empty
+post-header and contains the query text in its data part. Example:
+
+ ************************
+ ANNOTATE_RBR_EVENT
+ ************************
+ 00000220 | B6 A0 2C 4B | time_when = 1261215926
+ 00000224 | 33 | event_type = 51
+ 00000225 | 64 00 00 00 | server_id = 100
+ 00000229 | 36 00 00 00 | event_len = 54
+ 0000022D | 56 02 00 00 | log_pos = 00000256
+ 00000231 | 00 00 | flags = <none>
+ ------------------------
+ 00000233 | 49 4E 53 45 | query = "INSERT INTO t1 VALUES (1), (2), (3)"
+ 00000237 | 52 54 20 49 |
+ 0000023B | 4E 54 4F 20 |
+ 0000023F | 74 31 20 56 |
+ 00000243 | 41 4C 55 45 |
+ 00000247 | 53 20 28 31 |
+ 0000024B | 29 2C 20 28 |
+ 0000024F | 32 29 2C 20 |
+ 00000253 | 28 33 29 |
+ ************************
+
+In binary log, Annotate_rows event follows the (possible) 'BEGIN' Query event
+and precedes the first of Table map events which accompany the corresponding
+rows events. (See example in the "mysqlbinlog output" section below.)
+
+2. Server option: --binlog-annotate-rows-events
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+Tells the master to write Annotate_rows events to the binary log.
+
+ * Variable Name: binlog_annotate_rows_events
+ * Scope: Global & Session
+ * Access Type: Dynamic
+ * Data Type: bool
+ * Default Value: OFF
+NOTE. Session values allows to annotate only some selected statements:
...
SET SESSION binlog_annotate_rows_events=ON;
... statements to be annotated ...
SET SESSION binlog_annotate_rows_events=OFF;
... statements not to be annotated ...
-New binlog event type
-~~~~~~~~~~~~~~~~~~~~~
- Annotate_rows_log_event [ ANNOTATE_ROWS_EVENT ]
-
-Describes the query which caused the corresponding rows event. In binary log,
-precedes each Table_map_log_event. Contains empty post-header and the query
-text in its data part.
-
-The numeric code for this event must be assigned carefully. It should be
-coordinated with MySQL/Sun, otherwise we can get into a situation where MySQL
-uses the same numeric code for one event that MariaDB uses for
-ANNOTATE_ROWS_EVENT, which would make merging the two impossible.
-
-Example:
-
- ...
- ************************
- ANNOTATE_ROWS_EVENT [51]
- ************************
- 000000C7 | 54 1B 12 4B | time_when = 1259477844
- 000000CB | 33 | event_type = 51
- 000000CC | 64 00 00 00 | server_id = 100
- 000000D0 | 2C 00 00 00 | event_len = 44
- 000000D4 | F3 00 00 00 | log_pos = 000000F3
- 000000D8 | 00 00 | flags = <none>
- ------------------------
- 000000DA | 69 6E 73 65 | query = "insert into t1 values (1)"
- 000000DE | 72 74 20 69 |
- 000000E2 | 6E 74 6F 20 |
- 000000E6 | 74 31 20 76 |
- 000000EA | 61 6C 75 65 |
- 000000EE | 73 20 28 31 |
- 000000F2 | 29 |
- ************************
- TABLE_MAP_EVENT [19]
- ************************
- 000000F3 | 54 1B 12 4B | time_when = 1259477844
- 000000F7 | 13 | event_type = 19
- 000000F8 | 64 00 00 00 | server_id = 100
- 000000FC | 29 00 00 00 | event_len = 41
- 00000100 | 1C 01 00 00 | log_pos = 0000011C
- 00000104 | 00 00 | flags = <none>
- ------------------------
- ...
- ************************
- WRITE_ROWS_EVENT [23]
- ************************
- 0000011C | 54 1B 12 4B | time_when = 1259477844
- 00000120 | 17 | event_type = 23
- 00000121 | 64 00 00 00 | server_id = 100
- 00000125 | 22 00 00 00 | event_len = 34
- 00000129 | 3E 01 00 00 | log_pos = 0000013E
- 0000012D | 10 00 | flags = LOG_EVENT_UPDATE_TABLE_MAP_VERSION_F
- ------------------------
- 0000012F | 0F 00 00 00 | table_id = 15
- ...
-
-New mysqlbinlog option
-~~~~~~~~~~~~~~~~~~~~~~
- --print-annotate-rows-events
-
-With this option, mysqlbinlog prints the content of Annotate-rows
-events (if the binary log does contain them). Without this option
-(i.e. by default), mysqlbinlog skips Annotate rows events.
-
-
-mysqlbinlog output
-~~~~~~~~~~~~~~~~~~
-Something like this:
+3. Server option: --replicate-annotate-rows-events
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+Tells the slave to reproduce Annotate_rows events recieved from the master
+in its own binary log (sensible only in pair with log-slave-updates option).
+
+ * Variable Name: replicate_annotate_rows_events
+ * Scope: Global
+ * Access Type: Read only
+ * Data Type: bool
+ * Default Value: OFF
+
+NOTE. Why do we additionally need this 'replicate' option? Why not to make
+the slave to reproduce this events when its binlog-annotate-rows-events
+global value is ON? Well, because, for example, we may want to configure
+the slave which should reproduce Annotate_rows events but has global
+binlog-annotate-rows-events = OFF meaning this to be the default value for
+the client threads (see also "How slave treats replicate-annotate-rows-events
+option" in LLD part).
+
+4. mysqlbinlog option: --print-annotate-rows-events
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+With this option, mysqlbinlog prints the content of Annotate_rows events (if
+the binary log does contain them). Without this option (i.e. by default),
+mysqlbinlog skips Annotate_rows events.
+5. mysqlbinlog output
+~~~~~~~~~~~~~~~~~~~~~
+With --print-annotate-rows-events, mysqlbinlog outputs Annotate_rows events
+in a form like this:
...
- # at 199
- # at 243
- # at 284
- #091129 9:57:24 server id 100 end_log_pos 243 Query: `insert into t1 values
-(1)`
- #091129 9:57:24 server id 100 end_log_pos 284 Table_map: `test`.`t1` mapped
-to number 15
- #091129 9:57:24 server id 100 end_log_pos 318 Write_rows: table id 15
+ # at 1646
+ #091219 12:45:26 server id 100 end_log_pos 1714 Query thread_id=1
+exec_time=0 error_code=0
+ SET TIMESTAMP=1261215926/*!*/;
+ BEGIN
+ /*!*/;
+ # at 1714
+ # at 1812
+ # at 1853
+ # at 1894
+ # at 1938
+ #091219 12:45:26 server id 100 end_log_pos 1812 Query: `DELETE t1, t2 FROM
+t1 INNER JOIN t2 INNER JOIN t3 WHERE t1.a=t2.a AND t2.a=t3.a`
+ #091219 12:45:26 server id 100 end_log_pos 1853 Table_map: `test`.`t1`
+mapped to number 16
+ #091219 12:45:26 server id 100 end_log_pos 1894 Table_map: `test`.`t2`
+mapped to number 17
+ #091219 12:45:26 server id 100 end_log_pos 1938 Delete_rows: table id 16
+ #091219 12:45:26 server id 100 end_log_pos 1982 Delete_rows: table id 17
flags: STMT_END_F
-
- BINLOG '
- VBsSSzNkAAAALAAAAPMAAAAAAGluc2VydCBpbnRvIHQxIHZhbHVlcyAoMSk=
- VBsSSxNkAAAAKQAAABwBAAAAAA8AAAAAAAAABHRlc3QAAnQxAAEDAAE=
- VBsSSxdkAAAAIgAAAD4BAAAQAA8AAAAAAAEAAf/+AQAAAA==
- '/*!*/;
- ### INSERT INTO test.t1
- ### SET
- ### @1=1 /* INT meta=0 nullable=1 is_null=0 */
...
-When master sends Annotate rows events
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-1. Master always sends Annotate_rows events to mysqlbinlog (in
- remote case).
-2. Master sends Annotate_rows events to a slave only if the slave has
- both log-slave-updates and binlog-annotate-rows-events options set.
-
-=-=(Bothorsen - Fri, 18 Dec 2009, 16:22)=-=-
Add estimation time.
Worked 5 hours and estimate 35 hours remain (original estimate increased by 5 hours).
-=-=(Bothorsen - Fri, 18 Dec 2009, 16:16)=-=-
This is the work done on this patch so far. Most of it done by Alex.
Worked 15 hours and estimate 035 hours remain (original estimate increased by 50 hours).
------------------------------------------------------------
-=-=(View All Progress Notes, 20 total)=-=-
http://askmonty.org/worklog/index.pl?tid=47&nolimit=1
DESCRIPTION:
Store in binlog (and show in mysqlbinlog output) texts of statements that
caused RBR events
This is needed for (list from Monty):
- Easier to understand why updates happened
- Would make it easier to find out where in application things went
wrong (as you can search for exact strings)
- Allow one to filter things based on comments in the statement.
The cost of this can be that the binlog will be approximately 2x in size
(especially insert of big blob's would be a bit painful), so this should
be an optional feature.
HIGH-LEVEL SPECIFICATION:
Content
~~~~~~~
1. Annotate_rows_log_event
2. Server option: --binlog-annotate-rows-events
3. Server option: --replicate-annotate-rows-events
4. mysqlbinlog option: --print-annotate-rows-events
5. mysqlbinlog output
1. Annotate_rows_log_event [ ANNOTATE_ROWS_EVENT ]
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
Describes the query which caused the corresponding rows events. Has empty
post-header and contains the query text in its data part. Example:
************************
ANNOTATE_ROWS_EVENT
************************
00000220 | B6 A0 2C 4B | time_when = 1261215926
00000224 | 33 | event_type = 51
00000225 | 64 00 00 00 | server_id = 100
00000229 | 36 00 00 00 | event_len = 54
0000022D | 56 02 00 00 | log_pos = 00000256
00000231 | 00 00 | flags = <none>
------------------------
00000233 | 49 4E 53 45 | query = "INSERT INTO t1 VALUES (1), (2), (3)"
00000237 | 52 54 20 49 |
0000023B | 4E 54 4F 20 |
0000023F | 74 31 20 56 |
00000243 | 41 4C 55 45 |
00000247 | 53 20 28 31 |
0000024B | 29 2C 20 28 |
0000024F | 32 29 2C 20 |
00000253 | 28 33 29 |
************************
In binary log, Annotate_rows event follows the (possible) 'BEGIN' Query event
and precedes the first of Table map events which accompany the corresponding
rows events. (See example in the "mysqlbinlog output" section below.)
2. Server option: --binlog-annotate-rows-events
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
Tells the master to write Annotate_rows events to the binary log.
* Variable Name: binlog_annotate_rows_events
* Scope: Global & Session
* Access Type: Dynamic
* Data Type: bool
* Default Value: OFF
NOTE. Session values allows to annotate only some selected statements:
...
SET SESSION binlog_annotate_rows_events=ON;
... statements to be annotated ...
SET SESSION binlog_annotate_rows_events=OFF;
... statements not to be annotated ...
3. Server option: --replicate-annotate-rows-events
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
Tells the slave to reproduce Annotate_rows events recieved from the master
in its own binary log (sensible only in pair with log-slave-updates option).
* Variable Name: replicate_annotate_rows_events
* Scope: Global
* Access Type: Read only
* Data Type: bool
* Default Value: OFF
NOTE. Why do we additionally need this 'replicate' option? Why not to make
the slave to reproduce this events when its binlog-annotate-rows-events
global value is ON? Well, because, for example, we may want to configure
the slave which should reproduce Annotate_rows events but has global
binlog-annotate-rows-events = OFF meaning this to be the default value for
the client threads (see also "How slave treats replicate-annotate-rows-events
option" in LLD part).
4. mysqlbinlog option: --print-annotate-rows-events
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
With this option, mysqlbinlog prints the content of Annotate_rows events (if
the binary log does contain them). Without this option (i.e. by default),
mysqlbinlog skips Annotate_rows events.
5. mysqlbinlog output
~~~~~~~~~~~~~~~~~~~~~
With --print-annotate-rows-events, mysqlbinlog outputs Annotate_rows events
in a form like this:
...
# at 1646
#091219 12:45:26 server id 100 end_log_pos 1714 Query thread_id=1
exec_time=0 error_code=0
SET TIMESTAMP=1261215926/*!*/;
BEGIN
/*!*/;
# at 1714
# at 1812
# at 1853
# at 1894
# at 1938
#091219 12:45:26 server id 100 end_log_pos 1812 Query: `DELETE t1, t2 FROM
t1 INNER JOIN t2 INNER JOIN t3 WHERE t1.a=t2.a AND t2.a=t3.a`
#091219 12:45:26 server id 100 end_log_pos 1853 Table_map: `test`.`t1`
mapped to number 16
#091219 12:45:26 server id 100 end_log_pos 1894 Table_map: `test`.`t2`
mapped to number 17
#091219 12:45:26 server id 100 end_log_pos 1938 Delete_rows: table id 16
#091219 12:45:26 server id 100 end_log_pos 1982 Delete_rows: table id 17
flags: STMT_END_F
...
LOW-LEVEL DESIGN:
Content
~~~~~~~
1. Annotate_rows event number
2. Outline of Annotate_rows event behavior
3. How Master writes Annotate_rows events to the binary log
4. How slave treats replicate-annotate-rows-events option
5. How slave IO thread requests Annotate_rows events
6. How master executes the request
7. How slave SQL thread processes Annotate_rows events
8. General remarks
1. Annotate_rows event number
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
To avoid possible event numbers conflict with MySQL/Sun, we leave a gap
between the last MySQL event number and the Annotate_rows event number:
enum Log_event_type
{ ...
INCIDENT_EVENT= 26,
// New MySQL event numbers are to be added here
MYSQL_EVENTS_END,
MARIA_EVENTS_BEGIN= 51,
// New Maria event numbers start from here
ANNOTATE_ROWS_EVENT= 51,
ENUM_END_EVENT
};
together with the corresponding extension of 'post_header_len' array in the
Format description event. (This extension does not affect the compatibility
of the binary log). Here is how Format description event looks like with
this extension:
************************
FORMAT_DESCRIPTION_EVENT
************************
00000004 | A1 A0 2C 4B | time_when = 1261215905
00000008 | 0F | event_type = 15
00000009 | 64 00 00 00 | server_id = 100
0000000D | 7F 00 00 00 | event_len = 127
00000011 | 83 00 00 00 | log_pos = 00000083
00000015 | 01 00 | flags = LOG_EVENT_BINLOG_IN_USE_F
------------------------
00000017 | 04 00 | binlog_ver = 4
00000019 | 35 2E 32 2E | server_ver = 5.2.0-MariaDB-alpha-debug-log
..... ...
0000004B | A1 A0 2C 4B | time_created = 1261215905
0000004F | 13 | common_header_len = 19
------------------------
post_header_len
------------------------
00000050 | 38 | 56 - START_EVENT_V3 [1]
..... ...
00000069 | 02 | 2 - INCIDENT_EVENT [26]
0000006A | 00 | 0 - RESERVED [27]
..... ...
00000081 | 00 | 0 - RESERVED [50]
00000082 | 00 | 0 - ANNOTATE_ROWS_EVENT [51]
************************
2. Outline of Annotate_rows event behavior
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
Each Annotate_rows_log_event object has two private members describing the
corresponding query:
char *m_query_txt;
uint m_query_len;
When the object is created for writing to a binary log, this query is taken
from 'thd' (for short, below we omit the 'Annotate_rows_log_event::' prefix
as well as other implementation details):
Annotate_rows_log_event(THD *thd)
{
m_query_txt = thd->query();
m_query_len = thd->query_length();
}
When the object is read from a binary log, the query is taken from the buffer
containing the binary log representation of the event (this buffer is allocated
in Log_event object from which all Log events are derived):
Annotate_rows_log_event(char *buf, uint event_len,
Format_description_log_event *desc)
{
m_query_len = event_len - desc->common_header_len;
m_query_txt = buf + desc->common_header_len;
}
The events are written to the binary log by the Log_event::write() member
which calls virtual write_data_header() and write_data_body() members
("data header" and "post header" are synonym in replication terminology).
In our case, data header is empty and data body is just the query:
bool write_data_body(IO_CACHE *file)
{
return my_b_safe_write(file, (uchar*) m_query_txt, m_query_len);
}
Printing the event is just printing the query:
void Annotate_rows_log_event::print(FILE *file, PRINT_EVENT_INFO *pinfo)
{
my_b_printf(&pinfo->head_cache, "\tQuery: `%s`\n", m_query_txt);
}
3. How Master writes Annotate_rows events to the binary log
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
The event is written to the binary log just before the group of Table_map
events which precede corresponding Rows events (one query may generate
several Table map events in the binary log, but the corresponding
Annotate_rows event must be written only once before the first Table map
event; hence the boolean variable 'with_annotate' below):
int write_locked_table_maps(THD *thd)
{ ...
bool with_annotate= thd->variables.binlog_annotate_rows_events;
...
for (uint i= 0; i < ... <number of tables> ...; ++i)
{ ...
thd->binlog_write_table_map(table, ..., with_annotate);
with_annotate= 0; // write Annotate_event not more than once
...
}
...
}
int THD::binlog_write_table_map(TABLE *table, ..., bool with_annotate)
{ ...
Table_map_log_event the_event(...);
...
if (with_annotate)
{
Annotate_rows_log_event anno(this);
mysql_bin_log.write(&anno);
}
mysql_bin_log.write(&the_event);
...
}
4. How slave treats replicate-annotate-rows-events option
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
The replicate-annotate-rows-events option is treated just as the session
value of the binlog_annotate_rows_events variable for the slave IO and
SQL threads. This setting is done during initialization of these threads:
pthread_handler_t handle_slave_io(void *arg)
{
THD *thd= new THD;
...
init_slave_thread(thd, SLAVE_THD_IO);
...
}
pthread_handler_t handle_slave_sql(void *arg)
{
THD *thd= new THD;
...
init_slave_thread(thd, SLAVE_THD_SQL);
...
}
int init_slave_thread(THD* thd, SLAVE_THD_TYPE thd_type)
{ ...
thd->variables.binlog_annotate_rows_events=
opt_replicate_annotate_rows_events;
...
}
5. How slave IO thread requests Annotate_rows events
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
If the replicate-annotate-rows-events option is not set on a slave, there
is no need for master to send Annotate_rows events to this slave. The slave
(or mysqlbinlog in remote case), before requesting binlog dump via the
COM_BINLOG_DUMP command, informs the master whether it should send these
events by executing the newly added COM_BINLOG_DUMP_OPTIONS_EXT server
command:
case COM_BINLOG_DUMP_OPTIONS_EXT:
thd->binlog_dump_flags_ext= packet[0];
my_ok(thd);
break;
Note. We add this new command and don't use COM_BINLOG_DUMP to avoid possible
conflicts with MySQL/Sun.
6. How master executes the request
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
case COM_BINLOG_DUMP:
{ ...
flags= uint2korr(packet + 4);
...
mysql_binlog_send(thd, ..., flags);
...
}
void mysql_binlog_send(THD* thd, ..., ushort flags)
{ ...
Log_event::read_log_event(&log, packet, ...);
...
if ((*packet)[EVENT_TYPE_OFFSET + 1] != ANNOTATE_ROWS_EVENT ||
flags & BINLOG_SEND_ANNOTATE_ROWS_EVENT)
{
my_net_write(net, packet->ptr(), packet->length());
}
...
}
7. How slave SQL thread processes Annotate_rows events
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
The slave processes each recieved event by "applying" it, i.e. by
calling the Log_event::apply_event() function which in turn calls
the virtual do_apply_event() member specific for each type of the
event.
int exec_relay_log_event(THD* thd, Relay_log_info* rli)
{ ...
Log_event *ev = next_event(rli);
...
apply_event_and_update_pos(ev, ...);
if (ev->get_type_code() != FORMAT_DESCRIPTION_EVENT)
delete ev;
...
}
int apply_event_and_update_pos(Log_event *ev, ...)
{ ...
ev->apply_event(...);
...
}
int Log_event::apply_event(...)
{
return do_apply_event(...);
}
What does it mean to "apply" an Annotate_rows event? It means to set current
thd query to that of the described by the event, i.e. to the query which
caused the subsequent Rows events (see "How Master writes Annotate_rows
events to the binary log" to follow what happens further when the subsequent
Rows events are applied):
int Annotate_rows_log_event::do_apply_event(...)
{
thd->set_query(m_query_txt, m_query_len);
}
NOTE. I am not sure, but possibly current values of thd->query and
thd->query_length should be saved before calling set_query() and to be
restored on the Annotate_rows_log_event object deletion.
Is it really needed ?
After calling this do_apply_event() function we may not delete the
Annotate_rows_log_event object immediatedly (see exec_relay_log_event()
above) because thd->query now points to the string inside this object.
We may keep the pointer to this object in the Relay_log_info:
class Relay_log_info
{
public:
...
void set_annotate_event(Annotate_rows_log_event*);
Annotate_rows_log_event* get_annotate_event();
void free_annotate_event();
...
private:
Annotate_rows_log_event* m_annotate_event;
};
The saved Annotate_rows object should be deleted when all corresponding
Rows events will be processed:
int exec_relay_log_event(THD* thd, Relay_log_info* rli)
{ ...
Log_event *ev= next_event(rli);
...
apply_event_and_update_pos(ev, ...);
if (rli->get_annotate_event() && is_last_rows_event(ev))
rli->free_annotate_event();
else if (ev->get_type_code() == ANNOTATE_ROWS_EVENT)
rli->set_annotate_event((Annotate_rows_log_event*) ev);
else if (ev->get_type_code() != FORMAT_DESCRIPTION_EVENT)
delete ev;
...
}
where
bool is_last_rows_event(Log_event* ev)
{
Log_event_type type= ev->get_type_code();
if (IS_ROWS_EVENT_TYPE(type))
{
Rows_log_event* rows= (Rows_log_event*)ev;
return rows->get_flags(Rows_log_event::STMT_END_F);
}
return 0;
}
#define IS_ROWS_EVENT_TYPE(type) ((type) == WRITE_ROWS_EVENT || \
(type) == UPDATE_ROWS_EVENT || \
(type) == DELETE_ROWS_EVENT)
8. General remarks
~~~~~~~~~~~~~~~~~~
Kristian noticed that introducing new log event type should be coordinated
somehow with MySQL/Sun:
Kristian: The numeric code for this event must be assigned carefully.
It should be coordinated with MySQL/Sun, otherwise we can get into a
situation where MySQL uses the same numeric code for one event that
MariaDB uses for ANNOTATE_ROWS_EVENT, which would make merging the two
impossible.
Alex: I reserved about 20 numbers not to have possible conflicts
with MySQL.
Kristian: Still, I think it would be appropriate to send a polite email
to internals(a)lists.mysql.com about this and suggesting to reserve the
event number.
ESTIMATED WORK TIME
ESTIMATED COMPLETION DATE
-----------------------------------------------------------------------
WorkLog (v3.5.9)
1
0

[Maria-developers] Updated (by Alexi): Store in binlog text of statements that caused RBR events (47)
by worklog-noreply@askmonty.org 04 Feb '10
by worklog-noreply@askmonty.org 04 Feb '10
04 Feb '10
-----------------------------------------------------------------------
WORKLOG TASK
-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-
TASK...........: Store in binlog text of statements that caused RBR events
CREATION DATE..: Sat, 15 Aug 2009, 23:48
SUPERVISOR.....: Monty
IMPLEMENTOR....:
COPIES TO......: Knielsen
CATEGORY.......: Server-RawIdeaBin
TASK ID........: 47 (http://askmonty.org/worklog/?tid=47)
VERSION........: Server-9.x
STATUS.........: Un-Assigned
PRIORITY.......: 60
WORKED HOURS...: 20
ESTIMATE.......: 35 (hours remain)
ORIG. ESTIMATE.: 35
PROGRESS NOTES:
-=-=(Alexi - Thu, 04 Feb 2010, 09:54)=-=-
Low Level Design modified.
--- /tmp/wklog.47.old.16174 2010-02-04 09:54:13.000000000 +0200
+++ /tmp/wklog.47.new.16174 2010-02-04 09:54:13.000000000 +0200
@@ -171,35 +171,20 @@
5. How slave IO thread requests Annotate_rows events
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-When requesting an event, the slave should inform the master whether
-it should send Annotate_rows events or not. To that end we add a new
-BINLOG_SEND_ANNOTATE_ROWS_EVENT flag used when requesting an event:
+If the replicate-annotate-rows-events option is not set on a slave, there
+is no need for master to send Annotate_rows events to this slave. The slave
+(or mysqlbinlog in remote case), before requesting binlog dump via the
+COM_BINLOG_DUMP command, informs the master whether it should send these
+events by executing the newly added COM_BINLOG_DUMP_OPTIONS_EXT server
+command:
+
+ case COM_BINLOG_DUMP_OPTIONS_EXT:
+ thd->binlog_dump_flags_ext= packet[0];
+ my_ok(thd);
+ break;
- #define BINLOG_DUMP_NON_BLOCK 1
- #define BINLOG_SEND_ANNOTATE_ROWS_EVENT 2
-
- pthread_handler_t handle_slave_io(void *arg)
- { ...
- request_dump(mysql, ...);
- ...
- }
-
- int request_dump(MYSQL* mysql, ...)
- { ...
- if (opt_log_slave_updates &&
- mi->io_thd->variables.binlog_annotate_rows_events)
- binlog_flags|= BINLOG_SEND_ANNOTATE_ROWS_EVENT;
- ...
- int2store(buf + 4, binlog_flags);
- ...
- simple_command(mysql, COM_BINLOG_DUMP, buf, ...);
- ...
- }
-
-NOTE. mysqlbinlog, when remotely requesting BINLOG_DUMP by calling the
-simple_command() function, should also use this flag if it wants (in case
-of the --print-annotate-rows-events option set) to recieve Annotate_rows
-events.
+Note. We add this new command and don't use COM_BINLOG_DUMP to avoid possible
+conflicts with MySQL/Sun.
6. How master executes the request
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
@@ -338,10 +323,4 @@
to internals(a)lists.mysql.com about this and suggesting to reserve the
event number.
-Also we should notice the introduction of the BINLOG_SEND_ANNOTATE_ROWS_EVENT
-flag taking into account that MySQL/Sun may also introduce a flag with the
-same value to be used in the request_dump-mysql_binlog_send interface.
-But this is mainly the question of merging: if a conflict concerning this
-flag occur, we may simply change the BINLOG_SEND_ANNOTATE_ROWS_EVENT value
-(this does not require additional changes in the code).
-=-=(Alexi - Sun, 20 Dec 2009, 16:00)=-=-
Low Level Design modified.
--- /tmp/wklog.47.old.19667 2009-12-20 14:00:56.000000000 +0000
+++ /tmp/wklog.47.new.19667 2009-12-20 14:00:56.000000000 +0000
@@ -196,6 +196,11 @@
...
}
+NOTE. mysqlbinlog, when remotely requesting BINLOG_DUMP by calling the
+simple_command() function, should also use this flag if it wants (in case
+of the --print-annotate-rows-events option set) to recieve Annotate_rows
+events.
+
6. How master executes the request
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
@@ -212,8 +217,7 @@
Log_event::read_log_event(&log, packet, ...);
...
if ((*packet)[EVENT_TYPE_OFFSET + 1] != ANNOTATE_ROWS_EVENT ||
- flags & BINLOG_SEND_ANNOTATE_ROWS_EVENT ||
- thd->server_id == 0 /* slave == mysqlbinlog */ )
+ flags & BINLOG_SEND_ANNOTATE_ROWS_EVENT)
{
my_net_write(net, packet->ptr(), packet->length());
}
-=-=(Alexi - Sun, 20 Dec 2009, 13:14)=-=-
Low Level Design modified.
--- /tmp/wklog.47.old.11350 2009-12-20 13:14:04.000000000 +0200
+++ /tmp/wklog.47.new.11350 2009-12-20 13:14:04.000000000 +0200
@@ -282,23 +282,18 @@
Annotate_rows_log_event* m_annotate_event;
};
-When the saved Annotate_rows object may be deleted? When all corresponding
-Rows events will be processed, i.e. before processing the first non-Rows
-event (note that Annotate_rows object resides in the binary log *after*
-the (possible) 'BEGIN' Query event which accompanies the rows events; note
-also that this deletion is adjusted with the case when some or all
-corresponding Rows events are filtered out by replicate filter rules):
+The saved Annotate_rows object should be deleted when all corresponding
+Rows events will be processed:
int exec_relay_log_event(THD* thd, Relay_log_info* rli)
{ ...
Log_event *ev= next_event(rli);
...
- if (rli->get_annotate_event() && !IS_RBR_EVENT_TYPE(ev->get_type_code()))
- rli->free_annotate_event();
-
apply_event_and_update_pos(ev, ...);
- if (ev->get_type_code() == ANNOTATE_ROWS_EVENT)
+ if (rli->get_annotate_event() && is_last_rows_event(ev))
+ rli->free_annotate_event();
+ else if (ev->get_type_code() == ANNOTATE_ROWS_EVENT)
rli->set_annotate_event((Annotate_rows_log_event*) ev);
else if (ev->get_type_code() != FORMAT_DESCRIPTION_EVENT)
delete ev;
@@ -307,10 +302,21 @@
where
- #define IS_RBR_EVENT_TYPE(type) ( (type) == TABLE_MAP_EVENT || \
- (type) == WRITE_ROWS_EVENT || \
+ bool is_last_rows_event(Log_event* ev)
+ {
+ Log_event_type type= ev->get_type_code();
+ if (IS_ROWS_EVENT_TYPE(type))
+ {
+ Rows_log_event* rows= (Rows_log_event*)ev;
+ return rows->get_flags(Rows_log_event::STMT_END_F);
+ }
+
+ return 0;
+ }
+
+ #define IS_ROWS_EVENT_TYPE(type) ((type) == WRITE_ROWS_EVENT || \
(type) == UPDATE_ROWS_EVENT || \
- (type) == DELETE_ROWS_EVENT )
+ (type) == DELETE_ROWS_EVENT)
8. General remarks
~~~~~~~~~~~~~~~~~~
-=-=(Alexi - Sun, 20 Dec 2009, 09:29)=-=-
Low Level Design modified.
--- /tmp/wklog.47.old.32726 2009-12-20 07:29:56.000000000 +0000
+++ /tmp/wklog.47.new.32726 2009-12-20 07:29:56.000000000 +0000
@@ -56,7 +56,7 @@
0000006A | 00 | 0 - RESERVED [27]
..... ...
00000081 | 00 | 0 - RESERVED [50]
- 00000082 | 00 | 0 - ANNOTATE_RBR_EVENT [51]
+ 00000082 | 00 | 0 - ANNOTATE_ROWS_EVENT [51]
************************
2. Outline of Annotate_rows event behavior
-=-=(Alexi - Sat, 19 Dec 2009, 16:10)=-=-
Low Level Design modified.
--- /tmp/wklog.47.old.16051 2009-12-19 16:10:48.000000000 +0200
+++ /tmp/wklog.47.new.16051 2009-12-19 16:10:48.000000000 +0200
@@ -253,7 +253,7 @@
thd query to that of the described by the event, i.e. to the query which
caused the subsequent Rows events (see "How Master writes Annotate_rows
events to the binary log" to follow what happens further when the subsequent
-Rows events is applied):
+Rows events are applied):
int Annotate_rows_log_event::do_apply_event(...)
{
-=-=(Alexi - Sat, 19 Dec 2009, 16:02)=-=-
High-Level Specification modified.
--- /tmp/wklog.47.old.15695 2009-12-19 16:02:33.000000000 +0200
+++ /tmp/wklog.47.new.15695 2009-12-19 16:02:33.000000000 +0200
@@ -12,7 +12,7 @@
post-header and contains the query text in its data part. Example:
************************
- ANNOTATE_RBR_EVENT
+ ANNOTATE_ROWS_EVENT
************************
00000220 | B6 A0 2C 4B | time_when = 1261215926
00000224 | 33 | event_type = 51
-=-=(Alexi - Sat, 19 Dec 2009, 15:58)=-=-
Low Level Design modified.
--- /tmp/wklog.47.old.15437 2009-12-19 13:58:12.000000000 +0000
+++ /tmp/wklog.47.new.15437 2009-12-19 13:58:12.000000000 +0000
@@ -1 +1,337 @@
+Content
+~~~~~~~
+ 1. Annotate_rows event number
+ 2. Outline of Annotate_rows event behavior
+ 3. How Master writes Annotate_rows events to the binary log
+ 4. How slave treats replicate-annotate-rows-events option
+ 5. How slave IO thread requests Annotate_rows events
+ 6. How master executes the request
+ 7. How slave SQL thread processes Annotate_rows events
+ 8. General remarks
+
+1. Annotate_rows event number
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+To avoid possible event numbers conflict with MySQL/Sun, we leave a gap
+between the last MySQL event number and the Annotate_rows event number:
+
+ enum Log_event_type
+ { ...
+ INCIDENT_EVENT= 26,
+ // New MySQL event numbers are to be added here
+ MYSQL_EVENTS_END,
+
+ MARIA_EVENTS_BEGIN= 51,
+ // New Maria event numbers start from here
+ ANNOTATE_ROWS_EVENT= 51,
+
+ ENUM_END_EVENT
+ };
+
+together with the corresponding extension of 'post_header_len' array in the
+Format description event. (This extension does not affect the compatibility
+of the binary log). Here is how Format description event looks like with
+this extension:
+
+ ************************
+ FORMAT_DESCRIPTION_EVENT
+ ************************
+ 00000004 | A1 A0 2C 4B | time_when = 1261215905
+ 00000008 | 0F | event_type = 15
+ 00000009 | 64 00 00 00 | server_id = 100
+ 0000000D | 7F 00 00 00 | event_len = 127
+ 00000011 | 83 00 00 00 | log_pos = 00000083
+ 00000015 | 01 00 | flags = LOG_EVENT_BINLOG_IN_USE_F
+ ------------------------
+ 00000017 | 04 00 | binlog_ver = 4
+ 00000019 | 35 2E 32 2E | server_ver = 5.2.0-MariaDB-alpha-debug-log
+ ..... ...
+ 0000004B | A1 A0 2C 4B | time_created = 1261215905
+ 0000004F | 13 | common_header_len = 19
+ ------------------------
+ post_header_len
+ ------------------------
+ 00000050 | 38 | 56 - START_EVENT_V3 [1]
+ ..... ...
+ 00000069 | 02 | 2 - INCIDENT_EVENT [26]
+ 0000006A | 00 | 0 - RESERVED [27]
+ ..... ...
+ 00000081 | 00 | 0 - RESERVED [50]
+ 00000082 | 00 | 0 - ANNOTATE_RBR_EVENT [51]
+ ************************
+
+2. Outline of Annotate_rows event behavior
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+Each Annotate_rows_log_event object has two private members describing the
+corresponding query:
+
+ char *m_query_txt;
+ uint m_query_len;
+
+When the object is created for writing to a binary log, this query is taken
+from 'thd' (for short, below we omit the 'Annotate_rows_log_event::' prefix
+as well as other implementation details):
+
+ Annotate_rows_log_event(THD *thd)
+ {
+ m_query_txt = thd->query();
+ m_query_len = thd->query_length();
+ }
+
+When the object is read from a binary log, the query is taken from the buffer
+containing the binary log representation of the event (this buffer is allocated
+in Log_event object from which all Log events are derived):
+
+ Annotate_rows_log_event(char *buf, uint event_len,
+ Format_description_log_event *desc)
+ {
+ m_query_len = event_len - desc->common_header_len;
+ m_query_txt = buf + desc->common_header_len;
+ }
+
+The events are written to the binary log by the Log_event::write() member
+which calls virtual write_data_header() and write_data_body() members
+("data header" and "post header" are synonym in replication terminology).
+In our case, data header is empty and data body is just the query:
+
+ bool write_data_body(IO_CACHE *file)
+ {
+ return my_b_safe_write(file, (uchar*) m_query_txt, m_query_len);
+ }
+
+Printing the event is just printing the query:
+
+ void Annotate_rows_log_event::print(FILE *file, PRINT_EVENT_INFO *pinfo)
+ {
+ my_b_printf(&pinfo->head_cache, "\tQuery: `%s`\n", m_query_txt);
+ }
+
+3. How Master writes Annotate_rows events to the binary log
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+The event is written to the binary log just before the group of Table_map
+events which precede corresponding Rows events (one query may generate
+several Table map events in the binary log, but the corresponding
+Annotate_rows event must be written only once before the first Table map
+event; hence the boolean variable 'with_annotate' below):
+
+ int write_locked_table_maps(THD *thd)
+ { ...
+ bool with_annotate= thd->variables.binlog_annotate_rows_events;
+ ...
+ for (uint i= 0; i < ... <number of tables> ...; ++i)
+ { ...
+ thd->binlog_write_table_map(table, ..., with_annotate);
+ with_annotate= 0; // write Annotate_event not more than once
+ ...
+ }
+ ...
+ }
+
+ int THD::binlog_write_table_map(TABLE *table, ..., bool with_annotate)
+ { ...
+ Table_map_log_event the_event(...);
+ ...
+ if (with_annotate)
+ {
+ Annotate_rows_log_event anno(this);
+ mysql_bin_log.write(&anno);
+ }
+
+ mysql_bin_log.write(&the_event);
+ ...
+ }
+
+4. How slave treats replicate-annotate-rows-events option
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+The replicate-annotate-rows-events option is treated just as the session
+value of the binlog_annotate_rows_events variable for the slave IO and
+SQL threads. This setting is done during initialization of these threads:
+
+ pthread_handler_t handle_slave_io(void *arg)
+ {
+ THD *thd= new THD;
+ ...
+ init_slave_thread(thd, SLAVE_THD_IO);
+ ...
+ }
+
+ pthread_handler_t handle_slave_sql(void *arg)
+ {
+ THD *thd= new THD;
+ ...
+ init_slave_thread(thd, SLAVE_THD_SQL);
+ ...
+ }
+
+ int init_slave_thread(THD* thd, SLAVE_THD_TYPE thd_type)
+ { ...
+ thd->variables.binlog_annotate_rows_events=
+ opt_replicate_annotate_rows_events;
+ ...
+ }
+
+5. How slave IO thread requests Annotate_rows events
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+When requesting an event, the slave should inform the master whether
+it should send Annotate_rows events or not. To that end we add a new
+BINLOG_SEND_ANNOTATE_ROWS_EVENT flag used when requesting an event:
+
+ #define BINLOG_DUMP_NON_BLOCK 1
+ #define BINLOG_SEND_ANNOTATE_ROWS_EVENT 2
+
+ pthread_handler_t handle_slave_io(void *arg)
+ { ...
+ request_dump(mysql, ...);
+ ...
+ }
+
+ int request_dump(MYSQL* mysql, ...)
+ { ...
+ if (opt_log_slave_updates &&
+ mi->io_thd->variables.binlog_annotate_rows_events)
+ binlog_flags|= BINLOG_SEND_ANNOTATE_ROWS_EVENT;
+ ...
+ int2store(buf + 4, binlog_flags);
+ ...
+ simple_command(mysql, COM_BINLOG_DUMP, buf, ...);
+ ...
+ }
+
+6. How master executes the request
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+ case COM_BINLOG_DUMP:
+ { ...
+ flags= uint2korr(packet + 4);
+ ...
+ mysql_binlog_send(thd, ..., flags);
+ ...
+ }
+
+ void mysql_binlog_send(THD* thd, ..., ushort flags)
+ { ...
+ Log_event::read_log_event(&log, packet, ...);
+ ...
+ if ((*packet)[EVENT_TYPE_OFFSET + 1] != ANNOTATE_ROWS_EVENT ||
+ flags & BINLOG_SEND_ANNOTATE_ROWS_EVENT ||
+ thd->server_id == 0 /* slave == mysqlbinlog */ )
+ {
+ my_net_write(net, packet->ptr(), packet->length());
+ }
+ ...
+ }
+
+7. How slave SQL thread processes Annotate_rows events
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+The slave processes each recieved event by "applying" it, i.e. by
+calling the Log_event::apply_event() function which in turn calls
+the virtual do_apply_event() member specific for each type of the
+event.
+
+ int exec_relay_log_event(THD* thd, Relay_log_info* rli)
+ { ...
+ Log_event *ev = next_event(rli);
+ ...
+ apply_event_and_update_pos(ev, ...);
+
+ if (ev->get_type_code() != FORMAT_DESCRIPTION_EVENT)
+ delete ev;
+ ...
+ }
+
+ int apply_event_and_update_pos(Log_event *ev, ...)
+ { ...
+ ev->apply_event(...);
+ ...
+ }
+
+ int Log_event::apply_event(...)
+ {
+ return do_apply_event(...);
+ }
+
+What does it mean to "apply" an Annotate_rows event? It means to set current
+thd query to that of the described by the event, i.e. to the query which
+caused the subsequent Rows events (see "How Master writes Annotate_rows
+events to the binary log" to follow what happens further when the subsequent
+Rows events is applied):
+
+ int Annotate_rows_log_event::do_apply_event(...)
+ {
+ thd->set_query(m_query_txt, m_query_len);
+ }
+
+NOTE. I am not sure, but possibly current values of thd->query and
+thd->query_length should be saved before calling set_query() and to be
+restored on the Annotate_rows_log_event object deletion.
+Is it really needed ?
+
+After calling this do_apply_event() function we may not delete the
+Annotate_rows_log_event object immediatedly (see exec_relay_log_event()
+above) because thd->query now points to the string inside this object.
+We may keep the pointer to this object in the Relay_log_info:
+
+ class Relay_log_info
+ {
+ public:
+ ...
+ void set_annotate_event(Annotate_rows_log_event*);
+ Annotate_rows_log_event* get_annotate_event();
+ void free_annotate_event();
+ ...
+ private:
+ Annotate_rows_log_event* m_annotate_event;
+ };
+
+When the saved Annotate_rows object may be deleted? When all corresponding
+Rows events will be processed, i.e. before processing the first non-Rows
+event (note that Annotate_rows object resides in the binary log *after*
+the (possible) 'BEGIN' Query event which accompanies the rows events; note
+also that this deletion is adjusted with the case when some or all
+corresponding Rows events are filtered out by replicate filter rules):
+
+ int exec_relay_log_event(THD* thd, Relay_log_info* rli)
+ { ...
+ Log_event *ev= next_event(rli);
+ ...
+ if (rli->get_annotate_event() && !IS_RBR_EVENT_TYPE(ev->get_type_code()))
+ rli->free_annotate_event();
+
+ apply_event_and_update_pos(ev, ...);
+
+ if (ev->get_type_code() == ANNOTATE_ROWS_EVENT)
+ rli->set_annotate_event((Annotate_rows_log_event*) ev);
+ else if (ev->get_type_code() != FORMAT_DESCRIPTION_EVENT)
+ delete ev;
+ ...
+ }
+
+where
+
+ #define IS_RBR_EVENT_TYPE(type) ( (type) == TABLE_MAP_EVENT || \
+ (type) == WRITE_ROWS_EVENT || \
+ (type) == UPDATE_ROWS_EVENT || \
+ (type) == DELETE_ROWS_EVENT )
+
+8. General remarks
+~~~~~~~~~~~~~~~~~~
+Kristian noticed that introducing new log event type should be coordinated
+somehow with MySQL/Sun:
+
+ Kristian: The numeric code for this event must be assigned carefully.
+ It should be coordinated with MySQL/Sun, otherwise we can get into a
+ situation where MySQL uses the same numeric code for one event that
+ MariaDB uses for ANNOTATE_ROWS_EVENT, which would make merging the two
+ impossible.
+ Alex: I reserved about 20 numbers not to have possible conflicts
+ with MySQL.
+ Kristian: Still, I think it would be appropriate to send a polite email
+ to internals(a)lists.mysql.com about this and suggesting to reserve the
+ event number.
+
+Also we should notice the introduction of the BINLOG_SEND_ANNOTATE_ROWS_EVENT
+flag taking into account that MySQL/Sun may also introduce a flag with the
+same value to be used in the request_dump-mysql_binlog_send interface.
+But this is mainly the question of merging: if a conflict concerning this
+flag occur, we may simply change the BINLOG_SEND_ANNOTATE_ROWS_EVENT value
+(this does not require additional changes in the code).
-=-=(Alexi - Sat, 19 Dec 2009, 15:41)=-=-
High-Level Specification modified.
--- /tmp/wklog.47.old.14545 2009-12-19 15:41:21.000000000 +0200
+++ /tmp/wklog.47.new.14545 2009-12-19 15:41:21.000000000 +0200
@@ -1,122 +1,107 @@
-First suggestion:
-
-> I think for this we would actually need a new binlog event type
-> (Comment_log_event?). Unless we want to log an empty statement Query_log_event
-> containing only a comment (a bit of a hack).
-
-New server option
-~~~~~~~~~~~~~~~~~
- --binlog-annotate-rows-events
-
-Setting this option makes RBR (rows-) events in the binary log to be
-preceded by Annotate rows events (see below). The corresponding
-'binlog_annotate_rows_events' system variable is dynamic and has both
-global and session values. Default global value is OFF.
-
-Note. Session values are usefull to make it possible to annotate only
- some selected statements:
+Content
+~~~~~~~
+ 1. Annotate_rows_log_event
+ 2. Server option: --binlog-annotate-rows-events
+ 3. Server option: --replicate-annotate-rows-events
+ 4. mysqlbinlog option: --print-annotate-rows-events
+ 5. mysqlbinlog output
+
+1. Annotate_rows_log_event [ ANNOTATE_ROWS_EVENT ]
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+Describes the query which caused the corresponding rows events. Has empty
+post-header and contains the query text in its data part. Example:
+
+ ************************
+ ANNOTATE_RBR_EVENT
+ ************************
+ 00000220 | B6 A0 2C 4B | time_when = 1261215926
+ 00000224 | 33 | event_type = 51
+ 00000225 | 64 00 00 00 | server_id = 100
+ 00000229 | 36 00 00 00 | event_len = 54
+ 0000022D | 56 02 00 00 | log_pos = 00000256
+ 00000231 | 00 00 | flags = <none>
+ ------------------------
+ 00000233 | 49 4E 53 45 | query = "INSERT INTO t1 VALUES (1), (2), (3)"
+ 00000237 | 52 54 20 49 |
+ 0000023B | 4E 54 4F 20 |
+ 0000023F | 74 31 20 56 |
+ 00000243 | 41 4C 55 45 |
+ 00000247 | 53 20 28 31 |
+ 0000024B | 29 2C 20 28 |
+ 0000024F | 32 29 2C 20 |
+ 00000253 | 28 33 29 |
+ ************************
+
+In binary log, Annotate_rows event follows the (possible) 'BEGIN' Query event
+and precedes the first of Table map events which accompany the corresponding
+rows events. (See example in the "mysqlbinlog output" section below.)
+
+2. Server option: --binlog-annotate-rows-events
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+Tells the master to write Annotate_rows events to the binary log.
+
+ * Variable Name: binlog_annotate_rows_events
+ * Scope: Global & Session
+ * Access Type: Dynamic
+ * Data Type: bool
+ * Default Value: OFF
+NOTE. Session values allows to annotate only some selected statements:
...
SET SESSION binlog_annotate_rows_events=ON;
... statements to be annotated ...
SET SESSION binlog_annotate_rows_events=OFF;
... statements not to be annotated ...
-New binlog event type
-~~~~~~~~~~~~~~~~~~~~~
- Annotate_rows_log_event [ ANNOTATE_ROWS_EVENT ]
-
-Describes the query which caused the corresponding rows event. In binary log,
-precedes each Table_map_log_event. Contains empty post-header and the query
-text in its data part.
-
-The numeric code for this event must be assigned carefully. It should be
-coordinated with MySQL/Sun, otherwise we can get into a situation where MySQL
-uses the same numeric code for one event that MariaDB uses for
-ANNOTATE_ROWS_EVENT, which would make merging the two impossible.
-
-Example:
-
- ...
- ************************
- ANNOTATE_ROWS_EVENT [51]
- ************************
- 000000C7 | 54 1B 12 4B | time_when = 1259477844
- 000000CB | 33 | event_type = 51
- 000000CC | 64 00 00 00 | server_id = 100
- 000000D0 | 2C 00 00 00 | event_len = 44
- 000000D4 | F3 00 00 00 | log_pos = 000000F3
- 000000D8 | 00 00 | flags = <none>
- ------------------------
- 000000DA | 69 6E 73 65 | query = "insert into t1 values (1)"
- 000000DE | 72 74 20 69 |
- 000000E2 | 6E 74 6F 20 |
- 000000E6 | 74 31 20 76 |
- 000000EA | 61 6C 75 65 |
- 000000EE | 73 20 28 31 |
- 000000F2 | 29 |
- ************************
- TABLE_MAP_EVENT [19]
- ************************
- 000000F3 | 54 1B 12 4B | time_when = 1259477844
- 000000F7 | 13 | event_type = 19
- 000000F8 | 64 00 00 00 | server_id = 100
- 000000FC | 29 00 00 00 | event_len = 41
- 00000100 | 1C 01 00 00 | log_pos = 0000011C
- 00000104 | 00 00 | flags = <none>
- ------------------------
- ...
- ************************
- WRITE_ROWS_EVENT [23]
- ************************
- 0000011C | 54 1B 12 4B | time_when = 1259477844
- 00000120 | 17 | event_type = 23
- 00000121 | 64 00 00 00 | server_id = 100
- 00000125 | 22 00 00 00 | event_len = 34
- 00000129 | 3E 01 00 00 | log_pos = 0000013E
- 0000012D | 10 00 | flags = LOG_EVENT_UPDATE_TABLE_MAP_VERSION_F
- ------------------------
- 0000012F | 0F 00 00 00 | table_id = 15
- ...
-
-New mysqlbinlog option
-~~~~~~~~~~~~~~~~~~~~~~
- --print-annotate-rows-events
-
-With this option, mysqlbinlog prints the content of Annotate-rows
-events (if the binary log does contain them). Without this option
-(i.e. by default), mysqlbinlog skips Annotate rows events.
-
-
-mysqlbinlog output
-~~~~~~~~~~~~~~~~~~
-Something like this:
+3. Server option: --replicate-annotate-rows-events
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+Tells the slave to reproduce Annotate_rows events recieved from the master
+in its own binary log (sensible only in pair with log-slave-updates option).
+
+ * Variable Name: replicate_annotate_rows_events
+ * Scope: Global
+ * Access Type: Read only
+ * Data Type: bool
+ * Default Value: OFF
+
+NOTE. Why do we additionally need this 'replicate' option? Why not to make
+the slave to reproduce this events when its binlog-annotate-rows-events
+global value is ON? Well, because, for example, we may want to configure
+the slave which should reproduce Annotate_rows events but has global
+binlog-annotate-rows-events = OFF meaning this to be the default value for
+the client threads (see also "How slave treats replicate-annotate-rows-events
+option" in LLD part).
+
+4. mysqlbinlog option: --print-annotate-rows-events
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+With this option, mysqlbinlog prints the content of Annotate_rows events (if
+the binary log does contain them). Without this option (i.e. by default),
+mysqlbinlog skips Annotate_rows events.
+5. mysqlbinlog output
+~~~~~~~~~~~~~~~~~~~~~
+With --print-annotate-rows-events, mysqlbinlog outputs Annotate_rows events
+in a form like this:
...
- # at 199
- # at 243
- # at 284
- #091129 9:57:24 server id 100 end_log_pos 243 Query: `insert into t1 values
-(1)`
- #091129 9:57:24 server id 100 end_log_pos 284 Table_map: `test`.`t1` mapped
-to number 15
- #091129 9:57:24 server id 100 end_log_pos 318 Write_rows: table id 15
+ # at 1646
+ #091219 12:45:26 server id 100 end_log_pos 1714 Query thread_id=1
+exec_time=0 error_code=0
+ SET TIMESTAMP=1261215926/*!*/;
+ BEGIN
+ /*!*/;
+ # at 1714
+ # at 1812
+ # at 1853
+ # at 1894
+ # at 1938
+ #091219 12:45:26 server id 100 end_log_pos 1812 Query: `DELETE t1, t2 FROM
+t1 INNER JOIN t2 INNER JOIN t3 WHERE t1.a=t2.a AND t2.a=t3.a`
+ #091219 12:45:26 server id 100 end_log_pos 1853 Table_map: `test`.`t1`
+mapped to number 16
+ #091219 12:45:26 server id 100 end_log_pos 1894 Table_map: `test`.`t2`
+mapped to number 17
+ #091219 12:45:26 server id 100 end_log_pos 1938 Delete_rows: table id 16
+ #091219 12:45:26 server id 100 end_log_pos 1982 Delete_rows: table id 17
flags: STMT_END_F
-
- BINLOG '
- VBsSSzNkAAAALAAAAPMAAAAAAGluc2VydCBpbnRvIHQxIHZhbHVlcyAoMSk=
- VBsSSxNkAAAAKQAAABwBAAAAAA8AAAAAAAAABHRlc3QAAnQxAAEDAAE=
- VBsSSxdkAAAAIgAAAD4BAAAQAA8AAAAAAAEAAf/+AQAAAA==
- '/*!*/;
- ### INSERT INTO test.t1
- ### SET
- ### @1=1 /* INT meta=0 nullable=1 is_null=0 */
...
-When master sends Annotate rows events
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-1. Master always sends Annotate_rows events to mysqlbinlog (in
- remote case).
-2. Master sends Annotate_rows events to a slave only if the slave has
- both log-slave-updates and binlog-annotate-rows-events options set.
-
-=-=(Bothorsen - Fri, 18 Dec 2009, 16:22)=-=-
Add estimation time.
Worked 5 hours and estimate 35 hours remain (original estimate increased by 5 hours).
-=-=(Bothorsen - Fri, 18 Dec 2009, 16:16)=-=-
This is the work done on this patch so far. Most of it done by Alex.
Worked 15 hours and estimate 035 hours remain (original estimate increased by 50 hours).
------------------------------------------------------------
-=-=(View All Progress Notes, 20 total)=-=-
http://askmonty.org/worklog/index.pl?tid=47&nolimit=1
DESCRIPTION:
Store in binlog (and show in mysqlbinlog output) texts of statements that
caused RBR events
This is needed for (list from Monty):
- Easier to understand why updates happened
- Would make it easier to find out where in application things went
wrong (as you can search for exact strings)
- Allow one to filter things based on comments in the statement.
The cost of this can be that the binlog will be approximately 2x in size
(especially insert of big blob's would be a bit painful), so this should
be an optional feature.
HIGH-LEVEL SPECIFICATION:
Content
~~~~~~~
1. Annotate_rows_log_event
2. Server option: --binlog-annotate-rows-events
3. Server option: --replicate-annotate-rows-events
4. mysqlbinlog option: --print-annotate-rows-events
5. mysqlbinlog output
1. Annotate_rows_log_event [ ANNOTATE_ROWS_EVENT ]
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
Describes the query which caused the corresponding rows events. Has empty
post-header and contains the query text in its data part. Example:
************************
ANNOTATE_ROWS_EVENT
************************
00000220 | B6 A0 2C 4B | time_when = 1261215926
00000224 | 33 | event_type = 51
00000225 | 64 00 00 00 | server_id = 100
00000229 | 36 00 00 00 | event_len = 54
0000022D | 56 02 00 00 | log_pos = 00000256
00000231 | 00 00 | flags = <none>
------------------------
00000233 | 49 4E 53 45 | query = "INSERT INTO t1 VALUES (1), (2), (3)"
00000237 | 52 54 20 49 |
0000023B | 4E 54 4F 20 |
0000023F | 74 31 20 56 |
00000243 | 41 4C 55 45 |
00000247 | 53 20 28 31 |
0000024B | 29 2C 20 28 |
0000024F | 32 29 2C 20 |
00000253 | 28 33 29 |
************************
In binary log, Annotate_rows event follows the (possible) 'BEGIN' Query event
and precedes the first of Table map events which accompany the corresponding
rows events. (See example in the "mysqlbinlog output" section below.)
2. Server option: --binlog-annotate-rows-events
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
Tells the master to write Annotate_rows events to the binary log.
* Variable Name: binlog_annotate_rows_events
* Scope: Global & Session
* Access Type: Dynamic
* Data Type: bool
* Default Value: OFF
NOTE. Session values allows to annotate only some selected statements:
...
SET SESSION binlog_annotate_rows_events=ON;
... statements to be annotated ...
SET SESSION binlog_annotate_rows_events=OFF;
... statements not to be annotated ...
3. Server option: --replicate-annotate-rows-events
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
Tells the slave to reproduce Annotate_rows events recieved from the master
in its own binary log (sensible only in pair with log-slave-updates option).
* Variable Name: replicate_annotate_rows_events
* Scope: Global
* Access Type: Read only
* Data Type: bool
* Default Value: OFF
NOTE. Why do we additionally need this 'replicate' option? Why not to make
the slave to reproduce this events when its binlog-annotate-rows-events
global value is ON? Well, because, for example, we may want to configure
the slave which should reproduce Annotate_rows events but has global
binlog-annotate-rows-events = OFF meaning this to be the default value for
the client threads (see also "How slave treats replicate-annotate-rows-events
option" in LLD part).
4. mysqlbinlog option: --print-annotate-rows-events
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
With this option, mysqlbinlog prints the content of Annotate_rows events (if
the binary log does contain them). Without this option (i.e. by default),
mysqlbinlog skips Annotate_rows events.
5. mysqlbinlog output
~~~~~~~~~~~~~~~~~~~~~
With --print-annotate-rows-events, mysqlbinlog outputs Annotate_rows events
in a form like this:
...
# at 1646
#091219 12:45:26 server id 100 end_log_pos 1714 Query thread_id=1
exec_time=0 error_code=0
SET TIMESTAMP=1261215926/*!*/;
BEGIN
/*!*/;
# at 1714
# at 1812
# at 1853
# at 1894
# at 1938
#091219 12:45:26 server id 100 end_log_pos 1812 Query: `DELETE t1, t2 FROM
t1 INNER JOIN t2 INNER JOIN t3 WHERE t1.a=t2.a AND t2.a=t3.a`
#091219 12:45:26 server id 100 end_log_pos 1853 Table_map: `test`.`t1`
mapped to number 16
#091219 12:45:26 server id 100 end_log_pos 1894 Table_map: `test`.`t2`
mapped to number 17
#091219 12:45:26 server id 100 end_log_pos 1938 Delete_rows: table id 16
#091219 12:45:26 server id 100 end_log_pos 1982 Delete_rows: table id 17
flags: STMT_END_F
...
LOW-LEVEL DESIGN:
Content
~~~~~~~
1. Annotate_rows event number
2. Outline of Annotate_rows event behavior
3. How Master writes Annotate_rows events to the binary log
4. How slave treats replicate-annotate-rows-events option
5. How slave IO thread requests Annotate_rows events
6. How master executes the request
7. How slave SQL thread processes Annotate_rows events
8. General remarks
1. Annotate_rows event number
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
To avoid possible event numbers conflict with MySQL/Sun, we leave a gap
between the last MySQL event number and the Annotate_rows event number:
enum Log_event_type
{ ...
INCIDENT_EVENT= 26,
// New MySQL event numbers are to be added here
MYSQL_EVENTS_END,
MARIA_EVENTS_BEGIN= 51,
// New Maria event numbers start from here
ANNOTATE_ROWS_EVENT= 51,
ENUM_END_EVENT
};
together with the corresponding extension of 'post_header_len' array in the
Format description event. (This extension does not affect the compatibility
of the binary log). Here is how Format description event looks like with
this extension:
************************
FORMAT_DESCRIPTION_EVENT
************************
00000004 | A1 A0 2C 4B | time_when = 1261215905
00000008 | 0F | event_type = 15
00000009 | 64 00 00 00 | server_id = 100
0000000D | 7F 00 00 00 | event_len = 127
00000011 | 83 00 00 00 | log_pos = 00000083
00000015 | 01 00 | flags = LOG_EVENT_BINLOG_IN_USE_F
------------------------
00000017 | 04 00 | binlog_ver = 4
00000019 | 35 2E 32 2E | server_ver = 5.2.0-MariaDB-alpha-debug-log
..... ...
0000004B | A1 A0 2C 4B | time_created = 1261215905
0000004F | 13 | common_header_len = 19
------------------------
post_header_len
------------------------
00000050 | 38 | 56 - START_EVENT_V3 [1]
..... ...
00000069 | 02 | 2 - INCIDENT_EVENT [26]
0000006A | 00 | 0 - RESERVED [27]
..... ...
00000081 | 00 | 0 - RESERVED [50]
00000082 | 00 | 0 - ANNOTATE_ROWS_EVENT [51]
************************
2. Outline of Annotate_rows event behavior
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
Each Annotate_rows_log_event object has two private members describing the
corresponding query:
char *m_query_txt;
uint m_query_len;
When the object is created for writing to a binary log, this query is taken
from 'thd' (for short, below we omit the 'Annotate_rows_log_event::' prefix
as well as other implementation details):
Annotate_rows_log_event(THD *thd)
{
m_query_txt = thd->query();
m_query_len = thd->query_length();
}
When the object is read from a binary log, the query is taken from the buffer
containing the binary log representation of the event (this buffer is allocated
in Log_event object from which all Log events are derived):
Annotate_rows_log_event(char *buf, uint event_len,
Format_description_log_event *desc)
{
m_query_len = event_len - desc->common_header_len;
m_query_txt = buf + desc->common_header_len;
}
The events are written to the binary log by the Log_event::write() member
which calls virtual write_data_header() and write_data_body() members
("data header" and "post header" are synonym in replication terminology).
In our case, data header is empty and data body is just the query:
bool write_data_body(IO_CACHE *file)
{
return my_b_safe_write(file, (uchar*) m_query_txt, m_query_len);
}
Printing the event is just printing the query:
void Annotate_rows_log_event::print(FILE *file, PRINT_EVENT_INFO *pinfo)
{
my_b_printf(&pinfo->head_cache, "\tQuery: `%s`\n", m_query_txt);
}
3. How Master writes Annotate_rows events to the binary log
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
The event is written to the binary log just before the group of Table_map
events which precede corresponding Rows events (one query may generate
several Table map events in the binary log, but the corresponding
Annotate_rows event must be written only once before the first Table map
event; hence the boolean variable 'with_annotate' below):
int write_locked_table_maps(THD *thd)
{ ...
bool with_annotate= thd->variables.binlog_annotate_rows_events;
...
for (uint i= 0; i < ... <number of tables> ...; ++i)
{ ...
thd->binlog_write_table_map(table, ..., with_annotate);
with_annotate= 0; // write Annotate_event not more than once
...
}
...
}
int THD::binlog_write_table_map(TABLE *table, ..., bool with_annotate)
{ ...
Table_map_log_event the_event(...);
...
if (with_annotate)
{
Annotate_rows_log_event anno(this);
mysql_bin_log.write(&anno);
}
mysql_bin_log.write(&the_event);
...
}
4. How slave treats replicate-annotate-rows-events option
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
The replicate-annotate-rows-events option is treated just as the session
value of the binlog_annotate_rows_events variable for the slave IO and
SQL threads. This setting is done during initialization of these threads:
pthread_handler_t handle_slave_io(void *arg)
{
THD *thd= new THD;
...
init_slave_thread(thd, SLAVE_THD_IO);
...
}
pthread_handler_t handle_slave_sql(void *arg)
{
THD *thd= new THD;
...
init_slave_thread(thd, SLAVE_THD_SQL);
...
}
int init_slave_thread(THD* thd, SLAVE_THD_TYPE thd_type)
{ ...
thd->variables.binlog_annotate_rows_events=
opt_replicate_annotate_rows_events;
...
}
5. How slave IO thread requests Annotate_rows events
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
If the replicate-annotate-rows-events option is not set on a slave, there
is no need for master to send Annotate_rows events to this slave. The slave
(or mysqlbinlog in remote case), before requesting binlog dump via the
COM_BINLOG_DUMP command, informs the master whether it should send these
events by executing the newly added COM_BINLOG_DUMP_OPTIONS_EXT server
command:
case COM_BINLOG_DUMP_OPTIONS_EXT:
thd->binlog_dump_flags_ext= packet[0];
my_ok(thd);
break;
Note. We add this new command and don't use COM_BINLOG_DUMP to avoid possible
conflicts with MySQL/Sun.
6. How master executes the request
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
case COM_BINLOG_DUMP:
{ ...
flags= uint2korr(packet + 4);
...
mysql_binlog_send(thd, ..., flags);
...
}
void mysql_binlog_send(THD* thd, ..., ushort flags)
{ ...
Log_event::read_log_event(&log, packet, ...);
...
if ((*packet)[EVENT_TYPE_OFFSET + 1] != ANNOTATE_ROWS_EVENT ||
flags & BINLOG_SEND_ANNOTATE_ROWS_EVENT)
{
my_net_write(net, packet->ptr(), packet->length());
}
...
}
7. How slave SQL thread processes Annotate_rows events
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
The slave processes each recieved event by "applying" it, i.e. by
calling the Log_event::apply_event() function which in turn calls
the virtual do_apply_event() member specific for each type of the
event.
int exec_relay_log_event(THD* thd, Relay_log_info* rli)
{ ...
Log_event *ev = next_event(rli);
...
apply_event_and_update_pos(ev, ...);
if (ev->get_type_code() != FORMAT_DESCRIPTION_EVENT)
delete ev;
...
}
int apply_event_and_update_pos(Log_event *ev, ...)
{ ...
ev->apply_event(...);
...
}
int Log_event::apply_event(...)
{
return do_apply_event(...);
}
What does it mean to "apply" an Annotate_rows event? It means to set current
thd query to that of the described by the event, i.e. to the query which
caused the subsequent Rows events (see "How Master writes Annotate_rows
events to the binary log" to follow what happens further when the subsequent
Rows events are applied):
int Annotate_rows_log_event::do_apply_event(...)
{
thd->set_query(m_query_txt, m_query_len);
}
NOTE. I am not sure, but possibly current values of thd->query and
thd->query_length should be saved before calling set_query() and to be
restored on the Annotate_rows_log_event object deletion.
Is it really needed ?
After calling this do_apply_event() function we may not delete the
Annotate_rows_log_event object immediatedly (see exec_relay_log_event()
above) because thd->query now points to the string inside this object.
We may keep the pointer to this object in the Relay_log_info:
class Relay_log_info
{
public:
...
void set_annotate_event(Annotate_rows_log_event*);
Annotate_rows_log_event* get_annotate_event();
void free_annotate_event();
...
private:
Annotate_rows_log_event* m_annotate_event;
};
The saved Annotate_rows object should be deleted when all corresponding
Rows events will be processed:
int exec_relay_log_event(THD* thd, Relay_log_info* rli)
{ ...
Log_event *ev= next_event(rli);
...
apply_event_and_update_pos(ev, ...);
if (rli->get_annotate_event() && is_last_rows_event(ev))
rli->free_annotate_event();
else if (ev->get_type_code() == ANNOTATE_ROWS_EVENT)
rli->set_annotate_event((Annotate_rows_log_event*) ev);
else if (ev->get_type_code() != FORMAT_DESCRIPTION_EVENT)
delete ev;
...
}
where
bool is_last_rows_event(Log_event* ev)
{
Log_event_type type= ev->get_type_code();
if (IS_ROWS_EVENT_TYPE(type))
{
Rows_log_event* rows= (Rows_log_event*)ev;
return rows->get_flags(Rows_log_event::STMT_END_F);
}
return 0;
}
#define IS_ROWS_EVENT_TYPE(type) ((type) == WRITE_ROWS_EVENT || \
(type) == UPDATE_ROWS_EVENT || \
(type) == DELETE_ROWS_EVENT)
8. General remarks
~~~~~~~~~~~~~~~~~~
Kristian noticed that introducing new log event type should be coordinated
somehow with MySQL/Sun:
Kristian: The numeric code for this event must be assigned carefully.
It should be coordinated with MySQL/Sun, otherwise we can get into a
situation where MySQL uses the same numeric code for one event that
MariaDB uses for ANNOTATE_ROWS_EVENT, which would make merging the two
impossible.
Alex: I reserved about 20 numbers not to have possible conflicts
with MySQL.
Kristian: Still, I think it would be appropriate to send a polite email
to internals(a)lists.mysql.com about this and suggesting to reserve the
event number.
ESTIMATED WORK TIME
ESTIMATED COMPLETION DATE
-----------------------------------------------------------------------
WorkLog (v3.5.9)
1
0

[Maria-developers] Updated (by Alexi): Store in binlog text of statements that caused RBR events (47)
by worklog-noreply@askmonty.org 04 Feb '10
by worklog-noreply@askmonty.org 04 Feb '10
04 Feb '10
-----------------------------------------------------------------------
WORKLOG TASK
-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-
TASK...........: Store in binlog text of statements that caused RBR events
CREATION DATE..: Sat, 15 Aug 2009, 23:48
SUPERVISOR.....: Monty
IMPLEMENTOR....:
COPIES TO......: Knielsen
CATEGORY.......: Server-RawIdeaBin
TASK ID........: 47 (http://askmonty.org/worklog/?tid=47)
VERSION........: Server-9.x
STATUS.........: Un-Assigned
PRIORITY.......: 60
WORKED HOURS...: 20
ESTIMATE.......: 35 (hours remain)
ORIG. ESTIMATE.: 35
PROGRESS NOTES:
-=-=(Alexi - Thu, 04 Feb 2010, 09:54)=-=-
Low Level Design modified.
--- /tmp/wklog.47.old.16174 2010-02-04 09:54:13.000000000 +0200
+++ /tmp/wklog.47.new.16174 2010-02-04 09:54:13.000000000 +0200
@@ -171,35 +171,20 @@
5. How slave IO thread requests Annotate_rows events
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-When requesting an event, the slave should inform the master whether
-it should send Annotate_rows events or not. To that end we add a new
-BINLOG_SEND_ANNOTATE_ROWS_EVENT flag used when requesting an event:
+If the replicate-annotate-rows-events option is not set on a slave, there
+is no need for master to send Annotate_rows events to this slave. The slave
+(or mysqlbinlog in remote case), before requesting binlog dump via the
+COM_BINLOG_DUMP command, informs the master whether it should send these
+events by executing the newly added COM_BINLOG_DUMP_OPTIONS_EXT server
+command:
+
+ case COM_BINLOG_DUMP_OPTIONS_EXT:
+ thd->binlog_dump_flags_ext= packet[0];
+ my_ok(thd);
+ break;
- #define BINLOG_DUMP_NON_BLOCK 1
- #define BINLOG_SEND_ANNOTATE_ROWS_EVENT 2
-
- pthread_handler_t handle_slave_io(void *arg)
- { ...
- request_dump(mysql, ...);
- ...
- }
-
- int request_dump(MYSQL* mysql, ...)
- { ...
- if (opt_log_slave_updates &&
- mi->io_thd->variables.binlog_annotate_rows_events)
- binlog_flags|= BINLOG_SEND_ANNOTATE_ROWS_EVENT;
- ...
- int2store(buf + 4, binlog_flags);
- ...
- simple_command(mysql, COM_BINLOG_DUMP, buf, ...);
- ...
- }
-
-NOTE. mysqlbinlog, when remotely requesting BINLOG_DUMP by calling the
-simple_command() function, should also use this flag if it wants (in case
-of the --print-annotate-rows-events option set) to recieve Annotate_rows
-events.
+Note. We add this new command and don't use COM_BINLOG_DUMP to avoid possible
+conflicts with MySQL/Sun.
6. How master executes the request
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
@@ -338,10 +323,4 @@
to internals(a)lists.mysql.com about this and suggesting to reserve the
event number.
-Also we should notice the introduction of the BINLOG_SEND_ANNOTATE_ROWS_EVENT
-flag taking into account that MySQL/Sun may also introduce a flag with the
-same value to be used in the request_dump-mysql_binlog_send interface.
-But this is mainly the question of merging: if a conflict concerning this
-flag occur, we may simply change the BINLOG_SEND_ANNOTATE_ROWS_EVENT value
-(this does not require additional changes in the code).
-=-=(Alexi - Sun, 20 Dec 2009, 16:00)=-=-
Low Level Design modified.
--- /tmp/wklog.47.old.19667 2009-12-20 14:00:56.000000000 +0000
+++ /tmp/wklog.47.new.19667 2009-12-20 14:00:56.000000000 +0000
@@ -196,6 +196,11 @@
...
}
+NOTE. mysqlbinlog, when remotely requesting BINLOG_DUMP by calling the
+simple_command() function, should also use this flag if it wants (in case
+of the --print-annotate-rows-events option set) to recieve Annotate_rows
+events.
+
6. How master executes the request
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
@@ -212,8 +217,7 @@
Log_event::read_log_event(&log, packet, ...);
...
if ((*packet)[EVENT_TYPE_OFFSET + 1] != ANNOTATE_ROWS_EVENT ||
- flags & BINLOG_SEND_ANNOTATE_ROWS_EVENT ||
- thd->server_id == 0 /* slave == mysqlbinlog */ )
+ flags & BINLOG_SEND_ANNOTATE_ROWS_EVENT)
{
my_net_write(net, packet->ptr(), packet->length());
}
-=-=(Alexi - Sun, 20 Dec 2009, 13:14)=-=-
Low Level Design modified.
--- /tmp/wklog.47.old.11350 2009-12-20 13:14:04.000000000 +0200
+++ /tmp/wklog.47.new.11350 2009-12-20 13:14:04.000000000 +0200
@@ -282,23 +282,18 @@
Annotate_rows_log_event* m_annotate_event;
};
-When the saved Annotate_rows object may be deleted? When all corresponding
-Rows events will be processed, i.e. before processing the first non-Rows
-event (note that Annotate_rows object resides in the binary log *after*
-the (possible) 'BEGIN' Query event which accompanies the rows events; note
-also that this deletion is adjusted with the case when some or all
-corresponding Rows events are filtered out by replicate filter rules):
+The saved Annotate_rows object should be deleted when all corresponding
+Rows events will be processed:
int exec_relay_log_event(THD* thd, Relay_log_info* rli)
{ ...
Log_event *ev= next_event(rli);
...
- if (rli->get_annotate_event() && !IS_RBR_EVENT_TYPE(ev->get_type_code()))
- rli->free_annotate_event();
-
apply_event_and_update_pos(ev, ...);
- if (ev->get_type_code() == ANNOTATE_ROWS_EVENT)
+ if (rli->get_annotate_event() && is_last_rows_event(ev))
+ rli->free_annotate_event();
+ else if (ev->get_type_code() == ANNOTATE_ROWS_EVENT)
rli->set_annotate_event((Annotate_rows_log_event*) ev);
else if (ev->get_type_code() != FORMAT_DESCRIPTION_EVENT)
delete ev;
@@ -307,10 +302,21 @@
where
- #define IS_RBR_EVENT_TYPE(type) ( (type) == TABLE_MAP_EVENT || \
- (type) == WRITE_ROWS_EVENT || \
+ bool is_last_rows_event(Log_event* ev)
+ {
+ Log_event_type type= ev->get_type_code();
+ if (IS_ROWS_EVENT_TYPE(type))
+ {
+ Rows_log_event* rows= (Rows_log_event*)ev;
+ return rows->get_flags(Rows_log_event::STMT_END_F);
+ }
+
+ return 0;
+ }
+
+ #define IS_ROWS_EVENT_TYPE(type) ((type) == WRITE_ROWS_EVENT || \
(type) == UPDATE_ROWS_EVENT || \
- (type) == DELETE_ROWS_EVENT )
+ (type) == DELETE_ROWS_EVENT)
8. General remarks
~~~~~~~~~~~~~~~~~~
-=-=(Alexi - Sun, 20 Dec 2009, 09:29)=-=-
Low Level Design modified.
--- /tmp/wklog.47.old.32726 2009-12-20 07:29:56.000000000 +0000
+++ /tmp/wklog.47.new.32726 2009-12-20 07:29:56.000000000 +0000
@@ -56,7 +56,7 @@
0000006A | 00 | 0 - RESERVED [27]
..... ...
00000081 | 00 | 0 - RESERVED [50]
- 00000082 | 00 | 0 - ANNOTATE_RBR_EVENT [51]
+ 00000082 | 00 | 0 - ANNOTATE_ROWS_EVENT [51]
************************
2. Outline of Annotate_rows event behavior
-=-=(Alexi - Sat, 19 Dec 2009, 16:10)=-=-
Low Level Design modified.
--- /tmp/wklog.47.old.16051 2009-12-19 16:10:48.000000000 +0200
+++ /tmp/wklog.47.new.16051 2009-12-19 16:10:48.000000000 +0200
@@ -253,7 +253,7 @@
thd query to that of the described by the event, i.e. to the query which
caused the subsequent Rows events (see "How Master writes Annotate_rows
events to the binary log" to follow what happens further when the subsequent
-Rows events is applied):
+Rows events are applied):
int Annotate_rows_log_event::do_apply_event(...)
{
-=-=(Alexi - Sat, 19 Dec 2009, 16:02)=-=-
High-Level Specification modified.
--- /tmp/wklog.47.old.15695 2009-12-19 16:02:33.000000000 +0200
+++ /tmp/wklog.47.new.15695 2009-12-19 16:02:33.000000000 +0200
@@ -12,7 +12,7 @@
post-header and contains the query text in its data part. Example:
************************
- ANNOTATE_RBR_EVENT
+ ANNOTATE_ROWS_EVENT
************************
00000220 | B6 A0 2C 4B | time_when = 1261215926
00000224 | 33 | event_type = 51
-=-=(Alexi - Sat, 19 Dec 2009, 15:58)=-=-
Low Level Design modified.
--- /tmp/wklog.47.old.15437 2009-12-19 13:58:12.000000000 +0000
+++ /tmp/wklog.47.new.15437 2009-12-19 13:58:12.000000000 +0000
@@ -1 +1,337 @@
+Content
+~~~~~~~
+ 1. Annotate_rows event number
+ 2. Outline of Annotate_rows event behavior
+ 3. How Master writes Annotate_rows events to the binary log
+ 4. How slave treats replicate-annotate-rows-events option
+ 5. How slave IO thread requests Annotate_rows events
+ 6. How master executes the request
+ 7. How slave SQL thread processes Annotate_rows events
+ 8. General remarks
+
+1. Annotate_rows event number
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+To avoid possible event numbers conflict with MySQL/Sun, we leave a gap
+between the last MySQL event number and the Annotate_rows event number:
+
+ enum Log_event_type
+ { ...
+ INCIDENT_EVENT= 26,
+ // New MySQL event numbers are to be added here
+ MYSQL_EVENTS_END,
+
+ MARIA_EVENTS_BEGIN= 51,
+ // New Maria event numbers start from here
+ ANNOTATE_ROWS_EVENT= 51,
+
+ ENUM_END_EVENT
+ };
+
+together with the corresponding extension of 'post_header_len' array in the
+Format description event. (This extension does not affect the compatibility
+of the binary log). Here is how Format description event looks like with
+this extension:
+
+ ************************
+ FORMAT_DESCRIPTION_EVENT
+ ************************
+ 00000004 | A1 A0 2C 4B | time_when = 1261215905
+ 00000008 | 0F | event_type = 15
+ 00000009 | 64 00 00 00 | server_id = 100
+ 0000000D | 7F 00 00 00 | event_len = 127
+ 00000011 | 83 00 00 00 | log_pos = 00000083
+ 00000015 | 01 00 | flags = LOG_EVENT_BINLOG_IN_USE_F
+ ------------------------
+ 00000017 | 04 00 | binlog_ver = 4
+ 00000019 | 35 2E 32 2E | server_ver = 5.2.0-MariaDB-alpha-debug-log
+ ..... ...
+ 0000004B | A1 A0 2C 4B | time_created = 1261215905
+ 0000004F | 13 | common_header_len = 19
+ ------------------------
+ post_header_len
+ ------------------------
+ 00000050 | 38 | 56 - START_EVENT_V3 [1]
+ ..... ...
+ 00000069 | 02 | 2 - INCIDENT_EVENT [26]
+ 0000006A | 00 | 0 - RESERVED [27]
+ ..... ...
+ 00000081 | 00 | 0 - RESERVED [50]
+ 00000082 | 00 | 0 - ANNOTATE_RBR_EVENT [51]
+ ************************
+
+2. Outline of Annotate_rows event behavior
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+Each Annotate_rows_log_event object has two private members describing the
+corresponding query:
+
+ char *m_query_txt;
+ uint m_query_len;
+
+When the object is created for writing to a binary log, this query is taken
+from 'thd' (for short, below we omit the 'Annotate_rows_log_event::' prefix
+as well as other implementation details):
+
+ Annotate_rows_log_event(THD *thd)
+ {
+ m_query_txt = thd->query();
+ m_query_len = thd->query_length();
+ }
+
+When the object is read from a binary log, the query is taken from the buffer
+containing the binary log representation of the event (this buffer is allocated
+in Log_event object from which all Log events are derived):
+
+ Annotate_rows_log_event(char *buf, uint event_len,
+ Format_description_log_event *desc)
+ {
+ m_query_len = event_len - desc->common_header_len;
+ m_query_txt = buf + desc->common_header_len;
+ }
+
+The events are written to the binary log by the Log_event::write() member
+which calls virtual write_data_header() and write_data_body() members
+("data header" and "post header" are synonym in replication terminology).
+In our case, data header is empty and data body is just the query:
+
+ bool write_data_body(IO_CACHE *file)
+ {
+ return my_b_safe_write(file, (uchar*) m_query_txt, m_query_len);
+ }
+
+Printing the event is just printing the query:
+
+ void Annotate_rows_log_event::print(FILE *file, PRINT_EVENT_INFO *pinfo)
+ {
+ my_b_printf(&pinfo->head_cache, "\tQuery: `%s`\n", m_query_txt);
+ }
+
+3. How Master writes Annotate_rows events to the binary log
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+The event is written to the binary log just before the group of Table_map
+events which precede corresponding Rows events (one query may generate
+several Table map events in the binary log, but the corresponding
+Annotate_rows event must be written only once before the first Table map
+event; hence the boolean variable 'with_annotate' below):
+
+ int write_locked_table_maps(THD *thd)
+ { ...
+ bool with_annotate= thd->variables.binlog_annotate_rows_events;
+ ...
+ for (uint i= 0; i < ... <number of tables> ...; ++i)
+ { ...
+ thd->binlog_write_table_map(table, ..., with_annotate);
+ with_annotate= 0; // write Annotate_event not more than once
+ ...
+ }
+ ...
+ }
+
+ int THD::binlog_write_table_map(TABLE *table, ..., bool with_annotate)
+ { ...
+ Table_map_log_event the_event(...);
+ ...
+ if (with_annotate)
+ {
+ Annotate_rows_log_event anno(this);
+ mysql_bin_log.write(&anno);
+ }
+
+ mysql_bin_log.write(&the_event);
+ ...
+ }
+
+4. How slave treats replicate-annotate-rows-events option
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+The replicate-annotate-rows-events option is treated just as the session
+value of the binlog_annotate_rows_events variable for the slave IO and
+SQL threads. This setting is done during initialization of these threads:
+
+ pthread_handler_t handle_slave_io(void *arg)
+ {
+ THD *thd= new THD;
+ ...
+ init_slave_thread(thd, SLAVE_THD_IO);
+ ...
+ }
+
+ pthread_handler_t handle_slave_sql(void *arg)
+ {
+ THD *thd= new THD;
+ ...
+ init_slave_thread(thd, SLAVE_THD_SQL);
+ ...
+ }
+
+ int init_slave_thread(THD* thd, SLAVE_THD_TYPE thd_type)
+ { ...
+ thd->variables.binlog_annotate_rows_events=
+ opt_replicate_annotate_rows_events;
+ ...
+ }
+
+5. How slave IO thread requests Annotate_rows events
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+When requesting an event, the slave should inform the master whether
+it should send Annotate_rows events or not. To that end we add a new
+BINLOG_SEND_ANNOTATE_ROWS_EVENT flag used when requesting an event:
+
+ #define BINLOG_DUMP_NON_BLOCK 1
+ #define BINLOG_SEND_ANNOTATE_ROWS_EVENT 2
+
+ pthread_handler_t handle_slave_io(void *arg)
+ { ...
+ request_dump(mysql, ...);
+ ...
+ }
+
+ int request_dump(MYSQL* mysql, ...)
+ { ...
+ if (opt_log_slave_updates &&
+ mi->io_thd->variables.binlog_annotate_rows_events)
+ binlog_flags|= BINLOG_SEND_ANNOTATE_ROWS_EVENT;
+ ...
+ int2store(buf + 4, binlog_flags);
+ ...
+ simple_command(mysql, COM_BINLOG_DUMP, buf, ...);
+ ...
+ }
+
+6. How master executes the request
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+ case COM_BINLOG_DUMP:
+ { ...
+ flags= uint2korr(packet + 4);
+ ...
+ mysql_binlog_send(thd, ..., flags);
+ ...
+ }
+
+ void mysql_binlog_send(THD* thd, ..., ushort flags)
+ { ...
+ Log_event::read_log_event(&log, packet, ...);
+ ...
+ if ((*packet)[EVENT_TYPE_OFFSET + 1] != ANNOTATE_ROWS_EVENT ||
+ flags & BINLOG_SEND_ANNOTATE_ROWS_EVENT ||
+ thd->server_id == 0 /* slave == mysqlbinlog */ )
+ {
+ my_net_write(net, packet->ptr(), packet->length());
+ }
+ ...
+ }
+
+7. How slave SQL thread processes Annotate_rows events
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+The slave processes each recieved event by "applying" it, i.e. by
+calling the Log_event::apply_event() function which in turn calls
+the virtual do_apply_event() member specific for each type of the
+event.
+
+ int exec_relay_log_event(THD* thd, Relay_log_info* rli)
+ { ...
+ Log_event *ev = next_event(rli);
+ ...
+ apply_event_and_update_pos(ev, ...);
+
+ if (ev->get_type_code() != FORMAT_DESCRIPTION_EVENT)
+ delete ev;
+ ...
+ }
+
+ int apply_event_and_update_pos(Log_event *ev, ...)
+ { ...
+ ev->apply_event(...);
+ ...
+ }
+
+ int Log_event::apply_event(...)
+ {
+ return do_apply_event(...);
+ }
+
+What does it mean to "apply" an Annotate_rows event? It means to set current
+thd query to that of the described by the event, i.e. to the query which
+caused the subsequent Rows events (see "How Master writes Annotate_rows
+events to the binary log" to follow what happens further when the subsequent
+Rows events is applied):
+
+ int Annotate_rows_log_event::do_apply_event(...)
+ {
+ thd->set_query(m_query_txt, m_query_len);
+ }
+
+NOTE. I am not sure, but possibly current values of thd->query and
+thd->query_length should be saved before calling set_query() and to be
+restored on the Annotate_rows_log_event object deletion.
+Is it really needed ?
+
+After calling this do_apply_event() function we may not delete the
+Annotate_rows_log_event object immediatedly (see exec_relay_log_event()
+above) because thd->query now points to the string inside this object.
+We may keep the pointer to this object in the Relay_log_info:
+
+ class Relay_log_info
+ {
+ public:
+ ...
+ void set_annotate_event(Annotate_rows_log_event*);
+ Annotate_rows_log_event* get_annotate_event();
+ void free_annotate_event();
+ ...
+ private:
+ Annotate_rows_log_event* m_annotate_event;
+ };
+
+When the saved Annotate_rows object may be deleted? When all corresponding
+Rows events will be processed, i.e. before processing the first non-Rows
+event (note that Annotate_rows object resides in the binary log *after*
+the (possible) 'BEGIN' Query event which accompanies the rows events; note
+also that this deletion is adjusted with the case when some or all
+corresponding Rows events are filtered out by replicate filter rules):
+
+ int exec_relay_log_event(THD* thd, Relay_log_info* rli)
+ { ...
+ Log_event *ev= next_event(rli);
+ ...
+ if (rli->get_annotate_event() && !IS_RBR_EVENT_TYPE(ev->get_type_code()))
+ rli->free_annotate_event();
+
+ apply_event_and_update_pos(ev, ...);
+
+ if (ev->get_type_code() == ANNOTATE_ROWS_EVENT)
+ rli->set_annotate_event((Annotate_rows_log_event*) ev);
+ else if (ev->get_type_code() != FORMAT_DESCRIPTION_EVENT)
+ delete ev;
+ ...
+ }
+
+where
+
+ #define IS_RBR_EVENT_TYPE(type) ( (type) == TABLE_MAP_EVENT || \
+ (type) == WRITE_ROWS_EVENT || \
+ (type) == UPDATE_ROWS_EVENT || \
+ (type) == DELETE_ROWS_EVENT )
+
+8. General remarks
+~~~~~~~~~~~~~~~~~~
+Kristian noticed that introducing new log event type should be coordinated
+somehow with MySQL/Sun:
+
+ Kristian: The numeric code for this event must be assigned carefully.
+ It should be coordinated with MySQL/Sun, otherwise we can get into a
+ situation where MySQL uses the same numeric code for one event that
+ MariaDB uses for ANNOTATE_ROWS_EVENT, which would make merging the two
+ impossible.
+ Alex: I reserved about 20 numbers not to have possible conflicts
+ with MySQL.
+ Kristian: Still, I think it would be appropriate to send a polite email
+ to internals(a)lists.mysql.com about this and suggesting to reserve the
+ event number.
+
+Also we should notice the introduction of the BINLOG_SEND_ANNOTATE_ROWS_EVENT
+flag taking into account that MySQL/Sun may also introduce a flag with the
+same value to be used in the request_dump-mysql_binlog_send interface.
+But this is mainly the question of merging: if a conflict concerning this
+flag occur, we may simply change the BINLOG_SEND_ANNOTATE_ROWS_EVENT value
+(this does not require additional changes in the code).
-=-=(Alexi - Sat, 19 Dec 2009, 15:41)=-=-
High-Level Specification modified.
--- /tmp/wklog.47.old.14545 2009-12-19 15:41:21.000000000 +0200
+++ /tmp/wklog.47.new.14545 2009-12-19 15:41:21.000000000 +0200
@@ -1,122 +1,107 @@
-First suggestion:
-
-> I think for this we would actually need a new binlog event type
-> (Comment_log_event?). Unless we want to log an empty statement Query_log_event
-> containing only a comment (a bit of a hack).
-
-New server option
-~~~~~~~~~~~~~~~~~
- --binlog-annotate-rows-events
-
-Setting this option makes RBR (rows-) events in the binary log to be
-preceded by Annotate rows events (see below). The corresponding
-'binlog_annotate_rows_events' system variable is dynamic and has both
-global and session values. Default global value is OFF.
-
-Note. Session values are usefull to make it possible to annotate only
- some selected statements:
+Content
+~~~~~~~
+ 1. Annotate_rows_log_event
+ 2. Server option: --binlog-annotate-rows-events
+ 3. Server option: --replicate-annotate-rows-events
+ 4. mysqlbinlog option: --print-annotate-rows-events
+ 5. mysqlbinlog output
+
+1. Annotate_rows_log_event [ ANNOTATE_ROWS_EVENT ]
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+Describes the query which caused the corresponding rows events. Has empty
+post-header and contains the query text in its data part. Example:
+
+ ************************
+ ANNOTATE_RBR_EVENT
+ ************************
+ 00000220 | B6 A0 2C 4B | time_when = 1261215926
+ 00000224 | 33 | event_type = 51
+ 00000225 | 64 00 00 00 | server_id = 100
+ 00000229 | 36 00 00 00 | event_len = 54
+ 0000022D | 56 02 00 00 | log_pos = 00000256
+ 00000231 | 00 00 | flags = <none>
+ ------------------------
+ 00000233 | 49 4E 53 45 | query = "INSERT INTO t1 VALUES (1), (2), (3)"
+ 00000237 | 52 54 20 49 |
+ 0000023B | 4E 54 4F 20 |
+ 0000023F | 74 31 20 56 |
+ 00000243 | 41 4C 55 45 |
+ 00000247 | 53 20 28 31 |
+ 0000024B | 29 2C 20 28 |
+ 0000024F | 32 29 2C 20 |
+ 00000253 | 28 33 29 |
+ ************************
+
+In binary log, Annotate_rows event follows the (possible) 'BEGIN' Query event
+and precedes the first of Table map events which accompany the corresponding
+rows events. (See example in the "mysqlbinlog output" section below.)
+
+2. Server option: --binlog-annotate-rows-events
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+Tells the master to write Annotate_rows events to the binary log.
+
+ * Variable Name: binlog_annotate_rows_events
+ * Scope: Global & Session
+ * Access Type: Dynamic
+ * Data Type: bool
+ * Default Value: OFF
+NOTE. Session values allows to annotate only some selected statements:
...
SET SESSION binlog_annotate_rows_events=ON;
... statements to be annotated ...
SET SESSION binlog_annotate_rows_events=OFF;
... statements not to be annotated ...
-New binlog event type
-~~~~~~~~~~~~~~~~~~~~~
- Annotate_rows_log_event [ ANNOTATE_ROWS_EVENT ]
-
-Describes the query which caused the corresponding rows event. In binary log,
-precedes each Table_map_log_event. Contains empty post-header and the query
-text in its data part.
-
-The numeric code for this event must be assigned carefully. It should be
-coordinated with MySQL/Sun, otherwise we can get into a situation where MySQL
-uses the same numeric code for one event that MariaDB uses for
-ANNOTATE_ROWS_EVENT, which would make merging the two impossible.
-
-Example:
-
- ...
- ************************
- ANNOTATE_ROWS_EVENT [51]
- ************************
- 000000C7 | 54 1B 12 4B | time_when = 1259477844
- 000000CB | 33 | event_type = 51
- 000000CC | 64 00 00 00 | server_id = 100
- 000000D0 | 2C 00 00 00 | event_len = 44
- 000000D4 | F3 00 00 00 | log_pos = 000000F3
- 000000D8 | 00 00 | flags = <none>
- ------------------------
- 000000DA | 69 6E 73 65 | query = "insert into t1 values (1)"
- 000000DE | 72 74 20 69 |
- 000000E2 | 6E 74 6F 20 |
- 000000E6 | 74 31 20 76 |
- 000000EA | 61 6C 75 65 |
- 000000EE | 73 20 28 31 |
- 000000F2 | 29 |
- ************************
- TABLE_MAP_EVENT [19]
- ************************
- 000000F3 | 54 1B 12 4B | time_when = 1259477844
- 000000F7 | 13 | event_type = 19
- 000000F8 | 64 00 00 00 | server_id = 100
- 000000FC | 29 00 00 00 | event_len = 41
- 00000100 | 1C 01 00 00 | log_pos = 0000011C
- 00000104 | 00 00 | flags = <none>
- ------------------------
- ...
- ************************
- WRITE_ROWS_EVENT [23]
- ************************
- 0000011C | 54 1B 12 4B | time_when = 1259477844
- 00000120 | 17 | event_type = 23
- 00000121 | 64 00 00 00 | server_id = 100
- 00000125 | 22 00 00 00 | event_len = 34
- 00000129 | 3E 01 00 00 | log_pos = 0000013E
- 0000012D | 10 00 | flags = LOG_EVENT_UPDATE_TABLE_MAP_VERSION_F
- ------------------------
- 0000012F | 0F 00 00 00 | table_id = 15
- ...
-
-New mysqlbinlog option
-~~~~~~~~~~~~~~~~~~~~~~
- --print-annotate-rows-events
-
-With this option, mysqlbinlog prints the content of Annotate-rows
-events (if the binary log does contain them). Without this option
-(i.e. by default), mysqlbinlog skips Annotate rows events.
-
-
-mysqlbinlog output
-~~~~~~~~~~~~~~~~~~
-Something like this:
+3. Server option: --replicate-annotate-rows-events
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+Tells the slave to reproduce Annotate_rows events recieved from the master
+in its own binary log (sensible only in pair with log-slave-updates option).
+
+ * Variable Name: replicate_annotate_rows_events
+ * Scope: Global
+ * Access Type: Read only
+ * Data Type: bool
+ * Default Value: OFF
+
+NOTE. Why do we additionally need this 'replicate' option? Why not to make
+the slave to reproduce this events when its binlog-annotate-rows-events
+global value is ON? Well, because, for example, we may want to configure
+the slave which should reproduce Annotate_rows events but has global
+binlog-annotate-rows-events = OFF meaning this to be the default value for
+the client threads (see also "How slave treats replicate-annotate-rows-events
+option" in LLD part).
+
+4. mysqlbinlog option: --print-annotate-rows-events
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+With this option, mysqlbinlog prints the content of Annotate_rows events (if
+the binary log does contain them). Without this option (i.e. by default),
+mysqlbinlog skips Annotate_rows events.
+5. mysqlbinlog output
+~~~~~~~~~~~~~~~~~~~~~
+With --print-annotate-rows-events, mysqlbinlog outputs Annotate_rows events
+in a form like this:
...
- # at 199
- # at 243
- # at 284
- #091129 9:57:24 server id 100 end_log_pos 243 Query: `insert into t1 values
-(1)`
- #091129 9:57:24 server id 100 end_log_pos 284 Table_map: `test`.`t1` mapped
-to number 15
- #091129 9:57:24 server id 100 end_log_pos 318 Write_rows: table id 15
+ # at 1646
+ #091219 12:45:26 server id 100 end_log_pos 1714 Query thread_id=1
+exec_time=0 error_code=0
+ SET TIMESTAMP=1261215926/*!*/;
+ BEGIN
+ /*!*/;
+ # at 1714
+ # at 1812
+ # at 1853
+ # at 1894
+ # at 1938
+ #091219 12:45:26 server id 100 end_log_pos 1812 Query: `DELETE t1, t2 FROM
+t1 INNER JOIN t2 INNER JOIN t3 WHERE t1.a=t2.a AND t2.a=t3.a`
+ #091219 12:45:26 server id 100 end_log_pos 1853 Table_map: `test`.`t1`
+mapped to number 16
+ #091219 12:45:26 server id 100 end_log_pos 1894 Table_map: `test`.`t2`
+mapped to number 17
+ #091219 12:45:26 server id 100 end_log_pos 1938 Delete_rows: table id 16
+ #091219 12:45:26 server id 100 end_log_pos 1982 Delete_rows: table id 17
flags: STMT_END_F
-
- BINLOG '
- VBsSSzNkAAAALAAAAPMAAAAAAGluc2VydCBpbnRvIHQxIHZhbHVlcyAoMSk=
- VBsSSxNkAAAAKQAAABwBAAAAAA8AAAAAAAAABHRlc3QAAnQxAAEDAAE=
- VBsSSxdkAAAAIgAAAD4BAAAQAA8AAAAAAAEAAf/+AQAAAA==
- '/*!*/;
- ### INSERT INTO test.t1
- ### SET
- ### @1=1 /* INT meta=0 nullable=1 is_null=0 */
...
-When master sends Annotate rows events
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-1. Master always sends Annotate_rows events to mysqlbinlog (in
- remote case).
-2. Master sends Annotate_rows events to a slave only if the slave has
- both log-slave-updates and binlog-annotate-rows-events options set.
-
-=-=(Bothorsen - Fri, 18 Dec 2009, 16:22)=-=-
Add estimation time.
Worked 5 hours and estimate 35 hours remain (original estimate increased by 5 hours).
-=-=(Bothorsen - Fri, 18 Dec 2009, 16:16)=-=-
This is the work done on this patch so far. Most of it done by Alex.
Worked 15 hours and estimate 035 hours remain (original estimate increased by 50 hours).
------------------------------------------------------------
-=-=(View All Progress Notes, 20 total)=-=-
http://askmonty.org/worklog/index.pl?tid=47&nolimit=1
DESCRIPTION:
Store in binlog (and show in mysqlbinlog output) texts of statements that
caused RBR events
This is needed for (list from Monty):
- Easier to understand why updates happened
- Would make it easier to find out where in application things went
wrong (as you can search for exact strings)
- Allow one to filter things based on comments in the statement.
The cost of this can be that the binlog will be approximately 2x in size
(especially insert of big blob's would be a bit painful), so this should
be an optional feature.
HIGH-LEVEL SPECIFICATION:
Content
~~~~~~~
1. Annotate_rows_log_event
2. Server option: --binlog-annotate-rows-events
3. Server option: --replicate-annotate-rows-events
4. mysqlbinlog option: --print-annotate-rows-events
5. mysqlbinlog output
1. Annotate_rows_log_event [ ANNOTATE_ROWS_EVENT ]
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
Describes the query which caused the corresponding rows events. Has empty
post-header and contains the query text in its data part. Example:
************************
ANNOTATE_ROWS_EVENT
************************
00000220 | B6 A0 2C 4B | time_when = 1261215926
00000224 | 33 | event_type = 51
00000225 | 64 00 00 00 | server_id = 100
00000229 | 36 00 00 00 | event_len = 54
0000022D | 56 02 00 00 | log_pos = 00000256
00000231 | 00 00 | flags = <none>
------------------------
00000233 | 49 4E 53 45 | query = "INSERT INTO t1 VALUES (1), (2), (3)"
00000237 | 52 54 20 49 |
0000023B | 4E 54 4F 20 |
0000023F | 74 31 20 56 |
00000243 | 41 4C 55 45 |
00000247 | 53 20 28 31 |
0000024B | 29 2C 20 28 |
0000024F | 32 29 2C 20 |
00000253 | 28 33 29 |
************************
In binary log, Annotate_rows event follows the (possible) 'BEGIN' Query event
and precedes the first of Table map events which accompany the corresponding
rows events. (See example in the "mysqlbinlog output" section below.)
2. Server option: --binlog-annotate-rows-events
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
Tells the master to write Annotate_rows events to the binary log.
* Variable Name: binlog_annotate_rows_events
* Scope: Global & Session
* Access Type: Dynamic
* Data Type: bool
* Default Value: OFF
NOTE. Session values allows to annotate only some selected statements:
...
SET SESSION binlog_annotate_rows_events=ON;
... statements to be annotated ...
SET SESSION binlog_annotate_rows_events=OFF;
... statements not to be annotated ...
3. Server option: --replicate-annotate-rows-events
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
Tells the slave to reproduce Annotate_rows events recieved from the master
in its own binary log (sensible only in pair with log-slave-updates option).
* Variable Name: replicate_annotate_rows_events
* Scope: Global
* Access Type: Read only
* Data Type: bool
* Default Value: OFF
NOTE. Why do we additionally need this 'replicate' option? Why not to make
the slave to reproduce this events when its binlog-annotate-rows-events
global value is ON? Well, because, for example, we may want to configure
the slave which should reproduce Annotate_rows events but has global
binlog-annotate-rows-events = OFF meaning this to be the default value for
the client threads (see also "How slave treats replicate-annotate-rows-events
option" in LLD part).
4. mysqlbinlog option: --print-annotate-rows-events
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
With this option, mysqlbinlog prints the content of Annotate_rows events (if
the binary log does contain them). Without this option (i.e. by default),
mysqlbinlog skips Annotate_rows events.
5. mysqlbinlog output
~~~~~~~~~~~~~~~~~~~~~
With --print-annotate-rows-events, mysqlbinlog outputs Annotate_rows events
in a form like this:
...
# at 1646
#091219 12:45:26 server id 100 end_log_pos 1714 Query thread_id=1
exec_time=0 error_code=0
SET TIMESTAMP=1261215926/*!*/;
BEGIN
/*!*/;
# at 1714
# at 1812
# at 1853
# at 1894
# at 1938
#091219 12:45:26 server id 100 end_log_pos 1812 Query: `DELETE t1, t2 FROM
t1 INNER JOIN t2 INNER JOIN t3 WHERE t1.a=t2.a AND t2.a=t3.a`
#091219 12:45:26 server id 100 end_log_pos 1853 Table_map: `test`.`t1`
mapped to number 16
#091219 12:45:26 server id 100 end_log_pos 1894 Table_map: `test`.`t2`
mapped to number 17
#091219 12:45:26 server id 100 end_log_pos 1938 Delete_rows: table id 16
#091219 12:45:26 server id 100 end_log_pos 1982 Delete_rows: table id 17
flags: STMT_END_F
...
LOW-LEVEL DESIGN:
Content
~~~~~~~
1. Annotate_rows event number
2. Outline of Annotate_rows event behavior
3. How Master writes Annotate_rows events to the binary log
4. How slave treats replicate-annotate-rows-events option
5. How slave IO thread requests Annotate_rows events
6. How master executes the request
7. How slave SQL thread processes Annotate_rows events
8. General remarks
1. Annotate_rows event number
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
To avoid possible event numbers conflict with MySQL/Sun, we leave a gap
between the last MySQL event number and the Annotate_rows event number:
enum Log_event_type
{ ...
INCIDENT_EVENT= 26,
// New MySQL event numbers are to be added here
MYSQL_EVENTS_END,
MARIA_EVENTS_BEGIN= 51,
// New Maria event numbers start from here
ANNOTATE_ROWS_EVENT= 51,
ENUM_END_EVENT
};
together with the corresponding extension of 'post_header_len' array in the
Format description event. (This extension does not affect the compatibility
of the binary log). Here is how Format description event looks like with
this extension:
************************
FORMAT_DESCRIPTION_EVENT
************************
00000004 | A1 A0 2C 4B | time_when = 1261215905
00000008 | 0F | event_type = 15
00000009 | 64 00 00 00 | server_id = 100
0000000D | 7F 00 00 00 | event_len = 127
00000011 | 83 00 00 00 | log_pos = 00000083
00000015 | 01 00 | flags = LOG_EVENT_BINLOG_IN_USE_F
------------------------
00000017 | 04 00 | binlog_ver = 4
00000019 | 35 2E 32 2E | server_ver = 5.2.0-MariaDB-alpha-debug-log
..... ...
0000004B | A1 A0 2C 4B | time_created = 1261215905
0000004F | 13 | common_header_len = 19
------------------------
post_header_len
------------------------
00000050 | 38 | 56 - START_EVENT_V3 [1]
..... ...
00000069 | 02 | 2 - INCIDENT_EVENT [26]
0000006A | 00 | 0 - RESERVED [27]
..... ...
00000081 | 00 | 0 - RESERVED [50]
00000082 | 00 | 0 - ANNOTATE_ROWS_EVENT [51]
************************
2. Outline of Annotate_rows event behavior
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
Each Annotate_rows_log_event object has two private members describing the
corresponding query:
char *m_query_txt;
uint m_query_len;
When the object is created for writing to a binary log, this query is taken
from 'thd' (for short, below we omit the 'Annotate_rows_log_event::' prefix
as well as other implementation details):
Annotate_rows_log_event(THD *thd)
{
m_query_txt = thd->query();
m_query_len = thd->query_length();
}
When the object is read from a binary log, the query is taken from the buffer
containing the binary log representation of the event (this buffer is allocated
in Log_event object from which all Log events are derived):
Annotate_rows_log_event(char *buf, uint event_len,
Format_description_log_event *desc)
{
m_query_len = event_len - desc->common_header_len;
m_query_txt = buf + desc->common_header_len;
}
The events are written to the binary log by the Log_event::write() member
which calls virtual write_data_header() and write_data_body() members
("data header" and "post header" are synonym in replication terminology).
In our case, data header is empty and data body is just the query:
bool write_data_body(IO_CACHE *file)
{
return my_b_safe_write(file, (uchar*) m_query_txt, m_query_len);
}
Printing the event is just printing the query:
void Annotate_rows_log_event::print(FILE *file, PRINT_EVENT_INFO *pinfo)
{
my_b_printf(&pinfo->head_cache, "\tQuery: `%s`\n", m_query_txt);
}
3. How Master writes Annotate_rows events to the binary log
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
The event is written to the binary log just before the group of Table_map
events which precede corresponding Rows events (one query may generate
several Table map events in the binary log, but the corresponding
Annotate_rows event must be written only once before the first Table map
event; hence the boolean variable 'with_annotate' below):
int write_locked_table_maps(THD *thd)
{ ...
bool with_annotate= thd->variables.binlog_annotate_rows_events;
...
for (uint i= 0; i < ... <number of tables> ...; ++i)
{ ...
thd->binlog_write_table_map(table, ..., with_annotate);
with_annotate= 0; // write Annotate_event not more than once
...
}
...
}
int THD::binlog_write_table_map(TABLE *table, ..., bool with_annotate)
{ ...
Table_map_log_event the_event(...);
...
if (with_annotate)
{
Annotate_rows_log_event anno(this);
mysql_bin_log.write(&anno);
}
mysql_bin_log.write(&the_event);
...
}
4. How slave treats replicate-annotate-rows-events option
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
The replicate-annotate-rows-events option is treated just as the session
value of the binlog_annotate_rows_events variable for the slave IO and
SQL threads. This setting is done during initialization of these threads:
pthread_handler_t handle_slave_io(void *arg)
{
THD *thd= new THD;
...
init_slave_thread(thd, SLAVE_THD_IO);
...
}
pthread_handler_t handle_slave_sql(void *arg)
{
THD *thd= new THD;
...
init_slave_thread(thd, SLAVE_THD_SQL);
...
}
int init_slave_thread(THD* thd, SLAVE_THD_TYPE thd_type)
{ ...
thd->variables.binlog_annotate_rows_events=
opt_replicate_annotate_rows_events;
...
}
5. How slave IO thread requests Annotate_rows events
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
If the replicate-annotate-rows-events option is not set on a slave, there
is no need for master to send Annotate_rows events to this slave. The slave
(or mysqlbinlog in remote case), before requesting binlog dump via the
COM_BINLOG_DUMP command, informs the master whether it should send these
events by executing the newly added COM_BINLOG_DUMP_OPTIONS_EXT server
command:
case COM_BINLOG_DUMP_OPTIONS_EXT:
thd->binlog_dump_flags_ext= packet[0];
my_ok(thd);
break;
Note. We add this new command and don't use COM_BINLOG_DUMP to avoid possible
conflicts with MySQL/Sun.
6. How master executes the request
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
case COM_BINLOG_DUMP:
{ ...
flags= uint2korr(packet + 4);
...
mysql_binlog_send(thd, ..., flags);
...
}
void mysql_binlog_send(THD* thd, ..., ushort flags)
{ ...
Log_event::read_log_event(&log, packet, ...);
...
if ((*packet)[EVENT_TYPE_OFFSET + 1] != ANNOTATE_ROWS_EVENT ||
flags & BINLOG_SEND_ANNOTATE_ROWS_EVENT)
{
my_net_write(net, packet->ptr(), packet->length());
}
...
}
7. How slave SQL thread processes Annotate_rows events
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
The slave processes each recieved event by "applying" it, i.e. by
calling the Log_event::apply_event() function which in turn calls
the virtual do_apply_event() member specific for each type of the
event.
int exec_relay_log_event(THD* thd, Relay_log_info* rli)
{ ...
Log_event *ev = next_event(rli);
...
apply_event_and_update_pos(ev, ...);
if (ev->get_type_code() != FORMAT_DESCRIPTION_EVENT)
delete ev;
...
}
int apply_event_and_update_pos(Log_event *ev, ...)
{ ...
ev->apply_event(...);
...
}
int Log_event::apply_event(...)
{
return do_apply_event(...);
}
What does it mean to "apply" an Annotate_rows event? It means to set current
thd query to that of the described by the event, i.e. to the query which
caused the subsequent Rows events (see "How Master writes Annotate_rows
events to the binary log" to follow what happens further when the subsequent
Rows events are applied):
int Annotate_rows_log_event::do_apply_event(...)
{
thd->set_query(m_query_txt, m_query_len);
}
NOTE. I am not sure, but possibly current values of thd->query and
thd->query_length should be saved before calling set_query() and to be
restored on the Annotate_rows_log_event object deletion.
Is it really needed ?
After calling this do_apply_event() function we may not delete the
Annotate_rows_log_event object immediatedly (see exec_relay_log_event()
above) because thd->query now points to the string inside this object.
We may keep the pointer to this object in the Relay_log_info:
class Relay_log_info
{
public:
...
void set_annotate_event(Annotate_rows_log_event*);
Annotate_rows_log_event* get_annotate_event();
void free_annotate_event();
...
private:
Annotate_rows_log_event* m_annotate_event;
};
The saved Annotate_rows object should be deleted when all corresponding
Rows events will be processed:
int exec_relay_log_event(THD* thd, Relay_log_info* rli)
{ ...
Log_event *ev= next_event(rli);
...
apply_event_and_update_pos(ev, ...);
if (rli->get_annotate_event() && is_last_rows_event(ev))
rli->free_annotate_event();
else if (ev->get_type_code() == ANNOTATE_ROWS_EVENT)
rli->set_annotate_event((Annotate_rows_log_event*) ev);
else if (ev->get_type_code() != FORMAT_DESCRIPTION_EVENT)
delete ev;
...
}
where
bool is_last_rows_event(Log_event* ev)
{
Log_event_type type= ev->get_type_code();
if (IS_ROWS_EVENT_TYPE(type))
{
Rows_log_event* rows= (Rows_log_event*)ev;
return rows->get_flags(Rows_log_event::STMT_END_F);
}
return 0;
}
#define IS_ROWS_EVENT_TYPE(type) ((type) == WRITE_ROWS_EVENT || \
(type) == UPDATE_ROWS_EVENT || \
(type) == DELETE_ROWS_EVENT)
8. General remarks
~~~~~~~~~~~~~~~~~~
Kristian noticed that introducing new log event type should be coordinated
somehow with MySQL/Sun:
Kristian: The numeric code for this event must be assigned carefully.
It should be coordinated with MySQL/Sun, otherwise we can get into a
situation where MySQL uses the same numeric code for one event that
MariaDB uses for ANNOTATE_ROWS_EVENT, which would make merging the two
impossible.
Alex: I reserved about 20 numbers not to have possible conflicts
with MySQL.
Kristian: Still, I think it would be appropriate to send a polite email
to internals(a)lists.mysql.com about this and suggesting to reserve the
event number.
ESTIMATED WORK TIME
ESTIMATED COMPLETION DATE
-----------------------------------------------------------------------
WorkLog (v3.5.9)
1
0
Hi,
I've implemented a few things for the Windows port of SHOW PROFILE (
IO read/writes, user/kernel times, page faults) which you may want to
consider looking at. There's also a few miscellaneous fixes in there.
The commit log has the details.
See: https://launchpad.net/~abudovski/maria/robust
Thanks.
2
7