Hi Varun, Please find my feedback below. As disucssed before: please also run the MDEV's test case with this patch and post a comment with the result. On Fri, Mar 23, 2018 at 05:17:57PM +0530, Varun wrote:
revision-id: 2950feb5c09033843a6a32c7431421b45d5812dd (mariadb-10.3.0-646-g2950feb5c09) parent(s): 9ef7ab221d70630155206910de8a6053db097164 author: Varun Gupta committer: Varun Gupta timestamp: 2018-03-23 17:17:09 +0530 message:
MDEV-9959: A serious MariaDB server performance bug
step#1: if a derived table has SELECT DISTINCT, provide index statistics for it so that the join optimizer in the upper select knows that ref access to the table will produce one row.
Added handling for multiple selects in the derived table
--- mysql-test/r/mdev9959.result | 64 ++++++++++++++++++++++++++++++++++++++++++++ mysql-test/t/mdev9959.test | 28 +++++++++++++++++++ sql/sql_lex.h | 2 ++ sql/sql_union.cc | 36 +++++++++++++++++++++++++ sql/table.cc | 26 +++++++++++++++--- 5 files changed, 153 insertions(+), 3 deletions(-)
diff --git a/mysql-test/r/mdev9959.result b/mysql-test/r/mdev9959.result new file mode 100644 index 00000000000..5269ba9b734 --- /dev/null +++ b/mysql-test/r/mdev9959.result @@ -0,0 +1,64 @@ +create table t1(a int); +insert into t1 values (1),(2),(3),(4),(5),(6); +create table t2(a int, b int,c int); +insert into t2(a,b,c) values (1,1,2),(2,2,3),(3,1,4),(4,2,2),(5,1,1),(6,2,5); +create table t3(a int, b int); +insert into t3(a,b) values (1,1),(2,2),(2,1),(1,2),(5,1),(9,2); +analyze select * from t1 , ( (select distinct t2.a from t2 group by c))q where t1.a=q.a; +id select_type table type possible_keys key key_len ref rows r_rows filtered r_filtered Extra +1 PRIMARY t1 ALL NULL NULL NULL NULL 6 6.00 100.00 100.00 Using where +1 PRIMARY <derived2> ref key0 key0 5 test.t1.a 1 0.83 100.00 100.00 +2 DERIVED t2 ALL NULL NULL NULL NULL 6 6.00 100.00 100.00 Using temporary; Using filesort
It is not at all clear which parts of the above output matter and which don't, for example, does #rows matter for table t1? Please add a comment (into .test file with --echo # ) describing that table "<derived2>" should have type=ref and rows=1
+select * from t1 , ( (select distinct t2.a from t2 group by c))q where t1.a=q.a; +a a +1 1 +2 2 +3 3 +5 5 +6 6 +analyze select * from t1 , ( (select t2.a from t2 group by c))q where t1.a=q.a; +id select_type table type possible_keys key key_len ref rows r_rows filtered r_filtered Extra +1 PRIMARY t1 ALL NULL NULL NULL NULL 6 6.00 100.00 100.00 Using where +1 PRIMARY <derived2> ref key0 key0 5 test.t1.a 2 0.83 100.00 100.00 +2 DERIVED t2 ALL NULL NULL NULL NULL 6 6.00 100.00 100.00 Using temporary; Using filesort +select * from t1 , ( (select t2.a from t2 group by c))q where t1.a=q.a; +a a +1 1 +2 2 +3 3 +5 5 +6 6 +analyze select * from t1 , ( (select distinct t2.a from t2 group by c) union all (select distinct t2.a from t2 group by c))q where t1.a=q.a; +id select_type table type possible_keys key key_len ref rows r_rows filtered r_filtered Extra +1 PRIMARY t1 ALL NULL NULL NULL NULL 6 6.00 100.00 100.00 Using where +1 PRIMARY <derived2> ref key0 key0 5 test.t1.a 2 1.67 100.00 100.00 +2 DERIVED t2 ALL NULL NULL NULL NULL 6 6.00 100.00 100.00 Using temporary; Using filesort +3 UNION t2 ALL NULL NULL NULL NULL 6 6.00 100.00 100.00 Using temporary; Using filesort
What does the above test show? We've got type=ref, #rows = 2 for the derived table. The estimate is correct but where does it come from? As far as I understand this patch, it only sets the estimate rows=1 for derived tables. How do we arrive at the value of 2? (If we would have gotten rows=2 without this patch, what's the point of having the testcase as part of this patch?)
+select * from t1 , ( (select distinct t2.a from t2 group by c) union all (select distinct t2.a from t2 group by c))q where t1.a=q.a; +a a +1 1 +1 1 +2 2 +2 2 +3 3 +3 3 +5 5 +5 5 +6 6 +6 6 +analyze select * from t1 , ( (select distinct t2.a from t2 group by c) union (select distinct t2.a from t2 group by c) union (select t3.a from t3 group by b))q where t1.a=q.a; +id select_type table type possible_keys key key_len ref rows r_rows filtered r_filtered Extra +1 PRIMARY t1 ALL NULL NULL NULL NULL 6 6.00 100.00 100.00 Using where +1 PRIMARY <derived2> ref key0 key0 5 test.t1.a 2 0.83 100.00 100.00 +2 DERIVED t2 ALL NULL NULL NULL NULL 6 6.00 100.00 100.00 Using temporary; Using filesort +3 UNION t2 ALL NULL NULL NULL NULL 6 6.00 100.00 100.00 Using temporary; Using filesort +4 UNION t3 ALL NULL NULL NULL NULL 6 6.00 100.00 100.00 Using temporary; Using filesort +NULL UNION RESULT <union2,3,4> ALL NULL NULL NULL NULL NULL 5.00 NULL NULL +select * from t1 , ( (select distinct t2.a from t2 group by c) union (select distinct t2.a from t2 group by c) union (select t3.a from t3 group by b))q where t1.a=q.a; +a a +1 1 +2 2 +3 3 +5 5 +6 6 +drop table t1,t2,t3; diff --git a/mysql-test/t/mdev9959.test b/mysql-test/t/mdev9959.test new file mode 100644 index 00000000000..77970f363ac --- /dev/null +++ b/mysql-test/t/mdev9959.test @@ -0,0 +1,28 @@ +create table t1(a int); +insert into t1 values (1),(2),(3),(4),(5),(6); +create table t2(a int, b int,c int); +insert into t2(a,b,c) values (1,1,2),(2,2,3),(3,1,4),(4,2,2),(5,1,1),(6,2,5); +create table t3(a int, b int); +insert into t3(a,b) values (1,1),(2,2),(2,1),(1,2),(5,1),(9,2); + +# one select in derived table + +#with distinct +analyze select * from t1 , ( (select distinct t2.a from t2 group by c))q where t1.a=q.a; +select * from t1 , ( (select distinct t2.a from t2 group by c))q where t1.a=q.a; +# without distinct +analyze select * from t1 , ( (select t2.a from t2 group by c))q where t1.a=q.a; +select * from t1 , ( (select t2.a from t2 group by c))q where t1.a=q.a; + +# multiple selects in derived table + +#UNION ALL +analyze select * from t1 , ( (select distinct t2.a from t2 group by c) union all (select distinct t2.a from t2 group by c))q where t1.a=q.a; +select * from t1 , ( (select distinct t2.a from t2 group by c) union all (select distinct t2.a from t2 group by c))q where t1.a=q.a; + +#UNION +analyze select * from t1 , ( (select distinct t2.a from t2 group by c) union (select distinct t2.a from t2 group by c) union (select t3.a from t3 group by b))q where t1.a=q.a; +select * from t1 , ( (select distinct t2.a from t2 group by c) union (select distinct t2.a from t2 group by c) union (select t3.a from t3 group by b))q where t1.a=q.a; + +drop table t1,t2,t3; + diff --git a/sql/sql_lex.h b/sql/sql_lex.h index f0241a32acf..4128c1dd300 100644 --- a/sql/sql_lex.h +++ b/sql/sql_lex.h @@ -761,6 +761,7 @@ class st_select_lex_unit: public st_select_lex_node { Procedure *last_procedure; /* Pointer to procedure, if such exists */
bool columns_are_renamed; + bool union_all; /* TRUE if we have UNION ALL operation */
This variable is checked but it is never set?
void init_query(); st_select_lex* outer_select(); @@ -800,6 +801,7 @@ class st_select_lex_unit: public st_select_lex_node { bool union_needs_tmp_table();
void set_unique_exclude(); + bool check_distinct_in_union();
friend struct LEX; friend int subselect_union_engine::exec(); diff --git a/sql/sql_union.cc b/sql/sql_union.cc index 857c9a117f5..43ce0418fe5 100644 --- a/sql/sql_union.cc +++ b/sql/sql_union.cc @@ -1960,3 +1960,39 @@ void st_select_lex_unit::set_unique_exclude() } } } + +/* + Check if the selects in the derived table can give distinct rows
I think "can" is the wrong word here. UNION/INTERSECT/EXCEPT output "can" be distinct if the underlying selects produce distinct sets of rows. What we are checking is whether the output will be distinct regardless of what data is there in the queried tables.
+ + for example: + select * from + ((select t1.a from t1) op (select t2.a from t2) op (select t3.a from t3)); + the op here being UNION/INTERSECT/EXCEPT + + so this function would check if the derived table like the case above + can give distinct rows or not.
Please describe the criteria and provide a couple of examples (either here or in the comments inside the function).
+ + @retval false Distinct rows are not guarenteed + @retval true Distinct rows are guanrenteed Typos in the above lines.
+ +*/ + +bool st_select_lex_unit::check_distinct_in_union() +{ + bool is_intersect_present=FALSE; + st_select_lex* first= first_select(); + for(st_select_lex *sl=first; sl ; sl=sl->next_select()) + is_intersect_present|= (sl->linkage == INTERSECT_TYPE); + + if (!union_all) + return true; + else + { + if (union_select) + { + if (!is_intersect_present && !union_select->next_select()) + return true; + } + } + return false; +} diff --git a/sql/table.cc b/sql/table.cc index f42455a8413..c474d2ed4d7 100644 --- a/sql/table.cc +++ b/sql/table.cc @@ -7113,9 +7113,29 @@ bool TABLE::add_tmp_key(uint key, uint key_parts, if (!keyinfo->rec_per_key) return TRUE; bzero(keyinfo->rec_per_key, sizeof(ulong)*key_parts); - st_select_lex* lex= pos_in_table_list ? pos_in_table_list->derived->first_select(): NULL; - if (lex && (lex->options & SELECT_DISTINCT)) - keyinfo->rec_per_key[key_parts-1]=1; + + /* + For the case when there is a derived table that would give distinct rows, + the index statistics are passed to the join optimizer to tell that + a ref access to the derived table will produce only one row. + */ + + st_select_lex_unit* derived= pos_in_table_list ? pos_in_table_list->derived: NULL; + if (derived) + { + /* + This handles the case when we have a single select in the derived table + */ + st_select_lex* first= derived->first_select(); + if (first && !first->is_part_of_union() && + first->options & SELECT_DISTINCT) + keyinfo->rec_per_key[key_parts-1]=1; + else + { + if (check_distinct_in_union()) + keyinfo->rec_per_key[key_parts-1]=1; + }
I think it would be better if there was a single "if", like so: if ((this is a non-union select with SELECT_DISTINCT || check_distinct_in_union()) { keyinfo->rec_per_key[key_parts-1]=1; }
+ } keyinfo->read_stats= NULL; keyinfo->collected_stats= NULL;
BR Sergei -- Sergei Petrunia, Software Developer MariaDB Corporation | Skype: sergefp | Blog: http://s.petrunia.net/blog