Re: [Maria-developers] [Commits] 3f4dab80912: MDEV-17032: Estimates are higher for partitions of a table with @@use_stat_tables= PREFERABLY
Hi Varun, On Mon, Nov 12, 2018 at 02:41:51PM +0530, Varun wrote:
revision-id: 3f4dab80912682325b61b288178febe9ee55de49 (mariadb-10.0.36-78-g3f4dab80912) parent(s): 6cecb10a2f8b6536bed78ab6d3791d8befc9d732 author: Varun Gupta committer: Varun Gupta timestamp: 2018-11-12 14:40:07 +0530 message:
MDEV-17032: Estimates are higher for partitions of a table with @@use_stat_tables= PREFERABLY
The problem here is EITS statistics does not calculate statistics for the partitions of the table. So a temporary solution would be to not read EITS statistics for partitioned tables.
Also disabling reading of EITS for columns that participate in the partition list of a table.
--- mysql-test/r/partition.result | 37 +++++++++++++++++++++++++++++++++++++ mysql-test/t/partition.test | 35 +++++++++++++++++++++++++++++++++++ sql/opt_range.cc | 12 ++++++++++-- sql/partition_info.cc | 21 +++++++++++++++++++++ sql/partition_info.h | 1 + sql/sql_statistics.cc | 6 ++++++ 6 files changed, 110 insertions(+), 2 deletions(-)
diff --git a/mysql-test/r/partition.result b/mysql-test/r/partition.result index c6669176b3d..abfbc21dccb 100644 --- a/mysql-test/r/partition.result +++ b/mysql-test/r/partition.result @@ -2645,3 +2645,40 @@ Warnings: Note 1517 Duplicate partition name p2 DEALLOCATE PREPARE stmt; DROP TABLE t1; +# +# MDEV-17032: Estimates are higher for partitions of a table with @@use_stat_tables= PREFERABLY +# +create table ten(a int); +insert into ten values (0),(1),(2),(3),(4),(5),(6),(7),(8),(9); +create table one_k(a int); +insert into one_k select A.a + B.a* 10 + C.a * 100 from ten A, ten B, ten C;
Please rename the tables to follow the t0,t1,t2... convention.
+create table t1 ( +part_key int, +a int, +b int +) partition by list(part_key) ( +partition p0 values in (0), +partition p1 values in (1), +partition p2 values in (2), +partition p3 values in (3), +partition p4 values in (4) +); +insert into t1 +select mod(a,5), a/100, a from one_k; +set @save_use_stat_tables= @@use_stat_tables; +set @save_optimizer_use_condition_selectivity=@@optimizer_use_condition_selectivity; +set @@use_stat_tables= PREFERABLY; +set @@optimizer_use_condition_selectivity=4; +analyze table t1; +Table Op Msg_type Msg_text +test.t1 analyze status Engine-independent statistics collected +test.t1 analyze status OK
After some time passes, it gets hard to understand what exactly is being checked. Then, there's some change in the EXPLAIN (typically for an unrelated reason) and one is left wondering, whether the new EXPLAIN output is ok or not. * please use EXPLAIN PARTITIONS. * please add something like "--echo # rows should be X, not Y" before the query.
+explain select * from t1 where part_key in (1,2); +id select_type table type possible_keys key key_len ref rows Extra +1 SIMPLE t1 ALL NULL NULL NULL NULL 400 Using where +explain select * from t1 where part_key > 1; +id select_type table type possible_keys key key_len ref rows Extra +1 SIMPLE t1 ALL NULL NULL NULL NULL 600 Using where
This does not provide sufficient coverage. I get this result if I just apply the change in sql_statistics.cc. mysql> explain select * from t1 where part_key in (1,2); +------+-------------+-------+------+---------------+------+---------+------+------+-------------+ | id | select_type | table | type | possible_keys | key | key_len | ref | rows | Extra | +------+-------------+-------+------+---------------+------+---------+------+------+-------------+ | 1 | SIMPLE | t1 | ALL | NULL | NULL | NULL | NULL | 400 | Using where | +------+-------------+-------+------+---------------+------+---------+------+------+-------------+ 1 row in set (0.01 sec) mysql> explain select * from t1 where part_key > 1; +------+-------------+-------+------+---------------+------+---------+------+------+-------------+ | id | select_type | table | type | possible_keys | key | key_len | ref | rows | Extra | +------+-------------+-------+------+---------------+------+---------+------+------+-------------+ | 1 | SIMPLE | t1 | ALL | NULL | NULL | NULL | NULL | 600 | Using where | +------+-------------+-------+------+---------------+------+---------+------+------+-------------+ 1 row in set (0.00 sec) Please add testcases for other changes.
+set @@optimizer_use_condition_selectivity=@save_optimizer_use_condition_selectivity; +set @@use_stat_tables= @save_use_stat_tables; +drop table t1, one_k, ten; diff --git a/sql/opt_range.cc b/sql/opt_range.cc index 3bcaa72e32f..993b7d57e0a 100644 --- a/sql/opt_range.cc +++ b/sql/opt_range.cc @@ -3322,6 +3322,10 @@ bool create_key_parts_for_pseudo_indexes(RANGE_OPT_PARAM *param, { Field **field_ptr; TABLE *table= param->table; + partition_info *part_info= NULL; + #ifdef WITH_PARTITION_STORAGE_ENGINE + part_info= table->part_info; + #endif uint parts= 0;
for (field_ptr= table->field; *field_ptr; field_ptr++) @@ -3329,7 +3333,9 @@ bool create_key_parts_for_pseudo_indexes(RANGE_OPT_PARAM *param, Column_statistics* col_stats= (*field_ptr)->read_stats; if (bitmap_is_set(used_fields, (*field_ptr)->field_index) && col_stats && !col_stats->no_stat_values_provided() - && !((*field_ptr)->type() == MYSQL_TYPE_GEOMETRY)) + && !((*field_ptr)->type() == MYSQL_TYPE_GEOMETRY) + && (!part_info || + part_info->disable_eits_for_partitioning_columns(*field_ptr))) parts++; }
@@ -3350,7 +3356,9 @@ bool create_key_parts_for_pseudo_indexes(RANGE_OPT_PARAM *param, if (bitmap_is_set(used_fields, (*field_ptr)->field_index)) { Field *field= *field_ptr; - if (field->type() == MYSQL_TYPE_GEOMETRY) + if (field->type() == MYSQL_TYPE_GEOMETRY + && (!part_info || + part_info->disable_eits_for_partitioning_columns(*field_ptr))) continue; Why use 'field_ptr' if you have 'field' ?
The above condition seems to be wrong. Suppose that the following holds: field->type()==MYSQL_TYPE_GEOMETRY, part_info!=NULL part_info->disable_eits_for_partitioning_columns(field) == FALSE // makes // sense as one typically does not do partitioning on geometry columns. if-condition evaluates to: (TRUE && (FALSE || FALSE)) -> (TRUE && FALSE) -> FALSE. So we will not execute 'continue;' for a geometry column. This seems incorrect.
uint16 store_length; diff --git a/sql/partition_info.cc b/sql/partition_info.cc index 52bda560c1c..91d74b28d0e 100644 --- a/sql/partition_info.cc +++ b/sql/partition_info.cc @@ -3164,6 +3164,27 @@ void partition_info::print_debug(const char *str, uint *value) DBUG_PRINT("info", ("parser: %s", str)); DBUG_VOID_RETURN; } + +/* + + Disabling reading EITS statistics for columns involved in the + partition list of a table. + We assume the selecticivity for such columns would be handled + during partition pruning. + +*/ + +bool partition_info::disable_eits_for_partitioning_columns(Field *field)
The function name is misleading. If a function is named disable_XYZ() I would have expected the code inside to actually take the steps to prevent XYZ from happening. I think it would be more straightforward if it was called bool partition_info::field_is_used_in_partition_expr() const or something like that. Please also declare the function as 'const' and maybe its parameter too. (As discussed on Slack, the function should also search for the field in the list of sub-partitioning columns)
+{ + uint i; + for (i= 0; i < num_part_fields; i++) + { + if (field->eq(part_field_array[i])) + return FALSE; + } + return TRUE; +} + #else /* WITH_PARTITION_STORAGE_ENGINE */ /* For builds without partitioning we need to define these functions diff --git a/sql/sql_statistics.cc b/sql/sql_statistics.cc index cb75a5c2176..33a12588434 100644 --- a/sql/sql_statistics.cc +++ b/sql/sql_statistics.cc @@ -3589,6 +3589,12 @@ void set_statistics_for_table(THD *thd, TABLE *table) (use_stat_table_mode <= COMPLEMENTARY || !table->stats_is_read || read_stats->cardinality_is_null) ? table->file->stats.records : read_stats->cardinality; + + #ifdef WITH_PARTITION_STORAGE_ENGINE + if (table->part_info) + table->used_stat_records= table->file->stats.records; + #endif
This looks cryptic. Please add a comment, something like: /* For partitioned table, EITS statistics is based on data from all partitions. On the other hand, Partition Pruning figures which partitions will be accessed and then computes the estimate of rows in used_partitions. Use the estimate from Partition Pruning as it is typically more precise. Ideally, EITS should provide per-partition statistics but this is not implemented currently. */
KEY *key_info, *key_info_end; for (key_info= table->key_info, key_info_end= key_info+table->s->keys; key_info < key_info_end; key_info++)
BR Sergei -- Sergei Petrunia, Software Developer MariaDB Corporation | Skype: sergefp | Blog: http://s.petrunia.net/blog
participants (1)
-
Sergey Petrunia