revision-id: 53730224efd987f97a6cc968ff5214ee499d84e0 (mariadb-10.4.1-163-g53730224efd) parent(s): 3c305d3f1951f1667f84e48ddd98674c6318c39d author: Vicențiu Ciorbaru committer: Vicențiu Ciorbaru timestamp: 2019-02-10 19:54:50 +0200 message: Improve histogram collection performance by sampling Histogram collection is done by sampling a percentage of rows from the table, not looking at all individual ones. The default implementation, to keep the server's Engine Indepenent Statistics component working uses Bernoulli sampling. It does a simple table scan, but only accepts rows that pass a dice roll. This implementation is done as a storage engine interface method, so as to allow faster and/or more efficient implementations for storage engines internally. The number of rows collected is capped to a minimum of 50000 and increases logarithmically with a coffecient of 4096. The coffecient is chosen so that we expect an error of less than 3% in our estimations according to the paper: "Random Sampling for Histogram Construction: How much is enough?” – Surajit Chaudhuri, Rajeev Motwani, Vivek Narasayya, ACM SIGMOD, 1998. This interface is also a precursor to allowing SELECT ... FROM table with sampling to work. Performance wise, for a table of 10 million rows and 13 columns, 6 int, 6 doubles, one string, the previous analyze table statistics took 1 minute 20 seconds to collect all data. Post implementation, the time is less than 6 seconds. This was run on an InnoDB table, NVME SSD with approximately 2GB/s read speed and 500MB/s write speed. --- mysql-test/main/selectivity.result | 8 +++---- mysql-test/main/selectivity_innodb.result | 8 +++---- sql/handler.cc | 14 +++++++++++ sql/handler.h | 40 ++++++++++++++++++++++++++++++- sql/sql_statistics.cc | 32 +++++++++++++++++++------ 5 files changed, 86 insertions(+), 16 deletions(-) diff --git a/mysql-test/main/selectivity.result b/mysql-test/main/selectivity.result index 00907235ecc..6d09c1c9b62 100644 --- a/mysql-test/main/selectivity.result +++ b/mysql-test/main/selectivity.result @@ -1290,9 +1290,9 @@ explain extended select * from t1, t2, t1 as t3 where t1.b=t2.c and t2.d=t3.a and t3.b<5 and t1.a < 2000; id select_type table type possible_keys key key_len ref rows filtered Extra -1 SIMPLE t1 ALL NULL NULL NULL NULL 262144 100.00 Using where +1 SIMPLE t1 ALL NULL NULL NULL NULL 262117 100.00 Using where 1 SIMPLE t2 ref c,d c 5 test.t1.b 5 100.00 -1 SIMPLE t3 ALL NULL NULL NULL NULL 262144 100.00 Using where; Using join buffer (flat, BNL join) +1 SIMPLE t3 ALL NULL NULL NULL NULL 262117 100.00 Using where; Using join buffer (flat, BNL join) Warnings: Note 1003 select `test`.`t1`.`a` AS `a`,`test`.`t1`.`b` AS `b`,`test`.`t2`.`c` AS `c`,`test`.`t2`.`d` AS `d`,`test`.`t3`.`a` AS `a`,`test`.`t3`.`b` AS `b` from `test`.`t1` join `test`.`t2` join `test`.`t1` `t3` where `test`.`t2`.`c` = `test`.`t1`.`b` and `test`.`t3`.`a` = `test`.`t2`.`d` and `test`.`t3`.`b` < 5 and `test`.`t1`.`a` < 2000 select * from t1, t2, t1 as t3 @@ -1307,9 +1307,9 @@ explain extended select * from t1, t2, t1 as t3 where t1.b=t2.c and t2.d=t3.a and t3.b<5 and t1.a < 2000; id select_type table type possible_keys key key_len ref rows filtered Extra -1 SIMPLE t3 ALL NULL NULL NULL NULL 262144 0.00 Using where +1 SIMPLE t3 ALL NULL NULL NULL NULL 262117 0.00 Using where 1 SIMPLE t2 ref c,d d 5 test.t3.a 7 100.00 -1 SIMPLE t1 ALL NULL NULL NULL NULL 262144 2.00 Using where; Using join buffer (flat, BNL join) +1 SIMPLE t1 ALL NULL NULL NULL NULL 262117 2.00 Using where; Using join buffer (flat, BNL join) Warnings: Note 1003 select `test`.`t1`.`a` AS `a`,`test`.`t1`.`b` AS `b`,`test`.`t2`.`c` AS `c`,`test`.`t2`.`d` AS `d`,`test`.`t3`.`a` AS `a`,`test`.`t3`.`b` AS `b` from `test`.`t1` join `test`.`t2` join `test`.`t1` `t3` where `test`.`t1`.`b` = `test`.`t2`.`c` and `test`.`t2`.`d` = `test`.`t3`.`a` and `test`.`t3`.`b` < 5 and `test`.`t1`.`a` < 2000 select * from t1, t2, t1 as t3 diff --git a/mysql-test/main/selectivity_innodb.result b/mysql-test/main/selectivity_innodb.result index 93917065722..0b20a40f69f 100644 --- a/mysql-test/main/selectivity_innodb.result +++ b/mysql-test/main/selectivity_innodb.result @@ -1300,9 +1300,9 @@ explain extended select * from t1, t2, t1 as t3 where t1.b=t2.c and t2.d=t3.a and t3.b<5 and t1.a < 2000; id select_type table type possible_keys key key_len ref rows filtered Extra -1 SIMPLE t1 ALL NULL NULL NULL NULL 262144 100.00 Using where +1 SIMPLE t1 ALL NULL NULL NULL NULL 262117 100.00 Using where 1 SIMPLE t2 ref c,d c 5 test.t1.b 5 100.00 -1 SIMPLE t3 ALL NULL NULL NULL NULL 262144 100.00 Using where; Using join buffer (flat, BNL join) +1 SIMPLE t3 ALL NULL NULL NULL NULL 262117 100.00 Using where; Using join buffer (flat, BNL join) Warnings: Note 1003 select `test`.`t1`.`a` AS `a`,`test`.`t1`.`b` AS `b`,`test`.`t2`.`c` AS `c`,`test`.`t2`.`d` AS `d`,`test`.`t3`.`a` AS `a`,`test`.`t3`.`b` AS `b` from `test`.`t1` join `test`.`t2` join `test`.`t1` `t3` where `test`.`t2`.`c` = `test`.`t1`.`b` and `test`.`t3`.`a` = `test`.`t2`.`d` and `test`.`t3`.`b` < 5 and `test`.`t1`.`a` < 2000 select * from t1, t2, t1 as t3 @@ -1317,9 +1317,9 @@ explain extended select * from t1, t2, t1 as t3 where t1.b=t2.c and t2.d=t3.a and t3.b<5 and t1.a < 2000; id select_type table type possible_keys key key_len ref rows filtered Extra -1 SIMPLE t3 ALL NULL NULL NULL NULL 262144 0.00 Using where +1 SIMPLE t3 ALL NULL NULL NULL NULL 262117 0.00 Using where 1 SIMPLE t2 ref c,d d 5 test.t3.a 7 100.00 -1 SIMPLE t1 ALL NULL NULL NULL NULL 262144 2.00 Using where; Using join buffer (flat, BNL join) +1 SIMPLE t1 ALL NULL NULL NULL NULL 262117 2.00 Using where; Using join buffer (flat, BNL join) Warnings: Note 1003 select `test`.`t1`.`a` AS `a`,`test`.`t1`.`b` AS `b`,`test`.`t2`.`c` AS `c`,`test`.`t2`.`d` AS `d`,`test`.`t3`.`a` AS `a`,`test`.`t3`.`b` AS `b` from `test`.`t1` join `test`.`t2` join `test`.`t1` `t3` where `test`.`t1`.`b` = `test`.`t2`.`c` and `test`.`t2`.`d` = `test`.`t3`.`a` and `test`.`t3`.`b` < 5 and `test`.`t1`.`a` < 2000 select * from t1, t2, t1 as t3 diff --git a/sql/handler.cc b/sql/handler.cc index 2e45b5883ea..16fb126918e 100644 --- a/sql/handler.cc +++ b/sql/handler.cc @@ -7547,3 +7547,17 @@ bool Vers_parse_info::check_sys_fields(const Lex_table_name &table_name, "ROW END" : found_flag ? "ROW START" : "ROW START/END"); return true; } + +int handler::random_sample(uchar *buf) +{ + int rc; + THD *thd= ha_thd(); + do + { + if (table->in_use->check_killed(1)) + return HA_ERR_ABORTED_BY_USER; + rc= rnd_next(buf); + } while (rc == HA_ERR_RECORD_DELETED || thd_rnd(thd) > sample_fraction); + + return rc; +} diff --git a/sql/handler.h b/sql/handler.h index dfb2333b24e..fcdadbeb42b 100644 --- a/sql/handler.h +++ b/sql/handler.h @@ -1913,6 +1913,11 @@ enum enum_stats_auto_recalc { HA_STATS_AUTO_RECALC_DEFAULT= 0, HA_STATS_AUTO_RECALC_ON, HA_STATS_AUTO_RECALC_OFF }; +enum sample_mode { + HA_SAMPLE_BERNOULLI= 0, + HA_SAMPLE_SYSTEM +}; + /** A helper struct for schema DDL statements: CREATE SCHEMA [IF NOT EXISTS] name [ schema_specification... ] @@ -2947,9 +2952,11 @@ class handler :public Sql_alloc /** Length of ref (1-8 or the clustered key length) */ uint ref_length; FT_INFO *ft_handler; - enum init_stat { NONE=0, INDEX, RND }; + enum init_stat { NONE=0, INDEX, RND, SAMPLE }; init_stat inited, pre_inited; + double sample_fraction= 0; + enum sample_mode sample_mode; const COND *pushed_cond; /** next_insert_id is the next value which should be inserted into the @@ -3112,6 +3119,31 @@ class handler :public Sql_alloc virtual int prepare_range_scan(const key_range *start_key, const key_range *end_key) { return 0; } + int ha_random_sample_init(THD *thd, enum sample_mode mode, double fraction) + __attribute__((warn_unused_result)) + { + DBUG_ENTER("ha_random_sample_init"); + DBUG_ASSERT(inited==NONE); + int result; + sample_mode= mode; + sample_fraction= fraction; + inited= (result= random_sample_init(mode, fraction)) ? NONE : SAMPLE; + DBUG_RETURN(result); + } + int ha_random_sample(uchar *buf) + __attribute__((warn_unused_result)) + { + DBUG_ENTER("ha_random_sample"); + DBUG_ASSERT(inited == SAMPLE); + DBUG_RETURN(random_sample(buf)); + } + int ha_random_sample_end() + { + DBUG_ENTER("ha_random_sample_end"); + inited= NONE; + DBUG_RETURN(random_sample_end()); + } + int ha_rnd_init(bool scan) __attribute__ ((warn_unused_result)) { DBUG_EXECUTE_IF("ha_rnd_init_fail", return HA_ERR_TABLE_DEF_CHANGED;); @@ -4425,6 +4457,12 @@ class handler :public Sql_alloc /* Note: ha_index_read_idx_map() may bypass index_init() */ virtual int index_init(uint idx, bool sorted) { return 0; } virtual int index_end() { return 0; } + virtual int random_sample_init(enum sample_mode mode, double fraction) + { + return rnd_init(TRUE); + } + virtual int random_sample(uchar *buf); + virtual int random_sample_end() { return rnd_end(); } /** rnd_init() can be called two times without rnd_end() in between (it only makes sense if scan=1). diff --git a/sql/sql_statistics.cc b/sql/sql_statistics.cc index db214a1fe28..22a015821de 100644 --- a/sql/sql_statistics.cc +++ b/sql/sql_statistics.cc @@ -2727,12 +2727,16 @@ int collect_statistics_for_table(THD *thd, TABLE *table) Field *table_field; ha_rows rows= 0; handler *file=table->file; + double sample_fraction; + const ha_rows MIN_THRESHOLD_FOR_SAMPLING= 50000; DBUG_ENTER("collect_statistics_for_table"); table->collected_stats->cardinality_is_null= TRUE; table->collected_stats->cardinality= 0; + table->file->info(HA_STATUS_VARIABLE); + for (field_ptr= table->field; *field_ptr; field_ptr++) { table_field= *field_ptr; @@ -2743,12 +2747,27 @@ int collect_statistics_for_table(THD *thd, TABLE *table) restore_record(table, s->default_values); - /* Perform a full table scan to collect statistics on 'table's columns */ - if (!(rc= file->ha_rnd_init(TRUE))) - { + + if (file->records() < MIN_THRESHOLD_FOR_SAMPLING) + { + sample_fraction= 1; + } + else + { + sample_fraction= std::fmin( + (MIN_THRESHOLD_FOR_SAMPLING + 4096 * + log(200 * file->records())) / file->records(), 1); + } + + + /* Fetch samples from the table to collect statistics on table's columns */ + + if (!(rc= file->ha_random_sample_init(thd, HA_SAMPLE_BERNOULLI, + sample_fraction))) + { DEBUG_SYNC(table->in_use, "statistics_collection_start"); - while ((rc= file->ha_rnd_next(table->record[0])) != HA_ERR_END_OF_FILE) + while ((rc= file->ha_random_sample(table->record[0])) != HA_ERR_END_OF_FILE) { if (thd->killed) break; @@ -2768,10 +2787,9 @@ int collect_statistics_for_table(THD *thd, TABLE *table) break; rows++; } - file->ha_rnd_end(); + file->ha_random_sample_end(); } rc= (rc == HA_ERR_END_OF_FILE && !thd->killed) ? 0 : 1; - /* Calculate values for all statistical characteristics on columns and and for each field f of 'table' save them in the write_stat structure @@ -2780,7 +2798,7 @@ int collect_statistics_for_table(THD *thd, TABLE *table) if (!rc) { table->collected_stats->cardinality_is_null= FALSE; - table->collected_stats->cardinality= rows; + table->collected_stats->cardinality= rows / sample_fraction; } bitmap_clear_all(table->write_set);