[Commits] fd66f77c856: Introduce analyze_sample_percentage variable
revision-id: fd66f77c856c0e9ef25e1c677c4cc2ea0efec8c7 (mariadb-10.4.1-164-gfd66f77c856) parent(s): 88b311e62ff0f977943984fd1bf6e33421267550 author: Vicențiu Ciorbaru committer: Vicențiu Ciorbaru timestamp: 2019-02-15 01:23:00 +0200 message: Introduce analyze_sample_percentage variable The variable controls the amount of sampling analyze table performs. If ANALYZE table with histogram collection is too slow, one can reduce the time taken by setting analyze_sample_percentage to a lower value of the total number of rows. Setting it to 0 will use a formula to compute how many rows to sample: The number of rows collected is capped to a minimum of 50000 and increases logarithmically with a coffecient of 4096. The coffecient is chosen so that we expect an error of less than 3% in our estimations according to the paper: "Random Sampling for Histogram Construction: How much is enough?” – Surajit Chaudhuri, Rajeev Motwani, Vivek Narasayya, ACM SIGMOD, 1998. The drawback of sampling is that avg_frequency number is computed imprecisely and will yeild a smaller number than the real one. --- mysql-test/main/statistics.result | 104 +++++++++++++++++++++ mysql-test/main/statistics.test | 88 +++++++++++++++++ .../sys_vars/r/sysvars_server_embedded.result | 14 +++ .../sys_vars/r/sysvars_server_notembedded.result | 14 +++ sql/sql_class.h | 1 + sql/sql_statistics.cc | 40 ++++++-- sql/sys_vars.cc | 9 ++ 7 files changed, 260 insertions(+), 10 deletions(-) diff --git a/mysql-test/main/statistics.result b/mysql-test/main/statistics.result index 34a17cf049c..01176b8d6cf 100644 --- a/mysql-test/main/statistics.result +++ b/mysql-test/main/statistics.result @@ -1757,3 +1757,107 @@ DROP TABLE t1; # # End of 10.2 tests # +# +# Start of 10.4 tests +# +# +# Test analyze_sample_percentage system variable. +# +set @save_use_stat_tables=@@use_stat_tables; +set @save_analyze_sample_percentage=@@analyze_sample_percentage; +set session rand_seed1=42; +set session rand_seed2=62; +set use_stat_tables=PREFERABLY; +set histogram_size=10; +CREATE TABLE t1 (id int); +INSERT INTO t1 (id) VALUES (1), (1), (1), (1), (1), (1), (1); +INSERT INTO t1 (id) SELECT id FROM t1; +INSERT INTO t1 SELECT id+1 FROM t1; +INSERT INTO t1 SELECT id+2 FROM t1; +INSERT INTO t1 SELECT id+4 FROM t1; +INSERT INTO t1 SELECT id+8 FROM t1; +INSERT INTO t1 SELECT id+16 FROM t1; +INSERT INTO t1 SELECT id+32 FROM t1; +INSERT INTO t1 SELECT id+64 FROM t1; +INSERT INTO t1 SELECT id+128 FROM t1; +INSERT INTO t1 SELECT id+256 FROM t1; +INSERT INTO t1 SELECT id+512 FROM t1; +INSERT INTO t1 SELECT id+1024 FROM t1; +INSERT INTO t1 SELECT id+2048 FROM t1; +INSERT INTO t1 SELECT id+4096 FROM t1; +INSERT INTO t1 SELECT id+9192 FROM t1; +# +# This query will should show a full table scan analysis. +# +ANALYZE TABLE t1; +Table Op Msg_type Msg_text +test.t1 analyze status Engine-independent statistics collected +test.t1 analyze status OK +select table_name, column_name, min_value, max_value, nulls_ratio, avg_length, avg_frequency, +DECODE_HISTOGRAM(hist_type, histogram) +from mysql.column_stats; +table_name column_name min_value max_value nulls_ratio avg_length avg_frequency DECODE_HISTOGRAM(hist_type, histogram) +t1 id 1 17384 0.0000 4.0000 14.0000 0.082,0.086,0.086,0.086,0.086,0.141,0.086,0.086,0.086,0.086,0.086 +set analyze_sample_percentage=0.1; +# +# This query will show an innacurate avg_frequency value. +# +ANALYZE TABLE t1; +Table Op Msg_type Msg_text +test.t1 analyze status Engine-independent statistics collected +test.t1 analyze status Table is already up to date +select table_name, column_name, min_value, max_value, nulls_ratio, avg_length, avg_frequency, +DECODE_HISTOGRAM(hist_type, histogram) +from mysql.column_stats; +table_name column_name min_value max_value nulls_ratio avg_length avg_frequency DECODE_HISTOGRAM(hist_type, histogram) +t1 id 111 17026 0.0000 4.0000 1.0047 0.039,0.098,0.055,0.118,0.078,0.157,0.082,0.118,0.094,0.063,0.098 +# +# This query will show a better avg_frequency value. +# +set analyze_sample_percentage=25; +ANALYZE TABLE t1; +Table Op Msg_type Msg_text +test.t1 analyze status Engine-independent statistics collected +test.t1 analyze status Table is already up to date +select table_name, column_name, min_value, max_value, nulls_ratio, avg_length, avg_frequency, +DECODE_HISTOGRAM(hist_type, histogram) +from mysql.column_stats; +table_name column_name min_value max_value nulls_ratio avg_length avg_frequency DECODE_HISTOGRAM(hist_type, histogram) +t1 id 1 17384 0.0000 4.0000 3.5736 0.082,0.086,0.086,0.082,0.086,0.145,0.086,0.086,0.082,0.086,0.090 +set analyze_sample_percentage=0; +# +# Test self adjusting sampling level. +# +ANALYZE TABLE t1; +Table Op Msg_type Msg_text +test.t1 analyze status Engine-independent statistics collected +test.t1 analyze status Table is already up to date +select table_name, column_name, min_value, max_value, nulls_ratio, avg_length, avg_frequency, +DECODE_HISTOGRAM(hist_type, histogram) +from mysql.column_stats; +table_name column_name min_value max_value nulls_ratio avg_length avg_frequency DECODE_HISTOGRAM(hist_type, histogram) +t1 id 1 17384 0.0000 4.0000 7.4523 0.082,0.090,0.086,0.082,0.086,0.145,0.086,0.082,0.086,0.086,0.086 +# +# Test record estimation is working properly. +# +select count(*) from t1; +count(*) +229376 +explain select * from t1; +id select_type table type possible_keys key key_len ref rows Extra +1 SIMPLE t1 ALL NULL NULL NULL NULL 229060 +set analyze_sample_percentage=100; +ANALYZE TABLE t1; +Table Op Msg_type Msg_text +test.t1 analyze status Engine-independent statistics collected +test.t1 analyze status Table is already up to date +select table_name, column_name, min_value, max_value, nulls_ratio, avg_length, avg_frequency, +DECODE_HISTOGRAM(hist_type, histogram) +from mysql.column_stats; +table_name column_name min_value max_value nulls_ratio avg_length avg_frequency DECODE_HISTOGRAM(hist_type, histogram) +t1 id 1 17384 0.0000 4.0000 14.0000 0.082,0.086,0.086,0.086,0.086,0.141,0.086,0.086,0.086,0.086,0.086 +explain select * from t1; +id select_type table type possible_keys key key_len ref rows Extra +1 SIMPLE t1 ALL NULL NULL NULL NULL 229376 +set use_stat_tables=@save_use_stat_tables; +drop table t1; diff --git a/mysql-test/main/statistics.test b/mysql-test/main/statistics.test index b2e544064b0..2b799c9c4b9 100644 --- a/mysql-test/main/statistics.test +++ b/mysql-test/main/statistics.test @@ -898,3 +898,91 @@ DROP TABLE t1; --echo # --echo # End of 10.2 tests --echo # + + +--echo # +--echo # Start of 10.4 tests +--echo # + +--echo # +--echo # Test analyze_sample_percentage system variable. +--echo # +set @save_use_stat_tables=@@use_stat_tables; +set @save_analyze_sample_percentage=@@analyze_sample_percentage; + +set session rand_seed1=42; +set session rand_seed2=62; + +set use_stat_tables=PREFERABLY; +set histogram_size=10; + +CREATE TABLE t1 (id int); +INSERT INTO t1 (id) VALUES (1), (1), (1), (1), (1), (1), (1); +INSERT INTO t1 (id) SELECT id FROM t1; +INSERT INTO t1 SELECT id+1 FROM t1; +INSERT INTO t1 SELECT id+2 FROM t1; +INSERT INTO t1 SELECT id+4 FROM t1; +INSERT INTO t1 SELECT id+8 FROM t1; +INSERT INTO t1 SELECT id+16 FROM t1; +INSERT INTO t1 SELECT id+32 FROM t1; +INSERT INTO t1 SELECT id+64 FROM t1; +INSERT INTO t1 SELECT id+128 FROM t1; +INSERT INTO t1 SELECT id+256 FROM t1; +INSERT INTO t1 SELECT id+512 FROM t1; +INSERT INTO t1 SELECT id+1024 FROM t1; +INSERT INTO t1 SELECT id+2048 FROM t1; +INSERT INTO t1 SELECT id+4096 FROM t1; +INSERT INTO t1 SELECT id+9192 FROM t1; + +--echo # +--echo # This query will should show a full table scan analysis. +--echo # +ANALYZE TABLE t1; +select table_name, column_name, min_value, max_value, nulls_ratio, avg_length, avg_frequency, + DECODE_HISTOGRAM(hist_type, histogram) +from mysql.column_stats; + +set analyze_sample_percentage=0.1; + +--echo # +--echo # This query will show an innacurate avg_frequency value. +--echo # +ANALYZE TABLE t1; +select table_name, column_name, min_value, max_value, nulls_ratio, avg_length, avg_frequency, + DECODE_HISTOGRAM(hist_type, histogram) +from mysql.column_stats; + +--echo # +--echo # This query will show a better avg_frequency value. +--echo # +set analyze_sample_percentage=25; +ANALYZE TABLE t1; +select table_name, column_name, min_value, max_value, nulls_ratio, avg_length, avg_frequency, + DECODE_HISTOGRAM(hist_type, histogram) +from mysql.column_stats; + + +set analyze_sample_percentage=0; +--echo # +--echo # Test self adjusting sampling level. +--echo # +ANALYZE TABLE t1; +select table_name, column_name, min_value, max_value, nulls_ratio, avg_length, avg_frequency, + DECODE_HISTOGRAM(hist_type, histogram) +from mysql.column_stats; +-- echo # +-- echo # Test record estimation is working properly. +-- echo # +select count(*) from t1; +explain select * from t1; + +set analyze_sample_percentage=100; +ANALYZE TABLE t1; +select table_name, column_name, min_value, max_value, nulls_ratio, avg_length, avg_frequency, + DECODE_HISTOGRAM(hist_type, histogram) +from mysql.column_stats; +explain select * from t1; + +set use_stat_tables=@save_use_stat_tables; + +drop table t1; diff --git a/mysql-test/suite/sys_vars/r/sysvars_server_embedded.result b/mysql-test/suite/sys_vars/r/sysvars_server_embedded.result index 926ac8b2668..08b3f7ae938 100644 --- a/mysql-test/suite/sys_vars/r/sysvars_server_embedded.result +++ b/mysql-test/suite/sys_vars/r/sysvars_server_embedded.result @@ -40,6 +40,20 @@ NUMERIC_BLOCK_SIZE NULL ENUM_VALUE_LIST DEFAULT,COPY,INPLACE,NOCOPY,INSTANT READ_ONLY NO COMMAND_LINE_ARGUMENT OPTIONAL +VARIABLE_NAME ANALYZE_SAMPLE_PERCENTAGE +SESSION_VALUE 100.000000 +GLOBAL_VALUE 100.000000 +GLOBAL_VALUE_ORIGIN COMPILE-TIME +DEFAULT_VALUE 100.000000 +VARIABLE_SCOPE SESSION +VARIABLE_TYPE DOUBLE +VARIABLE_COMMENT Percentage of rows from the table ANALYZE TABLE will sample to collect table statistics. Set to 0 to let MariaDB decide what percentage of rows to sample. +NUMERIC_MIN_VALUE 0 +NUMERIC_MAX_VALUE 100 +NUMERIC_BLOCK_SIZE NULL +ENUM_VALUE_LIST NULL +READ_ONLY NO +COMMAND_LINE_ARGUMENT REQUIRED VARIABLE_NAME AUTOCOMMIT SESSION_VALUE ON GLOBAL_VALUE ON diff --git a/mysql-test/suite/sys_vars/r/sysvars_server_notembedded.result b/mysql-test/suite/sys_vars/r/sysvars_server_notembedded.result index 219b550d83e..2f70f26ab71 100644 --- a/mysql-test/suite/sys_vars/r/sysvars_server_notembedded.result +++ b/mysql-test/suite/sys_vars/r/sysvars_server_notembedded.result @@ -40,6 +40,20 @@ NUMERIC_BLOCK_SIZE NULL ENUM_VALUE_LIST DEFAULT,COPY,INPLACE,NOCOPY,INSTANT READ_ONLY NO COMMAND_LINE_ARGUMENT OPTIONAL +VARIABLE_NAME ANALYZE_SAMPLE_PERCENTAGE +SESSION_VALUE 100.000000 +GLOBAL_VALUE 100.000000 +GLOBAL_VALUE_ORIGIN COMPILE-TIME +DEFAULT_VALUE 100.000000 +VARIABLE_SCOPE SESSION +VARIABLE_TYPE DOUBLE +VARIABLE_COMMENT Percentage of rows from the table ANALYZE TABLE will sample to collect table statistics. Set to 0 to let MariaDB decide what percentage of rows to sample. +NUMERIC_MIN_VALUE 0 +NUMERIC_MAX_VALUE 100 +NUMERIC_BLOCK_SIZE NULL +ENUM_VALUE_LIST NULL +READ_ONLY NO +COMMAND_LINE_ARGUMENT REQUIRED VARIABLE_NAME AUTOCOMMIT SESSION_VALUE ON GLOBAL_VALUE ON diff --git a/sql/sql_class.h b/sql/sql_class.h index 402a114aadd..a16e078edc7 100644 --- a/sql/sql_class.h +++ b/sql/sql_class.h @@ -619,6 +619,7 @@ typedef struct system_variables ulong optimizer_selectivity_sampling_limit; ulong optimizer_use_condition_selectivity; ulong use_stat_tables; + double sample_percentage; ulong histogram_size; ulong histogram_type; ulong preload_buff_size; diff --git a/sql/sql_statistics.cc b/sql/sql_statistics.cc index f903ce143a4..27fab974441 100644 --- a/sql/sql_statistics.cc +++ b/sql/sql_statistics.cc @@ -2729,12 +2729,28 @@ int collect_statistics_for_table(THD *thd, TABLE *table) Field *table_field; ha_rows rows= 0; handler *file=table->file; + double sample_fraction= thd->variables.sample_percentage / 100; + const ha_rows MIN_THRESHOLD_FOR_SAMPLING= 50000; DBUG_ENTER("collect_statistics_for_table"); table->collected_stats->cardinality_is_null= TRUE; table->collected_stats->cardinality= 0; + if (thd->variables.sample_percentage == 0) + { + if (file->records() < MIN_THRESHOLD_FOR_SAMPLING) + { + sample_fraction= 1; + } + else + { + sample_fraction= std::fmin( + (MIN_THRESHOLD_FOR_SAMPLING + 4096 * + log(200 * file->records())) / file->records(), 1); + } + } + for (field_ptr= table->field; *field_ptr; field_ptr++) { table_field= *field_ptr; @@ -2747,7 +2763,7 @@ int collect_statistics_for_table(THD *thd, TABLE *table) /* Perform a full table scan to collect statistics on 'table's columns */ if (!(rc= file->ha_rnd_init(TRUE))) - { + { DEBUG_SYNC(table->in_use, "statistics_collection_start"); while ((rc= file->ha_rnd_next(table->record[0])) != HA_ERR_END_OF_FILE) @@ -2758,17 +2774,20 @@ int collect_statistics_for_table(THD *thd, TABLE *table) if (rc) break; - for (field_ptr= table->field; *field_ptr; field_ptr++) + if (thd_rnd(thd) <= sample_fraction) { - table_field= *field_ptr; - if (!bitmap_is_set(table->read_set, table_field->field_index)) - continue; - if ((rc= table_field->collected_stats->add())) + for (field_ptr= table->field; *field_ptr; field_ptr++) + { + table_field= *field_ptr; + if (!bitmap_is_set(table->read_set, table_field->field_index)) + continue; + if ((rc= table_field->collected_stats->add())) + break; + } + if (rc) break; + rows++; } - if (rc) - break; - rows++; } file->ha_rnd_end(); } @@ -2782,7 +2801,8 @@ int collect_statistics_for_table(THD *thd, TABLE *table) if (!rc) { table->collected_stats->cardinality_is_null= FALSE; - table->collected_stats->cardinality= rows; + table->collected_stats->cardinality= + static_cast<ha_rows>(rows / sample_fraction); } bitmap_clear_all(table->write_set); diff --git a/sql/sys_vars.cc b/sql/sys_vars.cc index 7241685fb61..99009146cb8 100644 --- a/sql/sys_vars.cc +++ b/sql/sys_vars.cc @@ -349,6 +349,15 @@ static Sys_var_long Sys_pfs_connect_attrs_size( #endif /* WITH_PERFSCHEMA_STORAGE_ENGINE */ +static Sys_var_double Sys_analyze_sample_percentage( + "analyze_sample_percentage", + "Percentage of rows from the table ANALYZE TABLE will sample " + "to collect table statistics. Set to 0 to let MariaDB decide " + "what percentage of rows to sample.", + SESSION_VAR(sample_percentage), + CMD_LINE(REQUIRED_ARG), VALID_RANGE(0, 100), + DEFAULT(100)); + static Sys_var_ulong Sys_auto_increment_increment( "auto_increment_increment", "Auto-increment columns are incremented by this",
participants (1)
-
vicentiu@mariadb.org