[Commits] 3ac32917ab6: MDEV-21130: Histograms: use JSON as on-disk format
revision-id: 3ac32917ab6c42a5a0f9ed817dd8d3c7e20ce34d (mariadb-10.6.2-68-g3ac32917ab6) parent(s): 5ddb8069145b518426be7fd31881d1d3fa5f53b4 author: Sergei Petrunia committer: Sergei Petrunia timestamp: 2021-07-23 01:26:50 +0300 message: MDEV-21130: Histograms: use JSON as on-disk format Preparation for handling different kinds of histograms: - In Column_statistics, change "Histogram histogram" into "Histogram *histogram_". This allows for different kinds of Histogram classes with virtual functions. - [Almost] remove the usage of Histogram->set_values and Histogram->set_size. The code outside the histogram should not make any assumptions about what/how is stored in the Histogram. - Introduce drafts of methods to read/save histograms to/from disk. --- sql/sql_statistics.cc | 214 ++++++++++++++++++++++++++++++++++++-------------- sql/sql_statistics.h | 90 ++++++++++++++------- sql/table.h | 10 ++- 3 files changed, 226 insertions(+), 88 deletions(-) diff --git a/sql/sql_statistics.cc b/sql/sql_statistics.cc index 56e20ecf48e..2ec9aaa5965 100644 --- a/sql/sql_statistics.cc +++ b/sql/sql_statistics.cc @@ -311,7 +311,7 @@ class Column_statistics_collected :public Column_statistics inline void init(THD *thd, Field * table_field); inline bool add(); - inline void finish(ha_rows rows, double sample_fraction); + inline void finish(MEM_ROOT *mem_root, ha_rows rows, double sample_fraction); inline void cleanup(); }; @@ -1068,21 +1068,22 @@ class Column_stat: public Stat_table stat_field->store(stats->get_avg_frequency()); break; case COLUMN_STAT_HIST_SIZE: - stat_field->store(stats->histogram.get_size()); + // Note: this is dumb. the histogram size is stored with the + // histogram! + stat_field->store(stats->histogram_? + stats->histogram_->get_size() : 0); break; case COLUMN_STAT_HIST_TYPE: - stat_field->store(stats->histogram.get_type() + 1); + if (stats->histogram_) + stat_field->store(stats->histogram_->get_type() + 1); + else + stat_field->set_null(); break; case COLUMN_STAT_HISTOGRAM: - if (stats->histogram.get_type() == JSON) - { - stat_field->store((char *) stats->histogram.get_values(), - strlen((char *) stats->histogram.get_values()), &my_charset_bin); - } else - { - stat_field->store((char *) stats->histogram.get_values(), - stats->histogram.get_size(), &my_charset_bin); - } + if (stats->histogram_) + stats->histogram_->serialize(stat_field); + else + stat_field->set_null(); break; } } @@ -1111,6 +1112,7 @@ class Column_stat: public Stat_table void get_stat_values() { table_field->read_stats->set_all_nulls(); + table_field->read_stats->histogram_type_on_disk= INVALID_HISTOGRAM; if (table_field->read_stats->min_value) table_field->read_stats->min_value->set_null(); @@ -1122,7 +1124,7 @@ class Column_stat: public Stat_table char buff[MAX_FIELD_WIDTH]; String val(buff, sizeof(buff), &my_charset_bin); - for (uint i= COLUMN_STAT_MIN_VALUE; i <= COLUMN_STAT_HIST_TYPE; i++) + for (uint i= COLUMN_STAT_MIN_VALUE; i <= COLUMN_STAT_HISTOGRAM; i++) { Field *stat_field= stat_table->field[i]; @@ -1166,13 +1168,22 @@ class Column_stat: public Stat_table table_field->read_stats->set_avg_frequency(stat_field->val_real()); break; case COLUMN_STAT_HIST_SIZE: - table_field->read_stats->histogram.set_size(stat_field->val_int()); + //TODO: ignore this. The size is a part of histogram! + //table_field->read_stats->histogram.set_size(stat_field->val_int()); break; case COLUMN_STAT_HIST_TYPE: - Histogram_type hist_type= (Histogram_type) (stat_field->val_int() - - 1); - table_field->read_stats->histogram.set_type(hist_type); - break; + // TODO: save this next to histogram. + // For some reason, the histogram itself is read in + // read_histograms_for_table + { + Histogram_type hist_type= (Histogram_type) (stat_field->val_int() - + 1); + table_field->read_stats->histogram_type_on_disk= hist_type; + break; + } + case COLUMN_STAT_HISTOGRAM: + //TODO: if stat_field->length() == 0 then histogram_type_on_disk is set to INVALID_HISTOGRAM + break; } } } @@ -1195,7 +1206,7 @@ class Column_stat: public Stat_table of read_stats->histogram. */ - void get_histogram_value() + Histogram * load_histogram(MEM_ROOT *mem_root) { if (find_stat()) { @@ -1205,13 +1216,54 @@ class Column_stat: public Stat_table Field *stat_field= stat_table->field[fldno]; table_field->read_stats->set_not_null(fldno); stat_field->val_str(&val); - memcpy(table_field->read_stats->histogram.get_values(), - val.ptr(), table_field->read_stats->histogram.get_size()); + // histogram-todo: here, create the histogram of appropriate type. + Histogram *hist= new (mem_root) Histogram(); + if (!hist->parse(mem_root, table_field->read_stats->histogram_type_on_disk, + (const uchar*)val.ptr(), val.length())) + { + table_field->read_stats->histogram_= hist; + return hist; + } + //memcpy(table_field->read_stats->histogram_.get_values(), + // val.ptr(), table_field->read_stats->histogram.get_size()); } + return NULL; } - }; +bool Histogram::parse(MEM_ROOT *mem_root, Histogram_type type_arg, const uchar *ptr_arg, uint size_arg) +{ + // Just copy the data + size = (uint8) size_arg; + type = type_arg; + values = (uchar*)alloc_root(mem_root, size_arg); + memcpy(values, ptr_arg, size_arg); + return false; +} + + +/* + Save the histogram data info a table field. +*/ +void Histogram::serialize(Field *field) +{ + if (get_type() == JSON) + { + field->store((char*)get_values(), strlen((char*)get_values()), + &my_charset_bin); + } + else + field->store((char*)get_values(), get_size(), &my_charset_bin); +} + +void Histogram::init_for_collection(MEM_ROOT *mem_root, + Histogram_type htype_arg, + ulonglong size_arg) +{ + type= htype_arg; + values = (uchar*)alloc_root(mem_root, size_arg); + size= (uint8) size_arg; +} /* An object of the class Index_stat is created to read statistical @@ -1552,7 +1604,7 @@ class Histogram_builder Column_statistics *col_stats= col->collected_stats; min_value= col_stats->min_value; max_value= col_stats->max_value; - histogram= &col_stats->histogram; + histogram= col_stats->histogram_; hist_width= histogram->get_width(); bucket_capacity= (double) records / (hist_width + 1); curr_bucket= 0; @@ -1605,7 +1657,7 @@ std::vector<std::string> bucket_bounds = {}; Column_statistics *col_stats= col->collected_stats; min_value= col_stats->min_value; max_value= col_stats->max_value; - histogram= &col_stats->histogram; + histogram= col_stats->histogram_; hist_width= histogram->get_width(); bucket_capacity= (double) records / (hist_width + 1); curr_bucket= 0; @@ -1765,9 +1817,9 @@ class Count_distinct_field: public Sql_alloc @brief Calculate a histogram of the tree */ - void walk_tree_with_histogram(ha_rows rows) + void walk_tree_with_histogram(ha_rows rows) { - if(table_field->collected_stats->histogram.get_type() == JSON) + if (table_field->collected_stats->histogram_->get_type() == JSON) { Histogram_builder_json hist_builder(table_field, tree_key_length, rows); tree->walk(table_field->table, json_histogram_build_walk, @@ -1775,7 +1827,8 @@ class Count_distinct_field: public Sql_alloc hist_builder.build(); distincts= hist_builder.get_count_distinct(); distincts_single_occurence= hist_builder.get_count_single_occurence(); - } else + } + else { Histogram_builder hist_builder(table_field, tree_key_length, rows); tree->walk(table_field->table, histogram_build_walk, @@ -1799,18 +1852,19 @@ class Count_distinct_field: public Sql_alloc @brief Get the size of the histogram in bytes built for table_field */ + /* uint get_hist_size() { return table_field->collected_stats->histogram.get_size(); - } + }*/ /* @brief Get the pointer to the histogram built for table_field */ - uchar *get_histogram() + Histogram *get_histogram() { - return table_field->collected_stats->histogram.get_values(); + return table_field->collected_stats->histogram_; } }; @@ -2209,7 +2263,7 @@ int alloc_statistics_for_table(THD* thd, TABLE *table) uint key_parts= table->s->ext_key_parts; ulonglong *idx_avg_frequency= (ulonglong*) alloc_root(&table->mem_root, sizeof(ulonglong) * key_parts); - +/* uint hist_size= thd->variables.histogram_size; Histogram_type hist_type= (Histogram_type) (thd->variables.histogram_type); uchar *histogram= NULL; @@ -2220,16 +2274,16 @@ int alloc_statistics_for_table(THD* thd, TABLE *table) bzero(histogram, hist_size * columns); } - - if (!table_stats || !column_stats || !index_stats || !idx_avg_frequency || - (hist_size && !histogram)) +*/ + if (!table_stats || !column_stats || !index_stats || !idx_avg_frequency) + //|| (hist_size && !histogram)) DBUG_RETURN(1); table->collected_stats= table_stats; table_stats->column_stats= column_stats; table_stats->index_stats= index_stats; table_stats->idx_avg_frequency= idx_avg_frequency; - table_stats->histograms= histogram; + //table_stats->histograms= histogram; memset(column_stats, 0, sizeof(Column_statistics) * columns); @@ -2237,10 +2291,12 @@ int alloc_statistics_for_table(THD* thd, TABLE *table) { if (bitmap_is_set(table->read_set, (*field_ptr)->field_index)) { + column_stats->histogram_ = NULL; + /* column_stats->histogram.set_size(hist_size); column_stats->histogram.set_type(hist_type); column_stats->histogram.set_values(histogram); - histogram+= hist_size; + histogram+= hist_size;*/ (*field_ptr)->collected_stats= column_stats++; } } @@ -2459,6 +2515,25 @@ bool Column_statistics_collected::add() } +/* + Create an empty Histogram object from histogram_type. + + Note: it is not yet clear whether collection-time histogram should be the same + as lookup-time histogram. At the moment, they are. +*/ + +Histogram* get_histogram_by_type(MEM_ROOT *mem_root, Histogram_type hist_type) { + switch (hist_type) { + case SINGLE_PREC_HB: + case DOUBLE_PREC_HB: + case JSON: + return new Histogram(); + default: + DBUG_ASSERT(0); + } + return NULL; +}; + /** @brief Get the results of aggregation when collecting the statistics on a column @@ -2468,7 +2543,7 @@ bool Column_statistics_collected::add() */ inline -void Column_statistics_collected::finish(ha_rows rows, double sample_fraction) +void Column_statistics_collected::finish(MEM_ROOT *mem_root, ha_rows rows, double sample_fraction) { double val; @@ -2486,10 +2561,19 @@ void Column_statistics_collected::finish(ha_rows rows, double sample_fraction) } if (count_distinct) { - uint hist_size= count_distinct->get_hist_size(); + //uint hist_size= count_distinct->get_hist_size(); + uint hist_size= current_thd->variables.histogram_size; + Histogram_type hist_type= (Histogram_type) (current_thd->variables.histogram_type); + bool have_histogram= false; + if (hist_size != 0 && hist_type != INVALID_HISTOGRAM) + { + have_histogram= true; + histogram_= new Histogram; + histogram_->init_for_collection(mem_root, hist_type, hist_size); + } /* Compute cardinality statistics and optionally histogram. */ - if (hist_size == 0) + if (!have_histogram) count_distinct->walk_tree(); else count_distinct->walk_tree_with_histogram(rows - nulls); @@ -2527,13 +2611,14 @@ void Column_statistics_collected::finish(ha_rows rows, double sample_fraction) set_not_null(COLUMN_STAT_AVG_FREQUENCY); } else - hist_size= 0; - histogram.set_size(hist_size); + have_histogram= false ; // TODO: need this? + //histogram.set_size(hist_size); set_not_null(COLUMN_STAT_HIST_SIZE); - if (hist_size && distincts) + if (have_histogram && distincts) { set_not_null(COLUMN_STAT_HIST_TYPE); - histogram.set_values(count_distinct->get_histogram()); + //histogram.set_values(count_distinct->get_histogram()); + histogram_= count_distinct->get_histogram(); set_not_null(COLUMN_STAT_HISTOGRAM); } delete count_distinct; @@ -2795,7 +2880,7 @@ int collect_statistics_for_table(THD *thd, TABLE *table) continue; bitmap_set_bit(table->write_set, table_field->field_index); if (!rc) - table_field->collected_stats->finish(rows, sample_fraction); + table_field->collected_stats->finish(&table->mem_root, rows, sample_fraction); else table_field->collected_stats->cleanup(); } @@ -3001,16 +3086,19 @@ int read_statistics_for_table(THD *thd, TABLE *table, TABLE_LIST *stat_tables) /* Read statistics from the statistical table column_stats */ stat_table= stat_tables[COLUMN_STAT].table; - ulong total_hist_size= 0; + //ulong total_hist_size= 0; + bool have_histograms= false; Column_stat column_stat(stat_table, table); for (field_ptr= table_share->field; *field_ptr; field_ptr++) { table_field= *field_ptr; column_stat.set_key_fields(table_field); column_stat.get_stat_values(); - total_hist_size+= table_field->read_stats->histogram.get_size(); + //total_hist_size+= table_field->read_stats->histogram.get_size(); + if (table_field->read_stats->histogram_type_on_disk != INVALID_HISTOGRAM) + have_histograms= true; } - table_share->stats_cb.total_hist_size= total_hist_size; + table_share->stats_cb.total_hist_size= have_histograms? 1:0; // total_hist_size /* Read statistics from the statistical table index_stats */ stat_table= stat_tables[INDEX_STAT].table; @@ -3147,28 +3235,36 @@ int read_histograms_for_table(THD *thd, TABLE *table, TABLE_LIST *stat_tables) { TABLE_STATISTICS_CB *stats_cb= &table->s->stats_cb; DBUG_ENTER("read_histograms_for_table"); - + + // histograms-todo: why do we use synchronization here, when we load + // histogram for the TABLE object, not TABLE_SHARE? + // is it because of the use of stats_cb->mem_root? if (stats_cb->start_histograms_load()) { - uchar *histogram= (uchar *) alloc_root(&stats_cb->mem_root, - stats_cb->total_hist_size); + //uchar *histogram= (uchar *) alloc_root(&stats_cb->mem_root, + // stats_cb->total_hist_size); + /* if (!histogram) { stats_cb->abort_histograms_load(); DBUG_RETURN(1); } - memset(histogram, 0, stats_cb->total_hist_size); + */ + //memset(histogram, 0, stats_cb->total_hist_size); Column_stat column_stat(stat_tables[COLUMN_STAT].table, table); for (Field **field_ptr= table->s->field; *field_ptr; field_ptr++) { Field *table_field= *field_ptr; - if (uint hist_size= table_field->read_stats->histogram.get_size()) + //if (uint hist_size= table_field->read_stats->histogram.get_size()) + if (table_field->read_stats->histogram_type_on_disk != INVALID_HISTOGRAM) { column_stat.set_key_fields(table_field); - table_field->read_stats->histogram.set_values(histogram); - column_stat.get_histogram_value(); - histogram+= hist_size; + //table_field->read_stats->histogram.set_values(histogram); + + table_field->read_stats->histogram_= + column_stat.load_histogram(&stats_cb->mem_root); + //histogram+= hist_size; } } stats_cb->end_histograms_load(); @@ -3860,8 +3956,8 @@ double get_column_range_cardinality(Field *field, if (avg_frequency > 1.0 + 0.000001 && col_stats->min_max_values_are_provided()) { - Histogram *hist= &col_stats->histogram; - if (hist->is_usable(thd)) + Histogram *hist= col_stats->histogram_; + if (hist && hist->is_usable(thd)) { store_key_image_to_rec(field, (uchar *) min_endp->key, field->key_length()); @@ -3904,8 +4000,8 @@ double get_column_range_cardinality(Field *field, else max_mp_pos= 1.0; - Histogram *hist= &col_stats->histogram; - if (hist->is_usable(thd)) + Histogram *hist= col_stats->histogram_; + if (hist && hist->is_usable(thd)) sel= hist->range_selectivity(min_mp_pos, max_mp_pos); else sel= (max_mp_pos - min_mp_pos); diff --git a/sql/sql_statistics.h b/sql/sql_statistics.h index a554721d50b..178bc11a278 100644 --- a/sql/sql_statistics.h +++ b/sql/sql_statistics.h @@ -43,7 +43,8 @@ enum enum_histogram_type { SINGLE_PREC_HB, DOUBLE_PREC_HB, - JSON + JSON, + INVALID_HISTOGRAM } Histogram_type; enum enum_stat_tables @@ -141,40 +142,70 @@ double get_column_range_cardinality(Field *field, bool is_stat_table(const LEX_CSTRING *db, LEX_CSTRING *table); bool is_eits_usable(Field* field); -class Histogram +/* + Common base for all histograms +*/ +class Histogram_base : public Sql_alloc { +public: + virtual bool parse(MEM_ROOT *mem_root, Histogram_type type_arg, + const uchar *ptr, uint size)= 0; + virtual void serialize(Field *to_field)= 0; -private: - Histogram_type type; - uint8 size; /* Size of values array, in bytes */ - uchar *values; + virtual Histogram_type get_type()=0; + + // Legacy: return the size of the histogram on disk. + // This will be stored in mysql.column_stats.hist_size column. + // Newer, JSON-based histograms may return 0. + virtual uint get_size()=0; - uint prec_factor() + virtual ~Histogram_base(){} +}; + +class Histogram : public Histogram_base +{ +public: + bool parse(MEM_ROOT *mem_root, Histogram_type type_arg, + const uchar *ptr_arg, uint size_arg) override; + void serialize(Field *to_field) override; + Histogram_type get_type() override { return type; } + + uint get_size() override { return (uint) size; } + + // returns number of buckets in the histogram + uint get_width() { switch (type) { case SINGLE_PREC_HB: case JSON: - return ((uint) (1 << 8) - 1); + return size; case DOUBLE_PREC_HB: - return ((uint) (1 << 16) - 1); + return size / 2; + default: + DBUG_ASSERT(0); } - return 1; + return 0; } -public: - uint get_width() +private: + Histogram_type type; + uint8 size; /* Size of values array, in bytes */ + uchar *values; + + uint prec_factor() { switch (type) { case SINGLE_PREC_HB: case JSON: - return size; + return ((uint) (1 << 8) - 1); case DOUBLE_PREC_HB: - return size / 2; + return ((uint) (1 << 16) - 1); + default: + DBUG_ASSERT(0); } - return 0; + return 1; } -private: uint get_value(uint i) { DBUG_ASSERT(i < get_width()); @@ -184,6 +215,8 @@ class Histogram return (uint) (((uint8 *) values)[i]); case DOUBLE_PREC_HB: return (uint) uint2korr(values + i * 2); + default: + DBUG_ASSERT(0); } return 0; } @@ -227,19 +260,13 @@ class Histogram return i; } -public: - - uint get_size() { return (uint) size; } - - Histogram_type get_type() { return type; } - uchar *get_values() { return (uchar *) values; } +public: + void init_for_collection(MEM_ROOT *mem_root, Histogram_type htype_arg, ulonglong size); - void set_size (ulonglong sz) { size= (uint8) sz; } - - void set_type (Histogram_type t) { type= t; } - + // Note: these two are used only for saving the JSON text: void set_values (uchar *vals) { values= (uchar *) vals; } + void set_size (ulonglong sz) { size= (uint8) sz; } bool is_available() { return get_size() > 0 && get_values(); } @@ -264,6 +291,9 @@ class Histogram case DOUBLE_PREC_HB: int2store(values + i * 2, val * prec_factor()); return; + default: + DBUG_ASSERT(0); + return; } } @@ -277,6 +307,9 @@ class Histogram case DOUBLE_PREC_HB: int2store(values + i * 2, uint2korr(values + i * 2 - 2)); return; + default: + DBUG_ASSERT(0); + return; } } @@ -314,7 +347,7 @@ class Table_statistics /* Array of records per key for index prefixes */ ulonglong *idx_avg_frequency; - uchar *histograms; /* Sequence of histograms */ + //uchar *histograms; /* Sequence of histograms */ }; @@ -377,7 +410,8 @@ class Column_statistics public: - Histogram histogram; + Histogram_type histogram_type_on_disk; + Histogram *histogram_; uint32 no_values_provided_bitmap() { diff --git a/sql/table.h b/sql/table.h index 2e074abcea0..f557f4ca59e 100644 --- a/sql/table.h +++ b/sql/table.h @@ -679,7 +679,15 @@ class TABLE_STATISTICS_CB public: MEM_ROOT mem_root; /* MEM_ROOT to allocate statistical data for the table */ Table_statistics *table_stats; /* Structure to access the statistical data */ - ulong total_hist_size; /* Total size of all histograms */ + + /* + Total size of all histograms. A value of 0 means historams are not present, + and histograms_are_ready() can finish sooner. + + Currently we just set it to 1 when we expect to load histograms. + histogram-todo: rename this or even remove? + */ + ulong total_hist_size; bool histograms_are_ready() const {
participants (1)
-
Sergei Petrunia