[Commits] 034f232e182: Move JSON histograms code into its own files

4 Sep 2021

revision-id: 034f232e182a172847d534554cb8ed56b56c8222 (mariadb-10.6.1-114-g034f232e182)
parent(s): b7b3b6b44a5048ae26599131a7608f9f6579eb87
author: Sergei Petrunia
committer: Sergei Petrunia
timestamp: 2021-09-04 17:11:16 +0300
message:

Move JSON histograms code into its own files

---
 sql/CMakeLists.txt        |   1 +
 sql/opt_histogram_json.cc | 391 +++++++++++++++++++++++++++++++++++++++++
 sql/opt_histogram_json.h  |  95 ++++++++++
 sql/sql_statistics.cc     | 435 +---------------------------------------------
 sql/sql_statistics.h      | 127 +++++++-------
 5 files changed, 559 insertions(+), 490 deletions(-)

diff --git a/sql/CMakeLists.txt b/sql/CMakeLists.txt
index 906c9d30bc9..ff05a8b2eae 100644
--- a/sql/CMakeLists.txt
+++ b/sql/CMakeLists.txt
@@ -147,6 +147,7 @@ SET (SQL_SOURCE
                sql_analyze_stmt.cc
                sql_join_cache.cc
                create_options.cc multi_range_read.cc
+               opt_histogram_json.cc
                opt_index_cond_pushdown.cc opt_subselect.cc
                opt_table_elimination.cc sql_expression_cache.cc
                gcalc_slicescan.cc gcalc_tools.cc
diff --git a/sql/opt_histogram_json.cc b/sql/opt_histogram_json.cc
new file mode 100644
index 00000000000..196ee6f2737
--- /dev/null
+++ b/sql/opt_histogram_json.cc
@@ -0,0 +1,391 @@
+/*
+   Copyright (c) 2021, MariaDB Corporation.
+
+   This program is free software; you can redistribute it and/or modify
+   it under the terms of the GNU General Public License as published by
+   the Free Software Foundation; version 2 of the License.
+
+   This program is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+   GNU General Public License for more details.
+
+   You should have received a copy of the GNU General Public License
+   along with this program; if not, write to the Free Software
+   Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA */
+
+#include "mariadb.h"
+#include "sql_base.h"
+#include "my_json_writer.h"
+#include "sql_statistics.h"
+#include "opt_histogram_json.h"
+
+class Histogram_json_builder : public Histogram_builder
+{
+  Histogram_json_hb *histogram;
+  uint hist_width;         /* the number of points in the histogram        */
+  double bucket_capacity;  /* number of rows in a bucket of the histogram  */
+  uint curr_bucket;        /* number of the current bucket to be built     */
+
+  std::vector<std::string> bucket_bounds;
+  bool first_value= true;
+public:
+
+  Histogram_json_builder(Histogram_json_hb *hist, Field *col, uint col_len,
+                         ha_rows rows)
+    : Histogram_builder(col, col_len, rows), histogram(hist)
+  {
+    bucket_capacity= (double)records / histogram->get_width();
+    hist_width= histogram->get_width();
+    curr_bucket= 0;
+  }
+
+  ~Histogram_json_builder() override = default;
+
+  /*
+    @brief
+      Add data to the histogram. This call adds elem_cnt rows, each
+      of which has value of *elem.
+
+    @detail
+      Subsequent next() calls will add values that are greater than *elem.
+  */
+  int next(void *elem, element_count elem_cnt) override
+  {
+    counters.next(elem, elem_cnt);
+    ulonglong count= counters.get_count();
+
+    if (curr_bucket == hist_width)
+      return 0;
+    if (first_value)
+    {
+      first_value= false;
+      column->store_field_value((uchar*) elem, col_length);
+      StringBuffer<MAX_FIELD_WIDTH> val;
+      column->val_str(&val);
+      bucket_bounds.push_back(std::string(val.ptr(), val.length()));
+    }
+
+    if (count > bucket_capacity * (curr_bucket + 1))
+    {
+      column->store_field_value((uchar*) elem, col_length);
+      StringBuffer<MAX_FIELD_WIDTH> val;
+      column->val_str(&val);
+      bucket_bounds.emplace_back(val.ptr(), val.length());
+
+      curr_bucket++;
+      while (curr_bucket != hist_width &&
+             count > bucket_capacity * (curr_bucket + 1))
+      {
+        bucket_bounds.push_back(std::string(val.ptr(), val.length()));
+        curr_bucket++;
+      }
+    }
+
+    if (records == count && bucket_bounds.size() == hist_width)
+    {
+      column->store_field_value((uchar*) elem, col_length);
+      StringBuffer<MAX_FIELD_WIDTH> val;
+      column->val_str(&val);
+      bucket_bounds.push_back(std::string(val.ptr(), val.length()));
+    }
+    return 0;
+  }
+
+  /*
+    @brief
+      Finalize the creation of histogram
+  */
+  void finalize() override
+  {
+    Json_writer writer;
+    writer.start_object();
+    writer.add_member(Histogram_json_hb::JSON_NAME).start_array();
+
+    for(auto& value: bucket_bounds) {
+      writer.add_str(value.c_str());
+    }
+    writer.end_array();
+    writer.end_object();
+    Binary_string *json_string= (Binary_string *) writer.output.get_string();
+    histogram->set_json_text(bucket_bounds.size()-1,
+                             (uchar *) json_string->c_ptr());
+  }
+};
+
+
+Histogram_builder *Histogram_json_hb::create_builder(Field *col, uint col_len,
+                                                     ha_rows rows)
+{
+  return new Histogram_json_builder(this, col, col_len, rows);
+}
+
+
+void Histogram_json_hb::init_for_collection(MEM_ROOT *mem_root,
+                                            Histogram_type htype_arg,
+                                            ulonglong size_arg)
+{
+  DBUG_ASSERT(htype_arg == JSON_HB);
+  size= (uint8) size_arg;
+}
+
+
+/*
+  @brief
+    Parse the histogram from its on-disk representation
+
+  @return
+     false  OK
+     True   Error
+*/
+
+bool Histogram_json_hb::parse(MEM_ROOT *mem_root, Field *field,
+                              Histogram_type type_arg, const char *hist_data,
+                              size_t hist_data_len)
+{
+  DBUG_ENTER("Histogram_json_hb::parse");
+  DBUG_ASSERT(type_arg == JSON_HB);
+  const char *err;
+  json_engine_t je;
+  json_string_t key_name;
+
+  json_scan_start(&je, &my_charset_utf8mb4_bin,
+                  (const uchar*)hist_data,
+                  (const uchar*)hist_data+hist_data_len);
+
+  if (json_read_value(&je) || je.value_type != JSON_VALUE_OBJECT)
+  {
+    err= "Root JSON element must be a JSON object";
+    goto error;
+  }
+
+  json_string_set_str(&key_name, (const uchar*)JSON_NAME,
+                      (const uchar*)JSON_NAME + strlen(JSON_NAME));
+  json_string_set_cs(&key_name, system_charset_info);
+
+  if (json_scan_next(&je) || je.state != JST_KEY ||
+      !json_key_matches(&je, &key_name))
+  {
+    err= "The first key in the object must be histogram_hb_v1";
+    goto error;
+  }
+
+  // The value must be a JSON array
+  if (json_read_value(&je) || (je.value_type != JSON_VALUE_ARRAY))
+  {
+    err= "A JSON array expected";
+    goto error;
+  }
+
+  // Read the array
+  while (!json_scan_next(&je))
+  {
+    switch(je.state)
+    {
+      case JST_VALUE:
+      {
+        const char *val;
+        int val_len;
+        json_smart_read_value(&je, &val, &val_len);
+        if (je.value_type != JSON_VALUE_STRING &&
+            je.value_type != JSON_VALUE_NUMBER &&
+            je.value_type != JSON_VALUE_TRUE &&
+            je.value_type != JSON_VALUE_FALSE)
+        {
+          err= "Scalar value expected";
+          goto error;
+        }
+        uchar buf[MAX_KEY_LENGTH];
+        uint len_to_copy= field->key_length();
+        field->store_text(val, val_len, &my_charset_bin);
+        uint bytes= field->get_key_image(buf, len_to_copy, Field::itRAW);
+        histogram_bounds.push_back(std::string((char*)buf, bytes));
+        // TODO: Should we also compare this endpoint with the previous
+        // to verify that the ordering is right?
+        break;
+      }
+      case JST_ARRAY_END:
+        break;
+    }
+  }
+  // n_buckets = n_bounds - 1 :
+  size= histogram_bounds.size()-1;
+  DBUG_RETURN(false);
+
+error:
+  my_error(ER_JSON_HISTOGRAM_PARSE_FAILED, MYF(0), err,
+           je.s.c_str - (const uchar*)hist_data);
+  DBUG_RETURN(true);
+}
+
+
+static
+void store_key_image_to_rec_no_null(Field *field, const uchar *ptr)
+{
+  MY_BITMAP *old_map= dbug_tmp_use_all_columns(field->table,
+                                    &field->table->write_set);
+  field->set_key_image(ptr, field->key_length());
+  dbug_tmp_restore_column_map(&field->table->write_set, old_map);
+}
+
+
+static
+double position_in_interval(Field *field, const  uchar *key,
+                            const std::string& left, const std::string& right)
+{
+  double res;
+  if (field->pos_through_val_str())
+  {
+    uint32 min_len= uint2korr(left.data());
+    uint32 max_len= uint2korr(right.data());
+    uint32 midp_len= uint2korr(key);
+
+    res= pos_in_interval_for_string(field->charset(),
+           key + HA_KEY_BLOB_LENGTH,
+           midp_len,
+           (const uchar*)left.data() + HA_KEY_BLOB_LENGTH,
+           min_len,
+           (const uchar*)right.data() + HA_KEY_BLOB_LENGTH,
+           max_len);
+  }
+  else
+  {
+    store_key_image_to_rec_no_null(field, (const uchar*)left.data());
+    double min_val_real= field->val_real();
+    
+    store_key_image_to_rec_no_null(field, (const uchar*)right.data());
+    double max_val_real= field->val_real();
+
+    store_key_image_to_rec_no_null(field, key);
+    double midp_val_real= field->val_real();
+
+    res= pos_in_interval_for_double(midp_val_real, min_val_real, max_val_real);
+  }
+  return res;
+}
+
+
+double Histogram_json_hb::point_selectivity(Field *field, key_range *endpoint,
+                                            double avg_sel)
+{
+  double sel;
+  store_key_image_to_rec(field, (uchar *) endpoint->key,
+                         field->key_length());
+  const uchar *min_key = endpoint->key;
+  if (field->real_maybe_null())
+    min_key++;
+  uint min_idx= find_bucket(field, min_key, false);
+
+  uint max_idx= find_bucket(field, min_key, true);
+#if 0
+  // find how many buckets this value occupies
+  while ((max_idx + 1 < get_width() ) &&
+         (field->key_cmp((uchar *)histogram_bounds[max_idx + 1].data(), min_key) == 0)) {
+    max_idx++;
+  }
+#endif
+  if (max_idx > min_idx)
+  {
+    // value spans multiple buckets
+    double bucket_sel= 1.0/(get_width() + 1);
+    sel= bucket_sel * (max_idx - min_idx + 1);
+  }
+  else
+  {
+    // the value fits within a single bucket
+    sel = MY_MIN(avg_sel, 1.0/get_width());
+  }
+  return sel;
+}
+
+
+/*
+  @param field    The table field histogram is for.  We don't care about the
+                  field's current value, we only need its virtual functions to
+                  perform various operations
+
+  @param min_endp Left endpoint, or NULL if there is none
+  @param max_endp Right endpoint, or NULL if there is none
+*/
+
+double Histogram_json_hb::range_selectivity(Field *field, key_range *min_endp,
+                                            key_range *max_endp)
+{
+  double min, max;
+  double width= 1.0 / histogram_bounds.size();
+
+  if (min_endp && !(field->null_ptr && min_endp->key[0]))
+  {
+    bool exclusive_endp= (min_endp->flag == HA_READ_AFTER_KEY)? true: false;
+    const uchar *min_key= min_endp->key;
+    if (field->real_maybe_null())
+      min_key++;
+
+    // Find the leftmost bucket that contains the lookup value.
+    // (If the lookup value is to the left of all buckets, find bucket #0)
+    int idx= find_bucket(field, min_key, exclusive_endp);
+    double min_sel= position_in_interval(field, (const uchar*)min_key,
+                                         histogram_bounds[idx],
+                                         histogram_bounds[idx+1]);
+    min= idx*width + min_sel*width;
+  }
+  else
+    min= 0.0;
+
+  if (max_endp)
+  {
+    // The right endpoint cannot be NULL
+    DBUG_ASSERT(!(field->null_ptr && max_endp->key[0]));
+    bool inclusive_endp= (max_endp->flag == HA_READ_AFTER_KEY)? true: false;
+    const uchar *max_key= max_endp->key;
+    if (field->real_maybe_null())
+      max_key++;
+
+    int idx= find_bucket(field, max_key, inclusive_endp);
+    double max_sel= position_in_interval(field, (const uchar*)max_key,
+                                         histogram_bounds[idx],
+                                         histogram_bounds[idx+1]);
+    max= idx*width + max_sel*width;
+  }
+  else
+    max= 1.0;
+
+  double sel = max - min;
+  return sel;
+}
+
+
+void Histogram_json_hb::serialize(Field *field)
+{
+  field->store(json_text.data(), json_text.size(), &my_charset_bin);
+}
+
+
+/*
+  Find the histogram bucket that contains the value.
+
+  @param equal_is_less Controls what to do if a histogram bound is equal to the
+                       lookup_val.
+*/
+
+int Histogram_json_hb::find_bucket(Field *field, const uchar *lookup_val,
+                                   bool equal_is_less)
+{
+  int low= 0;
+  int high= histogram_bounds.size() - 1;
+  int middle;
+
+  while (low + 1 < high)
+  {
+    middle= (low + high) / 2;
+    int res= field->key_cmp((uchar*)histogram_bounds[middle].data(), lookup_val);
+    if (!res)
+      res= equal_is_less? -1: 1;
+    if (res < 0)
+      low= middle;
+    else //res > 0
+      high= middle;
+  }
+
+  return low;
+}
diff --git a/sql/opt_histogram_json.h b/sql/opt_histogram_json.h
new file mode 100644
index 00000000000..c5b31c273ad
--- /dev/null
+++ b/sql/opt_histogram_json.h
@@ -0,0 +1,95 @@
+/*
+   Copyright (c) 2021, MariaDB Corporation.
+
+   This program is free software; you can redistribute it and/or modify
+   it under the terms of the GNU General Public License as published by
+   the Free Software Foundation; version 2 of the License.
+
+   This program is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+   GNU General Public License for more details.
+
+   You should have received a copy of the GNU General Public License
+   along with this program; if not, write to the Free Software
+   Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA */
+
+#include "sql_statistics.h"
+
+/*
+  An equi-height histogram which stores real values for bucket bounds.
+
+  Handles @@histogram_type=JSON_HB
+*/
+
+class Histogram_json_hb : public Histogram_base
+{
+  size_t size; /* Number of elements in the histogram */
+
+  /* Collection-time only: collected histogram in the JSON form. */
+  std::string json_text;
+
+  // Array of histogram bucket endpoints in KeyTupleFormat.
+  std::vector<std::string> histogram_bounds;
+
+public:
+  static constexpr const char* JSON_NAME="histogram_hb_v1";
+
+  bool parse(MEM_ROOT *mem_root, Field *field, Histogram_type type_arg,
+             const char *hist_data, size_t hist_data_len) override;
+
+  void serialize(Field *field) override;
+
+  Histogram_builder *create_builder(Field *col, uint col_len,
+                                    ha_rows rows) override;
+
+  // returns number of buckets in the histogram
+  uint get_width() override
+  {
+    return (uint)size;
+  }
+
+  Histogram_type get_type() override
+  {
+    return JSON_HB;
+  }
+
+  /*
+    @brief
+      Legacy: this returns the size of the histogram on disk.
+
+    @detail
+      This is only called at collection time when json_text is non-empty.
+  */
+  uint get_size() override
+  {
+    return json_text.size();
+  }
+
+  void init_for_collection(MEM_ROOT *mem_root, Histogram_type htype_arg,
+                           ulonglong size) override;
+
+  bool is_available() override {return true; }
+
+  bool is_usable(THD *thd) override
+  {
+    return thd->variables.optimizer_use_condition_selectivity > 3 &&
+           is_available();
+  }
+
+  double point_selectivity(Field *field, key_range *endpoint,
+                           double avg_selection) override;
+  double range_selectivity(Field *field, key_range *min_endp,
+                           key_range *max_endp) override;
+
+  void set_json_text(ulonglong sz, uchar *json_text_arg)
+  {
+    size = (uint8) sz;
+    json_text.assign((const char*)json_text_arg,
+                     strlen((const char*)json_text_arg));
+  }
+
+private:
+  int find_bucket(Field *field, const uchar *lookup_val, bool equal_is_less);
+};
+
diff --git a/sql/sql_statistics.cc b/sql/sql_statistics.cc
index e6988150304..dd79cc16d59 100644
--- a/sql/sql_statistics.cc
+++ b/sql/sql_statistics.cc
@@ -28,11 +28,11 @@
 #include "sql_base.h"
 #include "key.h"
 #include "sql_statistics.h"
+#include "opt_histogram_json.h"
 #include "opt_range.h"
 #include "uniques.h"
 #include "sql_show.h"
 #include "sql_partition.h"
-#include "my_json_writer.h"
 
 #include <vector>
 #include <string>
@@ -1267,8 +1267,8 @@ void Histogram_binary::serialize(Field *field)
 }
 
 void Histogram_binary::init_for_collection(MEM_ROOT *mem_root,
-                                    Histogram_type htype_arg, 
-                                    ulonglong size_arg)
+                                           Histogram_type htype_arg,
+                                           ulonglong size_arg)
 {
   type= htype_arg;
   values = (uchar*)alloc_root(mem_root, size_arg);
@@ -1276,273 +1276,6 @@ void Histogram_binary::init_for_collection(MEM_ROOT *mem_root,
 }
 
 
-void Histogram_json_hb::init_for_collection(MEM_ROOT *mem_root,
-                                            Histogram_type htype_arg,
-                                            ulonglong size_arg)
-{
-  DBUG_ASSERT(htype_arg == JSON_HB);
-  size= (uint8) size_arg;
-}
-
-
-/*
-  @brief
-    Parse the histogram from its on-disk representation
-
-  @return
-     false  OK
-     True   Error
-*/
-
-bool Histogram_json_hb::parse(MEM_ROOT *mem_root, Field *field,
-                              Histogram_type type_arg, const char *hist_data,
-                              size_t hist_data_len)
-{
-  DBUG_ENTER("Histogram_json_hb::parse");
-  DBUG_ASSERT(type_arg == JSON_HB);
-  const char *err;
-  json_engine_t je;
-  json_string_t key_name;
-
-  json_scan_start(&je, &my_charset_utf8mb4_bin,
-                  (const uchar*)hist_data,
-                  (const uchar*)hist_data+hist_data_len);
-
-  if (json_read_value(&je) || je.value_type != JSON_VALUE_OBJECT)
-  {
-    err= "Root JSON element must be a JSON object";
-    goto error;
-  }
-
-  json_string_set_str(&key_name, (const uchar*)JSON_NAME,
-                      (const uchar*)JSON_NAME + strlen(JSON_NAME));
-  json_string_set_cs(&key_name, system_charset_info);
-
-  if (json_scan_next(&je) || je.state != JST_KEY ||
-      !json_key_matches(&je, &key_name))
-  {
-    err= "The first key in the object must be histogram_hb_v1";
-    goto error;
-  }
-
-  // The value must be a JSON array
-  if (json_read_value(&je) || (je.value_type != JSON_VALUE_ARRAY))
-  {
-    err= "A JSON array expected";
-    goto error;
-  }
-
-  // Read the array
-  while (!json_scan_next(&je))
-  {
-    switch(je.state)
-    {
-      case JST_VALUE:
-      {
-        const char *val;
-        int val_len;
-        json_smart_read_value(&je, &val, &val_len);
-        if (je.value_type != JSON_VALUE_STRING &&
-            je.value_type != JSON_VALUE_NUMBER &&
-            je.value_type != JSON_VALUE_TRUE &&
-            je.value_type != JSON_VALUE_FALSE)
-        {
-          err= "Scalar value expected";
-          goto error;
-        }
-        uchar buf[MAX_KEY_LENGTH];
-        uint len_to_copy= field->key_length();
-        field->store_text(val, val_len, &my_charset_bin);
-        uint bytes= field->get_key_image(buf, len_to_copy, Field::itRAW);
-        histogram_bounds.push_back(std::string((char*)buf, bytes));
-        // TODO: Should we also compare this endpoint with the previous
-        // to verify that the ordering is right?
-        break;
-      }
-      case JST_ARRAY_END:
-        break;
-    }
-  }
-  // n_buckets = n_bounds - 1 :
-  size= histogram_bounds.size()-1;
-  DBUG_RETURN(false);
-
-error:
-  my_error(ER_JSON_HISTOGRAM_PARSE_FAILED, MYF(0), err,
-           je.s.c_str - (const uchar*)hist_data);
-  DBUG_RETURN(true);
-}
-
-
-double Histogram_json_hb::point_selectivity(Field *field, key_range *endpoint,
-                                            double avg_sel)
-{
-  double sel;
-  store_key_image_to_rec(field, (uchar *) endpoint->key,
-                         field->key_length());
-  const uchar *min_key = endpoint->key;
-  if (field->real_maybe_null())
-    min_key++;
-  uint min_idx= find_bucket(field, min_key, false);
-
-  uint max_idx= find_bucket(field, min_key, true);
-#if 0
-  // find how many buckets this value occupies
-  while ((max_idx + 1 < get_width() ) &&
-         (field->key_cmp((uchar *)histogram_bounds[max_idx + 1].data(), min_key) == 0)) {
-    max_idx++;
-  }
-#endif
-  if (max_idx > min_idx)
-  {
-    // value spans multiple buckets
-    double bucket_sel= 1.0/(get_width() + 1);
-    sel= bucket_sel * (max_idx - min_idx + 1);
-  }
-  else
-  {
-    // the value fits within a single bucket
-    sel = MY_MIN(avg_sel, 1.0/get_width());
-  }
-  return sel;
-}
-
-
-static
-void store_key_image_to_rec_no_null(Field *field, const uchar *ptr)
-{
-  MY_BITMAP *old_map= dbug_tmp_use_all_columns(field->table,
-                                    &field->table->write_set);
-  field->set_key_image(ptr, field->key_length());
-  dbug_tmp_restore_column_map(&field->table->write_set, old_map);
-}
-
-
-static
-double position_in_interval(Field *field, const  uchar *key,
-                            const std::string& left, const std::string& right)
-{
-  double res;
-  if (field->pos_through_val_str())
-  {
-    uint32 min_len= uint2korr(left.data());
-    uint32 max_len= uint2korr(right.data());
-    uint32 midp_len= uint2korr(key);
-
-    res= pos_in_interval_for_string(field->charset(),
-           key + HA_KEY_BLOB_LENGTH,
-           midp_len,
-           (const uchar*)left.data() + HA_KEY_BLOB_LENGTH,
-           min_len,
-           (const uchar*)right.data() + HA_KEY_BLOB_LENGTH,
-           max_len);
-  }
-  else
-  {
-    store_key_image_to_rec_no_null(field, (const uchar*)left.data());
-    double min_val_real= field->val_real();
-    
-    store_key_image_to_rec_no_null(field, (const uchar*)right.data());
-    double max_val_real= field->val_real();
-
-    store_key_image_to_rec_no_null(field, key);
-    double midp_val_real= field->val_real();
-
-    res= pos_in_interval_for_double(midp_val_real, min_val_real, max_val_real);
-  }
-  return res;
-}
-
-/*
-  @param field    The table field histogram is for.  We don't care about the
-                  field's current value, we only need its virtual functions to
-                  perform various operations
-
-  @param min_endp Left endpoint, or NULL if there is none
-  @param max_endp Right endpoint, or NULL if there is none
-*/
-double Histogram_json_hb::range_selectivity(Field *field, key_range *min_endp,
-                                            key_range *max_endp)
-{
-  double min, max;
-  double width= 1.0 / histogram_bounds.size();
-
-  if (min_endp && !(field->null_ptr && min_endp->key[0]))
-  {
-    bool exclusive_endp= (min_endp->flag == HA_READ_AFTER_KEY)? true: false;
-    const uchar *min_key= min_endp->key;
-    if (field->real_maybe_null())
-      min_key++;
-
-    // Find the leftmost bucket that contains the lookup value.
-    // (If the lookup value is to the left of all buckets, find bucket #0)
-    int idx= find_bucket(field, min_key, exclusive_endp);
-    double min_sel= position_in_interval(field, (const uchar*)min_key,
-                                         histogram_bounds[idx],
-                                         histogram_bounds[idx+1]);
-    min= idx*width + min_sel*width;
-  }
-  else
-    min= 0.0;
-
-  if (max_endp)
-  {
-    // The right endpoint cannot be NULL
-    DBUG_ASSERT(!(field->null_ptr && max_endp->key[0]));
-    bool inclusive_endp= (max_endp->flag == HA_READ_AFTER_KEY)? true: false;
-    const uchar *max_key= max_endp->key;
-    if (field->real_maybe_null())
-      max_key++;
-
-    int idx= find_bucket(field, max_key, inclusive_endp);
-    double max_sel= position_in_interval(field, (const uchar*)max_key,
-                                         histogram_bounds[idx],
-                                         histogram_bounds[idx+1]);
-    max= idx*width + max_sel*width;
-  }
-  else
-    max= 1.0;
-
-  double sel = max - min;
-  return sel;
-}
-
-
-void Histogram_json_hb::serialize(Field *field)
-{
-  field->store(json_text.data(), json_text.size(), &my_charset_bin);
-}
-
-
-/*
-  Find the histogram bucket that contains the value.
-
-  @param equal_is_less Controls what to do if a histogram bound is equal to the
-                       lookup_val.
-*/
-
-int Histogram_json_hb::find_bucket(Field *field, const uchar *lookup_val,
-                                   bool equal_is_less)
-{
-  int low= 0;
-  int high= histogram_bounds.size() - 1;
-  int middle;
-
-  while (low + 1 < high)
-  {
-    middle= (low + high) / 2;
-    int res= field->key_cmp((uchar*)histogram_bounds[middle].data(), lookup_val);
-    if (!res)
-      res= equal_is_less? -1: 1;
-    if (res < 0)
-      low= middle;
-    else //res > 0
-      high= middle;
-  }
-
-  return low;
-}
-
 /*
   An object of the class Index_stat is created to read statistical
   data on tables from the statistical table table_stat, to update
@@ -1853,73 +1586,6 @@ class Stat_table_write_iter
   }
 };
 
-
-/*
-  This is used to collect the the basic statistics from a Unique object:
-   - count of values
-   - count of distinct values
-   - count of distinct values that have occurred only once
-*/
-
-class Basic_stats_collector
-{
-  ulonglong count;         /* number of values retrieved                   */
-  ulonglong count_distinct;    /* number of distinct values retrieved      */
-  /* number of distinct values that occured only once  */
-  ulonglong count_distinct_single_occurence;
-
-public:
-  Basic_stats_collector()
-  {
-    count= 0;
-    count_distinct= 0;
-    count_distinct_single_occurence= 0;
-  }
-
-  ulonglong get_count_distinct() const { return count_distinct; }
-  ulonglong get_count_single_occurence() const
-  {
-    return count_distinct_single_occurence;
-  }
-  ulonglong get_count() const { return count; }
-
-  void next(void *elem, element_count elem_cnt)
-  {
-    count_distinct++;
-    if (elem_cnt == 1)
-      count_distinct_single_occurence++;
-    count+= elem_cnt;
-  }
-};
-
-/*
-  Histogram_builder is a helper class that is used to build histograms
-  for columns.
-
-  Do not create directly, call Histogram->get_builder(...);
-*/
-
-class Histogram_builder
-{
-protected:
-  Field *column;           /* table field for which the histogram is built */
-  uint col_length;         /* size of this field                           */
-  ha_rows records;         /* number of records the histogram is built for */
-
-  Histogram_builder(Field *col, uint col_len, ha_rows rows) :
-    column(col), col_length(col_len), records(rows)
-  {}
-
-public:
-  // A histogram builder will also collect the counters
-  Basic_stats_collector counters;
-
-  virtual int next(void *elem, element_count elem_cnt)=0;
-  virtual void finalize()=0;
-  virtual ~Histogram_builder(){}
-};
-
-
 class Histogram_binary_builder : public Histogram_builder
 {
   Field *min_value;        /* pointer to the minimal value for the field   */
@@ -1974,101 +1640,6 @@ Histogram_builder *Histogram_binary::create_builder(Field *col, uint col_len,
 }
 
 
-class Histogram_json_builder : public Histogram_builder
-{
-  Histogram_json_hb *histogram;
-  uint hist_width;         /* the number of points in the histogram        */
-  double bucket_capacity;  /* number of rows in a bucket of the histogram  */
-  uint curr_bucket;        /* number of the current bucket to be built     */
-
-  std::vector<std::string> bucket_bounds;
-  bool first_value= true;
-public:
-  Histogram_json_builder(Field *col, uint col_len, ha_rows rows)
-    : Histogram_builder(col, col_len, rows)
-  {
-    histogram= (Histogram_json_hb*)col->collected_stats->histogram;
-    bucket_capacity= (double)records / histogram->get_width();
-    hist_width= histogram->get_width();
-    curr_bucket= 0;
-  }
-
-  ~Histogram_json_builder() override = default;
-
-  /*
-    Add data to the histogram. Adding Element elem which encountered elem_cnt
-    times.
-  */
-  int next(void *elem, element_count elem_cnt) override
-  {
-    counters.next(elem, elem_cnt);
-    ulonglong count= counters.get_count();
-
-    if (curr_bucket == hist_width)
-      return 0;
-    if (first_value)
-    {
-      first_value= false;
-      column->store_field_value((uchar*) elem, col_length);
-      StringBuffer<MAX_FIELD_WIDTH> val;
-      column->val_str(&val);
-      bucket_bounds.push_back(std::string(val.ptr(), val.length()));
-    }
-
-    if (count > bucket_capacity * (curr_bucket + 1))
-    {
-      column->store_field_value((uchar*) elem, col_length);
-      StringBuffer<MAX_FIELD_WIDTH> val;
-      column->val_str(&val);
-      bucket_bounds.emplace_back(val.ptr(), val.length());
-
-      curr_bucket++;
-      while (curr_bucket != hist_width &&
-             count > bucket_capacity * (curr_bucket + 1))
-      {
-        bucket_bounds.push_back(std::string(val.ptr(), val.length()));
-        curr_bucket++;
-      }
-    }
-
-    if (records == count && bucket_bounds.size() == hist_width)
-    {
-      column->store_field_value((uchar*) elem, col_length);
-      StringBuffer<MAX_FIELD_WIDTH> val;
-      column->val_str(&val);
-      bucket_bounds.push_back(std::string(val.ptr(), val.length()));
-    }
-    return 0;
-  }
-
-  /*
-    Finalize the creation of histogram
-  */
-  void finalize() override
-  {
-    Json_writer writer;
-    writer.start_object();
-    writer.add_member(Histogram_json_hb::JSON_NAME).start_array();
-
-    for(auto& value: bucket_bounds) {
-      writer.add_str(value.c_str());
-    }
-    writer.end_array();
-    writer.end_object();
-    Binary_string *json_string = (Binary_string *) writer.output.get_string();
-    histogram->set_json_text(bucket_bounds.size()-1,
-                             (uchar *) json_string->c_ptr());
-  }
-};
-
-
-Histogram_builder *Histogram_json_hb::create_builder(Field *col, uint col_len,
-                                                     ha_rows rows)
-{
-  return new Histogram_json_builder(col, col_len, rows);
-}
-
-
 Histogram_base *create_histogram(MEM_ROOT *mem_root, Histogram_type hist_type,
                                  THD *owner)
 {
diff --git a/sql/sql_statistics.h b/sql/sql_statistics.h
index 0b1b310941f..7ac315f48e2 100644
--- a/sql/sql_statistics.h
+++ b/sql/sql_statistics.h
@@ -162,11 +162,18 @@ class Histogram_base : public Sql_alloc
 
   virtual uint get_width()=0;
 
-  virtual Histogram_builder *create_builder(Field *col, uint col_len,
-                                            ha_rows rows)=0;
-
+  /*
+    The creation-time workflow is:
+     * create a histogram
+     * init_for_collection()
+     * create_builder()
+     * feed the data to the builder
+     * serialize();
+  */
   virtual void init_for_collection(MEM_ROOT *mem_root, Histogram_type htype_arg,
                                    ulonglong size)=0;
+  virtual Histogram_builder *create_builder(Field *col, uint col_len,
+                                            ha_rows rows)=0;
 
   virtual bool is_available()=0;
 
@@ -177,19 +184,26 @@ class Histogram_base : public Sql_alloc
   virtual double range_selectivity(Field *field, key_range *min_endp,
                                    key_range *max_endp)=0;
 
-  // Legacy: return the size of the histogram on disk.
-  // This will be stored in mysql.column_stats.hist_size column.
-  // Newer, JSON-based histograms may return 0.
+  /*
+    Legacy: return the size of the histogram on disk.
+
+    This will be stored in mysql.column_stats.hist_size column.
+    The value is not really needed as one can look at
+    LENGTH(mysql.column_stats.histogram) directly.
+  */
   virtual uint get_size()=0;
   virtual ~Histogram_base()= default;
 
-
   Histogram_base() : owner(NULL) {}
+
+  /*
+    Memory management: a histogram may be (exclusively) "owned" by a particular
+    thread (done for histograms that are being collected).  By default, a
+    histogram has owner==NULL and is not owned by any particular thread.
+  */
   THD *get_owner() { return owner; }
   void set_owner(THD *thd) { owner=thd; }
 private:
-  // Owner is a thread that *exclusively* owns this histogram (and so can
-  // delete it at any time)
   THD *owner;
 };
 
@@ -353,75 +367,72 @@ class Histogram_binary : public Histogram_base
 
 
 /*
-  An equi-height histogram which stores real values for bucket bounds.
-
-  Handles @@histogram_type=JSON_HB
+  This is used to collect the the basic statistics from a Unique object:
+   - count of values
+   - count of distinct values
+   - count of distinct values that have occurred only once
 */
 
-class Histogram_json_hb : public Histogram_base
+class Basic_stats_collector
 {
-private:
-  size_t size; /* Number of elements in the histogram */
-  
-  /* Collection-time only: collected histogram in the JSON form. */
-  std::string json_text;
-
-  // Array of histogram bucket endpoints in KeyTupleFormat.
-  std::vector<std::string> histogram_bounds;
+  ulonglong count;         /* number of values retrieved                   */
+  ulonglong count_distinct;    /* number of distinct values retrieved      */
+  /* number of distinct values that occured only once  */
+  ulonglong count_distinct_single_occurence;
 
 public:
-  static constexpr const char* JSON_NAME="histogram_hb_v1";
-
-  bool parse(MEM_ROOT *mem_root, Field *field, Histogram_type type_arg,
-             const char *hist_data, size_t hist_data_len) override;
-
-  void serialize(Field *field) override;
-
-  Histogram_builder *create_builder(Field *col, uint col_len,
-                                    ha_rows rows) override;
-
-  // returns number of buckets in the histogram
-  uint get_width() override
+  Basic_stats_collector()
   {
-    return (uint)size;
+    count= 0;
+    count_distinct= 0;
+    count_distinct_single_occurence= 0;
   }
 
-  Histogram_type get_type() override
+  ulonglong get_count_distinct() const { return count_distinct; }
+  ulonglong get_count_single_occurence() const
   {
-    return JSON_HB;
+    return count_distinct_single_occurence;
   }
+  ulonglong get_count() const { return count; }
 
-  void set_json_text(ulonglong sz, uchar *json_text_arg)
+  void next(void *elem, element_count elem_cnt)
   {
-    size = (uint8) sz;
-    json_text.assign((const char*)json_text_arg, 
-                     strlen((const char*)json_text_arg));
+    count_distinct++;
+    if (elem_cnt == 1)
+      count_distinct_single_occurence++;
+    count+= elem_cnt;
   }
+};
 
-  uint get_size() override
-  {
-    return size;
-  }
 
-  void init_for_collection(MEM_ROOT *mem_root, Histogram_type htype_arg,
-                           ulonglong size) override;
+/*
+  Histogram_builder is a helper class that is used to build histograms
+  for columns.
 
-  bool is_available() override {return true; }
+  Do not create directly, call Histogram->get_builder(...);
+*/
 
-  bool is_usable(THD *thd) override
-  {
-    return thd->variables.optimizer_use_condition_selectivity > 3 &&
-           is_available();
-  }
+class Histogram_builder
+{
+protected:
+  Field *column;           /* table field for which the histogram is built */
+  uint col_length;         /* size of this field                           */
+  ha_rows records;         /* number of records the histogram is built for */
 
-  double point_selectivity(Field *field, key_range *endpoint,
-                           double avg_selection) override;
-  double range_selectivity(Field *field, key_range *min_endp,
-                           key_range *max_endp) override;
-private:
-  int find_bucket(Field *field, const uchar *lookup_val, bool equal_is_less);
+  Histogram_builder(Field *col, uint col_len, ha_rows rows) :
+    column(col), col_length(col_len), records(rows)
+  {}
+
+public:
+  // A histogram builder will also collect the counters
+  Basic_stats_collector counters;
+
+  virtual int next(void *elem, element_count elem_cnt)=0;
+  virtual void finalize()=0;
+  virtual ~Histogram_builder(){}
 };
 
+
 class Columns_statistics;
 class Index_statistics;

    

[Commits] 034f232e182: Move JSON histograms code into its own files

psergey