[Commits] [PATCH] Implement avg_frequency unsmoothed jacknife estimator

16 Feb 2019

When sampling data through ANALYZE TABLE, use the estimator to get a
better estimation of avg_frequency instead of just using the raw
sampled data.
---
 mysql-test/main/statistics.result |   6 +-
 sql/sql_statistics.cc             | 106 +++++++++++++++++++++++-------
 2 files changed, 85 insertions(+), 27 deletions(-)

diff --git a/mysql-test/main/statistics.result b/mysql-
test/main/statistics.result
index 01176b8d6cf..ed31ce94936 100644
--- a/mysql-test/main/statistics.result
+++ b/mysql-test/main/statistics.result
@@ -1810,7 +1810,7 @@ select table_name, column_name, min_value,
max_value, nulls_ratio, avg_length, a
 DECODE_HISTOGRAM(hist_type, histogram)
 from mysql.column_stats;
 table_name     column_name     min_value       max_value       nulls_r
atio     avg_length      avg_frequency     DECODE_HISTOGRAM(hist_type,
histogram)
-t1     id      111     17026   0.0000  4.0000  1.0047  0.039,0.098,0.0
55,0.118,0.078,0.157,0.082,0.118,0.094,0.063,0.098
+t1     id      111     17026   0.0000  4.0000  1004.7393       0.039,0
.098,0.055,0.118,0.078,0.157,0.082,0.118,0.094,0.063,0.098
 #
 # This query will show a better avg_frequency value.
 #
@@ -1823,7 +1823,7 @@ select table_name, column_name, min_value,
max_value, nulls_ratio, avg_length, a
 DECODE_HISTOGRAM(hist_type, histogram)
 from mysql.column_stats;
 table_name     column_name     min_value       max_value       nulls_r
atio     avg_length      avg_frequency     DECODE_HISTOGRAM(hist_type,
histogram)
-t1     id      1       17384   0.0000  4.0000  3.5736  0.082,0.086,0.0
86,0.082,0.086,0.145,0.086,0.086,0.082,0.086,0.090
+t1     id      1       17384   0.0000  4.0000  14.2943
0.082,0.086,0.086,0.082,0.086,0.145,0.086,0.086,0.082,0.086,0.090
 set analyze_sample_percentage=0;
 #
 # Test self adjusting sampling level.
@@ -1836,7 +1836,7 @@ select table_name, column_name, min_value,
max_value, nulls_ratio, avg_length, a
 DECODE_HISTOGRAM(hist_type, histogram)
 from mysql.column_stats;
 table_name     column_name     min_value       max_value       nulls_r
atio     avg_length      avg_frequency     DECODE_HISTOGRAM(hist_type,
histogram)
-t1     id      1       17384   0.0000  4.0000  7.4523  0.082,0.090,0.0
86,0.082,0.086,0.145,0.086,0.082,0.086,0.086,0.086
+t1     id      1       17384   0.0000  4.0000  13.9816
0.082,0.090,0.086,0.082,0.086,0.145,0.086,0.082,0.086,0.086,0.086
 #
 # Test record estimation is working properly.
 #
diff --git a/sql/sql_statistics.cc b/sql/sql_statistics.cc
index 27fab974441..8018c41a192 100644
--- a/sql/sql_statistics.cc
+++ b/sql/sql_statistics.cc
@@ -325,7 +325,7 @@ class Column_statistics_collected :public
Column_statistics
 
   inline void init(THD *thd, Field * table_field);
   inline bool add();
-  inline void finish(ha_rows rows); 
+  inline void finish(ha_rows rows, double sample_fraction);
   inline void cleanup();
 };
 
@@ -1540,6 +1540,8 @@ class Histogram_builder
   uint curr_bucket;        /* number of the current bucket to be
built     */
   ulonglong count;         /* number of values
retrieved                   */
   ulonglong count_distinct;    /* number of distinct values
retrieved      */
+  /* number of distinct values that occured only once  */
+  ulonglong count_distinct_single_occurence;
 
 public: 
   Histogram_builder(Field *col, uint col_len, ha_rows rows)
@@ -1553,14 +1555,21 @@ class Histogram_builder
     bucket_capacity= (double) records / (hist_width + 1);
     curr_bucket= 0;
     count= 0;
-    count_distinct= 0;    
+    count_distinct= 0;
+    count_distinct_single_occurence= 0;
   }
 
-  ulonglong get_count_distinct() { return count_distinct; }
+  ulonglong get_count_distinct() const { return count_distinct; }
+  ulonglong get_count_single_occurence() const
+  {
+    return count_distinct_single_occurence;
+  }
 
   int next(void *elem, element_count elem_cnt)
   {
     count_distinct++;
+    if (elem_cnt == 1)
+      count_distinct_single_occurence++;
     count+= elem_cnt;
     if (curr_bucket == hist_width)
       return 0;
@@ -1574,7 +1583,7 @@ class Histogram_builder
              count > bucket_capacity * (curr_bucket + 1))
       {
         histogram->set_prev_value(curr_bucket);
-       curr_bucket++;
+        curr_bucket++;
       }
     }
     return 0;
@@ -1590,9 +1599,18 @@ int histogram_build_walk(void *elem,
element_count elem_cnt, void *arg)
   return hist_builder->next(elem, elem_cnt);
 }
 
-C_MODE_END
 
 
+static int count_distinct_single_occurence_walk(void *elem,
+                                                element_count count,
void *arg)
+{
+  ((ulonglong*)arg)[0]+= 1;
+  if (count == 1)
+    ((ulonglong*)arg)[1]+= 1;
+  return 0;
+}
+
+C_MODE_END
 /*
   The class Count_distinct_field is a helper class used to calculate
   the number of distinct values for a column. The class employs the
@@ -1611,6 +1629,9 @@ class Count_distinct_field: public Sql_alloc
   Unique *tree;       /* The helper object to contain distinct values
*/
   uint tree_key_length; /* The length of the keys for the elements of
'tree */
 
+  ulonglong distincts;
+  ulonglong distincts_single_occurence;
+
 public:
   
   Count_distinct_field() {}
@@ -1662,30 +1683,40 @@ class Count_distinct_field: public Sql_alloc
   {
     return tree->unique_add(table_field->ptr);
   }
-  
+
   /*
     @brief
     Calculate the number of elements accumulated in the container of
'tree'
   */
-  ulonglong get_value()
-  {
-    ulonglong count;
-    if (tree->elements == 0)
-      return (ulonglong) tree->elements_in_tree();
-    count= 0;  
-    tree->walk(table_field->table, count_distinct_walk, (void*)
&count);
-    return count;
+  void walk_tree()
+  {
+    ulonglong counts[2] = {0, 0};
+    tree->walk(table_field->table,
+               count_distinct_single_occurence_walk, counts);
+    distincts= counts[0];
+    distincts_single_occurence= counts[1];
   }
 
   /*
     @brief
-    Build the histogram for the elements accumulated in the container
of 'tree'
+    Calculate a histogram of the tree
   */
-  ulonglong get_value_with_histogram(ha_rows rows)
+   void walk_tree_with_histogram(ha_rows rows)
   {
     Histogram_builder hist_builder(table_field, tree_key_length,
rows);
     tree->walk(table_field->table,  histogram_build_walk, (void *)
&hist_builder);
-    return hist_builder.get_count_distinct();
+    distincts= hist_builder.get_count_distinct();
+    distincts_single_occurence=
hist_builder.get_count_single_occurence();
+  }
+
+  ulonglong get_count_distinct()
+  {
+    return distincts;
+  }
+
+  ulonglong get_count_distinct_single_occurence()
+  {
+    return distincts_single_occurence;
   }
 
   /*
@@ -2514,7 +2545,7 @@ bool Column_statistics_collected::add()
 */
 
 inline
-void Column_statistics_collected::finish(ha_rows rows)
+void Column_statistics_collected::finish(ha_rows rows, double
sample_fraction)
 {
   double val;
 
@@ -2532,16 +2563,43 @@ void
Column_statistics_collected::finish(ha_rows rows)
   }
   if (count_distinct)
   {
-    ulonglong distincts;
     uint hist_size= count_distinct->get_hist_size();
+
+    /* Compute cardinality statistics and optionally histogram. */
     if (hist_size == 0)
-      distincts= count_distinct->get_value();
+      count_distinct->walk_tree();
     else
-      distincts= count_distinct->get_value_with_histogram(rows -
nulls);
+      count_distinct->walk_tree_with_histogram(rows - nulls);
+
+    ulonglong distincts= count_distinct->get_count_distinct();
+    ulonglong distincts_single_occurence=
+      count_distinct->get_count_distinct_single_occurence();
+
     if (distincts)
     {
-      val= (double) (rows - nulls) / distincts;
-      set_avg_frequency(val); 
+      /*
+       We use the unsmoothed first-order jackknife estimator" to
estimate
+       the number of distinct values.
+       With a sufficient large percentage of rows sampled (80%), we
revert back
+       to computing the avg_frequency off of the raw data.
+      */
+      if (sample_fraction > 0.8)
+        val= (double) (rows - nulls) / distincts;
+      else
+      {
+        if (nulls == 1)
+          distincts_single_occurence+= 1;
+        if (nulls)
+          distincts+= 1;
+        double fraction_single_occurence= distincts_single_occurence /
rows;
+        double total_number_of_rows= rows / sample_fraction;
+        double estimate_total_distincts= total_number_of_rows /
+                (distincts /
+                 (1.0 - (1.0 - sample_fraction) *
fraction_single_occurence));
+        val = std::fmax(estimate_total_distincts * (rows - nulls) /
rows, 1.0);
+      }
+
+      set_avg_frequency(val);
       set_not_null(COLUMN_STAT_AVG_FREQUENCY);
     }
     else
@@ -2813,7 +2871,7 @@ int collect_statistics_for_table(THD *thd, TABLE
*table)
       continue;
     bitmap_set_bit(table->write_set, table_field->field_index); 
     if (!rc)
-      table_field->collected_stats->finish(rows);
+      table_field->collected_stats->finish(rows, sample_fraction);
     else
       table_field->collected_stats->cleanup();
   }
-- 
2.20.1

    

[Commits] [PATCH] Implement avg_frequency unsmoothed jacknife estimator

Vicențiu Ciorbaru