[Maria-developers] Review: MDEV-12985: Support percentile and media window functions
Hi Varun! Here is the in-detail review for the percentile functions. Please add a grammar definition for median too and test it. All in all, not many changes, but please address my comments. If you disagree with some of them, let's discuss them.
diff --git a/mysql-test/r/win_percentile_cont.result b/mysql-test/r/win_percentile_cont.result new file mode 100644 index 00000000000..61f70892887 --- /dev/null +++ b/mysql-test/r/win_percentile_cont.result diff --git a/mysql-test/t/win_percentile_cont.test b/mysql-test/t/win_percentile_cont.test new file mode 100644 index 00000000000..75fde963b2a --- /dev/null +++ b/mysql-test/t/win_percentile_cont.test @@ -0,0 +1,55 @@ Please put more tests here, as we discussed. Also, to make test cases easier to follow, please add cume_dist as a function in the select to all these use cases. +CREATE TABLE student (name CHAR(10), test double, score DECIMAL(19,4)); +INSERT INTO student VALUES +('Chun', 0, 3), ('Chun', 0, 7), +('Kaolin', 0.5, 3), ('Kaolin', 0.6, 7), +('Kaolin', 0.5, 4), +('Tatiana', 0.8, 4), ('Tata', 0.8, 4); + Make this --echo # No partition clause +#no partition clause +select name, percentile_disc(0.5) within group(order by score) over () from student; +select name, percentile_cont(0.5) within group(order by score) over () from student; + Same as before, use --echo # so we can see it in the result set too. +# argument set to null +select name, percentile_cont(null) within group(order by score) over (partition by name) from student; +select name, percentile_disc(null) within group(order by score) over (partition by name) from student; + Use --echo # +# complete query with partition column +select name, percentile_cont(0.5) within group(order by score) over (partition by name) as c from student; +select name, percentile_disc(0.5) within group(order by score) over (partition by name) as c from student; + Use --echo # +#subqueries having percentile functions + +select * from ( select name , percentile_cont(0.5) within group ( order by score) over (partition by name ) from student ) as t; +select * from ( select name , percentile_disc(0.5) within group ( order by score) over (partition by name ) from student ) as t; +select name from student a where (select percentile_disc(0.5) within group (order by score) over (partition by name) from student b limit 1) >= 0.5; + USe --echo +# WITH STORED PROCEDURES + + USe --echo +#DISALLOWED FIELDS IN ORDER BY CLAUSE +--error ER_WRONG_TYPE_FOR_PERCENTILE_CONT +select score, percentile_cont(0.5) within group(order by name) over (partition by score) from student; +select score, percentile_disc(0.5) within group(order by name) over (partition by score) from student;
+ +# error with 2 order by elements + +--error ER_NOT_SINGLE_ELEMENT_ORDER_LIST +select percentile_disc(0.5) within group(order by score,test) over (partition by name) from student; +--error ER_NOT_SINGLE_ELEMENT_ORDER_LIST +select percentile_cont(0.5) within group(order by score,test) over (partition by name) from student; + Use --echo and fix that comment. We thought of a way for this right? +#parameter value should be in the range of 0 to 1( NEED TO THINK A WAY FOR THIS) +--error ER_ARGUMENT_OUT_OF_RANGE +select percentile_disc(1.5) within group(order by score) over (partition by name) from student; +--error ER_ARGUMENT_OUT_OF_RANGE +select percentile_cont(1.5) within group(order by score) over (partition by name) from student; + +--error ER_ARGUMENT_NOT_CONSTANT +select name,percentile_cont(test) within group(order by score) over (partition by name) from student; +--error ER_ARGUMENT_NOT_CONSTANT +select name, percentile_disc(test) within group(order by score) over (partition by name) from student; + This is commented out. Not good, fix it please. And use --echo # +#CHECK TYPE OF THE ARGUMENT, SHOULD BE ONLY NUMERICAL +#select name, percentile_cont(name) within group(order by score) over (partition by name) from student; +
+drop table student; diff --git a/sql/item.h b/sql/item.h index 76ce4aa935f..108b0838dfc 100644 --- a/sql/item.h +++ b/sql/item.h @@ -5245,6 +5246,7 @@ class Cached_item_int :public Cached_item_item Cached_item_int(Item *item_par) :Cached_item_item(item_par),value(0) {} bool cmp(void); int cmp_read_only(); Delete this, it's not used. + longlong get_value(){ return value;} };
@@ -5255,6 +5257,7 @@ class Cached_item_decimal :public Cached_item_item Cached_item_decimal(Item *item_par); bool cmp(void); int cmp_read_only(); Delete this, it's not used. + my_decimal *get_value(){ return &value;}; };
class Cached_item_field :public Cached_item diff --git a/sql/item_windowfunc.cc b/sql/item_windowfunc.cc index 6ab903a81bb..916f4113682 100644 --- a/sql/item_windowfunc.cc +++ b/sql/item_windowfunc.cc diff --git a/sql/item_windowfunc.h b/sql/item_windowfunc.h index 77cbd556e60..ca2ed7136ec 100644 --- a/sql/item_windowfunc.h +++ b/sql/item_windowfunc.h @@ -678,6 +687,275 @@ class Item_sum_ntile : public Item_sum_window_with_row_count ulong current_row_count_; };
+class Item_sum_percentile_disc : public Item_sum_cume_dist, + public Type_handler_hybrid_field_type +{ +public: + Item_sum_percentile_disc(THD *thd, Item* arg) : Item_sum_cume_dist(thd, arg), + Type_handler_hybrid_field_type(&type_handler_longlong), + value(NULL), val_calculated(FALSE), first_call(TRUE), + prev_value(0), order_item(NULL){} + + double val_real() + { + if (get_row_count() == 0 || get_arg(0)->is_null()) + { + null_value= true; + return 0; + } + null_value= false; + return value->val_real(); + } + + longlong val_int() + { + if (get_row_count() == 0 || get_arg(0)->is_null()) + { + null_value= true; + return 0; + } + null_value= false; + return value->val_int(); + } + + my_decimal* val_decimal(my_decimal* dec) + { + if (get_row_count() == 0 || get_arg(0)->is_null()) + { + null_value= true; + return 0; + } + null_value= false; + return value->val_decimal(dec); + } + + String* val_str(String *str) + { + if (get_row_count() == 0 || get_arg(0)->is_null()) + { + null_value= true; + return 0; + } + null_value= false; + return value->val_str(str); + } + + bool add() + { An extra space here before = + Item *arg = get_arg(0); + if (arg->is_null()) + return false; + + if (first_call) + { + prev_value= arg->val_real(); We are missing a space after >. + if (prev_value >1 || prev_value < 0) + { + my_error(ER_ARGUMENT_OUT_OF_RANGE, MYF(0)); + return true; + } + first_call= false; + } + + double arg_val= arg->val_real(); + There is one extra space here. + if (prev_value != arg_val) + { + my_error(ER_ARGUMENT_NOT_CONSTANT, MYF(0)); + return true; + } + + if (val_calculated) + return false; + + value->store(order_item); + value->cache_value(); + if (value->null_value) + return false; + + Item_sum_cume_dist::add(); + double val= Item_sum_cume_dist::val_real(); + + if (val >= prev_value && !val_calculated) + val_calculated= true; + return false; + } + + enum Sumfunctype sum_func() const + { + return PERCENTILE_DISC_FUNC; + } + + void clear() + { + val_calculated= false; + first_call= true; + value->clear(); + Item_sum_cume_dist::clear(); + } + + const char*func_name() const + { + return "percentile_disc"; + } + + void update_field() {} + void set_type_handler(Window_spec *window_spec); + const Type_handler *type_handler() const + {return Type_handler_hybrid_field_type::type_handler();} + + void fix_length_and_dec() + { + decimals = 10; // TODO-cvicentiu find out how many decimals the standard + // requires. + } + + Item *get_copy(THD *thd, MEM_ROOT *mem_root) + { return get_item_copy<Item_sum_percentile_disc>(thd, mem_root, this); } + void setup_window_func(THD *thd, Window_spec *window_spec); + void setup_hybrid(THD *thd, Item *item); + +private: + Item_cache *value; + bool val_calculated; + bool first_call; + double prev_value; + Item *order_item; +}; + +class Item_sum_percentile_cont : public Item_sum_cume_dist, + public Type_handler_hybrid_field_type +{ +public: + Item_sum_percentile_cont(THD *thd, Item* arg) : Item_sum_cume_dist(thd, arg), + Type_handler_hybrid_field_type(&type_handler_double), + floor_value(NULL), ceil_value(NULL), first_call(TRUE),prev_value(0), + ceil_val_calculated(FALSE), floor_val_calculated(FALSE), order_item(NULL){} + + double val_real() + { + if (get_row_count() == 0 || get_arg(0)->is_null()) + { + null_value= true; + return 0; + } + null_value= false; + double val= 1 + prev_value * (get_row_count()-1); + + /* + Applying the formula to get the value + If (CRN = FRN = RN) then the result is (value of expression from row at RN) + Otherwise the result is + (CRN - RN) * (value of expression for row at FRN) + + (RN - FRN) * (value of expression for row at CRN) + */ + + if(ceil(val) == floor(val)) + return floor_value->val_real(); + + double ret_val= ((val - floor(val)) * ceil_value->val_real()) + + ((ceil(val) - val) * floor_value->val_real()); + + return ret_val; + } + + bool add() + { + Item *arg = get_arg(0); + if (arg->is_null()) + return false; + + if (first_call) + { + first_call= false; + prev_value= arg->val_real(); + if (prev_value >1 || prev_value < 0) + { + my_error(ER_ARGUMENT_OUT_OF_RANGE, MYF(0)); + return true; + } + } + + double arg_val= arg->val_real(); + if (prev_value != arg_val) + { + my_error(ER_ARGUMENT_NOT_CONSTANT, MYF(0)); + return true; + } + + if (!floor_val_calculated) + { + floor_value->store(order_item); + floor_value->cache_value(); + if (floor_value->null_value) + return false; + } + if (floor_val_calculated && !ceil_val_calculated) + { + ceil_value->store(order_item); + ceil_value->cache_value(); + if (ceil_value->null_value) + return false; + } + + Item_sum_cume_dist::add(); + double val= 1 + prev_value * (get_row_count()-1); + + if (!floor_val_calculated && get_row_number() == floor(val)) + floor_val_calculated= true; + + if (!ceil_val_calculated && get_row_number() == ceil(val)) + ceil_val_calculated= true; + return false; + } + + enum Sumfunctype sum_func() const + { + return PERCENTILE_CONT_FUNC; + } + + void clear() + { + first_call= true; + floor_value->clear(); + ceil_value->clear(); + floor_val_calculated= false; + ceil_val_calculated= false; + Item_sum_cume_dist::clear(); + } + + const char*func_name() const + { + return "percentile_cont"; + } + void update_field() {} + void set_type_handler(Window_spec *window_spec); + const Type_handler *type_handler() const + {return Type_handler_hybrid_field_type::type_handler();} + + void fix_length_and_dec() + { + decimals = 10; // TODO-cvicentiu find out how many decimals the standard + // requires. + } + + Item *get_copy(THD *thd, MEM_ROOT *mem_root) + { return get_item_copy<Item_sum_percentile_cont>(thd, mem_root, this); } + void setup_window_func(THD *thd, Window_spec *window_spec); + void setup_hybrid(THD *thd, Item *item); + +private: + Item_cache *floor_value; + Item_cache *ceil_value; + bool first_call; + double prev_value; + bool ceil_val_calculated; + bool floor_val_calculated; + Item *order_item; +}; + + +
class Item_window_func : public Item_func_or_sum { @@ -781,12 +1063,40 @@ class Item_window_func : public Item_func_or_sum case Item_sum::DENSE_RANK_FUNC: case Item_sum::PERCENT_RANK_FUNC: case Item_sum::CUME_DIST_FUNC: + case Item_sum::PERCENTILE_CONT_FUNC: + case Item_sum::PERCENTILE_DISC_FUNC: return true; default: return false; } } Let's make the grammar enforce this. This is silly. Is there a big
USe --echo limitation why we can't define a special rule for these functions? There already is order_by_single_element_list in sql_yacc.yy, it just isn't defined properly. We can get rid of these checks altogether then.
+ bool only_single_element_order_list() const + { + switch (window_func()->sum_func()){ + case Item_sum::PERCENTILE_CONT_FUNC: + case Item_sum::PERCENTILE_DISC_FUNC: + return true; + default: + return false; + } + } + This is not needed. Put this in Item_sum_percentile_disc's fix_fields. + void setting_handler_for_percentile_functions(Item_result rtype) const + { + switch (window_func()->sum_func()){ + case Item_sum::PERCENTILE_DISC_FUNC: + ((Item_sum_percentile_disc* ) window_func())->set_handler_by_cmp_type(rtype); + break; + default: + return; + } + } + Get rid of this function here. Put the logic of checking this result type in fix_fields specific to Item_sum_percentile_cont / disc. + bool check_result_type_of_order_item(); + + + /* Computation functions. TODO: consoder merging these with class Group_bound_tracker. diff --git a/sql/share/errmsg-utf8.txt b/sql/share/errmsg-utf8.txt index 1f282e6aee5..d8d6975d92c 100644 --- a/sql/share/errmsg-utf8.txt +++ b/sql/share/errmsg-utf8.txt @@ -7490,3 +7490,11 @@ ER_WRONG_INSERT_INTO_SEQUENCE eng "Wrong INSERT into a SEQUENCE. One can only do single table INSERT into a squence object (like with mysqldump). If you want to change the SEQUENCE, use ALTER SEQUENCE instead." ER_SP_STACK_TRACE eng "At line %u in %s" This one will go away when we enforce the limitation in the grammar, make sure to remove it. +ER_NOT_SINGLE_ELEMENT_ORDER_LIST + eng "Incorrect number of elements in the order list for '%s'" +ER_WRONG_TYPE_FOR_PERCENTILE_CONT + eng "Numeric datatype is required for Percentile_CONT function" As discussed have these two errors print the function name to which the error is for, just like ER_NOT_SINGLE_ELEMENT_ORDER_LIST uses now. +ER_ARGUMENT_NOT_CONSTANT + eng "Argument to the percentile functions is not a constant" +ER_ARGUMENT_OUT_OF_RANGE + eng "Argument to the percentile functions does not belong to the range [0,1]" diff --git a/sql/sql_window.cc b/sql/sql_window.cc index 74e1ad25183..127d6d73ee6 100644 --- a/sql/sql_window.cc +++ b/sql/sql_window.cc @@ -304,6 +304,12 @@ setup_windows(THD *thd, Ref_ptr_array ref_pointer_array, TABLE_LIST *tables, win_func_item->update_used_tables(); }
2 problems with this code. 1. We can probably move this logic in Item_sum_percentile_[cont|disc]::fix_fields. See if that works. 2. There already is an identical while loop before this. USe that one, don't have an extra loop here for no reason.
+ li.rewind(); + while ((win_func_item= li++)) + { + if (win_func_item->check_result_type_of_order_item()) + DBUG_RETURN(1); + } DBUG_RETURN(0); }
@@ -1658,6 +1666,49 @@ class Frame_unbounded_following_set_count : public Frame_unbounded_following } };
This cursor counts all rows in the partition that don't have the order_item set to null. Let's rewrite this like so: Extract this logic: """ List_iterator_fast<Item_sum> it(sum_functions); Item_sum* item; while ((item= it++)) { Item_sum_window_with_row_count* item_with_row_count = static_cast<Item_sum_window_with_row_count *>(item); item_with_row_count->set_row_count(num_rows_in_partition); } """ 1. Nake it a protected method for Frame_unbounded_following_set_count. Lets call it set_win_funcs_row_count. 2. Rewrite Frame_unbounded_following_set_count to use that function. 3. Make a subclass Frame_unbounded_following_set_count_no_nulls 4. Make use of the protected method in the Frame_unbounded_following_set_count_no_nulls
+class Frame_unbounded_following_set_count_special: public
Frame_unbounded_following_set_count
+{ + +public: + Frame_unbounded_following_set_count_special(THD *thd, + SQL_I_List<ORDER> *partition_list, + SQL_I_List<ORDER> *order_list, Item* arg) : + Frame_unbounded_following_set_count(thd,partition_list, order_list) + { + order_item= order_list->first->item[0]; + } + void next_partition(ha_rows rownum) + { + ha_rows num_rows_in_partition= 0; + if (cursor.fetch()) + return; + + /* Walk to the end of the partition, find how many rows there are. */ + do + { + if (!order_item->is_null()) + num_rows_in_partition++; You are missing a space after } + }while (!cursor.next()); + + List_iterator_fast<Item_sum> it(sum_functions); + Item_sum* item; + while ((item= it++)) + { + Item_sum_window_with_row_count* item_with_row_count = + static_cast<Item_sum_window_with_row_count *>(item); + item_with_row_count->set_row_count(num_rows_in_partition); + } + } + + ha_rows get_curr_rownum() const + { + return cursor.get_rownum(); + } + +private: + Item* order_item; +}; +
/////////////////////////////////////////////////////////////////////////////
// ROWS-type frame bounds
/////////////////////////////////////////////////////////////////////////////
diff --git a/sql/sql_yacc.yy b/sql/sql_yacc.yy index a0bbf39b138..b3b34fce3a4 100644 --- a/sql/sql_yacc.yy +++ b/sql/sql_yacc.yy @@ -10210,6 +10215,11 @@ geometry_function: Geometry::wkb_polygon, Geometry::wkb_linestring)); } I think this is something that should not be here right? What is this? + | WITHIN '(' expr ',' expr ')' + { + $$= GEOM_NEW(thd, Item_func_spatial_precise_rel(thd, $3, $5, + Item_func::SP_WITHIN_FUNC)); + } ;
/* @@ -10671,6 +10681,47 @@ simple_window_func: } ;
+inverse_distribution_function: + inverse_distribution_function_def WITHIN GROUP_SYM + '(' + { Select->prepare_add_window_spec(thd); } + order_by_single_element_list ')' OVER_SYM + '(' opt_window_ref opt_window_partition_clause ')' + { + LEX *lex= Lex; + if (Select->add_window_spec(thd, lex->win_ref, + Select->group_list, + Select->order_list, + NULL)) + MYSQL_YYABORT; + $$= new (thd->mem_root) Item_window_func(thd, (Item_sum *) $1, + thd->lex->win_spec); + if ($$ == NULL) + MYSQL_YYABORT; + if (Select->add_window_func((Item_window_func *) $$)) + MYSQL_YYABORT; + } + ; + +inverse_distribution_function_def: + PERCENTILE_CONT_SYM '(' expr ')' + { + $$= new (thd->mem_root) Item_sum_percentile_cont(thd, $3); + if ($$ == NULL) + MYSQL_YYABORT; + } + | PERCENTILE_DISC_SYM '(' expr ')' + { + $$= new (thd->mem_root) Item_sum_percentile_disc(thd, $3); + if ($$ == NULL) + MYSQL_YYABORT; + } + ; +
See if you can make this accept only one order element.
+order_by_single_element_list: + ORDER_SYM BY order_list + ; + window_name: ident {
participants (1)
-
Vicențiu Ciorbaru