#At file:///home/tsk/mprog/src/5.3-mwl89/ based on revid:sanja@askmonty.org-20100710103730-ayy6a61pdibspf4o 2801 timour@askmonty.org 2010-07-16 MWL#89: Cost-based choice between Materialization and IN->EXISTS transformation 1. Changed the lazy optimization for subqueries that can be materialized into bottom-up optimization during the optimization of the main query. The main change is implemented by the method Item_in_subselect::setup_engine. All other changes were required to correct problems resulting from changing the order of optimization. Most of these problems followed the same pattern - there are some shared structures between a subquery and its parent query. Depending on which one is optimized first (parent or child query), these shared strucutres may get different values, thus resulting in an inconsistent query plan. 2. Changed the code-generation for subquery materialization to be performed in runtime memory for each (re)execution, instead of in statement memory (once per prepared statement). - Item_in_subselect::setup_engine() no longer creates materialization related objects in statement memory. - Merged subselect_hash_sj_engine::init_permanent and subselect_hash_sj_engine::init_runtime into subselect_hash_sj_engine::init, which is called for each (re)execution. - Fixed deletion of the temp table accordingly. @ mysql-test/r/subselect_mat.result Adjusted changed EXPLAIN because of earlier optimization of subqueries. modified: mysql-test/r/subselect_mat.result sql/item_subselect.cc sql/item_subselect.h sql/sql_class.cc sql/sql_class.h sql/sql_select.cc === modified file 'mysql-test/r/subselect_mat.result' --- a/mysql-test/r/subselect_mat.result 2010-06-26 10:05:41 +0000 +++ b/mysql-test/r/subselect_mat.result 2010-07-16 10:52:02 +0000 @@ -1139,7 +1139,7 @@ insert into t1 values (5); explain select min(a1) from t1 where 7 in (select b1 from t2 group by b1); id select_type table type possible_keys key key_len ref rows Extra 1 PRIMARY NULL NULL NULL NULL NULL NULL NULL Select tables optimized away -2 SUBQUERY t2 system NULL NULL NULL NULL 0 const row not found +2 SUBQUERY NULL NULL NULL NULL NULL NULL NULL no matching row in const table select min(a1) from t1 where 7 in (select b1 from t2 group by b1); min(a1) set @@optimizer_switch='default,materialization=off'; @@ -1153,7 +1153,7 @@ set @@optimizer_switch='default,semijoin explain select min(a1) from t1 where 7 in (select b1 from t2); id select_type table type possible_keys key key_len ref rows Extra 1 PRIMARY NULL NULL NULL NULL NULL NULL NULL Select tables optimized away -2 SUBQUERY t2 system NULL NULL NULL NULL 0 const row not found +2 SUBQUERY NULL NULL NULL NULL NULL NULL NULL no matching row in const table select min(a1) from t1 where 7 in (select b1 from t2); min(a1) set @@optimizer_switch='default,materialization=off'; === modified file 'sql/item_subselect.cc' --- a/sql/item_subselect.cc 2010-07-10 10:37:30 +0000 +++ b/sql/item_subselect.cc 2010-07-16 10:52:02 +0000 @@ -166,6 +166,7 @@ void Item_in_subselect::cleanup() Item_subselect::~Item_subselect() { delete engine; + engine= NULL; } Item_subselect::trans_res @@ -2220,73 +2221,73 @@ void Item_in_subselect::update_used_tabl bool Item_in_subselect::setup_engine() { - subselect_hash_sj_engine *new_engine= NULL; - bool res= FALSE; + subselect_hash_sj_engine *mat_engine= NULL; + subselect_single_select_engine *select_engine; DBUG_ENTER("Item_in_subselect::setup_engine"); - if (engine->engine_type() == subselect_engine::SINGLE_SELECT_ENGINE) - { - /* Create/initialize objects in permanent memory. */ - subselect_single_select_engine *old_engine; - Query_arena *arena= thd->stmt_arena, backup; - old_engine= (subselect_single_select_engine*) engine; + SELECT_LEX *save_select= thd->lex->current_select; + thd->lex->current_select= get_select_lex(); + int res= thd->lex->current_select->join->optimize(); + thd->lex->current_select= save_select; + if (res) + DBUG_RETURN(TRUE); - if (arena->is_conventional()) - arena= 0; - else - thd->set_n_backup_active_arena(arena, &backup); + /* + The select_engine (that executes transformed IN=>EXISTS subselects) is + pre-created at parse time, and is stored in statment memory (preserved + across PS executions). + */ + DBUG_ASSERT(engine->engine_type() == subselect_engine::SINGLE_SELECT_ENGINE); + select_engine= (subselect_single_select_engine*) engine; - if (!(new_engine= new subselect_hash_sj_engine(thd, this, - old_engine)) || - new_engine->init_permanent(unit->get_unit_column_types())) - { - Item_subselect::trans_res trans_res; - /* - If for some reason we cannot use materialization for this IN predicate, - delete all materialization-related objects, and apply the IN=>EXISTS - transformation. - */ - delete new_engine; - new_engine= NULL; - exec_method= NOT_TRANSFORMED; - if (left_expr->cols() == 1) - trans_res= single_value_in_to_exists_transformer(old_engine->join, - &eq_creator); - else - trans_res= row_value_in_to_exists_transformer(old_engine->join); - res= (trans_res != Item_subselect::RES_OK); - } - if (new_engine) - engine= new_engine; + /* Create/initialize execution objects. */ + if (!(mat_engine= new subselect_hash_sj_engine(thd, this, select_engine))) + DBUG_RETURN(TRUE); - if (arena) - thd->restore_active_arena(arena, &backup); - } - else + if (mat_engine->init(&select_engine->join->fields_list)) { - DBUG_ASSERT(engine->engine_type() == subselect_engine::HASH_SJ_ENGINE); - new_engine= (subselect_hash_sj_engine*) engine; - } + Item_subselect::trans_res trans_res; + /* + If for some reason we cannot use materialization for this IN predicate, + delete all materialization-related objects, and apply the IN=>EXISTS + transformation. + */ + delete mat_engine; + mat_engine= NULL; + exec_method= NOT_TRANSFORMED; + + if (left_expr->cols() == 1) + trans_res= single_value_in_to_exists_transformer(select_engine->join, + &eq_creator); + else + trans_res= row_value_in_to_exists_transformer(select_engine->join); - /* Initilizations done in runtime memory, repeated for each execution. */ - if (new_engine) - { /* - Reset the LIMIT 1 set in Item_exists_subselect::fix_length_and_dec. - TODO: - Currently we set the subquery LIMIT to infinity, and this is correct - because we forbid at parse time LIMIT inside IN subqueries (see - Item_in_subselect::test_limit). However, once we allow this, here - we should set the correct limit if given in the query. + The IN=>EXISTS transformation above injects new predicates into the + WHERE and HAVING clauses. Since the subquery was already optimized, + below we force its reoptimization with the new injected conditions + by the first call to subselect_single_select_engine::exec(). + This is the only case of lazy subquery optimization in the server. */ - unit->global_parameters->select_limit= NULL; - if ((res= new_engine->init_runtime())) - DBUG_RETURN(res); + DBUG_ASSERT(select_engine->join->optimized); + select_engine->join->optimized= false; + DBUG_RETURN(trans_res != Item_subselect::RES_OK); } - DBUG_RETURN(res); + /* + Reset the "LIMIT 1" set in Item_exists_subselect::fix_length_and_dec. + TODO: + Currently we set the subquery LIMIT to infinity, and this is correct + because we forbid at parse time LIMIT inside IN subqueries (see + Item_in_subselect::test_limit). However, once we allow this, here + we should set the correct limit if given in the query. + */ + unit->global_parameters->select_limit= NULL; + + engine= mat_engine; + DBUG_RETURN(FALSE); } @@ -3787,13 +3788,14 @@ bitmap_init_memroot(MY_BITMAP *map, uint @retval FALSE otherwise */ -bool subselect_hash_sj_engine::init_permanent(List<Item> *tmp_columns) +bool subselect_hash_sj_engine::init(List<Item> *tmp_columns) { + select_union *result_sink; /* Options to create_tmp_table. */ ulonglong tmp_create_options= thd->options | TMP_TABLE_ALL_COLUMNS; /* | TMP_TABLE_FORCE_MYISAM; TIMOUR: force MYISAM */ - DBUG_ENTER("subselect_hash_sj_engine::init_permanent"); + DBUG_ENTER("subselect_hash_sj_engine::init"); if (bitmap_init_memroot(&non_null_key_parts, tmp_columns->elements, thd->mem_root) || @@ -3822,15 +3824,16 @@ bool subselect_hash_sj_engine::init_perm DBUG_RETURN(TRUE); } */ - if (!(result= new select_materialize_with_stats)) + if (!(result_sink= new select_materialize_with_stats)) DBUG_RETURN(TRUE); - - if (((select_union*) result)->create_result_table( - thd, tmp_columns, TRUE, tmp_create_options, - "materialized subselect", TRUE)) + result_sink->get_tmp_table_param()->materialized_subquery= true; + if (result_sink->create_result_table(thd, tmp_columns, TRUE, + tmp_create_options, + "materialized subselect", TRUE)) DBUG_RETURN(TRUE); - tmp_table= ((select_union*) result)->table; + tmp_table= result_sink->table; + result= result_sink; /* If the subquery has blobs, or the total key lenght is bigger than @@ -3867,6 +3870,17 @@ bool subselect_hash_sj_engine::init_perm !(lookup_engine= make_unique_engine())) DBUG_RETURN(TRUE); + /* + Repeat name resolution for 'cond' since cond is not part of any + clause of the query, and it is not 'fixed' during JOIN::prepare. + */ + if (semi_join_conds && !semi_join_conds->fixed && + semi_join_conds->fix_fields(thd, (Item**)&semi_join_conds)) + DBUG_RETURN(TRUE); + /* Let our engine reuse this query plan for materialization. */ + materialize_join= materialize_engine->join; + materialize_join->change_result(result); + DBUG_RETURN(FALSE); } @@ -3957,8 +3971,6 @@ subselect_hash_sj_engine::make_unique_en Item_iterator_row it(item_in->left_expr); /* The only index on the temporary table. */ KEY *tmp_key= tmp_table->key_info; - /* Number of keyparts in tmp_key. */ - uint tmp_key_parts= tmp_key->key_parts; JOIN_TAB *tab; DBUG_ENTER("subselect_hash_sj_engine::make_unique_engine"); @@ -3981,41 +3993,22 @@ subselect_hash_sj_engine::make_unique_en } -/** - Initialize members of the engine that need to be re-initilized at each - execution. +subselect_hash_sj_engine::~subselect_hash_sj_engine() +{ + delete lookup_engine; + delete result; + if (tmp_table) + free_tmp_table(thd, tmp_table); +} - @retval TRUE if a memory allocation error occurred - @retval FALSE if success -*/ -bool subselect_hash_sj_engine::init_runtime() +int subselect_hash_sj_engine::prepare() { /* Create and optimize the JOIN that will be used to materialize the subquery if not yet created. */ - materialize_engine->prepare(); - /* - Repeat name resolution for 'cond' since cond is not part of any - clause of the query, and it is not 'fixed' during JOIN::prepare. - */ - if (semi_join_conds && !semi_join_conds->fixed && - semi_join_conds->fix_fields(thd, (Item**)&semi_join_conds)) - return TRUE; - /* Let our engine reuse this query plan for materialization. */ - materialize_join= materialize_engine->join; - materialize_join->change_result(result); - return FALSE; -} - - -subselect_hash_sj_engine::~subselect_hash_sj_engine() -{ - delete lookup_engine; - delete result; - if (tmp_table) - free_tmp_table(thd, tmp_table); + return materialize_engine->prepare(); } @@ -4036,6 +4029,12 @@ void subselect_hash_sj_engine::cleanup() count_null_only_columns= 0; strategy= UNDEFINED; materialize_engine->cleanup(); + /* + Restore the original Item_in_subselect engine. This engine is created once + at parse time and stored across executions, while all other materialization + related engines are created and chosen for each execution. + */ + ((Item_in_subselect *) item)->engine= materialize_engine; if (lookup_engine_type == TABLE_SCAN_ENGINE || lookup_engine_type == ROWID_MERGE_ENGINE) { @@ -4052,6 +4051,9 @@ void subselect_hash_sj_engine::cleanup() DBUG_ASSERT(lookup_engine->engine_type() == UNIQUESUBQUERY_ENGINE); lookup_engine->cleanup(); result->cleanup(); /* Resets the temp table as well. */ + DBUG_ASSERT(tmp_table); + free_tmp_table(thd, tmp_table); + tmp_table= NULL; } @@ -4080,9 +4082,8 @@ int subselect_hash_sj_engine::exec() the subquery predicate. */ thd->lex->current_select= materialize_engine->select_lex; - if ((res= materialize_join->optimize())) - goto err; /* purecov: inspected */ - DBUG_ASSERT(!is_materialized); /* We should materialize only once. */ + /* The subquery should be optimized, and materialized only once. */ + DBUG_ASSERT(materialize_join->optimized && !is_materialized); materialize_join->exec(); if ((res= test(materialize_join->error || thd->is_fatal_error))) goto err; === modified file 'sql/item_subselect.h' --- a/sql/item_subselect.h 2010-07-10 10:37:30 +0000 +++ b/sql/item_subselect.h 2010-07-16 10:52:02 +0000 @@ -817,10 +817,9 @@ public: } ~subselect_hash_sj_engine(); - bool init_permanent(List<Item> *tmp_columns); - bool init_runtime(); + bool init(List<Item> *tmp_columns); void cleanup(); - int prepare() { return 0; } /* Override virtual function in base class. */ + int prepare(); int exec(); virtual void print(String *str, enum_query_type query_type); uint cols() === modified file 'sql/sql_class.cc' --- a/sql/sql_class.cc 2010-07-10 10:37:30 +0000 +++ b/sql/sql_class.cc 2010-07-16 10:52:02 +0000 @@ -3052,6 +3052,7 @@ void TMP_TABLE_PARAM::init() table_charset= 0; precomputed_group_by= 0; bit_fields_as_long= 0; + materialized_subquery= 0; skip_create_table= 0; DBUG_VOID_RETURN; } === modified file 'sql/sql_class.h' --- a/sql/sql_class.h 2010-07-10 10:37:30 +0000 +++ b/sql/sql_class.h 2010-07-16 10:52:02 +0000 @@ -2852,6 +2852,8 @@ public: uint convert_blob_length; CHARSET_INFO *table_charset; bool schema_table; + /* TRUE if the temp table is created for subquery materialization. */ + bool materialized_subquery; /* True if GROUP BY and its aggregate functions are already computed by a table access method (e.g. by loose index scan). In this case @@ -2875,8 +2877,8 @@ public: TMP_TABLE_PARAM() :copy_field(0), group_parts(0), group_length(0), group_null_parts(0), convert_blob_length(0), - schema_table(0), precomputed_group_by(0), force_copy_fields(0), - bit_fields_as_long(0), skip_create_table(0) + schema_table(0), materialized_subquery(0), precomputed_group_by(0), + force_copy_fields(0), bit_fields_as_long(0), skip_create_table(0) {} ~TMP_TABLE_PARAM() { @@ -2905,6 +2907,7 @@ public: bool send_data(List<Item> &items); bool send_eof(); bool flush(); + TMP_TABLE_PARAM *get_tmp_table_param() { return &tmp_table_param; } virtual bool create_result_table(THD *thd, List<Item> *column_types, bool is_distinct, ulonglong options, @@ -2969,7 +2972,7 @@ protected: ha_rows count_rows; public: - select_materialize_with_stats() {} + select_materialize_with_stats() { tmp_table_param.init(); } virtual bool create_result_table(THD *thd, List<Item> *column_types, bool is_distinct, ulonglong options, const char *alias, bool bit_fields_as_long); === modified file 'sql/sql_select.cc' --- a/sql/sql_select.cc 2010-07-10 10:37:30 +0000 +++ b/sql/sql_select.cc 2010-07-16 10:52:02 +0000 @@ -2586,14 +2586,13 @@ err: Setup for execution all subqueries of a query, for which the optimizer chose hash semi-join. - @details Iterate over all subqueries of the query, and if they are under an - IN predicate, and the optimizer chose to compute it via hash semi-join: - - try to initialize all data structures needed for the materialized execution - of the IN predicate, - - if this fails, then perform the IN=>EXISTS transformation which was - previously blocked during JOIN::prepare. - - This method is part of the "code generation" query processing phase. + @details Iterate over all immediate child subqueries of the query, and if + they are under an IN predicate, and the optimizer chose to compute it via + materialization: + - optimize each subquery, + - choose an optimial execution strategy for the IN predicate - either + materialization, or an IN=>EXISTS transformation with an approriate + engine. This phase must be called after substitute_for_best_equal_field() because that function may replace items with other items from a multiple equality, @@ -7925,7 +7924,7 @@ bool TABLE_REF::tmp_table_index_lookup_i use that information instead. */ cur_ref_buff + null_count, - null_count ? key_buff : 0, + null_count ? cur_ref_buff : 0, cur_key_part->length, items[i], value); cur_ref_buff+= cur_key_part->store_length; } @@ -11408,10 +11407,30 @@ create_tmp_table(THD *thd,TMP_TABLE_PARA { if (thd->is_fatal_error) goto err; // Got OOM - continue; // Some kindf of const item + continue; // Some kind of const item } if (type == Item::SUM_FUNC_ITEM) - ((Item_sum *) item)->result_field= new_field; + { + Item_sum *agg_item= (Item_sum *) item; + /* + Update the result field only if it has never been set, or if the + created temporary table is not to be used for subquery + materialization. + + The reason is that for subqueries that require materialization as part + of their plan, we create the 'external' temporary table needed for IN + execution, after the 'internal' temporary table needed for grouping. + Since both the external and the internal temporary tables are created + for the same list of SELECT fields of the subquery, setting + 'result_field' for each invocation of create_tmp_table overrides the + previous value of 'result_field'. + + The condition below prevents the creation of the external temp table + to override the 'result_field' that was set for the internal temp table. + */ + if (!agg_item->result_field || !param->materialized_subquery) + agg_item->result_field= new_field; + } tmp_from_field++; reclength+=new_field->pack_length(); if (!(new_field->flags & NOT_NULL_FLAG)) @@ -19240,6 +19259,8 @@ bool JOIN::change_result(select_result * { DBUG_ENTER("JOIN::change_result"); result= res; + if (tmp_join) + tmp_join->result= res; if (!procedure && (result->prepare(fields_list, select_lex->master_unit()) || result->prepare2())) {