Hi Alexey,
commit caa416f8538055885eef510f8ef190e141e3f3c5 Author: Alexey Botchkov <holyfoot@askmonty.org> Date: Fri Apr 10 16:54:11 2020 +0400
MDEV-17399 Add support for JSON_TABLE.
Syntax for JSON_TABLE added. The ha_json_table handler added. Executing the JSON_TABLE we create the temporary table of the ha_json_table, add dependencies of other tables and sometimes conditions to WHERE.
I think the patch is moving in the right direction. The new way of interfacing with the optimizer is much nicer. There's plenty of input about the other parts of the patch, though. Please find it below. === New reserved keywords? == This patch makes PATH, NESTED, ORDINALITY, etc to be reserved keywords. Is this intentional? The effect of this is that using these words as indentifiers now causes parse errors: create table t10(path int); create table t10(nested int); create table t10(ordinality int); In MySQL 8, they are keywords but are not reserved: mysql> select * from INFORMATION_SCHEMA.KEYWORDS where word in ('path','nested','ordinality'); +------------+----------+ | WORD | RESERVED | +------------+----------+ | NESTED | 0 | | ORDINALITY | 0 | | PATH | 0 | +------------+----------+ which means one can still use them without quoting. == Invsible field for ref access is still there == Create_json_table::add_json_table_fields still creates an invisible json_string field. Is it necessary anymore? == An example that produces garbage data == select * from json_table( '[ {"color":"red", "price":"100"}, {"color":"green","price":"200"}, {"color":"bleu", "price":"300"} ]', '$[*]' columns (col1 varchar(200) PATH "$")) as TBL; +----------------------------------------------------+ | col1 | +----------------------------------------------------+ | {"color":"red", "price":"100"}, {"color":"green | | {"color":"green","price":"200"}, {"color":"bleu" | | {"color":"bleu", "price":"300"} ] ??`?XWUU G ' | +----------------------------------------------------+ Note the garbage on the last line. I am not sure if this a JSON_TABLE issue or an issue in JSON[path] engine. == Please create a BB tree == Please push the code into a bb-* tree so that the tests are ran. The rest of input is below.
diff --git a/sql/sql_select.cc b/sql/sql_select.cc index 09a15aaf7df..a86b89efd52 100644 --- a/sql/sql_select.cc +++ b/sql/sql_select.cc @@ -12569,6 +12569,9 @@ uint check_join_cache_usage(JOIN_TAB *tab, !join->allowed_outer_join_with_cache) goto no_join_cache;
+ if (tab->table->pos_in_table_list->table_function && + !tab->table->pos_in_table_list->table_function->join_cache_allowed()) + goto no_join_cache; /* Non-linked join buffers can't guarantee one match */
Ok.
@@ -16312,6 +16315,10 @@ simplify_joins(JOIN *join, List<TABLE_LIST> *join_list, COND *conds, bool top, table->embedding->nested_join->not_null_tables|= not_null_tables; }
+ if (table->table_function && + table->table_function->setup(join->thd, table, &conds)) + DBUG_RETURN(0); +
I see that this function also calls fix_fields() for the first argument. This is wrong. All name resolution should be done in the "fix_fields phase" in JOIN::prepare. This is a prerequisite to correct computation of item/select attributes. Without this, we get issues like this: explain select item_name, (select color from json_table(t1.item_props,'$' columns( color varchar(100) path '$.color')) as T limit 1 ) from t1; +------+-------------+-------+------+---------------+------+---------+------+------+-------+ | id | select_type | table | type | possible_keys | key | key_len | ref | rows | Extra | +------+-------------+-------+------+---------------+------+---------+------+------+-------+ | 1 | PRIMARY | t1 | ALL | NULL | NULL | NULL | NULL | 2 | | | 2 | SUBQUERY | T | ALL | NULL | NULL | NULL | NULL | 40 | | +------+-------------+-------+------+---------------+------+---------+------+------+-------+ Note the 'SUBQUERY'. The subquery is assumed to be uncorrelated while it is not.
if (!(table->outer_join & (JOIN_TYPE_LEFT | JOIN_TYPE_RIGHT)) || (used_tables & not_null_tables)) { @@ -16324,7 +16331,6 @@ simplify_joins(JOIN *join, List<TABLE_LIST> *join_list, COND *conds, bool top, table->outer_join= 0; if (!(straight_join || table->straight)) { - table->dep_tables= 0; TABLE_LIST *embedding= table->embedding; while (embedding) {
This doesn't seem to be correct, and it causes join_outer test to fail. What is the reason for this change?
diff --git a/sql/table_function.cc b/sql/table_function.cc new file mode 100644 index 00000000000..2786f894d80 --- /dev/null ... +class ha_json_table: public handler +{ +protected: + Table_function_json_table *m_jt; + String m_tmps; + String *m_js; + longlong m_order_counter;
* The name is counterintutive. What "order" does this count? * Please add a comment, something like "Count the rows produced. Needed for FOR ORDINALITY" columns.
+ +public: + ha_json_table(TABLE_SHARE *share_arg, Table_function_json_table *jt): + handler(&table_function_hton.m_hton, share_arg), m_jt(jt) + { + mark_trx_read_write_done= 1; + } + ~ha_json_table() {} + handler *clone(const char *name, MEM_ROOT *mem_root) { return NULL; } + const char *index_type(uint inx) { return "HASH"; } + /* Rows also use a fixed-size format */ + enum row_type get_row_type() const { return ROW_TYPE_FIXED; } + ulonglong table_flags() const + { + return (HA_FAST_KEY_READ | HA_NO_BLOBS | HA_NULL_IN_KEY | + HA_CAN_SQL_HANDLER | + HA_REC_NOT_IN_SEQ | HA_NO_TRANSACTIONS | + HA_HAS_RECORDS | HA_CAN_HASH_KEYS); + } + ulong index_flags(uint inx, uint part, bool all_parts) const + { + return HA_ONLY_WHOLE_INDEX | HA_KEY_SCAN_NOT_ROR; + } + uint max_supported_keys() const { return 1; } + uint max_supported_key_part_length() const { return MAX_KEY_LENGTH; } + double scan_time() { return 1000000.0; } + double read_time(uint index, uint ranges, ha_rows rows) { return 0.0; } + + int open(const char *name, int mode, uint test_if_locked); + int close(void) { return 0; } + int rnd_init(bool scan); + int rnd_next(uchar *buf); + int rnd_pos(uchar * buf, uchar *pos) { return 1; } + void position(const uchar *record) {} + int can_continue_handler_scan() { return 1; } + int info(uint); + int extra(enum ha_extra_function operation); + THR_LOCK_DATA **store_lock(THD *thd, THR_LOCK_DATA **to, + enum thr_lock_type lock_type) + { return NULL; } + int create(const char *name, TABLE *form, HA_CREATE_INFO *create_info) + { return 1; } +private: + void update_key_stats(); +}; + + This needs a comment, something like
A helper class that collects the field properties and creates a temporary table that represents JSON_TABLE output. Modeled after sql_select.cc, class Create_tmp_table.
+class Create_json_table: public Data_type_statistics +{ + // The following members are initialized only in start() + Field **m_default_field; + uchar *m_bitmaps; + // The following members are initialized in ctor + uint m_temp_pool_slot; + uint m_null_count; +public: + Create_json_table(const TMP_TABLE_PARAM *param, + bool save_sum_fields) + :m_temp_pool_slot(MY_BIT_NONE), + m_null_count(0) + { } + + void add_field(TABLE *table, Field *field, uint fieldnr, bool force_not_null_cols); + + TABLE *start(THD *thd, + TMP_TABLE_PARAM *param, + Table_function_json_table *jt, + const LEX_CSTRING *table_alias); + + bool add_json_table_fields(THD *thd, TABLE *table, + Table_function_json_table *jt); + bool finalize(THD *thd, TABLE *table, TMP_TABLE_PARAM *param, + Table_function_json_table *jt); +}; + + ... + +int ha_json_table::rnd_init(bool scan) +{ + Json_table_nested_path &p= m_jt->m_nested_path; + DBUG_ENTER("ha_json_table::index_read_map");
Please fix the name to match the function name.
+ + if ((m_js= m_jt->m_json->val_str(&m_tmps))) + { + p.scan_start(m_js->charset(), + (const uchar *) m_js->ptr(), (const uchar *) m_js->end()); + m_order_counter= 0; + } + + DBUG_RETURN(0); +} + +int ha_json_table::rnd_next(uchar *buf) +{ + Field **f= table->field; + Json_table_column *jc; + + if (!m_js) + return HA_ERR_END_OF_FILE; + + if (m_jt->m_nested_path.scan_next()) + return HA_ERR_END_OF_FILE; + + /// + (*f)->set_null(); + ++f; + /// + + List_iterator_fast<Json_table_column> jc_i(m_jt->m_columns); + my_ptrdiff_t ptrdiff= buf - table->record[0]; + while ((jc= jc_i++)) + { + if (ptrdiff) + (*f)->move_field_offset(ptrdiff); + switch (jc->m_column_type) + { + case Json_table_column::FOR_ORDINALITY: + (*f)->set_notnull(); + (*f)->store(m_order_counter++, TRUE); + break; This will not work if there are two FOR ORDINALITY columns:
select * from json_table( '[ {"color":"red", "price":"100"}, {"color":"green","price":"200"}, {"color":"bleu", "price":"300"} ]', '$[*]' columns ( id1 for ordinality, id2 for ordinality)) as TBL; produces: +------+------+ | id1 | id2 | +------+------+ | 0 | 1 | | 2 | 3 | | 4 | 5 | +------+------+ while both columns should have 1,2,3.
+ case Json_table_column::PATH: + case Json_table_column::EXISTS_PATH: + { + json_engine_t je; + json_engine_t &nest_je= jc->m_nest->m_engine; + json_path_step_t *cur_step; + uint array_counters[JSON_DEPTH_LIMIT]; + + if (jc->m_nest->m_null) + { + (*f)->set_null(); + break; + } + json_scan_start(&je, nest_je.s.cs, + nest_je.value_begin, nest_je.s.str_end); + + cur_step= jc->m_path.steps; + if (json_find_path(&je, &jc->m_path, &cur_step, array_counters) || + json_read_value(&je)) + { + if (jc->m_column_type == Json_table_column::PATH) + (*f)->set_null(); + else + { + /* EXISTS_PATH */ + (*f)->set_notnull(); + (*f)->store(0); + } + } + else + { + (*f)->set_notnull(); + if (jc->m_column_type == Json_table_column::PATH) + (*f)->store((const char *) je.value, (uint32) je.value_len,je.s.cs); + else + (*f)->store(1); /* EXISTS_PATH */ + } + break; + } + }; + if (ptrdiff) + (*f)->move_field_offset(-ptrdiff); + f++; + } + return 0; +} + + +int ha_json_table::info(uint) +{ + errkey= 1; + stats.records= 40; + stats.deleted= 0; + stats.mean_rec_length= 140; + stats.data_file_length= 127008; + stats.index_file_length= 126984; + stats.max_data_file_length= 14679980; + stats.delete_length= 0; + stats.create_time= 1584951202; + stats.auto_increment_value= 0; Are these just random constants or they have some meaning? Please add a comment.
+ return 0; +} + + + + +/** + Create a json table according to a field list. + + @param thd thread handle + @param param a description used as input to create the table + @param jt json_table specificaion + @param table_alias alias +*/ + +TABLE *Create_json_table::start(THD *thd, + TMP_TABLE_PARAM *param, + Table_function_json_table *jt, + const LEX_CSTRING *table_alias) +{ + MEM_ROOT *mem_root_save, own_root; + TABLE *table; + TABLE_SHARE *share; + uint copy_func_count= param->func_count; + char *tmpname,path[FN_REFLEN]; + Field **reg_field; + uint *blob_field; + DBUG_ENTER("Create_json_table::start"); + DBUG_PRINT("enter", + ("table_alias: '%s' ", table_alias->str)); + + if (use_temp_pool && !(test_flags & TEST_KEEP_TMP_TABLES)) + m_temp_pool_slot = bitmap_lock_set_next(&temp_pool); + + if (m_temp_pool_slot != MY_BIT_NONE) // we got a slot + sprintf(path, "%s-%lx-%i", tmp_file_prefix, + current_pid, m_temp_pool_slot); + else + { + /* if we run out of slots or we are not using tempool */ + sprintf(path, "%s-%lx-%lx-%x", tmp_file_prefix,current_pid, + (ulong) thd->thread_id, thd->tmp_table++); + }
I see that this this piece of code (thhe coice of temporary table) is now present in three copies. Before this patch, it was present in two, in sql_select.cc and in opt_subselect.cc. It's time to factor it out into a function.
+ + /* + No need to change table name to lower case as we are only creating + MyISAM, Aria or HEAP tables here + */
Are we? I assumed this would be ha_json_table table?
+ fn_format(path, path, mysql_tmpdir, "", + MY_REPLACE_EXT|MY_UNPACK_FILENAME); + + const uint field_count= param->field_count; + DBUG_ASSERT(field_count); + + init_sql_alloc(&own_root, "tmp_table", TABLE_ALLOC_BLOCK_SIZE, 0, + MYF(MY_THREAD_SPECIFIC)); + + if (!multi_alloc_root(&own_root, + &table, sizeof(*table), + &share, sizeof(*share), + ®_field, sizeof(Field*) * (field_count+1), + &m_default_field, sizeof(Field*) * (field_count), + &blob_field, sizeof(uint)*(field_count+1), + ¶m->items_to_copy, + sizeof(param->items_to_copy[0])*(copy_func_count+1), + ¶m->keyinfo, sizeof(*param->keyinfo), + ¶m->start_recinfo, + sizeof(*param->recinfo)*(field_count*2+4), + &tmpname, (uint) strlen(path)+1, + &m_bitmaps, bitmap_buffer_size(field_count)*6, + NullS)) + { + DBUG_RETURN(NULL); /* purecov: inspected */ + } + strmov(tmpname, path); + /* make table according to fields */ + + bzero((char*) table,sizeof(*table)); + bzero((char*) reg_field, sizeof(Field*) * (field_count+1)); + bzero((char*) m_default_field, sizeof(Field*) * (field_count)); + + table->mem_root= own_root; + mem_root_save= thd->mem_root; + thd->mem_root= &table->mem_root; + + table->field=reg_field; + table->alias.set(table_alias->str, table_alias->length, table_alias_charset); + + table->reginfo.lock_type=TL_WRITE; /* Will be updated */ + table->map=1; + table->temp_pool_slot= m_temp_pool_slot; + table->copy_blobs= 1; + table->in_use= thd; + table->no_rows_with_nulls= param->force_not_null_cols; + table->update_handler= NULL; + table->check_unique_buf= NULL; + + table->s= share; + init_tmp_table_share(thd, share, "", 0, "(temporary)", tmpname); + share->blob_field= blob_field; + share->table_charset= param->table_charset; + share->primary_key= MAX_KEY; // Indicate no primary key + if (param->schema_table) + share->db= INFORMATION_SCHEMA_NAME; + + param->using_outer_summary_function= 0; + thd->mem_root= mem_root_save; + DBUG_RETURN(table); +} + + +bool Create_json_table::finalize(THD *thd, TABLE *table, + TMP_TABLE_PARAM *param, + Table_function_json_table *jt) +{ + DBUG_ENTER("Create_json_table::finalize");
Nearly all the code in this function seems to be generic "setup the temporary table code". The only line that is specific to JSON table is this one: if (!(table->file= new (&table->mem_root) ha_json_table(share, jt))) goto err; Is it possible to somehow reuse the code from the generic temporary table creation code path?
+ DBUG_ASSERT(table); + ...
+ +bool Create_json_table::add_json_table_fields(THD *thd, TABLE *table, + Table_function_json_table *jt) +{ + TABLE_SHARE *share= table->s; + Json_table_column *jc; + uint fieldnr= 0; + MEM_ROOT *mem_root_save= thd->mem_root; + + DBUG_ENTER("add_json_table_fields"); + + { + /* Add the invisible json_string field. */ + Field *f= new (thd->mem_root) Field_string(0, + my_charset_utf8mb4_bin.mbmaxlen,(uchar *) "", 0, + Field::NONE, &jfield_name, &my_charset_utf8mb4_bin); + f->invisible= INVISIBLE_USER; + f->init(table); + add_field(table, f, fieldnr++, FALSE); + }
As noted earlier - is the above needed anymore?
+ + thd->mem_root= &table->mem_root; + List_iterator_fast<Json_table_column> jc_i(jt->m_columns); + while ((jc= jc_i++)) + { + Create_field *sql_f= jc->m_field; + Record_addr addr(!(sql_f->flags && NOT_NULL_FLAG)); + Bit_addr bit(addr.null()); + + if (!sql_f->charset) + sql_f->charset= &my_charset_utf8mb4_bin; + + Field *f= sql_f->type_handler()->make_table_field_from_def(share, + thd->mem_root, &sql_f->field_name, addr, bit, sql_f, sql_f->flags); + if (!f) + DBUG_RETURN(TRUE);
Please restore thd->mem_root before returning.
+ f->init(table); + add_field(table, f, fieldnr++, FALSE); + } + + share->fields= fieldnr; + share->blob_fields= m_blob_count; + table->field[fieldnr]= 0; // End marker + share->blob_field[m_blob_count]= 0; // End marker + share->column_bitmap_size= bitmap_buffer_size(share->fields); + + thd->mem_root= mem_root_save; + + DBUG_RETURN(0); +} + ... + +int Table_function_json_table::setup(THD *thd, TABLE_LIST *sql_table, + COND **cond) +{ + if (m_json->fix_fields(thd, &m_json)) + return TRUE; + + m_dep_tables= m_json->used_tables(); + + if (m_dep_tables) + { + /* + Since the json value depends on other tables, we add the + AND 'JSON_SOURCE=json_expression' condition to the WHERE + to make the optimizer using the JSON_SOURCE_index key + and call ::index_read / ::index_next methods. + */ + //Item_func_eq *eq; ^^ Please remove out-of-date comment: ^^
+ sql_table->dep_tables|= m_dep_tables; + sql_table->table->no_cache= TRUE; + if (unlikely(sql_table->dep_tables & sql_table->get_map())) + { + /* Table itself is used in the argument. */ + my_error(ER_WRONG_USAGE, MYF(0), "JSON_TABLE", "argument"); + return TRUE; + } + } + else + { + /* + Nothing to do. + Let the optimizer call ::rnd_init / ::rnd_next + */ + } + + return FALSE; +} + +void Table_function_json_table::get_estimates(ha_rows *out_rows, + double *scan_time, double *startup_cost) +{ + *out_rows= 40; Assumption that a JSON document will produce 40 rows looks reasonable.
+ *scan_time= 1000000.0;
The read cost of one million doesn't. Can we get a more realistic number? It should be much lower (can check by creating a table with 40 rows and noting the read cost).
+ *startup_cost= 0.0; +} + diff --git a/sql/table_function.h b/sql/table_function.h new file mode 100644 index 00000000000..4c11bb3ad94 --- /dev/null +++ b/sql/table_function.h @@ -0,0 +1,115 @@ +#ifndef TABLE_FUNCTION_INCLUDED +#define TABLE_FUNCTION_INCLUDED + +/* Copyright (c) 2020, MariaDB Corporation. All rights reserved. + + This program is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; version 2 of the License. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program; if not, write to the Free Software + Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1335 USA */ + + +#include <json_lib.h> + +class Json_table_nested_path : public Sql_alloc Please add comments about the class and its member variables.
+{ +public: + bool m_null; + json_path_t m_path; + json_engine_t m_engine; + json_path_t m_cur_path; + + Json_table_nested_path *m_parent; + Json_table_nested_path *m_nested, *m_next_nested; + Json_table_nested_path **m_nested_hook; + Json_table_nested_path *m_cur_nested; + Json_table_nested_path(Json_table_nested_path *parent_nest): + m_parent(parent_nest), m_nested(0), m_next_nested(0), + m_nested_hook(&m_nested) {} + int set_path(THD *thd, const LEX_CSTRING &path); + void scan_start(CHARSET_INFO *i_cs, const uchar *str, const uchar *end); + int scan_next(); +}; + + +class Json_table_column : public Sql_alloc +{ +public: + enum enum_type + { + FOR_ORDINALITY, + PATH, + EXISTS_PATH + }; + + enum enum_on_type + { + ON_EMPTY, + ON_ERROR + }; + + enum enum_on_response + { + RESPONSE_NOT_SPECIFIED, + RESPONSE_ERROR, + RESPONSE_NULL, + RESPONSE_DEFAULT + }; + + struct On_response + { + public: + Json_table_column::enum_on_response m_response; + LEX_CSTRING m_default; + }; + + enum_type m_column_type; + json_path_t m_path; + On_response m_on_error; + On_response m_on_empty; + Create_field *m_field; + Json_table_nested_path *m_nest; + + void set(enum_type ctype) + { + m_column_type= ctype; + } + int set(THD *thd, enum_type ctype, const LEX_CSTRING &path); + Json_table_column(Create_field *f, Json_table_nested_path *nest) : + m_field(f), m_nest(nest) + { + m_on_error.m_response= RESPONSE_NOT_SPECIFIED; + m_on_empty.m_response= RESPONSE_NOT_SPECIFIED; + } +}; + + +class Table_function_json_table : public Sql_alloc +{ +public: + Item *m_json; + Json_table_nested_path m_nested_path; + List<Json_table_column> m_columns; + table_map m_dep_tables; + + Table_function_json_table(Item *json): m_json(json), m_nested_path(0) {} + + int setup(THD *thd, TABLE_LIST *sql_table, COND **cond); + bool join_cache_allowed() const { return !m_dep_tables; }
I think we might want to allow dependencies on constant tables (and perhaps outside references). It's fine not to have this in the first version.
+ void get_estimates(ha_rows *out_rows, + double *scan_time, double *startup_cost); +}; + + +TABLE *create_table_for_function(THD *thd, TABLE_LIST *sql_table); + +#endif /* TABLE_FUNCTION_INCLUDED */ +
BR Sergei -- Sergei Petrunia, Software Developer MariaDB Corporation | Skype: sergefp | Blog: http://s.petrunia.net/blog