Re: [Maria-developers] 60d3489: MDEV-9143 JSON_xxx functions.
Hi, Alexey! Summary: no big problems found, no conceptual changes needed :) Still - I feel there was too much code copied around in item_jsonfunc.cc, better to reduce the code duplication. And in sql_json.cc - lots of complains about naming and absense of comments. I was mostly able to figure out what it does regardless, but one should not need to spend a better half of the day just to understand one small function. Please add comments and rename functions/methods/variables, as I've noted in the review. On Jul 25, Alexey Botchkov wrote:
revision-id: 60d3489c96a4a95bd249b11972a132e321d15f29 (mariadb-10.2.1-4-g60d3489) parent(s): 848d211c5c4df00b819cd84d7530cf7d29bb0524 committer: Alexey Botchkov timestamp: 2016-07-25 13:37:01 +0400 message:
MDEV-9143 JSON_xxx functions.
an empty line after the commit subject, please
A number of JSON functions implemented.
be a bit more verbose here. list all functions here. specify which are SQL standard, which are compatible with MySQL, etc and please explain (not in the commit comment, here, in the email) why did you decide to implement exactly these functions. SQL Standard - that's clear, but MySQL has many more, why did you choose only these few?
--- mysql-test/r/func_json.result | 66 +++ mysql-test/t/func_json.test | 26 + sql/CMakeLists.txt | 4 +- sql/item.h | 15 + sql/item_create.cc | 163 +++++++ sql/item_jsonfunc.cc | 654 +++++++++++++++++++++++++ sql/item_jsonfunc.h | 132 ++++++ sql/item_xmlfunc.cc | 17 +- sql/sql_json.cc | 1055 +++++++++++++++++++++++++++++++++++++++++ sql/sql_json.h | 120 +++++ sql/sql_yacc.yy | 4 +- strings/ctype-ucs2.c | 12 +- 12 files changed, 2246 insertions(+), 22 deletions(-)
diff --git a/mysql-test/r/func_json.result b/mysql-test/r/func_json.result new file mode 100644 index 0000000..f22b935 --- /dev/null +++ b/mysql-test/r/func_json.result @@ -0,0 +1,66 @@ +select json_valid('[1, 2]'); +json_valid('[1, 2]') +1 +select json_valid('"mama"}'); +json_valid('"mama"}') +0 +select json_valid('{"mama":1, "myla":[2,3]}');
heh I'm surprised, you haven't added the third key :)
+json_valid('{"mama":1, "myla":[2,3]}') +1 +select json_valid('[false, true, null]'); +json_valid('[false, true, null]') +1 +select json_value('{"key1":123}', '$.key2'); +json_value('{"key1":123}', '$.key2') +NULL +select json_value('{"key1":123}', '$.key1'); +json_value('{"key1":123}', '$.key1') +123 +select json_value('{"key1":[1,2,3]}', '$.key1'); +json_value('{"key1":[1,2,3]}', '$.key1') +NULL +select json_value('{"key1": [1,2,3], "key1":123}', '$.key1'); +json_value('{"key1": [1,2,3], "key1":123}', '$.key1')
please, also implement json_query(). it's trivial to do and it's part of SQL standard. there's no reason not to support it :)
+123 +select json_array_append('["a", "b"]', '$', FALSE); +json_array_append('["a", "b"]', '$', FALSE) +["a", "b", false] +select json_array_append('{"k1":1, "k2":["a", "b"]}', '$.k2', 2); +json_array_append('{"k1":1, "k2":["a", "b"]}', '$.k2', 2)
test with multiple values to append?
+{"k1":1, "k2":["a", "b", 2]} +select json_contains_path('{"key1":1, "key2":[2,3]}', "oNE", "$.key2[1]");
what's sql standard way of doing it?
+json_contains_path('{"key1":1, "key2":[2,3]}', "oNE", "$.key2[1]") +1 +select json_contains_path('{"key1":1, "key2":[2,3]}', "oNE", "$.key2[10]"); +json_contains_path('{"key1":1, "key2":[2,3]}', "oNE", "$.key2[10]") +0 +select json_contains_path('{"key1":1, "key2":[2,3]}', "oNE", "$.ma"); +json_contains_path('{"key1":1, "key2":[2,3]}', "oNE", "$.ma") +0 +select json_contains_path('{"key1":1, "key2":[2,3]}', "one", "$.key1"); +json_contains_path('{"key1":1, "key2":[2,3]}', "one", "$.key1") +1 +select json_contains_path('{"key1":1, "key2":[2,3]}', "one", "$.key1", "$.ma"); +json_contains_path('{"key1":1, "key2":[2,3]}', "one", "$.key1", "$.ma") +1 +select json_contains_path('{"key1":1, "key2":[2,3]}', "aLl", "$.key1", "$.ma"); +json_contains_path('{"key1":1, "key2":[2,3]}', "aLl", "$.key1", "$.ma") +0 +select json_contains_path('{"key1":1, "key2":[2,3]}', "aLl", "$.key1", "$.key2"); +json_contains_path('{"key1":1, "key2":[2,3]}', "aLl", "$.key1", "$.key2") +1 +select json_extract('{"key1":"asd", "key2":[2,3]}', "$.key1"); +json_extract('{"key1":"asd", "key2":[2,3]}', "$.key1") +asd +select json_extract('{"key1":"asd", "key2":[2,3]}', "$.keyX", "$.keyY"); +json_extract('{"key1":"asd", "key2":[2,3]}', "$.keyX", "$.keyY") +NULL +select json_extract('{"key1":"asd", "key2":[2,3]}', "$.key1", "$.key2"); +json_extract('{"key1":"asd", "key2":[2,3]}', "$.key1", "$.key2") +["asd", [2,3]] +select json_extract('{"key1":5, "key2":[2,3]}', "$.key1", "$.key2"); +json_extract('{"key1":5, "key2":[2,3]}', "$.key1", "$.key2") +[5, [2,3]] +select json_extract('{"key0":true, "key1":"qwe"}', "$.key1"); +json_extract('{"key0":true, "key1":"qwe"}', "$.key1") +qwe diff --git a/sql/CMakeLists.txt b/sql/CMakeLists.txt index 089d793..db11391 100644 --- a/sql/CMakeLists.txt +++ b/sql/CMakeLists.txt @@ -134,12 +134,12 @@ SET (SQL_SOURCE opt_table_elimination.cc sql_expression_cache.cc gcalc_slicescan.cc gcalc_tools.cc threadpool_common.cc ../sql-common/mysql_async.c - my_apc.cc my_apc.h mf_iocache_encr.cc + my_apc.cc my_apc.h mf_iocache_encr.cc item_jsonfunc.cc my_json_writer.cc my_json_writer.h rpl_gtid.cc rpl_parallel.cc sql_type.cc sql_type.h item_windowfunc.cc sql_window.cc - sql_cte.cc sql_cte.h + sql_cte.cc sql_cte.h sql_json.cc sql_json.h
do I get correctly that item_jsonfunc.cc is json functions and sql_json.cc is json parsing library? in that case, please, rename the file simply to json.cc, there's not much "sql" in it. And, btw, there's no need to add *.h files to the SQL_SOURCE list.
${WSREP_SOURCES} table_cache.cc encryption.cc temporary_tables.cc ${CMAKE_CURRENT_BINARY_DIR}/sql_builtin.cc diff --git a/sql/sql_yacc.yy b/sql/sql_yacc.yy index 9d7e735..26786ba 100644 --- a/sql/sql_yacc.yy +++ b/sql/sql_yacc.yy @@ -13767,13 +13767,13 @@ literal: } | FALSE_SYM { - $$= new (thd->mem_root) Item_int(thd, (char*) "FALSE",0,1); + $$= new (thd->mem_root) Item_bool(thd, (char*) "FALSE",0); if ($$ == NULL) MYSQL_YYABORT; } | TRUE_SYM { - $$= new (thd->mem_root) Item_int(thd, (char*) "TRUE",1,1); + $$= new (thd->mem_root) Item_bool(thd, (char*) "TRUE",1);
This commit of yours does not change any result files. Does it mean that you have run the complete test suite and did not find any differences in result files caused by this change?
if ($$ == NULL) MYSQL_YYABORT; } diff --git a/strings/ctype-ucs2.c b/strings/ctype-ucs2.c index 06dab08..c52177a 100644 --- a/strings/ctype-ucs2.c +++ b/strings/ctype-ucs2.c @@ -1190,10 +1190,13 @@ my_lengthsp_mb2(CHARSET_INFO *cs __attribute__((unused)), #endif /* HAVE_CHARSET_mb2*/
+/* + Next part is actually HAVE_CHARSET_utf16-specific, + but it the JSON functions needed my_utf16_uni()
typo: "it the"
+ so the #ifdef was moved lower. +*/
-#ifdef HAVE_CHARSET_utf16 - /* D800..DB7F - Non-provate surrogate high (896 pages) DB80..DBFF - Private surrogate high (128 pages) diff --git a/sql/item_create.cc b/sql/item_create.cc index 82f6bbd..585b706 100644 --- a/sql/item_create.cc +++ b/sql/item_create.cc @@ -1708,6 +1708,71 @@ class Create_func_issimple : public Create_func_arg1 #endif
+class Create_func_json_valid : public Create_func_arg1 +{ +public: + virtual Item *create_1_arg(THD *thd, Item *arg1); + + static Create_func_json_valid s_singleton; + +protected: + Create_func_json_valid() {} + virtual ~Create_func_json_valid() {} +}; + + +class Create_func_json_value : public Create_func_arg2 +{ +public: + virtual Item *create_2_arg(THD *thd, Item *arg1, Item *arg2); + + static Create_func_json_value s_singleton; + +protected: + Create_func_json_value() {} + virtual ~Create_func_json_value() {} +};
okay, although I suspect that we (you?) will need to remove that later and create Item_func_json_value directly in sql_yacc.yy - if we ever want to do the complete SQL standard syntax for it.
+ + +class Create_func_json_contains_path : public Create_native_func +{ +public: + virtual Item *create_native(THD *thd, LEX_STRING name, List<Item> *item_list); + + static Create_func_json_contains_path s_singleton; + +protected: + Create_func_json_contains_path() {} + virtual ~Create_func_json_contains_path() {} +}; + + +class Create_func_json_extract : public Create_native_func +{ +public: + virtual Item *create_native(THD *thd, LEX_STRING name, List<Item> *item_list); + + static Create_func_json_extract s_singleton; + +protected: + Create_func_json_extract() {} + virtual ~Create_func_json_extract() {} +}; + + +class Create_func_json_array_append : public Create_native_func +{ +public: + virtual Item *create_native(THD *thd, LEX_STRING name, List<Item> *item_list); + + static Create_func_json_array_append s_singleton; + +protected: + Create_func_json_array_append() {} + virtual ~Create_func_json_array_append() {} +}; + + class Create_func_last_day : public Create_func_arg1 { public: diff --git a/sql/sql_json.h b/sql/sql_json.h new file mode 100644 index 0000000..e937d3d --- /dev/null +++ b/sql/sql_json.h @@ -0,0 +1,120 @@ +#ifndef SQL_JSON_INCLUDED +#define SQL_JSON_INCLUDED + +#include <m_ctype.h> + +static const int json_depth_limit= 32;
uppercase, please
+ +enum json_errors { + JE_BAD= -1, /* Invalid character - cannot read the string. */ + JE_IMP= -2, /* This character disallowed in JSON. */
like what? and why "IMP"?
+ JE_EOS= -3, /* Unexpected end of string. */ + JE_UEX= -4, /* Syntax error - unexpected character. */
"UEX" isn't clear at all. JE_UNEXPECTED, please
+ JE_IMS= -5, /* This character disallowed in string constant. */
like what? and why "IMS"?
+ JE_ESCAPING= -6, /* Wrong escaping. */ + JE_TOODEEP= -7, /* The limit on the depth was overrun. */ +}; + +
This is a json library. it needs *lots* of comments. I'll mark below what's not obvious
+class json_string +{ +public: + const uchar *c_str; + const uchar *str_end; + my_wc_t c_next; + int error; + + CHARSET_INFO *cs; + my_charset_conv_mb_wc wc; + int get_next_char() + { + return wc(cs, &c_next, c_str, str_end); + } + my_wc_t next_chr() const { return c_next; }
comment, explaining the difference between get_next_char and next_chr
+ bool eos() const { return c_str >= str_end; } + int handle_esc();
comment
+ void skip_s_getchar(int &t_next, int &c_len);
comment
+ void set_cs(CHARSET_INFO *i_cs); + void set_str(const uchar *str, const uchar *end) + { + c_str= str; + str_end= end; + } + void setup(CHARSET_INFO *i_cs, const uchar *str, const uchar *end) + { + set_cs(i_cs); + set_str(str, end); + } + int read_str_chr();
comment
+}; + + +struct json_path_step
comment
+{ + enum types { KEY=0, ARRAY=1 }; + + types type; + int wild; + const uchar *key; + const uchar *key_end; + uint n_item;
comment
+}; + + +class json_path : public json_string +{ +public: + json_path_step steps[json_depth_limit]; + json_path_step *last_step; + + bool mode_strict; + int setup(CHARSET_INFO *i_cs, const uchar *str, const uchar *end);
Hm, so the actual search is not a method of json_path and not in sql_json.cc at all. Why is that?
+}; + + +class json_engine : public json_string +{ +public: + enum states { + VALUE, /* value met */ + DONE, /* ok to finish */ + KEY, /* key met */
s/met/found/
+ OB_B, /* object */
s/OB_B/OBJ_START/
+ OB_E, /* object ended */
OBJ_END
+ OB_C, /* object continues */
OBJ_CONT
+ AR_B, /* array */
ARRAY_START
+ AR_E, /* array ended */
ARRAY_END
+ AR_C, /* array continues */
you guessed it, ARRAY_CONT
+ READ_VALUE, /* value is beeing read */
s/beeing/being/
+ NR_STATES + }; + + enum value_types + { OBJECT=0, ARRAY=1, STRING, NUMBER, V_TRUE, V_FALSE, V_NULL }; + + int sav_c_len;
comment
+ value_types value_type; + const uchar *value; + const uchar *value_begin; + const uchar *value_end; + int value_len;
comment. what value? this is json_engine object, not json_value
+ + int stack[json_depth_limit]; + int *stack_p; + + int state;
comment. current state?
+ + int scan_start(CHARSET_INFO *i_cs, const uchar *str, const uchar *end); + int scan_next(); + + int read_keyname_chr();
comment
+ + int read_value();
comment
+ int skip_key();
comment
+ int skip_level();
comment
+ int skip_array_item() { return skip_key(); }
comment
+ bool value_scalar() const { return value_type > ARRAY; } +}; + +#endif /* SQL_JSON_INCLUDED */ + diff --git a/sql/sql_json.cc b/sql/sql_json.cc new file mode 100644 index 0000000..3880a04 --- /dev/null +++ b/sql/sql_json.cc @@ -0,0 +1,1055 @@ +#include <my_global.h> + +#include "sql_json.h" + + +/* Copied from ctype-ucs2.c. */
no, don't do that, please. move defines to some header file included into both. and you've change my_utf16_uni() to be no longer static, you don't seem to need to copy it here. Same should be done for arrays that you need. Another question. Why do you treat json as utf16, instead of being it just a regular string in whatever character set it might happen to be?
+/* + D800..DB7F - Non-provate surrogate high (896 pages) + DB80..DBFF - Private surrogate high (128 pages) + DC00..DFFF - Surrogate low (1024 codes in a page) +*/ +#define MY_UTF16_SURROGATE_HIGH_FIRST 0xD800 +#define MY_UTF16_SURROGATE_HIGH_LAST 0xDBFF +#define MY_UTF16_SURROGATE_LOW_FIRST 0xDC00 +#define MY_UTF16_SURROGATE_LOW_LAST 0xDFFF + +#define MY_UTF16_HIGH_HEAD(x) ((((uchar) (x)) & 0xFC) == 0xD8) +#define MY_UTF16_LOW_HEAD(x) ((((uchar) (x)) & 0xFC) == 0xDC) +/* Test if a byte is a leading byte of a high or low surrogate head: */ +#define MY_UTF16_SURROGATE_HEAD(x) ((((uchar) (x)) & 0xF8) == 0xD8) +/* Test if a Unicode code point is a high or low surrogate head */ +#define MY_UTF16_SURROGATE(x) (((x) & 0xF800) == 0xD800) + +#define MY_UTF16_WC2(a, b) ((a << 8) + b) + +/* + a= 110110?? (<< 18) + b= ???????? (<< 10) + c= 110111?? (<< 8) + d= ???????? (<< 0) +*/ +#define MY_UTF16_WC4(a, b, c, d) (((a & 3) << 18) + (b << 10) + \ + ((c & 3) << 8) + d + 0x10000) + +int +my_utf16_uni(CHARSET_INFO *cs __attribute__((unused)), + my_wc_t *pwc, const uchar *s, const uchar *e) +{ + if (s + 2 > e) + return MY_CS_TOOSMALL2; + + /* + High bytes: 0xD[89AB] = B'110110??' + Low bytes: 0xD[CDEF] = B'110111??' + Surrogate mask: 0xFC = B'11111100' + */ + + if (MY_UTF16_HIGH_HEAD(*s)) /* Surrogate head */ + { + if (s + 4 > e) + return MY_CS_TOOSMALL4; + + if (!MY_UTF16_LOW_HEAD(s[2])) /* Broken surrigate pair */ + return MY_CS_ILSEQ; + + *pwc= MY_UTF16_WC4(s[0], s[1], s[2], s[3]); + return 4; + } + + if (MY_UTF16_LOW_HEAD(*s)) /* Low surrogate part without high part */ + return MY_CS_ILSEQ; + + *pwc= MY_UTF16_WC2(s[0], s[1]); + return 2; +} + +/* End of ctype-ucs2.c code. */ + +enum json_c_classes {
comment. or rename to json_char_classes
+ C_EOS, /* end of string */ + C_LCURB, /* { */ + C_RCURB, /* } */ + C_LSQRB, /* [ */ + C_RSQRB, /* ] */ + C_COLON, /* : */ + C_COMMA, /* , */ + C_QUOTE, /* " */ + C_DIGIT, /* -0123456789 */ + C_LOW_F, /* f */ + C_LOW_N, /* n */ + C_LOW_T, /* t */
clarify these comment, like: + C_LOW_T, /* t (for "true") */ otherwise it's not at all clear why f/n/t are special here.
+ C_ETC, /* everything else */ + C_ERR, /* character disallowed in JSON*/ + C_BAD, /* invalid character */ + NR_C_CLASSES, + C_SPACE /* space */
why space is after NR_C_CLASSES?
+}; + + +/* + This array maps first 128 Unicode Code Points into classes. + The remaining Unicode characters should be mapped to C_ETC. +*/ + +static int json_chr_map[128] = { + C_ERR, C_ERR, C_ERR, C_ERR, C_ERR, C_ERR, C_ERR, C_ERR, + C_ERR, C_SPACE, C_SPACE, C_ERR, C_ERR, C_SPACE, C_ERR, C_ERR, + C_ERR, C_ERR, C_ERR, C_ERR, C_ERR, C_ERR, C_ERR, C_ERR, + C_ERR, C_ERR, C_ERR, C_ERR, C_ERR, C_ERR, C_ERR, C_ERR, + + C_SPACE, C_ETC, C_QUOTE, C_ETC, C_ETC, C_ETC, C_ETC, C_ETC, + C_ETC, C_ETC, C_ETC, C_ETC, C_COMMA, C_DIGIT, C_ETC, C_ETC, + C_DIGIT, C_DIGIT, C_DIGIT, C_DIGIT, C_DIGIT, C_DIGIT, C_DIGIT, C_DIGIT, + C_DIGIT, C_DIGIT, C_COLON, C_ETC, C_ETC, C_ETC, C_ETC, C_ETC, + + C_ETC, C_ETC, C_ETC, C_ETC, C_ETC, C_ETC, C_ETC, C_ETC, + C_ETC, C_ETC, C_ETC, C_ETC, C_ETC, C_ETC, C_ETC, C_ETC, + C_ETC, C_ETC, C_ETC, C_ETC, C_ETC, C_ETC, C_ETC, C_ETC, + C_ETC, C_ETC, C_ETC, C_LSQRB, C_ETC, C_RSQRB, C_ETC, C_ETC, + + C_ETC, C_ETC, C_ETC, C_ETC, C_ETC, C_ETC, C_LOW_F, C_ETC, + C_ETC, C_ETC, C_ETC, C_ETC, C_ETC, C_ETC, C_LOW_N, C_ETC, + C_ETC, C_ETC, C_ETC, C_ETC, C_LOW_T, C_ETC, C_ETC, C_ETC, + C_ETC, C_ETC, C_ETC, C_LCURB, C_ETC, C_RCURB, C_ETC, C_ETC +}; + + +typedef int (*json_state_handler)(json_engine *); + + +/* The string is broken. */ +static int er_en(json_engine *j) +{ + j->error= JE_EOS; + return 1; +} + + +/* This symbol here breaks the JSON syntax. */ +static int __(json_engine *j) +{ + j->error= JE_UEX; + return 1; +} + + +/* Value of object. */ +static int ky_ob(json_engine *j)
dn' abbr lk tht use longer readable names, please
+{ + j->state= json_engine::OB_B; + *(++j->stack_p)= json_engine::OB_C; + return 0; +} + +/* Read value of object. */ +static int rd_ob(json_engine *j)
ok, in the context of the array below, rd_ob is kind of readable. but I have no idea what "ky" means. If it's "key" then I'd suggest to rename "ky" -> "key" and "rd" -> "val". and "obj", "arr" too.
+{ + j->state= json_engine::OB_B; + j->value_type= json_engine::OBJECT; + j->value= j->value_begin; + *(++j->stack_p)= json_engine::OB_C; + return 0; +} + + +/* Value of array. */ +static int ky_ar(json_engine *j) +{ + j->state= json_engine::AR_B; + *(++j->stack_p)= json_engine::AR_C; + j->value= j->value_begin; + return 0; +} + +/* Read value of object. */ +static int rd_ar(json_engine *j) +{ + j->state= json_engine::AR_B; + j->value_type= json_engine::ARRAY; + j->value= j->value_begin; + *(++j->stack_p)= json_engine::AR_C; + return 0; +} + + + +/* \b 8 \f 12 \n 10 \r 13 \t 9 */ +enum json_s_classes {
what does that mean? this needs a comment for the enum and even a comment for a comment :) a I don't see how this your comment applies to this your enum.
+ S_0= 0, + S_1= 1, + S_2= 2, + S_3= 3, + S_4= 4, + S_5= 5, + S_6= 6, + S_7= 7, + S_8= 8, + S_9= 9, + S_A= 10, + S_B= 11, + S_C= 12, + S_D= 13, + S_E= 14, + S_F= 15, + S_ETC= 36, /* rest of characters. */ + S_QUOTE= 37, + S_SOLID= 38, /* \ */
why "solid" ? because it's "reverse solidus"? better call it "backslash", that's far less confusing.
+ S_ERR= 100, /* disallowed */ +}; + + +/* This maps characters to their types inside a string constant. */ +static const int json_instr_chr_map[128] = { + S_ERR, S_ERR, S_ERR, S_ERR, S_ERR, S_ERR, S_ERR, S_ERR, + S_ERR, S_ERR, S_ERR, S_ERR, S_ERR, S_ERR, S_ERR, S_ERR, + S_ERR, S_ERR, S_ERR, S_ERR, S_ERR, S_ERR, S_ERR, S_ERR, + S_ERR, S_ERR, S_ERR, S_ERR, S_ERR, S_ERR, S_ERR, S_ERR, + + S_ETC, S_ETC, S_QUOTE, S_ETC, S_ETC, S_ETC, S_ETC, S_ETC, + S_ETC, S_ETC, S_ETC, S_ETC, S_ETC, S_ETC, S_ETC, S_ETC, + S_0, S_1, S_2, S_3, S_4, S_5, S_6, S_7, + S_8, S_9, S_ETC, S_ETC, S_ETC, S_ETC, S_ETC, S_ETC, + + S_ETC, S_A, S_B, S_C, S_D, S_E, S_F, S_ETC, + S_ETC, S_ETC, S_ETC, S_ETC, S_ETC, S_ETC, S_ETC, S_ETC, + S_ETC, S_ETC, S_ETC, S_ETC, S_ETC, S_ETC, S_ETC, S_ETC, + S_ETC, S_ETC, S_ETC, S_ETC, S_SOLID, S_ETC, S_ETC, S_ETC, + + S_ETC, S_A, S_B, S_C, S_D, S_E, S_F, S_ETC, + S_ETC, S_ETC, S_ETC, S_ETC, S_ETC, S_ETC, S_ETC, S_ETC, + S_ETC, S_ETC, S_ETC, S_ETC, S_ETC, S_ETC, S_ETC, S_ETC, + S_ETC, S_ETC, S_ETC, S_ETC, S_ETC, S_ETC, S_ETC, S_ETC +}; + + +static int read_4digit(json_string *j, uchar *s)
s/read_4digit/read_4_hexdigits/
+{ + int i, t, c_len; + for (i=0; i<4; i++) + { + if ((c_len= j->get_next_char()) <= 0) + return j->error= j->eos() ? JE_EOS : JE_BAD; + + if (j->c_next >= 128 || (t= json_instr_chr_map[j->c_next]) >= S_F) + return j->error= JE_UEX; + + j->c_str+= c_len; + s[i/2]+= (i % 2) ? t : t*16; + } + return 0; +} + + +/* \b 8 \f 12 \n 10 \r 13 \t 9 */ +int json_string::handle_esc() +{ + int t, c_len; + + if ((c_len= get_next_char()) <= 0) + return error= eos() ? JE_EOS : JE_BAD; + + c_str+= c_len; + switch (c_next) + { + case 'b': + c_next= 8; + return 0; + case 'f': + c_next= 12; + return 0; + case 'n': + c_next= 10; + return 0; + case 'r': + c_next= 13; + return 0; + case 't': + c_next= 9; + return 0; + } + + if (c_next < 128 && (t= json_instr_chr_map[c_next]) == S_ERR) + { + c_str-= c_len; + return error= JE_ESCAPING; + }
where did you read that the behavior should be like that?
+ + + if (c_next != 'u') + return 0;
where did you read that the behavior should be like that?
+ + /* + Read the four-hex-digits code. + If symbol is not in the Basic Multilingual Plane, we're reading + the string for the next four digits to compose the UTF-16 surrogate pair. + */ + uchar s[4]= {0,0,0,0}; + + if (read_4digit(this, s)) + return 1; + + if ((c_len= my_utf16_uni(0, &c_next, s, s+2)) == 2) + return 0; + + if (c_len != MY_CS_TOOSMALL4) + return error= JE_BAD; + + if ((c_len= get_next_char()) <= 0) + return error= eos() ? JE_EOS : JE_BAD; + if (c_next != '\\') + return error= JE_UEX; + + if ((c_len= get_next_char()) <= 0) + return error= eos() ? JE_EOS : JE_BAD; + if (c_next != 'u') + return error= JE_UEX; + + if (read_4digit(this, s+2)) + return 1; + + if ((c_len= my_utf16_uni(0, &c_next, s, s+4)) == 2) + return 0; + + return error= JE_BAD; +} + + +int json_string::read_str_chr()
at least, it's not "rd_str_chr", but I still don't understand what "read_str_chr" does. please, rename
+{ + int c_len; + + if ((c_len= get_next_char()) > 0) + { + c_str+= c_len; + return (c_next == '\\') ? handle_esc() : 0; + } + error= eos() ? JE_EOS : JE_BAD; + return 1; +} + + +inline int pass_str(json_engine *j)
what is "pass_str"? please, rename
+{ + int t, c_len; + for (;;) + { + if ((c_len= j->get_next_char()) > 0) + { + j->c_str+= c_len; + if (j->c_next >= 128 || ((t=json_instr_chr_map[j->c_next]) <= S_ETC)) + continue; + + if (j->c_next == '"') + break;
ok, so it apparently skips over a string, moving the cursor from the opening to the closing double quote
+ if (j->c_next == '\\') + { + if (j->handle_esc()) + return 1; + continue; + } + /* Symbol not allowed in JSON. */ + return j->error= JE_IMP; + } + else + return j->error= j->eos() ? JE_EOS : JE_BAD; + } + + j->state= *j->stack_p; + return 0; +} + + +/* Scalar string. */ +static int scr_s(json_engine *j)
five letters per name is difficult, beter make the array wider but more readable.
+{ + return pass_str(j) || j->scan_next(); +} + + +/* Read scalar string. */ +static int rd_s(json_engine *j) +{ + j->value= j->c_str; + + if (pass_str(j)) + return 1; + + j->state= *j->stack_p; + j->value_type= json_engine::STRING; + j->value_len= (j->c_str - j->value) - 1; + return 0; +} + + +enum json_num_classes { + N_MINUS, + N_PLUS, + N_ZERO, + N_DIGIT, + N_POINT, + N_E, + N_END, + N_EEND, + N_ERR, + N_NUM_CLASSES +};
comments!
+ + +static int json_num_chr_map[128] = { + N_ERR, N_ERR, N_ERR, N_ERR, N_ERR, N_ERR, N_ERR, N_ERR, + N_ERR, N_END, N_END, N_ERR, N_ERR, N_END, N_ERR, N_ERR, + N_ERR, N_ERR, N_ERR, N_ERR, N_ERR, N_ERR, N_ERR, N_ERR, + N_ERR, N_ERR, N_ERR, N_ERR, N_ERR, N_ERR, N_ERR, N_ERR, + + N_END, N_EEND, N_EEND, N_EEND, N_EEND, N_EEND, N_EEND, N_EEND, + N_EEND, N_EEND, N_EEND, N_PLUS, N_END, N_MINUS, N_POINT, N_EEND, + N_ZERO, N_DIGIT, N_DIGIT, N_DIGIT, N_DIGIT, N_DIGIT, N_DIGIT, N_DIGIT, + N_DIGIT, N_DIGIT, N_EEND, N_EEND, N_EEND, N_EEND, N_EEND, N_EEND, + + N_EEND, N_EEND, N_EEND, N_EEND, N_EEND, N_E, N_EEND, N_EEND, + N_EEND, N_EEND, N_EEND, N_EEND, N_EEND, N_EEND, N_EEND, N_EEND, + N_EEND, N_EEND, N_EEND, N_EEND, N_EEND, N_EEND, N_EEND, N_EEND, + N_EEND, N_EEND, N_EEND, N_EEND, N_EEND, N_END, N_EEND, N_EEND, + + N_EEND, N_EEND, N_EEND, N_EEND, N_EEND, N_E, N_EEND, N_EEND, + N_EEND, N_EEND, N_EEND, N_EEND, N_EEND, N_EEND, N_EEND, N_EEND, + N_EEND, N_EEND, N_EEND, N_EEND, N_EEND, N_EEND, N_EEND, N_EEND, + N_EEND, N_EEND, N_EEND, N_EEND, C_ETC, N_END, N_EEND, N_EEND,
what is C_ETC doing here? if you'd declared your array of json_num_classes type, you wouldn't be able to put value from another enum there. this applies everywhere. many your arrays are int. the stack is int. they all need to be of their corresponding enum types.
+}; + +enum json_num_states { + NS_OK, + NS_GO, + NS_GO1, + NS_Z, + NS_Z1, + NS_INT, + NS_FRAC, + NS_EX, + NS_EX1, + NS_NUM_STATES +}; + + +static int json_num_states[NS_NUM_STATES][N_NUM_CLASSES]= +{ +/* - + 0 1..9 POINT E END_OK ERROR */ +/* OK */ { JE_UEX, JE_UEX, JE_UEX, JE_UEX, JE_UEX, JE_UEX, JE_UEX, JE_BAD }, +/* GO */ { NS_GO1, JE_UEX, NS_Z, NS_INT, JE_UEX, JE_UEX, JE_UEX, JE_BAD }, +/* GO1 */ { JE_UEX, JE_UEX, NS_Z1, NS_INT, JE_UEX, JE_UEX, JE_UEX, JE_BAD }, +/* ZERO */ { JE_UEX, JE_UEX, JE_UEX, JE_UEX, NS_FRAC, JE_UEX, NS_OK, JE_BAD }, +/* ZE1 */ { JE_UEX, JE_UEX, JE_UEX, JE_UEX, NS_FRAC, JE_UEX, JE_UEX, JE_BAD }, +/* INT */ { JE_UEX, JE_UEX, NS_INT, NS_INT, NS_FRAC, NS_EX, NS_OK, JE_BAD }, +/* FRAC */ { JE_UEX, JE_UEX, NS_FRAC, NS_FRAC,JE_UEX, NS_EX, NS_OK, JE_BAD }, +/* EX */ { NS_EX1, NS_EX1, NS_EX1, NS_EX1, JE_UEX, JE_UEX, JE_UEX, JE_BAD }, +/* EX1 */ { JE_UEX, JE_UEX, NS_EX1, NS_EX1, JE_UEX, JE_UEX, JE_UEX, JE_BAD } +}; + + +inline int pass_n(json_engine *j) +{ + int state= json_num_states[NS_GO][json_num_chr_map[j->c_next]]; + int c_len; + + for (;;) + { + if ((c_len= j->get_next_char()) > 0) + { + if ((state= json_num_states[state][json_num_chr_map[j->c_next]]) > 0) + { + j->c_str+= c_len; + continue; + } + break; + } + + if ((j->error= j->eos() ? json_num_states[state][N_END] : JE_BAD) < 0) + return 1; + else + break; + } + + j->state= *j->stack_p; + return 0; +} + + +/* Scalar numeric. */ +static int scr_n(json_engine *j) +{ + return pass_n(j) || j->scan_next(); +} + + +/* Read number. */ +static int rd_n(json_engine *j) +{ + j->value= j->value_begin; + if (pass_n(j) == 0) + { + j->value_type= json_engine::NUMBER; + j->value_len= j->c_str - j->value_begin; + return 0; + } + return 1; +} + + +static int assure_string(json_string *j, const char *str)
better "match_string", or "match_verbatim" or "skip_verbatim", something like that
+{ + int c_len; + while (*str) + { + if ((c_len= j->get_next_char()) > 0) + { + if (j->c_next == (my_wc_t) *(str++)) + { + j->c_str+= c_len; + continue; + } + return JE_UEX; + } + return j->eos() ? JE_EOS : JE_BAD; + } + + return 0; +} + + +/* Scalar false. */ +static int scr_f(json_engine *j) +{ + if (assure_string(j, "alse")) + return 1; + j->state= *j->stack_p; + return j->scan_next(); +} + + +/* Scalar null. */ +static int scr_l(json_engine *j) +{ + if (assure_string(j, "ull")) + return 1; + j->state= *j->stack_p; + return j->scan_next(); +} + + +/* Scalar true. */ +static int scr_t(json_engine *j) +{ + if (assure_string(j, "rue")) + return 1; + j->state= *j->stack_p; + return j->scan_next(); +} + + +/* Read false. */ +static int rd_f(json_engine *j) +{ + j->value_type= json_engine::V_FALSE; + j->value= j->value_begin; + j->state= *j->stack_p; + j->value_len= 5; + return assure_string(j, "alse"); +} + + +/* Read null. */ +static int rd_l(json_engine *j) +{ + j->value_type= json_engine::V_NULL; + j->value= j->value_begin; + j->state= *j->stack_p; + j->value_len= 4; + return assure_string(j, "ull"); +} + + +/* Read true. */ +static int rd_t(json_engine *j) +{ + j->value_type= json_engine::V_TRUE; + j->value= j->value_begin; + j->state= *j->stack_p; + j->value_len= 4; + return assure_string(j, "rue"); +} + + +/* Disallowed character. */ +static int e_chr(json_engine *j) +{ + j->error= JE_IMP; + return 1; +} + + +/* Bad character. */ +static int b_chr(json_engine *j) +{ + j->error= JE_BAD; + return 1; +} + + +/* Correct finish. */ +static int done(json_engine *j) +{ + return 1; +} + + +/* End of the object. */ +static int en_ob(json_engine *j) +{ + j->stack_p--; + j->state= json_engine::OB_E; + return 0; +} + + +/* End of the array. */ +static int en_ar(json_engine *j) +{ + j->stack_p--; + j->state= json_engine::AR_E; + return 0; +} + + +/* Start reading key name. */ +static int get_k(json_engine *j) +{ + j->state= json_engine::KEY; + return 0; +} + + +inline void json_string::skip_s_getchar(int &t_next, int &c_len)
comment. the name is not clear at all, please rename
+{ + do + { + if ((c_len= get_next_char()) <= 0) + t_next= eos() ? C_EOS : C_BAD; + else + { + t_next= (c_next < 128) ? json_chr_map[c_next] : C_ETC; + c_str+= c_len; + } + } while (t_next == C_SPACE);
ok, so it's "get first non-space character"?
+} + + +/* Next key name. */ +static int nex_k(json_engine *j) +{ + int t_next, c_len; + j->skip_s_getchar(t_next, c_len); + + if (t_next == C_QUOTE) + { + j->state= json_engine::KEY; + return 0; + } + + j->error= (t_next == C_EOS) ? JE_EOS : + ((t_next == C_BAD) ? JE_BAD : + JE_UEX); + return 1; +} + + +/* Forward declarations. */ +static int skp_c(json_engine *j); +static int skp_k(json_engine *j); +static int ee_cb(json_engine *j); +static int ee_qb(json_engine *j); +static int ee_cm(json_engine *j); +static int ee_dn(json_engine *j); + + +static int ar_c(json_engine *j)
comment
+{ + j->state= json_engine::VALUE; + return 0; +}
comment
+ + +static int arb(json_engine *j)
comment
+{ + j->state= json_engine::VALUE; + j->c_str-= j->sav_c_len; + return 0; +} + + +static json_state_handler json_actions[json_engine::NR_STATES][NR_C_CLASSES]= +/* EOS { } [ ] : , " -0..9 f n t ETC ERR BAD*/ +{ +/*VALUE*/{er_en, ky_ob, __, ky_ar, __, __, __, scr_s, scr_n, scr_f, scr_l, scr_t, __, e_chr, b_chr}, +/*DONE*/ {done, __, __, __, __, __, __, __, __, __, __, __, __, e_chr, b_chr}, +/*KEY*/ {er_en, skp_k, skp_k, skp_k, skp_k, skp_k, skp_k, skp_c, skp_k, skp_k, skp_k, skp_k, skp_k, e_chr, b_chr},
} ] and some other characters - that's surely an error, why don't you use an error action for them? and why one skp_c ?
+/*OB_B*/ {er_en, __, en_ob, __, __, __, __, get_k, __, __, __, __, __, e_chr, b_chr}, +/*OB_E*/ {ee_dn, __, ee_cb, __, ee_qb, __, ee_cm, __, __, __, __, __, __, e_chr, b_chr}, +/*OB_C*/ {er_en, __, en_ob, __, en_ar, __, nex_k, __, __, __, __, __, __, e_chr, b_chr}, +/*AR_B*/ {er_en, arb, __, arb, en_ar, __, __, arb, arb, arb, arb, arb, __, e_chr, b_chr}, +/*AR_E*/ {ee_dn, __, ee_cb, __, ee_qb, __, ee_cm, __, __, __, __, __, __, e_chr, b_chr}, +/*AR_C*/ {er_en, __, __, __, en_ar, __, ar_c, __, __, __, __, __, __, e_chr, b_chr}, +/*RD_V*/ {er_en, rd_ob, __, rd_ar, __, __, __, rd_s, rd_n, rd_f, rd_l, rd_t, __, e_chr, b_chr}, +}; + + +void json_string::set_cs(CHARSET_INFO *i_cs) +{ + cs= i_cs; + error= 0; + wc= i_cs->cset->mb_wc; +} + + +int json_engine::scan_start(CHARSET_INFO *i_cs, const uchar *str, const uchar *end) +{ + json_string::setup(i_cs, str, end); + stack[0]= json_engine::DONE; + stack_p= stack; + state= json_engine::VALUE; + return 0; +} + + +/* Skip colon and the value. */ +static int skp_c(json_engine *j) +{ + int t_next, c_len; + + j->skip_s_getchar(t_next, c_len); + + if (t_next == C_COLON) + { + j->skip_s_getchar(t_next, c_len); + return json_actions[json_engine::VALUE][t_next](j);
please check how compiler optimizes that. it is a call/ret or a jump?
+ } + + j->error= (t_next == C_EOS) ? JE_EOS : + ((t_next == C_BAD) ? JE_BAD : + JE_UEX); + + return 1; +} + + +/* Skip colon and the value. */
same comment as for the previous function?
+static int skp_k(json_engine *j) +{ + int t_next, c_len; + while (j->read_keyname_chr() == 0) {} + + if (j->error) + return 1; + + j->skip_s_getchar(t_next, c_len); + return json_actions[json_engine::VALUE][t_next](j); +} + + +/* Continue after the end of the structure. */ +static int ee_dn(json_engine *j) +{ return json_actions[*j->stack_p][C_EOS](j); }
You really need a comment explaining how this state machine works.
+ + +/* Continue after the end of the structure. */ +static int ee_cb(json_engine *j) +{ return json_actions[*j->stack_p][C_RCURB](j); } + + +/* Continue after the end of the structure. */ +static int ee_qb(json_engine *j) +{ return json_actions[*j->stack_p][C_RSQRB](j); } + + +/* Continue after the end of the structure. */ +static int ee_cm(json_engine *j) +{ return json_actions[*j->stack_p][C_COMMA](j); } + + +int json_engine::read_keyname_chr()
comment. if the first quote already read by now?
+{ + int c_len, t; + + if ((c_len= get_next_char()) > 0) + { + c_str+= c_len; + if (c_next>= 128 || (t= json_instr_chr_map[c_next]) <= S_ETC) + return 0; + + switch (t) + { + case S_QUOTE: + for (;;) /* Skip spaces until ':'. */
why? you can use skip_s_getchar() to read whatever comes after the key.
+ { + if ((c_len= get_next_char() > 0)) + { + if (c_next == ':') + { + c_str+= c_len; + state= VALUE; + return 1; + } + + if (c_next < 128 && json_chr_map[c_next] == C_SPACE) + { + c_str+= c_len; + continue; + } + error= JE_UEX; + break; + } + error= eos() ? JE_EOS : JE_BAD; + break; + } + return 1; + case S_SOLID: + return handle_esc(); + case S_ERR: + c_str-= c_len; + error= JE_IMS; + return 1; + } + } + error= eos() ? JE_EOS : JE_BAD; + return 1; +} + + +int json_engine::read_value() +{ + int t_next, c_len, res; + + if (state == KEY) + { + while (read_keyname_chr() == 0) {} + + if (error) + return 1; + } + + skip_s_getchar(t_next, c_len);
here, in fact you *do* use skip_s_getchar() after a key.
+ + value_begin= c_str-c_len; + res= json_actions[READ_VALUE][t_next](this); + value_end= c_str; + return res; +} + + +int json_engine::scan_next() +{ + int t_next; + + skip_s_getchar(t_next, sav_c_len); + return json_actions[state][t_next](this); +} + + +enum json_path_c_classes { + P_EOS, /* end of string */ + P_USD, /* $ */ + P_ASTER, /* * */ + P_LSQRB, /* [ */ + P_RSQRB, /* ] */ + P_POINT, /* . */ + P_ZERO, /* 0 */ + P_DIGIT, /* 123456789 */ + P_L, /* l */ + P_S, /* s */
better comments for P_L and P_S (for "lax" / for "strict")
+ P_SPACE, /* space */ + P_SOLID, /* \ */ + P_ETC, /* everything else */ + P_ERR, /* character disallowed in JSON*/ + P_BAD, /* invalid character */ + N_PATH_CLASSES, +}; + + +static int json_path_chr_map[128] = { + P_ERR, P_ERR, P_ERR, P_ERR, P_ERR, P_ERR, P_ERR, P_ERR, + P_ERR, P_SPACE, P_SPACE, P_ERR, P_ERR, P_SPACE, P_ERR, P_ERR, + P_ERR, P_ERR, P_ERR, P_ERR, P_ERR, P_ERR, P_ERR, P_ERR, + P_ERR, P_ERR, P_ERR, P_ERR, P_ERR, P_ERR, P_ERR, P_ERR, + + P_SPACE, P_ETC, P_ETC, P_ETC, P_USD, P_ETC, P_ETC, P_ETC, + P_ETC, P_ETC, P_ASTER, P_ETC, P_ETC, P_ETC, P_POINT, P_ETC, + P_ZERO, P_DIGIT, P_DIGIT, P_DIGIT, P_DIGIT, P_DIGIT, P_DIGIT, P_DIGIT, + P_DIGIT, P_DIGIT, P_ETC, P_ETC, P_ETC, P_ETC, P_ETC, P_ETC, + + P_ETC, P_ETC, P_ETC, P_ETC, P_ETC, P_ETC, P_ETC, P_ETC, + P_ETC, P_ETC, P_ETC, P_ETC, P_L, P_ETC, P_ETC, P_ETC, + P_ETC, P_ETC, P_S, P_ETC, P_ETC, P_ETC, P_ETC, P_ETC, + P_ETC, P_ETC, P_ETC, P_LSQRB, P_SOLID, P_RSQRB, P_ETC, P_ETC, + + P_ETC, P_ETC, P_ETC, P_ETC, P_ETC, P_ETC, P_ETC, P_ETC, + P_ETC, P_ETC, P_ETC, P_ETC, P_L, P_ETC, P_ETC, P_ETC, + P_ETC, P_ETC, P_S, P_ETC, P_ETC, P_ETC, P_ETC, P_ETC, + P_ETC, P_ETC, P_ETC, P_ETC, P_ETC, P_ETC, P_ETC, P_ETC +}; + + +enum json_path_states { + PS_GO, + PS_LAX, + PS_PT, + PS_AR, + PS_AWD, + PS_Z, + PS_INT, + PS_AS, + PS_KEY, + PS_KNM, + PS_KWD, + N_PATH_STATES, + PS_SCT, + PS_EKY, + PS_EAR, + PS_ESC, + PS_OK, + PS_KOK +};
comment! explain every state, please!
+ + +static int json_path_transitions[N_PATH_STATES][N_PATH_CLASSES]= +{ +/* EOS $, * [ ] . 0 1..9 L S SPACE SOLID ETC ERR BAD */ +/* GO */ { JE_EOS, PS_PT, JE_UEX, JE_UEX, JE_UEX, JE_UEX, JE_UEX, JE_UEX, PS_LAX, PS_SCT, PS_GO, JE_UEX, JE_UEX, JE_IMP, JE_BAD}, +/* LAX */ { JE_EOS, JE_UEX, JE_UEX, JE_UEX, JE_UEX, JE_UEX, JE_UEX, JE_UEX, PS_LAX, JE_UEX, PS_GO, JE_UEX, JE_UEX, JE_IMP, JE_BAD}, +/* PT */ { PS_OK, JE_UEX, JE_UEX, PS_AR, JE_UEX, PS_KEY, JE_UEX, JE_UEX, JE_UEX, JE_UEX, JE_UEX, JE_UEX, JE_UEX, JE_IMP, JE_BAD}, +/* AR */ { JE_EOS, JE_UEX, PS_AWD, JE_UEX, PS_PT, JE_UEX, PS_Z, PS_INT, JE_UEX, JE_UEX, PS_AR, JE_UEX, JE_UEX, JE_IMP, JE_BAD}, +/* AWD */ { JE_EOS, JE_UEX, JE_UEX, JE_UEX, PS_PT, JE_UEX, JE_UEX, JE_UEX, JE_UEX, JE_UEX, PS_AS, JE_UEX, JE_UEX, JE_IMP, JE_BAD}, +/* Z */ { JE_EOS, JE_UEX, JE_UEX, JE_UEX, PS_PT, JE_UEX, JE_UEX, JE_UEX, JE_UEX, JE_UEX, PS_AS, JE_UEX, JE_UEX, JE_IMP, JE_BAD}, +/* INT */ { JE_EOS, JE_UEX, JE_UEX, JE_UEX, PS_PT, JE_UEX, PS_INT, PS_INT, JE_UEX, JE_UEX, PS_AS, JE_UEX, JE_UEX, JE_IMP, JE_BAD}, +/* AS */ { JE_EOS, JE_UEX, JE_UEX, JE_UEX, PS_PT, JE_UEX, JE_UEX, JE_UEX, JE_UEX, JE_UEX, PS_AS, JE_UEX, JE_UEX, JE_IMP, JE_BAD}, +/* KEY */ { JE_EOS, PS_KNM, PS_KWD, JE_UEX, PS_KNM, JE_UEX, PS_KNM, PS_KNM, PS_KNM, PS_KNM, PS_KNM, JE_UEX, PS_KNM, JE_IMP, JE_BAD}, +/* KNM */ { PS_KOK, PS_KNM, PS_KNM, PS_EAR, PS_KNM, PS_EKY, PS_KNM, PS_KNM, PS_KNM, PS_KNM, PS_KNM, PS_ESC, PS_KNM, JE_IMP, JE_BAD}, +/* KWD */ { PS_OK, JE_UEX, JE_UEX, PS_AR, JE_UEX, PS_EKY, JE_UEX, JE_UEX, JE_UEX, JE_UEX, JE_UEX, JE_UEX, JE_UEX, JE_IMP, JE_BAD} +}; + + +int json_path::setup(CHARSET_INFO *i_cs, const uchar *str, const uchar *end) +{ + int c_len, t_next, state= PS_GO; + + json_string::setup(i_cs, str, end); + + steps[0].type= json_path_step::ARRAY; + steps[0].wild= 1; + last_step= steps; + mode_strict= false; + + do + { + if ((c_len= get_next_char()) <= 0) + t_next= eos() ? P_EOS : P_BAD; + else + t_next= (c_next >= 128) ? P_ETC : json_path_chr_map[c_next]; + + if ((state= json_path_transitions[state][t_next]) < 0) + return error= state; + + c_str+= c_len; + + switch (state) + { + case PS_LAX: + if ((error= assure_string(this, "ax"))) + return 1; + mode_strict= false; + continue; + case PS_SCT: + if ((error= assure_string(this, "rict"))) + return 1; + mode_strict= true; + state= PS_LAX; + continue;
I'd personally done lax/strict without a state machine, they can only be in the beginning of the string right? Two states and two path character classes less.
+ case PS_AWD: + last_step->wild= 1; + continue; + case PS_INT: + last_step->n_item*= 10; + last_step->n_item+= c_next - '0'; + continue; + case PS_EKY: + last_step->key_end= c_str - c_len; + state= PS_KEY; + /* Note no 'continue' here. */ + case PS_KEY: + last_step++; + last_step->type= json_path_step::KEY; + last_step->wild= 0; + last_step->key= c_str; + continue; + case PS_EAR: + last_step->key_end= c_str - c_len; + state= PS_AR; + /* Note no 'continue' here. */ + case PS_AR: + last_step++; + last_step->type= json_path_step::ARRAY; + last_step->wild= 0; + last_step->n_item= 0; + continue; + case PS_KWD: + last_step->wild= 1; + continue; + case PS_ESC: + if (handle_esc()) + return 1; + continue; + case PS_KOK: + last_step->key_end= c_str - c_len; + state= PS_OK; + break; + }; + } while (state != PS_OK); + + return 0; +} + + +int json_engine::skip_level() +{ + int ct= 0; + + while (scan_next() == 0) + { + switch (state) { + case OB_B: + case AR_B: + ct++; + break; + case OB_E: + case AR_E: + if (ct == 0) + return 0; + ct--; + break; + } + } + + return 1; +} + + +int json_engine::skip_key()
comment! is it what standard says? that the search should continue over non-scalars instead of aborting with an error?
+{ + if (read_value()) + return 1; + + if (value_scalar()) + return 0; + + return skip_level(); +} diff --git a/sql/item_jsonfunc.h b/sql/item_jsonfunc.h new file mode 100644 index 0000000..7aea3f5 --- /dev/null +++ b/sql/item_jsonfunc.h @@ -0,0 +1,132 @@ +#ifndef ITEM_JSONFUNC_INCLUDED +#define ITEM_JSONFUNC_INCLUDED + +/* Copyright (c) 2016, MariaDB + + This program is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; version 2 of the License. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program; if not, write to the Free Software + Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA */ + + +/* This file defines all XML functions */
JSON
+ + +#ifdef USE_PRAGMA_INTERFACE +#pragma interface /* gcc class implementation */ +#endif
don't bother (and feel free to remove it from item_xmlfunc.h too :)
+ +#include "item_cmpfunc.h" // Item_bool_func +#include "item_strfunc.h" // Item_str_func +#include "sql_json.h" + + +class json_path_with_flags : public json_path +{ +public: + bool constant; + bool parsed; + json_path_step *cur_step; + void set_constant_flag(bool s_constant) + { + constant= s_constant; + parsed= FALSE; + } +}; + + +class Item_func_json_valid: public Item_int_func +{ +protected: + String tmp_value; + +public: + Item_func_json_valid(THD *thd, Item *json) : Item_int_func(thd, json) {} + longlong val_int(); + const char *func_name() const { return "json_valid"; } + void fix_length_and_dec() + { + Item_int_func::fix_length_and_dec(); + maybe_null= 1; + } +}; + + +class Item_func_json_value: public Item_str_func +{ +protected: + json_path_with_flags path; + String tmp_js, tmp_path; + +public: + Item_func_json_value(THD *thd, Item *js, Item *path): + Item_str_func(thd, js, path) {} + const char *func_name() const { return "json_value"; } + void fix_length_and_dec(); + String *val_str(String *); +}; + + +class Item_func_json_extract: public Item_str_func +{ +protected: + String tmp_js; + json_path_with_flags *paths; + String **tmp_paths; + +public: + Item_func_json_extract(THD *thd, List<Item> &list): + Item_str_func(thd, list) {} + const char *func_name() const { return "json_extract"; } + bool fix_fields(THD *thd, Item **ref); + void fix_length_and_dec(); + String *val_str(String *); + longlong val_int(); +}; + + +class Item_func_json_contains_path: public Item_int_func +{ +protected: + String tmp_js; + json_path_with_flags *paths; + String **tmp_paths; + bool mode_one; + bool ooa_constant, ooa_parsed; + +public: + Item_func_json_contains_path(THD *thd, List<Item> &list): + Item_int_func(thd, list) {} + const char *func_name() const { return "json_contains_path"; } + bool fix_fields(THD *thd, Item **ref); + void fix_length_and_dec(); + longlong val_int(); +}; + + +class Item_func_json_array_append: public Item_str_func +{ +protected: + String tmp_js; + String tmp_val; + json_path_with_flags *paths; + String **tmp_paths; +public: + Item_func_json_array_append(THD *thd, List<Item> &list): + Item_str_func(thd, list) {} + bool fix_fields(THD *thd, Item **ref); + void fix_length_and_dec(); + String *val_str(String *); + const char *func_name() const { return "json_array_append"; } +}; + + +#endif /* ITEM_JSONFUNC_INCLUDED */ diff --git a/sql/item_jsonfunc.cc b/sql/item_jsonfunc.cc new file mode 100644 index 0000000..989fc6e --- /dev/null +++ b/sql/item_jsonfunc.cc @@ -0,0 +1,654 @@ +/* Copyright (c) 2016, Monty Program Ab. + + This program is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; version 2 of the License. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program; if not, write to the Free Software + Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA */ + +#ifdef __GNUC__ +#pragma implementation +#endif
don't bother (and feel free to remove it from item_xmlfunc.cc too :)
+ +#include <my_global.h> +#include "sql_priv.h" +#include "sql_class.h" +#include "item.h" + +
function comment?
+static bool eq_ascii_string(const CHARSET_INFO *cs, + const char *ascii, + const char *s, uint32 s_len) +{ + const char *s_end= s + s_len; + + while (*ascii && s < s_end) + { + my_wc_t wc; + int wc_len; + + wc_len= cs->cset->mb_wc(cs, &wc, (uchar *) s, (uchar *) s_end); + if (wc_len <= 0 || (wc | 0x20) != (my_wc_t) *ascii) + return 0; + + ascii++; + s+= wc_len; + } + + return *ascii == 0 && s >= s_end; +}
Why did you need a special function for that? And you're only using it for "one" and "all". It's totally not worth optimizing for, just use strnncoll. You'll most likely be doing it only once anyway.
+ + +longlong Item_func_json_valid::val_int() +{ + String *js= args[0]->val_str(&tmp_value); + json_engine je; + + if ((null_value= args[0]->null_value) || js == NULL) + return 0; + + je.scan_start(js->charset(), (const uchar *) js->ptr(), + (const uchar *) js->ptr()+js->length()); + + while (je.scan_next() == 0) {} + + return je.error == 0; +} + + +void Item_func_json_value::fix_length_and_dec() +{ + collation.set(args[0]->collation); + max_length= args[0]->max_length; + path.constant= args[1]->const_item(); + path.parsed= FALSE;
path.set_constant_flag()
+} + + +static int handle_match(json_engine *je, json_path_with_flags *p, uint *n_arrays)
comment! I don't understand what this function does
+{ + DBUG_ASSERT(p->cur_step < p->last_step); + + if (je->read_value()) + return 1; + + if (je->value_scalar()) + return 0; + + p->cur_step++; + n_arrays[p->cur_step - p->steps]= 0; + + if ((int) je->value_type != (int) p->cur_step->type) + { + p->cur_step--; + return je->skip_level(); + } + + return 0; +} + + +static int json_key_matches(json_engine *je, json_string *k) +{ + while (je->read_keyname_chr() == 0) + { + if (k->read_str_chr() || + je->next_chr() != k->next_chr()) + return 0; + } + + if (k->read_str_chr()) + return 1; + + return 0; +} + + +static int find_value(json_engine *je, String *js, json_path_with_flags *p) +{ + json_string kn; + uint n_arrays[json_depth_limit]; + + kn.set_cs(p->cs);
what does 'kn' mean? key name? then call it 'key_name' or 'key' or at least add a comment
+ + do + { + json_path_step *cur_step= p->cur_step; + switch (je->state) + { + case json_engine::KEY: + DBUG_ASSERT(cur_step->type == json_path_step::KEY); + if (!cur_step->wild) + { + kn.set_str(cur_step->key, cur_step->key_end); + if (!json_key_matches(je, &kn)) + { + if (je->skip_key()) + goto exit; + continue; + } + } + if (cur_step == p->last_step || + handle_match(je, p, n_arrays)) + goto exit; + break; + case json_engine::VALUE: + DBUG_ASSERT(cur_step->type == json_path_step::ARRAY); + if (cur_step->wild || + cur_step->n_item == n_arrays[cur_step - p->steps]) + { + /* Array item matches. */ + if (cur_step == p->last_step || + handle_match(je, p, n_arrays)) + goto exit; + } + else + { + je->skip_array_item(); + n_arrays[cur_step - p->steps]++; + } + break; + case json_engine::OB_E: + case json_engine::AR_E: + p->cur_step--; + break; + default: + DBUG_ASSERT(0); + break; + }; + } while (je->scan_next() == 0); + + /* No luck. */ + return 1; + +exit: + return je->error; +} + + +String *Item_func_json_value::val_str(String *str) +{ + json_engine je; + String *js= args[0]->val_str(&tmp_js); + + if (!path.constant || !path.parsed) + { + String *s_p= args[1]->val_str(&tmp_path); + if (s_p && + path.setup(s_p->charset(), (const uchar *) s_p->ptr(), + (const uchar *) s_p->ptr() + s_p->length())) + goto error; + path.parsed= TRUE;
better: path.parsed= path.constant; and only if(path.parsed) in the if() above.
+ } + + if ((null_value= args[0]->null_value || args[1]->null_value)) + return NULL; + + je.scan_start(js->charset(),(const uchar *) js->ptr(), + (const uchar *) js->ptr() + js->length()); + + path.cur_step= path.steps; +continue_search: + if (find_value(&je, js, &path)) + { + if (je.error) + goto error; + + null_value= 1; + return 0; + }
^^^ this { .... } can be replaced with "goto error"
+ + if (je.read_value()) + goto error; + + if (!je.value_scalar()) + { + /* We only look for scalar values! */ + if (je.skip_level() || je.scan_next()) + goto error; + goto continue_search; + }
^^^ I don't understand that part. 1. what is je.skip_level() || je.scan_next() ? 2. is it what standard says? that the search should continue over non-scalars instead of aborting with an error?
+ + str->set((const char *) je.value, je.value_len, je.cs);
is that right? json_value is supposed to the return a value, not json. doesn't it mean that you should convert this json string to its actual string value? remove all escapes, that is?
+ return str; + +error: + null_value= 1; + return 0; +} + + +static int alloc_tmp_paths(THD *thd, uint n_paths, + json_path_with_flags **paths,String ***tmp_paths) +{ + uint n; + *paths= (json_path_with_flags *) alloc_root(thd->mem_root, + sizeof(json_path_with_flags) * n_paths); + *tmp_paths= (String **) alloc_root(thd->mem_root, sizeof(String *) * n_paths); + if (*paths == 0 || *tmp_paths == 0) + return 1; + + for (n=0; n<n_paths; n++) + { + if (((*tmp_paths)[n]= new(thd->mem_root) String) == 0) + return 1; + } + return 0; +}
Uh. String allocates the memory on the heap. Now, you allocate the String itself on a memroot. That's fine, but I don't see where you destroy these Strings, they must be explicitly destroyed, even if they're in a memroot.
+ + +static void mark_constant_paths(json_path_with_flags *p, + Item** args, uint n_args) +{ + uint n; + for (n= 0; n < n_args; n++) + p[n].set_constant_flag(args[n]->const_item()); +}
perhaps a cleaner option would be to create, say, class Item_func_json_many_paths : public Item_str_func { String tmp_js; json_path_with_flags *paths; String **tmp_paths; void mark_constant_paths(); int alloc_tmp_paths(); and inherit Item_func_json_extract and Item_func_json_contains_path from it.
+ + +bool Item_func_json_extract::fix_fields(THD *thd, Item **ref) +{ + return alloc_tmp_paths(thd, arg_count-1, &paths, &tmp_paths) || + Item_str_func::fix_fields(thd, ref); +} + + +void Item_func_json_extract::fix_length_and_dec() +{ + collation.set(args[0]->collation); + max_length= args[0]->max_length * (arg_count - 1); + + mark_constant_paths(paths, args+1, arg_count-1); +} + + +String *Item_func_json_extract::val_str(String *str) +{ + String *js= args[0]->val_str(&tmp_js); + json_engine je; + bool multiple_values_found= FALSE; + const uchar *value; + const char *first_value= NULL, *first_p_value; + uint n_arg, v_len, first_len, first_p_len; + + if ((null_value= args[0]->null_value)) + return 0; + + str->set_charset(js->charset()); + str->length(0); + + for (n_arg=1; n_arg < arg_count; n_arg++) + { + json_path_with_flags *c_path= paths + n_arg - 1; + if (!c_path->constant || !c_path->parsed) + { + String *s_p= args[n_arg]->val_str(tmp_paths[n_arg-1]); + if (s_p && + c_path->setup(s_p->charset(), (const uchar *) s_p->ptr(), + (const uchar *) s_p->ptr() + s_p->length())) + goto error; + c_path->parsed= TRUE; + }
same as in Item_func_json_value, could be moved to a common parent method, Item_func_json::setup_path(); it doesn't need to be virtual, so compiler will inline it and you'll have the same end result with less copy-pasted code
+ + je.scan_start(js->charset(),(const uchar *) js->ptr(), + (const uchar *) js->ptr() + js->length()); + + c_path->cur_step= c_path->steps; + + if (find_value(&je, js, c_path)) + { + /* Path wasn't found. */ + if (je.error) + goto error; + + continue; + } + + if (je.read_value()) + goto error; + + value= je.value_begin; + if (je.value_scalar()) + v_len= je.value_end - value; + else + { + if (je.skip_level()) + goto error; + v_len= je.c_str - value; + } + + if (!multiple_values_found) + { + if (first_value == NULL) + { + /* + Just remember the first value as we don't know yet + if we need to create an array out of it or not. + */ + first_value= (const char *) value; + first_len= v_len; + /* + We need this as we have to preserve quotes around string + constants if we use the value to create an array. Otherwise + we get the value without the quotes. + */ + first_p_value= (const char *) je.value; + first_p_len= je.value_len; + continue; + } + else + { + multiple_values_found= TRUE; /* We have to make an JSON array. */ + if ( str->append("[") || + str->append(first_value, first_len)) + goto error; /* Out of memory. */ + } + + } + if (str->append(", ", 2) || + str->append((const char *) value, v_len)) + goto error; /* Out of memory. */ + } + + if (first_value == NULL) + { + /* Nothing was found. */ + null_value= 1; + return 0; + } + + if (multiple_values_found ? + str->append("]") : + str->append(first_p_value, first_p_len)) + goto error; /* Out of memory. */ + + return str; + +error: + /* TODO: launch error messages. */ + null_value= 1; + return 0; +} + + +longlong Item_func_json_extract::val_int() +{ + String *js= args[0]->val_str(&tmp_js); + json_engine je; + uint n_arg; + + if ((null_value= args[0]->null_value)) + return 0; + + for (n_arg=1; n_arg < arg_count; n_arg++) + { + json_path_with_flags *c_path= paths + n_arg - 1; + if (!c_path->constant || !c_path->parsed) + { + String *s_p= args[n_arg]->val_str(tmp_paths[n_arg-1]); + if (s_p && + c_path->setup(s_p->charset(), (const uchar *) s_p->ptr(), + (const uchar *) s_p->ptr() + s_p->length())) + goto error; + c_path->parsed= TRUE; + } + + je.scan_start(js->charset(),(const uchar *) js->ptr(), + (const uchar *) js->ptr() + js->length()); + + c_path->cur_step= c_path->steps; + + if (find_value(&je, js, c_path)) + { + /* Path wasn't found. */ + if (je.error) + goto error; + + continue; + } + + if (je.read_value()) + goto error; + + if (je.value_scalar()) + { + int err; + char *v_end= (char *) je.value_end; + return (je.cs->cset->strtoll10)(je.cs, (const char *) je.value_begin, + &v_end, &err); + } + else + break; + } + + /* Nothing was found. */ + null_value= 1; + return 0; + +error: + /* TODO: launch error messages. */ + null_value= 1; + return 0; +} + + +bool Item_func_json_contains_path::fix_fields(THD *thd, Item **ref) +{ + return alloc_tmp_paths(thd, arg_count-2, &paths, &tmp_paths) || + Item_int_func::fix_fields(thd, ref); +} + + +void Item_func_json_contains_path::fix_length_and_dec() +{ + ooa_constant= args[1]->const_item(); + ooa_parsed= FALSE; + mark_constant_paths(paths, args+2, arg_count-2); + Item_int_func::fix_length_and_dec(); +} + + +longlong Item_func_json_contains_path::val_int() +{ + String *js= args[0]->val_str(&tmp_js); + json_engine je; + uint n_arg; + longlong result; + + if ((null_value= args[0]->null_value)) + return 0; + + if (!ooa_constant || !ooa_parsed) + { + char buff[20]; + String *res, tmp(buff, sizeof(buff), &my_charset_bin); + res= args[1]->val_str(&tmp); + mode_one=eq_ascii_string(res->charset(), "one", + res->ptr(), res->length()); + if (!mode_one) + { + if (!eq_ascii_string(res->charset(), "all", res->ptr(), res->length())) + goto error; + } + ooa_parsed= TRUE; + } + + result= !mode_one; + for (n_arg=2; n_arg < arg_count; n_arg++) + { + json_path_with_flags *c_path= paths + n_arg - 2; + if (!c_path->constant || !c_path->parsed) + { + String *s_p= args[n_arg]->val_str(tmp_paths[n_arg-2]); + if (s_p && + c_path->setup(s_p->charset(), (const uchar *) s_p->ptr(), + (const uchar *) s_p->ptr() + s_p->length())) + goto error; + c_path->parsed= TRUE; + } + + je.scan_start(js->charset(),(const uchar *) js->ptr(), + (const uchar *) js->ptr() + js->length()); + + c_path->cur_step= c_path->steps; + if (find_value(&je, js, c_path)) + { + /* Path wasn't found. */ + if (je.error) + goto error; + + if (!mode_one) + { + result= 0; + break; + } + } + else if (mode_one) + { + result= 1; + break; + } + } + + + return result; + +error: + null_value= 1; + return 0; +} + + +bool Item_func_json_array_append::fix_fields(THD *thd, Item **ref) +{ + return alloc_tmp_paths(thd, arg_count/2, &paths, &tmp_paths) || + Item_str_func::fix_fields(thd, ref); +} + + +void Item_func_json_array_append::fix_length_and_dec() +{ + uint n_arg; + ulonglong char_length; + + collation.set(args[0]->collation); + char_length= args[0]->max_char_length(); + + for (n_arg= 1; n_arg < arg_count; n_arg+= 2) + { + paths[n_arg-1].set_constant_flag(args[n_arg]->const_item()); + char_length+= args[n_arg+1]->max_char_length() + 4; + } + + fix_char_length_ulonglong(char_length); +} + + +static int append_json(String *str, Item *item, String *tmp_val) +{ + if (item->is_bool_type()) + { + longlong v_int= item->val_int(); + if (item->null_value) + goto append_null; + + const char *t_f; + t_f= v_int ? "true" : "false"; + return str->append(t_f, strlen(t_f)); + } + { + String *sv= item->val_str(tmp_val); + if (item->null_value) + goto append_null; + return str->append(*sv);
It seems that you believe the item to be a number. Because I'd expect you to quote and properly escape strings here.
+ } + +append_null: + return str->append("null", 4); +} + + +String *Item_func_json_array_append::val_str(String *str) +{ + json_engine je; + String *js= args[0]->val_str(&tmp_js); + uint n_arg, n_path, str_rest_len; + const uchar *ar_end; + + DBUG_ASSERT(fixed == 1); + + if ((null_value= args[0]->null_value)) + return 0; + + for (n_arg=1, n_path=0; n_arg < arg_count; n_arg+=2, n_path++) + { + json_path_with_flags *c_path= paths + n_path; + if (!c_path->constant || !c_path->parsed) + { + String *s_p= args[n_arg]->val_str(tmp_paths[n_path]); + if (s_p && + c_path->setup(s_p->charset(), (const uchar *) s_p->ptr(), + (const uchar *) s_p->ptr() + s_p->length())) + goto error; + c_path->parsed= TRUE; + } + if (args[n_arg]->null_value) + { + null_value= 1; + return 0; + } + + je.scan_start(js->charset(),(const uchar *) js->ptr(), + (const uchar *) js->ptr() + js->length()); + c_path->cur_step= c_path->steps; + + if (find_value(&je, js, c_path)) + { + if (je.error) + goto error; + null_value= 1; + return 0; + } + + if (je.read_value()) + goto error; + + if (je.value_type != json_engine::ARRAY) + { + /* Must be an array. */ + goto error; + } + + if (je.skip_level()) + goto error; + + str->length(0); + str->set_charset(js->charset()); + if (str->reserve(js->length() + 8, 512)) + goto error; /* Out of memory. */ + ar_end= je.c_str - je.sav_c_len; + str_rest_len= js->length() - (ar_end - (const uchar *) js->ptr()); + str->q_append(js->ptr(), ar_end-(const uchar *) js->ptr()); + str->append(", ", 2); + if (append_json(str, args[n_arg+1], &tmp_val)) + goto error; /* Out of memory. */ + + if (str->reserve(str->length() + str_rest_len, 512)) + goto error; /* Out of memory. */ + str->q_append((const char *) ar_end, str_rest_len); + } + + return str; + +error: + null_value= 1; + return 0; +} +
Regards, Sergei Chief Architect MariaDB and security@mariadb.org
participants (1)
-
Sergei Golubchik