[Commits] 103949d566d: MDEV-26764: JSON_HB Histograms: handle BINARY and unassigned characters
revision-id: 103949d566d0652b781a17ce414ed07e7210ef07 (mariadb-10.6.1-326-g103949d566d) parent(s): 5fb922bea0c7f70b6c912946464c43b8c6a007d5 author: Sergei Petrunia committer: Sergei Petrunia timestamp: 2021-12-03 20:13:43 +0300 message: MDEV-26764: JSON_HB Histograms: handle BINARY and unassigned characters Encode such characters in hex. --- mysql-test/main/statistics_json.result | 32 ++++++++++- mysql-test/main/statistics_json.test | 7 ++- sql/opt_histogram_json.cc | 100 +++++++++++++++++++++++++++------ sql/opt_histogram_json.h | 13 ++++- 4 files changed, 130 insertions(+), 22 deletions(-) diff --git a/mysql-test/main/statistics_json.result b/mysql-test/main/statistics_json.result index 1948d5acb4e..50a79d8f834 100644 --- a/mysql-test/main/statistics_json.result +++ b/mysql-test/main/statistics_json.result @@ -7896,16 +7896,41 @@ a drop table t1; # # Another testcase: use a character that cannot be represented in utf8: +# Also, now it's testcase for: +# MDEV-26764: JSON_HB Histograms: handle BINARY and unassigned characters # create table t1 ( a varchar(100) character set cp1251); -insert into t1 values ( _cp1251 x'88'),( _cp1251 x'98'); +insert into t1 values ( _cp1251 x'88'),( _cp1251 x'88'), ( _cp1251 x'88'); +insert into t1 values ( _cp1251 x'98'),( _cp1251 x'98'); analyze table t1 persistent for all; Table Op Msg_type Msg_text -test.t1 analyze status Operation failed +test.t1 analyze status Engine-independent statistics collected +test.t1 analyze status OK select hist_type, histogram from mysql.column_stats where db_name=database() and table_name='t1'; hist_type histogram +JSON_HB { + "target_histogram_size": 10, + "collected_at": "REPLACED", + "collected_by": "REPLACED", + "histogram_hb": [ + { + "start": "€", + "size": 0.6, + "ndv": 1 + }, + { + "start_hex": "98", + "end_hex": "98", + "size": 0.4, + "ndv": 1 + } + ] +} +analyze select * from t1 where a=_cp1251 x'88'; +id select_type table type possible_keys key key_len ref rows r_rows filtered r_filtered Extra +1 SIMPLE t1 ALL NULL NULL NULL NULL 5 5.00 60.00 60.00 Using where drop table t1; # # ASAN use-after-poison my_strnxfrm_simple_internal / Histogram_json_hb::range_selectivity ... @@ -8102,7 +8127,8 @@ set histogram_type= JSON_HB, histogram_size= 1; insert into t1 values ('foo'),(unhex('9C')); analyze table t1 persistent for all; Table Op Msg_type Msg_text -test.t1 analyze status Operation failed +test.t1 analyze status Engine-independent statistics collected +test.t1 analyze status OK select * from t1; a foo diff --git a/mysql-test/main/statistics_json.test b/mysql-test/main/statistics_json.test index bcc80093891..b67df41d9ba 100644 --- a/mysql-test/main/statistics_json.test +++ b/mysql-test/main/statistics_json.test @@ -227,9 +227,12 @@ drop table t1; --echo # --echo # Another testcase: use a character that cannot be represented in utf8: +--echo # Also, now it's testcase for: +--echo # MDEV-26764: JSON_HB Histograms: handle BINARY and unassigned characters --echo # create table t1 ( a varchar(100) character set cp1251); -insert into t1 values ( _cp1251 x'88'),( _cp1251 x'98'); +insert into t1 values ( _cp1251 x'88'),( _cp1251 x'88'), ( _cp1251 x'88'); +insert into t1 values ( _cp1251 x'98'),( _cp1251 x'98'); analyze table t1 persistent for all; --source include/histogram_replaces.inc @@ -237,6 +240,8 @@ select hist_type, histogram from mysql.column_stats where db_name=database() and table_name='t1'; +analyze select * from t1 where a=_cp1251 x'88'; + drop table t1; --echo # diff --git a/sql/opt_histogram_json.cc b/sql/opt_histogram_json.cc index 023e69b83aa..dfac0dd3f2f 100644 --- a/sql/opt_histogram_json.cc +++ b/sql/opt_histogram_json.cc @@ -70,11 +70,11 @@ static bool json_unescape_to_string(const char *val, int val_len, String* out) succeeds. */ -static bool json_escape_to_string(const String *str, String* out) +static int json_escape_to_string(const String *str, String* out) { // Make sure 'out' has some memory allocated. if (!out->alloced_length() && out->alloc(128)) - return true; + return JSON_ERROR_OUT_OF_SPACE; while (1) { @@ -90,15 +90,15 @@ static bool json_escape_to_string(const String *str, String* out) if (res >= 0) { out->length(res); - return false; // Ok + return 0; // Ok } if (res != JSON_ERROR_OUT_OF_SPACE) - return true; // Some conversion error + return res; // Some conversion error // Out of space error. Try with a bigger buffer if (out->alloc(out->alloced_length()*2)) - return true; + return JSON_ERROR_OUT_OF_SPACE; } } @@ -208,8 +208,7 @@ class Histogram_json_builder : public Histogram_builder */ bool finalize_bucket_with_end_value(void *elem) { - writer.add_member("end"); - if (append_column_value(elem)) + if (append_column_value(elem, false)) return true; finalize_bucket(); return false; @@ -224,19 +223,18 @@ class Histogram_json_builder : public Histogram_builder { DBUG_ASSERT(bucket.size == 0); writer.start_object(); - writer.add_member("start"); - if (append_column_value(elem)) + if (append_column_value(elem, true)) return true; bucket.ndv= 1; bucket.size= cnt; return false; } - + /* Append the passed value into the JSON writer as string value */ - bool append_column_value(void *elem) + bool append_column_value(void *elem, bool is_start) { StringBuffer<MAX_FIELD_WIDTH> val; @@ -246,12 +244,21 @@ class Histogram_json_builder : public Histogram_builder // Escape the value for JSON StringBuffer<MAX_FIELD_WIDTH> escaped_val; - if (json_escape_to_string(str, &escaped_val)) - return true; - - // Note: The Json_writer does NOT do escapes (perhaps this should change?) - writer.add_str(escaped_val.c_ptr_safe()); - return false; + int rc= json_escape_to_string(str, &escaped_val); + if (!rc) + { + writer.add_member(is_start? "start": "end"); + writer.add_str(escaped_val.c_ptr_safe()); + return false; + } + if (rc == JSON_ERROR_ILLEGAL_SYMBOL) + { + escaped_val.set_hex(val.ptr(), val.length()); + writer.add_member(is_start? "start_hex": "end_hex"); + writer.add_str(escaped_val.c_ptr_safe()); + return false; + } + return true; } /* @@ -496,6 +503,41 @@ bool read_bucket_endpoint(json_engine_t *je, Field *field, String *out, } +bool read_hex_bucket_endpoint(json_engine_t *je, Field *field, String *out, + const char **err) +{ + if (json_read_value(je)) + return true; + + if (je->value_type != JSON_VALUE_STRING || je->value_escaped || + (je->value_len & 1)) + { + *err= "Expected a hex string"; + return true; + } + StringBuffer<128> buf; + + for (auto pc= je->value; pc < je->value + je->value_len; pc+=2) + { + int hex_char1= hexchar_to_int(pc[0]); + int hex_char2= hexchar_to_int(pc[1]); + if (hex_char1 == -1 || hex_char2 == -1) + { + *err= "Expected a hex string"; + return true; + } + buf.append((hex_char1 << 4) | hex_char2); + } + + field->store_text(buf.ptr(), buf.length(), field->charset()); + out->alloc(field->pack_length()); + uint bytes= field->get_key_image((uchar*)out->ptr(), + field->key_length(), Field::itRAW); + out->length(bytes); + return false; +} + + /* @brief Parse a JSON reprsentation for one histogram bucket @@ -619,6 +661,30 @@ int Histogram_json_hb::parse_bucket(json_engine_t *je, Field *field, } save1.restore_to(je); + // Less common endoints: + Json_string start_hex_str("start_hex"); + if (json_key_matches(je, start_hex_str.get())) + { + if (read_hex_bucket_endpoint(je, field, &value_buf, err)) + return 1; + + have_start= true; + continue; + } + save1.restore_to(je); + + Json_string end_hex_str("end_hex"); + if (json_key_matches(je, end_hex_str.get())) + { + if (read_hex_bucket_endpoint(je, field, &value_buf, err)) + return 1; + last_bucket_end_endp.assign(value_buf.ptr(), value_buf.length()); + *assigned_last_end= true; + continue; + } + save1.restore_to(je); + + // Some unknown member. Skip it. if (json_skip_key(je)) return 1; diff --git a/sql/opt_histogram_json.h b/sql/opt_histogram_json.h index a2f8bdd37a5..327c852db98 100644 --- a/sql/opt_histogram_json.h +++ b/sql/opt_histogram_json.h @@ -32,12 +32,18 @@ "histogram_hb": [ { "start": "value", "size":nnn.nn, "ndv": nnn }, ... - { "start": "value", "size":nnn.nn, "ndv": nnn, "end": "value"} + + // Optionally, start and/or end can be replaced with _hex variant + { "start_hex: "value", "size":nnn.nn, "ndv":nnn}, + + ... + { "start": "value", "size":nnn.nn, "ndv": nnn, "end": "value"}, ] } The histogram is an object with single member named Histogram_json_hb:: JSON_NAME. The value of that member is an array of buckets. + Each bucket is an object with these members: "start" - the first value in the bucket. "size" - fraction of table rows that is contained in the bucket. @@ -51,6 +57,11 @@ The exception is single-point buckets where last value is the same as the first value. + + start/end can be replaced with start_hex/end_hex. In _hex variant, the + constant is encoded in hex. This encoding is used to handle so called + "unassigned characters": some non-UTF8 charsets have byte combinations that + are not mapped to any UTF8 character. */ class Histogram_json_hb : public Histogram_base
participants (1)
-
psergey