[Maria-developers] Please review MDEV-6353 my_ismbchar() and my_mbcharlen() refactoring
Hi Sergei, Please review a partial patch for MDEV-6353. It removes my_mbcharlen() in all remaining pieces of the code, except LOAD DATA and SELECT INTO OUTFILE. I'll do LOAD DATA and SELECT INTO OUTFILE in a separate patch. Thanks.
Hi, Alexander! On Mar 30, Alexander Barkov wrote:
commit 4ab28aca964fa646aa55676db813dbed66b83093 Author: Alexander Barkov <bar@mariadb.org> Date: Mon Mar 28 11:05:51 2016 +0400
MDEV-6353 my_ismbchar() and my_mbcharlen() refactoring, part 1. Fixing the debug_sync and create_options related code not to use my_mbcharlen(): - debug_sync_token() now uses cs->cset->scan(). Passing the end of the string pointer to debug_sync_update() in order to be able to use scan(). Adding support for a new pattern scan(MY_SEQ_NONSPACES). It does scans everything that scan(MY_SEQ_SPACES) does not. - Fixing set_one_value() to iterate bytes one by one. This is safe, because ',' cannot be a part of a multi-byte character in UTF8.
Looks ok, thanks! But don't push this commit alone, please. Wait until all parts of MDEV-6353 are ready and push them together. Regards, Sergei Chief Architect MariaDB and security@mariadb.org
Hello Sergei, On 03/30/2016 08:41 PM, Sergei Golubchik wrote:
Hi, Alexander!
On Mar 30, Alexander Barkov wrote:
commit 4ab28aca964fa646aa55676db813dbed66b83093 Author: Alexander Barkov <bar@mariadb.org> Date: Mon Mar 28 11:05:51 2016 +0400
MDEV-6353 my_ismbchar() and my_mbcharlen() refactoring, part 1. Fixing the debug_sync and create_options related code not to use my_mbcharlen(): - debug_sync_token() now uses cs->cset->scan(). Passing the end of the string pointer to debug_sync_update() in order to be able to use scan(). Adding support for a new pattern scan(MY_SEQ_NONSPACES). It does scans everything that scan(MY_SEQ_SPACES) does not. - Fixing set_one_value() to iterate bytes one by one. This is safe, because ',' cannot be a part of a multi-byte character in UTF8.
Looks ok, thanks! But don't push this commit alone, please. Wait until all parts of MDEV-6353 are ready and push them together.
Please review the final version, now removing all my_mbcharlen(). Thanks.
Regards, Sergei Chief Architect MariaDB and security@mariadb.org
Hi, Alexander! TL;DR: looks fine. Few minor comments, see below. On Apr 08, Alexander Barkov wrote:
diff --git a/include/m_ctype.h b/include/m_ctype.h index c892d576..bb633f8 100644 --- a/include/m_ctype.h +++ b/include/m_ctype.h @@ -1010,11 +1009,19 @@ int my_charlen(CHARSET_INFO *cs, const char *str, const char *end) return (cs->cset->charlen)(cs, (const uchar *) str, (const uchar *) end); } -#ifdef USE_MB -#define my_mbcharlen(s, a) ((s)->cset->mbcharlen((s),(a))) -#else -#define my_mbcharlen(s, a) 1 -#endif + + +/** + Convert broken and incomplete byte sequences to 1 byte. +*/ +static inline +int my_charlen_fix(CHARSET_INFO *cs, const char *str, const char *end) +{ + int char_length= my_charlen(cs, str, end); + DBUG_ASSERT(str < end); + return char_length > 0 ? (uint) char_length : 0U;
0U? or 1U? you convert to 1 byte, and below you change like - if (!(clen= my_mbcharlen(system_charset_info, c))) - clen= 1; + clen= my_charlen_fix(system_charset_info, name, name_end);
+} +
#define my_caseup_str(s, a) ((s)->cset->caseup_str((s), (a))) #define my_casedn_str(s, a) ((s)->cset->casedn_str((s), (a))) diff --git a/mysys/charset.c b/mysys/charset.c index 3c134dc..253dc72 100644 --- a/mysys/charset.c +++ b/mysys/charset.c @@ -909,8 +913,8 @@ size_t escape_string_for_mysql(CHARSET_INFO *charset_info, { char escape= 0; #ifdef USE_MB - int tmp_length; - if (use_mb_flag && (tmp_length= my_ismbchar(charset_info, from, end))) + int tmp_length= my_charlen(charset_info, from, end); + if (use_mb_flag && tmp_length > 1)
Old code didn't call my_ismbchar if use_mb_flag is false. Now you do. Was it intentional? Is it an issue?
{ if (to + tmp_length > to_end) { diff --git a/sql/debug_sync.cc b/sql/debug_sync.cc index 8b3412e..e84f1e8 100644 --- a/sql/debug_sync.cc +++ b/sql/debug_sync.cc @@ -876,18 +876,19 @@ static char *debug_sync_token(char **token_p, uint *token_length_p, char *ptr) /* If necessary, terminate token. */ if (*ptr) { + DBUG_ASSERT(ptr < ptrend); /* Get terminator character length. */ - uint mbspacelen= my_mbcharlen(system_charset_info, (uchar) *ptr); + int mbspacelen= my_charlen(system_charset_info, ptr, ptrend);
why not my_charlen_fix() ?
/* Terminate token. */ *ptr= '\0';
/* Skip the terminator. */ - ptr+= mbspacelen; + ptr+= mbspacelen < 1 ? 1 : mbspacelen;
/* Skip trailing space */ - while (my_isspace(system_charset_info, *ptr)) - ptr+= my_mbcharlen(system_charset_info, (uchar) *ptr); + ptr+= system_charset_info->cset->scan(system_charset_info, + ptr, ptrend, MY_SEQ_SPACES); }
end: diff --git a/sql/debug_sync.h b/sql/debug_sync.h index bf1b316..339a211 100644 --- a/sql/debug_sync.h +++ b/sql/debug_sync.h @@ -45,6 +45,9 @@ extern void debug_sync_init_thread(THD *thd); extern void debug_sync_end_thread(THD *thd); extern bool debug_sync_set_action(THD *thd, const char *action_str, size_t len);
+extern bool debug_sync_update(THD *thd, char *val_str, size_t len); +extern uchar *debug_sync_value_ptr(THD *thd);
I clearly remember that monty has done it recently somewhere (I mean, this and including it into sys_vars.h or ic or cc)
+ #endif /* defined(ENABLED_DEBUG_SYNC) */
#endif /* DEBUG_SYNC_INCLUDED */ diff --git a/sql/sql_class.cc b/sql/sql_class.cc index e3b7056..e29608b 100644 --- a/sql/sql_class.cc +++ b/sql/sql_class.cc @@ -3226,7 +3226,7 @@ int select_export::send_data(List<Item> &items)
if ((NEED_ESCAPING(*pos) || (check_second_byte && - my_mbcharlen(character_set_client, (uchar) *pos) == 2 && + ((uchar) *pos) > 0x7F /* a potential MB2HEAD */ &&
Hmm, why do you do this very low-level charset stuff in sql_class.cc? one is supposed to use charset methods instead.
pos + 1 < end && NEED_ESCAPING(pos[1]))) && /* diff --git a/sql/sys_vars.ic b/sql/sys_vars.ic index 373f583..bb290b6 100644 --- a/sql/sys_vars.ic +++ b/sql/sys_vars.ic @@ -1462,15 +1465,21 @@ class Sys_var_debug_sync :public sys_var String str(buff, sizeof(buff), system_charset_info), *res;
if (!(res=var->value->val_str(&str))) + { var->save_result.string_value.str= const_cast<char*>(""); + var->save_result.string_value.length= 0;
use var->save_result.string_value= empty_lex_string;
+ } else + { var->save_result.string_value.str= thd->strmake(res->ptr(), res->length()); + var->save_result.string_value.length= res->length();
use thd->make_lex_string(&var->save_result.string_value, ...);
+ } return false; } bool session_update(THD *thd, set_var *var) { - extern bool debug_sync_update(THD *thd, char *val_str); - return debug_sync_update(thd, var->save_result.string_value.str); + return debug_sync_update(thd, var->save_result.string_value.str, + var->save_result.string_value.length); } bool global_update(THD *thd, set_var *var) { diff --git a/strings/ctype-ucs2.c b/strings/ctype-ucs2.c index 74e474c..b5fab16 100644 --- a/strings/ctype-ucs2.c +++ b/strings/ctype-ucs2.c @@ -1049,6 +1049,9 @@ my_scan_mb2(CHARSET_INFO *cs __attribute__((unused)), { } return (size_t) (str - str0); + case MY_SEQ_NONSPACES: + DBUG_ASSERT(0);
why? impossible? or not implemented? please add a comment /* not implemented */ in that's the case.
+ /* pass through */ default: return 0; } @@ -2484,6 +2468,9 @@ my_scan_utf32(CHARSET_INFO *cs, str+= res; } return (size_t) (str - str0); + case MY_SEQ_NONSPACES: + DBUG_ASSERT(0);
same as above
+ /* pass through */ default: return 0; }
Regards, Sergei Chief Architect MariaDB and security@mariadb.org
Hi Sergei, Thanks for review. On 05/09/2016 10:30 PM, Sergei Golubchik wrote:
Hi, Alexander!
TL;DR: looks fine. Few minor comments, see below.
On Apr 08, Alexander Barkov wrote:
diff --git a/include/m_ctype.h b/include/m_ctype.h index c892d576..bb633f8 100644 --- a/include/m_ctype.h +++ b/include/m_ctype.h @@ -1010,11 +1009,19 @@ int my_charlen(CHARSET_INFO *cs, const char *str, const char *end) return (cs->cset->charlen)(cs, (const uchar *) str, (const uchar *) end); } -#ifdef USE_MB -#define my_mbcharlen(s, a) ((s)->cset->mbcharlen((s),(a))) -#else -#define my_mbcharlen(s, a) 1 -#endif + + +/** + Convert broken and incomplete byte sequences to 1 byte. +*/ +static inline +int my_charlen_fix(CHARSET_INFO *cs, const char *str, const char *end) +{ + int char_length= my_charlen(cs, str, end); + DBUG_ASSERT(str < end); + return char_length > 0 ? (uint) char_length : 0U;
0U? or 1U? you convert to 1 byte, and below you change like
- if (!(clen= my_mbcharlen(system_charset_info, c))) - clen= 1; + clen= my_charlen_fix(system_charset_info, name, name_end);
Good catch. Fixed.
+} +
#define my_caseup_str(s, a) ((s)->cset->caseup_str((s), (a))) #define my_casedn_str(s, a) ((s)->cset->casedn_str((s), (a))) diff --git a/mysys/charset.c b/mysys/charset.c index 3c134dc..253dc72 100644 --- a/mysys/charset.c +++ b/mysys/charset.c @@ -909,8 +913,8 @@ size_t escape_string_for_mysql(CHARSET_INFO *charset_info, { char escape= 0; #ifdef USE_MB - int tmp_length; - if (use_mb_flag && (tmp_length= my_ismbchar(charset_info, from, end))) + int tmp_length= my_charlen(charset_info, from, end); + if (use_mb_flag && tmp_length > 1)
Old code didn't call my_ismbchar if use_mb_flag is false. Now you do. Was it intentional? Is it an issue?
That was not a big issue. Anyway, I now removed "use_mb_flag" and changed the code like this: - int tmp_length; - if (use_mb_flag && (tmp_length= my_ismbchar(charset_info, from, end))) + int tmp_length= use_mb(charset_info) ? my_charlen(charset_info, from, end) : + 1; + if (tmp_length > 1) ... + if (tmp_length < 1) /* Bad byte sequence */
{ if (to + tmp_length > to_end) { diff --git a/sql/debug_sync.cc b/sql/debug_sync.cc index 8b3412e..e84f1e8 100644 --- a/sql/debug_sync.cc +++ b/sql/debug_sync.cc @@ -876,18 +876,19 @@ static char *debug_sync_token(char **token_p, uint *token_length_p, char *ptr) /* If necessary, terminate token. */ if (*ptr) { + DBUG_ASSERT(ptr < ptrend); /* Get terminator character length. */ - uint mbspacelen= my_mbcharlen(system_charset_info, (uchar) *ptr); + int mbspacelen= my_charlen(system_charset_info, ptr, ptrend);
why not my_charlen_fix() ?
Thanks. It's perfectly suites here. Fixed.
/* Terminate token. */ *ptr= '\0';
/* Skip the terminator. */ - ptr+= mbspacelen; + ptr+= mbspacelen < 1 ? 1 : mbspacelen;
/* Skip trailing space */ - while (my_isspace(system_charset_info, *ptr)) - ptr+= my_mbcharlen(system_charset_info, (uchar) *ptr); + ptr+= system_charset_info->cset->scan(system_charset_info, + ptr, ptrend, MY_SEQ_SPACES); }
end: diff --git a/sql/debug_sync.h b/sql/debug_sync.h index bf1b316..339a211 100644 --- a/sql/debug_sync.h +++ b/sql/debug_sync.h @@ -45,6 +45,9 @@ extern void debug_sync_init_thread(THD *thd); extern void debug_sync_end_thread(THD *thd); extern bool debug_sync_set_action(THD *thd, const char *action_str, size_t len);
+extern bool debug_sync_update(THD *thd, char *val_str, size_t len); +extern uchar *debug_sync_value_ptr(THD *thd);
I clearly remember that monty has done it recently somewhere (I mean, this and including it into sys_vars.h or ic or cc)
I can see a prototype for debug_sunc_update() in 10.1, in the same place in debug_sync.h. It does not have a "len" parameter. So it will be a conflict during a 10.1 -> 10.2 merge. Please use the 10.2 version when merging.
+ #endif /* defined(ENABLED_DEBUG_SYNC) */
#endif /* DEBUG_SYNC_INCLUDED */ diff --git a/sql/sql_class.cc b/sql/sql_class.cc index e3b7056..e29608b 100644 --- a/sql/sql_class.cc +++ b/sql/sql_class.cc @@ -3226,7 +3226,7 @@ int select_export::send_data(List<Item> &items)
if ((NEED_ESCAPING(*pos) || (check_second_byte && - my_mbcharlen(character_set_client, (uchar) *pos) == 2 && + ((uchar) *pos) > 0x7F /* a potential MB2HEAD */ &&
Hmm, why do you do this very low-level charset stuff in sql_class.cc? one is supposed to use charset methods instead.
I'd been thinking how to fix this place for some time, then I decided to go simply with this test for >0x7F and wait for gb18030. When I add gb18030, It will be more clear what kind of virtual charset function is needed. Currently it works fine for all multi-byte charsets supported by OUTFILE.
pos + 1 < end && NEED_ESCAPING(pos[1]))) && /* diff --git a/sql/sys_vars.ic b/sql/sys_vars.ic index 373f583..bb290b6 100644 --- a/sql/sys_vars.ic +++ b/sql/sys_vars.ic @@ -1462,15 +1465,21 @@ class Sys_var_debug_sync :public sys_var String str(buff, sizeof(buff), system_charset_info), *res;
if (!(res=var->value->val_str(&str))) + { var->save_result.string_value.str= const_cast<char*>(""); + var->save_result.string_value.length= 0;
use var->save_result.string_value= empty_lex_string;
Fixed.
+ } else + { var->save_result.string_value.str= thd->strmake(res->ptr(), res->length()); + var->save_result.string_value.length= res->length();
use thd->make_lex_string(&var->save_result.string_value, ...);
Fixed.
+ } return false; } bool session_update(THD *thd, set_var *var) { - extern bool debug_sync_update(THD *thd, char *val_str); - return debug_sync_update(thd, var->save_result.string_value.str); + return debug_sync_update(thd, var->save_result.string_value.str, + var->save_result.string_value.length); } bool global_update(THD *thd, set_var *var) { diff --git a/strings/ctype-ucs2.c b/strings/ctype-ucs2.c index 74e474c..b5fab16 100644 --- a/strings/ctype-ucs2.c +++ b/strings/ctype-ucs2.c @@ -1049,6 +1049,9 @@ my_scan_mb2(CHARSET_INFO *cs __attribute__((unused)), { } return (size_t) (str - str0); + case MY_SEQ_NONSPACES: + DBUG_ASSERT(0);
why? impossible? or not implemented? please add a comment /* not implemented */ in that's the case.
Added a comment.
+ /* pass through */ default: return 0; } @@ -2484,6 +2468,9 @@ my_scan_utf32(CHARSET_INFO *cs, str+= res; } return (size_t) (str - str0); + case MY_SEQ_NONSPACES: + DBUG_ASSERT(0);
same as above
Added a comment
+ /* pass through */ default: return 0; }
Regards, Sergei Chief Architect MariaDB and security@mariadb.org
participants (2)
-
Alexander Barkov
-
Sergei Golubchik