Hi, Alexander! On Feb 19, Alexander Barkov wrote:
Here's the first part, fixing only the problem reported in MDEV-7231.
Thanks! The fix is ok to push. Good solution with passing escape functions down. I only had few remarks about naming and comments, please see below.
diff --git a/sql/sql_lex.cc b/sql/sql_lex.cc index 898e3ae..231f67e 100644 --- a/sql/sql_lex.cc +++ b/sql/sql_lex.cc @@ -434,6 +448,196 @@ void Lex_input_stream::body_utf8_append_literal(THD *thd, m_cpp_utf8_processed_ptr= end_ptr; }
+ + + +extern "C" { + +/** + Escape a character. Consequently puts "escape" and "wc" characters into + the destination utf8 string. + @param cs - the character set (utf8) + @param escape - the escape character (backslash, single quote, double quote) + @param wc - the character to be escaped + @param str - the destination string + @param end - the end of the destination string + @returns - a code according to the wc_mb() convension. +*/ +int my_wc_mb_utf8_with_escape(CHARSET_INFO *cs, my_wc_t escape, my_wc_t wc, + uchar *str, uchar *end) +{ + DBUG_ASSERT(escape > 0); + if (str + 1 >= end) + return MY_CS_TOOSMALL2; // Not enough space, need at least two bytes. + *str= escape; + int cnvres= my_charset_utf8_handler.wc_mb(cs, wc, str + 1, end); + if (cnvres > 0) + return cnvres + 1; // The character was normally put + if (cnvres == MY_CS_ILUNI) + return MY_CS_ILUNI; // Could not encode "wc" (e.g. non-BMP character) + DBUG_ASSERT(cnvres <= MY_CS_TOOSMALL); + return cnvres - 1; // Not enough space +} + + +/** + Optionally escape a character. + If "escape" is non-zero, then both "escape" and "wc" are put to + the destination string. Otherwise, only "wc" is put. + @param cs - the character set (utf8) + @param wc - the character to be optionally escaped + @param escape - the escape character, or 0 + @param ewc - the escaped replacement of "wc" (e.g. 't' for '\t') + @param str - the destination string + @param end - the end of the destination string + @returns - a code according to the wc_mb() conversion. +*/ +int my_wc_mb_utf8_opt_escape(CHARSET_INFO *cs, + my_wc_t wc, my_wc_t escape, my_wc_t ewc, + uchar *str, uchar *end) +{ + return escape ? my_wc_mb_utf8_with_escape(cs, escape, ewc, str, end) : + my_charset_utf8_handler.wc_mb(cs, wc, str, end); +} + +/** + Encode a character with optional backlash escaping and quote escaping. + Quote marks are escaped using another quote mark. + Additionally, if "escape" is non-zero, then special characters are + also escaped using "escape". + Otherwise (if "escape" is zero, e.g. in case of MODE_NO_BACKSLASH_ESCAPES), + then special characters are not escaped and handled as normal characters. + + @param cs - the character set (utf8) + @param wc - the character to be encoded + @param str - the destination string + @param end - the end of the destination string + @param quotation - the string delimiter (e.g. ' or ") + @param escape - the escape character (backslash, or 0) + @returns - a code according to the wc_mb() convension. +*/ +int my_wc_mb_utf8_escape(CHARSET_INFO *cs, my_wc_t wc, uchar *str, uchar *end, + my_wc_t quotation, my_wc_t escape)
it's not a *quotation*. Either *quotation mark* or *quote* or *quote mark*. A *quotation* is the text between *quotation marks* So, here, please, either "quote" or "sep" as everywhere else. I think, I'd prefer "sep" for consistency.
+{ + DBUG_ASSERT(escape == 0 || escape == '\\');
may be a DBUG_ASSERT(quotation == '"' || quotation == '\''); too?
+ switch (wc) { + case 0: return my_wc_mb_utf8_opt_escape(cs, wc, escape, '0', str, end); + case '\t': return my_wc_mb_utf8_opt_escape(cs, wc, escape, 't', str, end); + case '\r': return my_wc_mb_utf8_opt_escape(cs, wc, escape, 'r', str, end); + case '\n': return my_wc_mb_utf8_opt_escape(cs, wc, escape, 'n', str, end); + case '\032': return my_wc_mb_utf8_opt_escape(cs, wc, escape, 'Z', str, end); + case '\'': + case '\"': + if (wc == quotation) + return my_wc_mb_utf8_with_escape(cs, wc, wc, str, end); + } + return my_charset_utf8_handler.wc_mb(cs, wc, str, end); // No escaping needed +} + + +/** wc_mb() compatible routines for all sql_mode and delimiter combinations */ +int my_wc_mb_utf8_escape_single_quote_and_backslash(CHARSET_INFO *cs, + my_wc_t wc, + uchar *str, uchar *end) +{ + return my_wc_mb_utf8_escape(cs, wc, str, end, '\'', '\\'); +} + + +int my_wc_mb_utf8_escape_double_quote_and_backslash(CHARSET_INFO *cs, + my_wc_t wc, + uchar *str, uchar *end) +{ + return my_wc_mb_utf8_escape(cs, wc, str, end, '"', '\\'); +} + + +int my_wc_mb_utf8_escape_single_quote(CHARSET_INFO *cs, my_wc_t wc, + uchar *str, uchar *end) +{ + return my_wc_mb_utf8_escape(cs, wc, str, end, '\'', 0); +} + + +int my_wc_mb_utf8_escape_double_quote(CHARSET_INFO *cs, my_wc_t wc, + uchar *str, uchar *end) +{ + return my_wc_mb_utf8_escape(cs, wc, str, end, '"', 0); +} + +}; // End of extern "C" + + +/** + Get an escaping function, depending on the current sql_mode and the + string separator. +*/ +my_charset_conv_wc_mb +Lex_input_stream::get_escape_func(THD *thd, my_wc_t quotation) const
again, not "quotation"
+{ + return thd->backslash_escapes() ? + (quotation == '"' ? my_wc_mb_utf8_escape_double_quote_and_backslash: + my_wc_mb_utf8_escape_single_quote_and_backslash) : + (quotation == '"' ? my_wc_mb_utf8_escape_double_quote: + my_wc_mb_utf8_escape_single_quote); +} + + +/** + Append a text literal to the end of m_body_utf8. + The string is escaped according to the current sql_mode and the + string delimiter (e.g. ' or "). + + @param thd - current THD + @param txt - the string to be appended to m_body_utf8. + Note, the string must be already unescaped. + @param cs - the character set of the string + @param end_ptr - m_cpp_utf8_processed_ptr will be set to this value + (see body_utf8_append_literal for details) + @param quotation - the string delimiter (single or double quotation)
again, not "quotation"
+*/ +void Lex_input_stream::body_utf8_append_escape(THD *thd, + const LEX_STRING *txt, + CHARSET_INFO *cs, + const char *end_ptr, + my_wc_t quotation) +{ + DBUG_ASSERT(quotation == '\'' || quotation == '"'); + if (!m_cpp_utf8_processed_ptr) + return; + /** + In case of "SET NAMES binary; SELECT 'aaa';" or "SELECT _binary'xxx';", + we reinterpret the string as utf8, as calling wc_mb() for "binary" is + not reliable. + */ + if (cs == &my_charset_bin) + cs= &my_charset_utf8_general_ci;
But body_utf8_append_literal() doesn't do that. Is that a bug?
+ uint errors; + /** + We previously alloced m_body_utf8 to be able to store the query with all + strings properly escaped. See get_body_utf8_maximum_length(). + So here we have guaranteedly enough space to append any string literal + with escaping. Passing txt->length*2 as "available space" should be good + enough.
what do you mean "good enough"? This wording suggests that it's good enough in most cases, but some specially constructed literals might be longer. But it's impossible. Better write, like Passing txt->length*2 as "available space" is always safe here.
+ For better safety purposes we could calculate get_body_utf8_maximum_length() + every time we append a string, but this would affect performance negatively, + so let's check that we don't get beyond the allocated buffer in + debug build only. + */ + DBUG_ASSERT(m_body_utf8 + get_body_utf8_maximum_length(thd) >= + m_body_utf8_ptr + txt->length * 2); + uint32 cnv_length= my_convert_using_func(m_body_utf8_ptr, txt->length * 2, + &my_charset_utf8_general_ci, + get_escape_func(thd, quotation), + txt->str, txt->length, + cs, cs->cset->mb_wc, + &errors); + m_body_utf8_ptr+= cnv_length; + *m_body_utf8_ptr= 0; + m_cpp_utf8_processed_ptr= end_ptr; +} + + void Lex_input_stream::add_digest_token(uint token, LEX_YYSTYPE yylval) { if (m_digest != NULL) @@ -1541,23 +1753,23 @@ static int lex_one_token(YYSTYPE *yylval, THD *thd) } /* " used for strings */ case MY_LEX_STRING: // Incomplete text string + { + uint sep; - if (lip->get_text(&yylval->lex_str, 1, 1)) + if (lip->get_text(&yylval->lex_str, (sep= lip->yyGetLast()), 1, 1)) { state= MY_LEX_CHAR; // Read char by char break; } - + CHARSET_INFO *strcs= lip->m_underscore_cs ? lip->m_underscore_cs : cs; lip->body_utf8_append(lip->m_cpp_text_start);
- lip->body_utf8_append_literal(thd, &yylval->lex_str,
It seems that after your patch body_utf8_append_literal will be used only for identifiers. Perhaps, rename to body_utf8_append_ident ? A specialized version, only for identifiers? And then there is no need to pass charset down, it can always use thd->charset() implicitly, as identifiers are always in thd->charset().
- lip->m_underscore_cs ? lip->m_underscore_cs : cs, - lip->m_cpp_text_end); - + lip->body_utf8_append_escape(thd, &yylval->lex_str, strcs, + lip->m_cpp_text_end, sep); lip->m_underscore_cs= NULL;
lex->text_string_is_7bit= (lip->tok_bitmap & 0x80) ? 0 : 1; return(TEXT_STRING); - + } case MY_LEX_COMMENT: // Comment lex->select_lex.options|= OPTION_FOUND_COMMENT; while ((c = lip->yyGet()) != '\n' && c) ;
Regards, Sergei Chief Architect MariaDB and security@mariadb.org