Hi, Alexander! On Dec 08, Alexander Barkov wrote:
Hi Sergei,
Please review a patch for MDEV-7055.
Thanks.
diff --git a/sql/item_timefunc.cc b/sql/item_timefunc.cc index 522004e..a2a6fff 100644 --- a/sql/item_timefunc.cc +++ b/sql/item_timefunc.cc @@ -447,6 +447,70 @@ static bool extract_date_time(DATE_TIME_FORMAT *format,
/** + A multi-byte safe helper class to read characters from a string. + + QQ: Serg: which file to put this new class in? + It can be helpful for some other purposes + (not only here in item_timefunc.cc) + I remember you don't like such things in sql_string.h :)
Thoughts: 1. I'd call it, like, string (or character) iterator 2. There's something like that in 5.6, you might want to see what API they use 3. Should rather be in C (inline, if possible, with a C++ wrapper, if you'd like) so that we can put it in a service along with CHARSET_INFO 4. Should rather be in CHARSET_INFO so that you could have fast bytewise iterator for simple charsets. And yes, 3 and 4 contradict each other, to a certain extent :) That's because they're thoughts, not a complete how-to plan.
+*/ +class Wchar_reader +{ + CHARSET_INFO *m_cs; + const char *m_ptr; + const char *m_end; +public: + Wchar_reader(CHARSET_INFO *cs, const char *str, size_t length) + :m_cs(cs), m_ptr(str), m_end(str + length) + { } + bool eol() const { return m_ptr >= m_end; } + /** + Read a character, return its Unicode code point and octet length. + + @param [OUT] wc - a pointer to a Unicode code point variable + @param [OUT] chlen - a pointer to a character length variable + @return false - on success + @return true - on error (end of line, or a bad byte sequence) + + Converts negative lengths in the range -6..-1 + (which mb_wc() returns for valid but unassigned characters) + to positive lengths 1..6, so the caller does not have + to care about unassigned characters. The caller will just see + such characters as "U+003F QUESTION MARK", but with length + not necessarily equal to 1. + */ + bool read(my_wc_t *wc, int *chlen) + { + *chlen= m_cs->cset->mb_wc(m_cs, wc, (uchar *) m_ptr, (uchar *) m_end); + if (*chlen <= 0) + { + if (*chlen < -6 || *chlen == 0) + return true; // End of line, or a bad byte sequence + *chlen= -(*chlen); // An unassigned (but a valid) character found + *wc= '?'; // Initialize *wc to QUESTION MARK + } + m_ptr+= *chlen; // Shift the pointer to the next character. + return false; + } + /** + Read a character when its length is not important for the caller + @param [OUT] wc - a pointer to a Unicode code point variable + @return false - on succes + @return true - on error (end of line, or an invalid byte sequence) + */ + bool read(my_wc_t *wc) + { + int chlen; + return read(wc, &chlen); + } + /** + Return a ponter to the next character in the queue. + */ + const char *ptr() const { return m_ptr; } +}; + + +/** Create a formated date/time value in a string. */
@@ -457,21 +521,29 @@ static bool make_date_time(DATE_TIME_FORMAT *format, MYSQL_TIME *l_time, uint hours_i; uint weekday; ulong length; - const char *ptr, *end; + my_wc_t wc; + int chlen; + Wchar_reader reader(str->charset(), format->format.str, + format->format.length);
is str->charset() the character set of the format string?
- end= (ptr= format->format.str) + format->format.length; - for (; ptr != end ; ptr++) + for ( ; !reader.read(&wc, &chlen) ; )
You don't distinguish here between end of string and invalid character. Is that ok?
diff --git a/sql/sql_string.cc b/sql/sql_string.cc index 885f53a..21fba79 100644 --- a/sql/sql_string.cc +++ b/sql/sql_string.cc @@ -547,8 +556,28 @@ bool String::append_with_prefill(const char *s,uint32 arg_length, t_length= full_length - arg_length; if (t_length > 0) { - bfill(Ptr+str_length, t_length, fill_char); - str_length=str_length + t_length; + if (charset()->mbminlen == 1) + { + /* + An ASCII string can be appended directly + to an ASCII-compatible string. This includes + multi-byte character sets, like utf8, sjis, etc. + */
but (mbminlen == 1) doesn't necessarily mean "ASCII-compatible", remember "filename" charset? I thought you've had an "ASCII-compatible" property somewhere in the CHARSET_INFO.
+ bfill(Ptr+str_length, t_length, fill_char); + str_length=str_length + t_length; + } + else + { + /* + Needs conversion to append an ASCII string to ASCII-incompatible + character sets, such as ucs2, utf16, utf16le, utf32. + */ + for (int i= 0; i < t_length; i++) + { + if (append(&fill_char, 1, &my_charset_latin1)) + return true; + } + } } append(s, arg_length); return FALSE;
Regards, Sergei Chief Architect MariaDB and security@mariadb.org -- Vote for my Percona Live 2016 talks: https://www.percona.com/live/data-performance-conference-2016/sessions/maria... https://www.percona.com/live/data-performance-conference-2016/sessions/maria...