New subject: 8d51c6d234b: MDEV-30164 System variable for default collations

23 Jun 2023

Hi, Alexander,

Please, see comments below.
Sorry, it took a while.

On Jun 23, Alexander Barkov wrote:
...
revision-id: 8d51c6d234b (mariadb-10.11.2-13-g8d51c6d234b)
parent(s): ceb0e7f944b
author: Alexander Barkov
committer: Alexander Barkov
timestamp: 2023-03-21 10:07:57 +0400
message:
MDEV-30164 System variable for default collations

diff --git a/mysql-test/main/ctype_collate_implicit.test b/mysql-test/main/ctype_collate_implicit.test
--- /dev/null
+++ b/mysql-test/main/ctype_collate_implicit.test
@@ -0,0 +1,209 @@
+--source include/have_utf8.inc
+--source include/have_utf8mb4.inc
+
+--echo #
+--echo # MDEV-30164 System variable for default collations
+--echo #
+
+SET @@character_set_collations= ' utf8mb3 = utf8mb3_bin ,  LATIN1 = LATIN1_BIN ';
also, please, test that parsing of commas is relaxed too. Like

  set @@character_set_collations= ',,,utf8mb3 = utf8mb3_bin,,latin1 = latin1_bin,,,';

it makes it easier for the user to edit the value.
...
+SELECT @@character_set_collations;
+SET @@character_set_collations='';
+SELECT @@character_set_collations;
+
+SET @@character_set_collations='utf8mb3=utf8mb3_bin';
+SELECT @@character_set_collations;
+SET @@character_set_collations='';
+
+--error ER_COLLATION_CHARSET_MISMATCH
+SET @@character_set_collations='utf8mb3=utf8mb4_general_ci';
+SELECT @@character_set_collations;
+
+--error ER_COLLATION_CHARSET_MISMATCH
+SET @@character_set_collations='utf8mb4=utf8mb3_general_ci';
+SELECT @@character_set_collations;
+
+SET @@character_set_collations='utf8mb3=utf8mb3_general_ci';
+SELECT @@character_set_collations;
+
+SET @@character_set_collations='utf8mb4=utf8mb4_general_ci,latin1=latin1_bin';
+SELECT @@character_set_collations;
+
+--error ER_COLLATION_CHARSET_MISMATCH
+SET @@character_set_collations='utf8mb4=uca1400_ai_ci,latin1=uca1400_ai_ci';
+
+# All or nothing is set. "Nothing" in this case because of the error on latin1.
+# The "uca1400_ai_ci FOR utf8mb4" part was ignored.
+SELECT @@character_set_collations;
+SELECT @@character_set_collations RLIKE 'utf8mb4=utf8mb4_general_ci' AS expect_true;
+
+
+SET @@character_set_collations='utf8mb4=uca1400_ai_ci';
+SELECT @@character_set_collations;
+
+SET NAMES utf8mb4;
+SELECT @@collation_connection;
+
+# We have to disable --view-protocol for the following statement.
+# 'mtr --view-protocol' creates a separate connection for these statements:
+#   CREATE VIEW mysqltest_tmp_sp AS ...;
+#   DROP VIEW mysqltest_tmp_sp;
+# The current @@character_set_collations does not affect this connection.
+# So --view-protocol would return the hard-coded character set collation here,
+# instead of utf8mb4_uca1400_ai_ci
+
+--disable_view_protocol
+SELECT collation('literal');
+--enable_view_protocol
+EXECUTE IMMEDIATE 'SELECT COLLATION(?)' USING 'literal';
+
+CREATE VIEW v1 AS SELECT 'literal', collation('literal') as cl;
+SHOW CREATE VIEW v1;
+SELECT * FROM v1;
+DROP VIEW v1;
+
+
+# Override @@collation_connection to utf8mb4_general_ci.
+# Make sure that CREATE statements does not use @@collation_connection.
+# to detect implicit collations.
+# Implicit collations are detected using @@character_set_collations!
+
+SET NAMES utf8mb4 COLLATE utf8mb4_general_ci;
+
+CREATE TABLE t1 (a TEXT CHARACTER SET utf8mb4);
+SHOW CREATE TABLE t1;
+DROP TABLE t1;
+
+CREATE TABLE t1 (a TEXT CHARACTER SET utf8mb4 COLLATE DEFAULT);
+SHOW CREATE TABLE t1;
+DROP TABLE t1;
+
+CREATE TABLE t1 (a TEXT COLLATE DEFAULT) CHARACTER SET utf8mb4;
+SHOW CREATE TABLE t1;
+DROP TABLE t1;
+
+CREATE TABLE t1 (a TEXT) CHARACTER SET utf8mb4;
+SHOW CREATE TABLE t1;
+DROP TABLE t1;
+
+CREATE DATABASE db1 CHARACTER SET utf8mb4;
+SHOW CREATE DATABASE db1;
+DROP DATABASE db1;
+
+
+# Test how @@character_set_collations affects various expressions
+# with implicit collations.
+
+
+let query=SELECT
+  @@collation_connection AS conn,
+  COLLATION('a') AS lit,
+  COLLATION(CONCAT(1)) AS num,
+  COLLATION(CAST(123 AS CHAR)) AS casti,
+  COLLATION(_utf8mb4'a') AS litu,
+  COLLATION(_utf8mb4 0x62) AS lituh,
+  COLLATION(_utf8mb4 X'63') AS lituhs,
+  COLLATION(CAST(123 AS CHAR CHARACTER SET utf8mb4)) AS castic,
+  COLLATION(CHAR(0x61 USING utf8mb4)) AS chr,
+  COLLATION(CONVERT('a' USING utf8mb4)) AS conv;
+
+# The below SET NAMES sets @@collation_connection to utf8mb4_general_ci.
+# But @@character_set_collations still contains utf8mb4=uca1400_ai_ci.
+
+SET NAMES utf8mb4 COLLATE utf8mb4_general_ci;
+
+# Columns expected to print utf8mb4_general_ci
+# because they use @@collation_connection:
+# - String literals without introducers
+# - Automatic number-to-string conversions
+# - CAST(AS CHAR) - without USING
+#
+# Columns expected to print utf8mb4_uca1400_ai_ci
+# because they use the current session default collation
+# for the character set (as specified in @@collation_connection)
+# - String literals with introducers
+# - CAST(AS CHAR USING cs)
+# - CHAR()
+# - CONVERT()
+
+--vertical_results
+--eval $query;
+--horizontal_results
+
+# This sets collation_connection to utf8mb4_uca1400_ai_ci
+# according to @@character_set_collations.
+SET NAMES utf8mb4;
+
+# Now all columns are expected to print utf8mb4_uca1400_ai_ci:
+# - Some columns because @@collation_connection says so
+# - Some columns because @@character_set_collations says so.
+
+--vertical_results
+--eval $query;
+--horizontal_results
+
+
+#
+# INFORMATION_SCHEMA
+#
+
+SET character_set_collations='latin1=latin1_bin,utf8mb4=uca1400_ai_ci';
+SHOW CHARACTER SET LIKE 'latin1';
+SELECT * FROM INFORMATION_SCHEMA.CHARACTER_SETS
+WHERE CHARACTER_SET_NAME='latin1';
+
+SHOW COLLATION LIKE 'latin1%';
+SELECT COLLATION_NAME, IS_DEFAULT
+FROM INFORMATION_SCHEMA.COLLATIONS
+WHERE CHARACTER_SET_NAME LIKE 'latin1%';
+SELECT COLLATION_NAME, FULL_COLLATION_NAME, IS_DEFAULT
+FROM INFORMATION_SCHEMA.COLLATION_CHARACTER_SET_APPLICABILITY
+WHERE COLLATION_NAME LIKE 'latin1%';
+
+SHOW CHARACTER SET LIKE 'utf8mb4';
+SELECT * FROM INFORMATION_SCHEMA.CHARACTER_SETS
+WHERE CHARACTER_SET_NAME='utf8mb4';
+
+SHOW COLLATION LIKE '%uca1400_ai_ci%';
+SELECT COLLATION_NAME, IS_DEFAULT
+FROM INFORMATION_SCHEMA.COLLATIONS
+WHERE COLLATION_NAME LIKE '%uca1400_ai_ci%';
+SELECT COLLATION_NAME, FULL_COLLATION_NAME, IS_DEFAULT
+FROM INFORMATION_SCHEMA.COLLATION_CHARACTER_SET_APPLICABILITY
+WHERE COLLATION_NAME LIKE '%uca1400_ai_ci%';
+
+#
+# Prepared statements: reprepare on @@character_set_collations change.
+#
+
+SET @@character_set_collations='';
+PREPARE stmt FROM 'SELECT '
+                  'COLLATION(CAST("x" AS CHAR CHARACTER SET utf8mb3)) AS a, '
+                  'COLLATION(_utf8mb3"x") AS b';
+EXECUTE stmt;
+SET @@character_set_collations='utf8mb3=utf8mb3_bin';
+EXECUTE stmt;
+
+SET @@character_set_collations='utf8mb3=utf8mb3_bin';
+PREPARE stmt FROM 'SELECT '
+                  'COLLATION(CAST("x" AS CHAR CHARACTER SET utf8mb3)) AS a, '
+                  'COLLATION(_utf8mb3"x") AS b';
+EXECUTE stmt;
+SET @@character_set_collations=DEFAULT;
+EXECUTE stmt;
+
+SET NAMES utf8mb3;
+SET @@character_set_collations='';
+PREPARE stmt FROM 'CREATE TABLE t1 '
+    '(a TEXT CHARACTER SET utf8mb3 COLLATE DEFAULT COLLATE utf8mb3_general_ci)';
+EXECUTE stmt;
+SHOW CREATE TABLE t1;
+DROP TABLE t1;
+
+SET @@character_set_collations='utf8mb3=utf8mb3_bin';
+--error ER_CONFLICTING_DECLARATIONS
+EXECUTE stmt;
+
+SET @@character_set_collations='';
+EXECUTE stmt;
+SHOW CREATE TABLE t1;
+DROP TABLE t1;
diff --git a/mysql-test/suite/sys_vars/r/sysvars_server_notembedded.result b/mysql-test/suite/sys_vars/r/sysvars_server_notembedded.result
--- a/mysql-test/suite/sys_vars/r/sysvars_server_notembedded.result
+++ b/mysql-test/suite/sys_vars/r/sysvars_server_notembedded.result
@@ -502,6 +502,16 @@ NUMERIC_BLOCK_SIZE	NULL
 ENUM_VALUE_LIST	NULL
 READ_ONLY	NO
 COMMAND_LINE_ARGUMENT	NULL
+VARIABLE_NAME	CHARACTER_SET_COLLATIONS
+VARIABLE_SCOPE	SESSION
+VARIABLE_TYPE	VARCHAR
+VARIABLE_COMMENT	Default collations for character sets
Better to emphasize somehow that it's not a complete list of
default collations, but adjustments to the compiled-in list
of default collations. If this variable is empty, it doesn't mean
that character sets have no default collations.

I'm not sure how to put this succintly in the help text though :)
...
+NUMERIC_MIN_VALUE	NULL
+NUMERIC_MAX_VALUE	NULL
+NUMERIC_BLOCK_SIZE	NULL
+ENUM_VALUE_LIST	NULL
+READ_ONLY	NO
+COMMAND_LINE_ARGUMENT	NULL
 VARIABLE_NAME	CHARACTER_SET_CONNECTION
 VARIABLE_SCOPE	SESSION
 VARIABLE_TYPE	ENUM
diff --git a/sql/lex_charset.cc b/sql/lex_charset.cc
--- a/sql/lex_charset.cc
+++ b/sql/lex_charset.cc
@@ -447,6 +451,17 @@ Lex_exact_charset_opt_extended_collate::find_default_collation() const
 }
+CHARSET_INFO *
+Lex_exact_charset_opt_extended_collate::
+  find_mapped_default_collation(Charset_collation_map_st::Used *used,
+                                const Charset_collation_map_st &map) const
+{
+  CHARSET_INFO *cs= find_compiled_default_collation();
+  if (!cs)
+    return nullptr;
+  return map.get_collation_for_charset(used, cs);
This seems a bit redundant. find_compiled_default_collation()
will do get_charset_by_csname(m_ci, MY_CS_PRIMARY)
and only then you'll be able to search, because you map
default compiled collation id to use specified collation id.

but conceptually you need to map a charset to a collation,
not a collation to a collation. If you'd map, say,
cs->cs_name.str (just the pointer, not the string value)
or, say, cs->cset, then you'd be mapping a charset to a collation.
and here you'd be able to do just

    return map.get_collation_for_charset(used, m_ci)

with find_compiled_default_collation() being done inside
get_collation_for_charset() and only when Elem_st wasn't found
...
+}
+
 /*
   Resolve an empty or a contextually typed collation according to the
   upper level default character set (and optionally a collation), e.g.:
diff --git a/sql/lex_charset.h b/sql/lex_charset.h
--- a/sql/lex_charset.h
+++ b/sql/lex_charset.h
@@ -544,6 +561,21 @@ struct Lex_exact_charset_extended_collation_attrs_st
   {
     return m_ci;
   }
+  CHARSET_INFO *charset_info(Charset_collation_map_st::Used *used,
+                             const Charset_collation_map_st &map) const
+  {
+    switch (m_type)
+    {
+    case TYPE_CHARACTER_SET:
+      return map.get_collation_for_charset(used, m_ci);
Why do you need this special charset_info(used,map)
that differs from charset_info() when m_type==TYPE_CHARACTER_SET,
and is used in set_charset_collation_attrs() only?
...
+    case TYPE_EMPTY:
+    case TYPE_CHARACTER_SET_COLLATE_EXACT:
+    case TYPE_COLLATE_CONTEXTUALLY_TYPED:
+    case TYPE_COLLATE_EXACT:
+      break;
+    }
+    return m_ci;
+  }
   Type type() const
   {
     return m_type;
diff --git a/sql/log_event_server.cc b/sql/log_event_server.cc
--- a/sql/log_event_server.cc
+++ b/sql/log_event_server.cc
@@ -1989,6 +1997,16 @@ int Query_log_event::do_apply_event(rpl_group_info *rgi,
         thd->variables.sql_mode=
           (sql_mode_t) ((thd->variables.sql_mode & MODE_NO_DIR_IN_CREATE) |
                         (sql_mode & ~(sql_mode_t) MODE_NO_DIR_IN_CREATE));
+
+      size_t cslen= thd->variables.character_set_collations.from_binary(
+                                      character_set_collations.str,
+                                      character_set_collations.length);
+      if (cslen != character_set_collations.length)
+      {
+        thd->variables.character_set_collations.init();
+        goto compare_errors; // QQ: report an error here?
is it corrupted event or unknown collation?
...
+      }
+
       if (charset_inited)
       {
         rpl_sql_thread_info *sql_info= thd->system_thread_info.rpl_sql_info;
diff --git a/sql/sql_parse.cc b/sql/sql_parse.cc
--- a/sql/sql_parse.cc
+++ b/sql/sql_parse.cc
@@ -6049,6 +6049,27 @@ mysql_execute_command(THD *thd, bool is_called_from_prepared_stmt)
       }
       thd->reset_kill_query();
     }
+
+    /*
+      If a non-default collation (in @@character_set_collations)
+      was used during the statement, the mysqlbinlog output for
+      the current statement will contain a sequence like this:
+
+        SET character_set_collations='utf8mb3=utf8mb3_bin';
+        INSERT INTO t1 VALUES (_utf8mb3'test');
+        COMMIT;
+
+      The statment (INSERT in this example) is already in binlog at this point, and the
+      and the "SET character_set_collations" is written inside a
+      Q_CHARACTER_SET_COLLATIONS chunk in its log entry header.
+      The flag CHARACTER_SET_COLLATIONS_USED is not needed any more.
+
+      Let's suppress the flag to avoid a Q_CHARACTER_SET_COLLATIONS chunk
+      inside the COMMIT log entry header - it would be useless and would
+      only waste space in the binary log.
+    */
+    thd->used&= ~THD::CHARACTER_SET_COLLATIONS_USED;
Wouldn't that logic apply to other THD::xxx_USED flags?
May be you should reset thd->used=0 here?
...
+
     if (unlikely(thd->is_error()) ||
         (thd->variables.option_bits & OPTION_MASTER_SQL_ERROR))
     {
diff --git a/sql/sql_table.cc b/sql/sql_table.cc
--- a/sql/sql_table.cc
+++ b/sql/sql_table.cc
@@ -12457,8 +12459,10 @@ bool HA_CREATE_INFO::
I'd prefer you didn't put a line break after ::
see how useful it makes the hunk header?
...
{
     // Make sure we don't do double resolution in direct SQL execution
     DBUG_ASSERT(!default_table_charset || thd->stmt_arena->is_stmt_execute());
+    Character_set_collations_used used(thd);
     if (!(default_table_charset=
-            default_cscl.resolved_to_context(ctx)))
+            default_cscl.resolved_to_context(&used,
+                           thd->variables.character_set_collations, ctx)))
       return true;
   }
diff --git a/sql/sql_class.h b/sql/sql_class.h
--- a/sql/sql_class.h
+++ b/sql/sql_class.h
@@ -5620,6 +5624,29 @@ class THD: public THD_count, /* this must be first */
 };
+class Character_set_collations_used: public Charset_collation_map_st::Used
+{
+  THD *m_thd;
+public:
+  Character_set_collations_used(THD *thd)
+   :m_thd(thd)
+  { }
+  ~Character_set_collations_used()
+  {
+    /*
+      Mark THD that the collation map was used,
+      no matter if a compiled or a mapped collation was
why no matter? as far as I understand, you didn't plan to change
compiled-in defaults
...
+      found during charset->collation resolution.
+      Even if the map was empty, we still need to print
+        SET @@session.character_set_collations='';
+      in mariadb-binlog output.
+    */
+    if (m_used)
+      m_thd->used|= THD::CHARACTER_SET_COLLATIONS_USED;
+  }
+};
+
+
 /*
   Start a new independent transaction for the THD.
   The old one is stored in this object and restored when calling
diff --git a/sql/sql_prepare.cc b/sql/sql_prepare.cc
--- a/sql/sql_prepare.cc
+++ b/sql/sql_prepare.cc
@@ -3538,6 +3544,13 @@ static void mysql_stmt_execute_common(THD *thd,
     DBUG_VOID_RETURN;
   }
+  if (stmt->prepare_time_charset_collation_map_version() !=
+      thd->variables.character_set_collations.version())
+  {
+    if (stmt->reprepare())
+      DBUG_VOID_RETURN;
+  }
why do you reprepare here and now where reprepare() is normally happens,
in the Prepared_statement::execute_loop()?
...
+
   /*
     In case of direct execution application decides how many parameters
     to send.
@@ -5220,6 +5252,13 @@ bool Prepared_statement::execute(String *expanded_query, bool open_cursor)
       MYSQL_QUERY_EXEC_START(thd->query(), thd->thread_id, thd->get_db(),
                              &thd->security_ctx->priv_user[0],
                              (char *) thd->security_ctx->host_or_ip, 1);
+      /*
+        If PREPARE used @@character_set_collations,
+        then we need to make sure binary log writes
+        the map in the event header.
+      */
why is it different from other THD::xx_USED flags?
...
+      thd->used|= m_prepare_time_thd_used_flags &
+                  THD::CHARACTER_SET_COLLATIONS_USED;
       error= mysql_execute_command(thd, true);
       MYSQL_QUERY_EXEC_DONE(error);
       thd->update_server_status();
diff --git a/sql/sys_vars.cc b/sql/sys_vars.cc
--- a/sql/sys_vars.cc
+++ b/sql/sys_vars.cc
@@ -436,6 +436,115 @@ static bool update_auto_increment_increment (sys_var *self, THD *thd, enum_var_t
#endif /* WITH_WSREP */
+
+class Sys_var_charset_collation_map: public sys_var
in sys_var.inl, please
...
+{
+public:
+  Sys_var_charset_collation_map(const char *name_arg, const char *comment,
+                                int flag_args, ptrdiff_t off, size_t size,
+                                CMD_LINE getopt,
+                                enum binlog_status_enum binlog_status_arg)
+   :sys_var(&all_sys_vars, name_arg, comment,
+            flag_args, off, getopt.id, getopt.arg_type,
+            SHOW_CHAR,
+            DEFAULT(0), nullptr, binlog_status_arg,
+            nullptr, nullptr, nullptr)
+  {
+    option.var_type|= GET_STR;
+  }
+
+private:
+
+  static bool charset_collation_map_from_item(Charset_collation_map_st *map,
+                                              Item *item,
+                                              myf utf8_flag)
+  {
+    String *value, buffer;
+    if (!(value= item->val_str_ascii(&buffer)))
+      return true;
+    return map->from_text(value->to_lex_cstring(), utf8_flag);
+  }
+
+  static const uchar *make_value_ptr(THD *thd,
+                                     const Charset_collation_map_st &map)
+  {
+    size_t nbytes= map.text_format_nbytes_needed();
+    char *buf= (char *) thd->alloc(nbytes);
+    size_t length= map.print(buf, nbytes);
+    return (uchar *) thd->strmake(buf, length);
Eh, sorry. I don't understand. You allocate memory twice?
for two copies of the string?
...
+  }
+
+
+private:
+
+  bool do_check(THD *thd, set_var *var) override
+  {
+    Charset_collation_map_st map;
+    return charset_collation_map_from_item(&map, var->value,
+                                           thd->get_utf8_flag());
+  }
+
+  void session_save_default(THD *thd, set_var *var) override
+  {
+    thd->variables.character_set_collations.set(
+      global_system_variables.character_set_collations, 1);
+  }
+
+  void global_save_default(THD *thd, set_var *var) override
+  {
+    global_system_variables.character_set_collations.init();
+  }
+
+  bool session_update(THD *thd, set_var *var) override
+  {
+    Charset_collation_map_st map;
+    if (!var->value)
+    {
+      session_save_default(thd, var);
+      return false;
+    }
+    if (charset_collation_map_from_item(&map, var->value, thd->get_utf8_flag()))
The idea is that var->value is only evaluated in check and is not
reevaluated in update. check is supposed to store the value in set_var *var
for update to use.
...
+      return true;
+    thd->variables.character_set_collations.set(map, 1);
+    return false;
+  }
+
+  bool global_update(THD *thd, set_var *var) override
+  {
+    Charset_collation_map_st map;
+    if (!var->value)
+    {
+      global_save_default(thd, var);
+      return false;
+    }
+    if (charset_collation_map_from_item(&map, var->value, thd->get_utf8_flag()))
+      return true;
+    global_system_variables.character_set_collations= map;
+    return false;
+  }
+
+  const uchar *
+  session_value_ptr(THD *thd, const LEX_CSTRING *base) const override
+  {
+    return make_value_ptr(thd, thd->variables.character_set_collations);
+  }
+
+  const uchar *
+  global_value_ptr(THD *thd, const LEX_CSTRING *base) const override
+  {
+    return make_value_ptr(thd, global_system_variables.
+                                 character_set_collations);
+  }
+};
+
+
+static Sys_var_charset_collation_map Sys_character_set_collations(
+       "character_set_collations",
+       "Default collations for character sets",
+       SESSION_VAR(character_set_collations),
+       NO_CMD_LINE, NOT_IN_BINLOG);
+
+
 static Sys_var_double Sys_analyze_sample_percentage(
        "analyze_sample_percentage",
        "Percentage of rows from the table ANALYZE TABLE will sample "
diff --git a/sql/sql_yacc.yy b/sql/sql_yacc.yy
--- a/sql/sql_yacc.yy
+++ b/sql/sql_yacc.yy
@@ -9686,6 +9713,9 @@ column_default_non_parenthesized_expr:
           }
         | CONVERT_SYM '(' expr USING charset_name ')'
           {
+            Character_set_collations_used used(thd);
+            $5= thd->variables.character_set_collations.
+                                 get_collation_for_charset(&used, $5);
this is rather awkward pattern that you use everywhere, with

  Character_set_collations_used used(thd);

why not to pass thd as an argument instead?


-            Character_set_collations_used used(thd);
             $5= thd->variables.character_set_collations.
-                                 get_collation_for_charset(&used, $5);
+                                 get_collation_for_charset(thd, $5);
...
$$= new (thd->mem_root) Item_func_conv_charset(thd, $3, $5);
             if (unlikely($$ == NULL))
               MYSQL_YYABORT;
diff --git a/sql/simple_tokenizer.h b/sql/simple_tokenizer.h
--- /dev/null
+++ b/sql/simple_tokenizer.h
@@ -0,0 +1,83 @@
+/* Copyright (c) 2022, MariaDB Corporation.
2023
...
+
+   This program is free software; you can redistribute it and/or modify
+   it under the terms of the GNU General Public License as published by
+   the Free Software Foundation; version 2 of the License.
+
+   This program is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+   GNU General Public License for more details.
+
+   You should have received a copy of the GNU General Public License
+   along with this program; if not, write to the Free Software
+   Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1335  USA */
+
+#ifndef SIMPLE_TOKENIZER_INCLUDED
+#define SIMPLE_TOKENIZER_INCLUDED
+
+
+class Simple_tokenizer
+{
+  const char *m_ptr;
+  const char *m_end;
+public:
+  Simple_tokenizer(const char *str, size_t length)
+   :m_ptr(str), m_end(str + length)
+  { }
+  const char *ptr() const
+  {
+    return m_ptr;
+  }
+  bool eof() const
+  {
+    return m_ptr >= m_end;
+  }
+  void get_spaces()
+  {
+    for ( ; !eof(); m_ptr++)
+    {
+      if (m_ptr[0] != ' ')
+        break;
+    }
+  }
+  bool is_ident_start(char ch) const
+  {
+    return (ch >= 'a' && ch <= 'z') ||
+           (ch >= 'A' && ch <= 'Z') ||
+           ch == '_';
+  }
+  bool is_ident_body(char ch) const
+  {
+    return is_ident_start(ch) ||
+           (ch >= '0' && ch <= '9');
+  }
+  bool is_ident_start() const
+  {
+    return !eof() && is_ident_start(*m_ptr);
+  }
+  bool is_ident_body() const
+  {
+    return !eof() && is_ident_body(*m_ptr);
+  }
+  LEX_CSTRING get_ident()
+  {
+    if (!is_ident_start())
+      return {m_ptr,0};
+    const char *start= m_ptr++;
+    for ( ; is_ident_body(); m_ptr++)
+    { }
+    LEX_CSTRING res= {start, (size_t) (m_ptr - start)};
+    return res;
+  }
+  bool get_char(char ch)
+  {
+    if (eof() || *m_ptr != ch)
+      return true;
+    m_ptr++;
+    return false;
+  }
+};
that's too big and complex, but also not complex enough.
note that we have quite a few of those already, in patricular in
typelib.c, in strfunc.cc, item_func.cc (find_in_set) etc.

and this one could've been 2-3 times smaller for @@character_set_collations
only, and it still cannot replace any other parsers from above.

I'd keep just two methods: get_token() - which skips whitespaces
and returns the next substring of non-white-space and not ',' or '='
characters. multibyte-safe. And get_char, which returns the next
non-white-space character.

this parser could be used in all use cases from above.
...
+
+
+#endif // SIMPLE_TOKENIZER_INCLUDED
diff --git a/sql/charset_collations.h b/sql/charset_collations.h
--- /dev/null
+++ b/sql/charset_collations.h
@@ -0,0 +1,265 @@
+/* Copyright (c) 2023, MariaDB Corporation.
+
+   This program is free software; you can redistribute it and/or modify
+   it under the terms of the GNU General Public License as published by
+   the Free Software Foundation; version 2 of the License.
+
+   This program is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+   GNU General Public License for more details.
+
+   You should have received a copy of the GNU General Public License
+   along with this program; if not, write to the Free Software
+   Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1335  USA */
+
+#ifndef LEX_CHARSET_COLLATIONS_INCLUDED
+#define LEX_CHARSET_COLLATIONS_INCLUDED
+
+struct Charset_collation_map_st
+{
+public:
+
+  class Used
+  {
+  public:
+    enum map_used_t
+    {
+      USED_NONE= 0,
+      USED_COMPILED_COLLATION= 1 << 0,
+      USED_MAPPED_COLLATION=   1 << 1
+    };
+  protected:
+    map_used_t m_used;
+  public:
+    Used()
+     :m_used(USED_NONE)
+    { }
+    void add(map_used_t flag)
+    {
+      m_used= (map_used_t) ((uint) m_used | (uint) flag);
+    }
+  };
+
+  struct Elem_st
+  {
+  protected:
+    CHARSET_INFO *m_charset;
+    CHARSET_INFO *m_collation;
What is the difference between charset and collation?
I was under impression that any CHARSET_INFO describes a collation,
there is no special structure just for a charset. Am I wrong?
...
+    static size_t print_lex_string(char *dst, const LEX_CSTRING &str)
+    {
+      memcpy(dst, str.str, str.length);
+      return str.length;
+    }
+  public:
+    /*
+      Size in text format: 'utf8mb4=utf8mb4_unicode_ai_ci'
+    */
+    static constexpr size_t text_size_max()
+    {
+       return MY_CS_CHARACTER_SET_NAME_SIZE + 1 +
+              MY_CS_COLLATION_NAME_SIZE;
+    }
+    CHARSET_INFO *charset() const
+    {
+      return m_charset;
+    }
+    CHARSET_INFO *collation() const
+    {
+      return m_collation;
+    }
+    void set_collation(CHARSET_INFO *cl)
+    {
+      m_collation= cl;
+    }
+    size_t print(char *dst) const
+    {
+      const char *dst0= dst;
+      dst+= print_lex_string(dst, m_charset->cs_name);
+      *dst++= '=';
+      dst+= print_lex_string(dst, m_collation->coll_name);
+      return (size_t) (dst - dst0);
+    }
+    int cmp_by_charset_id(const Elem_st &rhs) const
+    {
+      return m_charset->number < rhs.m_charset->number ? -1 :
+             m_charset->number > rhs.m_charset->number ? +1 : 0;
+    }
+  };
+  class Elem: public Elem_st
+  {
+  public:
+    Elem(CHARSET_INFO *charset, CHARSET_INFO *collation)
+    {
+      m_charset= charset;
+      m_collation= collation;
+    }
+  };
+protected:
+  Elem_st m_element[8]; // Should be enough for now
+  uint m_count;
+  uint m_version;
+
+  static int cmp_by_charset_id(const void *a, const void *b)
+  {
+    return static_cast<const Elem_st*>(a)->
+             cmp_by_charset_id(*static_cast<const Elem_st*>(b));
+  }
+
+  void sort()
+  {
+    qsort(m_element, m_count, sizeof(Elem_st), cmp_by_charset_id);
+  }
+
+  const Elem_st *find_elem_by_charset_id(uint id) const
+  {
+    if (!m_count)
+      return NULL;
+    int first= 0, last= ((int) m_count) - 1;
+    for ( ; first <= last; )
+    {
+      const int middle= (first + last) / 2;
+      DBUG_ASSERT(middle >= 0);
+      DBUG_ASSERT(middle < (int) m_count);
+      const uint middle_id= m_element[middle].charset()->number;
+      if (middle_id == id)
+        return &m_element[middle];
+      if (middle_id < id)
+        first= middle + 1;
+      else
+        last= middle - 1;
+    }
+    return NULL;
+  }
+
+  bool insert(const Elem_st &elem)
+  {
+    DBUG_ASSERT(elem.charset()->state & MY_CS_PRIMARY);
+    if (m_count >= array_elements(m_element))
+      return true;
+    m_element[m_count]= elem;
+    m_count++;
+    sort();
+    return false;
+  }
+
+  bool insert_or_replace(const Elem_st &elem)
+  {
+    DBUG_ASSERT(elem.charset()->state & MY_CS_PRIMARY);
+    const Elem_st *found= find_elem_by_charset_id(elem.charset()->number);
+    if (found)
+    {
+      const_cast<Elem_st*>(found)->set_collation(elem.collation());
+      return false;
+    }
+    return insert(elem);
+  }
+
+public:
+  void init()
+  {
+    m_count= 0;
+    m_version= 0;
+  }
+  uint count() const
+  {
+    return m_count;
+  }
+  uint version() const
+  {
+    return m_version;
+  }
+  void set(const Charset_collation_map_st &rhs, uint version_increment)
+  {
+    uint version= m_version;
+    *this= rhs;
+    m_version= version + version_increment;
+  }
+  const Elem_st & operator[](uint pos) const
+  {
+    DBUG_ASSERT(pos < m_count);
+    return m_element[pos];
+  }
+  bool insert_or_replace(const class Lex_exact_charset &cs,
+                         const class Lex_extended_collation &cl,
+                         bool error_on_conflicting_duplicate);
+  CHARSET_INFO *get_collation_for_charset(Used *used,
+                                          CHARSET_INFO *cs) const
+  {
+    DBUG_ASSERT(cs->state & MY_CS_PRIMARY);
+    const Elem_st *elem= find_elem_by_charset_id(cs->number);
+    if (elem)
+    {
+      used->add(Used::USED_MAPPED_COLLATION);
+      return elem->collation();
+    }
+    used->add(Used::USED_COMPILED_COLLATION);
+    return cs;
+  }
+  size_t text_format_nbytes_needed() const
+  {
+    return (Elem_st::text_size_max() + 1/* for ',' */) * m_count;
+  }
+  size_t print(char *dst, size_t nbytes_available) const
+  {
+    const char *dst0= dst;
+    const char *end= dst + nbytes_available;
+    for (uint i= 0; i < m_count; i++)
+    {
+      if (Elem_st::text_size_max() + 1/* for ',' */ > (size_t) (end - dst))
+        break;
+      if (i > 0)
+        *dst++= ',';
+      dst+= m_element[i].print(dst);
+    }
+    return dst - dst0;
+  }
+  static constexpr size_t binary_size_max()
+  {
+    return 1/*count*/ + 4 * array_elements(m_element);
+  }
+  size_t to_binary(char *dst) const
+  {
+    const char *dst0= dst;
+    *dst++= (char) (uchar) m_count;
+    for (uint i= 0; i < m_count; i++)
+    {
+      int2store(dst, (uint16) m_element[i].charset()->number);
+      dst+= 2;
+      int2store(dst, (uint16) m_element[i].collation()->number);
+      dst+= 2;
+    }
+    return (size_t) (dst - dst0);
+  }
+  size_t from_binary(const char *src, size_t srclen)
+  {
+    const char *src0= src;
+    init();
+    if (!srclen)
+      return 0; // Empty
+    uint count= (uchar) *src++;
+    if (srclen < 1 + 4 * count)
+      return 0;
+    for (uint i= 0; i < count; i++, src+= 4)
+    {
+      CHARSET_INFO *cs, *cl;
+      if (!(cs= get_charset(uint2korr(src), MYF(0))) ||
+          !(cl= get_charset(uint2korr(src + 2), MYF(0))))
+      {
+        /*
+          Unpacking from binary format happens on the slave side.
+          If for some reasons the slave does not know about a
+          character set or a collation, just skip the pair here.
+          This pair might not even be needed.
+        */
+        continue;
+      }
+      insert_or_replace(Elem(cs, cl));
+    }
+    return src - src0;
+  }
+  bool from_text(const LEX_CSTRING &str, myf utf8_flag);
+};
+
+
+#endif // LEX_CHARSET_COLLATIONS_INCLUDED
Regards,
Sergei
VP of MariaDB Server Engineering
and security@mariadb.org

    

Re: 8d51c6d234b: MDEV-30164 System variable for default collations

Sergei Golubchik

Alexander Barkov

Sergei Golubchik

Alexander Barkov

tags

participants (2)