Hi, Alexander, On Mar 14, Alexander Barkov wrote:
revision-id: 49ecf935415 (mariadb-10.6.1-335-g49ecf935415) parent(s): c67789f63c8 author: Alexander Barkov committer: Alexander Barkov timestamp: 2022-02-28 14:04:58 +0400 message:
MDEV-27009 Add UCA-14.0.0 collations
please, list all user visible changes there. Mainly that collations are now decoupled from charsets. New syntax in CREATE TABLE, changes in I_S tables, etc.
diff --git a/mysql-test/include/ctype_utf_uca1400_ids.inc b/mysql-test/include/ctype_utf_uca1400_ids.inc new file mode 100644 index 00000000000..09cf49fc0e7 --- /dev/null +++ b/mysql-test/include/ctype_utf_uca1400_ids.inc @@ -0,0 +1,17 @@
file names are confusing. better rename ctype_ucs_uca1400_ids.inc to something like ctype_convert_uca1400_ids and ctype_utf_uca1400_ids to ctype_set_names_uca1400_ids or something like that, to show what they do.
+ +--disable_ps_protocol +--enable_metadata +DELIMITER $$; +FOR rec IN (SELECT COLLATION_NAME + FROM INFORMATION_SCHEMA.COLLATION_CHARACTER_SET_APPLICABILITY + WHERE CHARACTER_SET_NAME=@charset + AND COLLATION_NAME RLIKE 'uca1400' + ORDER BY ID) +DO + EXECUTE IMMEDIATE CONCAT('SET NAMES ',@charset,' COLLATE ', rec.COLLATION_NAME); + SELECT rec.COLLATION_NAME; +END FOR; +$$ +DELIMITER ;$$ +--disable_metadata +--enable_ps_protocol diff --git a/include/m_ctype.h b/include/m_ctype.h index 4c6628b72b3..706764ead2a 100644 --- a/include/m_ctype.h +++ b/include/m_ctype.h @@ -34,7 +34,9 @@ enum loglevel { extern "C" { #endif
-#define MY_CS_NAME_SIZE 32 +#define MY_CS_CHARACTER_SET_NAME_SIZE 32 +#define MY_CS_COLLATION_NAME_SIZE 64
That's FULL_COLLATION_NAME_SIZE, right?
+ #define MY_CS_CTYPE_TABLE_SIZE 257 #define MY_CS_TO_LOWER_TABLE_SIZE 256 #define MY_CS_TO_UPPER_TABLE_SIZE 256 @@ -240,6 +242,46 @@ typedef enum enum_repertoire_t } my_repertoire_t;
+/* ID compatibility */ +typedef enum enum_collation_id_type +{ + MY_COLLATION_ID_TYPE_PRECISE= 0, + MY_COLLATION_ID_TYPE_COMPAT_100800= 1 +} my_collation_id_type_t; + + +/* Collation name display modes */ +typedef enum enum_collation_name_mode +{ + MY_COLLATION_NAME_MODE_FULL= 0, + MY_COLLATION_NAME_MODE_CONTEXT= 1 +} my_collation_name_mode_t; + + +/* Level flags */ +#define MY_CS_LEVEL_BIT_PRIMARY 0x00 +#define MY_CS_LEVEL_BIT_SECONDARY 0x01 +#define MY_CS_LEVEL_BIT_TERTIARY 0x02 +#define MY_CS_LEVEL_BIT_QUATERNARY 0x03 + +#define MY_CS_COLL_LEVELS_S1 (1<<MY_CS_LEVEL_BIT_PRIMARY) + +#define MY_CS_COLL_LEVELS_AI_CS (1<<MY_CS_LEVEL_BIT_PRIMARY)| \ + (1<<MY_CS_LEVEL_BIT_TERTIARY) + +#define MY_CS_COLL_LEVELS_S2 (1<<MY_CS_LEVEL_BIT_PRIMARY)| \ + (1<<MY_CS_LEVEL_BIT_SECONDARY) + +#define MY_CS_COLL_LEVELS_S3 (1<<MY_CS_LEVEL_BIT_PRIMARY)| \ + (1<<MY_CS_LEVEL_BIT_SECONDARY) | \ + (1<<MY_CS_LEVEL_BIT_TERTIARY)
AI_CS and S3 don't seem to be used yet
+ +#define MY_CS_COLL_LEVELS_S4 (1<<MY_CS_LEVEL_BIT_PRIMARY)| \ + (1<<MY_CS_LEVEL_BIT_SECONDARY) | \ + (1<<MY_CS_LEVEL_BIT_TERTIARY) | \ + (1<<MY_CS_LEVEL_BIT_QUATERNARY) + + /* Flags for strxfrm */ #define MY_STRXFRM_LEVEL1 0x00000001 /* for primary weights */ #define MY_STRXFRM_LEVEL2 0x00000002 /* for secondary weights */ diff --git a/sql/sql_alter.cc b/sql/sql_alter.cc index 86c6e9a27f8..9ddd482ad57 100644 --- a/sql/sql_alter.cc +++ b/sql/sql_alter.cc @@ -546,6 +546,7 @@ bool Sql_cmd_alter_table::execute(THD *thd)
result= mysql_alter_table(thd, &select_lex->db, &lex->name, &create_info, + lex->create_info.default_charset_collation,
I don't see why you need a new argument here. It's create_info.default_charset_collation, so, mysql_alter_table already gets it in create_info. All other mysql_alter_table invocations also take create_info argument and can get default_charset_collation from there
first_table, &alter_info, select_lex->order_list.elements, diff --git a/sql/sql_partition_admin.cc b/sql/sql_partition_admin.cc index fb1ae0d5fc7..4188dde252b 100644 --- a/sql/sql_partition_admin.cc +++ b/sql/sql_partition_admin.cc @@ -211,6 +211,7 @@ bool compare_table_with_partition(THD *thd, TABLE *table, TABLE *part_table, part_table->use_all_columns(); table->use_all_columns(); if (unlikely(mysql_prepare_alter_table(thd, part_table, &part_create_info, + Lex_maybe_default_charset_collation(),
Same. Can be in part_create_info
&part_alter_info, &part_alter_ctx))) { my_error(ER_TABLES_DIFFERENT_METADATA, MYF(0)); diff --git a/sql/sql_i_s.h b/sql/sql_i_s.h index bed2e886718..5ff06d32231 100644 --- a/sql/sql_i_s.h +++ b/sql/sql_i_s.h @@ -162,6 +162,11 @@ class Yesno: public Varchar { public: Yesno(): Varchar(3) { } + static LEX_CSTRING value(bool val) + { + return val ? Lex_cstring(STRING_WITH_LEN("Yes")) : + Lex_cstring(); + }
eh... please, rename the class from Yesno to something like Yesempty or Yes_or_empty, something that says that the second should not be Lex_cstring(STRING_WITH_LEN("No"))
};
diff --git a/sql/table.cc b/sql/table.cc index a683a78ff49..c28cb2bd928 100644 --- a/sql/table.cc +++ b/sql/table.cc @@ -3491,6 +3493,16 @@ int TABLE_SHARE::init_from_sql_statement_string(THD *thd, bool write, else thd->set_n_backup_active_arena(arena, &backup);
+ /* + THD::reset_db() does not set THD::db_charset, + so it keeps pointing to the character set and collation + of the current database, rather than the database of the + new initialized table.
Hmm, is that correct? Could you check other invocation of thd->reset_db()? Perhaps they all need to switch charset? In that case it should be done inside THD::reset_db(). Or may be they have to use mysql_change_db_impl() instead?
+ Let's call get_default_db_collation() before reset_db(). + This forces the db.opt file to be loaded. + */ + db_cs= get_default_db_collation(thd, db.str); + thd->reset_db(&db); lex_start(thd);
@@ -3498,6 +3510,11 @@ int TABLE_SHARE::init_from_sql_statement_string(THD *thd, bool write, sql_unusable_for_discovery(thd, hton, sql_copy)))) goto ret;
+ if (!(thd->lex->create_info.default_table_charset= + thd->lex->create_info.default_charset_collation. + resolved_to_character_set(db_cs, db_cs))) + DBUG_RETURN(true);
How could this (and similar if()'s in other files) fail?
+ thd->lex->create_info.db_type= hton; #ifdef WITH_PARTITION_STORAGE_ENGINE thd->work_part_info= 0; // For partitioning diff --git a/sql/mysys_charset.h b/sql/mysys_charset.h new file mode 100644 index 00000000000..86eaeedd432 --- /dev/null +++ b/sql/mysys_charset.h @@ -0,0 +1,44 @@ +#ifndef MYSYS_CHARSET +#define MYSYS_CHARSET + +/* Copyright (c) 2021, MariaDB Corporation. + + This program is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; version 2 of the License. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program; if not, write to the Free Software + Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1335 USA */ + + +#include "my_sys.h" + + +class Charset_loader_mysys: public MY_CHARSET_LOADER +{ +public: + Charset_loader_mysys() + { + my_charset_loader_init_mysys(this); + } + void raise_unknown_collation_error(const char *name, + CHARSET_INFO *name_cs) const; + CHARSET_INFO *get_charset(const char *cs_name, uint cs_flags, myf my_flags); + CHARSET_INFO *get_exact_collation(const char *name, myf utf8_flag); + CHARSET_INFO *get_contextually_typed_collation(CHARSET_INFO *cs, + const char *name); + CHARSET_INFO *get_contextually_typed_collation(const char *name); + CHARSET_INFO *get_contextually_typed_collation_or_error(CHARSET_INFO *cs, + const char *name); + CHARSET_INFO *find_default_collation(CHARSET_INFO *cs); + CHARSET_INFO *find_bin_collation_or_error(CHARSET_INFO *cs); +};
you can have C++ code in mysys too, you know, no need to put it in sql/mysys*
+ +#endif // MYSYS_CHARSET + diff --git a/strings/ctype-simple.c b/strings/ctype-simple.c index b579f0af203..d09dfba86ed 100644 --- a/strings/ctype-simple.c +++ b/strings/ctype-simple.c @@ -1940,13 +1941,26 @@ my_bool my_propagate_complex(CHARSET_INFO *cs __attribute__((unused)), }
+void my_ci_set_strength(struct charset_info_st *cs, uint strength) +{ + DBUG_ASSERT(strength > 0 && strength <= MY_STRXFRM_NLEVELS);
don't use && in asserts, please create two separate asserts instead: DBUG_ASSERT(strength > 0); DBUG_ASSERT(strength <= MY_STRXFRM_NLEVELS);
+ cs->levels_for_order= ((1 << strength) - 1);
why do you still use the old concept of "strength"? Why not to use bitmap consistently everywhere?
+} + + +void my_ci_set_level_flags(struct charset_info_st *cs, uint flags) +{ + DBUG_ASSERT(flags < (1<<MY_STRXFRM_NLEVELS)); + cs->levels_for_order= flags; +} + /* Normalize strxfrm flags
SYNOPSIS: my_strxfrm_flag_normalize() + cs - the CHARSET_INFO pointer flags - non-normalized flags - nlevels - number of levels
NOTES: If levels are omitted, then 1-maximum is assumed. diff --git a/sql/handler.h b/sql/handler.h index 8ad521e189a..1e82f37b1e7 100644 --- a/sql/handler.h +++ b/sql/handler.h @@ -2409,7 +2386,32 @@ struct Table_specification_st: public HA_CREATE_INFO, { HA_CREATE_INFO::options= 0; DDL_options_st::init(); + default_charset_collation.init(); + } + + bool + add_alter_list_item_convert_to_charset(const Lex_charset_collation_st &cl) + { + /* + cs cannot be NULL, as sql_yacc.yy translates + CONVERT TO CHARACTER SET DEFAULT + to + CONVERT TO CHARACTER SET <character-set-of-the-current-database> + TODO: Shouldn't we postpone resolution of DEFAULT until the + character set of the table owner database is loaded from its db.opt? + */ + DBUG_ASSERT(cl.charset_collation()); + DBUG_ASSERT(!cl.is_contextually_typed_collation()); + alter_table_convert_to_charset= cl.charset_collation(); + default_charset_collation.Lex_charset_collation_st::operator=(cl);
looks quite ugly. can you do, like, default_charset_collation.set(cl) ?
+ used_fields|= (HA_CREATE_USED_CHARSET | HA_CREATE_USED_DEFAULT_CHARSET); + return false; } + bool add_table_option_default_charset(CHARSET_INFO *cs); + bool add_table_option_default_collation(const Lex_charset_collation_st &cl); + bool resolve_db_charset_and_collation(THD *thd, + const LEX_CSTRING &db, + bool is_alter); };
diff --git a/strings/ctype-uca1400data.h b/strings/ctype-uca1400data.h new file mode 100644 index 00000000000..da95dcfde54 --- /dev/null +++ b/strings/ctype-uca1400data.h @@ -0,0 +1,44151 @@ +/* + Generated from allkeys.txt version '14.0.0' +*/
if it's generated, do you need to check it in? perhaps it should be generated during the build? you've checked in allkeys1400.txt anyway.
+static const uint16 uca1400_p000[]= { /* 0000 (4 weights per char) */ +0x0000,0x0000,0x0000,0x0000, 0x0000,0x0000,0x0000,0x0000, /* 0000 */ +0x0000,0x0000,0x0000,0x0000, 0x0000,0x0000,0x0000,0x0000, /* 0002 */ diff --git a/sql/sql_lex.cc b/sql/sql_lex.cc index 6ca10267187..d115401a855 100644 --- a/sql/sql_lex.cc +++ b/sql/sql_lex.cc @@ -542,6 +542,30 @@ bool LEX::add_alter_list(LEX_CSTRING name, LEX_CSTRING new_name, bool exists) }
+bool LEX::add_alter_list_item_convert_to_charset( + THD *thd, + CHARSET_INFO *cs, + const Lex_charset_collation_st &cl) +{ + if (!cs) + { + Lex_charset_collation_st tmp; + tmp.set_charset_collate_default(thd->variables.collation_database);
Hmm, what if one is doing ALTER TABLE db.test CHARSET DEFAULT and current db is not `db` but `test` ?
+ if (!(cs= tmp.charset_collation())) + return true; // Should not actually happen
assert?
+ } + + Lex_explicit_charset_opt_collate tmp(cs, false); + if (tmp.merge_opt_collate_or_error(cl) || + create_info.add_alter_list_item_convert_to_charset( + Lex_charset_collation(tmp))) + return true; + + alter_info.flags|= ALTER_CONVERT_TO; + return false; +} + + void LEX::init_last_field(Column_definition *field, const LEX_CSTRING *field_name) { @@ -11871,29 +11869,41 @@ CHARSET_INFO *Lex_collation_st::find_default_collation(CHARSET_INFO *cs) "def" is the upper level CHARACTER SET clause (e.g. of a table) */ CHARSET_INFO * -Lex_collation_st::resolved_to_character_set(CHARSET_INFO *def) const +Lex_charset_collation_st::resolved_to_character_set(CHARSET_INFO *def) const { DBUG_ASSERT(def); - if (m_type != TYPE_CONTEXTUALLY_TYPED) - { - if (!m_collation) - return def; // Empty - not typed at all - return m_collation; // Explicitly typed + + switch (m_type) { + case TYPE_EMPTY: + return def; + case TYPE_CHARACTER_SET: + DBUG_ASSERT(m_ci); + return m_ci; + case TYPE_COLLATE_EXACT: + DBUG_ASSERT(m_ci); + return m_ci; + case TYPE_COLLATE_CONTEXTUALLY_TYPED: + break; }
// Contextually typed - DBUG_ASSERT(m_collation); + DBUG_ASSERT(m_ci);
- if (m_collation == &my_charset_bin) // CHAR(10) BINARY - return find_bin_collation(def); + Charset_loader_mysys loader; + if (is_contextually_typed_binary_style()) // CHAR(10) BINARY + return loader.find_bin_collation_or_error(def);
- if (m_collation == &my_charset_latin1) // CHAR(10) COLLATE DEFAULT - return find_default_collation(def); + if (is_contextually_typed_collate_default()) // CHAR(10) COLLATE DEFAULT + return loader.find_default_collation(def); + + const LEX_CSTRING context_name= collation_name_context_suffix();
I'd rather put this in assert, not in if(). Like - if (!strncasecmp(context_name.str, STRING_WITH_LEN("uca1400_"))) + DBUG_ASSERT(!strncasecmp(context_cl_name.str, STRING_WITH_LEN("uca1400_")));
+ if (!strncasecmp(context_name.str, STRING_WITH_LEN("uca1400_"))) + return loader.get_contextually_typed_collation_or_error(def, + context_name.str);
/* - Non-binary and non-default contextually typed collation. + Non-binary, non-default, non-uca1400 contextually typed collation. We don't have such yet - the parser cannot produce this. - But will have soon, e.g. "uca1400_as_ci". */ DBUG_ASSERT(0); return NULL; @@ -11944,58 +11972,106 @@ bool Lex_collation_st:: CHAR(10) BINARY .. COLLATE latin1_bin CHAR(10) COLLATE uca1400_as_ci .. COLLATE latin1_bin */ - if (collation() == &my_charset_latin1 && - !(cl.collation()->state & MY_CS_PRIMARY)) + if (is_contextually_typed_collate_default() && + !cl.charset_collation()->default_flag()) { - my_error(ER_CONFLICTING_DECLARATIONS, MYF(0), - "COLLATE ", "DEFAULT", "COLLATE ", - cl.collation()->coll_name.str); + error_conflicting_collations_or_styles(*this, cl); return true; } - if (collation() == &my_charset_bin && - !(cl.collation()->state & MY_CS_BINSORT)) + + if (is_contextually_typed_binary_style() && + !cl.charset_collation()->binsort_flag()) { - my_error(ER_CONFLICTING_DECLARATIONS, MYF(0), - "", "BINARY", "COLLATE ", cl.collation()->coll_name.str); + error_conflicting_collations_or_styles(*this, cl); return true; } *this= cl; return false; }
- if (cl.is_contextually_typed_collation()) - { + DBUG_ASSERT(0); + return false; +} + + +bool +Lex_explicit_charset_opt_collate:: + merge_collate_or_error(const Lex_charset_collation_st &cl) +{ + DBUG_ASSERT(cl.type() != Lex_charset_collation_st::TYPE_CHARACTER_SET); + + switch (cl.type()) { + case Lex_charset_collation_st::TYPE_EMPTY: + return false; + case Lex_charset_collation_st::TYPE_CHARACTER_SET: + DBUG_ASSERT(0); + return false; + case Lex_charset_collation_st::TYPE_COLLATE_EXACT: /* - EXPLICIT + CONTEXT - CHAR(10) COLLATE latin1_bin .. COLLATE DEFAULT - not supported - CHAR(10) COLLATE latin1_bin .. COLLATE uca1400_as_ci - not yet + EXPLICIT + EXPLICIT + CHAR(10) CHARACTER SET latin1 .. COLLATE latin1_bin + CHAR(10) CHARACTER SET latin1 COLLATE latin1_bin .. COLLATE latin1_bin + CHAR(10) COLLATE latin1_bin .. COLLATE latin1_bin + CHAR(10) COLLATE latin1_bin .. COLLATE latin1_bin + CHAR(10) CHARACTER SET latin1 BINARY .. COLLATE latin1_bin */ - DBUG_ASSERT(0); // Not possible yet + if (m_with_collate && m_ci != cl.charset_collation()) + { + my_error(ER_CONFLICTING_DECLARATIONS, MYF(0), + "COLLATE ", m_ci->coll_name.str, + "COLLATE ", cl.charset_collation()->coll_name.str); + return true; + } + if (!my_charset_same(m_ci, cl.charset_collation())) + { + my_error(ER_COLLATION_CHARSET_MISMATCH, MYF(0), + cl.charset_collation()->coll_name.str, m_ci->cs_name.str); + return true; + } + m_ci= cl.charset_collation(); + m_with_collate= true; return false; - }
- /* - EXPLICIT + EXPLICIT - CHAR(10) CHARACTER SET latin1 .. COLLATE latin1_bin - CHAR(10) CHARACTER SET latin1 COLLATE latin1_bin .. COLLATE latin1_bin - CHAR(10) COLLATE latin1_bin .. COLLATE latin1_bin - CHAR(10) COLLATE latin1_bin .. COLLATE latin1_bin - CHAR(10) CHARACTER SET latin1 BINARY .. COLLATE latin1_bin - */ - if (type() == TYPE_EXPLICIT && collation() != cl.collation()) - { - my_error(ER_CONFLICTING_DECLARATIONS, MYF(0), - "COLLATE ", collation()->coll_name.str, - "COLLATE ", cl.collation()->coll_name.str); - return true; - } - if (!my_charset_same(collation(), cl.collation())) - { - my_error(ER_COLLATION_CHARSET_MISMATCH, MYF(0), - cl.collation()->coll_name.str, collation()->cs_name.str); - return true; + case Lex_charset_collation_st::TYPE_COLLATE_CONTEXTUALLY_TYPED: + if (cl.is_contextually_typed_collate_default()) + { + /* + SET NAMES latin1 COLLATE DEFAULT; + ALTER TABLE t1 CONVERT TO CHARACTER SET latin1 COLLATE DEFAULT; + */ + CHARSET_INFO *tmp= Charset_loader_mysys().find_default_collation(m_ci); + if (!tmp) + return true; + m_ci= tmp; + m_with_collate= true; + return false; + } + else + { + /* + EXPLICIT + CONTEXT + CHAR(10) COLLATE latin1_bin .. COLLATE DEFAULT not possible yet + CHAR(10) COLLATE latin1_bin .. COLLATE uca1400_as_ci + */ + + const LEX_CSTRING context_cl_name= cl.collation_name_context_suffix(); + if (!strncasecmp(context_cl_name.str, STRING_WITH_LEN("uca1400_")))
Like above, better DBUG_ASSERT(!strncasecmp(context_cl_name.str, STRING_WITH_LEN("uca1400_")))
+ { + CHARSET_INFO *tmp; + Charset_loader_mysys loader; + if (!(tmp= loader.get_contextually_typed_collation_or_error(m_ci, + context_cl_name.str))) + return true; + m_with_collate= true; + m_ci= tmp; + return false; + } + + DBUG_ASSERT(0); // Not possible yet + return false; + } } - *this= cl; + DBUG_ASSERT(0); return false; }
diff --git a/strings/ctype-uca.c b/strings/ctype-uca.c index b89916f3b20..3e6b4e4ce43 100644 --- a/strings/ctype-uca.c +++ b/strings/ctype-uca.c @@ -30542,7 +30613,7 @@ static const char vietnamese[]= Myanmar, according to CLDR Revision 8900. http://unicode.org/cldr/trac/browser/trunk/common/collation/my.xml */ -static const char myanmar[]= "[shift-after-method expand][version 5.2.0]" +static const char myanmar[]= "[shift-after-method expand]"
What's going on with myanmar? You removed a version here and added &my_uca_v520 below in its charset_info_st. What does this change mean?
/* Tones */ "&\\u108C" "<\\u1037" @@ -37627,7 +37825,7 @@ struct charset_info_st my_charset_utf32_myanmar_uca_ci= NULL, /* to_lower */ NULL, /* to_upper */ NULL, /* sort_order */ - NULL, /* uca */ + &my_uca_v520, /* uca */
What does this change?
NULL, /* tab_to_uni */ NULL, /* tab_from_uni */ &my_unicase_unicode520,/* caseinfo */
Regards, Sergei VP of MariaDB Server Engineering and security@mariadb.org