[Maria-developers] bzr commit into MariaDB 5.1, with Maria 1.5:maria branch (monty:2777)
#At lp:maria based on revid:knielsen@knielsen-hq.org-20091127132059-3su1w7xhsbbtpg6f 2777 Michael Widenius 2009-11-30 Added more general support for sorting 2 characters as one (contractions) Added support for Croatian sorting orders utf8_croatian_ci and ucs2_croatian_ci. Patch done by Alexander Barkov. See http://www.collation-charts.org/articles/croatian.htm modified: include/m_ctype.h mysql-test/r/ctype_uca.result mysql-test/t/ctype_uca.test mysys/charset-def.c strings/ctype-mb.c strings/ctype-uca.c strings/ctype-ucs2.c per-file messages: mysql-test/r/ctype_uca.result Added testing of Croatian sort order mysql-test/t/ctype_uca.test Added testing of Croatian sort order === modified file 'include/m_ctype.h' --- a/include/m_ctype.h 2009-09-07 20:50:10 +0000 +++ b/include/m_ctype.h 2009-11-30 12:42:24 +0000 @@ -49,6 +49,24 @@ typedef struct unicase_info_st extern MY_UNICASE_INFO *my_unicase_default[256]; extern MY_UNICASE_INFO *my_unicase_turkish[256]; +#define MY_UCA_MAX_CONTRACTION 4 +#define MY_UCA_MAX_WEIGHT_SIZE 8 + +typedef struct my_contraction_t +{ + my_wc_t ch[MY_UCA_MAX_CONTRACTION]; /* Character sequence */ + uint16 weight[MY_UCA_MAX_WEIGHT_SIZE];/* Its weight string, 0-terminated */ +} MY_CONTRACTION; + + +typedef struct my_contraction_list_t +{ + size_t nitems; /* Number of items in the list */ + MY_CONTRACTION *item; /* List of contractions */ + char *flags; /* Character flags, e.g. "is contraction head") */ +} MY_CONTRACTIONS; + + typedef struct uni_ctype_st { uchar pctype; @@ -262,7 +280,7 @@ typedef struct charset_info_st uchar *to_lower; uchar *to_upper; uchar *sort_order; - uint16 *contractions; + MY_CONTRACTIONS *contractions; uint16 **sort_order_big; uint16 *tab_to_uni; MY_UNI_IDX *tab_from_uni; @@ -475,6 +493,13 @@ my_bool my_charset_is_ascii_based(CHARSE my_bool my_charset_is_8bit_pure_ascii(CHARSET_INFO *cs); uint my_charset_repertoire(CHARSET_INFO *cs); +my_bool my_uca_have_contractions(CHARSET_INFO *cs); +my_bool my_uca_can_be_contraction_head(CHARSET_INFO *cs, my_wc_t wc); +my_bool my_uca_can_be_contraction_tail(CHARSET_INFO *cs, my_wc_t wc); +uint16 *my_uca_contraction2_weight(CHARSET_INFO *cs, my_wc_t wc1, my_wc_t wc2); + + + #define _MY_U 01 /* Upper case */ #define _MY_L 02 /* Lower case */ === modified file 'mysql-test/r/ctype_uca.result' --- a/mysql-test/r/ctype_uca.result 2008-03-26 09:51:16 +0000 +++ b/mysql-test/r/ctype_uca.result 2009-11-30 12:42:24 +0000 @@ -159,6 +159,7 @@ insert into t1 values (_ucs2 0x01fc),(_u insert into t1 values ('AA'),('Aa'),('aa'),('aA'); insert into t1 values ('CH'),('Ch'),('ch'),('cH'); insert into t1 values ('DZ'),('Dz'),('dz'),('dZ'); +insert into t1 values ('D��'),('D��'),('d��'),('d��'); insert into t1 values ('IJ'),('Ij'),('ij'),('iJ'); insert into t1 values ('LJ'),('Lj'),('lj'),('lJ'); insert into t1 values ('LL'),('Ll'),('ll'),('lL'); @@ -181,7 +182,7 @@ C,c,��,��,��,��,��,��,��,��,��,�� CH,Ch,cH,ch ��,�� D,d,��,�� -DZ,Dz,dZ,dz,��,��,��,��,��,�� +DZ,Dz,D��,D��,dZ,dz,d��,d��,��,��,��,��,��,�� ��,�� �� �� @@ -286,7 +287,7 @@ C,c,��,��,��,��,��,��,��,��,��,�� CH,Ch,cH,ch ��,�� D,d,��,�� -DZ,Dz,dZ,dz,��,��,��,��,��,�� +DZ,Dz,D��,D��,dZ,dz,d��,d��,��,��,��,��,��,�� ��,�� ��,�� �� @@ -400,6 +401,7 @@ CH,Ch,cH,ch ��,�� D,d,��,�� DZ,Dz,dZ,dz,��,��,��,��,��,�� +D��,D��,d��,d�� ��,�� �� �� @@ -513,7 +515,7 @@ C,c,��,��,��,��,��,��,��,��,��,�� CH,Ch,cH,ch ��,�� D,d,��,�� -DZ,Dz,dZ,dz,��,��,��,��,��,�� +DZ,Dz,D��,D��,dZ,dz,d��,d��,��,��,��,��,��,�� ��,�� �� �� @@ -622,6 +624,7 @@ CH,Ch,cH,ch ��,�� D,d,��,�� DZ,Dz,dZ,dz,��,��,��,��,��,�� +D��,D��,d��,d�� ��,�� �� �� @@ -729,7 +732,7 @@ CH,Ch,cH,ch ��,�� ��,�� D,d,��,�� -DZ,Dz,dZ,dz,��,��,��,��,��,�� +DZ,Dz,D��,D��,dZ,dz,d��,d��,��,��,��,��,��,�� ��,�� �� �� @@ -840,6 +843,7 @@ CH,Ch,cH,ch ��,�� D,d,��,�� DZ,Dz,dZ,dz +D��,D��,d��,d�� ��,��,��,��,��,�� ��,�� �� @@ -951,7 +955,7 @@ C,c,��,��,��,��,��,��,��,��,��,�� CH,Ch,cH,ch ��,�� D,d,��,�� -DZ,Dz,dZ,dz,��,��,��,��,��,�� +DZ,Dz,D��,D��,dZ,dz,d��,d��,��,��,��,��,��,�� ��,�� �� �� @@ -1056,7 +1060,7 @@ C,c,��,��,��,��,��,��,��,��,��,�� CH,Ch,cH,ch ��,�� D,d,��,�� -DZ,Dz,dZ,dz,��,��,��,��,��,�� +DZ,Dz,D��,D��,dZ,dz,d��,d��,��,��,��,��,��,�� ��,�� �� �� @@ -1164,7 +1168,7 @@ CH,Ch,cH,ch ��,�� ��,�� D,d,��,�� -DZ,Dz,dZ,dz,��,��,��,��,��,�� +DZ,Dz,D��,D��,dZ,dz,d��,d��,��,��,��,��,��,�� ��,�� �� �� @@ -1275,6 +1279,7 @@ cH ��,�� D,d,��,�� DZ,Dz,dZ,dz,��,��,��,��,��,�� +D��,D��,d��,d�� ��,�� �� �� @@ -1382,7 +1387,7 @@ C,c,��,��,��,��,��,��,��,��,��,�� CH,Ch,cH,ch ��,�� D,d,��,�� -DZ,Dz,dZ,dz,��,��,��,��,��,�� +DZ,Dz,D��,D��,dZ,dz,d��,d��,��,��,��,��,��,�� ��,�� �� �� @@ -1491,6 +1496,7 @@ cH ��,�� D,d,��,�� DZ,Dz,dZ,dz,��,��,��,��,��,�� +D��,D��,d��,d�� ��,�� �� �� @@ -1599,6 +1605,7 @@ cH ��,�� D,d,��,�� DZ,Dz,dZ,dz,��,��,��,��,��,�� +D��,D��,d��,d�� ��,�� �� �� @@ -1707,7 +1714,7 @@ cH CH,Ch,ch ��,�� D,d,��,�� -DZ,Dz,dZ,dz,��,��,��,��,��,�� +DZ,Dz,D��,D��,dZ,dz,d��,d��,��,��,��,��,��,�� ��,�� �� �� @@ -1813,7 +1820,7 @@ C,c,��,��,��,��,��,��,��,��,��,�� CH,Ch,cH,ch ��,�� D,d,��,�� -DZ,Dz,dZ,dz,��,��,��,��,��,�� +DZ,Dz,D��,D��,dZ,dz,d��,d��,��,��,��,��,��,�� ��,�� �� �� @@ -1921,7 +1928,7 @@ CH,Ch,cH,ch ��,�� ��,�� D,d,��,�� -DZ,Dz,dZ,dz,��,��,��,��,��,�� +DZ,Dz,D��,D��,dZ,dz,d��,d��,��,��,��,��,��,�� ��,�� �� �� @@ -2030,7 +2037,7 @@ C,c,��,��,��,��,��,��,��,��,��,�� CH,Ch,cH,ch ��,�� D,d,��,�� -DZ,Dz,dZ,dz,��,��,��,��,��,�� +DZ,Dz,D��,D��,dZ,dz,d��,d��,��,��,��,��,��,�� ��,�� �� �� @@ -2121,6 +2128,118 @@ Z,z,��,��,��,��,��,�� �� �� �� +select group_concat(c1 order by c1) from t1 group by c1 collate utf8_croatian_ci; +group_concat(c1 order by c1) +�� +�� +A,a,��,��,��,��,��,��,��,��,��,��,��,��,��,��,��,��,��,��,��,��,��,��,��,��,��,�� +AA,Aa,aA,aa +��,��,��,��,��,�� +B,b +�� +�� +��,�� +C,c,��,��,��,��,��,�� +CH,Ch,cH,ch +��,�� +��,�� +��,�� +D,d,��,�� +DZ,Dz,dZ,dz,��,��,�� +d�� +D��,D��,d��,��,��,�� +��,�� +�� +�� +��,�� +��,�� +E,e,��,��,��,��,��,��,��,��,��,��,��,��,��,��,��,��,��,�� +��,�� +�� +�� +F,f +��,�� +G,g,��,��,��,��,��,��,��,��,��,��,��,�� +��,�� +�� +�� +��,�� +H,h,��,�� +��,�� +��,�� +I,i,��,��,��,��,��,��,��,��,��,��,��,��,��,��,��,��,��,��,�� +IJ,Ij,iJ,ij,��,�� +�� +�� +�� +J,j,��,��,�� +K,k,��,��,��,�� +��,�� +L,l,��,��,��,��,��,�� +��,�� +lJ +LL,Ll,lL,ll +LJ,Lj,lj,��,��,�� +��,�� +�� +�� +M,m +N,n,��,��,��,��,��,��,��,��,��,�� +nJ +NJ,Nj,nj,��,��,�� +�� +�� +��,�� +O,o,��,��,��,��,��,��,��,��,��,��,��,��,��,��,��,��,��,��,��,��,��,��,��,�� +OE,Oe,oE,oe,��,�� +��,��,��,�� +�� +�� +P,p +��,�� +Q,q +�� +R,r,��,��,��,��,��,�� +RR,Rr,rR,rr +�� +S,s,��,��,��,��,��,��,�� +SS,Ss,sS,ss,�� +��,�� +�� +�� +T,t,��,��,��,�� +�� +��,�� +�� +��,�� +�� +U,u,��,��,��,��,��,��,��,��,��,��,��,��,��,��,��,��,��,��,��,��,��,��,��,��,��,��,��,��,��,��,��,�� +�� +�� +V,v +�� +W,w,��,�� +X,x +Y,y,��,��,��,��,��,�� +��,�� +Z,z,��,��,��,�� +�� +��,�� +��,�� +��,��,�� +��,�� +�� +��,�� +��,�� +�� +��,�� +��,�� +��,�� +�� +�� +�� +�� +�� drop table t1; SET NAMES utf8; CREATE TABLE t1 (c varchar(255) NOT NULL COLLATE utf8_general_ci, INDEX (c)); === modified file 'mysql-test/t/ctype_uca.test' --- a/mysql-test/t/ctype_uca.test 2008-02-20 18:49:26 +0000 +++ b/mysql-test/t/ctype_uca.test 2009-11-30 12:42:24 +0000 @@ -186,6 +186,7 @@ insert into t1 values (_ucs2 0x01fc),(_u insert into t1 values ('AA'),('Aa'),('aa'),('aA'); insert into t1 values ('CH'),('Ch'),('ch'),('cH'); insert into t1 values ('DZ'),('Dz'),('dz'),('dZ'); +insert into t1 values ('D��'),('D��'),('d��'),('d��'); insert into t1 values ('IJ'),('Ij'),('ij'),('iJ'); insert into t1 values ('LJ'),('Lj'),('lj'),('lJ'); insert into t1 values ('LL'),('Ll'),('ll'),('lL'); @@ -213,6 +214,7 @@ select group_concat(c1 order by c1) from select group_concat(c1 order by c1) from t1 group by c1 collate utf8_roman_ci; select group_concat(c1 order by c1) from t1 group by c1 collate utf8_esperanto_ci; select group_concat(c1 order by c1) from t1 group by c1 collate utf8_hungarian_ci; +select group_concat(c1 order by c1) from t1 group by c1 collate utf8_croatian_ci; drop table t1; === modified file 'mysys/charset-def.c' --- a/mysys/charset-def.c 2007-06-21 20:10:40 +0000 +++ b/mysys/charset-def.c 2009-11-30 12:42:24 +0000 @@ -42,6 +42,7 @@ extern CHARSET_INFO my_charset_ucs2_roma extern CHARSET_INFO my_charset_ucs2_persian_uca_ci; extern CHARSET_INFO my_charset_ucs2_esperanto_uca_ci; extern CHARSET_INFO my_charset_ucs2_hungarian_uca_ci; +extern CHARSET_INFO my_charset_ucs2_croatian_uca_ci; #endif #ifdef HAVE_CHARSET_utf8 @@ -63,6 +64,7 @@ extern CHARSET_INFO my_charset_utf8_roma extern CHARSET_INFO my_charset_utf8_persian_uca_ci; extern CHARSET_INFO my_charset_utf8_esperanto_uca_ci; extern CHARSET_INFO my_charset_utf8_hungarian_uca_ci; +extern CHARSET_INFO my_charset_utf8_croatian_uca_ci; #ifdef HAVE_UTF8_GENERAL_CS extern CHARSET_INFO my_charset_utf8_general_cs; #endif @@ -152,6 +154,7 @@ my_bool init_compiled_charsets(myf flags add_compiled_collation(&my_charset_ucs2_persian_uca_ci); add_compiled_collation(&my_charset_ucs2_esperanto_uca_ci); add_compiled_collation(&my_charset_ucs2_hungarian_uca_ci); + add_compiled_collation(&my_charset_ucs2_croatian_uca_ci); #endif #endif @@ -186,6 +189,7 @@ my_bool init_compiled_charsets(myf flags add_compiled_collation(&my_charset_utf8_persian_uca_ci); add_compiled_collation(&my_charset_utf8_esperanto_uca_ci); add_compiled_collation(&my_charset_utf8_hungarian_uca_ci); + add_compiled_collation(&my_charset_utf8_croatian_uca_ci); #endif #endif === modified file 'strings/ctype-mb.c' --- a/strings/ctype-mb.c 2009-02-13 16:41:47 +0000 +++ b/strings/ctype-mb.c 2009-11-30 12:42:24 +0000 @@ -567,8 +567,7 @@ my_bool my_like_range_mb(CHARSET_INFO *c char *min_end= min_str + res_length; char *max_end= max_str + res_length; size_t maxcharlen= res_length / cs->mbmaxlen; - const char *contraction_flags= cs->contractions ? - ((const char*) cs->contractions) + 0x40*0x40 : NULL; + my_bool have_contractions= my_uca_have_contractions(cs); for (; ptr != end && min_str != min_end && maxcharlen ; maxcharlen--) { @@ -636,8 +635,8 @@ fill_max_and_min: 'ab\min\min\min\min' and 'ab\max\max\max\max'. */ - if (contraction_flags && ptr + 1 < end && - contraction_flags[(uchar) *ptr]) + if (have_contractions && ptr + 1 < end && + my_uca_can_be_contraction_head(cs, (uchar) *ptr)) { /* Ptr[0] is a contraction head. */ @@ -659,8 +658,8 @@ fill_max_and_min: is not a contraction, then we put only ptr[0], and continue with ptr[1] on the next loop. */ - if (contraction_flags[(uchar) ptr[1]] && - cs->contractions[(*ptr-0x40)*0x40 + ptr[1] - 0x40]) + if (my_uca_can_be_contraction_tail(cs, (uchar) ptr[1]) && + my_uca_contraction2_weight(cs, (uchar) ptr[0], (uchar) ptr[1])) { /* Contraction found */ if (maxcharlen == 1 || min_str + 1 >= min_end) === modified file 'strings/ctype-uca.c' --- a/strings/ctype-uca.c 2009-11-16 20:49:51 +0000 +++ b/strings/ctype-uca.c 2009-11-30 12:42:24 +0000 @@ -6713,6 +6713,16 @@ static const char hungarian[]= "&U < \\u00FC <<< \\u00DC << \\u0171 <<< \\u0170"; +static const char croatian[]= + +"&C < \\u010D <<< \\u010C < \\u0107 <<< \\u0106 " +"&D < d\\u017E <<< \\u01C6 <<< D\\u017E <<< \\u01C5 <<< D\\u017D <<< \\u01C4 " +" < \\u0111 <<< \\u0110 " +"&L < lj <<< \\u01C9 <<< Lj <<< \\u01C8 <<< LJ <<< \\u01C7 " +"&N < nj <<< \\u01CC <<< Nj <<< \\u01CB <<< NJ <<< \\u01CA " +"&S < \\u0161 <<< \\u0160 " +"&Z < \\u017E <<< \\u017D"; + /* Unicode Collation Algorithm: Collation element (weight) scanner, @@ -6726,7 +6736,7 @@ typedef struct my_uca_scanner_st const uchar *send; /* End of the input string */ uchar *uca_length; uint16 **uca_weight; - uint16 *contractions; + MY_CONTRACTIONS *contractions; uint16 implicit[2]; int page; int code; @@ -6747,6 +6757,164 @@ typedef struct my_uca_scanner_handler_st static uint16 nochar[]= {0,0}; +#define MY_UCA_CNT_FLAG_SIZE 4096 +#define MY_UCA_CNT_FLAG_MASK 4095 + +#define MY_UCA_CNT_HEAD 1 +#define MY_UCA_CNT_TAIL 2 + + + + +/********** Helper functions to handle contraction ************/ + + +/** + Mark a character as a contraction part + + @cs Pointer to CHARSET_INFO data + @wc Unicode code point + @flag flag: "is contraction head", "is contraction tail" +*/ + +static void +my_uca_add_contraction_flag(CHARSET_INFO *cs, my_wc_t wc, int flag) +{ + cs->contractions->flags[wc & MY_UCA_CNT_FLAG_MASK]|= flag; +} + + +/** + Add a new contraction into contraction list + + @cs Pointer to CHARSET_INFO data + @wc Unicode code points of the characters + @len Number of characters + + @return New contraction + @retval Pointer to a newly added contraction +*/ + +static MY_CONTRACTION * +my_uca_add_contraction(CHARSET_INFO *cs, + my_wc_t *wc, int len __attribute__((unused))) +{ + MY_CONTRACTIONS *list= cs->contractions; + MY_CONTRACTION *next= &list->item[list->nitems]; + DBUG_ASSERT(len == 2); /* We currently support only contraction2 */ + next->ch[0]= wc[0]; + next->ch[1]= wc[1]; + list->nitems++; + return next; +} + + +/** + Allocate and initialize memory for contraction list and flags + + @cs Pointer to CHARSET_INFO data + @alloc Memory allocation function (typically points to my_alloc_once) + @n Number of contractions + + @return Error code + @retval 0 - memory allocated successfully + @retval 1 - not enough memory +*/ + +static my_bool +my_uca_alloc_contractions(CHARSET_INFO *cs, void *(*alloc)(size_t), size_t n) +{ + uint size= n * sizeof(MY_CONTRACTION); + if (!(cs->contractions= (*alloc)(sizeof(MY_CONTRACTIONS)))) + return 1; + bzero(cs->contractions, sizeof(MY_CONTRACTIONS)); + if (!(cs->contractions->item= (*alloc)(size)) || + !(cs->contractions->flags= (char*) (*alloc)(MY_UCA_CNT_FLAG_SIZE))) + return 1; + bzero((void*) cs->contractions->item, size); + bzero((void*) cs->contractions->flags, MY_UCA_CNT_FLAG_SIZE); + return 0; +} + + +/** + Check if UCA data has contractions (public version) + + @cs Pointer to CHARSET_INFO data + @retval 0 - no contraction, 1 - have contractions. +*/ + +my_bool +my_uca_have_contractions(CHARSET_INFO *cs) +{ + return cs->contractions != NULL; +} + + +/** + Check if a character can be contraction head + + @cs Pointer to CHARSET_INFO data + @wc Code point + + @retval 0 - cannot be contraction head + @retval 1 - can be contraction head +*/ + +my_bool +my_uca_can_be_contraction_head(CHARSET_INFO *cs, my_wc_t wc) +{ + return cs->contractions->flags[wc & MY_UCA_CNT_FLAG_MASK] & MY_UCA_CNT_HEAD; +} + + +/** + Check if a character can be contraction tail + + @cs Pointer to CHARSET_INFO data + @wc Code point + + @retval 0 - cannot be contraction tail + @retval 1 - can be contraction tail +*/ + +my_bool +my_uca_can_be_contraction_tail(CHARSET_INFO *cs, my_wc_t wc) +{ + return cs->contractions->flags[wc & MY_UCA_CNT_FLAG_MASK] & MY_UCA_CNT_TAIL; +} + + +/** + Find a contraction and return its weight array + + @cs Pointer to CHARSET data + @wc1 First character + @wc2 Second character + + @return Weight array + @retval NULL - no contraction found + @retval ptr - contraction weight array +*/ + +uint16 * +my_uca_contraction2_weight(CHARSET_INFO *cs, my_wc_t wc1, my_wc_t wc2) +{ + MY_CONTRACTIONS *list= cs->contractions; + MY_CONTRACTION *c, *last; + for (c= list->item, last= &list->item[list->nitems]; c < last; c++) + { + if (c->ch[0] == wc1 && c->ch[1] == wc2) + { + return c->weight; + } + } + return NULL; +} + + + + #ifdef HAVE_CHARSET_ucs2 /* Initialize collation weight scanner @@ -6766,7 +6934,7 @@ static uint16 nochar[]= {0,0}; */ static void my_uca_scanner_init_ucs2(my_uca_scanner *scanner, - CHARSET_INFO *cs __attribute__((unused)), + CHARSET_INFO *cs, const uchar *str, size_t length) { scanner->wbeg= nochar; @@ -6777,6 +6945,7 @@ static void my_uca_scanner_init_ucs2(my_ scanner->uca_length= cs->sort_order; scanner->uca_weight= cs->sort_order_big; scanner->contractions= cs->contractions; + scanner->cs= cs; return; } @@ -6865,18 +7034,23 @@ static int my_uca_scanner_next_ucs2(my_u if (scanner->contractions && (scanner->sbeg <= scanner->send)) { - int cweight; + my_wc_t wc1= ((scanner->page << 8) | scanner->code); - if (!scanner->page && !scanner->sbeg[0] && - (scanner->sbeg[1] > 0x40) && (scanner->sbeg[1] < 0x80) && - (scanner->code > 0x40) && (scanner->code < 0x80) && - (cweight= scanner->contractions[(scanner->code-0x40)*0x40+scanner->sbeg[1]-0x40])) + if (my_uca_can_be_contraction_head(scanner->cs, wc1)) + { + uint16 *cweight; + my_wc_t wc2= (((my_wc_t) scanner->sbeg[0]) << 8) | scanner->sbeg[1]; + if (my_uca_can_be_contraction_tail(scanner->cs, wc2) && + (cweight= my_uca_contraction2_weight(scanner->cs, + scanner->code, + scanner->sbeg[1]))) { scanner->implicit[0]= 0; scanner->wbeg= scanner->implicit; scanner->sbeg+=2; - return cweight; + return *cweight; } + } } if (!ucaw[scanner->page]) @@ -6959,23 +7133,22 @@ static int my_uca_scanner_next_any(my_uc scanner->code= wc & 0xFF; scanner->sbeg+= mb_len; - if (scanner->contractions && !scanner->page && - (scanner->code > 0x40) && (scanner->code < 0x80)) + if (my_uca_have_contractions(scanner->cs) && + my_uca_can_be_contraction_head(scanner->cs, wc)) { - uint page1, code1, cweight; + my_wc_t wc2; + uint16 *cweight; - if (((mb_len= scanner->cs->cset->mb_wc(scanner->cs, &wc, + if (((mb_len= scanner->cs->cset->mb_wc(scanner->cs, &wc2, scanner->sbeg, scanner->send)) >=0) && - (!(page1= (wc >> 8))) && - ((code1= (wc & 0xFF)) > 0x40) && - (code1 < 0x80) && - (cweight= scanner->contractions[(scanner->code-0x40)*0x40 + code1-0x40])) + my_uca_can_be_contraction_tail(scanner->cs, wc2) && + (cweight= my_uca_contraction2_weight(scanner->cs, wc, wc2))) { scanner->implicit[0]= 0; scanner->wbeg= scanner->implicit; scanner->sbeg+= mb_len; - return cweight; + return *cweight; } } @@ -7012,6 +7185,33 @@ static my_uca_scanner_handler my_any_uca my_uca_scanner_next_any }; + + +/** + Helper function: + Find address of weights of the given character. + + @weights UCA weight array + @lengths UCA length array + @ch character Unicode code point + + @return Weight array + @retval pointer to weight array for the given character, + or NULL if this page does not have implicit weights. +*/ + +static inline uint16 * +my_char_weight_addr(CHARSET_INFO *cs, uint wc) +{ + uint page= (wc >> 8); + uint ofst= wc & 0xFF; + return cs->sort_order_big[page] ? + cs->sort_order_big[page] + ofst * cs->sort_order[page] : + NULL; +} + + + /* Compares two strings according to the collation @@ -7683,8 +7883,8 @@ ex: typedef struct my_coll_rule_item_st { - uint base; /* Base character */ - uint curr[2]; /* Current character */ + my_wc_t base; /* Base character */ + my_wc_t curr[2]; /* Current character */ int diff[3]; /* Primary, Secondary and Tertiary difference */ } MY_COLL_RULE; @@ -7834,6 +8034,7 @@ static int my_coll_rule_parse(MY_COLL_RU static my_bool create_tailoring(CHARSET_INFO *cs, void *(*alloc)(size_t)) { MY_COLL_RULE rule[MY_MAX_COLL_RULE]; + MY_COLL_RULE *r, *rfirst, *rlast; char errstr[128]; uchar *newlengths; uint16 **newweights; @@ -7858,6 +8059,9 @@ static my_bool create_tailoring(CHARSET_ return 1; } + rfirst= rule; + rlast= rule + rc; + if (!cs->caseinfo) cs->caseinfo= my_unicase_default; @@ -7941,44 +8145,21 @@ static my_bool create_tailoring(CHARSET_ /* Now process contractions */ if (ncontractions) { - /* - 8K for weights for basic latin letter pairs, - plus 256 bytes for "is contraction part" flags. - */ - uint size= 0x40*0x40*sizeof(uint16) + 256; - char *contraction_flags; - if (!(cs->contractions= (uint16*) (*alloc)(size))) - return 1; - bzero((void*)cs->contractions, size); - contraction_flags= ((char*) cs->contractions) + 0x40*0x40; - for (i=0; i < rc; i++) + if (my_uca_alloc_contractions(cs, alloc, ncontractions)) + return 1; + for (r= rfirst; r < rlast; r++) { - if (rule[i].curr[1]) + uint16 *to; + if (r->curr[1]) /* Contraction */ { - uint pageb= (rule[i].base >> 8) & 0xFF; - uint chb= rule[i].base & 0xFF; - uint16 *offsb= defweights[pageb] + chb*deflengths[pageb]; - uint offsc; - - if (offsb[1] || - rule[i].curr[0] < 0x40 || rule[i].curr[0] > 0x7f || - rule[i].curr[1] < 0x40 || rule[i].curr[1] > 0x7f) - { - /* - TODO: add error reporting; - We support only basic latin letters contractions at this point. - Also, We don't support contractions with weight longer than one. - Otherwise, we'd need much more memory. - */ - return 1; - } - offsc= (rule[i].curr[0]-0x40)*0x40+(rule[i].curr[1]-0x40); - - /* Copy base weight applying primary difference */ - cs->contractions[offsc]= offsb[0] + rule[i].diff[0]; - /* Mark both letters as "is contraction part */ - contraction_flags[rule[i].curr[0]]= 1; - contraction_flags[rule[i].curr[1]]= 1; + /* Mark both letters as "is contraction part" */ + my_uca_add_contraction_flag(cs, r->curr[0], MY_UCA_CNT_HEAD); + my_uca_add_contraction_flag(cs, r->curr[1], MY_UCA_CNT_TAIL); + to= my_uca_add_contraction(cs, r->curr, 2)->weight; + /* Copy weight from the reset character */ + to[0]= my_char_weight_addr(cs, r->base)[0]; + /* Apply primary difference */ + to[0]+= r->diff[0]; } } } @@ -8701,6 +8882,39 @@ CHARSET_INFO my_charset_ucs2_hungarian_u }; +CHARSET_INFO my_charset_ucs2_croatian_uca_ci= +{ + 149,0,0, /* number */ + MY_CS_COMPILED|MY_CS_STRNXFRM|MY_CS_UNICODE, + "ucs2", /* cs name */ + "ucs2_croatian_ci", /* name */ + "", /* comment */ + croatian, /* tailoring */ + NULL, /* ctype */ + NULL, /* to_lower */ + NULL, /* to_upper */ + NULL, /* sort_order */ + NULL, /* contractions */ + NULL, /* sort_order_big*/ + NULL, /* tab_to_uni */ + NULL, /* tab_from_uni */ + my_unicase_default, /* caseinfo */ + NULL, /* state_map */ + NULL, /* ident_map */ + 8, /* strxfrm_multiply */ + 1, /* caseup_multiply */ + 1, /* casedn_multiply */ + 2, /* mbminlen */ + 2, /* mbmaxlen */ + 9, /* min_sort_char */ + 0xFFFF, /* max_sort_char */ + ' ', /* pad char */ + 0, /* escape_with_backslash_is_dangerous */ + &my_charset_ucs2_handler, + &my_collation_ucs2_uca_handler +}; + + #endif @@ -9358,6 +9572,38 @@ CHARSET_INFO my_charset_utf8_hungarian_u &my_collation_any_uca_handler }; +CHARSET_INFO my_charset_utf8_croatian_uca_ci= +{ + 213,0,0, /* number */ + MY_CS_COMPILED|MY_CS_STRNXFRM|MY_CS_UNICODE, + "utf8", /* cs name */ + "utf8_croatian_ci", /* name */ + "", /* comment */ + croatian, /* tailoring */ + ctype_utf8, /* ctype */ + NULL, /* to_lower */ + NULL, /* to_upper */ + NULL, /* sort_order */ + NULL, /* contractions */ + NULL, /* sort_order_big*/ + NULL, /* tab_to_uni */ + NULL, /* tab_from_uni */ + my_unicase_default, /* caseinfo */ + NULL, /* state_map */ + NULL, /* ident_map */ + 8, /* strxfrm_multiply */ + 1, /* caseup_multiply */ + 1, /* casedn_multiply */ + 1, /* mbminlen */ + 3, /* mbmaxlen */ + 9, /* min_sort_char */ + 0xFFFF, /* max_sort_char */ + ' ', /* pad char */ + 0, /* escape_with_backslash_is_dangerous */ + &my_charset_utf8_handler, + &my_collation_any_uca_handler +}; + #endif /* HAVE_CHARSET_utf8 */ #endif /* HAVE_UCA_COLLATIONS */ === modified file 'strings/ctype-ucs2.c' --- a/strings/ctype-ucs2.c 2009-10-15 21:38:29 +0000 +++ b/strings/ctype-ucs2.c 2009-11-30 12:42:24 +0000 @@ -1526,8 +1526,7 @@ my_bool my_like_range_ucs2(CHARSET_INFO char *min_org=min_str; char *min_end=min_str+res_length; size_t charlen= res_length / cs->mbmaxlen; - const char *contraction_flags= cs->contractions ? - ((const char*) cs->contractions) + 0x40*0x40 : NULL; + my_bool have_contractions= my_uca_have_contractions(cs); for ( ; ptr + 1 < end && min_str + 1 < min_end && charlen > 0 ; ptr+=2, charlen--) @@ -1567,8 +1566,9 @@ fill_max_and_min: return 0; } - if (contraction_flags && ptr + 3 < end && - ptr[0] == '\0' && contraction_flags[(uchar) ptr[1]]) + if (have_contractions && ptr + 3 < end && + ptr[0] == '\0' && + my_uca_can_be_contraction_head(cs, (uchar) ptr[1])) { /* Contraction head found */ if (ptr[2] == '\0' && (ptr[3] == w_one || ptr[3] == w_many)) @@ -1581,8 +1581,9 @@ fill_max_and_min: Check if the second letter can be contraction part, and if two letters really produce a contraction. */ - if (ptr[2] == '\0' && contraction_flags[(uchar) ptr[3]] && - cs->contractions[(ptr[1]-0x40)*0x40 + ptr[3] - 0x40]) + if (ptr[2] == '\0' && + my_uca_can_be_contraction_tail(cs, (uchar) ptr[3]) && + my_uca_contraction2_weight(cs,(uchar) ptr[1], (uchar) ptr[3])) { /* Contraction found */ if (charlen == 1 || min_str + 2 >= min_end)
participants (1)
-
Michael Widenius