Re: [Maria-developers] e3f45b2f9ea: MDEV-10267 Add ngram fulltext parser plugin
Hi, Rinat! On Oct 19, Rinat Ibragimov wrote:
revision-id: e3f45b2f9ea (mariadb-10.2.31-368-ge3f45b2f9ea) parent(s): 44c48cefdbe author: Rinat Ibragimov <ri@tempesta-tech.com> committer: Rinat Ibragimov <ri@tempesta-tech.com> timestamp: 2020-08-25 01:59:06 +0300 message:
MDEV-10267 Add ngram fulltext parser plugin
Ngram is a fulltext parser plugin that splits words into overlapping segments of fixed lengths. For example, if 3-grams are in use, string "abcdef" is split into "abc", "bcd", "cde", "def", thus allowing an efficient substring search for CJK texts where splitting by words in not feasible. Only word characters are going into n-grams. Spaces and punctuation characters are treated as separators.
diff --git a/plugin/fulltext/CMakeLists.txt b/plugin/fulltext/CMakeLists.txt index 3c37666cb3e..9704733a9aa 100644 --- a/plugin/fulltext/CMakeLists.txt +++ b/plugin/fulltext/CMakeLists.txt @@ -18,3 +18,7 @@ set(CMAKE_CXX_FLAGS_RELWITHDEBINFO "${CMAKE_CXX_FLAGS_RELWITHDEBINFO} -DNDEBUG")
MYSQL_ADD_PLUGIN(ftexample plugin_example.c MODULE_ONLY MODULE_OUTPUT_NAME "mypluglib" COMPONENT Test) + +# n-gram parser +MYSQL_ADD_PLUGIN(ftngram ngram_parser/plugin_ngram.cc + MODULE_ONLY MODULE_OUTPUT_NAME "fulltext_ngram")
why to put the plugin under plugin/fulltext/ngram_parser/ ? it's only one file, just put it in plugin/fulltext/
diff --git a/plugin/fulltext/mysql-test/ngram/disabled.def b/plugin/fulltext/mysql-test/ngram/disabled.def new file mode 100644 index 00000000000..0ca9dd33ef7 --- /dev/null +++ b/plugin/fulltext/mysql-test/ngram/disabled.def @@ -0,0 +1 @@ +ngram_gb18030 : GB18030 character set is not supported yet.
I don't think there's any need to add tests for unsupported charsets. The plugin is trivial, no need to test it for every possible charset that we have and don't have.
diff --git a/plugin/fulltext/mysql-test/ngram/ngram_basic.inc b/plugin/fulltext/mysql-test/ngram/ngram_basic.inc --- /dev/null +++ b/plugin/fulltext/mysql-test/ngram/ngram_basic.inc @@ -0,0 +1,25 @@ +--echo +--eval CREATE TABLE t (lang TEXT, str TEXT, FULLTEXT(str) WITH PARSER ngram) ENGINE=$engine CHARSET $charset + +# Some character are not possible to represent in some encodings. Ignore them. +--disable_warnings +--disable_query_log +# http://www.gerolf.org/doc/polyglot/polyglot.htm. +INSERT IGNORE INTO t VALUES + ('Chinese', '他们为什么不说中文?'), + ('English', 'Why can''t they just speak English?'), + ('Japanese', 'なぜ、みんな日本語を話してくれないのか?'), + ('Korean', '세계의 모든 사람들이 한국어를 이해한다면 얼마나 좋을까?'); +--enable_query_log +--enable_warnings + +SELECT * FROM t ORDER BY lang; + +SELECT * FROM t WHERE MATCH(str) AGAINST('之一' IN BOOLEAN MODE) ORDER BY lang; +SELECT * FROM t WHERE MATCH(str) AGAINST('数据' IN BOOLEAN MODE) ORDER BY lang; +SELECT * FROM t WHERE MATCH(str) AGAINST('zzz' IN BOOLEAN MODE) ORDER BY lang; +SELECT * FROM t WHERE MATCH(str) AGAINST('gli' IN BOOLEAN MODE) ORDER BY lang; +SELECT * FROM t WHERE MATCH(str) AGAINST('ない' IN BOOLEAN MODE) ORDER BY lang; +SELECT * FROM t WHERE MATCH(str) AGAINST('람들' IN BOOLEAN MODE) ORDER BY lang; + +DROP TABLE t; diff --git a/plugin/fulltext/mysql-test/ngram/ngram_basic.test b/plugin/fulltext/mysql-test/ngram/ngram_basic.test new file mode 100644 index 00000000000..eaa8829c3ee --- /dev/null +++ b/plugin/fulltext/mysql-test/ngram/ngram_basic.test @@ -0,0 +1,25 @@ +# Ngram fulltext parser basic functionality. + +SET NAMES utf8mb4; +SELECT @@ngram_token_size; + +let $engine_index= 3; +while ($engine_index) { + if ($engine_index == 3) { let $engine= InnoDB; } + if ($engine_index == 2) { let $engine= MyISAM; } + if ($engine_index == 1) { let $engine= Aria; } + + let $charset_index= 6; + while ($charset_index) { + if ($charset_index == 6) { let $charset= utf8mb4; } + if ($charset_index == 5) { let $charset= big5; } + if ($charset_index == 4) { let $charset= gbk; } + if ($charset_index == 3) { let $charset= gb2312; } + if ($charset_index == 2) { let $charset= sjis; } + if ($charset_index == 1) { let $charset= ujis; } + + source ngram_basic.inc; + dec $charset_index; + }
eh. unroll the loop please, write let $engine= InnoDB; let $charset= utf8mb4; source ngram_basic.inc; let $charset= big5; source ngram_basic.inc; ... Or, better: CREATE TABLE t (lang TEXT, str TEXT, FULLTEXT(str) WITH PARSER ngram) ENGINE=InnoDB CHARSET utf8mb4; source ngram_basic.inc; CREATE TABLE t (lang TEXT, str TEXT, FULLTEXT(str) WITH PARSER ngram) ENGINE=InnoDB CHARSET big5; source ngram_basic.inc; ...
+ dec $engine_index; +} diff --git a/plugin/fulltext/mysql-test/ngram/ngram_missing_plugin.result b/plugin/fulltext/mysql-test/ngram/ngram_missing_plugin.result new file mode 100644 index 00000000000..aa17b1d6053 --- /dev/null +++ b/plugin/fulltext/mysql-test/ngram/ngram_missing_plugin.result @@ -0,0 +1,23 @@
there are tests for the missing fulltest parser plugin. they don't depend on the individial plugin (that is, btw, *not installed*). No need to repeat them for every plugin, remove this test.
+SET NAMES utf8mb4; +CREATE TABLE t (str TEXT, FULLTEXT(str) WITH PARSER ngram) +ENGINE=InnoDB; +INSERT INTO t VALUES ('Hello World'), ('abcdef ghijklm'); +# Restart with ngram plugin disabled. +SET NAMES utf8mb4; +SELECT * FROM t WHERE MATCH(str) AGAINST('bcd') ORDER BY str; +ERROR HY000: Plugin 'ngram' is not loaded +SELECT * FROM t ORDER BY str; +ERROR HY000: Plugin 'ngram' is not loaded +INSERT INTO t VALUES ('something'); +ERROR HY000: Plugin 'ngram' is not loaded +# Restart with ngram plugin enabled again. +SET NAMES utf8mb4; +SELECT * FROM t WHERE MATCH(str) AGAINST('bcd') ORDER BY str; +str +abcdef ghijklm +SELECT * FROM t ORDER BY str; +str +abcdef ghijklm +Hello World +INSERT INTO t VALUES ('something'); +DROP TABLE t; diff --git a/plugin/fulltext/mysql-test/ngram/ngram_token_size.result b/plugin/fulltext/mysql-test/ngram/ngram_token_size.result new file mode 100644 index 00000000000..7043e147d5b --- /dev/null +++ b/plugin/fulltext/mysql-test/ngram/ngram_token_size.result @@ -0,0 +1,109 @@ +# Check 'ngram_token_size' variable. +# +# It's a global variable. +SELECT @@global.ngram_token_size; +@@global.ngram_token_size +2 +SHOW GLOBAL VARIABLES LIKE 'ngram_token_size'; +Variable_name Value +ngram_token_size 2 +SELECT * FROM information_schema.global_variables WHERE variable_name='ngram_token_size'; +VARIABLE_NAME VARIABLE_VALUE +NGRAM_TOKEN_SIZE 2 +# +# Not a session variable. +SELECT @@session.ngram_token_size; +ERROR HY000: Variable 'ngram_token_size' is a GLOBAL variable +SHOW SESSION VARIABLES LIKE 'ngram_token_size'; +Variable_name Value +ngram_token_size 2 +SELECT * FROM information_schema.session_variables WHERE variable_name='ngram_token_size'; +VARIABLE_NAME VARIABLE_VALUE +NGRAM_TOKEN_SIZE 2 +# +# 0 is outside allowed range [1, 10], so it's changed to 1. +SET GLOBAL ngram_token_size= 0; +Warnings: +Warning 1292 Truncated incorrect ngram_token_size value: '0' +SELECT * FROM information_schema.global_variables WHERE variable_name='ngram_token_size'; +VARIABLE_NAME VARIABLE_VALUE +NGRAM_TOKEN_SIZE 1 +# +# 10 is an acceptable value. +SET GLOBAL ngram_token_size= 10; +SELECT * FROM information_schema.global_variables WHERE variable_name='ngram_token_size'; +VARIABLE_NAME VARIABLE_VALUE +NGRAM_TOKEN_SIZE 10 +# +# 1 is an acceptable value. +SET GLOBAL ngram_token_size= 1; +SELECT * FROM information_schema.global_variables WHERE variable_name='ngram_token_size'; +VARIABLE_NAME VARIABLE_VALUE +NGRAM_TOKEN_SIZE 1 +# +# 11 is outside allowed range [1, 10], so it's changed to 10. +SET GLOBAL ngram_token_size= 11; +Warnings: +Warning 1292 Truncated incorrect ngram_token_size value: '11' +SELECT * FROM information_schema.global_variables WHERE variable_name='ngram_token_size'; +VARIABLE_NAME VARIABLE_VALUE +NGRAM_TOKEN_SIZE 10
^^^^ all tests above aren't needed, it's enough to do query_vertical select * from information_schema.system_variables where variable_name='ngram_token_size'; all the behavior above is implemented outside of the plugin, no need to test it.
+# +# Try 3-grams. +SET GLOBAL ngram_token_size= 3; +CREATE TABLE t (str TEXT, FULLTEXT INDEX (str) WITH PARSER ngram) Engine=InnoDB; +INSERT INTO t VALUES +('Lorem ipsum dolor sit amet, consectetur adipiscing elit.'), +('Maecenas non pellentesque odio.'), +('Sed fermentum egestas sapien vel consequat.'); +# Only one row contains substring 'api'. +SELECT str FROM t WHERE MATCH(str) AGAINST('api'); +str +Sed fermentum egestas sapien vel consequat. +# 'lent' is parsed into ('len', 'ent') list. Two rows contain at least one of the tokens. +SELECT str FROM t WHERE MATCH(str) AGAINST('lent*'); +str +Maecenas non pellentesque odio. +Sed fermentum egestas sapien vel consequat. +# 'zzz' is not present anywhere, so result is empty. +SELECT str FROM t WHERE MATCH(str) AGAINST('zzzz'); +str +# 'piz' cannot be found anywhere, but 'api' is in one of the rows. +SELECT str FROM t WHERE MATCH(str) AGAINST('apiz'); +str +Sed fermentum egestas sapien vel consequat. +# +# Try switching to 2-grams while keeping the index. +SET GLOBAL ngram_token_size= 2; +# 'api' is now parsed into ('ap', 'pi'). Since index only contains 3-grams, nothing is found. +SELECT str FROM t WHERE MATCH(str) AGAINST('api'); +str +# +# Add some data while ngram size is set to 2. +INSERT INTO t VALUES('1234 56789'); +# +# Now index contains 3-grams from the rows added before and a number of +# 2-grams from the just added row. +# Querying for 'api' still returns nothing, but line with '678' is found. +SELECT str from t WHERE MATCH(str) AGAINST('api'); +str +SELECT str FROM t WHERE MATCH(str) AGAINST('678'); +str +1234 56789 +# +# Switching back to 3-grams make intitially added rows searchable. +SET GLOBAL ngram_token_size= 3; +SELECT str FROM t WHERE MATCH(str) AGAINST('api'); +str +Sed fermentum egestas sapien vel consequat. +# +# But row with '678' in it cannot be found, as it was added with 2-grams. +SELECT str FROM t WHERE MATCH(str) AGAINST('678'); +str +# +# Try matching in boolean mode. +SELECT str FROM t WHERE MATCH(str) AGAINST('+ent -api' IN BOOLEAN MODE); +str +Maecenas non pellentesque odio. +# +DROP TABLE t; diff --git a/plugin/fulltext/mysql-test/ngram/ngram_ucs2.result b/plugin/fulltext/mysql-test/ngram/ngram_ucs2.result new file mode 100644 index 00000000000..9d82743bd24 --- /dev/null +++ b/plugin/fulltext/mysql-test/ngram/ngram_ucs2.result @@ -0,0 +1,8 @@ +CREATE TABLE t (str TEXT, FULLTEXT INDEX (str) WITH PARSER ngram) Engine=InnoDB CHARSET=UCS2; +ERROR HY000: Column 'str' cannot be part of FULLTEXT index +CREATE TABLE t (str TEXT, FULLTEXT INDEX (str) WITH PARSER ngram) Engine=InnoDB CHARSET=UTF16; +ERROR HY000: Column 'str' cannot be part of FULLTEXT index +CREATE TABLE t (str TEXT, FULLTEXT INDEX (str) WITH PARSER ngram) Engine=InnoDB CHARSET=UTF16LE; +ERROR HY000: Column 'str' cannot be part of FULLTEXT index +CREATE TABLE t (str TEXT, FULLTEXT INDEX (str) WITH PARSER ngram) Engine=InnoDB CHARSET=UTF32; +ERROR HY000: Column 'str' cannot be part of FULLTEXT index
same as above. This is fulltext index limitation, it does not depend on the parser plugin. The plugin doesn't need to test for it.
diff --git a/plugin/fulltext/mysql-test/ngram/suite.opt b/plugin/fulltext/mysql-test/ngram/suite.opt new file mode 100644 index 00000000000..ab37cea996e --- /dev/null +++ b/plugin/fulltext/mysql-test/ngram/suite.opt @@ -0,0 +1,4 @@ +--plugin-load-add=$FULLTEXT_NGRAM_SO +--loose-innodb
no need to enable innodb explicitly, `source have_innodb.inc` does that.
+--loose-enable-innodb_ft_index_table +--loose-innodb_ft_index_cache
why did you enable these two plugins if you don't use them in your tests?
diff --git a/plugin/fulltext/ngram_parser/plugin_ngram.cc b/plugin/fulltext/ngram_parser/plugin_ngram.cc new file mode 100644 index 00000000000..80498c3299b --- /dev/null +++ b/plugin/fulltext/ngram_parser/plugin_ngram.cc @@ -0,0 +1,256 @@ +/* + Copyright (c) 2014, 2017, Oracle and/or its affiliates. All rights reserved. + Copyright (c) 2020, MariaDB. + + This program is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License, version 2.0, + as published by the Free Software Foundation. + + This program is also distributed with certain software (including + but not limited to OpenSSL) that is licensed under separate terms, + as designated in a particular file or component or in included license + documentation. The authors of MySQL hereby grant you an additional + permission to link the program and your derivative works with the + separately licensed software that they have included with MySQL. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License, version 2.0, for more details. + + You should have received a copy of the GNU General Public License + along with this program; if not, write to the Free Software + Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA */ + +#include <stddef.h> +#include <limits.h> +#include <my_attribute.h> +#include <my_global.h> +#include "../storage/innobase/include/fts0tokenize.h"
No, the plugin should be self-contained and not include anything from other plugins. At least if a plugin is as trivial as that one.
+ +#define U_LOW_LINE 0x5f // Code point for underscore '_'. +#define is_alnum(ct) ((ct) & (_MY_U | _MY_L | _MY_NMR)) + +static int ngram_token_size; + +static MYSQL_SYSVAR_INT(token_size, ngram_token_size, + 0, + "Ngram full text plugin parser token size in characters", + NULL, NULL, 2, 1, 10, 0); + +static int my_ci_charlen(CHARSET_INFO *cs, const uchar *str, const uchar *end) +{ + return my_charlen(cs, reinterpret_cast<const char *>(str), + reinterpret_cast<const char *>(end)); +} + +static int my_ci_ctype(CHARSET_INFO *cs, int *char_type, const uchar *str, + const uchar *end) +{ + return cs->cset->ctype(cs, char_type, str, end); +} + +static bool is_underscore(CHARSET_INFO *cs, const uchar *str, const uchar *end) +{ + my_wc_t wc; + cs->cset->mb_wc(cs, &wc, str, end); + return wc == U_LOW_LINE; +} + +// Splits a string into ngrams and emits them. +static int split_into_ngrams(MYSQL_FTPARSER_PARAM *param, const uchar *doc, + int len, MYSQL_FTPARSER_BOOLEAN_INFO *bool_info) +{ + const CHARSET_INFO *cs= param->cs; + const uchar *start= doc; + const uchar *end= doc + len; + const uchar *next= start; + int n_chars= 0; + int ret= 0; + bool is_first= true; + + while (next < end) { + int char_type; + int char_len= my_ci_ctype(cs, &char_type, next, end); + + // Broken data? + if (next + char_len > end || char_len == 0) + break; + + // Avoid creating partial n-grams by stopping at word boundaries. + // Underscore is treated as a word character too, since identifiers in + // programming languages often contain them. + if (!is_alnum(char_type) && !is_underscore(cs, next, end)) { + start= next + char_len; + next= start; + n_chars= 0; + continue;
I don't think it's a good idea. On the opposite, I would rather create ngrams over word boundaries, so that, say, I'd split "n-gram plugin" to "n-g", "-gr", "gra", "ram", "am ", "m p", " pl", "plu", "lug", "ugi", "gin". May be better would be to split it into "n-gr", "gra", "ram", "am p", "m pl", "plu", "lug", "ugi", "gin". (that is, always three letters in an n-gram, not three characters). This needs testing to find out what approach works better.
+ } + + next += char_len; + n_chars++; + + if (n_chars == ngram_token_size) { + bool_info->position= static_cast<uint>(start - doc);
nope, we don't have MYSQL_FTPARSER_FULL_BOOLEAN_INFO::position and this plugin is definitely not the reason to break the API and add it.
+ ret= param->mysql_add_word(param, reinterpret_cast<const char *>(start), + static_cast<int>(next - start), bool_info); + if (ret != 0) + return ret; + + start += my_ci_charlen(cs, start, end);
why do you scan the string again here, you've already counted character by character above?
+ n_chars= ngram_token_size - 1; + is_first= false; + } + } + + // If nothing was emitted yet, emit the remainder.
what reminder, what is it for?
+ switch (param->mode) { + case MYSQL_FTPARSER_FULL_BOOLEAN_INFO: + case MYSQL_FTPARSER_WITH_STOPWORDS: + if (n_chars > 0 && is_first) { + assert(next > start); + assert(n_chars < ngram_token_size); + + bool_info->position= static_cast<uint>(start - doc); + ret= param->mysql_add_word(param, reinterpret_cast<const char *>(start), + static_cast<int>(next - start), bool_info); + } + break; + + default: + break; + } + + return ret; +} + +static int number_of_chars(const CHARSET_INFO *cs, const uchar *buf, int len) +{ + const uchar *ptr= buf; + const uchar *end= buf + len; + int size= 0; + + while (ptr < end) { + ptr += my_ci_charlen(cs, ptr, end); + size += 1; + } + + return size; +} + +// Convert term into phrase and handle wildcard. +static int ngram_term_convert(MYSQL_FTPARSER_PARAM *param, const uchar *token, + int len, MYSQL_FTPARSER_BOOLEAN_INFO *bool_info) +{ + MYSQL_FTPARSER_BOOLEAN_INFO token_info= + { FT_TOKEN_WORD, 0, 0, 0, 0, 0, ' ', 0 }; + const CHARSET_INFO *cs= param->cs; + int ret= 0; + + assert(bool_info->type == FT_TOKEN_WORD); + assert(bool_info->quot == NULL); + + /* Convert rules: + * 1. if term with wildcard and term length is less than ngram_token_size, + * we keep it as normal term search. + * 2. otherwise, term is converted to phrase and wildcard is ignored. + * e.g. 'abc' and 'abc*' are both equivalent to '"ab bc"'. + */ + + if (bool_info->trunc && number_of_chars(cs, token, len) < ngram_token_size) { + ret= param->mysql_add_word(param, reinterpret_cast<const char *>(token), + len, bool_info); + } else { + bool_info->type= FT_TOKEN_LEFT_PAREN; + bool_info->quot= reinterpret_cast<char*>(1); + + ret= param->mysql_add_word(param, NULL, 0, bool_info); + if (ret != 0) + return ret; + + ret= split_into_ngrams(param, token, len, &token_info); + if (ret != 0) + return ret; + + bool_info->type= FT_TOKEN_RIGHT_PAREN; + ret= param->mysql_add_word(param, NULL, 0, bool_info); + + assert(bool_info->quot == NULL); + bool_info->type= FT_TOKEN_WORD; + } + + return ret; +} + +static int ngram_parser_parse(MYSQL_FTPARSER_PARAM *param) +{ + MYSQL_FTPARSER_BOOLEAN_INFO bool_info= + { FT_TOKEN_WORD, 0, 0, 0, 0, 0, ' ', 0 }; + const CHARSET_INFO *cs= param->cs; + uchar *start= reinterpret_cast<uchar *>(const_cast<char *>(param->doc)); + uchar *end= start + param->length; + FT_WORD word= {NULL, 0, 0}; + int ret= 0; + + switch (param->mode) { + case MYSQL_FTPARSER_SIMPLE_MODE: + case MYSQL_FTPARSER_WITH_STOPWORDS: + ret= split_into_ngrams(param, start, static_cast<int>(end - start), + &bool_info); + break; + + case MYSQL_FTPARSER_FULL_BOOLEAN_INFO: + // Reusing InnoDB's boolean mode parser to handle all special characters. + while (fts_get_word(cs, &start, end, &word, &bool_info)) {
nope. InnoDB might not even be compiled it. Use param->mysql_parse()
+ if (bool_info.type == FT_TOKEN_WORD) { + if (bool_info.quot != NULL) { + ret= split_into_ngrams(param, word.pos, word.len, &bool_info); + } else { + ret= ngram_term_convert(param, word.pos, word.len, &bool_info); + } + } else { + ret= param->mysql_add_word(param, reinterpret_cast<char*>(word.pos), + word.len, &bool_info); + } + + if (ret != 0) + return ret; + } + + break; + } + + return ret; +} + +static struct st_mysql_ftparser ngram_parser_descriptor= +{ + MYSQL_FTPARSER_INTERFACE_VERSION, + ngram_parser_parse, + NULL, + NULL, +}; + +static struct st_mysql_sys_var *ngram_system_variables[]= +{ + MYSQL_SYSVAR(token_size), + NULL +}; + +maria_declare_plugin(ngram_parser) +{ + MYSQL_FTPARSER_PLUGIN, // Type. + &ngram_parser_descriptor, // Descriptor. + "ngram", // Name. + "Oracle Corp", // Author. + "Ngram Full-Text Parser", // Description. + PLUGIN_LICENSE_GPL, // License. + NULL, // Initialization function. + NULL, // Deinitialization function. + 0x0100, // Numeric version. + NULL, // Status variables. + ngram_system_variables, // System variables. + "1.0", // String version representation. + MariaDB_PLUGIN_MATURITY_EXPERIMENTAL, // Maturity. +} +maria_declare_plugin_end;
Regards, Sergei VP of MariaDB Server Engineering and security@mariadb.org
participants (1)
-
Sergei Golubchik