[Commits] d187214b884: MDEV-21955: Alternative way of doing packed-sort keys
revision-id: d187214b884f3f37eda052dacc701eece6284183 (mariadb-10.5.0-329-gd187214b884) parent(s): b753ac066bc26acda9deb707a31c112f1bbf9ec2 author: Sergei Petrunia committer: Sergei Petrunia timestamp: 2020-03-16 22:43:23 +0300 message: MDEV-21955: Alternative way of doing packed-sort keys Use "Variable-Length Space Padded Encoding" that MyRocks uses. The value comparison function is memcmp(). A very crude just to evaluate the performance of this. --- sql/filesort.cc | 185 +++++++++++++++++++++++++++++++++++++++++++++++++++++++- sql/sql_class.h | 5 ++ 2 files changed, 188 insertions(+), 2 deletions(-) diff --git a/sql/filesort.cc b/sql/filesort.cc index 56f060b8c26..e7c8c4c19bd 100644 --- a/sql/filesort.cc +++ b/sql/filesort.cc @@ -103,7 +103,6 @@ void Sort_param::init_for_filesort(uint sortlen, TABLE *table, max_rows= maxrows; } - void Sort_param::try_to_pack_addons(ulong max_length_for_sort_data) { if (!using_addon_fields() || // no addons, or @@ -2165,6 +2164,9 @@ Type_handler_decimal_result::sort_length(THD *thd, @return Total length of sort buffer in bytes */ +static int get_segment_size_from_collation(const CHARSET_INFO *const cs); +static void rdb_get_mem_comparable_space(const CHARSET_INFO *const cs, + std::vector<uchar> **xfrm); static uint sortlength(THD *thd, Sort_keys *sort_keys, bool *multi_byte_charset, @@ -2195,6 +2197,7 @@ sortlength(THD *thd, Sort_keys *sort_keys, bool *multi_byte_charset, SORT_FIELD_ATTR::VARIABLE_SIZE : SORT_FIELD_ATTR::FIXED_SIZE; sortorder->cs= cs; + sortorder->space_xfrm= NULL; if (use_strnxfrm((cs=sortorder->field->sort_charset()))) { @@ -2207,6 +2210,10 @@ sortlength(THD *thd, Sort_keys *sort_keys, bool *multi_byte_charset, sortorder->length_bytes= number_storage_requirement(MY_MIN(sortorder->original_length, thd->variables.max_sort_length)); + //psergey-try: + sortorder->m_segment_size= get_segment_size_from_collation(cs); + rdb_get_mem_comparable_space(cs, &sortorder->space_xfrm); + sortorder->m_n_weights= sortorder->field->char_length(); } if ((sortorder->maybe_null= sortorder->field->maybe_null())) @@ -2215,6 +2222,7 @@ sortlength(THD *thd, Sort_keys *sort_keys, bool *multi_byte_charset, else { CHARSET_INFO *cs; + sortorder->space_xfrm= NULL; sortorder->item->type_handler()->sort_length(thd, sortorder->item, sortorder); sortorder->type= sortorder->item->type_handler()->is_packable() ? @@ -2228,6 +2236,7 @@ sortlength(THD *thd, Sort_keys *sort_keys, bool *multi_byte_charset, if (sortorder->is_variable_sized() && allow_packing_for_keys) { allow_packing_for_keys= sortorder->check_if_packing_possible(thd); + allow_packing_for_keys= false; // psergey-new sortorder->length_bytes= number_storage_requirement(MY_MIN(sortorder->original_length, thd->variables.max_sort_length)); @@ -2479,7 +2488,6 @@ SORT_INFO::~SORT_INFO() DBUG_VOID_RETURN; } - void Sort_param::try_to_pack_sortkeys() { #ifdef WITHOUT_PACKED_SORT_KEYS @@ -2850,7 +2858,27 @@ int SORT_FIELD_ATTR::compare_packed_fixed_size_vals(uchar *a, size_t *a_len, <0 key a_ptr less than b_ptr */ +int compare_packed_sort_keys(void *sort_param, + unsigned char **a_ptr, unsigned char **b_ptr) +{ + uchar *a= *a_ptr; + uchar *b= *b_ptr; + // length was saved with this: + // Sort_keys::store_sortkey_length(orig_to, length); + int a_len= uint4korr(a); + int b_len= uint4korr(b); + a+= Sort_keys::size_of_length_field; + b+= Sort_keys::size_of_length_field; + + int size= std::min(a_len, b_len); + int res; + if ((res= memcmp(a, b, size))) + return res; + return (a_len < b_len)? -1 : ( (a_len == b_len)? 0 : 1); + +} +#if 0 int compare_packed_sort_keys(void *sort_param, unsigned char **a_ptr, unsigned char **b_ptr) { @@ -2886,7 +2914,106 @@ int compare_packed_sort_keys(void *sort_param, retval= memcmp(a, b, param->res_length); return retval; } +#endif + + +///////////////////////////////////////////////////////////////////////////// +// psergey-try: +///////////////////////////////////////////////////////////////////////////// + +/* + Compare the string in [buf..buf_end) with a string that is an infinite + sequence of strings in space_xfrm +*/ + +static int rdb_compare_string_with_spaces( + const uchar *buf, const uchar *const buf_end, + const std::vector<uchar> *const space_xfrm) { + int cmp = 0; + while (buf < buf_end) { + size_t bytes = std::min((size_t)(buf_end - buf), space_xfrm->size()); + if ((cmp = memcmp(buf, space_xfrm->data(), bytes)) != 0) break; + buf += bytes; + } + return cmp; +} + +static int get_segment_size_from_collation(const CHARSET_INFO *const cs) { + int ret; + + enum collations_used { + COLLATION_UTF8MB4_BIN = 46, + COLLATION_LATIN1_BIN = 47, + COLLATION_UTF16LE_BIN = 55, + COLLATION_UTF32_BIN = 61, + COLLATION_UTF16_BIN = 62, + COLLATION_BINARY = 63, + COLLATION_UTF8_BIN = 83 + }; + + if (cs->number == COLLATION_UTF8MB4_BIN || cs->number == COLLATION_UTF16_BIN || + cs->number == COLLATION_UTF16LE_BIN || cs->number == COLLATION_UTF32_BIN) { + /* + In these collations, a character produces one weight, which is 3 bytes. + Segment has 3 characters, add one byte for VARCHAR_CMP_* marker, and we + get 3*3+1=10 + */ + ret = 10; + } else { + /* + All other collations. There are two classes: + - Unicode-based, except for collations mentioned in the if-condition. + For these all weights are 2 bytes long, a character may produce 0..8 + weights. + in any case, 8 bytes of payload in the segment guarantee that the last + space character won't span across segments. + + - Collations not based on unicode. These have length(strxfrm(' '))=1, + there nothing to worry about. + + In both cases, take 8 bytes payload + 1 byte for VARCHAR_CMP* marker. + */ + ret = 9; + } + return ret; +} + +const int RDB_SPACE_XFRM_SIZE = 20; + +static void rdb_get_mem_comparable_space(const CHARSET_INFO *const cs, + std::vector<uchar> **xfrm) { + //size_t *const xfrm_len, + //size_t *const mb_len) { + + // Upper bound of how many bytes can be occupied by multi-byte form of a + // character in any charset. + const int MAX_MULTI_BYTE_CHAR_SIZE = 4; + DBUG_ASSERT(cs->mbmaxlen <= MAX_MULTI_BYTE_CHAR_SIZE); + + // multi-byte form of the ' ' (space) character + uchar space_mb[MAX_MULTI_BYTE_CHAR_SIZE]; + + const size_t space_mb_len = cs->wc_mb( + (my_wc_t)cs->pad_char, space_mb, space_mb + sizeof(space_mb)); + // mem-comparable image of the space character + std::array<uchar, 20> space; + + const size_t space_len = cs->strnxfrm( + space.data(), sizeof(space), 1, space_mb, space_mb_len, 0); + + std::vector<uchar> *p= new std::vector<uchar>(space.data(), + space.data() + space_len); + while (p->size() < RDB_SPACE_XFRM_SIZE) { + p->insert(p->end(), space.data(), + space.data() + space_len); + } + + *xfrm = p; + //*xfrm = &rdb_mem_comparable_space[cs->number]->spaces_xfrm; + //*xfrm_len = rdb_mem_comparable_space[cs->number]->space_xfrm_len; + //*mb_len = rdb_mem_comparable_space[cs->number]->space_mb_len; +} /* @brief @@ -2911,6 +3038,59 @@ uint SORT_FIELD_ATTR::pack_sort_string(uchar *to, const LEX_CSTRING &str, CHARSET_INFO *cs) const { + const int VARCHAR_CMP_LESS_THAN_SPACES = 1; + const int VARCHAR_CMP_EQUAL_TO_SPACES = 2; + const int VARCHAR_CMP_GREATER_THAN_SPACES = 3; + + const size_t trimmed_len = cs->lengthsp(str.str, str.length); + uchar buf_space[1000]; + uchar *buf=buf_space; + const size_t xfrm_len = cs->strnxfrm((char*)buf, sizeof(buf_space), + m_n_weights, + str.str, trimmed_len, + 0/*flags*/); + + /* Got a mem-comparable image in 'buf'. Now, produce varlength encoding */ + uchar *const buf_end = buf + xfrm_len; + + size_t encoded_size = 0; + uchar *ptr = to; + size_t padding_bytes; + while (true) { + const size_t copy_len = + std::min<size_t>(m_segment_size - 1, buf_end - buf); + padding_bytes = m_segment_size - 1 - copy_len; + memcpy(ptr, buf, copy_len); + ptr += copy_len; + buf += copy_len; + + if (padding_bytes) { + memcpy(ptr, space_xfrm->data(), padding_bytes); + ptr += padding_bytes; + *ptr = VARCHAR_CMP_EQUAL_TO_SPACES; // last segment + } else { + // Compare the string suffix with a hypothetical infinite string of + // spaces. It could be that the first difference is beyond the end of + // current chunk. + const int cmp = + rdb_compare_string_with_spaces(buf, buf_end, space_xfrm); + + if (cmp < 0) { + *ptr = VARCHAR_CMP_LESS_THAN_SPACES; + } else if (cmp > 0) { + *ptr = VARCHAR_CMP_GREATER_THAN_SPACES; + } else { + // It turns out all the rest are spaces. + *ptr = VARCHAR_CMP_EQUAL_TO_SPACES; + } + } + encoded_size += m_segment_size; + + if (*(ptr++) == VARCHAR_CMP_EQUAL_TO_SPACES) break; + } + + return encoded_size; +#if 0 uchar *orig_to= to; size_t length, data_length; length= str.length; @@ -2934,6 +3114,7 @@ SORT_FIELD_ATTR::pack_sort_string(uchar *to, const LEX_CSTRING &str, to+= suffix_length; } return static_cast<uint>(to - orig_to); +#endif } diff --git a/sql/sql_class.h b/sql/sql_class.h index d756b047ffe..957d8d26a7c 100644 --- a/sql/sql_class.h +++ b/sql/sql_class.h @@ -6315,6 +6315,11 @@ struct SORT_FIELD_ATTR */ bool maybe_null; CHARSET_INFO *cs; + + uint m_segment_size; //psergey-try: + uint m_n_weights; // psergey-try + std::vector<uchar> *space_xfrm; + uint pack_sort_string(uchar *to, const LEX_CSTRING &str, CHARSET_INFO *cs) const; int compare_packed_fixed_size_vals(uchar *a, size_t *a_len,
participants (1)
-
psergey