developers
Threads by month
- ----- 2025 -----
- January
- ----- 2024 -----
- December
- November
- October
- September
- August
- July
- June
- May
- April
- March
- February
- January
- ----- 2023 -----
- December
- November
- October
- September
- August
- July
- June
- May
- April
- March
- February
- January
- ----- 2022 -----
- December
- November
- October
- September
- August
- July
- June
- May
- April
- March
- February
- January
- ----- 2021 -----
- December
- November
- October
- September
- August
- July
- June
- May
- April
- March
- February
- January
- ----- 2020 -----
- December
- November
- October
- September
- August
- July
- June
- May
- April
- March
- February
- January
- ----- 2019 -----
- December
- November
- October
- September
- August
- July
- June
- May
- April
- March
- February
- January
- ----- 2018 -----
- December
- November
- October
- September
- August
- July
- June
- May
- April
- March
- February
- January
- ----- 2017 -----
- December
- November
- October
- September
- August
- July
- June
- May
- April
- March
- February
- January
- ----- 2016 -----
- December
- November
- October
- September
- August
- July
- June
- May
- April
- March
- February
- January
- ----- 2015 -----
- December
- November
- October
- September
- August
- July
- June
- May
- April
- March
- February
- January
- ----- 2014 -----
- December
- November
- October
- September
- August
- July
- June
- May
- April
- March
- February
- January
- ----- 2013 -----
- December
- November
- October
- September
- August
- July
- June
- May
- April
- March
- February
- January
- ----- 2012 -----
- December
- November
- October
- September
- August
- July
- June
- May
- April
- March
- February
- January
- ----- 2011 -----
- December
- November
- October
- September
- August
- July
- June
- May
- April
- March
- February
- January
- ----- 2010 -----
- December
- November
- October
- September
- August
- July
- June
- May
- April
- March
- February
- January
- ----- 2009 -----
- December
- November
- October
- September
- August
- July
- June
- May
- April
- March
- February
November 2018
- 11 participants
- 37 discussions
23 Jul '19
Hello!
While trying to release 10.0.26 for Debian and Ubuntu I noticed git
chokes again on Windows line endings. As our .gitattributes states
that by default git should clean up the line endings, why do we keep
getting new Windows line endings in new code?
mariadb-10.0$ cat .gitattributes
# Set the default behavior, in case people don't have core.autocrlf set.
* text=auto
...
storage/connect/mysql-test/connect/std_data/*.txt -text
mariadb-10.0.26$ find . | xargs file | grep CRLF
./storage/connect/mysql-test/connect/std_data/boyswin.txt:
ASCII text, with CRLF line terminators
./storage/connect/mysql-test/connect/std_data/expenses.txt:
ASCII text, with CRLF line terminators
./storage/connect/mysql-test/connect/std_data/emp.txt:
ASCII text, with CRLF line terminators
./mysql-test/std_data/loaddata7.dat:
ASCII text, with
CRLF line terminators
./mysql-test/r/loadxml.result:
ASCII text, with
CRLF, LF line terminators
./mysql-test/r/perror-win.result:
ASCII text, with
CRLF, LF line terminators
./mysql-test/r/mysql_binary_mode.result:
ASCII English
text, with CRLF, LF line terminators
./mysql-test/r/func_regexp_pcre.result:
UTF-8 Unicode C++
program text, with CRLF, CR, LF line terminators
./pcre/testdata/grepoutputN:
ASCII text, with CRLF, CR, LF line terminators
./pcre/testdata/greppatN4:
ASCII text, with CRLF line terminators
This leads for me to:
dpkg-source: info: building mariadb-10.0 using existing
./mariadb-10.0_10.0.26.orig.tar.gz
dpkg-source: info: local changes detected, the modified files are:
mariadb-10.0/storage/connect/mysql-test/connect/std_data/boyswin.txt
mariadb-10.0/storage/connect/mysql-test/connect/std_data/emp.txt
mariadb-10.0/storage/connect/mysql-test/connect/std_data/expenses.txt
dpkg-source: error: aborting due to unexpected upstream changes, see
/tmp/mariadb-10.0_10.0.26-1.diff.Yzwwhw
Which means I have to do extra work with importing the upstream sources..
- Otto
2
4
Hello,
So, MDEV-16934 introduced eq_range_index_dives_limit into 10.2.8 and 10.3.0.
The default was set to 0 (which means no limit) in order to not introduce
optimizer behavior change into stable versions.
The question is: should 10.4 also have 0 by default or we can set it to some
finite limit? MySQL's default value is 10.
BR
Sergei
--
Sergei Petrunia, Software Developer
MariaDB Corporation | Skype: sergefp | Blog: http://s.petrunia.net/blog
3
2
Re: [Maria-developers] d282f5c5560: MDEV-10963 Fragmented BINLOG query
by Sergei Golubchik 18 Dec '18
by Sergei Golubchik 18 Dec '18
18 Dec '18
Hi, Andrei!
Looks better!
There are no major problems, but see comments below. There're few
suggestions how to simplify the code.
On Nov 05, Andrei Elkin wrote:
> revision-id: d282f5c55609469cd74d7390f70c7d922c778711 (mariadb-10.1.35-93-gd282f5c5560)
> parent(s): 2a576f71c5d3c7aacef564e5b1251f83bde48f51
> author: Andrei Elkin <andrei.elkin(a)mariadb.com>
> committer: Andrei Elkin <andrei.elkin(a)mariadb.com>
> timestamp: 2018-10-21 23:42:00 +0300
> message:
>
> MDEV-10963 Fragmented BINLOG query
>
> diff --git a/mysql-test/suite/binlog/t/binlog_mysqlbinlog_row_frag.test b/mysql-test/suite/binlog/t/binlog_mysqlbinlog_row_frag.test
> new file mode 100644
> index 00000000000..bdf41c94c76
> --- /dev/null
> +++ b/mysql-test/suite/binlog/t/binlog_mysqlbinlog_row_frag.test
> @@ -0,0 +1,50 @@
> +--source include/have_debug.inc
> +--source include/have_log_bin.inc
> +--source include/have_binlog_format_row.inc
you don't need to include have_log_bin, if you include
have_binlog_format_row.
> +
> +--let $MYSQLD_DATADIR= `select @@datadir`
> +--let $max_size=1024
> +
> +CREATE TABLE t (a TEXT);
> +# events of interest are guaranteed to stay in 000001 log
> +RESET MASTER;
> +--eval INSERT INTO t SET a=repeat('a', $max_size)
eh? why did you do it with let/eval instead of a simple sql statement?
you don't use $max_size anywhere else.
> +SELECT a from t into @a;
> +FLUSH LOGS;
> +DELETE FROM t;
> +
> +--exec $MYSQL_BINLOG --debug-binlog-row-event-max-encoded-size=256 $MYSQLD_DATADIR/master-bin.000001 > $MYSQLTEST_VARDIR/tmp/mysqlbinlog.sql
> +
> +--let $assert_text= BINLOG is fragmented
> +--let $assert_select= BINLOG @binlog_fragment_0, @binlog_fragment_1
> +--let $assert_count= 1
> +--let $assert_file= $MYSQLTEST_VARDIR/tmp/mysqlbinlog.sql
> +--source include/assert_grep.inc
no, please, use search_pattern_in_file.inc instead.
> +
> +--exec $MYSQL test < $MYSQLTEST_VARDIR/tmp/mysqlbinlog.sql
> +
> +SELECT a LIKE @a as 'true' FROM t;
> +SELECT @binlog_fragment_0, @binlog_fragment_1 as 'NULL';
that makes no sense, @binlog_fragment_0 and _1 were set in a separate
client session. You cannot test whether they were cleared or not there,
by looking at the values here
> +
> +# improper syntax error
> +--echo BINLOG number-of-fragments must be exactly two
> +--error ER_PARSE_ERROR
> +BINLOG @binlog_fragment;
> +--error ER_PARSE_ERROR
> +BINLOG @binlog_fragment, @binlog_fragment, @binlog_fragment;
> +
> +# corrupted fragments error check (to the expected error code notice,
> +# the same error code occurs in a similar unfragmented case)
> +SET @binlog_fragment_0='012345';
> +SET @binlog_fragment_1='012345';
> +--error ER_SYNTAX_ERROR
> +BINLOG @binlog_fragment_0, @binlog_fragment_1;
> +
> +# Not existing fragment is not allowed
> +SET @binlog_fragment_0='012345';
> +--error ER_WRONG_TYPE_FOR_VAR
> +BINLOG @binlog_fragment_0, @binlog_fragment_not_exist;
> +
> +--echo # Cleanup
> +--remove_file $MYSQLTEST_VARDIR/tmp/mysqlbinlog.sql
> +DROP TABLE t;
> diff --git a/mysys/mf_iocache2.c b/mysys/mf_iocache2.c
> --- a/mysys/mf_iocache2.c
> +++ b/mysys/mf_iocache2.c
> @@ -22,52 +22,53 @@
> #include <stdarg.h>
> #include <m_ctype.h>
>
> -/*
> - Copy contents of an IO_CACHE to a file.
> -
> - SYNOPSIS
> - my_b_copy_to_file()
> - cache IO_CACHE to copy from
> - file File to copy to
> -
> - DESCRIPTION
> - Copy the contents of the cache to the file. The cache will be
> - re-inited to a read cache and will read from the beginning of the
> - cache.
> -
> - If a failure to write fully occurs, the cache is only copied
> - partially.
> +/**
> + Copy the cache to the file. Copying can be constrained to @c count
> + number of bytes when the parameter is less than SIZE_T_MAX. The
> + cache will be optionally re-inited to a read cache and will read
> + from the beginning of the cache. If a failure to write fully
> + occurs, the cache is only copied partially.
>
> TODO
> - Make this function solid by handling partial reads from the cache
> - in a correct manner: it should be atomic.
> -
> - RETURN VALUE
> - 0 All OK
> - 1 An error occurred
> + Make this function solid by handling partial reads from the cache
> + in a correct manner: it should be atomic.
> +
> + @param cache IO_CACHE to copy from
> + @param file File to copy to
> + @param do_reinit whether to turn the cache to read mode
> + @param count the copied size or the max of the type
> + when the whole cache is to be copied.
> + @return
> + 0 All OK
> + 1 An error occurred
> */
> int
> -my_b_copy_to_file(IO_CACHE *cache, FILE *file)
> +my_b_copy_to_file(IO_CACHE *cache, FILE *file,
> + my_bool do_reinit,
> + size_t count)
> {
> - size_t bytes_in_cache;
> + size_t curr_write, bytes_in_cache;
> DBUG_ENTER("my_b_copy_to_file");
>
> /* Reinit the cache to read from the beginning of the cache */
> - if (reinit_io_cache(cache, READ_CACHE, 0L, FALSE, FALSE))
> + if (do_reinit && reinit_io_cache(cache, READ_CACHE, 0L, FALSE, FALSE))
generally, when there's a function that is always called with a
constant (compile-time) argument, I prefer to split the code
compile-time too, if it isn't too much trouble. In this case it would
mean a new function like
int my_b_copy_all_to_file(IO_CACHE *cache, FILE *file)
{
if (reinit_io_cache(cache, READ_CACHE, 0L, FALSE, FALSE)
return 1;
return my_b_copy_to_file(cache, file, SIZE_T_MAX);
}
and all old code will be changed to use my_b_copy_all_to_file().
Old my_b_copy_to_file() won't need to do reinit_io_cache() anymore and
your code will use it directly.
> DBUG_RETURN(1);
> bytes_in_cache= my_b_bytes_in_cache(cache);
> do
> {
> - if (my_fwrite(file, cache->read_pos, bytes_in_cache,
> + curr_write= MY_MIN(bytes_in_cache, count);
> + if (my_fwrite(file, cache->read_pos, curr_write,
> MYF(MY_WME | MY_NABP)) == (size_t) -1)
> DBUG_RETURN(1);
> - } while ((bytes_in_cache= my_b_fill(cache)));
> +
> + cache->read_pos += curr_write;
> + count -= curr_write;
> + } while (count && (bytes_in_cache= my_b_fill(cache)));
> if(cache->error == -1)
> DBUG_RETURN(1);
> DBUG_RETURN(0);
> }
>
> -
> my_off_t my_b_append_tell(IO_CACHE* info)
> {
> /*
> diff --git a/sql/log_event.cc b/sql/log_event.cc
> index e07b7002398..aeca794f0cd 100644
> --- a/sql/log_event.cc
> +++ b/sql/log_event.cc
> @@ -10474,12 +10488,151 @@ void Rows_log_event::pack_info(Protocol *protocol)
> #endif
>
> #ifdef MYSQL_CLIENT
> +/**
> + Print an event "body" cache to @c file possibly in multiple fragements.
> + Each fragement is optionally per @c do_wrap to procude an SQL statement.
> +
> + @param file a file to print to
> + @param body the "body" IO_CACHE of event
> + @param do_wrap whether to wrap base64-encoded strings with
> + SQL cover.
> + The function signals on any error through setting @c body->error to -1.
> +*/
> +void copy_cache_to_file_wrapped(FILE *file,
> + IO_CACHE *body,
> + bool do_wrap,
> + const char *delimiter)
> +{
> + uint n_frag= 1;
> + const char* before_frag= NULL;
> + char* after_frag= NULL;
> + char* after_last= NULL;
> + /*
> + 2 fragments can always represent near 1GB row-based
> + base64-encoded event as two strings each of size less than
> + max(max_allowed_packet). Greater number of fragments does not
> + save from potential need to tweak (increase) @@max_allowed_packet
> + before to process the fragments. So 2 is safe and enough.
> + */
> + const char fmt_last_frag2[]=
> + "\nBINLOG @binlog_fragment_0, @binlog_fragment_1%s\n";
> + const char fmt_before_frag[]= "\nSET /* ONE_SHOT */ @binlog_fragment_%d ='\n";
this ONE_SHOT is confusing, even if in a comment. Better not to do it :)
> + /*
> + Buffer to hold computed formatted strings according to specifiers.
> + The sizes may depend on an actual fragment number size in terms of decimal
> + signs so its maximum is estimated (not precisely yet safely) below.
> + */
> + char buf[sizeof(fmt_before_frag) + sizeof(fmt_last_frag2)
> + + ((sizeof(n_frag) * 8)/3 + 1) // max of decimal index
> + + sizeof(PRINT_EVENT_INFO::max_delimiter_len) + 3]; // delim, \n and 0
sizeof(max_delimiter_len) ? it's sizeof(uint), right? Did you mean
sizeof(PRINT_EVENT_INFO::delimiter)
or simply
PRINT_EVENT_INFO::max_delimiter_len
without sizeof?
> +
> + if (do_wrap)
> + {
> + after_frag= (char*) my_malloc(sizeof(buf), MYF(MY_WME));
> + sprintf(after_frag, "'%s\n", delimiter);
> + if (my_b_tell(body) > opt_binlog_rows_event_max_encoded_size)
> + n_frag= 2;
> + if (n_frag > 1)
> + {
> + before_frag= fmt_before_frag;
> + after_last= (char*) my_malloc(sizeof(buf), MYF(MY_WME));
> + sprintf(after_last, fmt_last_frag2, (char*) delimiter);
> + }
> + else
> + {
> + before_frag= "\nBINLOG '\n"; // single "fragment"
> + }
> + }
> +
> + size_t total_size= my_b_tell(body), total_written= 0;
> + size_t frag_size= total_size / n_frag + 1, curr_size;
> +
> + if (reinit_io_cache(body, READ_CACHE, 0L, FALSE, FALSE))
> + {
> + body->error= -1;
> + goto err;
> + }
> +
> + for (uint i= 0; i < n_frag; i++, total_written += curr_size)
> + {
> + curr_size= i < n_frag - 1 ? frag_size : total_size - total_written;
> +
> + DBUG_ASSERT(i < n_frag - 1 || curr_size <= frag_size);
> +
> + if (before_frag)
> + {
> + sprintf(buf, before_frag, i);
> + my_fwrite(file, (uchar*) buf, strlen(buf), MYF(MY_WME | MY_NABP));
> + }
> + if (my_b_copy_to_file(body, file, FALSE, curr_size))
> + {
> + body->error= -1;
> + goto err;
> + }
> + if (after_frag)
> + {
> + sprintf(buf, after_frag, NULL);
> + my_fwrite(file, (uchar*) buf, strlen(buf), MYF(MY_WME | MY_NABP));
> + }
> + }
Hmm, dunno. I suspect you can do it three times shorter and five times
easier to read if you wouldn't try to generalize it for a arbitrary
number of fragments with arbitrary prefixes and suffixes. Just
if (my_b_tell(body) < opt_binlog_rows_event_max_encoded_size - margin)
{
my_fprintf(file, "BINLOG '");
my_b_copy_to_file(body, file);
my_fprintf(file, "'%s\n", delimiter);
}
else
{
my_fprintf(file, "SET @binlog_fragment_0='");
my_b_copy_to_file(body, file, opt_binlog_rows_event_max_encoded_size);
my_fprintf(file, "'%s\nSET @binlog_fragment_1='", delimiter);
my_b_copy_to_file(body, file, SIZE_T_MAX);
my_fprintf(file, "'%s\nBINLOG @binlog_fragment_0, @binlog_fragment_1%s\n",
delimiter, delimiter);
}
See?
> +
> + if (after_last)
> + {
> + sprintf(buf, after_last, n_frag);
> + my_fwrite(file, (uchar*) buf, strlen(buf), MYF(MY_WME | MY_NABP));
> + }
> + reinit_io_cache(body, WRITE_CACHE, 0, FALSE, TRUE);
> +
> +err:
> + my_free(after_frag);
> + my_free(after_last);
> +}
> +
> +/**
> + The function invokes base64 encoder to run on the current
> + event string and store the result into two caches.
> + When the event ends the current statement the caches are is copied into
> + the argument file.
> + Copying is also concerned how to wrap the event, specifically to produce
> + a valid SQL syntax.
> + When the encoded data size is within max(MAX_ALLOWED_PACKET)
> + a regular BINLOG query is composed. Otherwise it is build as fragmented
> +
> + SET @binlog_fragment_0='...';
> + SET @binlog_fragment_1='...';
> + BINLOG DEFRAGMENT(@binlog_fragment_0, @binlog_fragment_1);
> +
> + where fragments are represented by a sequence of "indexed" user
> + variables.
> + Two more statements are composed as well
> +
> + SET @binlog_fragment_0=NULL;
> + SET @binlog_fragment_1=NULL;
> +
> + to promptly release memory.
No, they aren't
> +
> + NOTE.
@note
> + If any changes made don't forget to duplicate them to
> + Old_rows_log_event as long as it's supported.
> +
> + @param file pointer to IO_CACHE
> + @param print_event_info pointer to print_event_info specializing
> + what out of and how to print the event
> + @param name the name of a table that the event operates on
> +
> + The function signals on any error of cache access through setting
> + that cache's @c error to -1.
> +*/
> void Rows_log_event::print_helper(FILE *file,
> PRINT_EVENT_INFO *print_event_info,
> char const *const name)
> {
> IO_CACHE *const head= &print_event_info->head_cache;
> IO_CACHE *const body= &print_event_info->body_cache;
> + bool do_print_encoded=
> + print_event_info->base64_output_mode != BASE64_OUTPUT_DECODE_ROWS &&
> + !print_event_info->short_form;
> +
> if (!print_event_info->short_form)
> {
> bool const last_stmt_event= get_flags(STMT_END_F);
> diff --git a/sql/log_event.h b/sql/log_event.h
> index 90900f63533..28277e659d2 100644
> --- a/sql/log_event.h
> +++ b/sql/log_event.h
> @@ -749,6 +749,7 @@ typedef struct st_print_event_info
> that was printed. We cache these so that we don't have to print
> them if they are unchanged.
> */
> + static const uint max_delimiter_len= 16;
why did you introduce this max_delimiter_len, if all you use
is sizeof(delimiter) anyway? (and even that is not needed)
> // TODO: have the last catalog here ??
> char db[FN_REFLEN+1]; // TODO: make this a LEX_STRING when thd->db is
> bool flags2_inited;
> @@ -798,7 +799,7 @@ typedef struct st_print_event_info
> bool printed_fd_event;
> my_off_t hexdump_from;
> uint8 common_header_len;
> - char delimiter[16];
> + char delimiter[max_delimiter_len];
>
> uint verbose;
> table_mapping m_table_map;
> diff --git a/sql/sql_binlog.cc b/sql/sql_binlog.cc
> index 91cf038907e..b4e3342d8f3 100644
> --- a/sql/sql_binlog.cc
> +++ b/sql/sql_binlog.cc
> @@ -28,6 +28,70 @@
> // START_EVENT_V3,
> // Log_event_type,
> // Log_event
> +
> +/**
> + Copy fragments into the standard placeholder thd->lex->comment.str.
> +
> + Compute the size of the (still) encoded total,
> + allocate and then copy fragments one after another.
> + The size can exceed max(max_allowed_packet) which is not a
> + problem as no String instance is created off this char array.
> +
> + @param thd THD handle
> + @return
> + 0 at success,
> + -1 otherwise.
> +*/
> +int binlog_defragment(THD *thd)
> +{
> + user_var_entry *entry[2];
> + LEX_STRING name[2]= { thd->lex->comment, thd->lex->ident };
> +
> + /* compute the total size */
> + thd->lex->comment.str= NULL;
> + thd->lex->comment.length= 0;
> + for (uint k= 0; k < 2; k++)
> + {
> + entry[k]=
> + (user_var_entry*) my_hash_search(&thd->user_vars, (uchar*) name[k].str,
> + name[k].length);
> + if (!entry[k] || entry[k]->type != STRING_RESULT)
> + {
> + my_error(ER_WRONG_TYPE_FOR_VAR, MYF(0), name[k].str);
> + return -1;
> + }
> + thd->lex->comment.length += entry[k]->length;
> + }
> +
> + thd->lex->comment.str= // to be freed by the caller
> + (char *) my_malloc(thd->lex->comment.length, MYF(MY_WME));
> + if (!thd->lex->comment.str)
> + {
> + my_error(ER_OUTOFMEMORY, MYF(ME_FATALERROR), 1);
> + return -1;
> + }
> +
> + /* fragments are merged into allocated buf while the user var:s get reset */
> + size_t gathered_length= 0;
> + for (uint k=0; k < 2; k++)
> + {
> + memcpy(thd->lex->comment.str + gathered_length, entry[k]->value, entry[k]->length);
> + gathered_length += entry[k]->length;
> + if (update_hash(entry[k], true, NULL, 0, STRING_RESULT, &my_charset_bin, 0))
> + {
> + my_printf_error(ER_WRONG_TYPE_FOR_VAR,
> + "%s: BINLOG fragment user "
> + "variable '%s' could not be unset", MYF(0),
> + ER_THD(thd, ER_WRONG_TYPE_FOR_VAR), entry[k]->value);
> + }
I don't see how update_hash(entry[k], true, ...) can ever fail, so
there's no need to pretend that it can.
> + }
> +
> + DBUG_ASSERT(gathered_length == thd->lex->comment.length);
> +
> + return 0;
> +}
> +
> +
> /**
> Execute a BINLOG statement.
>
> @@ -119,6 +175,23 @@ void mysql_client_binlog_statement(THD* thd)
> rli->sql_driver_thd= thd;
> rli->no_storage= TRUE;
>
> + if (unlikely(is_fragmented= thd->lex->comment.str && thd->lex->ident.str))
> + if (binlog_defragment(thd))
> + goto end;
> +
> + if (!(coded_len= thd->lex->comment.length))
> + {
> + my_error(ER_SYNTAX_ERROR, MYF(0));
> + goto end;
> + }
> +
> + decoded_len= base64_needed_decoded_length(coded_len);
> + if (!(buf= (char *) my_malloc(decoded_len, MYF(MY_WME))))
> + {
> + my_error(ER_OUTOFMEMORY, MYF(ME_FATALERROR), 1);
> + goto end;
> + }
> +
Technically, it should be possible to decode base64 in-place and avoid
allocating a second 3GB buffer. But let's not do it in this MDEV :)
> for (char const *strptr= thd->lex->comment.str ;
> strptr < thd->lex->comment.str + thd->lex->comment.length ; )
> {
Regards,
Sergei
Chief Architect MariaDB
and security(a)mariadb.org
2
2
[Maria-developers] Do we support compilers that support C++11 but not thread_local?
by Sergey Petrunia 27 Nov '18
by Sergey Petrunia 27 Nov '18
27 Nov '18
Hi,
I'm writing this in connection with MyRocks Storage Engine and
https://github.com/facebook/mysql-5.6/issues/904
C++ 11 includes "thread_local" specifier :
https://en.cppreference.com/w/cpp/keyword/thread_local
GNU also has __thread, which was available before C++11.
RocksDB (and so MyRocks) uses both __thread and thread_local. thread_local is
part of the standard, so they might switch to thread_local.
The question: will it cause any issues for us?
In other words, do we have compilers/environments that
- claim to support C++11 (so that we attempt to compile MyRocks)
- but do not support "thread_local"
?
For Windows, RocksDB does this in port/win/port_win.h:
#ifndef __thread
#define __thread __declspec(thread)
#endif
but this page page about Visual Studio 2015)
https://msdn.microsoft.com/en-us/library/6yh4a9k1.aspx
says that
C++11: The thread_local storage class specifier is the recommended
way to specify thread-local storage for objects and class members.
I assume this means it's ok to use thread_local on Windows?
RocksDB source code mentions one problematic case:
// However, thread_local is not supported in all compilers that accept -std=c++11
// (e.g., eg Mac with XCode < 8. XCode 8+ supports thread_local).
As far as I understand this is not relevant for us?
Anyone aware of other such cases?
BR
Sergei
--
Sergei Petrunia, Software Developer
MariaDB Corporation | Skype: sergefp | Blog: http://s.petrunia.net/blog
3
2
[Maria-developers] MDEV-17706 Review server changes introduced by Galera 4 wsrep patch
by Alexander Barkov 27 Nov '18
by Alexander Barkov 27 Nov '18
27 Nov '18
Hi Jan,
I checked changes to the /sql directory so far.
Sorry, I'm completely out of this topic.
So I have only general suggestions.
1. General code structure.
There are a lot of new
#ifdef WITH_WRESP
do_something_with_wresp();
#else
do_something_without_wresp();
#endif
This make the server code harder to follow.
I'd prefer a more modular way.
Would it be possible to introduce some abstract class with virtual
methods? One instantiable class would implement behavior for wresp,
and other class would implement behavior for "without wresp".
Instead #ifdefs, we would do:
behavour->do_something();
WRESP support could be then turned into a real dynamic loadable plugin,
the server would be able to load it independently from the server
compilation flags.
Structures that need extra members, for example, "struct THD_TRANS"
could be implemented like this:
struct THD_TRANS_WRESP: public THD_TRANS
{
public:
int rw_ha_count;
void reset()
{
THD_TRANS::reset();
rw_ha_count= 0;
}
};
The virtual interface would do either "new THD_TRANS()" or "new
THD_TRANS_WRESP()",
depending on the WSREP support availability.
2. Please fix coding style in some places:
+ bool fix_length_and_dec()
+ {
+ max_length = WSREP_GTID_STR_LEN;
+ maybe_null = true;
+ return FALSE;
+ }
It should:
+ bool fix_length_and_dec()
+ {
+ max_length= WSREP_GTID_STR_LEN;
+ maybe_null= true;
+ return false;
+ }
There are more places that do not follow the coding style.
Please check.
3. Why do we need a separate directory /wresp with pure C files?
Are they shared by the server and by the clients? If not,
why not to put this code together with other wsrep related files?
4. Some code was commented. Why not just remove it?
Example:
+#ifdef TODO
DBUG_SLOW_ASSERT(global_status_var.global_memory_used == 0);
+#endif
Example:
#ifdef WITH_WSREP
- if (thd->wsrep_exec_mode == LOCAL_STATE)
+ // if (thd->wsrep_exec_mode == LOCAL_STATE)
WSREP_TO_ISOLATION_BEGIN(WSREP_MYSQL_DB, NULL, NULL);
#endif
Example:
+#include "log_event.h"
+// #include "binlog.h"
+
Example:
+int wsrep_write_dummy_event_low(THD *thd, const char *msg)
+{
+ ::abort();
+ return 0;
+#if 0
+ int ret= 0;
.... a lot of dead code
+ return ret;
+#endif
+}
Example:
+int wsrep_write_dummy_event(THD *orig_thd, const char *msg)
+{
+ return 0;
+#if 0
... a lot of dead code
+#endif
+}
Example:
+//#ifdef GTID_SUPPORT
+void wsrep_init_sidno(const wsrep::id& uuid)
.....
+//#endif /* GTID_SUPPORT */
Example:
+static std::string wsrep_server_id()
+{
+#ifdef WSREP_TODO
+ std::stringstream ss;
+ ss << server_id;
+ return(ss.str());
+#endif
+ /* us
Example:
+ // enum wsrep::provider::status ret = wsrep_sync_wait_upto(thd,
NULL, -1);
+ if (thd->wsrep_cs().sync_wait(-1))
Example:
+#if UNUSED /* 323f269d4099 (Jan Lindström 2018-07-19) */
static const char* wsrep_get_query_or_msg(const THD* thd)
{
sw
... a lot of unused code
}
+#endif //UNUSED
Example:
+#if 0
+ /* todo */
+ wsrep_xid_init(&thd->wsrep_xid,
+ thd->wsrep_trx_meta.gtid.uuid,
+ thd->wsrep_trx_meta.gtid.seqno);
+ if (tc_log) tc_log->commit(thd, true);
+#endif /* 0 */
Example:
+void* start_wsrep_THD(void *arg)
+{
+ THD *thd;
+ //wsrep_thd_processor_fun processor= (wsrep_thd_processor_fun)arg;
Example:
+//wsrep_t *get_wsrep()
+//{
+// return wsrep;
+//}
Example:
+//int wsrep_show_status(THD *thd, SHOW_VAR *var, char *buff,
+// enum enum_var_type scope);
There are many other pieces of the dead code.
5. Why this change?
--- a/sql/sql_basic_types.h
+++ b/sql/sql_basic_types.h
@@ -20,6 +20,8 @@
#ifndef SQL_TYPES_INCLUDED
#define SQL_TYPES_INCLUDED
+#include <my_global.h>
+
6. sql/wsrep_mysqld.cc: something is wrong with indentation:
case 1:
case 2:
case 3:
+ case 4:
7. According to our coding style, multi-line comments like this:
+ // Note: We can't call THD destructor without crashing
+ // if plugins have not been initialized. However, in most of the
+ // cases this means that pre SE initialization SST failed and
+ // we are going to exit anyway.
should be:
/*
Note: We can't call THD destructor without crashing
if plugins have not been initialized. However, in most of the
cases this means that pre SE initialization SST failed and
*/
we are going to exit anyway.
8. This looks strange:
+const char wsrep_defaults_group_suffix[256] = {0};
Why do you need an array of 256 zero bytes in the const static area?
9. You remove a lot of code in one place and add new code in other places.
It seems you move some old code to new locations.
For example, this code was removed from sql/wsrep_thd.cc.diff:
-void wsrep_replay_transaction(THD *thd)
I can see its pieces in wsrep_high_priority_service.h,
in the class Wsrep_replayer_service.
Can refactoring changes like this be done in a separate patch?
It would make reviewing easier.
10. Please don't use redundant current_thd calls when a THD pointer
is already available in a local variable or a function parameter.
Example:
@@ -2457,7 +2460,7 @@ void Item_func_rand::seed_random(Item *arg)
THD *thd= current_thd;
if (WSREP(thd))
{
- if (thd->wsrep_exec_mode==REPL_RECV)
+ if (wsrep_thd_is_applying(current_thd))
11. When does current_thd return NULL?
+ if (current_thd)
+ {
+ /* Undocking the thread specific data. */
+ my_pthread_setspecific_ptr(THR_THD, NULL);
+ /
12. Perhaps ErrConv family classes could be used in some cases:
This:
+ std::ostringstream os;
+ os << view.state_id().id();
+ wsrep_update_cluster_state_uuid(os.str().c_str());
can probably be shortened to something like this:
wsrep_update_cluster_state_uuid(ErrConvInteger(view.state_id().id()).ptr());
13. There's something wrong with semicolons here:
+ /* DTRACE begin */
+ MYSQL_QUERY_START(rawbuf, thd->thread_id,
+ (char *)(thd->db.str ? : thd->db.str : "unknown"),
+ &thd->security_ctx->priv_user[0],
+ (char *) thd->security_ctx->host_or_ip);
Notice, "?" immediately followed by ":".
This should not compile. Is it an optionally compiled code?
Please make sure to compile in all possible build option
combinations, to check all optional code.
2
1
[Maria-developers] Fwd: Re: [Commits] 68b7e231cb3: MDEV-17255: New optimizer defaults and ANALYZE TABLE
by Sergey Petrunia 27 Nov '18
by Sergey Petrunia 27 Nov '18
27 Nov '18
----- Forwarded message from Igor Babaev <igor(a)mariadb.com> -----
Date: Sun, 25 Nov 2018 11:45:34 -0800
From: Igor Babaev <igor(a)mariadb.com>
To: Sergey Petrunia <sergey(a)mariadb.com>
Subject: Fwd: Re: [Commits] 68b7e231cb3: MDEV-17255: New optimizer defaults and ANALYZE TABLE
Hi Sergey,
PREFERABLY_FOR_READS
sounds confusing, because the stat tables still are used for UPDATE/DELETE.
Maybe
PREFERABLY_FOR_QUERIES?
Besides, don't forget that you need
COMPLEMENTARY_FOR_READS[QUERIES]
as well.
Regards,
Igor.
-------- Forwarded Message --------
Subject: Re: [Commits] 68b7e231cb3: MDEV-17255: New optimizer defaults and
ANALYZE TABLE
Date: Sat, 24 Nov 2018 21:47:57 +0300
From: Sergey Petrunia <sergey(a)mariadb.com>
Reply-To: maria-developers(a)lists.launchpad.net
To: Varun <varunraiko1803(a)gmail.com>, maria-developers(a)lists.launchpad.net
CC: commits(a)mariadb.org
Hi Varun,
On Fri, Nov 16, 2018 at 11:48:59PM +0530, Varun wrote:
> revision-id: 68b7e231cb3a1cdfbd968c2cbf72687b29b6d2da (mariadb-10.3.6-209-g68b7e231cb3)
> parent(s): b9a9055793ab8b9a50200e2f55602463627dedd3
Does the above say the patch is going into 10.3?
The MDEV has fixVersion=10.4 and the parent revision is also in 10.4.
So I assume the target version is 10.4?
> author: Varun Gupta
> committer: Varun Gupta
> timestamp: 2018-11-16 23:48:11 +0530
> message:
>
> MDEV-17255: New optimizer defaults and ANALYZE TABLE
>
> Added another value for the use_stat_tables system variable
Please mention in the commit comment:
- the name of the new value
- the fact that the new value is now the default.
> ---
> sql/opt_range.cc | 2 +-
> sql/sql_admin.cc | 3 ++-
> sql/sql_statistics.h | 1 +
> sql/sys_vars.cc | 4 ++--
> 4 files changed, 6 insertions(+), 4 deletions(-)
Please run the testsuite. I did the 'main' testsuite and:
- main.mysqld--help, main.stat_tables_disabled - these need updates
- a few other tests need to be updated as now opening any table will now cause
the server to attept to read EITS stats for it.
Please also add test coverage for this particular MDEV. This should be a
testcase that shows that
- with PREFERABLY, ANALYZE will collect EITS stats, SELECT will use them.
- with PREFERABLY_FOR_READS :
-- ANALYZE will not collect EITS stats
-- ANALYZE .. PERSISTENT FOR ... will update the EITS stats.
-- SELECT will use them.
.
>
> diff --git a/sql/opt_range.cc b/sql/opt_range.cc
> index d6db365a8a2..f501a5ae085 100644
> --- a/sql/opt_range.cc
> +++ b/sql/opt_range.cc
> @@ -3158,7 +3158,7 @@ bool calculate_cond_selectivity_for_table(THD *thd, TABLE *table, Item **cond)
>
> if (thd->variables.optimizer_use_condition_selectivity > 2 &&
> !bitmap_is_clear_all(used_fields) &&
> - thd->variables.use_stat_tables > 0)
> + get_use_stat_tables_mode(thd) > NEVER)
> {
> PARAM param;
> MEM_ROOT alloc;
> diff --git a/sql/sql_admin.cc b/sql/sql_admin.cc
> index d0d959de8f9..b5c732737b7 100644
> --- a/sql/sql_admin.cc
> +++ b/sql/sql_admin.cc
> @@ -767,7 +767,8 @@ static bool mysql_admin_table(THD* thd, TABLE_LIST* tables,
> }
> collect_eis=
> (table->table->s->table_category == TABLE_CATEGORY_USER &&
> - (get_use_stat_tables_mode(thd) > NEVER ||
> + ((get_use_stat_tables_mode(thd) > NEVER &&
> + get_use_stat_tables_mode(thd) < PREFERABLY_FOR_READS) ||
> lex->with_persistent_for_clause));
>
>
> diff --git a/sql/sql_statistics.h b/sql/sql_statistics.h
> index 39cddf95188..a942c05be09 100644
> --- a/sql/sql_statistics.h
> +++ b/sql/sql_statistics.h
> @@ -22,6 +22,7 @@ enum enum_use_stat_tables_mode
> NEVER,
> COMPLEMENTARY,
> PREFERABLY,
Please add a comment describing the new value.
> + PREFERABLY_FOR_READS,
> } Use_stat_tables_mode;
>
> typedef
> diff --git a/sql/sys_vars.cc b/sql/sys_vars.cc
> index 92c7d329bb9..3652d728fc1 100644
> --- a/sql/sys_vars.cc
> +++ b/sql/sys_vars.cc
> @@ -5841,12 +5841,12 @@ static Sys_var_ulong Sys_progress_report_time(
> VALID_RANGE(0, UINT_MAX), DEFAULT(5), BLOCK_SIZE(1));
>
> const char *use_stat_tables_modes[] =
> - {"NEVER", "COMPLEMENTARY", "PREFERABLY", 0};
> + {"NEVER", "COMPLEMENTARY", "PREFERABLY", "PREFERABLY_FOR_READS", 0};
> static Sys_var_enum Sys_optimizer_use_stat_tables(
> "use_stat_tables",
> "Specifies how to use system statistics tables",
> SESSION_VAR(use_stat_tables), CMD_LINE(REQUIRED_ARG),
> - use_stat_tables_modes, DEFAULT(0));
> + use_stat_tables_modes, DEFAULT(3));
>
> static Sys_var_ulong Sys_histogram_size(
> "histogram_size",
BR
Sergei
--
Sergei Petrunia, Software Developer
MariaDB Corporation | Skype: sergefp | Blog: http://s.petrunia.net/blog
_______________________________________________
commits mailing list
commits(a)mariadb.org
https://lists.askmonty.org/cgi-bin/mailman/listinfo/commits
----- End forwarded message -----
--
BR
Sergei
--
Sergei Petrunia, Software Developer
MariaDB Corporation | Skype: sergefp | Blog: http://s.petrunia.net/blog
1
0
Re: [Maria-developers] [Maria-discuss] Pipeline-stype slave parallel applier
by Kristian Nielsen 26 Nov '18
by Kristian Nielsen 26 Nov '18
26 Nov '18
andrei.elkin(a)pp.inet.fi writes:
> Your comments, thoughts and critical notes are most welcome!
Thanks for the interesting idea and detailed description, Andrei! I have
written some initial comments inline, below:
> entering the binlog group (ordered) commit module. They can end up
> into binlog as A,B or B,A, and regardless of that (Gtid actually) order
> also be acknowledged to the user in either way.
>
> The latter fact infers that slave does not have to commit the group
> sub-transaction/branches in the GTID order. It could be done in any
> order including no order at all like to do so as one epoch commit.
However, MariaDB GTID requires that transactions be binlogged in the same
order on the slave as on the master. Otherwise it is not possible to move a
lower-level GTID slave from one master to the other, as the slave position
only includes a single GTID in MariaDB.
> conservative scheduler offers a decent parallelization method still suffering from
> the aforementioned uneven branch's sizes and inter-thread
> communications caused by ordering. The optimistic scheduler remedies to some level
> but not fully (an optimistically executed group may be unbalanced too)
> and even conceding some risk of performance degradation (rollback is costly).
Jean François Gagné's comprehensive benchmarking of parallel replication
actually has some very interesting data on this. He ran a real Booking.com
workload with literally thousands of worker threads and had a huge rollback
ratio (I remember it as up around 40%), and still saw speedup and good
scalabitily. Apparently rollback isn't necessarily so costly; maybe because
replication servers often have spare cores idling anyway, and because the
aborted transaction acts as a pre-fetcher, improving I/O parallelism.
But the fact that thousands of threads still improved speed suggests exactly
that there was a problem of uneven transaction size - a few large
transactions spread far apart, so lots of transaction lookahead was needed.
I do not know if those transactions were single large queries or contained
many individual statements, but in the latter case this is exactly what this
idea could help with!
> There is an idea reminiscent to pipelining to handle the epoch (the
> master binlog group) as a single transaction but in parallel. That is
> to execute its statements by multiple workers, this time scheduling
> them fairly, and "atomically" commit their works.
So this is a very interesting idea. Once the transactions are recorded in
the binlog, we have a serial stream of statements that will reproduce the
original data on the master. And they will do this regardless of if we
remove some of the COMMITs/BEGINs - or if we add some. (I wonder if there
are any exceptions to this, I can't think of any off the top of my head).
So instead of trying to parallelise transactions, just do all the individual
statements in parallel, disregarding transaction boundaries (with some
restrictions to ensure consistency, as you describe below).
I wonder how much this will benefit real-life workloads. There seems to be a
lot of potential. For some workloads it will not help - if most transactions
are single-statement, or if the multi-statement transactions have
dependencies between the statements. But that doesn't seem likely to be the
typical scenario.
But why tie this to the master binlog group? It seems to fit perfectly to
the optimistic parallel replication mode, where different transactions are
run in parallel speculatively, and any conflicts are detected and resolved.
In fact, it seems the exact same mechanism will work to detect conflicts
between the more fine-grained per-statement parallelisation.
The conservative parallle replication I consider obsolete. It causes a lot
of hassle for the user to try to get good-sized group commits on the master
to ensure parallel replication opportunities on the slave. And it doesn't
even ensure consistency; there are corner cases (in InnoDB and elsewhere)
that can cause conflicts to appear anyway on the slave, so the
rollback+retry mechanism is needed anyway for conservative, just like for
optimistic.
> To the fairness part, workers get assigned by a statement at a time
> from the epoch transaction(sub-transaction BEGIN, COMMIT removed).
> Say 'E' an epoch consists of 's' of sub-transaction
>
> E := { T_1, T_2, ... T_s }
>
> For each m from 1 to s sub-transactions T_m gets scheduled to some worker
>
> W_1 W_2 ... W_k
>
> | | |
> V V V
>
> T_m := { e_1, e_2, ... e_l }
>
> Hereinafter I use the TeX math notation, '^' - a superscript attached to
> the event to designate its parent transaction sequence number (Gtid
> seq_no), '_' - a subscript to enumerate an object within its
> compound object.
>
> e_i stands for T-ransation's statements, W_t:s are workers.
> Effectively the epoch breaks into modified branches executed by
> workers on one to one basic:
>
> E := { T'_1, T'_2, ... T'_k }
>
> here 'k' is the number of workers engaged, T' indicates the modified
> transaction.
> The branches are executed until they are ready to prepare which event
> is triggered by scheduling of the last statement of the last original
> sub-transaction.
>
> It's clear that sizes of the modified branches are even this time.
Well... each worker has exactly one statement at a time, but each statement
can be very different in execution cost. But more even than
transaction-based scheduling, certainly.
> The last statement worker coordinates 2pc.
>
> Thinking more technically workers can consume/pull from
> the current epoch presented as a queue which is pushed into
> by statement producer.
>
> Here is a possible state of the queue when m:th statement of T_n is
> about to be pushed:
>
> e^n_k ->
>
>
> [ e^n_m-1, ..., e^n_2, e^n_1; ...; e^1_l1, ... e^1_2, e^1_1 ]
>
> ...----- T_n ------------| T_n-1,...T_2 |---------- T_1 ---------|
>
> A pull by consumer at this point would return e^1_1.
How do you imagine integrating this in the current parallel replication
scheduling algorithm?
In the current algorithm, the producer (the SQL thread) assigns work to
worker threads in a round-robin way. There is a lot of care taken to
minimise contention on a single global queue-lock or similar, and I think
that is an important reason why Jean François' tests were able to scale so
well to many thousand worker threads.
I wonder if the existing scheduling could not be used directly for this idea
also - but it needs to be extended of course to schedule individual
statements rather than whole transactions, with some way to let worker
threads coordinate transaction start/end between them.
What are your thoughts on this?
> Very likely that by the epoch prepare event *all* the branches have been already
> ready for prepare().
Hm, any particular reason you think so? I would imagine the opposite, that
individual statements will be scheduled quickly, and it will be quite random
in which order they happen to become ready in?
> This pipelining parallelization method can work for the single "large"
> transaction in the binlog group and also could drag into the
> parallelizable input transactions from later groups if we additionally
> create master side dependency tracking (think of mysql logical
> timestamp method).
> Also notice that the optimisitic scheduler is orthogonal to this
> method so the two can be combined.
>
> Consistency concern
> -------------------
>
> The epoch parallel execution (scheduling) must respect
> intra-transaction statement dependencies (e.g FIFO execution order of
> operations over the same object [record, table]). There is no
> inter-transaction dependencies, at least in theory.
In fact, there do exist such inter-transaction dependencies in a number of
corner cases, at least if we want to enforce the same commit order on slave
as on master, which is required for MariaDB GTID. But that is already
handled in current code, by detecting the dependency inside the storage
engine (the thd_rpl_deadlock_check() mechanism).
I wonder if the exact same thd_rpl_deadlock_check() mechanism would not work
as well for this per-statement scheduling idea? It would be interesting to
see a proof-of-concept based on this...
> The notion of a leader, the two-phase-commits leader, remains roughly
> the same as in the existing conservative scheduler. This time more
> than initiating commits of the epoch branches it also takes care to
> advance the slave gtid execution status accordingly.
> It could be implemented as the very last statement of its own T' branch.
Hm, right, interesting, agree that this seems a good fit.
In fact, if the existing mechanism can be slightly extended, maybe it can
already do much of what is needed to ensure consistency. Eg. suppose we have
3 original transactions each with two statements:
e^1_1 e^1_2 e^2_1 e^2_2 e^3_1 e^3_2
Suppose the first 3 of these are ready to commit at some point:
e^1_1 e^1_2 e^2_1
The leader can check that it doesn't try to commit only part of an original
transaction. In this case it sees that e^2_1 is missing e_2_2, and it can
commit only the first part and leave the partial transaction for a following
group commit:
e^2_1
This way, we already ensure that a crash will not leave us with an
inconsistent storage engine state on the slave. Because we never commit part
of an original master transaction - all parts of it are always
group-committed together. Seems promising, though that code is also heavily
optimised for scalability, so will require some case.
Hm, one case needs to be handled though - when there are more statements in
one original master transaction, than there are worker threads - otherwise
we end up with a deadlock.
> It's clear that causal dependencies of an user write and a read (to
> find the effect of that write) can be tracked by WAIT_FOR_GTID() as
> currently.
>
> A plain two-phase commit does admit possibility to catch on slave combinations
> of the old (pre-epoch) and new version (post-epoch) or some objects of
> the same transaction if the user hits with non-locking SELECT in a
> "tiny" time window between commit():s of two branches
> like in the following example.
>
> Let t1,t2 some tables and the binlog group consists of just one
> transaction
>
> T_1 := { update(t1), update(t2) }
>
> Let on slave it's modified into two parallel ones:
>
> T'_1 := { update(t1) } || T'2 := { update(t2) }
>
> After both respond OK to 2pc prepare() and then the leader initiates
> commit() where could be 4 possible result sets to the contemporary SELECT
>
> SET @@session.tx_isolation='REPEATABLE-READ';
> SELECT * from t1 as t_1, t2 as t_2;
> => ...
Well, due to the enforced commit order, it will not be possible to see the
case where update(t2) is committed but update(t1) is not.
In fact, if the group commit is extended as suggested above (to only
group-commit the statements of an original transaction together), then I
think it is possible for the user to avoid this problem entirely using
START TRANSACTION WITH CONSISTENT SNAPSHOT
In MariaDB, this ensures a consistent read view against 2pc transactions
(reader will always see all of the 2pc commit or none of it). And IIRC, it
works by holding the same lock (LOCK_commit_ordered) that is used by the
group commit leader. So if the user has some special case where this minor
read-inconsistency is sensitive, the START TRANSACTION WITH CONSISTENT
SNAPSHOT mechanism can be used to avoid it completely.
> While this is perhaps more than imperfect, I believe it may be
> relevant only to remote corner cases, but more importantly, we still can
> refine that doing some *adjustments* in the Engine studied briefly
> myself of what would be a sub-transaction in Innodb,
> "ideologically" endorsed and guided by Marko Makela.
> The idea is to hold the branch's locks even after the branch commits,
> and delegate their release to the leader.
>
> Recoverability
> --------------
>
> When slave is configured to log in the binary log
> we must make sure to rollback any prepared
> epoch branches at the server recovery unless the last branch (of the
> leader) is also there, prepared.
>
> When it does not binlog the problem is that the modified branch's binlog
> images can't be just flushed one after another especially
> if the user specifies to log on the slave uniformly with the master
> that is to preserve the original sub-transaction structures and their
> boundaries.
"if the user specified" - but this is always a requirement in MariaDB,
right? Otherwise GTID order is broken?
> In such case there are several multiple option, here are some:
>
> - use relay-logged image of original sub-transactions (may be limited
> to the master and slave version combination though;)
I did not follow what this means - can you elaborate?
> - reconstruct original sub-transaction's binlog images. That is
> binlog events "belonging" to an original sub-transaction will be
> cached cooperatively by workers that took share of the
> original sub-transaction execution.
>
> Optionally at least the caching could be set to allow out
> of order while the statements are parallelizable, like in the above
> T1(t1,t2) example. Otherwise the ordering will be preserved through
> implementing the shared cache as as a queue with an interface
> to insert an item into the middle.
Isn't it possible to do the binlog caching locally, each worker caching just
the binlogging for its own statement? If we make sure they group-commit
together, they will be written in sequence into the binlog, so they can just
write each their part one after the other?
> - exotics like to create the slave side "merge" GTID which would
> embed the original "commit". (The quotes here to allude to the Git's
> way).
>
> While it's a serious subject I am leaving out mariabackup for this moment.
> I believe it will be feasible to adjust its logics to
> account as prepared only such transactions that that
> are prepared in all their branches.
>
> More about pipeline style optimization
> --------------------------------------
>
> As mentioned workers can be made self-serving to assign events for
> execution at once when become available. That must turn to think of
> producer-consumer model, potentially lock-free queues, dynamical # of
> workers to stay in balance with the receiver's pace etc.
So are you planning to implement a completely new parallel replication
scheduler, rather than build on the existing one?
> I think *eventually* we will need to assign the producer role to
> the IO (receiver) thread, and convert the current SQL (driver) thread
> into (relay-) logger (we still may have to have such logger for
> the semisync replication, the relay logger can be opted out
> otherwise).
> Here is a picture
>
> IO_thread.push()
> | +---- worker.pull()
> V V
> e^s_i -> [ e^s_i-1, ... e^s_1 }, T_s-1, ... ]
> ^
> |
> SQL.pull()
>
> where on the right hand side the two consumers handle
> a transaction and when both of them have done with it
> (a worker has read its last event and the logger written
> it to the relay-log) the query element becomes
> garbage-collectable.
So overall, this was an interesting idea to see! I never thought before of
doing parallelisation beyond the original transaction boundaries from the
master. But now that you have described it, it actually seems quite a
natural extension of the optimistic parallel replication, and it fits very
well with the fundamental principle in MariaDB replication of preserving
strict GTID ordering, which might work to handle much of the consistency
requirements.
It would be quite interesting to see this in practice on real workloads and
with benchmarks - I could imagine very large improvements for some
workloads. Though I suppose you shouldn't underestimate the work required to
complete something like this in production quality (the devil is in the
detail, once you get to nasty things like temporary tables and
non-transactional engines and other horrors that are part of the
MySQL/MariaDB ecosystem).
Hope this helps,
- Kristian.
2
7
[Maria-developers] Documenting is needed: MDEV-16991 Rounding vs truncation for TIME, DATETIME, TIMESTAMP
by Alexander Barkov 26 Nov '18
by Alexander Barkov 26 Nov '18
26 Nov '18
Hi Kenneth, all,
I've pushed MDEV-16991.
Can you please document this new feature.
Thanks!
2
3
Hi Alexey,
Thanks for your review.
On 11/26/2018 12:29 AM, Alexey Botchkov wrote:
> Ok, fine with leaving sql_base_types.h and the MYSQL_TIME_STATUS as it is.
>
> The last comment then:
> Item_func_trt_id::val_int()
> Item_interval_DDhhmmssff_typecast::val_str()
> there you get the 'current_thd' which is considered to be slow.
> Code looks like
> THD *thd= current_thd;
> Datetime::Options opt(TIME_CONV_NONE, thd);
> if (args[0]->get_date(thd, &commit_ts, opt))
> I think it makes sence to add the Item_func_trt_id::m_opt and fill it in
> ::fix_fields().
Unfortunately this does not help much in these two places exactly.
We need to pass thd to get_date() anyway.
Methods like val_int(), val_str(), etc, should be fixed eventually to
get an extra THD* parameter.
But I checked the entiner patch again and fixed a few other places
where some current_thd calls were redundant.
Thanks!
>
> Ok with me to push if you fix it like that.
>
> One extra idea though. We can get rid of current_thd
> in set_date_length() too -
> add the THD * parameter there and call in in ::fix_fields() instead of
> ::fix_length_and_dec().
> Since it doesn't seem critical, i'm fine if you leave it as it is :)
Alas, fix_length_and_dec() does not get THD* as an argument.
Only fix_fields() does.
fix_length_and_dec() should also be fixed to get THD*.
Btw, perhaps 10.4 is a good timing for this.
>
> Best regards.
> HF
>
>
>
> On Sun, Nov 25, 2018 at 9:42 AM Alexander Barkov <bar(a)mariadb.com
> <mailto:bar@mariadb.com>> wrote:
>
> Hi Alexey,
>
>
> On 11/25/2018 09:05 AM, Alexander Barkov wrote:
> >>
> >> What do you think if we do this:
> >> - add THD* memver to the MYSQL_TIME_STATUS structure,
> >> so we can get rid of the THD * parameter to many functions?
> >> - send one MYSQL_TIME_STATUS* to the add_nanoseconds()
> >> instead of three {thd, &status->warnings, and
> status->nanoseconds} ?
> >
> > Changing all functions that accept both &warn and nsec to
> > accept a pointer to MYSQL_TIME_STATUS instead, like this:
> >
> > < xxxx(thd, &status->warnings, and status->nanoseconds);
> >> xxxx(thd, &status);
> >
> > is a very good idea.
> >
> >
> > I'd avoid mix "thd" to MYSQL_TIME_STATUS though.
> > It's a pure C structure.
> >
> > Does it sound OK?
> >
>
> I tried this, but this does not save anything.
> See the attached patch.
>
> git diff --stat
> sql/sql_time.cc | 6 +++---
> sql/sql_type.cc | 2 +-
> sql/sql_type.h | 8 ++++++++
> 3 files changed, 12 insertions(+), 4 deletions(-)
>
>
> In all other places "*warn" and "nsec" come from different
> places (not from the same MYSQL_TIME_STATUS).
>
>
> Btw, MYSQL_TIME_STATUS is actually an interface to low level
> pure C conversion functions like str_to_xxx() and number_to_xxx().
> Let's not propagate it all around the code.
>
> Can we leave the relevant code as is?
>
> Thanks.
>
1
0
Re: [Maria-developers] [Commits] 8bfb140d5dc: Move deletion of old GTID rows to slave background thread
by Kristian Nielsen 25 Nov '18
by Kristian Nielsen 25 Nov '18
25 Nov '18
Hi Andrei!
Here is the patch to move the cleanup of old GTID rows into the slave
background thread. It is mostly identical to the patch I wrote a month
ago, but now I got test cases added/updated and I ran it through buildbot.
Want to review it? The patch should be complete now. The patch is also
available in git here:
https://github.com/knielsen/server/tree/gtid_delete_in_background_thread
This patch could go into 10.4.
With the patch, a request is sent to the background thread to delete old
rows every @@gtid_cleanup_batch_size replicated transactions. Some
considerations on the default of this value:
- It should be reasonably large so that the background thread only needs to
wake up occasionally, and a number of row deletions can share the cost of
the part that is not inherently per-row.
- A large enough value can group a number of row deletions in a single
transaction which is probably beneficial. There is also a fixed overhead
per existing gtid_domain_id to scan the GTID state, so the value should
not be much smaller than the number of domains in use.
- It should not be too large either, since that could cause the
mysql.gtid_slave_pos table to grow unnecessarily. Ideally the table
should fit in one InnoDB data page.
I settled on a default of 64. Probably a wide range will work fine in >99%
of cases, eg. 10 or 100 should be mostly the same. 64 should be large enough
that the per-batch overhead becomes small relative to the per-row, even if
hundreds of domains are used. And small enough that the table is likely to
be able to fit in a single InnoDB 16k page, even considering that deletion
is asynchroneous and more rows can accumulate temporarily due to parallel
replication etc.
I think this is a nice improvement. Moving the deletion logic away from the
main replication SQL/worker threads simplifies the logic, eg. for when a
transaction fails after a deletion has already been done. And it also has
the potential to be more efficient; the contention on global mutex
LOCK_slave_state is reduced, processing is moved out of the main replication
threads, and batching row deletions could potentially be cheaper than doing
them one-by-one.
- Kristian.
Kristian Nielsen <knielsen(a)knielsen-hq.org> writes:
> revision-id: 8bfb140d5dc247c183787b8a0a1799cf375845bd (mariadb-10.3.10-25-g8bfb140d5dc)
> parent(s): 74387028a06c557f36a0fd1bbde347f1551c8fb7
> author: Kristian Nielsen
> committer: Kristian Nielsen
> timestamp: 2018-11-25 19:38:33 +0100
> message:
>
> Move deletion of old GTID rows to slave background thread
>
> This patch changes how old rows in mysql.gtid_slave_pos* tables are deleted.
> Instead of doing it as part of every replicated transaction in
> record_gtid(), it is done periodically (every @@gtid_cleanup_batch_size
> transaction) in the slave background thread.
>
> This removes the deletion step from the replication process in SQL or worker
> threads, which could speed up replication with many small transactions. It
> also decreases contention on the global mutex LOCK_slave_state. And it
> simplifies the logic, eg. when a replicated transaction fails after having
> deleted old rows.
>
> With this patch, the deletion of old GTID rows happens asynchroneously and
> slightly non-deterministic. Thus the number of old rows in
> mysql.gtid_slave_pos can temporarily exceed @@gtid_cleanup_batch_size. But
> all old rows will be deleted eventually after sufficiently many new GTIDs
> have been replicated.
>
> ---
> mysql-test/main/mysqld--help.result | 10 +
> mysql-test/suite/rpl/r/rpl_gtid_mdev4484.result | 40 +-
> mysql-test/suite/rpl/r/rpl_gtid_stop_start.result | 8 +-
> .../suite/rpl/r/rpl_parallel_optimistic.result | 14 +-
> mysql-test/suite/rpl/t/rpl_gtid_mdev4484.test | 68 +++-
> .../suite/rpl/t/rpl_parallel_optimistic.test | 42 ++-
> .../sys_vars/r/sysvars_server_notembedded.result | 14 +
> sql/log_event.cc | 6 +-
> sql/mysqld.cc | 1 +
> sql/mysqld.h | 1 +
> sql/rpl_gtid.cc | 413 +++++++++++++--------
> sql/rpl_gtid.h | 12 +-
> sql/rpl_rli.cc | 87 +----
> sql/rpl_rli.h | 11 -
> sql/slave.cc | 35 +-
> sql/slave.h | 1 +
> sql/sys_vars.cc | 13 +
> .../mysql-test/rocksdb_rpl/r/mdev12179.result | 18 +
> .../mysql-test/rocksdb_rpl/t/mdev12179.test | 85 +++++
> .../mysql-test/tokudb_rpl/r/mdev12179.result | 18 +
> .../tokudb/mysql-test/tokudb_rpl/t/mdev12179.test | 85 +++++
> 21 files changed, 675 insertions(+), 307 deletions(-)
>
> diff --git a/mysql-test/main/mysqld--help.result b/mysql-test/main/mysqld--help.result
> index 5a7153f32d3..4f801ec5275 100644
> --- a/mysql-test/main/mysqld--help.result
> +++ b/mysql-test/main/mysqld--help.result
> @@ -294,6 +294,15 @@ The following specify which files/extra groups are read (specified before remain
> --group-concat-max-len=#
> The maximum length of the result of function
> GROUP_CONCAT()
> + --gtid-cleanup-batch-size=#
> + Normally does not need tuning. How many old rows must
> + accumulate in the mysql.gtid_slave_pos table before a
> + background job will be run to delete them. Can be
> + increased to reduce number of commits if using many
> + different engines with --gtid_pos_auto_engines, or to
> + reduce CPU overhead if using a huge number of different
> + gtid_domain_ids. Can be decreased to reduce number of old
> + rows in the table.
> --gtid-domain-id=# Used with global transaction ID to identify logically
> independent replication streams. When events can
> propagate through multiple parallel paths (for example
> @@ -1425,6 +1434,7 @@ gdb FALSE
> general-log FALSE
> getopt-prefix-matching FALSE
> group-concat-max-len 1048576
> +gtid-cleanup-batch-size 64
> gtid-domain-id 0
> gtid-ignore-duplicates FALSE
> gtid-pos-auto-engines
> diff --git a/mysql-test/suite/rpl/r/rpl_gtid_mdev4484.result b/mysql-test/suite/rpl/r/rpl_gtid_mdev4484.result
> index aaeb0c8f119..55d2831dcf4 100644
> --- a/mysql-test/suite/rpl/r/rpl_gtid_mdev4484.result
> +++ b/mysql-test/suite/rpl/r/rpl_gtid_mdev4484.result
> @@ -16,36 +16,32 @@ INSERT INTO t1 VALUES (1);
> connection slave;
> connection slave;
> include/stop_slave.inc
> +SET @old_gtid_cleanup_batch_size= @@GLOBAL.gtid_cleanup_batch_size;
> +SET GLOBAL gtid_cleanup_batch_size= 2;
> SET @old_dbug= @@GLOBAL.debug_dbug;
> SET GLOBAL debug_dbug="+d,gtid_slave_pos_simulate_failed_delete";
> SET sql_log_bin= 0;
> -CALL mtr.add_suppression("Can't find file");
> +CALL mtr.add_suppression("<DEBUG> Error deleting old GTID row");
> SET sql_log_bin= 1;
> include/start_slave.inc
> connection master;
> -INSERT INTO t1 VALUES (2);
> -connection slave;
> -include/wait_for_slave_sql_error.inc [errno=1942]
> -STOP SLAVE IO_THREAD;
> -SELECT domain_id, server_id, seq_no FROM mysql.gtid_slave_pos
> -ORDER BY domain_id, sub_id DESC LIMIT 1;
> -domain_id server_id seq_no
> -0 1 3
> +connection slave;
> +SELECT COUNT(*), MAX(seq_no) INTO @pre_count, @pre_max_seq_no
> +FROM mysql.gtid_slave_pos;
> +SELECT IF(@pre_count >= 20, "OK", CONCAT("Error: too few rows seen while errors injected: ", @pre_count));
> +IF(@pre_count >= 20, "OK", CONCAT("Error: too few rows seen while errors injected: ", @pre_count))
> +OK
> SET GLOBAL debug_dbug= @old_dbug;
> -include/start_slave.inc
> connection master;
> -INSERT INTO t1 VALUES (3);
> -connection slave;
> -connection slave;
> -SELECT domain_id, server_id, seq_no FROM mysql.gtid_slave_pos
> -ORDER BY domain_id, sub_id DESC LIMIT 1;
> -domain_id server_id seq_no
> -0 1 4
> -SELECT * FROM t1 ORDER BY i;
> -i
> -1
> -2
> -3
> +connection slave;
> +connection slave;
> +SELECT IF(COUNT(*) >= 1, "OK", CONCAT("Error: too few rows seen after errors no longer injected: ", COUNT(*)))
> +FROM mysql.gtid_slave_pos
> +WHERE seq_no <= @pre_max_seq_no;
> +IF(COUNT(*) >= 1, "OK", CONCAT("Error: too few rows seen after errors no longer injected: ", COUNT(*)))
> +OK
> connection master;
> DROP TABLE t1;
> +connection slave;
> +SET GLOBAL gtid_cleanup_batch_size= @old_gtid_cleanup_batch_size;
> include/rpl_end.inc
> diff --git a/mysql-test/suite/rpl/r/rpl_gtid_stop_start.result b/mysql-test/suite/rpl/r/rpl_gtid_stop_start.result
> index ff845794c22..b27ffed9f94 100644
> --- a/mysql-test/suite/rpl/r/rpl_gtid_stop_start.result
> +++ b/mysql-test/suite/rpl/r/rpl_gtid_stop_start.result
> @@ -171,7 +171,7 @@ include/start_slave.inc
> *** MDEV-4692: mysql.gtid_slave_pos accumulates values for a domain ***
> SELECT domain_id, COUNT(*) FROM mysql.gtid_slave_pos GROUP BY domain_id;
> domain_id COUNT(*)
> -0 2
> +0 3
> 1 2
> connection server_1;
> INSERT INTO t1 VALUES (11);
> @@ -179,7 +179,7 @@ connection server_2;
> FLUSH NO_WRITE_TO_BINLOG TABLES;
> SELECT domain_id, COUNT(*) FROM mysql.gtid_slave_pos GROUP BY domain_id;
> domain_id COUNT(*)
> -0 2
> +0 4
> 1 2
> include/start_slave.inc
> connection server_1;
> @@ -189,8 +189,8 @@ connection server_2;
> FLUSH NO_WRITE_TO_BINLOG TABLES;
> SELECT domain_id, COUNT(*) FROM mysql.gtid_slave_pos GROUP BY domain_id;
> domain_id COUNT(*)
> -0 2
> -1 2
> +0 3
> +1 1
> *** MDEV-4650: show variables; ERROR 1946 (HY000): Failed to load replication slave GTID position ***
> connection server_2;
> SET sql_log_bin=0;
> diff --git a/mysql-test/suite/rpl/r/rpl_parallel_optimistic.result b/mysql-test/suite/rpl/r/rpl_parallel_optimistic.result
> index ca202a66b0e..83343e52cab 100644
> --- a/mysql-test/suite/rpl/r/rpl_parallel_optimistic.result
> +++ b/mysql-test/suite/rpl/r/rpl_parallel_optimistic.result
> @@ -12,6 +12,8 @@ SET GLOBAL slave_parallel_threads=10;
> CHANGE MASTER TO master_use_gtid=slave_pos;
> SET @old_parallel_mode=@@GLOBAL.slave_parallel_mode;
> SET GLOBAL slave_parallel_mode='optimistic';
> +SET @old_gtid_cleanup_batch_size= @@GLOBAL.gtid_cleanup_batch_size;
> +SET GLOBAL gtid_cleanup_batch_size= 1000000;
> connection server_1;
> INSERT INTO t1 VALUES(1,1);
> BEGIN;
> @@ -131,6 +133,11 @@ c
> 204
> 205
> 206
> +SELECT IF(COUNT(*) >= 30, "OK", CONCAT("Error: too few old rows found: ", COUNT(*)))
> +FROM mysql.gtid_slave_pos;
> +IF(COUNT(*) >= 30, "OK", CONCAT("Error: too few old rows found: ", COUNT(*)))
> +OK
> +SET GLOBAL gtid_cleanup_batch_size=1;
> *** Test @@skip_parallel_replication. ***
> connection server_2;
> include/stop_slave.inc
> @@ -651,9 +658,10 @@ DROP TABLE t1, t2, t3;
> include/save_master_gtid.inc
> connection server_2;
> include/sync_with_master_gtid.inc
> -Check that no more than the expected last four GTIDs are in mysql.gtid_slave_pos
> -select count(4) <= 4 from mysql.gtid_slave_pos order by domain_id, sub_id;
> -count(4) <= 4
> +SELECT COUNT(*) <= 5*@@GLOBAL.gtid_cleanup_batch_size
> +FROM mysql.gtid_slave_pos;
> +COUNT(*) <= 5*@@GLOBAL.gtid_cleanup_batch_size
> 1
> +SET GLOBAL gtid_cleanup_batch_size= @old_gtid_cleanup_batch_size;
> connection server_1;
> include/rpl_end.inc
> diff --git a/mysql-test/suite/rpl/t/rpl_gtid_mdev4484.test b/mysql-test/suite/rpl/t/rpl_gtid_mdev4484.test
> index e1f5696f5a1..a28bff3d27a 100644
> --- a/mysql-test/suite/rpl/t/rpl_gtid_mdev4484.test
> +++ b/mysql-test/suite/rpl/t/rpl_gtid_mdev4484.test
> @@ -28,37 +28,79 @@ INSERT INTO t1 VALUES (1);
> # Inject an artificial error deleting entries, and check that the error handling code works.
> --connection slave
> --source include/stop_slave.inc
> +SET @old_gtid_cleanup_batch_size= @@GLOBAL.gtid_cleanup_batch_size;
> +SET GLOBAL gtid_cleanup_batch_size= 2;
> SET @old_dbug= @@GLOBAL.debug_dbug;
> SET GLOBAL debug_dbug="+d,gtid_slave_pos_simulate_failed_delete";
> SET sql_log_bin= 0;
> -CALL mtr.add_suppression("Can't find file");
> +CALL mtr.add_suppression("<DEBUG> Error deleting old GTID row");
> SET sql_log_bin= 1;
> --source include/start_slave.inc
>
> --connection master
> -INSERT INTO t1 VALUES (2);
> +--disable_query_log
> +let $i = 20;
> +while ($i) {
> + eval INSERT INTO t1 VALUES ($i+10);
> + dec $i;
> +}
> +--enable_query_log
> +--save_master_pos
>
> --connection slave
> ---let $slave_sql_errno= 1942
> ---source include/wait_for_slave_sql_error.inc
> -STOP SLAVE IO_THREAD;
> -SELECT domain_id, server_id, seq_no FROM mysql.gtid_slave_pos
> - ORDER BY domain_id, sub_id DESC LIMIT 1;
> +--sync_with_master
> +
> +# Now wait for the slave background thread to try to delete old rows and
> +# hit the error injection.
> +--let _TEST_MYSQLD_ERROR_LOG=$MYSQLTEST_VARDIR/log/mysqld.2.err
> +--perl
> + open F, '<', $ENV{'_TEST_MYSQLD_ERROR_LOG'} or die;
> + outer: while (1) {
> + inner: while (<F>) {
> + last outer if /<DEBUG> Error deleting old GTID row/;
> + }
> + # Easy way to do sub-second sleep without extra modules.
> + select(undef, undef, undef, 0.1);
> + }
> +EOF
> +
> +# Since we injected error in the cleanup code, the rows should remain in
> +# mysql.gtid_slave_pos. Check that we have at least 20 (more robust against
> +# non-deterministic cleanup and future changes than checking for exact number).
> +SELECT COUNT(*), MAX(seq_no) INTO @pre_count, @pre_max_seq_no
> + FROM mysql.gtid_slave_pos;
> +SELECT IF(@pre_count >= 20, "OK", CONCAT("Error: too few rows seen while errors injected: ", @pre_count));
> SET GLOBAL debug_dbug= @old_dbug;
> ---source include/start_slave.inc
>
> --connection master
> -INSERT INTO t1 VALUES (3);
> +--disable_query_log
> +let $i = 20;
> +while ($i) {
> + eval INSERT INTO t1 VALUES ($i+40);
> + dec $i;
> +}
> +--enable_query_log
> --sync_slave_with_master
>
> --connection slave
> -SELECT domain_id, server_id, seq_no FROM mysql.gtid_slave_pos
> - ORDER BY domain_id, sub_id DESC LIMIT 1;
> -SELECT * FROM t1 ORDER BY i;
> -
> +# Now check that 1) rows are being deleted again after removing error
> +# injection, and 2) old rows are left that failed their delete while errors
> +# where injected (again compensating for non-deterministic deletion).
> +# Deletion is async and slightly non-deterministic, so we wait for at
> +# least 10 of the 20 new rows to be deleted.
> +let $wait_condition=
> + SELECT COUNT(*) <= 20-10
> + FROM mysql.gtid_slave_pos
> + WHERE seq_no > @pre_max_seq_no;
> +--source include/wait_condition.inc
> +SELECT IF(COUNT(*) >= 1, "OK", CONCAT("Error: too few rows seen after errors no longer injected: ", COUNT(*)))
> + FROM mysql.gtid_slave_pos
> + WHERE seq_no <= @pre_max_seq_no;
>
> # Clean up
> --connection master
> DROP TABLE t1;
> +--connection slave
> +SET GLOBAL gtid_cleanup_batch_size= @old_gtid_cleanup_batch_size;
>
> --source include/rpl_end.inc
> diff --git a/mysql-test/suite/rpl/t/rpl_parallel_optimistic.test b/mysql-test/suite/rpl/t/rpl_parallel_optimistic.test
> index e08472d5f51..0060cf4416c 100644
> --- a/mysql-test/suite/rpl/t/rpl_parallel_optimistic.test
> +++ b/mysql-test/suite/rpl/t/rpl_parallel_optimistic.test
> @@ -21,6 +21,10 @@ SET GLOBAL slave_parallel_threads=10;
> CHANGE MASTER TO master_use_gtid=slave_pos;
> SET @old_parallel_mode=@@GLOBAL.slave_parallel_mode;
> SET GLOBAL slave_parallel_mode='optimistic';
> +# Run the first part of the test with high batch size and see that
> +# old rows remain in the table.
> +SET @old_gtid_cleanup_batch_size= @@GLOBAL.gtid_cleanup_batch_size;
> +SET GLOBAL gtid_cleanup_batch_size= 1000000;
>
>
> --connection server_1
> @@ -108,7 +112,12 @@ SELECT * FROM t3 ORDER BY c;
> SELECT * FROM t1 ORDER BY a;
> SELECT * FROM t2 ORDER BY a;
> SELECT * FROM t3 ORDER BY c;
> -#SHOW STATUS LIKE 'Slave_retried_transactions';
> +# Check that we have a bunch of old rows left-over - they were not deleted
> +# due to high @@gtid_cleanup_batch_size. Then set a low
> +# @@gtid_cleanup_batch_size so we can test that rows start being deleted.
> +SELECT IF(COUNT(*) >= 30, "OK", CONCAT("Error: too few old rows found: ", COUNT(*)))
> + FROM mysql.gtid_slave_pos;
> +SET GLOBAL gtid_cleanup_batch_size=1;
>
>
> --echo *** Test @@skip_parallel_replication. ***
> @@ -557,25 +566,18 @@ DROP TABLE t1, t2, t3;
>
> --connection server_2
> --source include/sync_with_master_gtid.inc
> -# Check for left-over rows in table mysql.gtid_slave_pos (MDEV-12147).
> -#
> -# There was a bug when a transaction got a conflict and was rolled back. It
> -# might have also handled deletion of some old rows, and these deletions would
> -# then also be rolled back. And since the deletes were never re-tried, old no
> -# longer needed rows would accumulate in the table without limit.
> -#
> -# The earlier part of this test file have plenty of transactions being rolled
> -# back. But the last DROP TABLE statement runs on its own and should never
> -# conflict, thus at this point the mysql.gtid_slave_pos table should be clean.
> -#
> -# To support @@gtid_pos_auto_engines, when a row is inserted in the table, it
> -# is associated with the engine of the table at insertion time, and it will
> -# only be deleted during record_gtid from a table of the same engine. Since we
> -# alter the table from MyISAM to InnoDB at the start of this test, we should
> -# end up with 4 rows: two left-over from when the table was MyISAM, and two
> -# left-over from the InnoDB part.
> ---echo Check that no more than the expected last four GTIDs are in mysql.gtid_slave_pos
> -select count(4) <= 4 from mysql.gtid_slave_pos order by domain_id, sub_id;
> +# Check that old rows are deleted from mysql.gtid_slave_pos.
> +# Deletion is asynchronous, so use wait_condition.inc.
> +# Also, there is a small amount of non-determinism in the deletion of old
> +# rows, so it is not guaranteed that there can never be more than
> +# @@gtid_cleanup_batch_size rows in the table; so allow a bit of slack
> +# here.
> +let $wait_condition=
> + SELECT COUNT(*) <= 5*@@GLOBAL.gtid_cleanup_batch_size
> + FROM mysql.gtid_slave_pos;
> +--source include/wait_condition.inc
> +eval $wait_condition;
> +SET GLOBAL gtid_cleanup_batch_size= @old_gtid_cleanup_batch_size;
>
> --connection server_1
> --source include/rpl_end.inc
> diff --git a/mysql-test/suite/sys_vars/r/sysvars_server_notembedded.result b/mysql-test/suite/sys_vars/r/sysvars_server_notembedded.result
> index e8e4d671eb9..5c5ca8b66b2 100644
> --- a/mysql-test/suite/sys_vars/r/sysvars_server_notembedded.result
> +++ b/mysql-test/suite/sys_vars/r/sysvars_server_notembedded.result
> @@ -1202,6 +1202,20 @@ NUMERIC_BLOCK_SIZE NULL
> ENUM_VALUE_LIST NULL
> READ_ONLY NO
> COMMAND_LINE_ARGUMENT NULL
> +VARIABLE_NAME GTID_CLEANUP_BATCH_SIZE
> +SESSION_VALUE NULL
> +GLOBAL_VALUE 64
> +GLOBAL_VALUE_ORIGIN COMPILE-TIME
> +DEFAULT_VALUE 64
> +VARIABLE_SCOPE GLOBAL
> +VARIABLE_TYPE INT UNSIGNED
> +VARIABLE_COMMENT Normally does not need tuning. How many old rows must accumulate in the mysql.gtid_slave_pos table before a background job will be run to delete them. Can be increased to reduce number of commits if using many different engines with --gtid_pos_auto_engines, or to reduce CPU overhead if using a huge number of different gtid_domain_ids. Can be decreased to reduce number of old rows in the table.
> +NUMERIC_MIN_VALUE 0
> +NUMERIC_MAX_VALUE 2147483647
> +NUMERIC_BLOCK_SIZE 1
> +ENUM_VALUE_LIST NULL
> +READ_ONLY NO
> +COMMAND_LINE_ARGUMENT REQUIRED
> VARIABLE_NAME GTID_CURRENT_POS
> SESSION_VALUE NULL
> GLOBAL_VALUE
> diff --git a/sql/log_event.cc b/sql/log_event.cc
> index 8813d20578e..e10480fb015 100644
> --- a/sql/log_event.cc
> +++ b/sql/log_event.cc
> @@ -5565,7 +5565,7 @@ int Query_log_event::do_apply_event(rpl_group_info *rgi,
> gtid= rgi->current_gtid;
> if (unlikely(rpl_global_gtid_slave_state->record_gtid(thd, >id,
> sub_id,
> - rgi, false,
> + true, false,
> &hton)))
> {
> int errcode= thd->get_stmt_da()->sql_errno();
> @@ -8362,7 +8362,7 @@ Gtid_list_log_event::do_apply_event(rpl_group_info *rgi)
> {
> if ((ret= rpl_global_gtid_slave_state->record_gtid(thd, &list[i],
> sub_id_list[i],
> - NULL, false, &hton)))
> + false, false, &hton)))
> return ret;
> rpl_global_gtid_slave_state->update_state_hash(sub_id_list[i], &list[i],
> hton, NULL);
> @@ -8899,7 +8899,7 @@ int Xid_log_event::do_apply_event(rpl_group_info *rgi)
> rgi->gtid_pending= false;
>
> gtid= rgi->current_gtid;
> - err= rpl_global_gtid_slave_state->record_gtid(thd, >id, sub_id, rgi,
> + err= rpl_global_gtid_slave_state->record_gtid(thd, >id, sub_id, true,
> false, &hton);
> if (unlikely(err))
> {
> diff --git a/sql/mysqld.cc b/sql/mysqld.cc
> index afef4a5f52c..07bdd66f74c 100644
> --- a/sql/mysqld.cc
> +++ b/sql/mysqld.cc
> @@ -580,6 +580,7 @@ ulong opt_binlog_commit_wait_count= 0;
> ulong opt_binlog_commit_wait_usec= 0;
> ulong opt_slave_parallel_max_queued= 131072;
> my_bool opt_gtid_ignore_duplicates= FALSE;
> +uint opt_gtid_cleanup_batch_size= 64;
>
> const double log_10[] = {
> 1e000, 1e001, 1e002, 1e003, 1e004, 1e005, 1e006, 1e007, 1e008, 1e009,
> diff --git a/sql/mysqld.h b/sql/mysqld.h
> index d5cabd790b2..261748372f9 100644
> --- a/sql/mysqld.h
> +++ b/sql/mysqld.h
> @@ -258,6 +258,7 @@ extern ulong opt_slave_parallel_mode;
> extern ulong opt_binlog_commit_wait_count;
> extern ulong opt_binlog_commit_wait_usec;
> extern my_bool opt_gtid_ignore_duplicates;
> +extern uint opt_gtid_cleanup_batch_size;
> extern ulong back_log;
> extern ulong executed_events;
> extern char language[FN_REFLEN];
> diff --git a/sql/rpl_gtid.cc b/sql/rpl_gtid.cc
> index fabd09adaa7..196c2fe3d16 100644
> --- a/sql/rpl_gtid.cc
> +++ b/sql/rpl_gtid.cc
> @@ -79,7 +79,7 @@ rpl_slave_state::record_and_update_gtid(THD *thd, rpl_group_info *rgi)
> rgi->gtid_pending= false;
> if (rgi->gtid_ignore_duplicate_state!=rpl_group_info::GTID_DUPLICATE_IGNORE)
> {
> - if (record_gtid(thd, &rgi->current_gtid, sub_id, NULL, false, &hton))
> + if (record_gtid(thd, &rgi->current_gtid, sub_id, false, false, &hton))
> DBUG_RETURN(1);
> update_state_hash(sub_id, &rgi->current_gtid, hton, rgi);
> }
> @@ -244,7 +244,7 @@ rpl_slave_state_free_element(void *arg)
>
>
> rpl_slave_state::rpl_slave_state()
> - : last_sub_id(0), gtid_pos_tables(0), loaded(false)
> + : pending_gtid_count(0), last_sub_id(0), gtid_pos_tables(0), loaded(false)
> {
> mysql_mutex_init(key_LOCK_slave_state, &LOCK_slave_state,
> MY_MUTEX_INIT_SLOW);
> @@ -331,14 +331,11 @@ rpl_slave_state::update(uint32 domain_id, uint32 server_id, uint64 sub_id,
> }
> }
> rgi->gtid_ignore_duplicate_state= rpl_group_info::GTID_DUPLICATE_NULL;
> -
> -#ifdef HAVE_REPLICATION
> - rgi->pending_gtid_deletes_clear();
> -#endif
> }
>
> if (!(list_elem= (list_element *)my_malloc(sizeof(*list_elem), MYF(MY_WME))))
> return 1;
> + list_elem->domain_id= domain_id;
> list_elem->server_id= server_id;
> list_elem->sub_id= sub_id;
> list_elem->seq_no= seq_no;
> @@ -348,6 +345,15 @@ rpl_slave_state::update(uint32 domain_id, uint32 server_id, uint64 sub_id,
> if (last_sub_id < sub_id)
> last_sub_id= sub_id;
>
> +#ifdef HAVE_REPLICATION
> + ++pending_gtid_count;
> + if (pending_gtid_count >= opt_gtid_cleanup_batch_size)
> + {
> + pending_gtid_count = 0;
> + slave_background_gtid_pending_delete_request();
> + }
> +#endif
> +
> return 0;
> }
>
> @@ -382,20 +388,22 @@ rpl_slave_state::get_element(uint32 domain_id)
>
>
> int
> -rpl_slave_state::put_back_list(uint32 domain_id, list_element *list)
> +rpl_slave_state::put_back_list(list_element *list)
> {
> - element *e;
> + element *e= NULL;
> int err= 0;
>
> mysql_mutex_lock(&LOCK_slave_state);
> - if (!(e= (element *)my_hash_search(&hash, (const uchar *)&domain_id, 0)))
> - {
> - err= 1;
> - goto end;
> - }
> while (list)
> {
> list_element *next= list->next;
> +
> + if ((!e || e->domain_id != list->domain_id) &&
> + !(e= (element *)my_hash_search(&hash, (const uchar *)&list->domain_id, 0)))
> + {
> + err= 1;
> + goto end;
> + }
> e->add(list);
> list= next;
> }
> @@ -572,12 +580,12 @@ rpl_slave_state::select_gtid_pos_table(THD *thd, LEX_CSTRING *out_tablename)
> /*
> Write a gtid to the replication slave state table.
>
> + Do it as part of the transaction, to get slave crash safety, or as a separate
> + transaction if !in_transaction (eg. MyISAM or DDL).
> +
> gtid The global transaction id for this event group.
> sub_id Value allocated within the sub_id when the event group was
> read (sub_id must be consistent with commit order in master binlog).
> - rgi rpl_group_info context, if we are recording the gtid transactionally
> - as part of replicating a transactional event. NULL if called from
> - outside of a replicated transaction.
>
> Note that caller must later ensure that the new gtid and sub_id is inserted
> into the appropriate HASH element with rpl_slave_state.add(), so that it can
> @@ -585,16 +593,13 @@ rpl_slave_state::select_gtid_pos_table(THD *thd, LEX_CSTRING *out_tablename)
> */
> int
> rpl_slave_state::record_gtid(THD *thd, const rpl_gtid *gtid, uint64 sub_id,
> - rpl_group_info *rgi, bool in_statement,
> + bool in_transaction, bool in_statement,
> void **out_hton)
> {
> TABLE_LIST tlist;
> int err= 0, not_sql_thread;
> bool table_opened= false;
> TABLE *table;
> - list_element *delete_list= 0, *next, *cur, **next_ptr_ptr, **best_ptr_ptr;
> - uint64 best_sub_id;
> - element *elem;
> ulonglong thd_saved_option= thd->variables.option_bits;
> Query_tables_list lex_backup;
> wait_for_commit* suspended_wfc;
> @@ -684,7 +689,7 @@ rpl_slave_state::record_gtid(THD *thd, const rpl_gtid *gtid, uint64 sub_id,
> thd->wsrep_ignore_table= true;
> #endif
>
> - if (!rgi)
> + if (!in_transaction)
> {
> DBUG_PRINT("info", ("resetting OPTION_BEGIN"));
> thd->variables.option_bits&=
> @@ -716,168 +721,280 @@ rpl_slave_state::record_gtid(THD *thd, const rpl_gtid *gtid, uint64 sub_id,
> my_error(ER_OUT_OF_RESOURCES, MYF(0));
> goto end;
> }
> +end:
>
> - mysql_mutex_lock(&LOCK_slave_state);
> - if ((elem= get_element(gtid->domain_id)) == NULL)
> +#ifdef WITH_WSREP
> + thd->wsrep_ignore_table= false;
> +#endif
> +
> + if (table_opened)
> {
> - mysql_mutex_unlock(&LOCK_slave_state);
> - my_error(ER_OUT_OF_RESOURCES, MYF(0));
> - err= 1;
> - goto end;
> + if (err || (err= ha_commit_trans(thd, FALSE)))
> + ha_rollback_trans(thd, FALSE);
> +
> + close_thread_tables(thd);
> + if (in_transaction)
> + thd->mdl_context.release_statement_locks();
> + else
> + thd->mdl_context.release_transactional_locks();
> }
> + thd->lex->restore_backup_query_tables_list(&lex_backup);
> + thd->variables.option_bits= thd_saved_option;
> + thd->resume_subsequent_commits(suspended_wfc);
> + DBUG_EXECUTE_IF("inject_record_gtid_serverid_100_sleep",
> + {
> + if (gtid->server_id == 100)
> + my_sleep(500000);
> + });
> + DBUG_RETURN(err);
> +}
>
> - /* Now pull out all GTIDs that were recorded in this engine. */
> - delete_list = NULL;
> - next_ptr_ptr= &elem->list;
> - cur= elem->list;
> - best_sub_id= 0;
> - best_ptr_ptr= NULL;
> - while (cur)
> +
> +/*
> + Return a list of all old GTIDs in any mysql.gtid_slave_pos* table that are
> + no longer needed and can be deleted from the table.
> +
> + Within each domain, we need to keep around the latest GTID (the one with the
> + highest sub_id), but any others in that domain can be deleted.
> +*/
> +rpl_slave_state::list_element *
> +rpl_slave_state::gtid_grab_pending_delete_list()
> +{
> + uint32 i;
> + list_element *full_list;
> +
> + mysql_mutex_lock(&LOCK_slave_state);
> + full_list= NULL;
> + for (i= 0; i < hash.records; ++i)
> {
> - list_element *next= cur->next;
> - if (cur->hton == hton)
> - {
> - /* Belongs to same engine, so move it to the delete list. */
> - cur->next= delete_list;
> - delete_list= cur;
> - if (cur->sub_id > best_sub_id)
> + element *elem= (element *)my_hash_element(&hash, i);
> + list_element *elist= elem->list;
> + list_element *last_elem, **best_ptr_ptr, *cur, *next;
> + uint64 best_sub_id;
> +
> + if (!elist)
> + continue; /* Nothing here */
> +
> + /* Delete any old stuff, but keep around the most recent one. */
> + cur= elist;
> + best_sub_id= cur->sub_id;
> + best_ptr_ptr= &elist;
> + last_elem= cur;
> + while ((next= cur->next)) {
> + last_elem= next;
> + if (next->sub_id > best_sub_id)
> {
> - best_sub_id= cur->sub_id;
> - best_ptr_ptr= &delete_list;
> - }
> - else if (best_ptr_ptr == &delete_list)
> + best_sub_id= next->sub_id;
> best_ptr_ptr= &cur->next;
> - }
> - else
> - {
> - /* Another engine, leave it in the list. */
> - if (cur->sub_id > best_sub_id)
> - {
> - best_sub_id= cur->sub_id;
> - /* Current best is not on the delete list. */
> - best_ptr_ptr= NULL;
> }
> - *next_ptr_ptr= cur;
> - next_ptr_ptr= &cur->next;
> + cur= next;
> }
> - cur= next;
> - }
> - *next_ptr_ptr= NULL;
> - /*
> - If the highest sub_id element is on the delete list, put it back on the
> - original list, to preserve the highest sub_id element in the table for
> - GTID position recovery.
> - */
> - if (best_ptr_ptr)
> - {
> + /*
> + Append the new elements to the full list. Note the order is important;
> + we do it here so that we do not break the list if best_sub_id is the
> + last of the new elements.
> + */
> + last_elem->next= full_list;
> + /*
> + Delete the highest sub_id element from the old list, and put it back as
> + the single-element new list.
> + */
> cur= *best_ptr_ptr;
> *best_ptr_ptr= cur->next;
> - cur->next= elem->list;
> + cur->next= NULL;
> elem->list= cur;
> +
> + /*
> + Collect the full list so far here. Note that elist may have moved if we
> + deleted the first element, so order is again important.
> + */
> + full_list= elist;
> }
> mysql_mutex_unlock(&LOCK_slave_state);
>
> - if (!delete_list)
> - goto end;
> + return full_list;
> +}
> +
>
> - /* Now delete any already committed GTIDs. */
> - bitmap_set_bit(table->read_set, table->field[0]->field_index);
> - bitmap_set_bit(table->read_set, table->field[1]->field_index);
> +/* Find the mysql.gtid_slave_posXXX table associated with a given hton. */
> +LEX_CSTRING *
> +rpl_slave_state::select_gtid_pos_table(void *hton)
> +{
> + struct gtid_pos_table *table_entry;
>
> - if ((err= table->file->ha_index_init(0, 0)))
> + /*
> + See comments on rpl_slave_state::gtid_pos_tables for rules around proper
> + access to the list.
> + */
> + table_entry= (struct gtid_pos_table *)
> + my_atomic_loadptr_explicit(>id_pos_tables, MY_MEMORY_ORDER_ACQUIRE);
> +
> + while (table_entry)
> {
> - table->file->print_error(err, MYF(0));
> - goto end;
> + if (table_entry->table_hton == hton)
> + {
> + if (likely(table_entry->state == GTID_POS_AVAILABLE))
> + return &table_entry->table_name;
> + }
> + table_entry= table_entry->next;
> }
> - cur = delete_list;
> - while (cur)
> - {
> - uchar key_buffer[4+8];
>
> - DBUG_EXECUTE_IF("gtid_slave_pos_simulate_failed_delete",
> - { err= ENOENT;
> - table->file->print_error(err, MYF(0));
> - /* `break' does not work inside DBUG_EXECUTE_IF */
> - goto dbug_break; });
> + table_entry= (struct gtid_pos_table *)
> + my_atomic_loadptr_explicit(&default_gtid_pos_table, MY_MEMORY_ORDER_ACQUIRE);
> + return &table_entry->table_name;
> +}
>
> - next= cur->next;
>
> - table->field[1]->store(cur->sub_id, true);
> - /* domain_id is already set in table->record[0] from write_row() above. */
> - key_copy(key_buffer, table->record[0], &table->key_info[0], 0, false);
> - if (table->file->ha_index_read_map(table->record[1], key_buffer,
> - HA_WHOLE_KEY, HA_READ_KEY_EXACT))
> - /* We cannot find the row, assume it is already deleted. */
> - ;
> - else if ((err= table->file->ha_delete_row(table->record[1])))
> - table->file->print_error(err, MYF(0));
> - /*
> - In case of error, we still discard the element from the list. We do
> - not want to endlessly error on the same element in case of table
> - corruption or such.
> - */
> - cur= next;
> - if (err)
> - break;
> - }
> -IF_DBUG(dbug_break:, )
> - table->file->ha_index_end();
> +void
> +rpl_slave_state::gtid_delete_pending(THD *thd,
> + rpl_slave_state::list_element **list_ptr)
> +{
> + int err= 0;
> + ulonglong thd_saved_option;
>
> -end:
> + if (unlikely(!loaded))
> + return;
>
> #ifdef WITH_WSREP
> - thd->wsrep_ignore_table= false;
> + /*
> + Updates in slave state table should not be appended to galera transaction
> + writeset.
> + */
> + thd->wsrep_ignore_table= true;
> #endif
>
> - if (table_opened)
> + thd_saved_option= thd->variables.option_bits;
> + thd->variables.option_bits&=
> + ~(ulonglong)(OPTION_NOT_AUTOCOMMIT |OPTION_BEGIN |OPTION_BIN_LOG |
> + OPTION_GTID_BEGIN);
> +
> + while (*list_ptr)
> {
> - if (err || (err= ha_commit_trans(thd, FALSE)))
> - {
> - /*
> - If error, we need to put any remaining delete_list back into the HASH
> - so we can do another delete attempt later.
> - */
> - if (delete_list)
> - {
> - put_back_list(gtid->domain_id, delete_list);
> - delete_list = 0;
> - }
> + LEX_CSTRING *gtid_pos_table_name, *tmp_table_name;
> + Query_tables_list lex_backup;
> + TABLE_LIST tlist;
> + TABLE *table;
> + handler::Table_flags direct_pos;
> + list_element *cur, **cur_ptr_ptr;
> + bool table_opened= false;
> + void *hton= (*list_ptr)->hton;
>
> - ha_rollback_trans(thd, FALSE);
> + thd->reset_for_next_command();
> +
> + /*
> + Only the SQL thread can call select_gtid_pos_table without a mutex
> + Other threads needs to use a mutex and take into account that the
> + result may change during execution, so we have to make a copy.
> + */
> + mysql_mutex_lock(&LOCK_slave_state);
> + tmp_table_name= select_gtid_pos_table(hton);
> + gtid_pos_table_name= thd->make_clex_string(tmp_table_name->str,
> + tmp_table_name->length);
> + mysql_mutex_unlock(&LOCK_slave_state);
> + if (!gtid_pos_table_name)
> + {
> + /* Out of memory - we can try again later. */
> + break;
> }
> - close_thread_tables(thd);
> - if (rgi)
> +
> + thd->lex->reset_n_backup_query_tables_list(&lex_backup);
> + tlist.init_one_table(&MYSQL_SCHEMA_NAME, gtid_pos_table_name, NULL, TL_WRITE);
> + if ((err= open_and_lock_tables(thd, &tlist, FALSE, 0)))
> + goto end;
> + table_opened= true;
> + table= tlist.table;
> +
> + if ((err= gtid_check_rpl_slave_state_table(table)))
> + goto end;
> +
> + direct_pos= table->file->ha_table_flags() & HA_PRIMARY_KEY_REQUIRED_FOR_POSITION;
> + bitmap_set_all(table->write_set);
> + table->rpl_write_set= table->write_set;
> +
> + /* Now delete any already committed GTIDs. */
> + bitmap_set_bit(table->read_set, table->field[0]->field_index);
> + bitmap_set_bit(table->read_set, table->field[1]->field_index);
> +
> + if (!direct_pos && (err= table->file->ha_index_init(0, 0)))
> {
> - thd->mdl_context.release_statement_locks();
> - /*
> - Save the list of old gtid entries we deleted. If this transaction
> - fails later for some reason and is rolled back, the deletion of those
> - entries will be rolled back as well, and we will need to put them back
> - on the to-be-deleted list so we can re-do the deletion. Otherwise
> - redundant rows in mysql.gtid_slave_pos may accumulate if transactions
> - are rolled back and retried after record_gtid().
> - */
> -#ifdef HAVE_REPLICATION
> - rgi->pending_gtid_deletes_save(gtid->domain_id, delete_list);
> -#endif
> + table->file->print_error(err, MYF(0));
> + goto end;
> }
> - else
> +
> + cur = *list_ptr;
> + cur_ptr_ptr = list_ptr;
> + do
> {
> - thd->mdl_context.release_transactional_locks();
> -#ifdef HAVE_REPLICATION
> - rpl_group_info::pending_gtid_deletes_free(delete_list);
> -#endif
> + uchar key_buffer[4+8];
> + list_element *next= cur->next;
> +
> + if (cur->hton == hton)
> + {
> + int res;
> +
> + table->field[0]->store((ulonglong)cur->domain_id, true);
> + table->field[1]->store(cur->sub_id, true);
> + if (direct_pos)
> + {
> + res= table->file->ha_rnd_pos_by_record(table->record[0]);
> + }
> + else
> + {
> + key_copy(key_buffer, table->record[0], &table->key_info[0], 0, false);
> + res= table->file->ha_index_read_map(table->record[0], key_buffer,
> + HA_WHOLE_KEY, HA_READ_KEY_EXACT);
> + }
> + DBUG_EXECUTE_IF("gtid_slave_pos_simulate_failed_delete",
> + { res= 1;
> + err= ENOENT;
> + sql_print_error("<DEBUG> Error deleting old GTID row");
> + });
> + if (res)
> + /* We cannot find the row, assume it is already deleted. */
> + ;
> + else if ((err= table->file->ha_delete_row(table->record[0])))
> + {
> + sql_print_error("Error deleting old GTID row: %s",
> + thd->get_stmt_da()->message());
> + /*
> + In case of error, we still discard the element from the list. We do
> + not want to endlessly error on the same element in case of table
> + corruption or such.
> + */
> + }
> + *cur_ptr_ptr= next;
> + my_free(cur);
> + }
> + else
> + {
> + /* Leave this one in the list until we get to the table for its hton. */
> + cur_ptr_ptr= &cur->next;
> + }
> + cur= next;
> + if (err)
> + break;
> + } while (cur);
> +end:
> + if (table_opened)
> + {
> + if (!direct_pos)
> + table->file->ha_index_end();
> +
> + if (err || (err= ha_commit_trans(thd, FALSE)))
> + ha_rollback_trans(thd, FALSE);
> }
> + close_thread_tables(thd);
> + thd->mdl_context.release_transactional_locks();
> + thd->lex->restore_backup_query_tables_list(&lex_backup);
> +
> + if (err)
> + break;
> }
> - thd->lex->restore_backup_query_tables_list(&lex_backup);
> thd->variables.option_bits= thd_saved_option;
> - thd->resume_subsequent_commits(suspended_wfc);
> - DBUG_EXECUTE_IF("inject_record_gtid_serverid_100_sleep",
> - {
> - if (gtid->server_id == 100)
> - my_sleep(500000);
> - });
> - DBUG_RETURN(err);
> +
> +#ifdef WITH_WSREP
> + thd->wsrep_ignore_table= false;
> +#endif
> }
>
>
> @@ -1251,7 +1368,7 @@ rpl_slave_state::load(THD *thd, const char *state_from_master, size_t len,
>
> if (gtid_parser_helper(&state_from_master, end, >id) ||
> !(sub_id= next_sub_id(gtid.domain_id)) ||
> - record_gtid(thd, >id, sub_id, NULL, in_statement, &hton) ||
> + record_gtid(thd, >id, sub_id, false, in_statement, &hton) ||
> update(gtid.domain_id, gtid.server_id, sub_id, gtid.seq_no, hton, NULL))
> return 1;
> if (state_from_master == end)
> diff --git a/sql/rpl_gtid.h b/sql/rpl_gtid.h
> index 0fc92d5e33c..60d822f7b0d 100644
> --- a/sql/rpl_gtid.h
> +++ b/sql/rpl_gtid.h
> @@ -118,8 +118,9 @@ struct rpl_slave_state
> {
> struct list_element *next;
> uint64 sub_id;
> - uint64 seq_no;
> + uint32 domain_id;
> uint32 server_id;
> + uint64 seq_no;
> /*
> hton of mysql.gtid_slave_pos* table used to record this GTID.
> Can be NULL if the gtid table failed to load (eg. missing
> @@ -191,6 +192,8 @@ struct rpl_slave_state
>
> /* Mapping from domain_id to its element. */
> HASH hash;
> + /* GTIDs added since last purge of old mysql.gtid_slave_pos rows. */
> + uint32 pending_gtid_count;
> /* Mutex protecting access to the state. */
> mysql_mutex_t LOCK_slave_state;
> /* Auxiliary buffer to sort gtid list. */
> @@ -233,7 +236,10 @@ struct rpl_slave_state
> int truncate_state_table(THD *thd);
> void select_gtid_pos_table(THD *thd, LEX_CSTRING *out_tablename);
> int record_gtid(THD *thd, const rpl_gtid *gtid, uint64 sub_id,
> - rpl_group_info *rgi, bool in_statement, void **out_hton);
> + bool in_transaction, bool in_statement, void **out_hton);
> + list_element *gtid_grab_pending_delete_list();
> + LEX_CSTRING *select_gtid_pos_table(void *hton);
> + void gtid_delete_pending(THD *thd, rpl_slave_state::list_element **list_ptr);
> uint64 next_sub_id(uint32 domain_id);
> int iterate(int (*cb)(rpl_gtid *, void *), void *data,
> rpl_gtid *extra_gtids, uint32 num_extra,
> @@ -245,7 +251,7 @@ struct rpl_slave_state
> bool is_empty();
>
> element *get_element(uint32 domain_id);
> - int put_back_list(uint32 domain_id, list_element *list);
> + int put_back_list(list_element *list);
>
> void update_state_hash(uint64 sub_id, rpl_gtid *gtid, void *hton,
> rpl_group_info *rgi);
> diff --git a/sql/rpl_rli.cc b/sql/rpl_rli.cc
> index b275ad884bd..2d91620c898 100644
> --- a/sql/rpl_rli.cc
> +++ b/sql/rpl_rli.cc
> @@ -1820,6 +1820,7 @@ rpl_load_gtid_slave_state(THD *thd)
> int err= 0;
> uint32 i;
> load_gtid_state_cb_data cb_data;
> + rpl_slave_state::list_element *old_gtids_list;
> DBUG_ENTER("rpl_load_gtid_slave_state");
>
> mysql_mutex_lock(&rpl_global_gtid_slave_state->LOCK_slave_state);
> @@ -1905,6 +1906,13 @@ rpl_load_gtid_slave_state(THD *thd)
> rpl_global_gtid_slave_state->loaded= true;
> mysql_mutex_unlock(&rpl_global_gtid_slave_state->LOCK_slave_state);
>
> + /* Clear out no longer needed elements now. */
> + old_gtids_list=
> + rpl_global_gtid_slave_state->gtid_grab_pending_delete_list();
> + rpl_global_gtid_slave_state->gtid_delete_pending(thd, &old_gtids_list);
> + if (old_gtids_list)
> + rpl_global_gtid_slave_state->put_back_list(old_gtids_list);
> +
> end:
> if (array_inited)
> delete_dynamic(&array);
> @@ -2086,7 +2094,6 @@ rpl_group_info::reinit(Relay_log_info *rli)
> long_find_row_note_printed= false;
> did_mark_start_commit= false;
> gtid_ev_flags2= 0;
> - pending_gtid_delete_list= NULL;
> last_master_timestamp = 0;
> gtid_ignore_duplicate_state= GTID_DUPLICATE_NULL;
> speculation= SPECULATE_NO;
> @@ -2217,12 +2224,6 @@ void rpl_group_info::cleanup_context(THD *thd, bool error)
> erroneously update the GTID position.
> */
> gtid_pending= false;
> -
> - /*
> - Rollback will have undone any deletions of old rows we might have made
> - in mysql.gtid_slave_pos. Put those rows back on the list to be deleted.
> - */
> - pending_gtid_deletes_put_back();
> }
> m_table_map.clear_tables();
> slave_close_thread_tables(thd);
> @@ -2448,78 +2449,6 @@ rpl_group_info::unmark_start_commit()
> }
>
>
> -/*
> - When record_gtid() has deleted any old rows from the table
> - mysql.gtid_slave_pos as part of a replicated transaction, save the list of
> - rows deleted here.
> -
> - If later the transaction fails (eg. optimistic parallel replication), the
> - deletes will be undone when the transaction is rolled back. Then we can
> - put back the list of rows into the rpl_global_gtid_slave_state, so that
> - we can re-do the deletes and avoid accumulating old rows in the table.
> -*/
> -void
> -rpl_group_info::pending_gtid_deletes_save(uint32 domain_id,
> - rpl_slave_state::list_element *list)
> -{
> - /*
> - We should never get to a state where we try to save a new pending list of
> - gtid deletes while we still have an old one. But make sure we handle it
> - anyway just in case, so we avoid leaving stray entries in the
> - mysql.gtid_slave_pos table.
> - */
> - DBUG_ASSERT(!pending_gtid_delete_list);
> - if (unlikely(pending_gtid_delete_list))
> - pending_gtid_deletes_put_back();
> -
> - pending_gtid_delete_list= list;
> - pending_gtid_delete_list_domain= domain_id;
> -}
> -
> -
> -/*
> - Take the list recorded by pending_gtid_deletes_save() and put it back into
> - rpl_global_gtid_slave_state. This is needed if deletion of the rows was
> - rolled back due to transaction failure.
> -*/
> -void
> -rpl_group_info::pending_gtid_deletes_put_back()
> -{
> - if (pending_gtid_delete_list)
> - {
> - rpl_global_gtid_slave_state->put_back_list(pending_gtid_delete_list_domain,
> - pending_gtid_delete_list);
> - pending_gtid_delete_list= NULL;
> - }
> -}
> -
> -
> -/*
> - Free the list recorded by pending_gtid_deletes_save(). Done when the deletes
> - in the list have been permanently committed.
> -*/
> -void
> -rpl_group_info::pending_gtid_deletes_clear()
> -{
> - pending_gtid_deletes_free(pending_gtid_delete_list);
> - pending_gtid_delete_list= NULL;
> -}
> -
> -
> -void
> -rpl_group_info::pending_gtid_deletes_free(rpl_slave_state::list_element *list)
> -{
> - rpl_slave_state::list_element *next;
> -
> - while (list)
> - {
> - next= list->next;
> - my_free(list);
> - list= next;
> - }
> -}
> -
> -
> rpl_sql_thread_info::rpl_sql_thread_info(Rpl_filter *filter)
> : rpl_filter(filter)
> {
> diff --git a/sql/rpl_rli.h b/sql/rpl_rli.h
> index d9f0e0e5d3b..b8b153c34be 100644
> --- a/sql/rpl_rli.h
> +++ b/sql/rpl_rli.h
> @@ -757,11 +757,6 @@ struct rpl_group_info
> /* Needs room for "Gtid D-S-N\x00". */
> char gtid_info_buf[5+10+1+10+1+20+1];
>
> - /* List of not yet committed deletions in mysql.gtid_slave_pos. */
> - rpl_slave_state::list_element *pending_gtid_delete_list;
> - /* Domain associated with pending_gtid_delete_list. */
> - uint32 pending_gtid_delete_list_domain;
> -
> /*
> The timestamp, from the master, of the commit event.
> Used to do delayed update of rli->last_master_timestamp, for getting
> @@ -903,12 +898,6 @@ struct rpl_group_info
> char *gtid_info();
> void unmark_start_commit();
>
> - static void pending_gtid_deletes_free(rpl_slave_state::list_element *list);
> - void pending_gtid_deletes_save(uint32 domain_id,
> - rpl_slave_state::list_element *list);
> - void pending_gtid_deletes_put_back();
> - void pending_gtid_deletes_clear();
> -
> longlong get_row_stmt_start_timestamp()
> {
> return row_stmt_start_timestamp;
> diff --git a/sql/slave.cc b/sql/slave.cc
> index bb1300d36e6..f8499513dd6 100644
> --- a/sql/slave.cc
> +++ b/sql/slave.cc
> @@ -465,6 +465,8 @@ static struct slave_background_gtid_pos_create_t {
> void *hton;
> } *slave_background_gtid_pos_create_list;
>
> +static volatile bool slave_background_gtid_pending_delete_flag;
> +
>
> pthread_handler_t
> handle_slave_background(void *arg __attribute__((unused)))
> @@ -499,6 +501,7 @@ handle_slave_background(void *arg __attribute__((unused)))
> {
> slave_background_kill_t *kill_list;
> slave_background_gtid_pos_create_t *create_list;
> + bool pending_deletes;
>
> thd->ENTER_COND(&COND_slave_background, &LOCK_slave_background,
> &stage_slave_background_wait_request,
> @@ -508,13 +511,15 @@ handle_slave_background(void *arg __attribute__((unused)))
> stop= abort_loop || thd->killed || slave_background_thread_stop;
> kill_list= slave_background_kill_list;
> create_list= slave_background_gtid_pos_create_list;
> - if (stop || kill_list || create_list)
> + pending_deletes= slave_background_gtid_pending_delete_flag;
> + if (stop || kill_list || create_list || pending_deletes)
> break;
> mysql_cond_wait(&COND_slave_background, &LOCK_slave_background);
> }
>
> slave_background_kill_list= NULL;
> slave_background_gtid_pos_create_list= NULL;
> + slave_background_gtid_pending_delete_flag= false;
> thd->EXIT_COND(&old_stage);
>
> while (kill_list)
> @@ -541,6 +546,17 @@ handle_slave_background(void *arg __attribute__((unused)))
> create_list= next;
> }
>
> + if (pending_deletes)
> + {
> + rpl_slave_state::list_element *list;
> +
> + slave_background_gtid_pending_delete_flag= false;
> + list= rpl_global_gtid_slave_state->gtid_grab_pending_delete_list();
> + rpl_global_gtid_slave_state->gtid_delete_pending(thd, &list);
> + if (list)
> + rpl_global_gtid_slave_state->put_back_list(list);
> + }
> +
> mysql_mutex_lock(&LOCK_slave_background);
> } while (!stop);
>
> @@ -615,6 +631,23 @@ slave_background_gtid_pos_create_request(
>
>
> /*
> + Request the slave background thread to delete no longer used rows from the
> + mysql.gtid_slave_pos* tables.
> +
> + This is called from time-critical rpl_slave_state::update(), so we avoid
> + taking any locks here. This means we may race with the background thread
> + to occasionally lose a signal. This is not a problem; any pending rows to
> + be deleted will just be deleted a bit later as part of the next batch.
> +*/
> +void
> +slave_background_gtid_pending_delete_request(void)
> +{
> + slave_background_gtid_pending_delete_flag= true;
> + mysql_cond_signal(&COND_slave_background);
> +}
> +
> +
> +/*
> Start the slave background thread.
>
> This thread is currently used for two purposes:
> diff --git a/sql/slave.h b/sql/slave.h
> index 649d55b45b9..12d569b0333 100644
> --- a/sql/slave.h
> +++ b/sql/slave.h
> @@ -276,6 +276,7 @@ bool net_request_file(NET* net, const char* fname);
> void slave_background_kill_request(THD *to_kill);
> void slave_background_gtid_pos_create_request
> (rpl_slave_state::gtid_pos_table *table_entry);
> +void slave_background_gtid_pending_delete_request(void);
>
> extern bool volatile abort_loop;
> extern Master_info *active_mi; /* active_mi for multi-master */
> diff --git a/sql/sys_vars.cc b/sql/sys_vars.cc
> index 6d4c135683a..9348f4e5c98 100644
> --- a/sql/sys_vars.cc
> +++ b/sql/sys_vars.cc
> @@ -1942,6 +1942,19 @@ Sys_var_last_gtid::session_value_ptr(THD *thd, const LEX_CSTRING *base)
> }
>
>
> +static Sys_var_uint Sys_gtid_cleanup_batch_size(
> + "gtid_cleanup_batch_size",
> + "Normally does not need tuning. How many old rows must accumulate in "
> + "the mysql.gtid_slave_pos table before a background job will be run to "
> + "delete them. Can be increased to reduce number of commits if "
> + "using many different engines with --gtid_pos_auto_engines, or to "
> + "reduce CPU overhead if using a huge number of different "
> + "gtid_domain_ids. Can be decreased to reduce number of old rows in the "
> + "table.",
> + GLOBAL_VAR(opt_gtid_cleanup_batch_size), CMD_LINE(REQUIRED_ARG),
> + VALID_RANGE(0,2147483647), DEFAULT(64), BLOCK_SIZE(1));
> +
> +
> static bool
> check_slave_parallel_threads(sys_var *self, THD *thd, set_var *var)
> {
> diff --git a/storage/rocksdb/mysql-test/rocksdb_rpl/r/mdev12179.result b/storage/rocksdb/mysql-test/rocksdb_rpl/r/mdev12179.result
> index 9c20fea97ae..a1e501f78f4 100644
> --- a/storage/rocksdb/mysql-test/rocksdb_rpl/r/mdev12179.result
> +++ b/storage/rocksdb/mysql-test/rocksdb_rpl/r/mdev12179.result
> @@ -2,6 +2,7 @@ include/master-slave.inc
> [connection master]
> connection server_2;
> include/stop_slave.inc
> +SET GLOBAL gtid_cleanup_batch_size = 999999999;
> CHANGE MASTER TO master_use_gtid=slave_pos;
> SET sql_log_bin=0;
> CREATE TABLE mysql.gtid_slave_pos_innodb LIKE mysql.gtid_slave_pos;
> @@ -41,6 +42,8 @@ a
> 1
> SELECT * FROM mysql.gtid_slave_pos ORDER BY sub_id;
> domain_id sub_id server_id seq_no
> +0 1 1 1
> +0 2 1 2
> 0 3 1 3
> 0 4 1 4
> SELECT * FROM ( SELECT * FROM mysql.gtid_slave_pos_innodb
> @@ -121,6 +124,21 @@ Transactions_multi_engine 6
> DELETE FROM t1 WHERE a >= 100;
> DELETE FROM t2 WHERE a >= 100;
> DELETE FROM t3 WHERE a >= 100;
> +connection server_1;
> +include/save_master_gtid.inc
> +connection server_2;
> +include/sync_with_master_gtid.inc
> +SELECT COUNT(*)>=10 FROM mysql.gtid_slave_pos;
> +COUNT(*)>=10
> +1
> +SELECT COUNT(*)>=10 FROM ( SELECT * FROM mysql.gtid_slave_pos_innodb
> +UNION ALL SELECT * FROM mysql.gtid_slave_pos_innodb_redundant) inner_select;
> +COUNT(*)>=10
> +1
> +SELECT COUNT(*)>=10 FROM mysql.gtid_slave_pos_rocksdb;
> +COUNT(*)>=10
> +1
> +SET GLOBAL gtid_cleanup_batch_size = 3;
> connection server_2;
> include/stop_slave.inc
> SET sql_log_bin=0;
> diff --git a/storage/rocksdb/mysql-test/rocksdb_rpl/t/mdev12179.test b/storage/rocksdb/mysql-test/rocksdb_rpl/t/mdev12179.test
> index e0d16e7f242..631d9ca533f 100644
> --- a/storage/rocksdb/mysql-test/rocksdb_rpl/t/mdev12179.test
> +++ b/storage/rocksdb/mysql-test/rocksdb_rpl/t/mdev12179.test
> @@ -4,6 +4,12 @@
>
> --connection server_2
> --source include/stop_slave.inc
> +
> +# Set GTID cleanup limit high enough that cleanup will not run and we
> +# can rely on consistent table output in .result.
> +--let $old_gtid_cleanup_batch_size=`SELECT @@GLOBAL.gtid_cleanup_batch_size`
> +SET GLOBAL gtid_cleanup_batch_size = 999999999;
> +
> CHANGE MASTER TO master_use_gtid=slave_pos;
> SET sql_log_bin=0;
> CREATE TABLE mysql.gtid_slave_pos_innodb LIKE mysql.gtid_slave_pos;
> @@ -89,6 +95,82 @@ DELETE FROM t2 WHERE a >= 100;
> DELETE FROM t3 WHERE a >= 100;
>
>
> +# Create a bunch more GTIDs in mysql.gtid_slave_pos* tables to test with.
> +--connection server_1
> +--disable_query_log
> +let $i=10;
> +while ($i) {
> + eval INSERT INTO t1 VALUES (300+$i);
> + eval INSERT INTO t2 VALUES (300+$i);
> + eval INSERT INTO t3 VALUES (300+$i);
> + dec $i;
> +}
> +--enable_query_log
> +--source include/save_master_gtid.inc
> +
> +--connection server_2
> +--source include/sync_with_master_gtid.inc
> +
> +# Check that we have many rows in mysql.gtid_slave_pos now (since
> +# @@gtid_cleanup_batch_size was set to a huge value). No need to check
> +# for an exact number, since that will require changing .result if
> +# anything changes prior to this point, and we just need to know that
> +# we have still have some data in the tables to make the following
> +# test effective.
> +SELECT COUNT(*)>=10 FROM mysql.gtid_slave_pos;
> +SELECT COUNT(*)>=10 FROM ( SELECT * FROM mysql.gtid_slave_pos_innodb
> + UNION ALL SELECT * FROM mysql.gtid_slave_pos_innodb_redundant) inner_select;
> +SELECT COUNT(*)>=10 FROM mysql.gtid_slave_pos_rocksdb;
> +
> +# Check that old GTID rows will be deleted when batch delete size is
> +# set reasonably. Old row deletion is not 100% deterministic (by design), so
> +# we must wait for it to occur, but it should occur eventually.
> +SET GLOBAL gtid_cleanup_batch_size = 3;
> +let $i=40;
> +--disable_query_log
> +--let $keep_include_silent=1
> +while ($i) {
> + let N=`SELECT 1+($i MOD 3)`;
> + --connection server_1
> + eval UPDATE t$N SET a=a+1 WHERE a=(SELECT MAX(a) FROM t$N);
> + --source include/save_master_gtid.inc
> + --connection server_2
> + --source include/sync_with_master_gtid.inc
> + let $j=50;
> + while ($j) {
> + let $is_done=`SELECT SUM(a)=1 FROM (
> + SELECT COUNT(*) AS a FROM mysql.gtid_slave_pos
> + UNION ALL
> + SELECT COUNT(*) AS a FROM ( SELECT * FROM mysql.gtid_slave_pos_innodb
> + UNION ALL SELECT * FROM mysql.gtid_slave_pos_innodb_redundant) inner_select
> + UNION ALL
> + SELECT COUNT(*) AS a FROM mysql.gtid_slave_pos_rocksdb) outer_select`;
> + if ($is_done) {
> + let $j=0;
> + }
> + if (!$is_done) {
> + real_sleep 0.1;
> + dec $j;
> + }
> + }
> + dec $i;
> + if ($is_done) {
> + let $i=0;
> + }
> +}
> +--enable_query_log
> +--let $keep_include_silent=0
> +if (!$is_done) {
> + --echo Timed out waiting for mysql.gtid_slave_pos* tables to be cleaned up
> +}
> +
> +--disable_query_log
> +DELETE FROM t1 WHERE a >= 100;
> +DELETE FROM t2 WHERE a >= 100;
> +DELETE FROM t3 WHERE a >= 100;
> +--enable_query_log
> +
> +
> # Test status variables Rpl_transactions_multi_engine and Transactions_gtid_foreign_engine.
> # Have mysql.gtid_slave_pos* for myisam and innodb but not rocksdb.
> --connection server_2
> @@ -223,6 +305,9 @@ SHOW STATUS LIKE "%transactions%engine";
> SET sql_log_bin=0;
> DROP TABLE mysql.gtid_slave_pos_innodb;
> SET sql_log_bin=1;
> +--disable_query_log
> +eval SET GLOBAL gtid_cleanup_batch_size = $old_gtid_cleanup_batch_size;
> +--enable_query_log
>
> --connection server_1
> DROP TABLE t1;
> diff --git a/storage/tokudb/mysql-test/tokudb_rpl/r/mdev12179.result b/storage/tokudb/mysql-test/tokudb_rpl/r/mdev12179.result
> index d4532eec4e2..d79e7e59aa4 100644
> --- a/storage/tokudb/mysql-test/tokudb_rpl/r/mdev12179.result
> +++ b/storage/tokudb/mysql-test/tokudb_rpl/r/mdev12179.result
> @@ -2,6 +2,7 @@ include/master-slave.inc
> [connection master]
> connection server_2;
> include/stop_slave.inc
> +SET GLOBAL gtid_cleanup_batch_size = 999999999;
> CHANGE MASTER TO master_use_gtid=slave_pos;
> SET sql_log_bin=0;
> CREATE TABLE mysql.gtid_slave_pos_innodb LIKE mysql.gtid_slave_pos;
> @@ -41,6 +42,8 @@ a
> 1
> SELECT * FROM mysql.gtid_slave_pos ORDER BY sub_id;
> domain_id sub_id server_id seq_no
> +0 1 1 1
> +0 2 1 2
> 0 3 1 3
> 0 4 1 4
> SELECT * FROM ( SELECT * FROM mysql.gtid_slave_pos_innodb
> @@ -121,6 +124,21 @@ Transactions_multi_engine 6
> DELETE FROM t1 WHERE a >= 100;
> DELETE FROM t2 WHERE a >= 100;
> DELETE FROM t3 WHERE a >= 100;
> +connection server_1;
> +include/save_master_gtid.inc
> +connection server_2;
> +include/sync_with_master_gtid.inc
> +SELECT COUNT(*)>=10 FROM mysql.gtid_slave_pos;
> +COUNT(*)>=10
> +1
> +SELECT COUNT(*)>=10 FROM ( SELECT * FROM mysql.gtid_slave_pos_innodb
> +UNION ALL SELECT * FROM mysql.gtid_slave_pos_innodb_redundant) inner_select;
> +COUNT(*)>=10
> +1
> +SELECT COUNT(*)>=10 FROM mysql.gtid_slave_pos_tokudb;
> +COUNT(*)>=10
> +1
> +SET GLOBAL gtid_cleanup_batch_size = 3;
> connection server_2;
> include/stop_slave.inc
> SET sql_log_bin=0;
> diff --git a/storage/tokudb/mysql-test/tokudb_rpl/t/mdev12179.test b/storage/tokudb/mysql-test/tokudb_rpl/t/mdev12179.test
> index ceb119cd0dc..1d19a25889e 100644
> --- a/storage/tokudb/mysql-test/tokudb_rpl/t/mdev12179.test
> +++ b/storage/tokudb/mysql-test/tokudb_rpl/t/mdev12179.test
> @@ -4,6 +4,12 @@
>
> --connection server_2
> --source include/stop_slave.inc
> +
> +# Set GTID cleanup limit high enough that cleanup will not run and we
> +# can rely on consistent table output in .result.
> +--let $old_gtid_cleanup_batch_size=`SELECT @@GLOBAL.gtid_cleanup_batch_size`
> +SET GLOBAL gtid_cleanup_batch_size = 999999999;
> +
> CHANGE MASTER TO master_use_gtid=slave_pos;
> SET sql_log_bin=0;
> CREATE TABLE mysql.gtid_slave_pos_innodb LIKE mysql.gtid_slave_pos;
> @@ -89,6 +95,82 @@ DELETE FROM t2 WHERE a >= 100;
> DELETE FROM t3 WHERE a >= 100;
>
>
> +# Create a bunch more GTIDs in mysql.gtid_slave_pos* tables to test with.
> +--connection server_1
> +--disable_query_log
> +let $i=10;
> +while ($i) {
> + eval INSERT INTO t1 VALUES (300+$i);
> + eval INSERT INTO t2 VALUES (300+$i);
> + eval INSERT INTO t3 VALUES (300+$i);
> + dec $i;
> +}
> +--enable_query_log
> +--source include/save_master_gtid.inc
> +
> +--connection server_2
> +--source include/sync_with_master_gtid.inc
> +
> +# Check that we have many rows in mysql.gtid_slave_pos now (since
> +# @@gtid_cleanup_batch_size was set to a huge value). No need to check
> +# for an exact number, since that will require changing .result if
> +# anything changes prior to this point, and we just need to know that
> +# we have still have some data in the tables to make the following
> +# test effective.
> +SELECT COUNT(*)>=10 FROM mysql.gtid_slave_pos;
> +SELECT COUNT(*)>=10 FROM ( SELECT * FROM mysql.gtid_slave_pos_innodb
> + UNION ALL SELECT * FROM mysql.gtid_slave_pos_innodb_redundant) inner_select;
> +SELECT COUNT(*)>=10 FROM mysql.gtid_slave_pos_tokudb;
> +
> +# Check that old GTID rows will be deleted when batch delete size is
> +# set reasonably. Old row deletion is not 100% deterministic (by design), so
> +# we must wait for it to occur, but it should occur eventually.
> +SET GLOBAL gtid_cleanup_batch_size = 3;
> +let $i=40;
> +--disable_query_log
> +--let $keep_include_silent=1
> +while ($i) {
> + let N=`SELECT 1+($i MOD 3)`;
> + --connection server_1
> + eval UPDATE t$N SET a=a+1 WHERE a=(SELECT MAX(a) FROM t$N);
> + --source include/save_master_gtid.inc
> + --connection server_2
> + --source include/sync_with_master_gtid.inc
> + let $j=50;
> + while ($j) {
> + let $is_done=`SELECT SUM(a)=1 FROM (
> + SELECT COUNT(*) AS a FROM mysql.gtid_slave_pos
> + UNION ALL
> + SELECT COUNT(*) AS a FROM ( SELECT * FROM mysql.gtid_slave_pos_innodb
> + UNION ALL SELECT * FROM mysql.gtid_slave_pos_innodb_redundant) inner_select
> + UNION ALL
> + SELECT COUNT(*) AS a FROM mysql.gtid_slave_pos_tokudb) outer_select`;
> + if ($is_done) {
> + let $j=0;
> + }
> + if (!$is_done) {
> + real_sleep 0.1;
> + dec $j;
> + }
> + }
> + dec $i;
> + if ($is_done) {
> + let $i=0;
> + }
> +}
> +--enable_query_log
> +--let $keep_include_silent=0
> +if (!$is_done) {
> + --echo Timed out waiting for mysql.gtid_slave_pos* tables to be cleaned up
> +}
> +
> +--disable_query_log
> +DELETE FROM t1 WHERE a >= 100;
> +DELETE FROM t2 WHERE a >= 100;
> +DELETE FROM t3 WHERE a >= 100;
> +--enable_query_log
> +
> +
> # Test status variables Rpl_transactions_multi_engine and Transactions_gtid_foreign_engine.
> # Have mysql.gtid_slave_pos* for myisam and innodb but not tokudb.
> --connection server_2
> @@ -223,6 +305,9 @@ SHOW STATUS LIKE "%transactions%engine";
> SET sql_log_bin=0;
> DROP TABLE mysql.gtid_slave_pos_innodb;
> SET sql_log_bin=1;
> +--disable_query_log
> +eval SET GLOBAL gtid_cleanup_batch_size = $old_gtid_cleanup_batch_size;
> +--enable_query_log
>
> --connection server_1
> DROP TABLE t1;
1
0