Hi, Aleksey,
On Nov 07, Aleksey Midenkov wrote:
> revision-id: 2342aaf46eb (mariadb-11.0.1-256-g2342aaf46eb)
> parent(s): 80bc5df1c80
> author: Aleksey Midenkov
> committer: Aleksey Midenkov
> timestamp: 2023-10-10 13:11:19 +0300
> message:
>
> MDEV-30973 Implement debug logging for slave_transaction_retries
>
> Configuration options:
>
> log_slave_retries
> log_slave_retries_max
1.
a commit comment of >100 lines that's a bit too much, I think.
2.
comments in MDEV-30973 say that one should be able to have everything in
one file, namely in the error log. I don't see how it's possible in your
commit.
3.
one of the reasons for not simply writing into erorr log was
"you get what are relatively large entries in there which can make it
hard/annoying to also look for the normal, one-line warning and error
output". But as far as I can see your slave retry entries are not large,
but normal one-line messages. I'd say they can fit into the error log
just fine.
I've still commented below in the assumption that the separate log
implementation will stay. But I honestly don't see a reason for it
anymore.
> diff --git a/mysql-test/suite/rpl/t/slave_deadlock.test b/mysql-test/suite/rpl/t/slave_deadlock.test
> new file mode 100644
> index 00000000000..9fcda7c9b71
> --- /dev/null
> +++ b/mysql-test/suite/rpl/t/slave_deadlock.test
> @@ -0,0 +1,194 @@
> +--source include/have_debug.inc
> +--source include/have_innodb.inc
> +--source include/have_binlog_format_statement.inc
> +
> +# On --repeat test infinitely increments thread_id and finally fails --sync_with_master at log_pos 1754:
> +#
> +# 231009 20:58:15 server id 1 end_log_pos 1754 CRC32 0x7e5aecd1 Query thread_id=561 exec_time=0 error_code=0 xid=0
I'm sorry, I don't understand that. Could you elaborate please?
> +#
> +# force_restart seems to fix this
> +
> +--source include/force_restart.inc
> +
> +# Note that test requires big enough --debug-sync-timeout unfair scheduling at high load.
Sorry, cannot parse that. Could you elaborate please?
> +# slave_deadlock.opt sets this to 1 hour
> +
> +--let $rpl_topology=1->2
> +--source include/rpl_init.inc
> +
> +--echo #
> +--echo # MDEV-30973 Enhance output of innodb deadlocks (SAMU-124, SAMU-131)
> +--echo #
> +--connection server_2
> +SET @old_dbug= @@global.debug_dbug;
> +set @old_parallel_threads= @@global.slave_parallel_threads;
> +set @old_domain_parallel_threads= @@global.slave_domain_parallel_threads;
> +set @old_parallel_mode= @@global.slave_parallel_mode;
> +set @old_slave_transaction_retries= @@global.slave_transaction_retries;
> +set @old_innodb_lock_wait_timeout= @@global.innodb_lock_wait_timeout;
> +set @old_log_slave_retries= @@global.log_slave_retries;
> +set @old_log_slave_retries_file= @@global.log_slave_retries_file;
> +set @old_log_slave_retries_max= @@global.log_slave_retries_max;
All this makes little sense, it's simpler to put all settings into the .opt
file
generally it's better to do your way, to have less
server restarts, but you have an .opt file already, you can as well
use it.
except if you decide to remove the .opt file - then nevermind, the
above it good.
> +--source include/stop_slave.inc
> +change master to master_use_gtid= slave_pos;
> +set global slave_parallel_threads= 10;
> +set global slave_domain_parallel_threads= 1;
> +set global slave_parallel_mode= optimistic;
> +set global slave_transaction_retries= 4;
> +set global innodb_lock_wait_timeout= 1;
...
> diff --git a/mysql-test/suite/versioning/r/rpl_row.result b/mysql-test/suite/versioning/r/rpl_row.result
> index c161f68d0bd..6343fbecdec 100644
> --- a/mysql-test/suite/versioning/r/rpl_row.result
> +++ b/mysql-test/suite/versioning/r/rpl_row.result
> @@ -37,12 +37,7 @@ select *, unix_timestamp(row_start) as row_start, unix_timestamp(row_end) as row
> x y row_start row_end
> 1 1 12345.000000 2147483647.999999
> set timestamp= default;
> -### INSERT INTO `test`.`t1`
> -### SET
> -### @1=1
> -### @2=1
> -### @3=12345.000000
> -### @4=2147483647.999999
why?
> +/*!50530 SET @@SESSION.PSEUDO_SLAVE_MODE=1*/;
> connection slave;
> select * from t1;
> x y
> diff --git a/sql/log.cc b/sql/log.cc
> index 877315c60de..78e874fda5b 100644
> --- a/sql/log.cc
> +++ b/sql/log.cc
> @@ -1188,9 +1188,17 @@ bool Log_to_file_event_handler::
> log_error(enum loglevel level, const char *format,
> va_list args)
> {
> - return vprint_msg_to_log(level, format, args);
> + return vprint_msg_to_log(stderr, level, format, args);
> }
>
> +#ifdef HAVE_REPLICATION
> +bool Log_to_file_event_handler::log_slave_retry(const char *format, va_list args)
> +{
> + DBUG_ASSERT(slave_retries_file.get());
> + return vprint_msg_to_log(slave_retries_file.get(), NO_LEVEL, format, args);
I don't think you should use vprint_msg_to_log and add a NO_LEVEL hack.
vprint_msg_to_log also prints into the windows event log, into
wsrep json log, and it locks LOCK_error_log. You don't need any of that,
just create your own function, it's just one fprintf() anyway.
> +}
> +#endif
> +
> void Log_to_file_event_handler::init_pthread_objects()
> {
> mysql_log.init_pthread_objects();
> @@ -9752,13 +9787,15 @@ static void print_buffer_to_file(enum loglevel level, const char *buffer,
> }
> }
>
> +#ifndef EMBEDDED_LIBRARY
> mysql_mutex_lock(&LOCK_error_log);
> +#endif
>
> skr= my_time(0);
> localtime_r(&skr, &tm_tmp);
> start=&tm_tmp;
>
> - fprintf(stderr, "%d-%02d-%02d %2d:%02d:%02d %lu [%s] %.*s%.*s\n",
> + fprintf(file, "%d-%02d-%02d %2d:%02d:%02d [T%lu] %s%.*s%.*s\n",
eh... no. don't change the format of existing log files, please.
> start->tm_year + 1900,
> start->tm_mon+1,
> start->tm_mday,
> @@ -9834,6 +9872,22 @@ void sql_print_error(const char *format, ...)
> }
>
>
> +#ifdef HAVE_REPLICATION
> +void sql_print_error2(const char *format, ...)
s/sql_print_error2/print_slave_error/ ?
> +{
> + va_list args;
> + DBUG_ENTER("sql_print_error");
> +
> + va_start(args, format);
> + error_log_print(ERROR_LEVEL, format, args);
> + slave_retries_print(format, args);
> + va_end(args);
> +
> + DBUG_VOID_RETURN;
> +}
> +#endif
> +
> +
> void sql_print_warning(const char *format, ...)
> {
> va_list args;
> diff --git a/sql/mysqld.cc b/sql/mysqld.cc
> index 8cc3ad51ee9..4f6b2273597 100644
> --- a/sql/mysqld.cc
> +++ b/sql/mysqld.cc
> @@ -765,6 +765,90 @@ char *opt_relay_logname = 0, *opt_relaylog_index_name=0;
> char *opt_logname, *opt_slow_logname, *opt_bin_logname;
> char *opt_binlog_index_name=0;
>
> +#ifdef HAVE_REPLICATION
> +char default_slave_retries_path[FN_REFLEN];
> +bool opt_slave_retries= false;
> +char *opt_slave_retries_path= NULL;
> +uint opt_slave_retries_max_log= 0;
> +#endif
> +Slave_retries_file slave_retries_file;
> +
> +bool Slave_retries_file::open()
> +{
> +#ifdef HAVE_REPLICATION
> + DBUG_ASSERT(opt_slave_retries);
> + DBUG_ASSERT(opt_slave_retries_path);
> + file= fopen(opt_slave_retries_path, "a");
my_fopen or my_open
and anyway, why a new class, why not to use MYSQL_LOG like general and slow
logs do?
> + if (!file)
> + {
> + sql_print_error("Open of log_slave_retries '%s' failed: %s (%d)",
> + opt_slave_retries_path, strerror(errno), errno);
This is inconsistent, the server doesn't print this for other log files.
I've just tried with general log and got
2023-11-06 17:58:19 0 [ERROR] mysqld: File '/tmp' not found (Errcode: 21 "Is a directory")
2023-11-06 17:58:19 0 [ERROR] Could not use /tmp for logging (error 21). Turning logging off for the whole duration of the MariaDB server process. To turn it on again: fix the cause, shutdown the MariaDB server and restart it.
Please, do the same
> + return true;
> + }
> + else
> + {
> + slave_retries_print("%s (mariadbd %s) starting as process %lu ...",
> + my_progname, server_version, (ulong) getpid());
Any reason, why it shouldn't be the same as in the error log?
sql_print_information("Starting MariaDB %s source revision %s as process %lu",
server_version, SOURCE_REVISION, (ulong) getpid());
> + }
> +#endif
> + return false;
> +}
> +
> +void Slave_retries_file::close()
> +{
> +#ifdef HAVE_REPLICATION
> + if (file)
> + {
> + acquire();
> + fclose(file);
> + release();
> + }
> + file= NULL;
> +#endif
> +}
> +
> +void Slave_retries_file::init()
> +{
> +#ifdef HAVE_REPLICATION
> + mysql_mutex_init(key_LOCK_global_system_variables, &mutex, MY_MUTEX_INIT_FAST);
> +#endif
> +}
> +
> +void Slave_retries_file::destroy()
> +{
> +#ifdef HAVE_REPLICATION
> + mysql_mutex_destroy(&mutex);
> +#endif
> +}
> +
> +bool Slave_retries_file::acquire()
> +{
> +#ifdef HAVE_REPLICATION
> + if (!file)
> + return false;
> + mysql_mutex_lock(&mutex);
> + return true;
> +#else
> + return false;
> +#endif
> +}
> +
> +void Slave_retries_file::release()
> +{
> +#ifdef HAVE_REPLICATION
> + mysql_mutex_unlock(&mutex);
> +#endif
> +}
> +
> +void Slave_retries_file::flush()
> +{
> + if (acquire())
> + {
> + fflush(file);
> + release();
> + }
> +}
> +
> /* Static variables */
>
> my_bool opt_stack_trace;
> @@ -5055,6 +5145,35 @@ static int init_server_components()
> "this server. However this will be ignored as the "
> "--log-bin option is not defined.");
> }
> +
> + slave_retries_file.init();
> + if (!log_error_file_ptr[0])
> + fn_format(default_slave_retries_path, pidfile_name, mysql_data_home,
> + "-retries.err", MY_REPLACE_EXT);
> + else
> + fn_format(default_slave_retries_path, log_error_file_ptr, mysql_data_home,
> + "-retries.err", MY_REPLACE_EXT);
that's not how all other log names work,
grep mysqld.cc for opt_slow_logname
> + if (opt_slave_retries_path && !opt_abort)
> + {
> + if (!opt_slave_retries_path[0])
> + {
> + if (!default_slave_retries_path[0])
> + {
> + /* fn_format failed */
> + opt_slave_retries_path= NULL;
> + opt_slave_retries= false;
> + }
> + else
> + {
> + opt_slave_retries_path= my_strdup(key_memory_Sys_var_charptr_value,
> + default_slave_retries_path, MYF(0));
> + }
> + }
> +
> + if (opt_slave_retries &&
> + slave_retries_file.open())
> + unireg_abort(1);
> + }
> #endif
>
> if (opt_bin_log)
> diff --git a/sql/mysqld.h b/sql/mysqld.h
> index 69cfc0cad56..5f1561ca10d 100644
> --- a/sql/mysqld.h
> +++ b/sql/mysqld.h
> @@ -822,6 +848,8 @@ enum options_mysqld
> OPT_KEY_CACHE_CHANGED_BLOCKS_HASH_SIZE,
> OPT_LOG_BASENAME,
> OPT_LOG_ERROR,
> + OPT_SLAVE_RETRIES,
> + OPT_SLAVE_RETRIES_LOG,
why do you need them?
> OPT_LOWER_CASE_TABLE_NAMES,
> OPT_PLUGIN_LOAD,
> OPT_PLUGIN_LOAD_ADD,
> diff --git a/sql/sql_show.cc b/sql/sql_show.cc
> index d742bc6c1c3..c53ca0f0097 100644
> --- a/sql/sql_show.cc
> +++ b/sql/sql_show.cc
> @@ -10662,6 +10663,15 @@ char *thd_get_error_context_description(THD *thd, char *buffer,
> /* Don't wait if LOCK_thd_data is used as this could cause a deadlock */
> if (!mysql_mutex_trylock(&thd->LOCK_thd_data))
> {
> + if (thd->rgi_slave)
is rgi_slave and its content protected by thd->LOCK_thd_data ?
> + {
> + rpl_group_info *rgi= thd->rgi_slave;
> + len= my_snprintf(header, sizeof(header),
> + ", GTID %u-%u-%llu", rgi->current_gtid.domain_id,
> + rgi->current_gtid.server_id, rgi->current_gtid.seq_no);
> + str.append(header, len);
> + }
> +
> if (const char *info= thread_state_info(thd))
> {
> str.append(' ');
> diff --git a/sql/sys_vars.cc b/sql/sys_vars.cc
> index ebb64b340b0..11c321f06dc 100644
> --- a/sql/sys_vars.cc
> +++ b/sql/sys_vars.cc
> @@ -5862,6 +5862,61 @@ Sys_slave_trans_retry_interval(
> "slave_transaction_retry_errors",
> GLOBAL_VAR(slave_trans_retry_interval), CMD_LINE(REQUIRED_ARG),
> VALID_RANGE(0, 3600), DEFAULT(0), BLOCK_SIZE(1));
> +
> +static bool update_slave_retries_log(sys_var *self, THD* thd, enum_var_type type)
> +{
> + if (opt_slave_retries)
> + {
> + if (!slave_retries_file.get() && slave_retries_file.open())
> + {
> + my_error(ER_CANT_OPEN_FILE, MYF(0), opt_slave_retries_path, errno);
> + opt_slave_retries= false;
> + return true;
> + }
> + }
> + else
> + slave_retries_file.close();
> + return false;
> +}
> +
> +static Sys_var_mybool Sys_slave_retries_log(
> + "log_slave_retries", "Log transaction retries",
> + GLOBAL_VAR(opt_slave_retries),
> + CMD_LINE(OPT_ARG, OPT_SLAVE_RETRIES), DEFAULT(false),
> + NO_MUTEX_GUARD, NOT_IN_BINLOG, ON_CHECK(0), ON_UPDATE(update_slave_retries_log));
> +
> +
> +static bool check_slave_retries_file(sys_var *self, THD *thd, set_var *var)
> +{
> + if (opt_slave_retries)
> + {
> + my_error(ER_NOT_SUPPORTED_YET, MYF(0), "modifying while log_slave_retries is ON");
as always, please don't hard-code error messages.
you can use
my_error(ER_OPTION_PREVENTS_STATEMENT, MYF(0), "log_slave_retries=ON");
> + return true;
> + }
> + return false;
> +}
> +
> +static bool update_slave_retries_file(sys_var *self, THD* thd, enum_var_type type)
> +{
> + if (!opt_slave_retries_path)
> + opt_slave_retries_path= my_strdup(key_memory_Sys_var_charptr_value,
> + default_slave_retries_path, MYF(0));
sys_var.cc can handle that for you, see PREALLOCATED
in e.g. opt_slow_logname
also, you don't reopen the log when its name changes, how comes?
> + return false;
> +}
> +
> +static Sys_var_charptr Sys_slave_retries_log_path(
> + "log_slave_retries_file", "Log file path for transaction retries",
> + GLOBAL_VAR(opt_slave_retries_path),
> + CMD_LINE(REQUIRED_ARG, OPT_SLAVE_RETRIES_LOG), DEFAULT(NULL),
> + NO_MUTEX_GUARD, NOT_IN_BINLOG, ON_CHECK(check_slave_retries_file),
> + ON_UPDATE(update_slave_retries_file));
> +
> +
> +static Sys_var_uint Sys_slave_retries_log_max(
> + "log_slave_retries_max", "Log max N transaction retries "
> + "(default 0 means infinity)",
just "(0 means infinity)". Default is printed separately
> + GLOBAL_VAR(opt_slave_retries_max_log), CMD_LINE(REQUIRED_ARG),
> + VALID_RANGE(0, UINT_MAX), DEFAULT(0), BLOCK_SIZE(1));
> #endif
>
> static bool check_locale(sys_var *self, THD *thd, set_var *var)
> diff --git a/storage/innobase/lock/lock0lock.cc b/storage/innobase/lock/lock0lock.cc
> index 0e9cef669d4..8a8eeaf39f6 100644
> --- a/storage/innobase/lock/lock0lock.cc
> +++ b/storage/innobase/lock/lock0lock.cc
> @@ -49,6 +49,7 @@ Created 5/7/1996 Heikki Tuuri
> #include <debug_sync.h>
>
> #include <set>
> +#include <stdio.h>
I presume Marko has reviewed (or will review) InnoDB changes
>
> #ifdef WITH_WSREP
> #include <mysql/service_wsrep.h>
> diff --git a/mysql-test/main/mysqld--help.result b/mysql-test/main/mysqld--help.result
> index 77d6d10b3e6..274c034d455 100644
> --- a/mysql-test/main/mysqld--help.result
> +++ b/mysql-test/main/mysqld--help.result
> @@ -1673,6 +1678,9 @@ log-isam myisam.log
> log-output FILE
> log-queries-not-using-indexes FALSE
> log-short-format FALSE
> +log-slave-retries FALSE
> +log-slave-retries-file (No default value)
this is another sign that log-slave-retries-file doesn't behave like
other log filename variables.
> +log-slave-retries-max 0
> log-slave-updates FALSE
> log-slow-admin-statements TRUE
> log-slow-disabled-statements sp
Regards,
Sergei
Chief Architect, MariaDB Server
and security(a)mariadb.org