From: poempeng <poempeng@tencent.com> During the commit stage of a large transaction, it may cost too much time to write binlog cache and block all subsequent transactions for a long time. One possible solution of this issue is to write the binlog of the large transaction to a temporary file and then rename it to the next new binlog file. We need to reserve some space for format description event, gtid list event, binlog checkpoint event and gtid event. After writing the binlog cache, we lock LOCK_log and write the aforementioned events to the reserved space. The default size of reserved space is 10MB, any unusable space is filled with a skip event(Ignorable_log_event). If any error occurs before acquiring LOCK_log, we fallback to the original commit path. However, if an error occurs after acquiring LOCK_log, we abort the server. New variable: non_blocking_binlog_threshold, if the size of binlog cache exceeds this value, try using non-blocking binlog. Limitations: Binlog encryption, wsrep transaction, stmt cache, incident event and slave thread are not supported. --- mysql-test/main/mysqld--help.result | 5 + .../binlog/r/non_blocking_binlog_basic.result | 42 +++ .../binlog/r/non_blocking_binlog_error.result | 36 ++ .../binlog/t/non_blocking_binlog_basic.test | 31 ++ .../binlog/t/non_blocking_binlog_error.test | 49 +++ .../perfschema/r/dml_setup_instruments.result | 2 +- .../rpl/r/rpl_non_blocking_binlog.result | 22 ++ .../suite/rpl/t/rpl_non_blocking_binlog.test | 31 ++ .../r/sysvars_server_notembedded.result | 10 + sql/log.cc | 354 +++++++++++++++++- sql/log.h | 30 +- sql/mysqld.cc | 9 +- sql/mysqld.h | 4 +- sql/sys_vars.cc | 8 + 14 files changed, 615 insertions(+), 18 deletions(-) create mode 100644 mysql-test/suite/binlog/r/non_blocking_binlog_basic.result create mode 100644 mysql-test/suite/binlog/r/non_blocking_binlog_error.result create mode 100644 mysql-test/suite/binlog/t/non_blocking_binlog_basic.test create mode 100644 mysql-test/suite/binlog/t/non_blocking_binlog_error.test create mode 100644 mysql-test/suite/rpl/r/rpl_non_blocking_binlog.result create mode 100644 mysql-test/suite/rpl/t/rpl_non_blocking_binlog.test diff --git a/mysql-test/main/mysqld--help.result b/mysql-test/main/mysqld--help.result index 431c37c25ed..d652eacadb0 100644 --- a/mysql-test/main/mysqld--help.result +++ b/mysql-test/main/mysqld--help.result @@ -735,6 +735,10 @@ The following specify which files/extra groups are read (specified before remain --net-write-timeout=# Number of seconds to wait for a block to be written to a connection before aborting the write + --non-blocking-binlog-threshold[=#] + If the binlog size of a transaction exceeds this value, + we write it to a new temporary file and rename it to the + next binlog file. --note-verbosity=name Verbosity level for note-warnings given to the user. See also @@sql_notes.. Any combination of: basic, @@ -1797,6 +1801,7 @@ net-buffer-length 16384 net-read-timeout 30 net-retry-count 10 net-write-timeout 60 +non-blocking-binlog-threshold 18446744073709551615 note-verbosity basic,explain old FALSE old-mode UTF8_IS_UTF8MB3 diff --git a/mysql-test/suite/binlog/r/non_blocking_binlog_basic.result b/mysql-test/suite/binlog/r/non_blocking_binlog_basic.result new file mode 100644 index 00000000000..dfbf6721f89 --- /dev/null +++ b/mysql-test/suite/binlog/r/non_blocking_binlog_basic.result @@ -0,0 +1,42 @@ +set global non_blocking_binlog_threshold=1; +Warnings: +Warning 1292 Truncated incorrect non_blocking_binlog_threshold value: '1' +show variables like 'non_blocking_binlog_threshold'; +Variable_name Value +non_blocking_binlog_threshold 268435456 +set global non_blocking_binlog_threshold=256*1024*1024; +show variables like 'non_blocking_binlog_threshold'; +Variable_name Value +non_blocking_binlog_threshold 268435456 +set global non_blocking_binlog_threshold=default; +show variables like 'non_blocking_binlog_threshold'; +Variable_name Value +non_blocking_binlog_threshold 18446744073709551615 +reset master; +create table t1(id int, c1 int) engine=innodb; +insert into t1 values(1,1); +set debug_dbug='+d,non_blocking_binlog_ignore_cache_size'; +update t1 set c1=2; +set debug_dbug='-d,non_blocking_binlog_ignore_cache_size'; +insert into t1 values(3,3); +show master status; +File Position Binlog_Do_DB Binlog_Ignore_DB +master-bin.000002 # +show binlog events in 'master-bin.000002'; +Log_name Pos Event_type Server_id End_log_pos Info +master-bin.000002 # Format_desc # # # +master-bin.000002 # Gtid_list # # # +master-bin.000002 # Binlog_checkpoint # # # +master-bin.000002 # Gtid # # # +master-bin.000002 # Ignorable log event # # # +master-bin.000002 # Annotate_rows # # # +master-bin.000002 # Table_map # # # +master-bin.000002 # Update_rows_v1 # # # +master-bin.000002 # Xid # # # +master-bin.000002 # Gtid # # # +master-bin.000002 # Annotate_rows # # # +master-bin.000002 # Table_map # # # +master-bin.000002 # Write_rows_v1 # # # +master-bin.000002 # Xid # # # +drop table t1; +include/assert_grep.inc [write non-blocking binlog succeed should be logged] diff --git a/mysql-test/suite/binlog/r/non_blocking_binlog_error.result b/mysql-test/suite/binlog/r/non_blocking_binlog_error.result new file mode 100644 index 00000000000..536c80e04b4 --- /dev/null +++ b/mysql-test/suite/binlog/r/non_blocking_binlog_error.result @@ -0,0 +1,36 @@ +call mtr.add_suppression(".*"); +reset master; +create table t1(id int, c1 int) engine=innodb; +insert into t1 values(1,1); +set debug_dbug='+d,non_blocking_binlog_ignore_cache_size'; +set debug_dbug='+d,non_blocking_binlog_open_file_failed'; +update t1 set c1=2; +set debug_dbug='-d,non_blocking_binlog_ignore_cache_size'; +set debug_dbug='-d,non_blocking_binlog_open_file_failed'; +insert into t1 values(3,3); +show master status; +File Position Binlog_Do_DB Binlog_Ignore_DB +master-bin.000001 # +include/assert_grep.inc [Open file failed should be logged] +set debug_dbug='+d,non_blocking_binlog_ignore_cache_size'; +set debug_dbug='+d,non_blocking_binlog_write_cache_failed'; +update t1 set c1=3; +set debug_dbug='-d,non_blocking_binlog_ignore_cache_size'; +set debug_dbug='-d,non_blocking_binlog_write_cache_failed'; +insert into t1 values(4,4); +show master status; +File Position Binlog_Do_DB Binlog_Ignore_DB +master-bin.000001 # +set debug_dbug='+d,non_blocking_binlog_ignore_cache_size'; +set debug_dbug='+d,non_blocking_binlog_write_gtid_failed'; +update t1 set c1=5; +ERROR HY000: Lost connection to server during query +show master status; +File Position Binlog_Do_DB Binlog_Ignore_DB +master-bin.000002 # +select * from t1; +id c1 +1 3 +3 3 +4 4 +drop table t1; diff --git a/mysql-test/suite/binlog/t/non_blocking_binlog_basic.test b/mysql-test/suite/binlog/t/non_blocking_binlog_basic.test new file mode 100644 index 00000000000..2964f921698 --- /dev/null +++ b/mysql-test/suite/binlog/t/non_blocking_binlog_basic.test @@ -0,0 +1,31 @@ +--source include/have_debug.inc +--source include/have_innodb.inc +--source include/have_binlog_format_row.inc +--source include/not_embedded.inc + +set global non_blocking_binlog_threshold=1; +show variables like 'non_blocking_binlog_threshold'; +set global non_blocking_binlog_threshold=256*1024*1024; +show variables like 'non_blocking_binlog_threshold'; +set global non_blocking_binlog_threshold=default; +show variables like 'non_blocking_binlog_threshold'; + +reset master; +create table t1(id int, c1 int) engine=innodb; +insert into t1 values(1,1); +set debug_dbug='+d,non_blocking_binlog_ignore_cache_size'; +update t1 set c1=2; +set debug_dbug='-d,non_blocking_binlog_ignore_cache_size'; +insert into t1 values(3,3); +--replace_column 2 # +show master status; +--replace_column 2 # 4 # 5 # 6 # +show binlog events in 'master-bin.000002'; +drop table t1; + +--let $assert_count = 1 +--let $assert_file = $MYSQLTEST_VARDIR/log/mysqld.1.err +--let $assert_text = write non-blocking binlog succeed should be logged +--let $assert_select = write non-blocking binlog succeed +--let $assert_only_after = CURRENT_TEST +--source include/assert_grep.inc diff --git a/mysql-test/suite/binlog/t/non_blocking_binlog_error.test b/mysql-test/suite/binlog/t/non_blocking_binlog_error.test new file mode 100644 index 00000000000..6699006bffb --- /dev/null +++ b/mysql-test/suite/binlog/t/non_blocking_binlog_error.test @@ -0,0 +1,49 @@ +--source include/have_debug.inc +--source include/have_innodb.inc +--source include/have_binlog_format_row.inc +--source include/not_valgrind.inc +--source include/not_embedded.inc + +call mtr.add_suppression(".*"); + +reset master; +create table t1(id int, c1 int) engine=innodb; +insert into t1 values(1,1); +set debug_dbug='+d,non_blocking_binlog_ignore_cache_size'; +set debug_dbug='+d,non_blocking_binlog_open_file_failed'; +update t1 set c1=2; +set debug_dbug='-d,non_blocking_binlog_ignore_cache_size'; +set debug_dbug='-d,non_blocking_binlog_open_file_failed'; +insert into t1 values(3,3); +--replace_column 2 # +show master status; + +--let $assert_count = 1 +--let $assert_file = $MYSQLTEST_VARDIR/log/mysqld.1.err +--let $assert_text = Open file failed should be logged +--let $assert_select = Open file .* failed +--let $assert_only_after = CURRENT_TEST +--source include/assert_grep.inc + +set debug_dbug='+d,non_blocking_binlog_ignore_cache_size'; +set debug_dbug='+d,non_blocking_binlog_write_cache_failed'; +update t1 set c1=3; +set debug_dbug='-d,non_blocking_binlog_ignore_cache_size'; +set debug_dbug='-d,non_blocking_binlog_write_cache_failed'; +insert into t1 values(4,4); +--replace_column 2 # +show master status; + +set debug_dbug='+d,non_blocking_binlog_ignore_cache_size'; +set debug_dbug='+d,non_blocking_binlog_write_gtid_failed'; +--exec echo "restart" > $MYSQLTEST_VARDIR/tmp/mysqld.1.expect +--error 2013 +update t1 set c1=5; +--enable_reconnect +--source include/wait_until_connected_again.inc +--replace_column 2 # +show master status; +# c1 should not be 5 +select * from t1; + +drop table t1; diff --git a/mysql-test/suite/perfschema/r/dml_setup_instruments.result b/mysql-test/suite/perfschema/r/dml_setup_instruments.result index ff000a09312..95d8bfb882a 100644 --- a/mysql-test/suite/perfschema/r/dml_setup_instruments.result +++ b/mysql-test/suite/perfschema/r/dml_setup_instruments.result @@ -21,6 +21,7 @@ where name like 'Wait/Synch/Rwlock/sql/%' 'wait/synch/rwlock/sql/LOCK_named_pipe_full_access_group') order by name limit 10; NAME ENABLED TIMED +wait/synch/rwlock/sql/binlog_checksum_rwlock YES YES wait/synch/rwlock/sql/LOCK_all_status_vars YES YES wait/synch/rwlock/sql/LOCK_dbnames YES YES wait/synch/rwlock/sql/LOCK_dboptions YES YES @@ -30,7 +31,6 @@ wait/synch/rwlock/sql/LOCK_SEQUENCE YES YES wait/synch/rwlock/sql/LOCK_ssl_refresh YES YES wait/synch/rwlock/sql/LOCK_system_variables_hash YES YES wait/synch/rwlock/sql/LOCK_sys_init_connect YES YES -wait/synch/rwlock/sql/LOCK_sys_init_slave YES YES select * from performance_schema.setup_instruments where name like 'Wait/Synch/Cond/sql/%' and name not in ( diff --git a/mysql-test/suite/rpl/r/rpl_non_blocking_binlog.result b/mysql-test/suite/rpl/r/rpl_non_blocking_binlog.result new file mode 100644 index 00000000000..c130848bfe9 --- /dev/null +++ b/mysql-test/suite/rpl/r/rpl_non_blocking_binlog.result @@ -0,0 +1,22 @@ +include/master-slave.inc +[connection master] +connection master; +create table t1(id int, c1 int) engine=innodb; +insert into t1 values(1,1); +set debug_dbug='+d,non_blocking_binlog_ignore_cache_size'; +update t1 set c1=2; +set debug_dbug='-d,non_blocking_binlog_ignore_cache_size'; +include/assert_grep.inc [write non-blocking binlog succeed should be logged] +insert into t1 values(3,3); +checksum table t1; +Table Checksum +test.t1 4217795292 +include/sync_slave_sql_with_master.inc +connection slave; +checksum table t1; +Table Checksum +test.t1 4217795292 +connection master; +drop table t1; +include/sync_slave_sql_with_master.inc +include/rpl_end.inc diff --git a/mysql-test/suite/rpl/t/rpl_non_blocking_binlog.test b/mysql-test/suite/rpl/t/rpl_non_blocking_binlog.test new file mode 100644 index 00000000000..558ca63c5ec --- /dev/null +++ b/mysql-test/suite/rpl/t/rpl_non_blocking_binlog.test @@ -0,0 +1,31 @@ +--source include/have_debug.inc +--source include/have_innodb.inc +--source include/master-slave.inc +--source include/not_embedded.inc + +connection master; +create table t1(id int, c1 int) engine=innodb; +insert into t1 values(1,1); +set debug_dbug='+d,non_blocking_binlog_ignore_cache_size'; +update t1 set c1=2; +set debug_dbug='-d,non_blocking_binlog_ignore_cache_size'; + +--let $assert_count = 1 +--let $assert_file = $MYSQLTEST_VARDIR/log/mysqld.1.err +--let $assert_text = write non-blocking binlog succeed should be logged +--let $assert_select = write non-blocking binlog succeed +--let $assert_only_after = CURRENT_TEST +--source include/assert_grep.inc + +insert into t1 values(3,3); +checksum table t1; + +--source include/sync_slave_sql_with_master.inc +connection slave; +checksum table t1; + +connection master; +drop table t1; +--source include/sync_slave_sql_with_master.inc + +--source include/rpl_end.inc diff --git a/mysql-test/suite/sys_vars/r/sysvars_server_notembedded.result b/mysql-test/suite/sys_vars/r/sysvars_server_notembedded.result index 2740319dd05..0ad02a33089 100644 --- a/mysql-test/suite/sys_vars/r/sysvars_server_notembedded.result +++ b/mysql-test/suite/sys_vars/r/sysvars_server_notembedded.result @@ -2512,6 +2512,16 @@ NUMERIC_BLOCK_SIZE 1 ENUM_VALUE_LIST NULL READ_ONLY NO COMMAND_LINE_ARGUMENT REQUIRED +VARIABLE_NAME NON_BLOCKING_BINLOG_THRESHOLD +VARIABLE_SCOPE GLOBAL +VARIABLE_TYPE BIGINT UNSIGNED +VARIABLE_COMMENT If the binlog size of a transaction exceeds this value, we write it to a new temporary file and rename it to the next binlog file. +NUMERIC_MIN_VALUE 268435456 +NUMERIC_MAX_VALUE 18446744073709551615 +NUMERIC_BLOCK_SIZE 1 +ENUM_VALUE_LIST NULL +READ_ONLY NO +COMMAND_LINE_ARGUMENT OPTIONAL VARIABLE_NAME NOTE_VERBOSITY VARIABLE_SCOPE SESSION VARIABLE_TYPE SET diff --git a/sql/log.cc b/sql/log.cc index 460cefea47b..9de1c6118cd 100644 --- a/sql/log.cc +++ b/sql/log.cc @@ -74,6 +74,9 @@ #include <utility> // pair #endif +#include <atomic> +#include <chrono> + /* max size of the log message */ #define MAX_LOG_BUFFER_SIZE 1024 #define MAX_TIME_SIZE 32 @@ -3727,7 +3730,10 @@ bool MYSQL_BIN_LOG::open(const char *log_name, enum cache_type io_cache_type_arg, ulong max_size_arg, bool null_created_arg, - bool need_mutex) + bool need_mutex, + const char *file_to_rename, + my_off_t file_size, + group_commit_entry *entry) { xid_count_per_binlog *new_xid_list_entry= NULL, *b; DBUG_ENTER("MYSQL_BIN_LOG::open"); @@ -3792,6 +3798,18 @@ bool MYSQL_BIN_LOG::open(const char *log_name, write_error= 0; + if (file_to_rename) + { + DBUG_ASSERT(!is_relay_log && entry); + if (mysql_file_rename(PSI_NOT_INSTRUMENTED, file_to_rename, log_file_name, + MY_WME | MY_NABP | MY_WAIT_IF_FULL) != 0) + { + sql_print_error("rename from '%s' to '%s' failed!", file_to_rename, + log_file_name); + DBUG_RETURN(1); + } + } + /* open the main log file */ if (MYSQL_LOG::open( #ifdef HAVE_PSI_INTERFACE @@ -3816,7 +3834,7 @@ bool MYSQL_BIN_LOG::open(const char *log_name, { bool write_file_name_to_index_file=0; - if (!my_b_filelength(&log_file)) + if (!my_b_filelength(&log_file) || file_to_rename) { /* The binary log file was empty (probably newly created) @@ -3993,6 +4011,12 @@ bool MYSQL_BIN_LOG::open(const char *log_name, goto err; bytes_written+= description_event_for_queue->data_written; } + if (file_to_rename) + { + if (write_gtid_and_skip_event(file_size, entry)) + goto err; + } + if (flush_io_cache(&log_file) || mysql_file_sync(log_file.file, MYF(MY_WME))) goto err; @@ -6826,7 +6850,6 @@ MYSQL_BIN_LOG::write_gtid_event(THD *thd, bool standalone, thd->variables.server_id= global_system_variables.server_id; } #endif - if (write_event(>id_event)) DBUG_RETURN(true); status_var_add(thd->status_var.binlog_bytes_written, gtid_event.data_written); @@ -7849,11 +7872,18 @@ int Event_log::write_cache_raw(THD *thd, IO_CACHE *cache) events prior to fill in the binlog cache. */ -int Event_log::write_cache(THD *thd, binlog_cache_data *cache_data) +int Event_log::write_cache(THD *thd, binlog_cache_data *cache_data, + IO_CACHE *f) { DBUG_ENTER("Event_log::write_cache"); IO_CACHE *cache= &cache_data->cache_log; - mysql_mutex_assert_owner(&LOCK_log); + + IO_CACHE *out_file= f; + if (likely(f == nullptr)) + { + mysql_mutex_assert_owner(&LOCK_log); + out_file= get_log_file(); + } /* If possible, just copy the cache over byte-by-byte with pre-computed @@ -7863,7 +7893,7 @@ int Event_log::write_cache(THD *thd, binlog_cache_data *cache_data) likely(!crypto.scheme) && likely(!opt_binlog_legacy_event_pos)) { - int res= my_b_copy_all_to_cache(cache, &log_file); + int res= my_b_copy_all_to_cache(cache, out_file); status_var_add(thd->status_var.binlog_bytes_written, my_b_tell(cache)); DBUG_RETURN(res ? ER_ERROR_ON_WRITE : 0); } @@ -7873,7 +7903,7 @@ int Event_log::write_cache(THD *thd, binlog_cache_data *cache_data) /* Amount of remaining bytes in the IO_CACHE read buffer. */ size_t log_file_pos; uchar header_buf[LOG_EVENT_HEADER_LEN]; - Log_event_writer writer(get_log_file(), 0, + Log_event_writer writer(out_file, 0, (enum_binlog_checksum_alg)binlog_checksum_options, &crypto); uint checksum_len= writer.checksum_len; @@ -7904,7 +7934,7 @@ int Event_log::write_cache(THD *thd, binlog_cache_data *cache_data) split. */ - log_file_pos= (size_t)my_b_tell(get_log_file()); + log_file_pos= (size_t)my_b_tell(out_file); for (;;) { /* @@ -8194,6 +8224,7 @@ MYSQL_BIN_LOG::write_transaction_to_binlog(THD *thd, ha_info= all ? thd->transaction->all.ha_list : thd->transaction->stmt.ha_list; entry.ro_1pc= is_ro_1pc; entry.end_event= end_ev; + entry.non_blocking_log= false; auto has_xid= entry.end_event->get_type_code() == XID_EVENT; for (; has_xid && !entry.need_unlog && ha_info; ha_info= ha_info->next()) @@ -8581,7 +8612,23 @@ MYSQL_BIN_LOG::queue_for_group_commit(group_commit_entry *orig_entry) bool MYSQL_BIN_LOG::write_transaction_to_binlog_events(group_commit_entry *entry) { - int is_leader= queue_for_group_commit(entry); + int is_leader; + if (unlikely(entry->cache_mngr->trx_cache.get_byte_position() >= + non_blocking_binlog_threshold) || + DBUG_IF("non_blocking_binlog_ignore_cache_size")) + { + if (!can_use_non_blocking_binlog(entry) || + write_non_blocking_binlog(entry)) + goto original_commit_path; + /* thread using non-blocking binlog is treated as a single group */ + is_leader= 1; + entry->non_blocking_log= true; + entry->next= nullptr; + goto group_commit_leader; + } +original_commit_path: + + is_leader= queue_for_group_commit(entry); #ifdef WITH_WSREP /* commit order was released in queue_for_group_commit() call, here we check if wsrep_commit_ordered() failed or if we are leader */ @@ -8616,7 +8663,10 @@ MYSQL_BIN_LOG::write_transaction_to_binlog_events(group_commit_entry *entry) if (is_leader < 0) return true; /* Error */ else if (is_leader) + { +group_commit_leader: trx_group_commit_leader(entry); + } else if (!entry->queued_by_other) { DEBUG_SYNC(entry->thd, "after_semisync_queue"); @@ -8748,6 +8798,12 @@ MYSQL_BIN_LOG::trx_group_commit_leader(group_commit_entry *leader) ("commit_before_get_LOCK_log SIGNAL waiting WAIT_FOR cont TIMEOUT 1"))); ); #endif + if (unlikely(leader->non_blocking_log)) + { + mysql_mutex_assert_owner(&LOCK_log); + current= leader; + goto after_fetching_group; + } /* Lock the LOCK_log(), and once we get it, collect any additional writes that queued up while we were waiting. @@ -8766,6 +8822,7 @@ MYSQL_BIN_LOG::trx_group_commit_leader(group_commit_entry *leader) current= group_commit_queue; group_commit_queue= NULL; mysql_mutex_unlock(&LOCK_prepare_ordered); +after_fetching_group: binlog_id= current_binlog_id; /* As the queue is in reverse order of entering, reverse it. */ @@ -8824,9 +8881,12 @@ MYSQL_BIN_LOG::trx_group_commit_leader(group_commit_entry *leader) !cache_mngr->trx_cache.empty() || current->thd->transaction->xid_state.is_explicit_XA()); - if (unlikely((current->error= write_transaction_or_stmt(current, - commit_id)))) - current->commit_errno= errno; + if (likely(leader->non_blocking_log == false)) + { + if (unlikely((current->error= + write_transaction_or_stmt(current, commit_id)))) + current->commit_errno= errno; + } strmake_buf(cache_mngr->last_commit_pos_file, log_file_name); commit_offset= my_b_write_tell(&log_file); @@ -12459,6 +12519,7 @@ mysql_bin_log_commit_pos(THD *thd, ulonglong *out_pos, const char **out_file) } #endif /* INNODB_COMPATIBILITY_HOOKS */ +mysql_rwlock_t binlog_checksum_rwlock; static void binlog_checksum_update(MYSQL_THD thd, struct st_mysql_sys_var *var, @@ -12468,6 +12529,7 @@ binlog_checksum_update(MYSQL_THD thd, struct st_mysql_sys_var *var, bool check_purge= false; ulong UNINIT_VAR(prev_binlog_id); + mysql_rwlock_wrlock(&binlog_checksum_rwlock); mysql_mutex_lock(mysql_bin_log.get_log_lock()); if(mysql_bin_log.is_open()) { @@ -12484,6 +12546,7 @@ binlog_checksum_update(MYSQL_THD thd, struct st_mysql_sys_var *var, DBUG_ASSERT(binlog_checksum_options == value); mysql_bin_log.checksum_alg_reset= BINLOG_CHECKSUM_ALG_UNDEF; mysql_mutex_unlock(mysql_bin_log.get_log_lock()); + mysql_rwlock_unlock(&binlog_checksum_rwlock); if (check_purge) mysql_bin_log.checkpoint_and_purge(prev_binlog_id); } @@ -12781,3 +12844,270 @@ void wsrep_register_binlog_handler(THD *thd, bool trx) } #endif /* WITH_WSREP */ + +bool MYSQL_BIN_LOG::can_use_non_blocking_binlog(group_commit_entry *entry) +{ + DBUG_ASSERT(entry->cache_mngr->trx_cache.get_byte_position() >= + non_blocking_binlog_threshold || + DBUG_IF("non_blocking_binlog_ignore_cache_size")); + THD *thd= entry->thd; + if (unlikely(!is_open()) || encrypt_binlog || + !entry->cache_mngr->stmt_cache.empty() || + entry->cache_mngr->trx_cache.has_incident() || thd->slave_thread || + thd->wait_for_commit_ptr) + return false; +#ifdef WITH_WSREP + if (wsrep_is_active(thd)) + return false; +#endif /* WITH_WSREP */ + return true; +} + +std::string MYSQL_BIN_LOG::generate_random_file_name() +{ + static std::atomic<uint64_t> temp_bin_counter{0}; + static std::atomic<bool> binlog_dir_inited{false}; + static std::string binlog_dir; + std::string temp_file_name; + if (unlikely(!binlog_dir_inited.load(std::memory_order_acquire))) + { + char dev[FN_REFLEN]; + size_t dev_length; + mysql_mutex_lock(&LOCK_log); + if (!binlog_dir_inited.load(std::memory_order_relaxed) && name != nullptr) + { + /* + see MYSQL_BIN_LOG::generate_new_name and fn_format for more + information + */ + size_t length= dirname_part(dev, name, &dev_length); + if (length == 0) + { + // log_bin is not set, use mysql_data_home + convert_dirname(dev, mysql_data_home, NullS); + } + unpack_dirname(dev, dev); // Replace ~/.. with dir + binlog_dir.assign(dev); + binlog_dir_inited.store(true); + } + mysql_mutex_unlock(&LOCK_log); + } + + temp_file_name.reserve(FN_REFLEN); + auto now_in_sys= std::chrono::system_clock::now().time_since_epoch(); + auto now_in_ms= + std::chrono::duration_cast<std::chrono::milliseconds>(now_in_sys) + .count(); + auto count= temp_bin_counter.fetch_add(1); + temp_file_name.append(binlog_dir); + temp_file_name.append("_temp_bin_"); + temp_file_name.append(std::to_string(now_in_ms)); + temp_file_name.push_back('_'); + temp_file_name.append(std::to_string(count)); + + return temp_file_name; +} + +void MYSQL_BIN_LOG::generate_skip_event(uchar *buf, size_t buf_len, + my_off_t pos, THD *thd) +{ + uchar *header= buf; + my_time_t now= thd->start_time; + int4store(header, now); + header[EVENT_TYPE_OFFSET]= IGNORABLE_LOG_EVENT; + int4store(header + SERVER_ID_OFFSET, server_id); + int4store(header + EVENT_LEN_OFFSET, buf_len); + int4store(header + LOG_POS_OFFSET, pos + buf_len); + int2store(header + FLAGS_OFFSET, LOG_EVENT_IGNORABLE_F); + if (binlog_checksum_options != BINLOG_CHECKSUM_ALG_OFF && + binlog_checksum_options != BINLOG_CHECKSUM_ALG_UNDEF) + { + ha_checksum crc= my_checksum(0, header, buf_len - BINLOG_CHECKSUM_LEN); + int4store(header + buf_len - BINLOG_CHECKSUM_LEN, crc); + } +} + +template <typename F= std::function<void()>> class Scoped_guard +{ +public: + Scoped_guard(F f); + ~Scoped_guard(); + +private: + F func; +}; + +template <typename F> Scoped_guard<F>::Scoped_guard(F f) : func(f) {} + +template <typename F> Scoped_guard<F>::~Scoped_guard() +{ + if (func) + func(); +} + +bool MYSQL_BIN_LOG::write_gtid_and_skip_event(my_off_t file_size, + group_commit_entry *entry) +{ + mysql_mutex_assert_owner(&LOCK_log); + DBUG_EXECUTE_IF("non_blocking_binlog_write_gtid_failed", { return true; }); + uchar *skip_event_buf= nullptr; + Scoped_guard<> scoped_guard{[&]() { + if (skip_event_buf) + my_free(skip_event_buf); + }}; + bool has_xid= entry->end_event->get_type_code() == XID_EVENT; + if (write_gtid_event(entry->thd, is_prepared_xa(entry->thd), + entry->using_trx_cache, 0, has_xid, entry->ro_1pc)) + return true; + my_off_t skip_event_start= my_b_safe_tell(&log_file); + size_t skip_event_len= non_blocking_binlog_reserved_size - skip_event_start; + skip_event_buf= + (uchar *) my_malloc(PSI_INSTRUMENT_ME, skip_event_len, MYF(MY_ZEROFILL)); + generate_skip_event(skip_event_buf, skip_event_len, skip_event_start, + entry->thd); + if (my_b_write(&log_file, skip_event_buf, skip_event_len) != 0) + return true; + DBUG_ASSERT(non_blocking_binlog_reserved_size == my_b_safe_tell(&log_file)); + if (flush_io_cache(&log_file) != 0) + return true; + if (reinit_io_cache(&log_file, io_cache_type, file_size, FALSE, TRUE)) + return true; + return false; +} + +bool MYSQL_BIN_LOG::write_non_blocking_binlog(group_commit_entry *entry) +{ + /* generate random file name */ + std::string temp_file_name(generate_random_file_name()); + if (temp_file_name.empty()) + { + sql_print_warning("Generate random file name failed!"); + return true; + } + if (temp_file_name.size() >= FN_REFLEN) + { + sql_print_warning("The name of temp file '%s' is too long!", + temp_file_name.c_str()); + return true; + } + THD *thd= entry->thd; + IO_CACHE temp_log_file; + File file= -1; + bool io_cache_inited= false; + bool delete_file= true; + bool binlog_checksum_locked= false; + // ignore all errors in this function + Dummy_error_handler dummy_error_handler; + thd->push_internal_handler(&dummy_error_handler); + Scoped_guard<> scoped_guard{[&]() { + if (io_cache_inited) + end_io_cache(&temp_log_file); + if (file > 0) + { + mysql_file_close(file, MYF(MY_WME)); + if (delete_file) + { + mysql_file_delete(PSI_NOT_INSTRUMENTED, temp_file_name.c_str(), + MYF(MY_WME)); + } + } + if (binlog_checksum_locked) + mysql_rwlock_unlock(&binlog_checksum_rwlock); + thd->pop_internal_handler(); + }}; + + // open temporary file + int flags= O_WRONLY | O_CREAT; + DBUG_EXECUTE_IF("non_blocking_binlog_open_file_failed", + { flags= O_WRONLY; }); + if ((file= mysql_file_open(PSI_NOT_INSTRUMENTED, temp_file_name.c_str(), + flags, MYF(MY_WME))) < 0) + { + sql_print_error("Open file '%s' failed!", temp_file_name.c_str()); + return true; + } + if (init_io_cache(&temp_log_file, file, LOG_BIN_IO_SIZE, WRITE_CACHE, + non_blocking_binlog_reserved_size, 0, + MYF(MY_WME | MY_NABP | MY_WAIT_IF_FULL))) + { + sql_print_error("Init io cache for file '%s' failed!", + temp_file_name.c_str()); + return true; + } + io_cache_inited= true; + // prevent any changes to binlog_checksum_options while writing cache + mysql_rwlock_rdlock(&binlog_checksum_rwlock); + binlog_checksum_locked= true; + enum_binlog_checksum_alg alg= + (enum_binlog_checksum_alg) binlog_checksum_options; + // write cache + { + binlog_cache_mngr *mngr= entry->cache_mngr; + binlog_cache_data *cache_data= mngr->get_binlog_cache_data(TRUE); + IO_CACHE *cache= &cache_data->cache_log; + if (reinit_io_cache(cache, READ_CACHE, 0, 0, 0)) + return true; + if (write_cache(entry->thd, cache_data, &temp_log_file) || + DBUG_IF("non_blocking_binlog_write_cache_failed")) + return true; + if (write_event(entry->end_event, alg, nullptr, &temp_log_file)) + return true; + status_var_add(entry->thd->status_var.binlog_bytes_written, + entry->end_event->data_written); + if (unlikely(cache->error)) + return true; + } + if (flush_io_cache(&temp_log_file) != 0) + return true; + mysql_mutex_lock(&LOCK_log); + mysql_mutex_lock(&LOCK_index); + // rename temporary file to next binlog file, abort if any error occurs + { + char new_name[FN_REFLEN], *new_name_ptr, *old_name; + File UNINIT_VAR(old_file); + // Reuse old name if not binlog and not update log + new_name_ptr= name; + if (unlikely(generate_new_name(new_name, name, 0))) + abort(); + new_name_ptr= new_name; + /* + We log the whole file name for log file as the user may decide + to change base names at some point. + */ + Rotate_log_event r(new_name + dirname_length(new_name), 0, + LOG_EVENT_OFFSET, 0); + if (unlikely(write_event(&r, alg))) + abort(); + if (unlikely(flush_io_cache(&log_file) != 0)) + abort(); + update_binlog_end_pos(); + old_name= name; + name= 0; // Don't free name + old_file= log_file.file; + uint close_flag= + LOG_CLOSE_TO_BE_OPENED | LOG_CLOSE_INDEX | LOG_CLOSE_DELAYED_CLOSE; + /* + We need to keep the old binlog file open (and marked as in-use) until + the new one is fully created and synced to disk and index. Otherwise we + leave a window where if we crash, there is no binlog file marked as + crashed for server restart to detect the need for recovery. + */ + close(close_flag); + if (unlikely(open_index_file(index_file_name, 0, FALSE))) + abort(); + if (unlikely(open(old_name, new_name_ptr, 0, io_cache_type, max_size, 1, + FALSE, temp_file_name.c_str(), + my_b_safe_tell(&temp_log_file), entry))) + abort(); + my_free(old_name); + delete_file= false; + clear_inuse_flag_when_closing(old_file); + mysql_file_close(old_file, MYF(MY_WME)); + } + mysql_mutex_unlock(&LOCK_index); + // unlock lock_log in trx_group_commit_leader +#ifndef NDEBUG + sql_print_information("write non-blocking binlog succeed!"); +#endif /* ! NDEBUG */ + return false; +} diff --git a/sql/log.h b/sql/log.h index fc5209d1922..af2b56da802 100644 --- a/sql/log.h +++ b/sql/log.h @@ -19,6 +19,7 @@ #include "handler.h" /* my_xid */ #include "rpl_constants.h" +#include <string> class Relay_log_info; class Gtid_index_writer; @@ -406,7 +407,8 @@ class Event_log: public MYSQL_LOG void set_write_error(THD *thd, bool is_transactional); static bool check_write_error(THD *thd); static bool check_cache_error(THD *thd, binlog_cache_data *cache_data); - int write_cache(THD *thd, binlog_cache_data *cache_data); + int write_cache(THD *thd, binlog_cache_data *cache_data, + IO_CACHE *f= nullptr); int write_cache_raw(THD *thd, IO_CACHE *cache); char* get_name() { return name; } void cleanup() @@ -650,6 +652,7 @@ class MYSQL_BIN_LOG: public TC_LOG, private Event_log bool queued_by_other; ulong binlog_id; bool ro_1pc; // passes the binlog_cache_mngr::ro_1pc value to Gtid ctor + bool non_blocking_log; }; /* @@ -986,7 +989,10 @@ class MYSQL_BIN_LOG: public TC_LOG, private Event_log enum cache_type io_cache_type_arg, ulong max_size, bool null_created, - bool need_mutex); + bool need_mutex, + const char *file_to_rename= nullptr, + my_off_t file_size= 0, + group_commit_entry *entry= nullptr); bool open_index_file(const char *index_file_name_arg, const char *log_name, bool need_mutex); /* Use this to start writing a new log file */ @@ -1177,6 +1183,24 @@ class MYSQL_BIN_LOG: public TC_LOG, private Event_log */ my_off_t binlog_end_pos; char binlog_end_pos_file[FN_REFLEN]; + +private: + // reserved size for format_description_log_event, gtid_list_log_event ... + static const my_off_t non_blocking_binlog_reserved_size= 10 * 1024 * 1024; + bool can_use_non_blocking_binlog(group_commit_entry *entry); + std::string generate_random_file_name(); + void generate_skip_event(uchar *buf, size_t buf_len, my_off_t pos, THD *thd); + /* + write gtid_log_event and ignorable_log_event(skip event) to binlog + @returns if a problem occurs, false otherwise + */ + bool write_gtid_and_skip_event(my_off_t file_size, + group_commit_entry *entry); + /* + write non-blocking binlog for big transaction + @returns true if a problem occurs, false otherwise + */ + bool write_non_blocking_binlog(group_commit_entry *entry); }; class Log_event_handler @@ -1476,4 +1500,6 @@ int binlog_commit_by_xid(handlerton *hton, XID *xid); int binlog_rollback_by_xid(handlerton *hton, XID *xid); bool write_bin_log_start_alter(THD *thd, bool& partial_alter, uint64 start_alter_id, bool log_if_exists); + +extern mysql_rwlock_t binlog_checksum_rwlock; #endif /* LOG_H */ diff --git a/sql/mysqld.cc b/sql/mysqld.cc index dc4a955aa79..8f9bdb02599 100644 --- a/sql/mysqld.cc +++ b/sql/mysqld.cc @@ -557,6 +557,7 @@ ulong opt_binlog_commit_wait_usec= 0; ulong opt_slave_parallel_max_queued= 131072; my_bool opt_gtid_ignore_duplicates= FALSE; uint opt_gtid_cleanup_batch_size= 64; +ulonglong non_blocking_binlog_threshold= ULONGLONG_MAX; const double log_10[] = { 1e000, 1e001, 1e002, 1e003, 1e004, 1e005, 1e006, 1e007, 1e008, 1e009, @@ -1059,7 +1060,8 @@ PSI_rwlock_key key_rwlock_LOCK_grant, key_rwlock_LOCK_logger, key_rwlock_LOCK_vers_stats, key_rwlock_LOCK_stat_serial, key_rwlock_LOCK_ssl_refresh, key_rwlock_THD_list, - key_rwlock_LOCK_all_status_vars; + key_rwlock_LOCK_all_status_vars, + key_rwlock_binlog_checksum; static PSI_rwlock_info all_server_rwlocks[]= { @@ -1077,7 +1079,8 @@ static PSI_rwlock_info all_server_rwlocks[]= { &key_rwlock_LOCK_stat_serial, "TABLE_SHARE::LOCK_stat_serial", 0}, { &key_rwlock_LOCK_ssl_refresh, "LOCK_ssl_refresh", PSI_FLAG_GLOBAL }, { &key_rwlock_THD_list, "THD_list::lock", PSI_FLAG_GLOBAL }, - { &key_rwlock_LOCK_all_status_vars, "LOCK_all_status_vars", PSI_FLAG_GLOBAL } + { &key_rwlock_LOCK_all_status_vars, "LOCK_all_status_vars", PSI_FLAG_GLOBAL }, + { &key_rwlock_binlog_checksum, "binlog_checksum_rwlock", PSI_FLAG_GLOBAL } }; #ifdef HAVE_MMAP @@ -2105,6 +2108,7 @@ static void clean_up_mutexes() mysql_mutex_destroy(&LOCK_start_thread); mysql_mutex_destroy(&LOCK_status); mysql_rwlock_destroy(&LOCK_all_status_vars); + mysql_rwlock_destroy(&binlog_checksum_rwlock); mysql_mutex_destroy(&LOCK_delayed_insert); mysql_mutex_destroy(&LOCK_delayed_status); mysql_mutex_destroy(&LOCK_delayed_create); @@ -4428,6 +4432,7 @@ static int init_thread_environment() mysql_rwlock_init(key_rwlock_LOCK_ssl_refresh, &LOCK_ssl_refresh); mysql_rwlock_init(key_rwlock_LOCK_grant, &LOCK_grant); mysql_rwlock_init(key_rwlock_LOCK_all_status_vars, &LOCK_all_status_vars); + mysql_rwlock_init(key_rwlock_binlog_checksum, &binlog_checksum_rwlock); mysql_cond_init(key_COND_start_thread, &COND_start_thread, NULL); #ifdef HAVE_REPLICATION mysql_mutex_init(key_LOCK_rpl_status, &LOCK_rpl_status, MY_MUTEX_INIT_FAST); diff --git a/sql/mysqld.h b/sql/mysqld.h index 4548a247ac1..4f9dfe72b4c 100644 --- a/sql/mysqld.h +++ b/sql/mysqld.h @@ -317,6 +317,7 @@ extern const char *encryption_algorithm_names[]; extern long opt_secure_timestamp; extern uint default_password_lifetime; extern my_bool disconnect_on_expired_password; +extern ulonglong non_blocking_binlog_threshold; enum secure_timestamp { SECTIME_NO, SECTIME_SUPER, SECTIME_REPL, SECTIME_YES }; bool is_set_timestamp_forbidden(THD *thd); @@ -371,7 +372,8 @@ extern PSI_rwlock_key key_rwlock_LOCK_grant, key_rwlock_LOCK_logger, key_rwlock_LOCK_system_variables_hash, key_rwlock_query_cache_query_lock, key_LOCK_SEQUENCE, key_rwlock_LOCK_vers_stats, key_rwlock_LOCK_stat_serial, - key_rwlock_THD_list; + key_rwlock_THD_list, + key_rwlock_binlog_checksum; #ifdef HAVE_MMAP extern PSI_cond_key key_PAGE_cond, key_COND_active, key_COND_pool; diff --git a/sql/sys_vars.cc b/sql/sys_vars.cc index 47e31291e6b..74e6fe9bb23 100644 --- a/sql/sys_vars.cc +++ b/sql/sys_vars.cc @@ -7327,3 +7327,11 @@ static Sys_var_enum Sys_block_encryption_mode( "AES_ENCRYPT() and AES_DECRYPT() functions", SESSION_VAR(block_encryption_mode), CMD_LINE(REQUIRED_ARG), block_encryption_mode_values, DEFAULT(0)); + +static Sys_var_ulonglong Sys_var_non_blocking_binlog_threshold( + "non_blocking_binlog_threshold", + "If the binlog size of a transaction exceeds this value, we write it to a " + "new temporary file and rename it to the next binlog file.", + GLOBAL_VAR(non_blocking_binlog_threshold), CMD_LINE(OPT_ARG), + VALID_RANGE(256 * 1024 * 1024, ULONGLONG_MAX), DEFAULT(ULONGLONG_MAX), + BLOCK_SIZE(1)); -- 2.30.2