03 Sep '24
If a slave replicating an event has waited for more than
@@slave_abort_blocking_timeout for a conflicting metadata lock held by a
non-replication thread, the blocking query is killed to allow replication to
proceed and not be blocked indefinitely by a user query.
Signed-off-by: Kristian Nielsen <knielsen(a)knielsen-hq.org>
---
.../rpl/r/slave_abort_blocking_timeout.result | 74 ++++++++++++++++
.../rpl/t/slave_abort_blocking_timeout.test | 85 +++++++++++++++++++
sql/mdl.cc | 43 ++++++++--
sql/mdl.h | 3 +-
sql/mysqld.cc | 1 +
sql/mysqld.h | 1 +
sql/privilege.h | 2 +
sql/sql_base.cc | 2 +-
sql/sql_class.cc | 17 ++--
sql/sql_class.h | 3 +-
sql/sys_vars.cc | 11 +++
11 files changed, 225 insertions(+), 17 deletions(-)
create mode 100644 mysql-test/suite/rpl/r/slave_abort_blocking_timeout.result
create mode 100644 mysql-test/suite/rpl/t/slave_abort_blocking_timeout.test
diff --git a/mysql-test/suite/rpl/r/slave_abort_blocking_timeout.result b/mysql-test/suite/rpl/r/slave_abort_blocking_timeout.result
new file mode 100644
index 00000000000..911ea4b070e
--- /dev/null
+++ b/mysql-test/suite/rpl/r/slave_abort_blocking_timeout.result
@@ -0,0 +1,74 @@
+include/master-slave.inc
+[connection master]
+*** Testcase to show how a long-running SELECT can block replication from proceeding
+*** past a DDL. Intention to implement a timeout after which such SELECT can be
+*** killed.
+connection master;
+CREATE TABLE t1 (a INT PRIMARY KEY, b INT) ENGINE=InnoDB;
+INSERT INTO t1 SELECT seq, 100+seq FROM seq_1_to_20;
+connection slave;
+include/stop_slave.inc
+SELECT @@GLOBAL.slave_abort_blocking_timeout;
+@@GLOBAL.slave_abort_blocking_timeout
+31536000.000000
+SET @old_abort_timeout= @@slave_abort_blocking_timeout;
+SET GLOBAL slave_abort_blocking_timeout= -1;
+Warnings:
+Warning 1292 Truncated incorrect slave_abort_blocking_timeout value: '-1'
+SELECT @@GLOBAL.slave_abort_blocking_timeout;
+@@GLOBAL.slave_abort_blocking_timeout
+0.000000
+SET GLOBAL slave_abort_blocking_timeout= 1.0;
+SELECT @@GLOBAL.slave_abort_blocking_timeout;
+@@GLOBAL.slave_abort_blocking_timeout
+1.000000
+connection server_2;
+SELECT X.a, SLEEP(IF((X.b MOD 2)=0, 0.4, 0.6)) FROM t1 X CROSS JOIN t1 Y;
+connection slave;
+connection master;
+UPDATE t1 SET b=b+1000 WHERE a=1;
+ALTER TABLE t1 ADD INDEX b_idx(b);
+UPDATE t1 SET b=b+1000 WHERE a=20;
+connection slave;
+include/start_slave.inc
+connection server_2;
+ERROR 70100: Query execution was interrupted
+connection slave;
+SHOW CREATE TABLE t1;
+Table t1
+Create Table CREATE TABLE `t1` (
+ `a` int(11) NOT NULL,
+ `b` int(11) DEFAULT NULL,
+ PRIMARY KEY (`a`),
+ KEY `b_idx` (`b`)
+) ENGINE=InnoDB DEFAULT CHARSET=utf8mb4 COLLATE=utf8mb4_uca1400_ai_ci
+include/stop_slave.inc
+SET GLOBAL slave_abort_blocking_timeout= 0;
+SELECT @@GLOBAL.slave_abort_blocking_timeout;
+@@GLOBAL.slave_abort_blocking_timeout
+0.000000
+connection server_2;
+SELECT X.a, SLEEP(IF((X.b MOD 2)=0, 0.4, 0.6)) FROM t1 X CROSS JOIN t1 Y;
+connection slave;
+connection master;
+UPDATE t1 SET b=b+1000 WHERE a=1;
+ALTER TABLE t1 DROP INDEX b_idx;
+UPDATE t1 SET b=b+1000 WHERE a=20;
+connection slave;
+include/start_slave.inc
+connection server_2;
+ERROR 70100: Query execution was interrupted
+connection slave;
+SHOW CREATE TABLE t1;
+Table t1
+Create Table CREATE TABLE `t1` (
+ `a` int(11) NOT NULL,
+ `b` int(11) DEFAULT NULL,
+ PRIMARY KEY (`a`)
+) ENGINE=InnoDB DEFAULT CHARSET=utf8mb4 COLLATE=utf8mb4_uca1400_ai_ci
+include/stop_slave.inc
+SET GLOBAL slave_abort_blocking_timeout= @old_abort_timeout;
+include/start_slave.inc
+connection master;
+DROP TABLE t1;
+include/rpl_end.inc
diff --git a/mysql-test/suite/rpl/t/slave_abort_blocking_timeout.test b/mysql-test/suite/rpl/t/slave_abort_blocking_timeout.test
new file mode 100644
index 00000000000..04f24ff5df8
--- /dev/null
+++ b/mysql-test/suite/rpl/t/slave_abort_blocking_timeout.test
@@ -0,0 +1,85 @@
+--source include/have_innodb.inc
+--source include/have_sequence.inc
+--source include/have_binlog_format_mixed.inc
+--source include/master-slave.inc
+
+--echo *** Testcase to show how a long-running SELECT can block replication from proceeding
+--echo *** past a DDL. Intention to implement a timeout after which such SELECT can be
+--echo *** killed.
+
+--connection master
+CREATE TABLE t1 (a INT PRIMARY KEY, b INT) ENGINE=InnoDB;
+INSERT INTO t1 SELECT seq, 100+seq FROM seq_1_to_20;
+
+--sync_slave_with_master
+
+--source include/stop_slave.inc
+SELECT @@GLOBAL.slave_abort_blocking_timeout;
+SET @old_abort_timeout= @@slave_abort_blocking_timeout;
+SET GLOBAL slave_abort_blocking_timeout= -1;
+SELECT @@GLOBAL.slave_abort_blocking_timeout;
+SET GLOBAL slave_abort_blocking_timeout= 1.0;
+SELECT @@GLOBAL.slave_abort_blocking_timeout;
+--connection server_2
+# Start a SELECT that will run for long.
+send SELECT X.a, SLEEP(IF((X.b MOD 2)=0, 0.4, 0.6)) FROM t1 X CROSS JOIN t1 Y;
+
+--connection slave
+# Wait for the SELECT to have started so it will block the coming DDL
+# from replicating.
+--let $wait_condition= SELECT COUNT(*)=1 FROM INFORMATION_SCHEMA.PROCESSLIST WHERE state = 'User sleep'
+--source include/wait_condition.inc
+
+--connection master
+UPDATE t1 SET b=b+1000 WHERE a=1;
+ALTER TABLE t1 ADD INDEX b_idx(b);
+UPDATE t1 SET b=b+1000 WHERE a=20;
+
+--save_master_pos
+--connection slave
+--source include/start_slave.inc
+--sync_with_master
+
+--connection server_2
+--error ER_QUERY_INTERRUPTED
+reap;
+
+--connection slave
+query_vertical SHOW CREATE TABLE t1;
+
+# Do it again to test that a timeout of 0 also works to abort user queries.
+--source include/stop_slave.inc
+SET GLOBAL slave_abort_blocking_timeout= 0;
+SELECT @@GLOBAL.slave_abort_blocking_timeout;
+--connection server_2
+send SELECT X.a, SLEEP(IF((X.b MOD 2)=0, 0.4, 0.6)) FROM t1 X CROSS JOIN t1 Y;
+
+--connection slave
+--let $wait_condition= SELECT COUNT(*)=1 FROM INFORMATION_SCHEMA.PROCESSLIST WHERE state = 'User sleep'
+--source include/wait_condition.inc
+
+--connection master
+UPDATE t1 SET b=b+1000 WHERE a=1;
+ALTER TABLE t1 DROP INDEX b_idx;
+UPDATE t1 SET b=b+1000 WHERE a=20;
+
+--save_master_pos
+--connection slave
+--source include/start_slave.inc
+--sync_with_master
+
+--connection server_2
+--error ER_QUERY_INTERRUPTED
+reap;
+
+--connection slave
+query_vertical SHOW CREATE TABLE t1;
+
+
+--source include/stop_slave.inc
+SET GLOBAL slave_abort_blocking_timeout= @old_abort_timeout;
+--source include/start_slave.inc
+
+--connection master
+DROP TABLE t1;
+--source include/rpl_end.inc
diff --git a/sql/mdl.cc b/sql/mdl.cc
index faccd1c9476..9845718e165 100644
--- a/sql/mdl.cc
+++ b/sql/mdl.cc
@@ -613,7 +613,7 @@ class MDL_lock
bool needs_notification(const MDL_ticket *ticket) const
{ return m_strategy->needs_notification(ticket); }
- void notify_conflicting_locks(MDL_context *ctx)
+ void notify_conflicting_locks(MDL_context *ctx, bool abort_blocking)
{
for (const auto &conflicting_ticket : m_granted)
{
@@ -624,7 +624,8 @@ class MDL_lock
ctx->get_owner()->
notify_shared_lock(conflicting_ctx->get_owner(),
- conflicting_ctx->get_needs_thr_lock_abort());
+ conflicting_ctx->get_needs_thr_lock_abort(),
+ abort_blocking);
}
}
}
@@ -2361,10 +2362,10 @@ MDL_context::acquire_lock(MDL_request *mdl_request, double lock_wait_timeout)
/*
Don't break conflicting locks if timeout is 0 as 0 is used
- To check if there is any conflicting locks...
+ to check if there is any conflicting locks...
*/
if (lock->needs_notification(ticket) && lock_wait_timeout)
- lock->notify_conflicting_locks(this);
+ lock->notify_conflicting_locks(this, false);
/*
Ensure that if we are trying to get an exclusive lock for a slave
@@ -2397,14 +2398,39 @@ MDL_context::acquire_lock(MDL_request *mdl_request, double lock_wait_timeout)
find_deadlock();
- struct timespec abs_timeout, abs_shortwait;
+ struct timespec abs_timeout, abs_shortwait, abs_abort_blocking_timeout;
+ bool abort_blocking_enabled= false;
+ double abort_blocking_timeout= slave_abort_blocking_timeout;
+ if (abort_blocking_timeout < lock_wait_timeout &&
+ m_owner->get_thd()->rgi_slave)
+ {
+ set_timespec_nsec(abs_abort_blocking_timeout,
+ (ulonglong)(abort_blocking_timeout * 1000000000ULL));
+ abort_blocking_enabled= true;
+ }
set_timespec_nsec(abs_timeout,
(ulonglong)(lock_wait_timeout * 1000000000ULL));
- set_timespec(abs_shortwait, 1);
wait_status= MDL_wait::EMPTY;
- while (cmp_timespec(abs_shortwait, abs_timeout) <= 0)
+ for (;;)
{
+ bool abort_blocking= false;
+ set_timespec(abs_shortwait, 1);
+ if (abort_blocking_enabled &&
+ cmp_timespec(abs_shortwait, abs_abort_blocking_timeout) >= 0)
+ {
+ /*
+ If a slave DDL has waited for --slave-abort-select-timeout, then notify
+ any blocking SELECT once before continuing to wait until the full
+ timeout.
+ */
+ abs_shortwait= abs_abort_blocking_timeout;
+ abort_blocking= true;
+ abort_blocking_enabled= false;
+ }
+ if (cmp_timespec(abs_shortwait, abs_timeout) > 0)
+ break;
+
/* abs_timeout is far away. Wait a short while and notify locks. */
wait_status= m_wait.timed_wait(m_owner, &abs_shortwait, FALSE,
mdl_request->key.get_wait_state_name());
@@ -2425,9 +2451,8 @@ MDL_context::acquire_lock(MDL_request *mdl_request, double lock_wait_timeout)
mysql_prlock_wrlock(&lock->m_rwlock);
if (lock->needs_notification(ticket))
- lock->notify_conflicting_locks(this);
+ lock->notify_conflicting_locks(this, abort_blocking);
mysql_prlock_unlock(&lock->m_rwlock);
- set_timespec(abs_shortwait, 1);
}
if (wait_status == MDL_wait::EMPTY)
wait_status= m_wait.timed_wait(m_owner, &abs_timeout, TRUE,
diff --git a/sql/mdl.h b/sql/mdl.h
index 68cf5d2e811..aa271cb6d09 100644
--- a/sql/mdl.h
+++ b/sql/mdl.h
@@ -110,7 +110,8 @@ class MDL_context_owner
@see THD::notify_shared_lock()
*/
virtual bool notify_shared_lock(MDL_context_owner *in_use,
- bool needs_thr_lock_abort) = 0;
+ bool needs_thr_lock_abort,
+ bool needs_non_slave_abort) = 0;
};
/**
diff --git a/sql/mysqld.cc b/sql/mysqld.cc
index e938e8f6cfa..7e0a7f339c0 100644
--- a/sql/mysqld.cc
+++ b/sql/mysqld.cc
@@ -494,6 +494,7 @@ uint internal_slave_connections_needed_for_purge;
ulong slave_max_allowed_packet= 0;
double slave_max_statement_time_double;
ulonglong slave_max_statement_time;
+double slave_abort_blocking_timeout;
ulonglong binlog_stmt_cache_size=0;
ulonglong max_binlog_stmt_cache_size=0;
ulonglong test_flags;
diff --git a/sql/mysqld.h b/sql/mysqld.h
index 7cc88be0ad1..20644f0c404 100644
--- a/sql/mysqld.h
+++ b/sql/mysqld.h
@@ -242,6 +242,7 @@ extern ulong max_binlog_size;
extern ulong slave_max_allowed_packet;
extern ulonglong slave_max_statement_time;
extern double slave_max_statement_time_double;
+extern double slave_abort_blocking_timeout;
extern ulong opt_binlog_rows_event_max_size;
extern ulong binlog_row_metadata;
extern my_bool opt_binlog_gtid_index;
diff --git a/sql/privilege.h b/sql/privilege.h
index 84efc010d1e..eec0eb49df1 100644
--- a/sql/privilege.h
+++ b/sql/privilege.h
@@ -598,6 +598,8 @@ constexpr privilege_t PRIV_SET_SYSTEM_GLOBAL_VAR_SLAVE_MAX_ALLOWED_PACKET=
REPL_SLAVE_ADMIN_ACL;
constexpr privilege_t PRIV_SET_SYSTEM_GLOBAL_VAR_SLAVE_MAX_STATEMENT_TIME=
REPL_SLAVE_ADMIN_ACL;
+constexpr privilege_t PRIV_SET_SYSTEM_GLOBAL_VAR_SLAVE_ABORT_BLOCKING_TIMEOUT=
+ REPL_SLAVE_ADMIN_ACL;
constexpr privilege_t PRIV_SET_SYSTEM_GLOBAL_VAR_SLAVE_NET_TIMEOUT=
REPL_SLAVE_ADMIN_ACL;
constexpr privilege_t PRIV_SET_SYSTEM_GLOBAL_VAR_SLAVE_PARALLEL_MAX_QUEUED=
diff --git a/sql/sql_base.cc b/sql/sql_base.cc
index b045bf14cd9..a96082e5903 100644
--- a/sql/sql_base.cc
+++ b/sql/sql_base.cc
@@ -1025,7 +1025,7 @@ void close_thread_table(THD *thd, TABLE **table_ptr)
thd->handler_stats.add(file->handler_stats);
}
/*
- This look is needed to allow THD::notify_shared_lock() to
+ This lock is needed to allow THD::notify_shared_lock() to
traverse the thd->open_tables list without having to worry that
some of the tables are removed from under it
*/
diff --git a/sql/sql_class.cc b/sql/sql_class.cc
index 13818813574..595ffd681a5 100644
--- a/sql/sql_class.cc
+++ b/sql/sql_class.cc
@@ -2158,21 +2158,28 @@ void THD::disconnect()
bool THD::notify_shared_lock(MDL_context_owner *ctx_in_use,
- bool needs_thr_lock_abort)
+ bool needs_thr_lock_abort,
+ bool needs_non_slave_abort)
{
THD *in_use= ctx_in_use->get_thd();
bool signalled= FALSE;
DBUG_ENTER("THD::notify_shared_lock");
DBUG_PRINT("enter",("needs_thr_lock_abort: %d", needs_thr_lock_abort));
- if ((in_use->system_thread & SYSTEM_THREAD_DELAYED_INSERT) &&
- !in_use->killed)
+ enum killed_state kill_signal;
+ if (in_use->system_thread & SYSTEM_THREAD_DELAYED_INSERT)
+ kill_signal= KILL_CONNECTION;
+ else if (needs_non_slave_abort && !in_use->slave_thread)
+ kill_signal= KILL_QUERY;
+ else
+ kill_signal= NOT_KILLED;
+ if (kill_signal != NOT_KILLED && !in_use->killed)
{
/* This code is similar to kill_delayed_threads() */
DBUG_PRINT("info", ("kill delayed thread"));
mysql_mutex_lock(&in_use->LOCK_thd_kill);
- if (in_use->killed < KILL_CONNECTION)
- in_use->set_killed_no_mutex(KILL_CONNECTION);
+ if (in_use->killed < kill_signal)
+ in_use->set_killed_no_mutex(kill_signal);
in_use->abort_current_cond_wait(true);
mysql_mutex_unlock(&in_use->LOCK_thd_kill);
signalled= TRUE;
diff --git a/sql/sql_class.h b/sql/sql_class.h
index f79d99c902e..410dddd9d3a 100644
--- a/sql/sql_class.h
+++ b/sql/sql_class.h
@@ -4288,7 +4288,8 @@ class THD: public THD_count, /* this must be first */
@retval FALSE otherwise.
*/
bool notify_shared_lock(MDL_context_owner *ctx_in_use,
- bool needs_thr_lock_abort) override;
+ bool needs_thr_lock_abort,
+ bool needs_non_slave_abort) override;
// End implementation of MDL_context_owner interface.
diff --git a/sql/sys_vars.cc b/sql/sys_vars.cc
index d4997793428..25be6ffeda6 100644
--- a/sql/sys_vars.cc
+++ b/sql/sys_vars.cc
@@ -2591,6 +2591,17 @@ static Sys_var_on_access_global<
GLOBAL_VAR(slave_max_statement_time_double), CMD_LINE(REQUIRED_ARG),
VALID_RANGE(0, LONG_TIMEOUT), DEFAULT(0), NO_MUTEX_GUARD,
NOT_IN_BINLOG, ON_CHECK(0), ON_UPDATE(update_slave_max_statement_time));
+
+static Sys_var_on_access_global<
+ Sys_var_double, PRIV_SET_SYSTEM_GLOBAL_VAR_SLAVE_ABORT_BLOCKING_TIMEOUT>
+ Sys_slave_abort_blocking_timeout(
+ "slave_abort_blocking_timeout",
+ "Maximum time a slave DDL will wait for a blocking SELECT or other "
+ "user query until that query will be aborted. The argument will be "
+ "treated as a decimal value with microsecond precision",
+ GLOBAL_VAR(slave_abort_blocking_timeout), CMD_LINE(REQUIRED_ARG),
+ VALID_RANGE(0, LONG_TIMEOUT), DEFAULT(LONG_TIMEOUT), NO_MUTEX_GUARD,
+ NOT_IN_BINLOG);
#endif
--
2.39.2
2
1
[PATCH] MDEV-32014 Rename binlog cache temporary file to binlog file for large transaction
by Kristian Nielsen 02 Sep '24
by Kristian Nielsen 02 Sep '24
02 Sep '24
From: Libing Song <anders.slb(a)alibaba-inc.com>
Description
===========
When a transaction commits, it copies the binlog events from
binlog cache to binlog file. Very large transactions
(eg. gigabytes) can stall other transactions for a long time
because the data is copied while holding LOCK_log, which blocks
other commits from binlogging.
The solution in this patch is to rename the binlog cache file to
a binlog file instead of copy, if the commiting transaction has
large binlog cache. Rename is a very fast operation, it doesn't
block other transactions a long time.
The feature is called `Binlog Free Flush`, The term will be used
in the design and code.
Design
======
* binlog_free_flush_threshold
type: ulonglong
scope: global
dynamic: yes
default: 128MB
Only the binlog cache temporary files large than 256MB are
renamed to binlog file.
* #binlog_cache_files directory
To support rename, all binlog cache temporary files are managed
as normal files now. `#binlog_cache_files` directory is in the same
directory with binlog files. It is created at server startup if it doesn't
exist. Otherwise, all files in the directory is deleted at startup.
The temporary files are named with ML_ prefix and the memorary address
of the binlog_cache_data object which guarantees it is unique.
* Reserve space
To supprot rename feature, It must reserve enough space at the
begin of the binlog cache file. The space is required for
Format description, Gtid list, checkpoint and Gtid events when
renaming it to a binlog file.
Since binlog_cache_data's cache_log is directly accessed by binlog log,
online alter and wsrep. It is not easy to update all the code. Thus
binlog cache will not reserve space if it is not session binlog cache or
wsrep session is enabled.
- m_file_reserved_bytes
Stores the bytes reserved at the begin of the cache file.
It is initialized in write_prepare() and cleared by reset().
The reserved file header is hide to callers. Thus there is no
change for callers. E.g.
- get_byte_position() still get the length of binlog data
written to the cache, but not the file length.
- truncate(0) will truncate the file to m_file_reserved_bytes but not 0.
- write_prepare()
write_prepare() is called everytime when anything is being written
into the cache. It will call init_file_reserved_bytes() to create
the cache file (if it doesn't exist) and reserve suitable space if
the data written exceeds buffer's size.
* Binlog_free_flush
It is used to encapsulate the code for remaing a binlog cache
tempoary file to binlog file.
- should_free_flush()
it is called by write_transaction_to_binlog_events() to check if
a binlog cache should be rename to a binlog file.
- commit()
That is the entry to rename a binlog cache and commit the
transaction. Both rename and commit are protected by LOCK_log,
Thus not other transactions can write anything into the renamed
binlog before it.
Rename happens in a rotation. After the new binlog file is generated,
replace_binlog_file() is called to:
- copy data from binlog file to binlog cache file.
- write gtid event.
- rename the binlog cache file to binlog file.
After that the rotation will continue to succeed. Then the transaction
is committed. The transaction will be committed in a seperated
group itself. Its cache file will be detached and cache log will be
reset before calling trx_group_commit_leader(). Thus only Xid event
be written.
---
libmysqld/CMakeLists.txt | 2 +-
mysql-test/main/mysqld--help.result | 5 +
mysql-test/main/tmp_space_usage.result | 9 +-
mysql-test/main/tmp_space_usage.test | 10 +-
.../binlog/r/binlog_free_flush_atomic.result | 68 +++
.../binlog/t/binlog_free_flush_atomic.test | 110 +++++
.../encryption/r/binlog_cache_encrypt.result | 18 +
.../t/binlog_cache_encrypt-master.opt | 1 +
.../encryption/t/binlog_cache_encrypt.test | 19 +
.../suite/rpl/r/rpl_binlog_free_flush.result | 117 +++++
.../suite/rpl/t/rpl_binlog_free_flush.test | 217 +++++++++
.../sys_vars/r/sysvars_server_embedded.result | 10 +
.../r/sysvars_server_notembedded.result | 10 +
sql/CMakeLists.txt | 2 +-
sql/log.cc | 432 ++++++++++++++++--
sql/log.h | 21 +-
sql/log_cache.cc | 122 +++++
sql/log_cache.h | 116 ++++-
sql/log_event.h | 13 +
sql/log_event_server.cc | 32 +-
sql/mysqld.cc | 4 +-
sql/sys_vars.cc | 10 +
22 files changed, 1288 insertions(+), 60 deletions(-)
create mode 100644 mysql-test/suite/binlog/r/binlog_free_flush_atomic.result
create mode 100644 mysql-test/suite/binlog/t/binlog_free_flush_atomic.test
create mode 100644 mysql-test/suite/encryption/r/binlog_cache_encrypt.result
create mode 100644 mysql-test/suite/encryption/t/binlog_cache_encrypt-master.opt
create mode 100644 mysql-test/suite/encryption/t/binlog_cache_encrypt.test
create mode 100644 mysql-test/suite/rpl/r/rpl_binlog_free_flush.result
create mode 100644 mysql-test/suite/rpl/t/rpl_binlog_free_flush.test
create mode 100644 sql/log_cache.cc
diff --git a/libmysqld/CMakeLists.txt b/libmysqld/CMakeLists.txt
index 12bcc7ce1aa..f0d837470e2 100644
--- a/libmysqld/CMakeLists.txt
+++ b/libmysqld/CMakeLists.txt
@@ -67,7 +67,7 @@ SET(SQL_EMBEDDED_SOURCES emb_qcache.cc libmysqld.c lib_sql.cc
../sql/item_subselect.cc ../sql/item_sum.cc ../sql/item_timefunc.cc
../sql/item_xmlfunc.cc ../sql/item_jsonfunc.cc
../sql/json_schema.cc ../sql/json_schema_helper.cc
- ../sql/key.cc ../sql/lock.cc ../sql/log.cc
+ ../sql/key.cc ../sql/lock.cc ../sql/log.cc ../sql/log_cache.cc
../sql/log_event.cc ../sql/log_event_server.cc
../sql/mf_iocache.cc ../sql/my_decimal.cc
../sql/net_serv.cc ../sql/opt_range.cc
diff --git a/mysql-test/main/mysqld--help.result b/mysql-test/main/mysqld--help.result
index a1752a356d9..862382b844d 100644
--- a/mysql-test/main/mysqld--help.result
+++ b/mysql-test/main/mysqld--help.result
@@ -95,6 +95,10 @@ The following specify which files/extra groups are read (specified before remain
statement-based binary logging (smaller binary logs),
MIXED for statement-based binary logging when it's safe
with fall back to row-based otherwise
+ --binlog-free-flush-threshold=#
+ Try to rename the binlog cache temporary file of the
+ commiting transaction to a binlog file when its binlog
+ cache size is bigger than the value of this variable
--binlog-gtid-index Enable the creation of a GTID index for every binlog
file, and the use of such index for speeding up GTID
lookup in the binlog
@@ -1620,6 +1624,7 @@ binlog-direct-non-transactional-updates FALSE
binlog-expire-logs-seconds 0
binlog-file-cache-size 16384
binlog-format MIXED
+binlog-free-flush-threshold 134217728
binlog-gtid-index TRUE
binlog-gtid-index-page-size 4096
binlog-gtid-index-span-min 65536
diff --git a/mysql-test/main/tmp_space_usage.result b/mysql-test/main/tmp_space_usage.result
index ff2f58ab437..242227e0783 100644
--- a/mysql-test/main/tmp_space_usage.result
+++ b/mysql-test/main/tmp_space_usage.result
@@ -160,16 +160,17 @@ ERROR HY000: Global temporary space limit reached
#
set @save_max_tmp_total_space_usage=@@global.max_tmp_total_space_usage;
set @@global.max_tmp_total_space_usage=64*1024*1024;
-set @@max_tmp_session_space_usage=1179648;
+set @@max_tmp_session_space_usage=1179648+65536;
select @@max_tmp_session_space_usage;
@@max_tmp_session_space_usage
-1179648
+1245184
set @save_aria_repair_threads=@@aria_repair_threads;
set @@aria_repair_threads=2;
set @save_max_heap_table_size=@@max_heap_table_size;
set @@max_heap_table_size=16777216;
CREATE TABLE t1 (a CHAR(255),b INT,INDEX (b));
INSERT INTO t1 SELECT SEQ,SEQ FROM seq_1_to_100000;
+set @@max_tmp_session_space_usage=1179648;
SELECT * FROM t1 UNION SELECT * FROM t1;
ERROR HY000: Local temporary space limit reached
DROP TABLE t1;
@@ -205,11 +206,13 @@ ERROR HY000: Local temporary space limit reached
#
connect c1, localhost, root,,;
set @@binlog_format=row;
-CREATE OR REPLACE TABLE t1 (a DATETIME) ENGINE=MyISAM;
+CREATE OR REPLACE TABLE t1 (a DATETIME) ENGINE=InnoDB;
+BEGIN;
INSERT INTO t1 SELECT NOW() FROM seq_1_to_6000;
SET max_tmp_session_space_usage = 64*1024;
SELECT * FROM information_schema.ALL_PLUGINS LIMIT 2;
ERROR HY000: Local temporary space limit reached
+ROLLBACK;
drop table t1;
connection default;
disconnect c1;
diff --git a/mysql-test/main/tmp_space_usage.test b/mysql-test/main/tmp_space_usage.test
index af7b295f343..1685dbbc450 100644
--- a/mysql-test/main/tmp_space_usage.test
+++ b/mysql-test/main/tmp_space_usage.test
@@ -215,7 +215,8 @@ select count(distinct concat(seq,repeat('x',1000))) from seq_1_to_1000;
set @save_max_tmp_total_space_usage=@@global.max_tmp_total_space_usage;
set @@global.max_tmp_total_space_usage=64*1024*1024;
-set @@max_tmp_session_space_usage=1179648;
+# Binlog cache reserve 4096 bytes at the begin of the temporary file.
+set @@max_tmp_session_space_usage=1179648+65536;
select @@max_tmp_session_space_usage;
set @save_aria_repair_threads=@@aria_repair_threads;
set @@aria_repair_threads=2;
@@ -224,6 +225,7 @@ set @@max_heap_table_size=16777216;
CREATE TABLE t1 (a CHAR(255),b INT,INDEX (b));
INSERT INTO t1 SELECT SEQ,SEQ FROM seq_1_to_100000;
+set @@max_tmp_session_space_usage=1179648;
--error 200
SELECT * FROM t1 UNION SELECT * FROM t1;
DROP TABLE t1;
@@ -266,11 +268,15 @@ SELECT MIN(VARIABLE_VALUE) OVER (), NTILE(1) OVER (), MAX(VARIABLE_NAME) OVER ()
connect(c1, localhost, root,,);
set @@binlog_format=row;
-CREATE OR REPLACE TABLE t1 (a DATETIME) ENGINE=MyISAM;
+CREATE OR REPLACE TABLE t1 (a DATETIME) ENGINE=InnoDB;
+# Use the transaction to keep binlog cache temporary file large enough
+BEGIN;
INSERT INTO t1 SELECT NOW() FROM seq_1_to_6000;
+
SET max_tmp_session_space_usage = 64*1024;
--error 200
SELECT * FROM information_schema.ALL_PLUGINS LIMIT 2;
+ROLLBACK;
drop table t1;
connection default;
disconnect c1;
diff --git a/mysql-test/suite/binlog/r/binlog_free_flush_atomic.result b/mysql-test/suite/binlog/r/binlog_free_flush_atomic.result
new file mode 100644
index 00000000000..31175d32581
--- /dev/null
+++ b/mysql-test/suite/binlog/r/binlog_free_flush_atomic.result
@@ -0,0 +1,68 @@
+RESET MASTER;
+#
+# binlog cache file is created in #binlog_cache_files directory
+# and it is deleted at disconnect
+#
+connect con1,localhost,root,,;
+CREATE TABLE t1 (c1 LONGTEXT) ENGINE = InnoDB;
+# list binlog_cache_files/
+INSERT INTO t1 values(repeat("1", 5242880));
+INSERT INTO t1 values(repeat("1", 5242880));
+FLUSH BINARY LOGS;
+# list #binlog_cache_files/
+ML_BINLOG_CACHE_FILE
+SET debug_sync = "thread_end SIGNAL signal.thread_end";
+disconnect con1;
+connection default;
+SET debug_sync = "now WAIT_FOR signal.thread_end";
+# binlog cache file is deleted at disconnection
+# list #binlog_cache_files/
+#
+# Reserved space is not big enough, rename will not happen. But rotate
+# will succeed.
+#
+SET GLOBAL binlog_free_flush_threshold = 10 * 1024 * 1024;
+SET debug = 'd,simulate_required_size_too_big';
+UPDATE t1 SET c1 = repeat('2', 5242880);
+include/assert.inc [Binlog is rotated, but free flush is not executed.]
+#
+# Error happens when renaming binlog cache to binlog file, rename will
+# not happen. Since the original binlog is delete, the rotate will failed
+# too. binlog will be closed.
+#
+SET debug = 'd,simulate_rename_binlog_cache_to_binlog_error';
+UPDATE t1 SET c1 = repeat('3', 5242880);
+ERROR HY000: Can't open file: './master-bin.000004' (errno: 1 "Operation not permitted")
+SELECT count(*) FROM t1 WHERE c1 like "3%";
+count(*)
+2
+# Binlog is closed
+show master status;
+File Position Binlog_Do_DB Binlog_Ignore_DB
+# restart
+show master status;
+File Position Binlog_Do_DB Binlog_Ignore_DB
+master-bin.000004 # <Binlog_Do_DB> <Binlog_Ignore_DB>
+#
+# Crash happens before rename the file
+#
+SET GLOBAL binlog_free_flush_threshold = 10 * 1024 * 1024;
+SET debug = 'd,binlog_free_flush_crash_before_rename';
+UPDATE t1 SET c1 = repeat('4', 5242880);
+ERROR HY000: Lost connection to server during query
+# One cache file left afte crash
+# list #binlog_cache_files/
+ML_BINLOG_CACHE_FILE
+non_binlog_cache
+# restart
+# The cache file is deleted at startup.
+# list #binlog_cache_files/
+non_binlog_cache
+include/assert_grep.inc [warning: non_binlog_cache file is in #binlog_cache_files/]
+include/show_binlog_events.inc
+Log_name Pos Event_type Server_id End_log_pos Info
+master-bin.000005 # Format_desc # # SERVER_VERSION, BINLOG_VERSION
+master-bin.000005 # Gtid_list # # [#-#-#]
+call mtr.add_suppression(".*Turning logging off for the whole duration.*");
+call mtr.add_suppression(".*non_binlog_cache is in #binlog_cache_files/.*");
+DROP TABLE t1;
diff --git a/mysql-test/suite/binlog/t/binlog_free_flush_atomic.test b/mysql-test/suite/binlog/t/binlog_free_flush_atomic.test
new file mode 100644
index 00000000000..05b4792c314
--- /dev/null
+++ b/mysql-test/suite/binlog/t/binlog_free_flush_atomic.test
@@ -0,0 +1,110 @@
+################################################################################
+# MDEV-32014 Rename binlog cache to binlog file
+#
+# It verifies that the rename logic is handled correct if error happens.
+################################################################################
+--source include/have_binlog_format_row.inc
+--source include/have_innodb.inc
+--source include/have_debug.inc
+RESET MASTER;
+
+--echo #
+--echo # binlog cache file is created in #binlog_cache_files directory
+--echo # and it is deleted at disconnect
+--echo #
+--connect(con1,localhost,root,,)
+CREATE TABLE t1 (c1 LONGTEXT) ENGINE = InnoDB;
+
+--echo # list binlog_cache_files/
+--let $datadir= `SELECT @@datadir`
+--list_files $datadir/#binlog_cache_files
+
+INSERT INTO t1 values(repeat("1", 5242880));
+INSERT INTO t1 values(repeat("1", 5242880));
+FLUSH BINARY LOGS;
+
+--echo # list #binlog_cache_files/
+--replace_regex /ML_[0-9]+/ML_BINLOG_CACHE_FILE/
+--list_files $datadir/#binlog_cache_files
+
+SET debug_sync = "thread_end SIGNAL signal.thread_end";
+--disconnect con1
+--connection default
+# Wait until the connection is closed completely.
+SET debug_sync = "now WAIT_FOR signal.thread_end";
+
+--echo # binlog cache file is deleted at disconnection
+--echo # list #binlog_cache_files/
+--list_files $datadir/#binlog_cache_files
+
+--echo #
+--echo # Reserved space is not big enough, rename will not happen. But rotate
+--echo # will succeed.
+--echo #
+SET GLOBAL binlog_free_flush_threshold = 10 * 1024 * 1024;
+SET debug = 'd,simulate_required_size_too_big';
+UPDATE t1 SET c1 = repeat('2', 5242880);
+
+--let $gtid_end_pos= query_get_value(SHOW BINLOG EVENTS IN 'master-bin.000002' LIMIT 4, End_log_pos, 4)
+--let $assert_cond= $gtid_end_pos < 4096
+--let $assert_text= Binlog is rotated, but free flush is not executed.
+--source include/assert.inc
+
+--echo #
+--echo # Error happens when renaming binlog cache to binlog file, rename will
+--echo # not happen. Since the original binlog is delete, the rotate will failed
+--echo # too. binlog will be closed.
+--echo #
+SET debug = 'd,simulate_rename_binlog_cache_to_binlog_error';
+--error ER_CANT_OPEN_FILE
+UPDATE t1 SET c1 = repeat('3', 5242880);
+SELECT count(*) FROM t1 WHERE c1 like "3%";
+
+--echo # Binlog is closed
+--source include/show_master_status.inc
+
+--source include/restart_mysqld.inc
+--source include/show_master_status.inc
+
+--echo #
+--echo # Crash happens before rename the file
+--echo #
+SET GLOBAL binlog_free_flush_threshold = 10 * 1024 * 1024;
+
+SET debug = 'd,binlog_free_flush_crash_before_rename';
+--source include/expect_crash.inc
+--error 2013
+UPDATE t1 SET c1 = repeat('4', 5242880);
+
+--write_file $datadir/#binlog_cache_files/non_binlog_cache
+It is not a binlog cache file
+EOF
+
+--echo # One cache file left afte crash
+--echo # list #binlog_cache_files/
+--replace_regex /ML_[0-9]+/ML_BINLOG_CACHE_FILE/
+--list_files $datadir/#binlog_cache_files
+
+--source include/start_mysqld.inc
+--echo # The cache file is deleted at startup.
+--echo # list #binlog_cache_files/
+--list_files $datadir/#binlog_cache_files
+
+--let $assert_text= warning: non_binlog_cache file is in #binlog_cache_files/
+--let $assert_file= $MYSQLTEST_VARDIR/log/mysqld.1.err
+--let $assert_select= non_binlog_cache.*#binlog_cache_files/
+--let $assert_count= 1
+--let $assert_only_after= CURRENT_TEST: binlog.binlog_free_flush_atomic
+--source include/assert_grep.inc
+
+--remove_file $datadir/#binlog_cache_files/non_binlog_cache
+
+--let $binlog_file= LAST
+--let $binlog_start= 4
+--let $skip_checkpoint_events= 1
+--source include/show_binlog_events.inc
+
+call mtr.add_suppression(".*Turning logging off for the whole duration.*");
+call mtr.add_suppression(".*non_binlog_cache is in #binlog_cache_files/.*");
+DROP TABLE t1;
+
diff --git a/mysql-test/suite/encryption/r/binlog_cache_encrypt.result b/mysql-test/suite/encryption/r/binlog_cache_encrypt.result
new file mode 100644
index 00000000000..a479a39cbcf
--- /dev/null
+++ b/mysql-test/suite/encryption/r/binlog_cache_encrypt.result
@@ -0,0 +1,18 @@
+RESET MASTER;
+CREATE TABLE t1 (c1 LONGTEXT) ENGINE = InnoDB;
+INSERT INTO t1 values(repeat("1", 5242880));
+INSERT INTO t1 values(repeat("1", 5242880));
+FLUSH BINARY LOGS;
+SET @saved_threshold= @@GLOBAL.binlog_free_flush_threshold;
+SET GLOBAL binlog_free_flush_threshold = 10 * 1024 * 1024;
+UPDATE t1 SET c1 = repeat('2', 5242880);
+include/show_binlog_events.inc
+Log_name Pos Event_type Server_id End_log_pos Info
+master-bin.000002 # Gtid # # BEGIN GTID #-#-#
+master-bin.000002 # Annotate_rows # # UPDATE t1 SET c1 = repeat('2', 5242880)
+master-bin.000002 # Table_map # # table_id: # (test.t1)
+master-bin.000002 # Update_rows_v1 # # table_id: #
+master-bin.000002 # Update_rows_v1 # # table_id: # flags: STMT_END_F
+master-bin.000002 # Xid # # COMMIT /* XID */
+SET GLOBAL binlog_free_flush_threshold = @saved_threshold;
+DROP TABLE t1;
diff --git a/mysql-test/suite/encryption/t/binlog_cache_encrypt-master.opt b/mysql-test/suite/encryption/t/binlog_cache_encrypt-master.opt
new file mode 100644
index 00000000000..469148de64a
--- /dev/null
+++ b/mysql-test/suite/encryption/t/binlog_cache_encrypt-master.opt
@@ -0,0 +1 @@
+--encrypt-tmp-files=on
diff --git a/mysql-test/suite/encryption/t/binlog_cache_encrypt.test b/mysql-test/suite/encryption/t/binlog_cache_encrypt.test
new file mode 100644
index 00000000000..93725ce653e
--- /dev/null
+++ b/mysql-test/suite/encryption/t/binlog_cache_encrypt.test
@@ -0,0 +1,19 @@
+--source include/have_file_key_management_plugin.inc
+--source include/have_binlog_format_row.inc
+--source include/have_innodb.inc
+RESET MASTER;
+CREATE TABLE t1 (c1 LONGTEXT) ENGINE = InnoDB;
+INSERT INTO t1 values(repeat("1", 5242880));
+INSERT INTO t1 values(repeat("1", 5242880));
+FLUSH BINARY LOGS;
+
+SET @saved_threshold= @@GLOBAL.binlog_free_flush_threshold;
+SET GLOBAL binlog_free_flush_threshold = 10 * 1024 * 1024;
+UPDATE t1 SET c1 = repeat('2', 5242880);
+
+--let $binlog_file= LAST
+--let $skip_checkpoint_events=1
+--source include/show_binlog_events.inc
+
+SET GLOBAL binlog_free_flush_threshold = @saved_threshold;
+DROP TABLE t1;
diff --git a/mysql-test/suite/rpl/r/rpl_binlog_free_flush.result b/mysql-test/suite/rpl/r/rpl_binlog_free_flush.result
new file mode 100644
index 00000000000..a216e252dd8
--- /dev/null
+++ b/mysql-test/suite/rpl/r/rpl_binlog_free_flush.result
@@ -0,0 +1,117 @@
+include/master-slave.inc
+[connection master]
+# Prepare
+SET @saved_threshold= @@GLOBAL.binlog_free_flush_threshold;
+SET @saved_checksum= @@GLOBAL.binlog_checksum;
+SET GLOBAL binlog_checksum = "NONE";
+CREATE TABLE t1 (c1 LONGTEXT) ENGINE = InnoDB;
+CREATE TABLE t2 (c1 LONGTEXT) ENGINE = MyISAM;
+INSERT INTO t1 values(repeat("1", 5242880));
+INSERT INTO t1 values(repeat("1", 5242880));
+INSERT INTO t2 values(repeat("1", 5242880));
+INSERT INTO t2 values(repeat("1", 5242880));
+FLUSH BINARY LOGS;
+# Not renamed to binlog, since the binlog cache is not larger than the
+# threshold. And it should works well after ROLLBACK TO SAVEPOINT
+BEGIN;
+SAVEPOINT s1;
+UPDATE t1 SET c1 = repeat('1', 5242880);
+ROLLBACK TO SAVEPOINT s1;
+UPDATE t1 SET c1 = repeat('2', 5242880);
+SAVEPOINT s2;
+UPDATE t1 SET c1 = repeat('3', 5242880);
+UPDATE t1 SET c1 = repeat('4', 5242880);
+ROLLBACK TO SAVEPOINT s2;
+COMMIT;
+include/assert.inc [Binlog is not rotated]
+#
+# Test binlog cache rename to binlog file with checksum off
+#
+SET GLOBAL binlog_free_flush_threshold = 10 * 1024 * 1024;
+# Transaction cache can be renamed and works well with ROLLBACK TO SAVEPOINT
+BEGIN;
+SAVEPOINT s1;
+UPDATE t1 SET c1 = repeat('2', 5242880);
+ROLLBACK TO s1;
+UPDATE t1 SET c1 = repeat('3', 5242880);
+SAVEPOINT s2;
+UPDATE t1 SET c1 = repeat('4', 5242880);
+UPDATE t1 SET c1 = repeat('5', 5242880);
+UPDATE t1 SET c1 = repeat('6', 5242880);
+ROLLBACK TO SAVEPOINT s2;
+COMMIT;
+INSERT INTO t1 VALUES("after_update");
+include/assert.inc [Free flush is executed.]
+# statement cache can be renamed
+BEGIN;
+UPDATE t2 SET c1 = repeat('4', 5242880);
+INSERT INTO t1 VALUES("after_update");
+COMMIT;
+include/assert.inc [Free flush is executed.]
+# CREATE SELECT works well
+CREATE TABLE t3 SELECT * FROM t1;
+include/assert.inc [Free flush is executed.]
+CREATE TABLE t4 SELECT * FROM t2;
+include/assert.inc [Free flush is executed.]
+# XA statement works well
+XA START "test-a-long-xid========================================";
+UPDATE t1 SET c1 = repeat('1', 5242880);
+XA END "test-a-long-xid========================================";
+XA PREPARE "test-a-long-xid========================================";
+XA COMMIT "test-a-long-xid========================================";
+include/assert.inc [Free flush is executed.]
+XA START "test-xid";
+UPDATE t1 SET c1 = repeat('2', 5242880);
+XA END "test-xid";
+XA COMMIT "test-xid" ONE PHASE;
+include/assert.inc [Free flush is executed.]
+#
+# It works well in the situation that binlog header is larger than
+# IO_SIZE and binlog file's buffer.
+#
+FLUSH BINARY LOGS;
+SET SESSION server_id = 1;
+UPDATE t1 SET c1 = repeat('3', 5242880);
+include/assert.inc [Free flush is executed.]
+#
+# RESET MASTER should work well. It also verifies binlog checksum mechanism.
+#
+include/rpl_reset.inc
+#
+# Test binlog cache rename to binlog file with checksum on
+#
+SET GLOBAL binlog_checksum = "CRC32";
+# It will not rename the cache to file, since the cache's checksum was
+# initialized when reset the cache at the end of previous transaction.
+UPDATE t1 SET c1 = repeat('5', 5242880);
+include/assert.inc [Binlog is not rotated]
+#
+# Not rename to binlog file If the cache's checksum is not same
+# to binlog_checksum
+#
+BEGIN;
+UPDATE t1 SET c1 = repeat('6', 5242880);
+SET GLOBAL binlog_checksum = "NONE";
+COMMIT;
+include/assert.inc [Binlog is not rotated]
+BEGIN;
+UPDATE t1 SET c1 = repeat('7', 5242880);
+SET GLOBAL binlog_checksum = "CRC32";
+COMMIT;
+include/assert.inc [Binlog is not rotated]
+#
+# Not rename to binlog file If both stmt and trx cache are not empty
+#
+UPDATE t1, t2 SET t1.c1 = repeat('8', 5242880), t2.c1 = repeat('7', 5242880);
+include/assert.inc [Binlog is not rotated]
+#
+# Not rename to binlog file If binlog_legacy_event_pos is on
+#
+SET GLOBAL binlog_legacy_event_pos = ON;
+UPDATE t1 SET c1 = repeat('9', 5242880);
+SET GLOBAL binlog_legacy_event_pos = OFF;
+include/assert.inc [Binlog is not rotated]
+DROP TABLE t1, t2, t3, t4;
+SET GLOBAL binlog_free_flush_threshold = @saved_threshold;
+SET GLOBAL binlog_checksum = @saved_checksum;
+include/rpl_end.inc
diff --git a/mysql-test/suite/rpl/t/rpl_binlog_free_flush.test b/mysql-test/suite/rpl/t/rpl_binlog_free_flush.test
new file mode 100644
index 00000000000..5152779dd48
--- /dev/null
+++ b/mysql-test/suite/rpl/t/rpl_binlog_free_flush.test
@@ -0,0 +1,217 @@
+################################################################################
+# MDEV-32014 Rename binlog cache to binlog file
+#
+# It verifies that the binlog caches which are larger
+# than binlog_free_flush_threshold can be move to a binlog file
+# successfully. With a successful rename,
+# - it rotates the binlog and the cache is renamed to the new binlog file
+# - an ignorable event is generated just after the Gtid_log_event of the
+# transaction to take the reserved spaces which is unused.
+#
+# It also verifies that rename is not supported in below cases
+# though the cache is larger than the threshold
+# - both statement and transaction cache should be flushed.
+# - the cache's checksum option is not same to binlog_checksum
+# - binlog_legacy_event_pos is enabled.
+################################################################################
+--source include/master-slave.inc
+--source include/have_binlog_format_row.inc
+--source include/have_innodb.inc
+
+--echo # Prepare
+SET @saved_threshold= @@GLOBAL.binlog_free_flush_threshold;
+SET @saved_checksum= @@GLOBAL.binlog_checksum;
+
+SET GLOBAL binlog_checksum = "NONE";
+
+CREATE TABLE t1 (c1 LONGTEXT) ENGINE = InnoDB;
+CREATE TABLE t2 (c1 LONGTEXT) ENGINE = MyISAM;
+
+INSERT INTO t1 values(repeat("1", 5242880));
+INSERT INTO t1 values(repeat("1", 5242880));
+INSERT INTO t2 values(repeat("1", 5242880));
+INSERT INTO t2 values(repeat("1", 5242880));
+
+FLUSH BINARY LOGS;
+
+--echo # Not renamed to binlog, since the binlog cache is not larger than the
+--echo # threshold. And it should works well after ROLLBACK TO SAVEPOINT
+BEGIN;
+SAVEPOINT s1;
+UPDATE t1 SET c1 = repeat('1', 5242880);
+ROLLBACK TO SAVEPOINT s1;
+UPDATE t1 SET c1 = repeat('2', 5242880);
+SAVEPOINT s2;
+UPDATE t1 SET c1 = repeat('3', 5242880);
+UPDATE t1 SET c1 = repeat('4', 5242880);
+ROLLBACK TO SAVEPOINT s2;
+COMMIT;
+
+--let $binlog_file= query_get_value(SHOW MASTER STATUS, File, 1)
+--let $assert_cond= "$binlog_file" = "master-bin.000003"
+--let $assert_text= Binlog is not rotated
+--source include/assert.inc
+
+--echo #
+--echo # Test binlog cache rename to binlog file with checksum off
+--echo #
+SET GLOBAL binlog_free_flush_threshold = 10 * 1024 * 1024;
+
+--echo # Transaction cache can be renamed and works well with ROLLBACK TO SAVEPOINT
+BEGIN;
+SAVEPOINT s1;
+UPDATE t1 SET c1 = repeat('2', 5242880);
+ROLLBACK TO s1;
+UPDATE t1 SET c1 = repeat('3', 5242880);
+SAVEPOINT s2;
+UPDATE t1 SET c1 = repeat('4', 5242880);
+UPDATE t1 SET c1 = repeat('5', 5242880);
+UPDATE t1 SET c1 = repeat('6', 5242880);
+ROLLBACK TO SAVEPOINT s2;
+COMMIT;
+INSERT INTO t1 VALUES("after_update");
+
+--let $gtid_end_pos= query_get_value(SHOW BINLOG EVENTS IN 'master-bin.000004' LIMIT 4, End_log_pos, 4)
+--let $assert_cond= $gtid_end_pos = 4096
+--let $assert_text= Free flush is executed.
+--source include/assert.inc
+
+--echo # statement cache can be renamed
+BEGIN;
+UPDATE t2 SET c1 = repeat('4', 5242880);
+INSERT INTO t1 VALUES("after_update");
+COMMIT;
+--let $gtid_end_pos= query_get_value(SHOW BINLOG EVENTS IN 'master-bin.000005' LIMIT 4, End_log_pos, 4)
+--let $assert_cond= $gtid_end_pos = 4096
+--let $assert_text= Free flush is executed.
+--source include/assert.inc
+
+--echo # CREATE SELECT works well
+CREATE TABLE t3 SELECT * FROM t1;
+--let $gtid_end_pos= query_get_value(SHOW BINLOG EVENTS IN 'master-bin.000006' LIMIT 4, End_log_pos, 4)
+--let $assert_cond= $gtid_end_pos = 4096
+--let $assert_text= Free flush is executed.
+--source include/assert.inc
+
+CREATE TABLE t4 SELECT * FROM t2;
+--let $gtid_end_pos= query_get_value(SHOW BINLOG EVENTS IN 'master-bin.000007' LIMIT 4, End_log_pos, 4)
+--let $assert_cond= $gtid_end_pos = 4096
+--let $assert_text= Free flush is executed.
+--source include/assert.inc
+
+--echo # XA statement works well
+XA START "test-a-long-xid========================================";
+UPDATE t1 SET c1 = repeat('1', 5242880);
+XA END "test-a-long-xid========================================";
+XA PREPARE "test-a-long-xid========================================";
+XA COMMIT "test-a-long-xid========================================";
+--let $gtid_end_pos= query_get_value(SHOW BINLOG EVENTS IN 'master-bin.000008' LIMIT 4, End_log_pos, 4)
+--let $assert_cond= $gtid_end_pos = 4096
+--let $assert_text= Free flush is executed.
+--source include/assert.inc
+
+XA START "test-xid";
+UPDATE t1 SET c1 = repeat('2', 5242880);
+XA END "test-xid";
+XA COMMIT "test-xid" ONE PHASE;
+--let $gtid_end_pos= query_get_value(SHOW BINLOG EVENTS IN 'master-bin.000009' LIMIT 4, End_log_pos, 4)
+--let $assert_cond= $gtid_end_pos = 4096
+--let $assert_text= Free flush is executed.
+--source include/assert.inc
+
+--echo #
+--echo # It works well in the situation that binlog header is larger than
+--echo # IO_SIZE and binlog file's buffer.
+--echo #
+--disable_query_log
+
+# make Gtid_list_event larger than 64K(binlog file's buffer)
+--let $server_id= 100000
+while ($server_id < 104096)
+{
+ eval SET SESSION server_id = $server_id;
+ eval UPDATE t1 SET c1 = "$server_id" LIMIT 1;
+ --inc $server_id
+}
+
+--enable_query_log
+
+# After flush, reserved space should be updated.
+FLUSH BINARY LOGS;
+
+SET SESSION server_id = 1;
+UPDATE t1 SET c1 = repeat('3', 5242880);
+
+--let $gtid_end_pos= query_get_value(SHOW BINLOG EVENTS IN 'master-bin.000011' LIMIT 4, End_log_pos, 4)
+# 69632 is 65K which is larger, binlog's buffer is 64K
+--let $assert_cond= $gtid_end_pos = 69632
+--let $assert_text= Free flush is executed.
+--source include/assert.inc
+
+--echo #
+--echo # RESET MASTER should work well. It also verifies binlog checksum mechanism.
+--echo #
+--source include/rpl_reset.inc
+
+--echo #
+--echo # Test binlog cache rename to binlog file with checksum on
+--echo #
+SET GLOBAL binlog_checksum = "CRC32";
+
+--echo # It will not rename the cache to file, since the cache's checksum was
+--echo # initialized when reset the cache at the end of previous transaction.
+UPDATE t1 SET c1 = repeat('5', 5242880);
+--let $binlog_file= query_get_value(SHOW MASTER STATUS, File, 1)
+--let $assert_cond= "$binlog_file" = "master-bin.000002"
+--let $assert_text= Binlog is not rotated
+--source include/assert.inc
+
+--echo #
+--echo # Not rename to binlog file If the cache's checksum is not same
+--echo # to binlog_checksum
+--echo #
+BEGIN;
+UPDATE t1 SET c1 = repeat('6', 5242880);
+SET GLOBAL binlog_checksum = "NONE";
+COMMIT;
+--let $binlog_file= query_get_value(SHOW MASTER STATUS, File, 1)
+--let $assert_cond= "$binlog_file" = "master-bin.000003"
+--let $assert_text= Binlog is not rotated
+--source include/assert.inc
+
+BEGIN;
+UPDATE t1 SET c1 = repeat('7', 5242880);
+SET GLOBAL binlog_checksum = "CRC32";
+COMMIT;
+--let $binlog_file= query_get_value(SHOW MASTER STATUS, File, 1)
+--let $assert_cond= "$binlog_file" = "master-bin.000004"
+--let $assert_text= Binlog is not rotated
+--source include/assert.inc
+
+--echo #
+--echo # Not rename to binlog file If both stmt and trx cache are not empty
+--echo #
+UPDATE t1, t2 SET t1.c1 = repeat('8', 5242880), t2.c1 = repeat('7', 5242880);
+--let $binlog_file= query_get_value(SHOW MASTER STATUS, File, 1)
+--let $assert_cond= "$binlog_file" = "master-bin.000004"
+--let $assert_text= Binlog is not rotated
+--source include/assert.inc
+
+--echo #
+--echo # Not rename to binlog file If binlog_legacy_event_pos is on
+--echo #
+SET GLOBAL binlog_legacy_event_pos = ON;
+UPDATE t1 SET c1 = repeat('9', 5242880);
+SET GLOBAL binlog_legacy_event_pos = OFF;
+--let $binlog_file= query_get_value(SHOW MASTER STATUS, File, 1)
+--let $assert_cond= "$binlog_file" = "master-bin.000004"
+--let $assert_text= Binlog is not rotated
+--source include/assert.inc
+
+# cleanup
+DROP TABLE t1, t2, t3, t4;
+SET GLOBAL binlog_free_flush_threshold = @saved_threshold;
+SET GLOBAL binlog_checksum = @saved_checksum;
+--let $binlog_file=
+--let $skip_checkpoint_events=0
+--source include/rpl_end.inc
diff --git a/mysql-test/suite/sys_vars/r/sysvars_server_embedded.result b/mysql-test/suite/sys_vars/r/sysvars_server_embedded.result
index 7a113a02b02..8771e4ac87f 100644
--- a/mysql-test/suite/sys_vars/r/sysvars_server_embedded.result
+++ b/mysql-test/suite/sys_vars/r/sysvars_server_embedded.result
@@ -432,6 +432,16 @@ NUMERIC_BLOCK_SIZE NULL
ENUM_VALUE_LIST MIXED,STATEMENT,ROW
READ_ONLY NO
COMMAND_LINE_ARGUMENT REQUIRED
+VARIABLE_NAME BINLOG_FREE_FLUSH_THRESHOLD
+VARIABLE_SCOPE GLOBAL
+VARIABLE_TYPE BIGINT UNSIGNED
+VARIABLE_COMMENT Try to rename the binlog cache temporary file of the commiting transaction to a binlog file when its binlog cache size is bigger than the value of this variable
+NUMERIC_MIN_VALUE 10485760
+NUMERIC_MAX_VALUE 18446744073709551615
+NUMERIC_BLOCK_SIZE 1
+ENUM_VALUE_LIST NULL
+READ_ONLY NO
+COMMAND_LINE_ARGUMENT REQUIRED
VARIABLE_NAME BINLOG_GTID_INDEX
VARIABLE_SCOPE GLOBAL
VARIABLE_TYPE BOOLEAN
diff --git a/mysql-test/suite/sys_vars/r/sysvars_server_notembedded.result b/mysql-test/suite/sys_vars/r/sysvars_server_notembedded.result
index 16d77d397d4..ceb141cfc7a 100644
--- a/mysql-test/suite/sys_vars/r/sysvars_server_notembedded.result
+++ b/mysql-test/suite/sys_vars/r/sysvars_server_notembedded.result
@@ -452,6 +452,16 @@ NUMERIC_BLOCK_SIZE NULL
ENUM_VALUE_LIST MIXED,STATEMENT,ROW
READ_ONLY NO
COMMAND_LINE_ARGUMENT REQUIRED
+VARIABLE_NAME BINLOG_FREE_FLUSH_THRESHOLD
+VARIABLE_SCOPE GLOBAL
+VARIABLE_TYPE BIGINT UNSIGNED
+VARIABLE_COMMENT Try to rename the binlog cache temporary file of the commiting transaction to a binlog file when its binlog cache size is bigger than the value of this variable
+NUMERIC_MIN_VALUE 10485760
+NUMERIC_MAX_VALUE 18446744073709551615
+NUMERIC_BLOCK_SIZE 1
+ENUM_VALUE_LIST NULL
+READ_ONLY NO
+COMMAND_LINE_ARGUMENT REQUIRED
VARIABLE_NAME BINLOG_GTID_INDEX
VARIABLE_SCOPE GLOBAL
VARIABLE_TYPE BOOLEAN
diff --git a/sql/CMakeLists.txt b/sql/CMakeLists.txt
index 0195555efaf..050cd54d98c 100644
--- a/sql/CMakeLists.txt
+++ b/sql/CMakeLists.txt
@@ -107,7 +107,7 @@ SET (SQL_SOURCE
hostname.cc init.cc item.cc item_buff.cc item_cmpfunc.cc
item_create.cc item_func.cc item_geofunc.cc item_row.cc
item_strfunc.cc item_subselect.cc item_sum.cc item_timefunc.cc
- key.cc log.cc lock.cc
+ key.cc log.cc log_cache.cc lock.cc
log_event.cc log_event_server.cc
rpl_record.cc rpl_reporting.cc
mf_iocache.cc my_decimal.cc
diff --git a/sql/log.cc b/sql/log.cc
index 34f9ad745fc..3dc57b21c05 100644
--- a/sql/log.cc
+++ b/sql/log.cc
@@ -163,6 +163,111 @@ static SHOW_VAR binlog_status_vars_detail[]=
{NullS, NullS, SHOW_LONG}
};
+/**
+ This class implementes the feature to rename a binlog cache temporary file to
+ a binlog file. It is used to avoid holding LOCK_log long time when writting a
+ huge binlog cache to binlog file.
+
+ With this feature, temporary files of binlog caches will be created in
+ BINLOG_CACHE_DIR which is created in the same directory to binlog files
+ at server startup.
+*/
+class Binlog_free_flush
+{
+public:
+ Binlog_free_flush() {}
+
+ /**
+ Check whether free flush should be executed on the cache_data.
+
+ @param group_commit_entry object of current transaction
+
+ @retval true it should do free flush
+ @retval false it should do normal commit.
+ */
+ bool should_free_flush(const MYSQL_BIN_LOG::group_commit_entry *entry) const;
+
+ /**
+ This function is the entry function to do free flush. It first,
+ rotate the binlog, then rename the temporary file of the
+ binlog cache to new binlog file, after that it commits the transaction.
+
+ @param entry, group_commit_entry object of current transaction.
+
+ @retval true free flush succeeds.
+ @retval false free flush fails, it should go to normal commit process.
+ */
+ bool commit(MYSQL_BIN_LOG::group_commit_entry *entry);
+
+ /**
+ After rotate has created the new binlog file, it copies the content
+ of the new binlog file to the binlog cache, delete the new binlog file
+ and then rename the binlog cache to the new binlog file.
+
+ @retval true Succeeds to replace the binlog file.
+ @retval false Failed to replace the binlog file. It only return
+ true if some error happened after the new binlog file
+ is deleted. In this situation rotate process will fail.
+ */
+ bool replace_binlog_file();
+ /**
+ The space left is more than a gtid event required, thus the extra
+ space is padded into the gtid event as 0. This function is used
+ to calculate the real gtid size with pad.
+ */
+ size_t get_gtid_event_pad_size();
+
+ /**
+ The space required for session binlog caches to reserve. It is calculated
+ from the length of current binlog file when it is generated and aligned
+ to IO_SIZE;
+
+ @param header_len header length of current binlog file.
+ */
+ void set_reserved_bytes(uint32 header_len)
+ {
+ // Gtid event length
+ header_len+= LOG_EVENT_HEADER_LEN + Gtid_log_event::max_data_length +
+ BINLOG_CHECKSUM_LEN;
+ header_len= header_len - (header_len % IO_SIZE) + IO_SIZE;
+ if (header_len != m_reserved_bytes)
+ m_reserved_bytes= header_len;
+ }
+
+ /**
+ Return reserved space required for binlog cache. It is NOT defined as
+ an atomic variable, while it is get and set in parallel. Synchronizing
+ between set and get is not really necessary.
+ */
+ uint32 get_reserved_size()
+ {
+ return m_reserved_bytes;
+ }
+private:
+ Binlog_free_flush &operator=(const Binlog_free_flush &);
+ Binlog_free_flush(const Binlog_free_flush &);
+
+ char m_cache_dir[FN_REFLEN];
+
+ /** The commit entry of current transaction which is doing free flush. */
+ MYSQL_BIN_LOG::group_commit_entry *m_entry{nullptr};
+
+ /** The cache_data which will be renamed to binlog. */
+ binlog_cache_data *m_cache_data{nullptr};
+
+ /** It will be set to true if rename operation succeeds */
+ bool m_replaced{false};
+
+ uint32 m_reserved_bytes {IO_SIZE};
+};
+static Binlog_free_flush binlog_free_flush;
+ulonglong opt_binlog_free_flush_threshold= 10 * 1024 * 1024;
+
+uint32 binlog_cache_reserved_size()
+{
+ return binlog_free_flush.get_reserved_size();
+}
+
/*
Variables for the binlog background thread.
Protected by the MYSQL_BIN_LOG::LOCK_binlog_background_thread mutex.
@@ -3761,7 +3866,8 @@ bool MYSQL_BIN_LOG::open(const char *log_name,
enum cache_type io_cache_type_arg,
ulong max_size_arg,
bool null_created_arg,
- bool need_mutex)
+ bool need_mutex,
+ bool is_free_flush)
{
xid_count_per_binlog *new_xid_list_entry= NULL, *b;
DBUG_ENTER("MYSQL_BIN_LOG::open");
@@ -4027,14 +4133,20 @@ bool MYSQL_BIN_LOG::open(const char *log_name,
goto err;
bytes_written+= description_event_for_queue->data_written;
}
+
+ // offset must be saved before replace_binlog_file(), it will update the pos
+ my_off_t offset= my_b_tell(&log_file);
+
+ if (is_free_flush && binlog_free_flush.replace_binlog_file())
+ goto err;
+
if (flush_io_cache(&log_file) ||
mysql_file_sync(log_file.file, MYF(MY_WME)))
goto err;
- my_off_t offset= my_b_tell(&log_file);
-
if (!is_relay_log)
{
+ binlog_free_flush.set_reserved_bytes((uint32)offset);
/* update binlog_end_pos so that it can be read by after sync hook */
reset_binlog_end_pos(log_file_name, offset);
@@ -4126,8 +4238,7 @@ bool MYSQL_BIN_LOG::open(const char *log_name,
/* Notify the io thread that binlog is rotated to a new file */
if (is_relay_log)
signal_relay_log_update();
- else
- update_binlog_end_pos();
+
DBUG_RETURN(0);
err:
@@ -5717,9 +5828,9 @@ int MYSQL_BIN_LOG::new_file()
@retval
nonzero - error
*/
-int MYSQL_BIN_LOG::new_file_without_locking()
+int MYSQL_BIN_LOG::new_file_without_locking(bool is_free_flush)
{
- return new_file_impl();
+ return new_file_impl(is_free_flush);
}
@@ -5734,7 +5845,7 @@ int MYSQL_BIN_LOG::new_file_without_locking()
binlog_space_total will be updated if binlog_space_limit is set
*/
-int MYSQL_BIN_LOG::new_file_impl()
+int MYSQL_BIN_LOG::new_file_impl(bool is_free_flush)
{
int error= 0, close_on_error= FALSE;
char new_name[FN_REFLEN], *new_name_ptr, *old_name, *file_to_open;
@@ -5856,7 +5967,8 @@ int MYSQL_BIN_LOG::new_file_impl()
{
/* reopen the binary log file. */
file_to_open= new_name_ptr;
- error= open(old_name, new_name_ptr, 0, io_cache_type, max_size, 1, FALSE);
+ error= open(old_name, new_name_ptr, 0, io_cache_type, max_size, 1, FALSE,
+ is_free_flush);
}
/* handle reopening errors */
@@ -6207,11 +6319,11 @@ static binlog_cache_mngr *binlog_setup_cache_mngr(THD *thd)
sizeof(binlog_cache_mngr),
MYF(MY_ZEROFILL));
if (!cache_mngr ||
- open_cached_file(&cache_mngr->stmt_cache.cache_log, mysql_tmpdir,
- LOG_PREFIX, (size_t)binlog_stmt_cache_size,
+ open_cached_file(&cache_mngr->stmt_cache.cache_log, binlog_cache_dir,
+ LOG_PREFIX, (size_t) binlog_stmt_cache_size,
MYF(MY_WME | MY_TRACK_WITH_LIMIT)) ||
- open_cached_file(&cache_mngr->trx_cache.cache_log, mysql_tmpdir,
- LOG_PREFIX, (size_t)binlog_cache_size,
+ open_cached_file(&cache_mngr->trx_cache.cache_log, binlog_cache_dir,
+ LOG_PREFIX, (size_t) binlog_cache_size,
MYF(MY_WME | MY_TRACK_WITH_LIMIT)))
{
my_free(cache_mngr);
@@ -6866,7 +6978,8 @@ Event_log::prepare_pending_rows_event(THD *thd, TABLE* table,
bool
MYSQL_BIN_LOG::write_gtid_event(THD *thd, bool standalone,
bool is_transactional, uint64 commit_id,
- bool has_xid, bool is_ro_1pc)
+ bool has_xid, bool is_ro_1pc,
+ bool is_free_flush)
{
rpl_gtid gtid;
uint32 domain_id;
@@ -6934,6 +7047,9 @@ MYSQL_BIN_LOG::write_gtid_event(THD *thd, bool standalone,
}
#endif
+ if (unlikely(is_free_flush))
+ gtid_event.pad_to_size= binlog_free_flush.get_gtid_event_pad_size();
+
if (write_event(>id_event))
DBUG_RETURN(true);
status_var_add(thd->status_var.binlog_bytes_written, gtid_event.data_written);
@@ -7623,7 +7739,8 @@ MYSQL_BIN_LOG::do_checkpoint_request(ulong binlog_id)
@retval
nonzero - error in rotating routine.
*/
-int MYSQL_BIN_LOG::rotate(bool force_rotate, bool* check_purge)
+int MYSQL_BIN_LOG::rotate(bool force_rotate, bool *check_purge,
+ bool is_free_flush)
{
int error= 0;
ulonglong binlog_pos;
@@ -7664,7 +7781,7 @@ int MYSQL_BIN_LOG::rotate(bool force_rotate, bool* check_purge)
*/
mark_xids_active(binlog_id, 1);
- if (unlikely((error= new_file_without_locking())))
+ if (unlikely((error= new_file_without_locking(is_free_flush))))
{
/**
Be conservative... There are possible lost events (eg,
@@ -7965,12 +8082,14 @@ int Event_log::write_cache_raw(THD *thd, IO_CACHE *cache)
int Event_log::write_cache(THD *thd, binlog_cache_data *cache_data)
{
- int res;
IO_CACHE *cache= &cache_data->cache_log;
DBUG_ENTER("Event_log::write_cache");
mysql_mutex_assert_owner(&LOCK_log);
+ if (cache_data->init_for_read())
+ DBUG_RETURN(ER_ERROR_ON_WRITE);
+
/*
If possible, just copy the cache over byte-by-byte with pre-computed
checksums.
@@ -7979,14 +8098,12 @@ int Event_log::write_cache(THD *thd, binlog_cache_data *cache_data)
likely(!crypto.scheme) &&
likely(!opt_binlog_legacy_event_pos))
{
- int res= my_b_copy_all_to_cache(cache, &log_file);
+ int res=
+ my_b_copy_to_cache(cache, &log_file, cache_data->length_for_read());
status_var_add(thd->status_var.binlog_bytes_written, my_b_tell(cache));
DBUG_RETURN(res ? ER_ERROR_ON_WRITE : 0);
}
- if ((res= reinit_io_cache(cache, READ_CACHE, 0, 0, 0)))
- DBUG_RETURN(ER_ERROR_ON_WRITE);
-
/* Amount of remaining bytes in the IO_CACHE read buffer. */
size_t log_file_pos;
uchar header_buf[LOG_EVENT_HEADER_LEN];
@@ -8301,6 +8418,7 @@ MYSQL_BIN_LOG::write_transaction_to_binlog(THD *thd,
DBUG_RETURN(0);
}
+ entry.next= nullptr;
entry.thd= thd;
entry.cache_mngr= cache_mngr;
entry.error= 0;
@@ -8703,7 +8821,16 @@ MYSQL_BIN_LOG::queue_for_group_commit(group_commit_entry *orig_entry)
bool
MYSQL_BIN_LOG::write_transaction_to_binlog_events(group_commit_entry *entry)
{
- int is_leader= queue_for_group_commit(entry);
+ int is_leader;
+
+ if (binlog_free_flush.should_free_flush(entry) &&
+ binlog_free_flush.commit(entry))
+ {
+ is_leader= 1;
+ goto commit;
+ }
+
+ is_leader= queue_for_group_commit(entry);
#ifdef WITH_WSREP
/* commit order was released in queue_for_group_commit() call,
here we check if wsrep_commit_ordered() failed or if we are leader */
@@ -8754,6 +8881,7 @@ MYSQL_BIN_LOG::write_transaction_to_binlog_events(group_commit_entry *entry)
*/
}
+commit:
if (!opt_optimize_thread_scheduling)
{
/* For the leader, trx_group_commit_leader() already took the lock. */
@@ -8852,7 +8980,8 @@ MYSQL_BIN_LOG::write_transaction_to_binlog_events(group_commit_entry *entry)
*/
void
-MYSQL_BIN_LOG::trx_group_commit_leader(group_commit_entry *leader)
+MYSQL_BIN_LOG::trx_group_commit_leader(group_commit_entry *leader,
+ bool is_free_flush)
{
uint xid_count= 0;
my_off_t UNINIT_VAR(commit_offset);
@@ -8863,6 +8992,16 @@ MYSQL_BIN_LOG::trx_group_commit_leader(group_commit_entry *leader)
uint64 commit_id;
DBUG_ENTER("MYSQL_BIN_LOG::trx_group_commit_leader");
+ /*
+ When move a binlog cache to a binlog file, the transaction itself is
+ a group.
+ */
+ if (unlikely(is_free_flush))
+ {
+ last_in_queue= leader;
+ queue= leader;
+ }
+ else
{
#ifdef ENABLED_DEBUG_SYNC
DBUG_EXECUTE_IF("inject_binlog_commit_before_get_LOCK_log",
@@ -8888,7 +9027,6 @@ MYSQL_BIN_LOG::trx_group_commit_leader(group_commit_entry *leader)
current= group_commit_queue;
group_commit_queue= NULL;
mysql_mutex_unlock(&LOCK_prepare_ordered);
- binlog_id= current_binlog_id;
/* As the queue is in reverse order of entering, reverse it. */
last_in_queue= current;
@@ -8908,8 +9046,9 @@ MYSQL_BIN_LOG::trx_group_commit_leader(group_commit_entry *leader)
/* Now we have in queue the list of transactions to be committed in order. */
}
-
- DBUG_ASSERT(is_open());
+
+ binlog_id= current_binlog_id;
+
if (likely(is_open())) // Should always be true
{
commit_id= (last_in_queue == leader ? 0 : (uint64)leader->thd->query_id);
@@ -8944,10 +9083,11 @@ MYSQL_BIN_LOG::trx_group_commit_leader(group_commit_entry *leader)
*/
DBUG_ASSERT(!cache_mngr->stmt_cache.empty() ||
!cache_mngr->trx_cache.empty() ||
- current->thd->transaction->xid_state.is_explicit_XA());
+ current->thd->transaction->xid_state.is_explicit_XA() ||
+ is_free_flush);
- if (unlikely((current->error= write_transaction_or_stmt(current,
- commit_id))))
+ if (unlikely((current->error= write_transaction_or_stmt(
+ current, commit_id, is_free_flush))))
current->commit_errno= errno;
strmake_buf(cache_mngr->last_commit_pos_file, log_file_name);
@@ -9203,7 +9343,7 @@ MYSQL_BIN_LOG::trx_group_commit_leader(group_commit_entry *leader)
int
MYSQL_BIN_LOG::write_transaction_or_stmt(group_commit_entry *entry,
- uint64 commit_id)
+ uint64 commit_id, bool is_free_flush)
{
binlog_cache_mngr *mngr= entry->cache_mngr;
bool has_xid= entry->end_event->get_type_code() == XID_EVENT;
@@ -9224,10 +9364,17 @@ MYSQL_BIN_LOG::write_transaction_or_stmt(group_commit_entry *entry,
DBUG_ASSERT(!(entry->using_stmt_cache && !mngr->stmt_cache.empty() &&
mngr->get_binlog_cache_log(FALSE)->error));
- if (write_gtid_event(entry->thd, is_prepared_xa(entry->thd),
- entry->using_trx_cache, commit_id,
- has_xid, entry->ro_1pc))
- DBUG_RETURN(ER_ERROR_ON_WRITE);
+ /*
+ gtid will be written when renaming the binlog cache to binlog file,
+ if is_free_flush is true. Thus skip write_gtid_event here.
+ */
+ if (likely(!is_free_flush))
+ {
+ if (write_gtid_event(entry->thd, is_prepared_xa(entry->thd),
+ entry->using_trx_cache, commit_id, has_xid,
+ entry->ro_1pc))
+ DBUG_RETURN(ER_ERROR_ON_WRITE);
+ }
if (entry->using_stmt_cache && !mngr->stmt_cache.empty() &&
write_cache(entry->thd, mngr->get_binlog_cache_data(FALSE)))
@@ -12933,3 +13080,220 @@ void wsrep_register_binlog_handler(THD *thd, bool trx)
}
#endif /* WITH_WSREP */
+
+inline bool Binlog_free_flush::should_free_flush(
+ const MYSQL_BIN_LOG::group_commit_entry *entry) const
+{
+ binlog_cache_data *trx_cache= entry->cache_mngr->get_binlog_cache_data(true);
+ binlog_cache_data *stmt_cache=
+ entry->cache_mngr->get_binlog_cache_data(false);
+
+ /*
+ The binlog cache file is not encrypted in the same way with binlog, so it
+ cannot be renamed to binlog file. It is not supported to rename both
+ statement cache and transaction cache to binlog files at the same time.
+ */
+ if (unlikely(encrypt_binlog ||
+ (entry->using_stmt_cache && entry->using_trx_cache &&
+ !stmt_cache->empty() && !trx_cache->empty())))
+ return false;
+
+ binlog_cache_data *cache_data;
+ if (unlikely(entry->using_stmt_cache && !stmt_cache->empty()))
+ cache_data= stmt_cache;
+ else
+ cache_data= trx_cache;
+
+ /* Do not free flush if total_bytes smaller than limit size. */
+ if (likely(cache_data->get_byte_position() <=
+ opt_binlog_free_flush_threshold))
+ return false;
+
+ /* Do not free flush if reserve space equal to zero. */
+ if (cache_data->file_reserved_bytes() == 0)
+ return false;
+
+ /*
+ Do not free flush if no tmp file writes, happened when threshold is
+ smaller than binlog cache size.
+ */
+ if (unlikely(cache_data->cache_log.disk_writes == 0))
+ return false;
+
+ return true;
+}
+
+bool Binlog_free_flush::commit(MYSQL_BIN_LOG::group_commit_entry *entry)
+{
+ bool check_purge= false;
+ THD *thd= entry->thd;
+ binlog_cache_mngr *cache_mngr= entry->cache_mngr;
+ binlog_cache_data *cache_data= cache_mngr->get_binlog_cache_data(true);
+ if (unlikely(!entry->using_trx_cache || cache_data->empty()))
+ cache_data= cache_mngr->get_binlog_cache_data(false);
+
+ /* Sync the temp file before enter log_lock to avoid holding the lock long */
+ if (cache_data->sync_temp_file())
+ return false;
+
+ thd->wait_for_prior_commit();
+
+ // It will be released by trx_group_commit_leader
+ mysql_mutex_lock(&mysql_bin_log.LOCK_log);
+
+ enum enum_binlog_checksum_alg expected_alg=
+ mysql_bin_log.checksum_alg_reset != BINLOG_CHECKSUM_ALG_UNDEF
+ ? mysql_bin_log.checksum_alg_reset
+ : (enum_binlog_checksum_alg) binlog_checksum_options;
+
+ /*
+ In legacy mode, all events should has a valid position this done by
+ updating log_pos field when writing events from binlog cache to binlog
+ file. Thus rename binlog cache to binlog file is not supported in legacy
+ mode.
+
+ if the cache's checksum alg is not same to the binlog's checksum, it needs
+ to recalculate the checksum. Thus rename binlog cache to binlog file is
+ not supported.
+ */
+ if (!mysql_bin_log.is_open() || opt_binlog_legacy_event_pos ||
+ (expected_alg != cache_data->checksum_opt))
+ {
+ mysql_mutex_unlock(&mysql_bin_log.LOCK_log);
+ return false;
+ }
+
+ m_entry= entry;
+ m_replaced= false;
+ m_cache_data= cache_data;
+ ulong prev_binlog_id= mysql_bin_log.current_binlog_id;
+
+ /*
+ It will call replace_binlog_file() to rename the transaction's binlog cache
+ to the new binlog file.
+ */
+ if (mysql_bin_log.rotate(true, &check_purge, true /* is_free_flush */))
+ {
+ DBUG_ASSERT(!m_replaced);
+ DBUG_ASSERT(!mysql_bin_log.is_open());
+ }
+
+ if (!m_replaced)
+ {
+ mysql_mutex_unlock(&mysql_bin_log.LOCK_log);
+ if (check_purge)
+ mysql_bin_log.checkpoint_and_purge(prev_binlog_id);
+ return false;
+ }
+
+ /* Seek binlog file to the end */
+ reinit_io_cache(&mysql_bin_log.log_file, WRITE_CACHE,
+ cache_data->temp_file_length(), false, true);
+ status_var_add(m_entry->thd->status_var.binlog_bytes_written,
+ cache_data->get_byte_position());
+ m_cache_data->detach_temp_file();
+
+ mysql_bin_log.trx_group_commit_leader(entry, true /* is_free_flush */);
+
+ if (check_purge)
+ mysql_bin_log.checkpoint_and_purge(prev_binlog_id);
+ return true;
+}
+
+bool Binlog_free_flush::replace_binlog_file()
+{
+ size_t binlog_size= my_b_tell(&mysql_bin_log.log_file);
+ size_t required_size= binlog_size;
+ // space for Gtid_log_event
+ required_size+= LOG_EVENT_HEADER_LEN + Gtid_log_event::max_data_length +
+ BINLOG_CHECKSUM_LEN;
+
+ DBUG_EXECUTE_IF("simulate_required_size_too_big", required_size= 10000;);
+ if (required_size > m_cache_data->file_reserved_bytes())
+ {
+ sql_print_information("Could not rename binlog cache to binlog, "
+ "require %llu bytes but only %llu bytes reserved.",
+ required_size, m_cache_data->file_reserved_bytes());
+ return false;
+ }
+
+ File new_log_fd= -1;
+ bool ret= false;
+
+ /* Create fd for the cache file as a new binlog file fd */
+ new_log_fd= mysql_file_open(key_file_binlog, m_cache_data->temp_file_name(),
+ O_BINARY | O_CLOEXEC | O_WRONLY, MYF(MY_WME));
+ if (new_log_fd == -1)
+ return false;
+
+ /* Copy the part which has been flushed to binlog file to binlog cache */
+ if (mysql_bin_log.log_file.pos_in_file > 0)
+ {
+ size_t copy_len= 0;
+ uchar buf[IO_SIZE];
+
+ int read_fd=
+ mysql_file_open(key_file_binlog, mysql_bin_log.get_log_fname(),
+ O_RDONLY | O_BINARY | O_SHARE, MYF(MY_WME));
+ if (read_fd == -1)
+ goto err;
+
+ while (copy_len < mysql_bin_log.log_file.pos_in_file)
+ {
+ int read_len= (int) mysql_file_read(read_fd, buf, IO_SIZE, MYF(MY_WME));
+ if (read_len < 0 ||
+ mysql_file_write(new_log_fd, buf, read_len,
+ MYF(MY_WME | MY_NABP | MY_WAIT_IF_FULL)))
+ {
+ mysql_file_close(read_fd, MYF(MY_WME));
+ goto err;
+ }
+ copy_len+= read_len;
+ }
+
+ mysql_file_close(read_fd, MYF(MY_WME));
+ }
+
+ // Set the cache file as binlog file.
+ mysql_file_close(mysql_bin_log.log_file.file, MYF(MY_WME));
+ mysql_bin_log.log_file.file= new_log_fd;
+ new_log_fd= -1;
+ my_delete(mysql_bin_log.get_log_fname(), MYF(0));
+
+ /* Any error happens after the file is deleted should return true. */
+ ret= true;
+
+ if (mysql_bin_log.write_gtid_event(
+ m_entry->thd, is_prepared_xa(m_entry->thd), m_entry->using_trx_cache,
+ 0 /* commit_id */, m_entry->end_event->get_type_code() == XID_EVENT,
+ m_entry->ro_1pc, true /* is_free_flush */))
+ goto err;
+
+ DBUG_EXECUTE_IF("binlog_free_flush_crash_before_rename", DBUG_SUICIDE(););
+
+ if (DBUG_IF("simulate_rename_binlog_cache_to_binlog_error") ||
+ my_rename(m_cache_data->temp_file_name(), mysql_bin_log.get_log_fname(),
+ MYF(MY_WME)))
+ goto err;
+
+ sql_print_information("Renamed binlog cache to binlog %s",
+ mysql_bin_log.get_log_fname());
+ m_replaced= true;
+ return false;
+err:
+ if (new_log_fd != -1)
+ mysql_file_close(new_log_fd, MYF(MY_WME));
+ return ret;
+}
+
+size_t Binlog_free_flush::get_gtid_event_pad_size()
+{
+ size_t begin_pos= my_b_tell(&mysql_bin_log.log_file);
+ size_t pad_to_size=
+ m_cache_data->file_reserved_bytes() - begin_pos - LOG_EVENT_HEADER_LEN;
+
+ if (binlog_checksum_options != BINLOG_CHECKSUM_ALG_OFF)
+ pad_to_size-= BINLOG_CHECKSUM_LEN;
+
+ return pad_to_size;
+}
diff --git a/sql/log.h b/sql/log.h
index 3ee06e17264..117f7638f2b 100644
--- a/sql/log.h
+++ b/sql/log.h
@@ -600,9 +600,12 @@ class binlog_cache_mngr;
class binlog_cache_data;
struct rpl_gtid;
struct wait_for_commit;
+class Binlog_free_flush;
class MYSQL_BIN_LOG: public TC_LOG, private Event_log
{
+ friend Binlog_free_flush;
+
#ifdef HAVE_PSI_INTERFACE
/** The instrumentation key to use for @ LOCK_index. */
PSI_mutex_key m_key_LOCK_index;
@@ -756,18 +759,20 @@ class MYSQL_BIN_LOG: public TC_LOG, private Event_log
new_file() is locking. new_file_without_locking() does not acquire
LOCK_log.
*/
- int new_file_impl();
+ int new_file_impl(bool is_free_flush= false);
void do_checkpoint_request(ulong binlog_id);
- int write_transaction_or_stmt(group_commit_entry *entry, uint64 commit_id);
+ int write_transaction_or_stmt(group_commit_entry *entry, uint64 commit_id,
+ bool is_free_flush= false);
int queue_for_group_commit(group_commit_entry *entry);
bool write_transaction_to_binlog_events(group_commit_entry *entry);
- void trx_group_commit_leader(group_commit_entry *leader);
+ void trx_group_commit_leader(group_commit_entry *leader,
+ bool is_free_flush= false);
bool is_xidlist_idle_nolock();
void update_gtid_index(uint32 offset, rpl_gtid gtid);
public:
void purge(bool all);
- int new_file_without_locking();
+ int new_file_without_locking(bool is_free_flush= false);
/*
A list of struct xid_count_per_binlog is used to keep track of how many
XIDs are in prepared, but not committed, state in each binlog. And how
@@ -997,7 +1002,8 @@ class MYSQL_BIN_LOG: public TC_LOG, private Event_log
enum cache_type io_cache_type_arg,
ulong max_size,
bool null_created,
- bool need_mutex);
+ bool need_mutex,
+ bool is_free_flush = false);
bool open_index_file(const char *index_file_name_arg,
const char *log_name, bool need_mutex);
/* Use this to start writing a new log file */
@@ -1037,7 +1043,7 @@ class MYSQL_BIN_LOG: public TC_LOG, private Event_log
bool is_active(const char* log_file_name);
bool can_purge_log(const char *log_file_name, bool interactive);
int update_log_index(LOG_INFO* linfo, bool need_update_threads);
- int rotate(bool force_rotate, bool* check_purge);
+ int rotate(bool force_rotate, bool* check_purge, bool is_free_flush= false);
void checkpoint_and_purge(ulong binlog_id);
int rotate_and_purge(bool force_rotate, DYNAMIC_ARRAY* drop_gtid_domain= NULL);
/**
@@ -1117,7 +1123,8 @@ class MYSQL_BIN_LOG: public TC_LOG, private Event_log
bool is_xidlist_idle();
bool write_gtid_event(THD *thd, bool standalone, bool is_transactional,
uint64 commit_id,
- bool has_xid= false, bool ro_1pc= false);
+ bool has_xid= false, bool ro_1pc= false,
+ bool is_free_flush= false);
int read_state_from_file();
int write_state_to_file();
int get_most_recent_gtid_list(rpl_gtid **list, uint32 *size);
diff --git a/sql/log_cache.cc b/sql/log_cache.cc
new file mode 100644
index 00000000000..014e76611cf
--- /dev/null
+++ b/sql/log_cache.cc
@@ -0,0 +1,122 @@
+#include "my_global.h"
+#include "log_cache.h"
+#include "handler.h"
+#include "my_sys.h"
+#include "mysql/psi/mysql_file.h"
+#include "mysql/service_wsrep.h"
+
+const char *BINLOG_CACHE_DIR= "#binlog_cache_files";
+char binlog_cache_dir[FN_REFLEN];
+extern uint32 binlog_cache_reserved_size();
+
+bool binlog_cache_data::init_file_reserved_bytes()
+{
+ // Session's cache file is not created, so created here.
+ if (cache_log.file == -1)
+ {
+ char name[FN_REFLEN];
+
+ /* Cache file is named with PREFIX + binlog_cache_data object's address */
+ snprintf(name, FN_REFLEN, "%s/%s_%llu", cache_log.dir, cache_log.prefix,
+ (ulonglong) this);
+
+ if ((cache_log.file=
+ mysql_file_open(0, name, O_CREAT | O_RDWR, MYF(MY_WME))) < 0)
+ {
+ sql_print_error("Failed to open binlog cache temporary file %s", name);
+ cache_log.error= -1;
+ return true;
+ }
+ }
+
+#ifdef WITH_WSREP
+ /*
+ WSREP code accesses cache_log directly, so don't reserve space if WSREP is
+ on.
+ */
+ if (unlikely(wsrep_on(current_thd)))
+ return false;
+#endif
+
+ m_file_reserved_bytes= binlog_cache_reserved_size();
+ cache_log.pos_in_file= m_file_reserved_bytes;
+ cache_log.seek_not_done= 1;
+ return false;
+}
+
+void binlog_cache_data::detach_temp_file()
+{
+ /*
+ If there was a rollback_to_savepoint happened before, the real length of
+ tmp file can be greater than the file_end_pos. Truncate the cache tmp
+ file to file_end_pos of this cache.
+ */
+ my_chsize(cache_log.file, my_b_tell(&cache_log), 0, MYF(MY_WME));
+
+ mysql_file_close(cache_log.file, MYF(0));
+ cache_log.file= -1;
+ reset();
+}
+
+extern void ignore_db_dirs_append(const char *dirname_arg);
+
+bool init_binlog_cache_dir()
+{
+ size_t length;
+ uint max_tmp_file_name_len=
+ 2 /* prefix */ + 10 /* max len of thread_id */ + 1 /* underline */;
+
+ ignore_db_dirs_append(BINLOG_CACHE_DIR);
+
+ dirname_part(binlog_cache_dir, log_bin_basename, &length);
+ /*
+ Must ensure the full name of the tmp file is shorter than FN_REFLEN, to
+ avoid overflowing the name buffer in write and commit.
+ */
+ if (length + strlen(BINLOG_CACHE_DIR) + max_tmp_file_name_len >= FN_REFLEN)
+ {
+ sql_print_error("Could not create binlog cache dir %s%s. It is too long.",
+ binlog_cache_dir, BINLOG_CACHE_DIR);
+ return true;
+ }
+
+ memcpy(binlog_cache_dir + length, BINLOG_CACHE_DIR,
+ strlen(BINLOG_CACHE_DIR));
+ binlog_cache_dir[length + strlen(BINLOG_CACHE_DIR)]= 0;
+
+ MY_DIR *dir_info= my_dir(binlog_cache_dir, MYF(0));
+
+ if (!dir_info)
+ {
+ /* Make a dir for binlog cache temp files if not exist. */
+ if (my_mkdir(binlog_cache_dir, 0777, MYF(0)) < 0)
+ {
+ sql_print_error("Could not create binlog cache dir %s.",
+ binlog_cache_dir);
+ return true;
+ }
+ return false;
+ }
+
+ /* Try to delete all cache files in the directory. */
+ for (uint i= 0; i < dir_info->number_of_files; i++)
+ {
+ FILEINFO *file= dir_info->dir_entry + i;
+
+ if (strncmp(file->name, LOG_PREFIX, strlen(LOG_PREFIX)))
+ {
+ sql_print_warning("%s is in %s/, but it is not a binlog cache file",
+ file->name, BINLOG_CACHE_DIR);
+ continue;
+ }
+
+ char file_path[FN_REFLEN];
+ fn_format(file_path, file->name, binlog_cache_dir, "",
+ MYF(MY_REPLACE_DIR));
+
+ my_delete(file_path, MYF(0));
+ }
+
+ my_dirend(dir_info);
+ return false;
+}
diff --git a/sql/log_cache.h b/sql/log_cache.h
index 79a9b94d8bc..a16e85b4b73 100644
--- a/sql/log_cache.h
+++ b/sql/log_cache.h
@@ -22,6 +22,16 @@ static constexpr my_off_t MY_OFF_T_UNDEF= ~0ULL;
/** Truncate cache log files bigger than this */
static constexpr my_off_t CACHE_FILE_TRUNC_SIZE = 65536;
+/**
+ Create binlog cache directory if it doesn't exist, otherwise delete all
+ files existing in the directory.
+
+ @retval false Succeeds to initialize the directory.
+ @retval true Failed to initialize the directory.
+*/
+bool init_binlog_cache_dir();
+
+extern char binlog_cache_dir[FN_REFLEN];
/*
Helper classes to store non-transactional and transactional data
@@ -35,7 +45,7 @@ class binlog_cache_data
before_stmt_pos(MY_OFF_T_UNDEF), m_pending(0), status(0),
incident(FALSE), precompute_checksums(precompute_checksums),
saved_max_binlog_cache_size(0), ptr_binlog_cache_use(0),
- ptr_binlog_cache_disk_use(0)
+ ptr_binlog_cache_disk_use(0), m_file_reserved_bytes(0)
{
/*
Read the current checksum setting. We will use this setting to decide
@@ -47,9 +57,13 @@ class binlog_cache_data
(enum_binlog_checksum_alg)binlog_checksum_options;
}
- ~binlog_cache_data()
+ virtual ~binlog_cache_data()
{
DBUG_ASSERT(empty());
+
+ if (cache_log.file != -1 && !encrypt_tmp_files)
+ unlink(my_filename(cache_log.file));
+
close_cached_file(&cache_log);
}
@@ -67,7 +81,7 @@ class binlog_cache_data
bool empty() const
{
return (pending() == NULL &&
- (my_b_write_tell(&cache_log) == 0 ||
+ (my_b_write_tell(&cache_log) - m_file_reserved_bytes == 0 ||
((status & (LOGGED_ROW_EVENT | LOGGED_CRITICAL)) == 0)));
}
@@ -97,6 +111,8 @@ class binlog_cache_data
bool truncate_file= (cache_log.file != -1 &&
my_b_write_tell(&cache_log) >
MY_MIN(CACHE_FILE_TRUNC_SIZE, binlog_stmt_cache_size));
+ // m_file_reserved_bytes must be reset to 0, before truncate.
+ m_file_reserved_bytes= 0;
truncate(0,1); // Forget what's in cache
checksum_opt= !precompute_checksums ? BINLOG_CHECKSUM_ALG_OFF :
(enum_binlog_checksum_alg)binlog_checksum_options;
@@ -112,7 +128,8 @@ class binlog_cache_data
my_off_t get_byte_position() const
{
- return my_b_tell(&cache_log);
+ DBUG_ASSERT(cache_log.type == WRITE_CACHE);
+ return my_b_tell(&cache_log) - m_file_reserved_bytes;
}
my_off_t get_prev_position() const
@@ -172,6 +189,81 @@ class binlog_cache_data
status|= status_arg;
}
+ /**
+ This function is called everytime when anything is being written into the
+ cache_log. To support rename binlog cache to binlog file, the cache_log
+ should be initialized with reserved space.
+ */
+ bool write_prepare(size_t write_length)
+ {
+ /* Data will exceed the buffer size in this write */
+ if (unlikely(cache_log.write_pos + write_length > cache_log.write_end &&
+ cache_log.pos_in_file == 0))
+ {
+ /* Only session's binlog cache need to reserve space. */
+ if (cache_log.dir == binlog_cache_dir && !encrypt_tmp_files)
+ return init_file_reserved_bytes();
+ }
+ return false;
+ }
+
+ /**
+ For session's binlog cache, it have to call this function to skip the
+ reserved before reading the cache file.
+ */
+ bool init_for_read()
+ {
+ return reinit_io_cache(&cache_log, READ_CACHE, m_file_reserved_bytes, 0, 0);
+ }
+
+ /**
+ For session's binlog cache, it have to call this function to get the
+ actual data length.
+ */
+ my_off_t length_for_read() const
+ {
+ DBUG_ASSERT(cache_log.type == READ_CACHE);
+ return cache_log.end_of_file - m_file_reserved_bytes;
+ }
+
+ /**
+ It function returns the cache file's actual length which includes the
+ reserved space.
+ */
+ my_off_t temp_file_length()
+ {
+ return my_b_tell(&cache_log);
+ }
+
+ uint32 file_reserved_bytes() { return m_file_reserved_bytes; }
+
+ /**
+ Flush and sync the data of the file into storage.
+
+ @retval true Error happens
+ @retval false Succeeds
+ */
+ bool sync_temp_file()
+ {
+ DBUG_ASSERT(cache_log.file != -1);
+
+ if (my_b_flush_io_cache(&cache_log, 1) ||
+ mysql_file_sync(cache_log.file, MYF(MY_WME)))
+ return true;
+ return false;
+ }
+
+ /**
+ Copy the name of the cache file to the argument name.
+ */
+ const char *temp_file_name() { return my_filename(cache_log.file); }
+
+ /**
+ It is called after renaming the cache file to a binlog file. The file
+ now is a binlog file, so detach it from the binlog cache.
+ */
+ void detach_temp_file();
+
/*
Cache to store data before copying it to the binary log.
*/
@@ -253,6 +345,12 @@ class binlog_cache_data
*/
ulong *ptr_binlog_cache_disk_use;
+ /*
+ Stores the bytes reserved at the begin of the cache file. It could be
+ 0 for cases that reserved space are not supported. see write_prepare().
+ */
+ uint32 m_file_reserved_bytes {0};
+
/*
It truncates the cache to a certain position. This includes deleting the
pending event.
@@ -266,12 +364,18 @@ class binlog_cache_data
delete pending();
set_pending(0);
}
- my_bool res __attribute__((unused))=
- reinit_io_cache(&cache_log, WRITE_CACHE, pos, 0, reset_cache);
+ my_bool res __attribute__((unused))= reinit_io_cache(
+ &cache_log, WRITE_CACHE, pos + m_file_reserved_bytes, 0, reset_cache);
DBUG_ASSERT(res == 0);
cache_log.end_of_file= saved_max_binlog_cache_size;
}
+ /**
+ Reserve required space at the begin of the tempoary file. It will create
+ the temporary file if it doesn't exist.
+ */
+ bool init_file_reserved_bytes();
+
binlog_cache_data& operator=(const binlog_cache_data& info);
binlog_cache_data(const binlog_cache_data& info);
};
diff --git a/sql/log_event.h b/sql/log_event.h
index fdbd46f8d0d..8c1edfe2c0b 100644
--- a/sql/log_event.h
+++ b/sql/log_event.h
@@ -3340,6 +3340,14 @@ class Gtid_log_event: public Log_event
uint64 sa_seq_no; // start alter identifier for CA/RA
#ifdef MYSQL_SERVER
event_xid_t xid;
+ /*
+ Pad the event to this size if it is not zero. It is only used for renaming
+ a binlog cache to binlog file. There is some reserved space for gtid event
+ and the events at the begin of the binlog file. There must be some space
+ left after the events are filled. Thus the left space is padded into the
+ gtid event with 0.
+ */
+ uint64 pad_to_size;
#else
event_mysql_xid_t xid;
#endif
@@ -3404,6 +3412,11 @@ class Gtid_log_event: public Log_event
static const uchar FL_EXTRA_THREAD_ID= 16; // thread_id like in BEGIN Query
#ifdef MYSQL_SERVER
+ static const uint max_data_length= GTID_HEADER_LEN + 2 + sizeof(XID)
+ + 1 /* flags_extra: */
+ + 4 /* Extra Engines */
+ + 4 /* FL_EXTRA_THREAD_ID */;
+
Gtid_log_event(THD *thd_arg, uint64 seq_no, uint32 domain_id, bool standalone,
uint16 flags, bool is_transactional, uint64 commit_id,
bool has_xid= false, bool is_ro_1pc= false);
diff --git a/sql/log_event_server.cc b/sql/log_event_server.cc
index 9b179836651..44cec8b15fd 100644
--- a/sql/log_event_server.cc
+++ b/sql/log_event_server.cc
@@ -29,6 +29,7 @@
#include "unireg.h"
#include "log_event.h"
+#include "log_cache.h"
#include "sql_base.h" // close_thread_tables
#include "sql_cache.h" // QUERY_CACHE_FLAGS_SIZE
#include "sql_locale.h" // MY_LOCALE, my_locale_by_number, my_locale_en_US
@@ -690,6 +691,9 @@ void Log_event::init_show_field_list(THD *thd, List<Item>* field_list)
int Log_event_writer::write_internal(const uchar *pos, size_t len)
{
DBUG_ASSERT(!ctx || encrypt_or_write == &Log_event_writer::encrypt_and_write);
+ if (cache_data && cache_data->write_prepare(len))
+ return 1;
+
if (my_b_safe_write(file, pos, len))
{
DBUG_PRINT("error", ("write to log failed: %d", my_errno));
@@ -2839,7 +2843,7 @@ Gtid_log_event::Gtid_log_event(THD *thd_arg, uint64 seq_no_arg,
bool ro_1pc)
: Log_event(thd_arg, flags_arg, is_transactional),
seq_no(seq_no_arg), commit_id(commit_id_arg), domain_id(domain_id_arg),
- flags2((standalone ? FL_STANDALONE : 0) |
+ pad_to_size(0), flags2((standalone ? FL_STANDALONE : 0) |
(commit_id_arg ? FL_GROUP_COMMIT_ID : 0)),
flags_extra(0), extra_engines(0),
thread_id(thd_arg->variables.pseudo_thread_id)
@@ -2959,10 +2963,7 @@ Gtid_log_event::peek(const uchar *event_start, size_t event_len,
bool
Gtid_log_event::write(Log_event_writer *writer)
{
- uchar buf[GTID_HEADER_LEN + 2 + sizeof(XID)
- + 1 /* flags_extra: */
- + 4 /* Extra Engines */
- + 4 /* FL_EXTRA_THREAD_ID */];
+ uchar buf[max_data_length];
size_t write_len= 13;
int8store(buf, seq_no);
@@ -3042,6 +3043,27 @@ Gtid_log_event::write(Log_event_writer *writer)
bzero(buf+write_len, GTID_HEADER_LEN-write_len);
write_len= GTID_HEADER_LEN;
}
+
+ if (unlikely(pad_to_size > write_len))
+ {
+ if (write_header(writer, pad_to_size) ||
+ write_data(writer, buf, write_len))
+ return true;
+
+ pad_to_size-= write_len;
+
+ char pad_buf[IO_SIZE];
+ bzero(pad_buf, pad_to_size);
+ while (pad_to_size)
+ {
+ uint64 size= pad_to_size >= IO_SIZE ? IO_SIZE : pad_to_size;
+ if (write_data(writer, pad_buf, size))
+ return true;
+ pad_to_size-= size;
+ }
+ return write_footer(writer);
+ }
+
return write_header(writer, write_len) ||
write_data(writer, buf, write_len) ||
write_footer(writer);
diff --git a/sql/mysqld.cc b/sql/mysqld.cc
index e938e8f6cfa..a2ef34a28db 100644
--- a/sql/mysqld.cc
+++ b/sql/mysqld.cc
@@ -120,7 +120,7 @@
#include "sp_cache.h"
#include "sql_reload.h" // reload_acl_and_cache
#include "sp_head.h" // init_sp_psi_keys
-
+#include "log_cache.h"
#include <mysqld_default_groups.h>
#ifdef HAVE_POLL_H
@@ -5609,6 +5609,8 @@ static int init_server_components()
mysql_mutex_unlock(log_lock);
if (unlikely(error))
unireg_abort(1);
+ if (unlikely(init_binlog_cache_dir()))
+ unireg_abort(1);
}
#ifdef HAVE_REPLICATION
diff --git a/sql/sys_vars.cc b/sql/sys_vars.cc
index d4997793428..cd7bae8ab12 100644
--- a/sql/sys_vars.cc
+++ b/sql/sys_vars.cc
@@ -7380,3 +7380,13 @@ static Sys_var_enum Sys_block_encryption_mode(
"AES_ENCRYPT() and AES_DECRYPT() functions",
SESSION_VAR(block_encryption_mode), CMD_LINE(REQUIRED_ARG),
block_encryption_mode_values, DEFAULT(0));
+
+extern ulonglong opt_binlog_free_flush_threshold;
+static Sys_var_ulonglong Sys_binlog_free_flush_threshold(
+ "binlog_free_flush_threshold",
+ "Try to rename the binlog cache temporary file of the commiting "
+ "transaction to a binlog file when its binlog cache size "
+ "is bigger than the value of this variable",
+ GLOBAL_VAR(opt_binlog_free_flush_threshold),
+ CMD_LINE(REQUIRED_ARG), VALID_RANGE(10 * 1024 * 1024, ULLONG_MAX),
+ DEFAULT(128 * 1024 * 1024), BLOCK_SIZE(1));
--
2.39.2
1
0