Re: [Maria-developers] e92037989f7: MDEV-21117: refine the server binlog-based recovery for semisync

25 Apr 2021

Hi, Andrei!

Don't be confused by the subject, this is a review of

   git diff 450c017c2d 2fa526e26e

that is of everything, combined. Not just one e92037989f7 commit.

On Apr 25, Sujatha wrote:
...
revision-id: e92037989f7 (mariadb-10.3.26-128-ge92037989f7)
parent(s): 450c017c2d9
author: Sujatha <sujatha.sivakumar@mariadb.com>
committer: Andrei Elkin <andrei.elkin@mariadb.com>
timestamp: 2021-04-13 12:26:12 +0300
message:
MDEV-21117: refine the server binlog-based recovery for semisync
...

diff --git a/libmariadb b/libmariadb
index fc431a035a2..e3824422064 160000
--- a/libmariadb
+++ b/libmariadb
@@ -1 +1 @@
-Subproject commit fc431a035a21ac1d4ef25d9d3cd8c4d7e64a8ee7
+Subproject commit e38244220646a7e95c9be22576460aa7a4eb715f
This is clearly a mistake, you erroneously checked in
old libmaridb commit, rolling back a bunch of changes.
See your commit 4bc83b2749
...
diff --git a/mysql-test/suite/binlog/t/binlog_truncate_multi_engine-master.opt b/mysql-test/suite/binlog/t/binlog_truncate_multi_engine-master.opt
new file mode 100644
index 00000000000..df675545bf9
--- /dev/null
+++ b/mysql-test/suite/binlog/t/binlog_truncate_multi_engine-master.opt
please rename this file to mysql-test/include/have_rocksdb.opt
...
diff --git a/mysql-test/suite/binlog/t/binlog_truncate_active_log.test b/mysql-test/suite/binlog/t/binlog_truncate_active_log.test
new file mode 100644
index 00000000000..2b794d02dd0
--- /dev/null
+++ b/mysql-test/suite/binlog/t/binlog_truncate_active_log.test
@@ -0,0 +1,74 @@
+# ==== Purpose ====
+#
+# Test verifies the truncation of single binary log file.
+#
+# ==== References ====
+#
+# MDEV-21117: recovery for --rpl-semi-sync-slave-enabled server
+
+--source include/have_innodb.inc
+--source include/have_aria.inc
+# File: binlog_truncate_active_log.inc included in test makes use of
+#       'debug_sync' facility.
+--source include/have_debug_sync.inc
you wouldn't need a comment if you'd include have_debug_sync.inc
directly into binlog_truncate_active_log.inc. but ok, whatever you like
...
+--source include/have_binlog_format_statement.inc
+
+call mtr.add_suppression("Can.t init tc log");
+call mtr.add_suppression("Aborting");
+
+# The following cases are tested:
+# A. 2pc transaction is followed by a blank "zero-engines" one
+# B. 2pc transaction follows the blank one
+# C. Similarly to A, with the XA blank transaction
+
+RESET MASTER;
+CREATE TABLE t (f INT) ENGINE=INNODB;
+CREATE TABLE t2 (f INT) ENGINE=INNODB;
+CREATE TABLE tm (f INT) ENGINE=Aria;
could you add a comment, explaining why you're using Aria here.
(you wrote that in an email, but please add a comment too)
...
+
+--echo # Case A.
+# Using 'debug_sync' hold 'query1' execution after 'query1' is flushed and
+# synced to binary log but not yet committed. In an another connection hold
+# 'query2' execution after 'query2' is flushed and synced to binlog.
+# Crash and restart server with --rpl-semi-sync-slave-enabled=1
+#
+# During recovery of binary log 'query1' status is checked with InnoDB engine,
+# it will be in prepared but not yet commited. All transactions starting from
+# 'query1' onwards will be removed from the binary log.
+
+--let $truncate_gtid_pos = 0-1-6
+--let $query1 = INSERT INTO t VALUES (20)
+--let $query2 = DELETE FROM t2 WHERE f = 0 /* no such record */
+--source binlog_truncate_active_log.inc
+
+--echo # Case B.
+# The inverted sequence ends up to truncate only $query2
+--let $truncate_gtid_pos = 0-1-10
+--let $query1 = DELETE FROM t2 WHERE f = 0
+--let $query2 = INSERT INTO t VALUES (20)
+--source binlog_truncate_active_log.inc
+
+
+delimiter |;
+CREATE PROCEDURE sp_blank_xa()
+BEGIN
+ XA START   'blank';
+ DELETE FROM t2 WHERE f = 0 /* no such record */;
+ XA END     'blank';
+ XA PREPARE 'blank';
+END|
+delimiter ;|
+
+
+--echo # Case C.
+--let $truncate_gtid_pos = 0-1-14
+--let $query1 = INSERT INTO t VALUES (20)
+--let $pre_q2 = CALL sp_blank_xa
+--let $query2 =  XA COMMIT  'blank'
+--source binlog_truncate_active_log.inc
what was truncated here?
a comment explains it for cases A and B, but not here.

may be it'd make sense to do show binlog events after every restart,
just to see the state of the binlog after truncation?
...
+DROP PROCEDURE sp_blank_xa;
+
+--echo # Cleanup
+DROP TABLE t,t2,tm;
+
+--echo # End of the tests
diff --git a/mysql-test/suite/binlog/t/binlog_truncate_active_log.inc b/mysql-test/suite/binlog/t/binlog_truncate_active_log.inc
new file mode 100644
index 00000000000..bbc464066fc
--- /dev/null
+++ b/mysql-test/suite/binlog/t/binlog_truncate_active_log.inc
@@ -0,0 +1,55 @@
+connect(master1,localhost,root,,);
+connect(master2,localhost,root,,);
+connect(master3,localhost,root,,);
+
+--connection default
+
+# First to commit few transactions
+INSERT INTO t  VALUES (10);
+INSERT INTO tm VALUES (10);
+
+--connection master1
+# Hold insert after write to binlog and before "run_commit_ordered" in engine
+SET DEBUG_SYNC= "commit_before_get_LOCK_commit_ordered SIGNAL master1_ready WAIT_FOR signal_never_arrives";
+--send_eval $query1
+
+--connection master2
+SET DEBUG_SYNC= "now WAIT_FOR master1_ready";
+if ($pre_q2)
+{
+  eval $pre_q2;
+}
+SET DEBUG_SYNC= "commit_before_get_LOCK_after_binlog_sync SIGNAL master2_ready";
+# To binlog non-xid transactional group which will be truncated all right
+--send_eval $query2
+
+--connection master3
+SET DEBUG_SYNC= "now WAIT_FOR master2_ready";
+SELECT @@global.gtid_binlog_pos as 'Before the crash';
+
+--connection default
+--source include/kill_mysqld.inc
+--disconnect master1
+--disconnect master2
+--disconnect master3
+
+#
+# Server restart
+#
+--let $restart_parameters= --rpl-semi-sync-slave-enabled=1
+--source include/start_mysqld.inc
+
+# Check error log for a successful truncate message.
+--let $log_error_ = $MYSQLTEST_VARDIR/log/mysqld.1.err
+
+--let SEARCH_FILE=$log_error_
why not to set SEARCH_FILE directly? but ok, as you like
...
+--let SEARCH_PATTERN=Successfully truncated.*to remove transactions starting from GTID $truncate_gtid_pos
+--replace_regex /FOUND [0-9]+/FOUND #/
can it be found multiple times? Why would binlog be truncated more than once?
...
+--source include/search_pattern_in_file.inc
+
+SELECT @@global.gtid_binlog_pos as 'After the crash';
+--echo "One row should be present in table 't'"
+SELECT * FROM t;
+
+# Local cleanup
+DELETE FROM t;
diff --git a/mysql-test/suite/binlog/t/binlog_truncate_multi_engine.test b/mysql-test/suite/binlog/t/binlog_truncate_multi_engine.test
new file mode 100644
index 00000000000..94837e3c3ea
--- /dev/null
+++ b/mysql-test/suite/binlog/t/binlog_truncate_multi_engine.test
@@ -0,0 +1,56 @@
+# ==== Purpose ====
+#
+# Test verifies truncation of multiple binary logs with multiple transactional
+# storage engines
+#
+# ==== References ====
+#
+# MDEV-21117: recovery for --rpl-semi-sync-slave-enabled server
+
+--source include/have_innodb.inc
+--source include/have_rocksdb.inc
+--source include/have_debug.inc
+--source include/have_debug_sync.inc
+--source include/have_binlog_format_row.inc
+
+--let $old_max_binlog_size= `select @@global.max_binlog_size`
+call mtr.add_suppression("Can.t init tc log");
+call mtr.add_suppression("Aborting");
+--let $MYSQLD_DATADIR= `SELECT @@datadir`
+
+CREATE TABLE t1 (a INT PRIMARY KEY, b MEDIUMTEXT) ENGINE=Innodb;
+CREATE TABLE t2 (a INT PRIMARY KEY, b MEDIUMTEXT) ENGINE=rocksdb;
+
+--let $case = A: neither engine committed => rollback & binlog truncate
+# Hold off engine commits after write to binlog and its rotation.
+# The transaction is killed along with the server after that.
+--let $shutdown_timeout=0
+--let $debug_sync_action = "commit_after_release_LOCK_log SIGNAL con1_ready WAIT_FOR signal_no_signal"
+--let $restart_parameters = --rpl-semi-sync-slave-enabled=1
+--let $test_outcome= 1 row should be present in both tables; binlog is truncated; number of binlogs at reconnect - 3
+--source binlog_truncate_multi_engine.inc
+--echo Proof of the truncated binlog file is readable (two transactions must be seen):
+--exec $MYSQL_BINLOG --short-form --skip-annotate-row-events $MYSQLD_DATADIR/master-bin.000002
+
+--let $case = B: one engine has committed its transaction branch
+# Hold off after one engine has committed.
+--let $shutdown_timeout=0
+--let $debug_sync_action = "commit_after_run_commit_ordered SIGNAL con1_ready WAIT_FOR signal_no_signal"
+# Both debug_sync and debug-dbug are required to make sure Engines remember the commit state
+# debug_sync alone will not help.
+--let $restart_parameters = --rpl-semi-sync-slave-enabled=1 --debug-dbug=d,binlog_truncate_partial_commit
in the first review I wrote

  this seems to be a rather crude way of faking a partially committed
  transaction. better to crash after the first engine has committed,
  that'd be much more natural.

and you replied

  This simulation aimed at (allows for) more complicated recovery time
  event sequences.
  In this case, indeed, crashing by demand is about of the same efforts.
  I can convert to that.

  [x]
...
+--let $test_outcome= 2 rows should be present in both tables; no binlog truncation; one extra binlog file compare with A; number of binlogs at reconnect - 4
+--source binlog_truncate_multi_engine.inc
+
+--let $case = C: both engines have committed its transaction branch
+# Hold off after both engines have committed. The server is shut down.
+--let $shutdown_timeout=
+--let $restart_parameters = --rpl-semi-sync-slave-enabled=1
+--let $test_outcome= 2 rows should be present in both tables; no binlog truncation; the same # of binlog files as in B; number of binlogs at reconnect - 4
+--source binlog_truncate_multi_engine.inc
+
+
+
+DROP TABLE t1, t2;
+
+--echo # End of the tests
diff --git a/mysql-test/suite/binlog/t/binlog_truncate_multi_engine.inc b/mysql-test/suite/binlog/t/binlog_truncate_multi_engine.inc
new file mode 100644
index 00000000000..41ae856dd9d
--- /dev/null
+++ b/mysql-test/suite/binlog/t/binlog_truncate_multi_engine.inc
@@ -0,0 +1,54 @@
+#
+# Loop body of binlog_truncate_multi_engine.test
what do you mean "loop body"?
...
+# Parameters:
+#   $debug_sync_action  describes debug-sync actions
+#   $kill_server        1 when to crash, 0 for regular restart
+#   $restart_parameters the caller may simulate partial commit at recovery
+#   $test_outcome       summary of extected results
+#   $MYSQLD_DATADIR
+
+--echo #
+--echo #
+--echo # Case $case
+--echo #
+RESET MASTER;
+FLUSH LOGS;
+SET GLOBAL max_binlog_size= 4096;
+
+connect(con1,localhost,root,,);
+--echo List of binary logs before rotation
+--source include/show_binary_logs.inc
+INSERT INTO t1 VALUES (1, REPEAT("x", 1));
+INSERT INTO t2 VALUES (1, REPEAT("x", 1));
I'm not sure I understand the point of REPEAT(..., 1)
but sure, if you like it that way... :)
...
+BEGIN;
+  INSERT INTO t1 VALUES (2, REPEAT("x", 4100));
+  INSERT INTO t2 VALUES (2, REPEAT("x", 4100));
+
+--eval SET DEBUG_SYNC= $debug_sync_action
+send COMMIT;
+
+--connection default
+SET DEBUG_SYNC= "now WAIT_FOR con1_ready";
+--echo List of binary logs after rotation
+--source include/show_binary_logs.inc
+
+--echo # restart the server with $restart_parameters
+--echo # the server is restarted
+--source include/restart_mysqld.inc
+
+--connection default
+--echo #
+--echo # *** Summary: $test_outcome:
+--echo #
+SELECT COUNT(*) FROM t1;
+SELECT COUNT(*) FROM t2;
+SELECT @@GLOBAL.gtid_binlog_state;
+SELECT @@GLOBAL.gtid_binlog_pos;
+--echo List of binary logs at the end of the tests
+--source include/show_binary_logs.inc
+--echo # ***
+# cleanup
+DELETE FROM t1;
+DELETE FROM t2;
+--disconnect con1
+--echo #
diff --git a/mysql-test/suite/binlog/t/binlog_truncate_multi_log.test b/mysql-test/suite/binlog/t/binlog_truncate_multi_log.test
new file mode 100644
index 00000000000..3b557bc89b8
--- /dev/null
+++ b/mysql-test/suite/binlog/t/binlog_truncate_multi_log.test
@@ -0,0 +1,78 @@
+# ==== Purpose ====
+#
+# Test verifies truncation of multiple binary logs.
+#
+# ==== References ====
+# MDEV-21117: recovery for --rpl-semi-sync-slave-enabled server
+
+--source include/have_innodb.inc
+--source include/have_debug_sync.inc
+--source include/have_binlog_format_row.inc
+
+call mtr.add_suppression("Can.t init tc log");
+call mtr.add_suppression("Aborting");
+
+SET @@global.max_binlog_size= 4096;
+
+RESET MASTER;
+FLUSH LOGS;
+CREATE TABLE ti (a INT PRIMARY KEY, b MEDIUMTEXT) ENGINE=Innodb;
+CREATE TABLE tm (a INT PRIMARY KEY, b MEDIUMTEXT) ENGINE=MyISAM;
+
+connect(master1,localhost,root,,);
+--echo "List of binary logs before rotation"
+--source include/show_binary_logs.inc
+
+# Some load to either non- and transactional egines
+# that should not affect the following recovery:
+INSERT INTO ti VALUES(1,"I am gonna survive");
+INSERT INTO tm VALUES(1,"me too!");
+
+# hold on near engine commit
+SET DEBUG_SYNC= "commit_after_release_LOCK_after_binlog_sync SIGNAL master1_ready WAIT_FOR con1_go";
+--send_eval INSERT INTO ti VALUES (2, REPEAT("x", 4100))
+
+connect(master2,localhost,root,,);
+# The 2nd trx for recovery, it does not rotate binlog
+SET DEBUG_SYNC= "now WAIT_FOR master1_ready";
+SET DEBUG_SYNC= "commit_before_get_LOCK_commit_ordered SIGNAL master2_ready WAIT_FOR master2_go";
+--send_eval INSERT INTO ti VALUES (3, "not gonna survive")
send_eval? what are you evaluating here?
...
+
+--connection default
+SET DEBUG_SYNC= "now WAIT_FOR master2_ready";
+--echo "List of binary logs before crash"
+--source include/show_binary_logs.inc
+--echo # The gtid binlog state prior the crash will be truncated at the end of the test
+SELECT @@global.gtid_binlog_state;
+
+--connection default
+--source include/kill_mysqld.inc
+--disconnect master1
+--disconnect master2
+
+#
+# Server restart
+#
+--let $restart_parameters= --rpl-semi-sync-slave-enabled=1
+--source include/start_mysqld.inc
+
+# Check error log for a successful truncate message.
+let $log_error_ = $MYSQLTEST_VARDIR/log/mysqld.1.err;
+
+--let SEARCH_FILE=$log_error_
+--let SEARCH_PATTERN=truncated binlog file:.*master.*000002
+--replace_regex /FOUND [0-9]+/FOUND #/
can it be found multiple times? Why would binlog be truncated more than once?
...
+--source include/search_pattern_in_file.inc
+
+
+--echo "One record should be present in table"
+SELECT * FROM ti;
+
+--echo # The truncated gtid binlog state
+SELECT @@global.gtid_binlog_state;
+SELECT @@global.gtid_binlog_pos;
+
+--echo # Cleanup
+DROP TABLE ti;
+
+--echo # End of the tests
diff --git a/mysql-test/suite/binlog/t/binlog_truncate_multi_log_unsafe.test b/mysql-test/suite/binlog/t/binlog_truncate_multi_log_unsafe.test
new file mode 100644
index 00000000000..38a9c0832f4
--- /dev/null
+++ b/mysql-test/suite/binlog/t/binlog_truncate_multi_log_unsafe.test
@@ -0,0 +1,120 @@
+# ==== Purpose ====
+# The test verifies attempt to recover by the semisync slave server whose
+# binlog is unsafe for truncation.
+#
+# ==== Implementation ====
+# 2 binlog files are created with the 1st one destined to be the binlog
+# checkpoint file for recovery.
+# The final group of events is replication unsafe (myisam INSERT).
+# Therefore the semisync slave recovery may not.
+#
+# Steps:
+#    0 - Set max_binlog_size= 4096, to help an insert into a
+#        transaction table 'ti' get binlog rotated while the
+#        transaction won't be committed, being stopped at
+#        a prior to commit debug_sync point
+#    1 - insert into a non-transactional 'tm' table completes with
+#        binary logging as well
+#    2 - kill and attempt to restart the server as semisync slave that
+#        must produce an expected unsafe-to-recover error
+#    3 - complete the test with a normal restart that successfully finds and
+#        commits the transaction in doubt.
+#
+# ==== References ====
+#
+# MDEV-21117: recovery for --rpl-semi-sync-slave-enabled server
+#
+
+--source include/have_innodb.inc
+--source include/have_debug_sync.inc
+--source include/have_binlog_format_row.inc
+
+SET @@global.max_binlog_size= 4096;
+
+call mtr.add_suppression("Table '.*tm' is marked as crashed and should be repaired");
+call mtr.add_suppression("Got an error from unknown thread");
+call mtr.add_suppression("Checking table:   '.*tm'");
+call mtr.add_suppression("Recovering table: '.*tm'");
+call mtr.add_suppression("Cannot trim the binary log to file");
+call mtr.add_suppression("Crash recovery failed");
+call mtr.add_suppression("Can.t init tc log");
+call mtr.add_suppression("Aborting");
+call mtr.add_suppression("Found 1 prepared transactions");
+call mtr.add_suppression("mysqld: Table.*tm.*is marked as crashed");
+call mtr.add_suppression("Checking table.*tm");
+
+RESET MASTER;
+FLUSH LOGS;
+CREATE TABLE ti (a INT PRIMARY KEY, b MEDIUMTEXT) ENGINE=Innodb;
+CREATE TABLE tm (f INT) ENGINE=MYISAM;
+
+--let $row_count = 5
+--let $i = `select $row_count-2`
+--disable_query_log
+while ($i)
+{
+  --eval INSERT INTO ti VALUES ($i, REPEAT("x", 1))
+  --dec $i
+}
+--enable_query_log
+INSERT INTO tm VALUES(1);
+
+connect(master1,localhost,root,,);
+connect(master2,localhost,root,,);
+connect(master3,localhost,root,,);
+
+--connection master1
+
+# The 1st trx binlogs, rotate binlog and hold on before committing at engine
+SET DEBUG_SYNC= "commit_after_release_LOCK_after_binlog_sync SIGNAL master1_ready WAIT_FOR master1_go";
+--send_eval INSERT INTO ti VALUES ($row_count - 1, REPEAT("x", 4100))
+
+--connection master2
+
+# The 2nd trx for recovery, it does not rotate binlog
+SET DEBUG_SYNC= "commit_before_get_LOCK_commit_ordered SIGNAL master2_ready WAIT_FOR master2_go";
+--send_eval INSERT INTO ti VALUES ($row_count, REPEAT("x", 1))
+
+--connection master3
+SET DEBUG_SYNC= "now WAIT_FOR master2_ready";
+SET DEBUG_SYNC= "commit_before_get_LOCK_after_binlog_sync SIGNAL master3_ready";
+--send INSERT INTO tm VALUES (2)
+
+--connection default
+SET DEBUG_SYNC= "now WAIT_FOR master3_ready";
+--echo # The gtid binlog state prior the crash must be restored at the end of the test;
+SELECT @@global.gtid_binlog_state;
+--source include/kill_mysqld.inc
+
+#
+# Server restarts
+#
+--echo # Failed restart as the semisync slave
+--error 1
+--exec $MYSQLD_LAST_CMD  --rpl-semi-sync-slave-enabled=1 >> $MYSQLTEST_VARDIR/log/mysqld.1.err 2>&1
+
+--echo # Normal restart
+--source include/start_mysqld.inc
+
+# Check error log for correct messages.
+let $log_error_ = $MYSQLTEST_VARDIR/log/mysqld.1.err;
+
+--let SEARCH_FILE=$log_error_
+--let SEARCH_PATTERN=Cannot trim the binary log to file
+--replace_regex /FOUND [0-9]+/FOUND #/
can it be found multiple times? Why would binlog be truncated more than once?
...
+--source include/search_pattern_in_file.inc
+
+--echo # Proof that the in-doubt transactions are recovered by the 2nd normal server restart
+--eval SELECT COUNT(*) = $row_count as 'True' FROM ti
+# myisam table may require repair (which is not tested here)
+--disable_warnings
+SELECT COUNT(*) <= 1 FROM tm;
+--enable_warnings
+
+--echo # The gtid binlog state prior the crash is restored now
+SELECT @@GLOBAL.gtid_binlog_state;
+SELECT @@GLOBAL.gtid_binlog_pos;
+
+--echo # Cleanup
+DROP TABLE ti, tm;
+--echo End of test
diff --git a/mysql-test/suite/rpl/t/rpl_semi_sync_fail_over.cnf b/mysql-test/suite/rpl/t/rpl_semi_sync_fail_over.cnf
new file mode 100644
index 00000000000..f8312bdc5b8
--- /dev/null
+++ b/mysql-test/suite/rpl/t/rpl_semi_sync_fail_over.cnf
@@ -0,0 +1,11 @@
+!include suite/rpl/rpl_1slave_base.cnf
+!include include/default_client.cnf
+
+
+[mysqld.1]
+log-slave-updates
+gtid-strict-mode=1
+
+[mysqld.2]
+log-slave-updates
+gtid-strict-mode=1
generally opt files (rpl_semi_sync_fail_over.opt in this case) are preferred,
because mtr will know what options to apply, while cnf files are more opaque
...
diff --git a/mysql-test/suite/rpl/t/rpl_semi_sync_fail_over.test b/mysql-test/suite/rpl/t/rpl_semi_sync_fail_over.test
new file mode 100644
index 00000000000..a8b40d6ed05
--- /dev/null
+++ b/mysql-test/suite/rpl/t/rpl_semi_sync_fail_over.test
@@ -0,0 +1,143 @@
+# ==== Purpose ====
+#
+# Test verifies replication failover scenario.
+#
+# ==== Implementation ====
+#
+# Steps:
+#    0 - Having two servers 1 and 2 enable semi-sync replication with
+#        with the master wait 'after_sync'.
+#    1 - Insert a row. While inserting second row simulate
+#        a server crash at once the transaction is written to binlog, flushed
+#        and synced but the binlog position is not updated.
+#    2 - Post crash-recovery on the old master execute there CHANGE MASTER
+#        TO command to connect to server id 2.
+#    3 - The old master new slave server 1 must connect to the new
+#        master server 2.
+#    4 - repeat the above to crash the new master and restore in role the old one
+#
+# ==== References ====
+#
+# MDEV-21117: recovery for --rpl-semi-sync-slave-enabled server
+
+
+--source include/have_innodb.inc
+--source include/have_debug_sync.inc
+--source include/have_binlog_format_row.inc
+--let $rpl_topology=1->2
+--source include/rpl_init.inc
why not to source master-slave.inc if you're using a standard master-slave
topology anyway?
...
+
+--connection server_2
+--source include/stop_slave.inc
+
+--connection server_1
+RESET MASTER;
+SET @@global.max_binlog_size= 4096;
+
+--connection server_2
+RESET MASTER;
+SET @@global.max_binlog_size= 4096;
+set @@global.rpl_semi_sync_slave_enabled = 1;
+set @@global.gtid_slave_pos = "";
+CHANGE MASTER TO master_use_gtid= slave_pos;
+--source include/start_slave.inc
+
+
+--connection server_1
+ALTER TABLE mysql.gtid_slave_pos ENGINE=InnoDB;
here I asked "why?" and you replied

 Actually not need to. There's no crash in the middle of a slave
 transaction.
 So it must be a copy-paste leftover.

 [x]
...
+set @@global.rpl_semi_sync_master_enabled = 1;
+set @@global.rpl_semi_sync_master_wait_point=AFTER_SYNC;
+
+call mtr.add_suppression("Can.t init tc log");
+call mtr.add_suppression("Aborting");
+call mtr.add_suppression("1 client is using or hasn.t closed the table properly");
+call mtr.add_suppression("Table './mtr/test_suppressions' is marked as crashed and should be repaired");
+
+CREATE TABLE t1 (a INT PRIMARY KEY, b MEDIUMTEXT) ENGINE=Innodb;
+INSERT INTO t1 VALUES (1, 'dummy1');
+
+#
+# CRASH the original master, and FAILOVER to the new
+#
+
+# value 1 for server id 1 -> 2 failover
+--let $failover_to_slave=1
+--let $query_to_crash= INSERT INTO t1 VALUES (2, REPEAT("x", 4100))
+--let $log_search_pattern=truncated binlog file:.*master.*000001
+--source rpl_semi_sync_crash.inc
+
+--connection server_2
+--let $rows_so_far=3
+--eval INSERT INTO t1 VALUES ($rows_so_far, 'dummy3')
+--save_master_pos
+--echo # The gtid state on current master must be equal to ...
+SHOW VARIABLES LIKE 'gtid_binlog_pos';
+
+--connection server_1
+--sync_with_master
+--eval SELECT COUNT(*) = $rows_so_far as 'true' FROM t1
+--echo # ... the gtid states on the slave:
+SHOW VARIABLES LIKE 'gtid_slave_pos';
+SHOW VARIABLES LIKE 'gtid_binlog_pos';
+
+--connection server_2
+#
+# CRASH the new master and FAILOVER back to the original
+#
+
+# value 0 for the reverse server id 2 -> 1 failover
+--let $failover_to_slave=0
+--let $query_to_crash = INSERT INTO t1 VALUES (4, REPEAT("x", 4100))
+--let $query2_to_crash= INSERT INTO t1 VALUES (5, REPEAT("x", 4100))
+--let $log_search_pattern=truncated binlog file:.*slave.*000001
+--source rpl_semi_sync_crash.inc
+
+--connection server_1
+--let $rows_so_far=6
+--eval INSERT INTO t1 VALUES ($rows_so_far, 'Done')
+--save_master_pos
+--echo # The gtid state on current master must be equal to ...
+SHOW VARIABLES LIKE 'gtid_binlog_pos';
+
+--connection server_2
+--sync_with_master
+--eval SELECT COUNT(*) = $rows_so_far as 'true' FROM t1
+--echo # ... the gtid states on the slave:
+SHOW VARIABLES LIKE 'gtid_slave_pos';
+SHOW VARIABLES LIKE 'gtid_binlog_pos';
+
+
+--let $diff_tables=server_1:t1, server_2:t1
+--source include/diff_tables.inc
+
+#
+--echo # Cleanup
+#
+--connection server_1
+DROP TABLE t1;
+--save_master_pos
+
+--connection server_2
+--sync_with_master
+--source include/stop_slave.inc
+
+--connection server_1
+set @@global.rpl_semi_sync_master_enabled = 0;
+set @@global.rpl_semi_sync_slave_enabled = 0;
+set @@global.rpl_semi_sync_master_wait_point=default;
+RESET SLAVE;
+RESET MASTER;
+
+--connection server_2
+set @@global.rpl_semi_sync_master_enabled = 0;
+set @@global.rpl_semi_sync_slave_enabled = 0;
+set @@global.rpl_semi_sync_master_wait_point=default;
+
+evalp CHANGE MASTER TO master_host='127.0.0.1', master_port=$SERVER_MYPORT_1, master_user='root', master_use_gtid=no;
+--source include/start_slave.inc
+
+connection default;
+--enable_reconnect
+--source include/wait_until_connected_again.inc
+
+--source include/rpl_end.inc
diff --git a/sql/handler.h b/sql/handler.h
index fc69d9423b4..05a62ed0021 100644
--- a/sql/handler.h
+++ b/sql/handler.h
@@ -873,6 +874,15 @@ typedef struct xid_t XID;
 /* The 'buf' has to have space for at least SQL_XIDSIZE bytes. */
 uint get_sql_xid(XID *xid, char *buf);
+/* struct for semisync slave binlog truncate recovery */
+struct xid_recovery_member
+{
+  my_xid xid;
+  uint in_engine_prepare;  // number of engines that have xid prepared
+  bool decided_to_commit;
+  std::pair<uint, my_off_t> binlog_coord; // semisync recovery binlog offset
wouldn't it be clearer to have a struct with named members?
in fact, I'm somewhat surprised there's no such struct for binlog coords
already.
...
+};
+
 /* for recover() handlerton call */
 #define MIN_XID_LIST_SIZE  128
 #define MAX_XID_LIST_SIZE  (1024*128)
@@ -4820,7 +4830,8 @@ int ha_commit_one_phase(THD *thd, bool all);
 int ha_commit_trans(THD *thd, bool all);
 int ha_rollback_trans(THD *thd, bool all);
 int ha_prepare(THD *thd);
-int ha_recover(HASH *commit_list);
+int ha_recover(HASH *commit_list, MEM_ROOT *mem_root= NULL);
+uint ha_recover_complete(HASH *commit_list, std::pair<uint, my_off_t> *coord= NULL);
is coord a truncation position?
...
/* transactions: these functions never call handlerton functions directly */
 int ha_enable_transaction(THD *thd, bool on);
diff --git a/sql/log_event.h b/sql/log_event.h
index 8a342cb5cd3..1036e9a44d4 100644
--- a/sql/log_event.h
+++ b/sql/log_event.h
@@ -482,6 +482,16 @@ class String;
 */
 #define LOG_EVENT_IGNORABLE_F 0x80
+/**
+   @def LOG_EVENT_ACCEPT_OWN_F
+
+   Flag sets by the semisync slave for accepting
+   the same server_id ("own") events which the slave must not have
+   in its state. Typically such events were never committed by
+   their originator (this server) and discared at its semisync-slave recovery.
+*/
+#define LOG_EVENT_ACCEPT_OWN_F 0x4000
may be, add an assert on all received events that such a flag is not set?
it can only be set on events in relay log.

also, consider the case when this event is read from a relay log, applied,
and then sent to further slaves. In this case this flag must be removed
before sending, otherwise they'll mistakenly might apply it if the server_id
will match.
...
+
 /**
    @def LOG_EVENT_SKIP_REPLICATION_F
@@ -3357,6 +3367,12 @@ class Gtid_log_event: public Log_event
   uint64 commit_id;
   uint32 domain_id;
   uchar flags2;
+  uint  flags_extra; // more flags area placed after the regular flags2's one
+  /*
+    Extra to a "base" engine recoverable engines participating
+    in the transaction. Zero, when the base engine only is present.
what's a "base engine"?
...
+  */
+  uint8 extra_engines;
/* Flags2. */
diff --git a/sql/handler.cc b/sql/handler.cc
index c0a810a72bc..a46cef6b64c 100644
--- a/sql/handler.cc
+++ b/sql/handler.cc
@@ -1637,9 +1672,17 @@ commit_one_phase_2(THD *thd, bool all, THD_TRANS *trans, bool is_real_trans)
     DEBUG_SYNC(thd, "commit_one_phase_2");
   if (ha_info)
   {
+    int err;
+
+    if (has_binlog_hton(ha_info) &&
can you replace has_binlog_hton() with, like, if trx cache is not empty or
binlog enabled or something like that?
...
+        (err= binlog_commit(thd, all,
+                            is_ro_1pc_trans(thd, ha_info, all, is_real_trans))))
+    {
+      my_error(ER_ERROR_DURING_COMMIT, MYF(0), err);
+      error= 1;
+    }
     for (; ha_info; ha_info= ha_info_next)
     {
-      int err;
       handlerton *ht= ha_info->ht();
       if ((err= ht->commit(ht, thd, all)))
       {
@@ -1962,8 +2008,177 @@ struct xarecover_st
   XID *list;
   HASH *commit_list;
   bool dry_run;
+  MEM_ROOT *mem_root;
+  bool error;
 };
+/**
+  Inserts a new hash member.
+
+  returns a successfully created and inserted @c xid_recovery_member
+           into hash @c hash_arg,
+           or NULL.
+*/
+static xid_recovery_member*
+xid_member_insert(HASH *hash_arg, my_xid xid_arg, MEM_ROOT *ptr_mem_root)
+{
+  xid_recovery_member *member= (xid_recovery_member*)
+    alloc_root(ptr_mem_root, sizeof(xid_recovery_member));
+  if (!member)
+    return NULL;
+
+  member->xid= xid_arg;
+  member->in_engine_prepare= 1;
+  member->decided_to_commit= false;
+
+  return my_hash_insert(hash_arg, (uchar*) member) ? NULL : member;
+}
+
+/*
+  Inserts a new or updates an existing hash member to increment
+  the member's prepare counter.
+
+  returns false  on success,
+           true   otherwise.
+*/
+static bool xid_member_replace(HASH *hash_arg, my_xid xid_arg,
+                               MEM_ROOT *ptr_mem_root)
+{
+  xid_recovery_member* member;
+  if ((member= (xid_recovery_member *)
+       my_hash_search(hash_arg, (uchar *)& xid_arg, sizeof(xid_arg))))
+    member->in_engine_prepare++;
+  else
+    member= xid_member_insert(hash_arg, xid_arg, ptr_mem_root);
+
+  return member == NULL;
+}
+
+/*
+  Decision to commit returns true, otherwise false for rollback.
+  Flagged to commit member is destined to commit. If it is in doubt in case
+  A. the caller does not specify coord_ptr (always so in the normal recovery), or
+  B. coord_ptr is not NULL (can only be so in the semisync slave case) and its
+     offset is greater than that of the member's the decision is rollback.
+  If both A,B do not hold - which is the semisync slave recovery case -
+  the decision is to rollback.
+*/
+static bool xarecover_decide(xid_recovery_member* member,
+                             xid_t x, std::pair<uint, my_off_t> *coord_ptr)
+{
+  return
+    member->decided_to_commit ? true :
+    !coord_ptr ? false :
+    (member->binlog_coord < *coord_ptr ?  // semisync slave recovery
+     true : false);
+}
+
+struct xarecover_iterate_arg
+{
+  handlerton *hton;
+  std::pair<uint, my_off_t> *binlog_coord;
+};
+
+/*
+  Hash iterate function to complete with commit or rollback as either
+  has been decided already or decide now (in the semisync recovery)
+  via comparison against passed offset.
+  Commit when the offset is greater than that of the member.
+*/
+static my_bool xarecover_do_commit_or_rollback(void *member_arg,
+                                               void *iter_arg)
+{
+  xid_recovery_member *member= (xid_recovery_member*) member_arg;
+  handlerton *hton= ((xarecover_iterate_arg*) iter_arg)->hton;
+  std::pair<uint, my_off_t> *max_coord_ptr=
+    ((xarecover_iterate_arg*) iter_arg)->binlog_coord;
+  xid_t x;
+  my_bool rc;
+
+  x.set(member->xid);
+
+  rc= xarecover_decide(member, x, max_coord_ptr) ?
+    hton->commit_by_xid(hton, &x) : hton->rollback_by_xid(hton, &x);
+
+  DBUG_ASSERT(rc || member->in_engine_prepare > 0);
+
+  if (!rc)
+  {
+    /*
+      This block relies on Engine to report XAER_NOTA at
+      "complete"_by_xid for unknown xid.
+    */
+    member->in_engine_prepare--;
+    if (global_system_variables.log_warnings > 2)
+      sql_print_warning("%s transaction with xid %llu",
may be not a sql_print_warning, but a sql_print_information?
it's just an informational message
...
+                        member->decided_to_commit ?
+                        "Committed" : "Rolled back", (ulonglong) member->xid);
+  }
+
+  return false;
+}
+
+static my_bool xarecover_do_count_in_prepare(void *member_arg,
+                                             void *ptr_count)
+{
+  xid_recovery_member *member= (xid_recovery_member*) member_arg;
+  if (member->in_engine_prepare)
+  {
+    (*(uint*) ptr_count)++;
+    if (global_system_variables.log_warnings > 2)
+      sql_print_warning("Found prepared transaction with xid %llu",
+                        (ulonglong) member->xid);
+  }
+
+  return false;
+}
+
+struct xarecover_complete_arg
+{
+  HASH *commit_list;
+  std::pair<uint, my_off_t> *binlog_coord;
+};
+
+/*
+  Completes binlog recovery to invoke a decider function for
+  each transaction in doubt.
+*/
+static my_bool xarecover_binlog_handlerton(THD *unused,
+                                           plugin_ref plugin,
+                                           void *arg)
+{
+    handlerton *hton= plugin_hton(plugin);
+
+    if (hton->state == SHOW_OPTION_YES && hton->recover)
+    {
+      xarecover_iterate_arg iter_arg=
+        {
+          hton,
+          ((xarecover_complete_arg*) arg)->binlog_coord
+        };
+      my_hash_iterate(((xarecover_complete_arg*) arg)->commit_list,
+                      xarecover_do_commit_or_rollback, &iter_arg);
+    }
+
+    return FALSE;
+}
+
+/*
+  Completes binlog recovery to invoke decider functions for
+  each handerton.
+  Returns the number of transactions remained doubtful.
+*/
+uint ha_recover_complete(HASH *commit_list, std::pair<uint, my_off_t> *coord)
+{
+  uint count= 0;
+  xarecover_complete_arg complete_arg= { commit_list, coord };
+  plugin_foreach(NULL, xarecover_binlog_handlerton,
+                 MYSQL_STORAGE_ENGINE_PLUGIN, &complete_arg);
+  my_hash_iterate(commit_list, xarecover_do_count_in_prepare, &count);
wouldn't it be cleaner to do everything in one commit_list scan?

for every xid_recovery_member:
  run plugin_foreach, commit or rollback as needed
  increment a counter, if still in doubt
...
+
+  return count;
+}
+
 static my_bool xarecover_handlerton(THD *unused, plugin_ref plugin,
                                     void *arg)
 {
@@ -1973,6 +2188,9 @@ static my_bool xarecover_handlerton(THD *unused, plugin_ref plugin,
if (hton->state == SHOW_OPTION_YES && hton->recover)
   {
+#ifndef DBUG_OFF
+    my_xid dbug_xid_list[128] __attribute__((unused)) = {0};
+#endif
What do you use it for?
...
while ((got= hton->recover(hton, info->list, info->len)) > 0 )
     {
       sql_print_information("Found %d prepared transaction(s) in %s",
diff --git a/sql/log.cc b/sql/log.cc
index 8073f09ab88..a90d1e757e8 100644
--- a/sql/log.cc
+++ b/sql/log.cc
@@ -2050,14 +2058,17 @@ static int binlog_commit(handlerton *hton, THD *thd, bool all)
     Otherwise, we accumulate the changes.
   */
   if (likely(!error) && ending_trans(thd, all))
+  {
+    cache_mngr->ro_1pc= ro_1pc;
     error= binlog_commit_flush_trx_cache(thd, all, cache_mngr);
+    cache_mngr->ro_1pc= false;
Why do you put it in cache_mngr, instead of passing it down, like you pass `all` ?
...
+  }
/*
     This is part of the stmt rollback.
   */
   if (!all)
     cache_mngr->trx_cache.set_prev_position(MY_OFF_T_UNDEF);
-
   THD_STAGE_INFO(thd, org_stage);
   DBUG_RETURN(error);
 }
@@ -9609,6 +9626,147 @@ int TC_LOG::using_heuristic_recover()
 /****** transaction coordinator log for 2pc - binlog() based solution ******/
 #define TC_LOG_BINLOG MYSQL_BIN_LOG
+/**
+  Truncates the current binlog to specified position. Removes the rest of binlogs
+  which are present after this binlog file.
+
+  @param  truncate_file    Holds the binlog name to be truncated
+  @param  truncate_pos     Position within binlog from where it needs to
+                           truncated.
+
+  @retval true             ok
+  @retval false            error
+
+*/
+bool MYSQL_BIN_LOG::truncate_and_remove_binlogs(const char *file_name,
+                                                my_off_t pos,
+                                                rpl_gtid *ptr_gtid)
+{
+  int error= 0;
+#ifdef HAVE_REPLICATION
+  LOG_INFO log_info;
+  THD *thd= current_thd;
+  my_off_t index_file_offset= 0;
+  File file= -1;
+  MY_STAT s;
+
+  if ((error= find_log_pos(&log_info, file_name, 1)))
+  {
+    sql_print_error("Failed to locate binary log file:%s."
+                    "Error:%d", file_name, error);
+    goto end;
+  }
+
+  while (!(error= find_next_log(&log_info, 1)))
+  {
+    if (!index_file_offset)
+    {
+      index_file_offset= log_info.index_file_start_offset;
+      if ((error= open_purge_index_file(TRUE)))
+      {
+        sql_print_error("Failed to open purge index "
+                        "file:%s. Error:%d", purge_index_file_name, error);
+        goto end;
+      }
+    }
+    if ((error= register_purge_index_entry(log_info.log_file_name)))
+    {
+      sql_print_error("Failed to copy %s to purge index"
+                      " file. Error:%d", log_info.log_file_name, error);
+      goto end;
+    }
+  }
+
+  if (error != LOG_INFO_EOF)
+  {
+    sql_print_error("Failed to find the next binlog to "
+                    "add to purge index register. Error:%d", error);
+    goto end;
+  }
+
+  if (is_inited_purge_index_file())
+  {
+    if (!index_file_offset)
+      index_file_offset= log_info.index_file_start_offset;
+
+    if ((error= sync_purge_index_file()))
+    {
+      sql_print_error("Failed to flush purge index "
+                      "file. Error:%d", error);
+      goto end;
+    }
+
+    // Trim index file
+    if ((error=
+         mysql_file_chsize(index_file.file, index_file_offset, '\n',
+                           MYF(MY_WME))) ||
+         (error=
+         mysql_file_sync(index_file.file, MYF(MY_WME|MY_SYNC_FILESIZE))))
+    {
+      sql_print_error("Failed to trim binlog index "
+                      "file:%s to offset:%llu. Error:%d", index_file_name,
+                      index_file_offset, error);
+      goto end;
+    }
+
+    /* Reset data in old index cache */
+    if ((error= reinit_io_cache(&index_file, READ_CACHE, (my_off_t) 0, 0, 1)))
+    {
+      sql_print_error("Failed to reinit binlog index "
+                      "file. Error:%d", error);
+      goto end;
+    }
+
+    /* Read each entry from purge_index_file and delete the file. */
+    if ((error= purge_index_entry(thd, NULL, TRUE)))
+    {
+      sql_print_error("Failed to process registered "
+                      "files that would be purged.");
+      goto end;
+    }
+  }
+
+  DBUG_ASSERT(pos);
+
+  if ((file= mysql_file_open(key_file_binlog, file_name,
+                             O_RDWR | O_BINARY, MYF(MY_WME))) < 0)
+  {
+    error= 1;
+    sql_print_error("Failed to open binlog file:%s for "
+                    "truncation.", file_name);
+    goto end;
+  }
+  my_stat(file_name, &s, MYF(0));
+
+  /* Change binlog file size to truncate_pos */
+  if ((error=
+       mysql_file_chsize(file, pos, 0, MYF(MY_WME))) ||
+      (error= mysql_file_sync(file, MYF(MY_WME|MY_SYNC_FILESIZE))))
+  {
+    sql_print_error("Failed to trim the "
+                    "binlog file:%s to size:%llu. Error:%d",
+                    file_name, pos, error);
+    goto end;
+  }
+  else
+  {
+    char buf[21];
+
+    longlong10_to_str(ptr_gtid->seq_no, buf, 10);
+    sql_print_information("Successfully truncated binlog file:%s "
+                          "to pos:%llu to remove transactions starting from "
+                          "GTID %u-%u-%s", file_name, pos,
+                          ptr_gtid->domain_id, ptr_gtid->server_id, buf);
+  }
+
+end:
+  if (file >= 0)
+    mysql_file_close(file, MYF(MY_WME));
Why you don't clean inuse flag here? You used to do it in the previous version of the patch.
...
+
+  error= error || close_purge_index_file();
+#endif
+  return error > 0;
+}
 int TC_LOG_BINLOG::open(const char *opt_name)
 {
   int      error= 1;
@@ -10215,34 +10914,50 @@ int TC_LOG_BINLOG::recover(LOG_INFO *linfo, const char *last_log_name,
       end_io_cache(&log);
       mysql_file_close(file, MYF(MY_WME));
       file= -1;
+      /*
+        NOTE: reading other binlog's FD is necessary for finding out
+        the checksum status of the respective binlog file.
+      */
okay, but where do you read other binlog's FD?
In the previous patch I reviewed you had
case FORMAT_DESCRIPTION_EVENT: read FD and replace fdle.
now you don't have it anymore.
...
+      if (find_next_log(linfo, 1))
+      {
+        sql_print_error("Error reading binlog files during recovery. "
+                        "Aborting.");
+        goto err2;
+      }
     }
+#ifdef HAVE_REPLICATION
+    int rc= ctx.next_binlog_or_round(round, last_log_name,
+                                     binlog_checkpoint_name, linfo, this);
+    if (rc == -1)
+      goto err2;
+    else if (rc == 1)
+      break;                                     // all rounds done
+#else
     if (!strcmp(linfo->log_file_name, last_log_name))
       break;                                    // No more files to do
+    round++;
+#endif
+
     if ((file= open_binlog(&log, linfo->log_file_name, &errmsg)) < 0)
     {
       sql_print_error("%s", errmsg);
       goto err2;
     }
-    /*
-      We do not need to read the Format_description_log_event of other binlog
-      files. It is not possible for a binlog checkpoint to span multiple
-      binlog files written by different versions of the server. So we can use
-      the first one read for reading from all binlog files.
-    */
-    if (find_next_log(linfo, 1))
-    {
-      sql_print_error("Error reading binlog files during recovery. Aborting.");
-      goto err2;
-    }
     fdle->reset_crypto();
-  }
+  } // end of for
if (do_xa)
   {
-    if (ha_recover(&xids))
-      goto err2;
-
+    if (binlog_checkpoint_found)
+    {
+#ifndef HAVE_REPLICATION
+      if (ha_recover_complete(&xids))
so, ha_recover_complete() is for no-semisync no-replication case?
basically it should be the old behavior, exactly as before?
why do you need ha_recover_complete() then if it didn't exist before?
...
+#else
+      if (ctx.complete(this, xids))
+#endif
+        goto err2;
+    }
     free_root(&mem_root, MYF(0));
     my_hash_free(&xids);
   }
Regards,
Sergei
VP of MariaDB Server Engineering
and security@mariadb.org

    

Re: [Maria-developers] e92037989f7: MDEV-21117: refine the server binlog-based recovery for semisync

Sergei Golubchik