Hi Serg, As we discussed under review of MDEV-232, here is a separate patch that makes InnoDB/XtraDB commit checkpointing be more asynchroneous. See MDEV-532 for further description of this task. I hope you will review this, at your convenience. - Kristian. knielsen@knielsen-hq.org writes:
At http://bazaar.launchpad.net/~maria-captains/maria/10.0
------------------------------------------------------------ revno: 3435 revision-id: knielsen@knielsen-hq.org-20120914124453-zsap6hjclq3vrb6n parent: knielsen@knielsen-hq.org-20120913123129-kaujy4cw0jc9o08k committer: knielsen@knielsen-hq.org branch nick: work-10.0-mdev225-181-232 timestamp: Fri 2012-09-14 14:44:53 +0200 message: MDEV-532: Async InnoDB commit checkpoint.
Make the commit checkpoint inside InnoDB be asynchroneous. Implement a background thread in binlog to do the writing and flushing of binlog checkpoint events to disk. === modified file 'mysql-test/suite/binlog/r/binlog_checkpoint.result' --- a/mysql-test/suite/binlog/r/binlog_checkpoint.result 2012-09-13 12:31:29 +0000 +++ b/mysql-test/suite/binlog/r/binlog_checkpoint.result 2012-09-14 12:44:53 +0000 @@ -70,8 +70,14 @@ show binlog events in 'master-bin.000003 Log_name Pos Event_type Server_id End_log_pos Info master-bin.000003 # Format_desc # # SERVER_VERSION, BINLOG_VERSION master-bin.000003 # Binlog_checkpoint # # master-bin.000001 +SET DEBUG_SYNC= "RESET"; +SET @old_dbug= @@global.DEBUG_DBUG; +SET GLOBAL debug_dbug="+d,binlog_background_checkpoint_processed"; SET DEBUG_SYNC= "now SIGNAL con2_continue"; con1 is still pending, no new binlog checkpoint should have been logged. +SET DEBUG_SYNC= "now WAIT_FOR binlog_background_checkpoint_processed"; +SET GLOBAL debug_dbug= @old_dbug; +SET DEBUG_SYNC= "RESET"; show binlog events in 'master-bin.000003' from <binlog_start>; Log_name Pos Event_type Server_id End_log_pos Info master-bin.000003 # Format_desc # # SERVER_VERSION, BINLOG_VERSION
=== modified file 'mysql-test/suite/binlog/r/binlog_xa_recover.result' --- a/mysql-test/suite/binlog/r/binlog_xa_recover.result 2012-09-13 12:31:29 +0000 +++ b/mysql-test/suite/binlog/r/binlog_xa_recover.result 2012-09-14 12:44:53 +0000 @@ -118,7 +118,11 @@ master-bin.00000<binlog_start> # Table_m master-bin.00000<binlog_start> # Write_rows # # table_id: # flags: STMT_END_F master-bin.00000<binlog_start> # Xid # # COMMIT /* XID */ SET DEBUG_SYNC= "now SIGNAL con10_cont"; +SET @old_dbug= @@global.DEBUG_DBUG; +SET GLOBAL debug_dbug="+d,binlog_background_checkpoint_processed"; SET DEBUG_SYNC= "now SIGNAL con12_cont"; +SET DEBUG_SYNC= "now WAIT_FOR binlog_background_checkpoint_processed"; +SET GLOBAL debug_dbug= @old_dbug; SET DEBUG_SYNC= "now SIGNAL con11_cont"; Checking that master-bin.000004 is the last binlog checkpoint show binlog events in 'master-bin.00000<binlog_start>' from <binlog_start>;
=== modified file 'mysql-test/suite/binlog/t/binlog_checkpoint.test' --- a/mysql-test/suite/binlog/t/binlog_checkpoint.test 2012-09-13 12:31:29 +0000 +++ b/mysql-test/suite/binlog/t/binlog_checkpoint.test 2012-09-14 12:44:53 +0000 @@ -71,6 +71,12 @@ SET DEBUG_SYNC= "now WAIT_FOR con2_ready --let $binlog_file= master-bin.000003 --source include/show_binlog_events.inc
+# We need to sync the test case with the background processing of the +# commit checkpoint, otherwise we get nondeterministic results. +SET DEBUG_SYNC= "RESET"; +SET @old_dbug= @@global.DEBUG_DBUG; +SET GLOBAL debug_dbug="+d,binlog_background_checkpoint_processed"; + SET DEBUG_SYNC= "now SIGNAL con2_continue";
connection con2; @@ -78,6 +84,12 @@ reap;
connection default; --echo con1 is still pending, no new binlog checkpoint should have been logged. +# Make sure commit checkpoint is processed before we check that no checkpoint +# event has been binlogged. +SET DEBUG_SYNC= "now WAIT_FOR binlog_background_checkpoint_processed"; +SET GLOBAL debug_dbug= @old_dbug; +SET DEBUG_SYNC= "RESET"; + --let $binlog_file= master-bin.000003 --source include/show_binlog_events.inc
=== modified file 'mysql-test/suite/binlog/t/binlog_xa_recover.test' --- a/mysql-test/suite/binlog/t/binlog_xa_recover.test 2012-09-13 12:31:29 +0000 +++ b/mysql-test/suite/binlog/t/binlog_xa_recover.test 2012-09-14 12:44:53 +0000 @@ -14,8 +14,24 @@ CREATE TABLE t1 (a INT PRIMARY KEY, b ME # Insert some data to force a couple binlog rotations (3), so we get some # normal binlog checkpoints before starting the test. INSERT INTO t1 VALUES (100, REPEAT("x", 4100)); +# Wait for the master-bin.000002 binlog checkpoint to appear. +--let $wait_for_all= 0 +--let $show_statement= SHOW BINLOG EVENTS IN "master-bin.000002" +--let $field= Info +--let $condition= = "master-bin.000002" +--source include/wait_show_condition.inc INSERT INTO t1 VALUES (101, REPEAT("x", 4100)); +--let $wait_for_all= 0 +--let $show_statement= SHOW BINLOG EVENTS IN "master-bin.000003" +--let $field= Info +--let $condition= = "master-bin.000003" +--source include/wait_show_condition.inc INSERT INTO t1 VALUES (102, REPEAT("x", 4100)); +--let $wait_for_all= 0 +--let $show_statement= SHOW BINLOG EVENTS IN "master-bin.000004" +--let $field= Info +--let $condition= = "master-bin.000004" +--source include/wait_show_condition.inc
# Now start a bunch of transactions that span multiple binlog # files. Leave then in the state prepared-but-not-committed in the engine @@ -153,10 +169,19 @@ SET DEBUG_SYNC= "now SIGNAL con10_cont"; connection con10; reap; connection default; + +# We need to sync the test case with the background processing of the +# commit checkpoint, otherwise we get nondeterministic results. +SET @old_dbug= @@global.DEBUG_DBUG; +SET GLOBAL debug_dbug="+d,binlog_background_checkpoint_processed"; + SET DEBUG_SYNC= "now SIGNAL con12_cont"; connection con12; reap; connection default; +SET DEBUG_SYNC= "now WAIT_FOR binlog_background_checkpoint_processed"; +SET GLOBAL debug_dbug= @old_dbug; + SET DEBUG_SYNC= "now SIGNAL con11_cont"; connection con11; reap; @@ -210,7 +235,20 @@ RESET MASTER; # crash recovery fails due to the error insert used for previous test. INSERT INTO t1 VALUES (21, REPEAT("x", 4100)); INSERT INTO t1 VALUES (22, REPEAT("x", 4100)); +# Wait for the master-bin.000003 binlog checkpoint to appear. +--let $wait_for_all= 0 +--let $show_statement= SHOW BINLOG EVENTS IN "master-bin.000003" +--let $field= Info +--let $condition= = "master-bin.000003" +--source include/wait_show_condition.inc INSERT INTO t1 VALUES (23, REPEAT("x", 4100)); +# Wait for the last (master-bin.000004) binlog checkpoint to appear. +--let $wait_for_all= 0 +--let $show_statement= SHOW BINLOG EVENTS IN "master-bin.000004" +--let $field= Info +--let $condition= = "master-bin.000004" +--source include/wait_show_condition.inc + --write_file $MYSQLTEST_VARDIR/tmp/mysqld.1.expect wait-binlog_xa_recover.test EOF
=== modified file 'mysql-test/suite/perfschema/r/all_instances.result' --- a/mysql-test/suite/perfschema/r/all_instances.result 2012-06-22 09:46:28 +0000 +++ b/mysql-test/suite/perfschema/r/all_instances.result 2012-09-14 12:44:53 +0000 @@ -76,6 +76,7 @@ wait/synch/mutex/sql/Master_info::run_lo wait/synch/mutex/sql/Master_info::sleep_lock wait/synch/mutex/sql/MDL_map::mutex wait/synch/mutex/sql/MDL_wait::LOCK_wait_status +wait/synch/mutex/sql/MYSQL_BIN_LOG::LOCK_binlog_thread wait/synch/mutex/sql/MYSQL_BIN_LOG::LOCK_index wait/synch/mutex/sql/MYSQL_BIN_LOG::LOCK_xid_list wait/synch/mutex/sql/MYSQL_RELAY_LOG::LOCK_index @@ -129,6 +130,8 @@ wait/synch/cond/sql/Master_info::sleep_c wait/synch/cond/sql/Master_info::start_cond wait/synch/cond/sql/Master_info::stop_cond wait/synch/cond/sql/MDL_context::COND_wait_status +wait/synch/cond/sql/MYSQL_BIN_LOG::COND_binlog_thread +wait/synch/cond/sql/MYSQL_BIN_LOG::COND_binlog_thread_end wait/synch/cond/sql/MYSQL_BIN_LOG::COND_queue_busy wait/synch/cond/sql/MYSQL_BIN_LOG::COND_xid_list wait/synch/cond/sql/MYSQL_BIN_LOG::update_cond
=== modified file 'mysql-test/suite/perfschema/r/relaylog.result' --- a/mysql-test/suite/perfschema/r/relaylog.result 2012-06-22 09:46:28 +0000 +++ b/mysql-test/suite/perfschema/r/relaylog.result 2012-09-14 12:44:53 +0000 @@ -56,8 +56,11 @@ where event_name like "%MYSQL_BIN_LOG%" and event_name not like "%MYSQL_BIN_LOG::update_cond" order by event_name; EVENT_NAME COUNT_STAR +wait/synch/cond/sql/MYSQL_BIN_LOG::COND_binlog_thread NONE +wait/synch/cond/sql/MYSQL_BIN_LOG::COND_binlog_thread_end NONE wait/synch/cond/sql/MYSQL_BIN_LOG::COND_queue_busy NONE wait/synch/cond/sql/MYSQL_BIN_LOG::COND_xid_list NONE +wait/synch/mutex/sql/MYSQL_BIN_LOG::LOCK_binlog_thread MANY wait/synch/mutex/sql/MYSQL_BIN_LOG::LOCK_index MANY wait/synch/mutex/sql/MYSQL_BIN_LOG::LOCK_xid_list MANY "Expect no slave relay log" @@ -131,8 +134,11 @@ where event_name like "%MYSQL_BIN_LOG%" and event_name not like "%MYSQL_BIN_LOG::update_cond" order by event_name; EVENT_NAME COUNT_STAR +wait/synch/cond/sql/MYSQL_BIN_LOG::COND_binlog_thread MANY +wait/synch/cond/sql/MYSQL_BIN_LOG::COND_binlog_thread_end NONE wait/synch/cond/sql/MYSQL_BIN_LOG::COND_queue_busy NONE -wait/synch/cond/sql/MYSQL_BIN_LOG::COND_xid_list NONE +wait/synch/cond/sql/MYSQL_BIN_LOG::COND_xid_list MANY +wait/synch/mutex/sql/MYSQL_BIN_LOG::LOCK_binlog_thread MANY wait/synch/mutex/sql/MYSQL_BIN_LOG::LOCK_index MANY wait/synch/mutex/sql/MYSQL_BIN_LOG::LOCK_xid_list MANY "Expect a slave relay log"
=== modified file 'sql/debug_sync.cc' --- a/sql/debug_sync.cc 2012-03-28 17:26:00 +0000 +++ b/sql/debug_sync.cc 2012-09-14 12:44:53 +0000 @@ -984,6 +984,7 @@ static bool debug_sync_eval_action(THD * DBUG_ENTER("debug_sync_eval_action"); DBUG_ASSERT(thd); DBUG_ASSERT(action_str); + DBUG_PRINT("debug_sync", ("action_str='%s'", action_str));
/* Get debug sync point name. Or a special command.
=== modified file 'sql/log.cc' --- a/sql/log.cc 2012-09-13 12:31:29 +0000 +++ b/sql/log.cc 2012-09-14 12:44:53 +0000 @@ -53,6 +53,7 @@ #include "rpl_handler.h" #include "debug_sync.h" #include "sql_show.h" +#include "my_pthread.h"
/* max size of the log message */ #define MAX_LOG_BUFFER_SIZE 1024 @@ -106,6 +107,14 @@ static SHOW_VAR binlog_status_vars_detai {NullS, NullS, SHOW_LONG} };
+/* Variables for the binlog background thread. */ +static bool binlog_thread_started= false; +static bool binlog_background_thread_stop= false; +static MYSQL_BIN_LOG::xid_count_per_binlog * + binlog_background_thread_queue= NULL; + +static bool start_binlog_background_thread(); +
/** purge logs, master and slave sides both, related error code @@ -2957,12 +2966,27 @@ void MYSQL_BIN_LOG::cleanup() my_free(b); }
+ /* Wait for the binlog thread to stop. */ + if (!is_relay_log && binlog_thread_started) + { + mysql_mutex_lock(&LOCK_binlog_thread); + binlog_background_thread_stop= true; + mysql_cond_signal(&COND_binlog_thread); + while (binlog_background_thread_stop) + mysql_cond_wait(&COND_binlog_thread_end, &LOCK_binlog_thread); + mysql_mutex_unlock(&LOCK_binlog_thread); + binlog_thread_started= false; + } + mysql_mutex_destroy(&LOCK_log); mysql_mutex_destroy(&LOCK_index); mysql_mutex_destroy(&LOCK_xid_list); + mysql_mutex_destroy(&LOCK_binlog_thread); mysql_cond_destroy(&update_cond); mysql_cond_destroy(&COND_queue_busy); mysql_cond_destroy(&COND_xid_list); + mysql_cond_destroy(&COND_binlog_thread); + mysql_cond_destroy(&COND_binlog_thread_end); } DBUG_VOID_RETURN; } @@ -2988,6 +3012,11 @@ void MYSQL_BIN_LOG::init_pthread_objects mysql_cond_init(m_key_update_cond, &update_cond, 0); mysql_cond_init(m_key_COND_queue_busy, &COND_queue_busy, 0); mysql_cond_init(key_BINLOG_COND_xid_list, &COND_xid_list, 0); + + mysql_mutex_init(key_BINLOG_LOCK_binlog_thread, + &LOCK_binlog_thread, MY_MUTEX_INIT_FAST); + mysql_cond_init(key_BINLOG_COND_binlog_thread, &COND_binlog_thread, 0); + mysql_cond_init(key_BINLOG_COND_binlog_thread_end, &COND_binlog_thread_end, 0); }
@@ -3085,6 +3114,10 @@ bool MYSQL_BIN_LOG::open(const char *log DBUG_ENTER("MYSQL_BIN_LOG::open"); DBUG_PRINT("enter",("log_type: %d",(int) log_type_arg));
+ if (!is_relay_log && !binlog_thread_started && + start_binlog_background_thread()) + DBUG_RETURN(1); + if (init_and_set_log_file_name(log_name, new_name, log_type_arg, io_cache_type_arg)) { @@ -5540,11 +5573,7 @@ bool general_log_write(THD *thd, enum en }
-/* - I would like to make this function static, but this causes compiler warnings - when it is declared as friend function in log.h. -*/ -void +static void binlog_checkpoint_callback(void *cookie) { MYSQL_BIN_LOG::xid_count_per_binlog *entry= @@ -8116,9 +8145,128 @@ int TC_LOG_BINLOG::unlog(ulong cookie, m void TC_LOG_BINLOG::commit_checkpoint_notify(void *cookie) { - mark_xid_done(((xid_count_per_binlog *)cookie)->binlog_id, true); + xid_count_per_binlog *entry= static_cast<xid_count_per_binlog *>(cookie); + mysql_mutex_lock(&LOCK_binlog_thread); + entry->next_in_queue= binlog_background_thread_queue; + binlog_background_thread_queue= entry; + mysql_cond_signal(&COND_binlog_thread); + mysql_mutex_unlock(&LOCK_binlog_thread); }
+/* + Binlog service thread. + + This thread is used to log binlog checkpoints in the background, rather than + in the context of random storage engine threads that happen to call + commit_checkpoint_notify_ha() and may not like the delays while syncing + binlog to disk or may not be setup with all my_thread_init() and other + necessary stuff. + + In the future, this thread could also be used to do log rotation in the + background, which could elimiate all stalls around binlog rotations. +*/ +pthread_handler_t +binlog_background_thread(void *arg __attribute__((unused))) +{ + bool stop; + MYSQL_BIN_LOG::xid_count_per_binlog *queue, *next; + THD *thd; + + my_thread_init(); + thd= new THD; + thd->system_thread= SYSTEM_THREAD_BINLOG_BACKGROUND; + my_pthread_setspecific_ptr(THR_THD, thd); + mysql_mutex_lock(&LOCK_thread_count); + thd->thread_id= thread_id++; + mysql_mutex_unlock(&LOCK_thread_count); + + for (;;) + { + /* + Wait until there is something in the queue to process, or we are asked + to shut down. + */ + thd_proc_info(thd, "Waiting for background binlog tasks"); + mysql_mutex_lock(&mysql_bin_log.LOCK_binlog_thread); + for (;;) + { + stop= binlog_background_thread_stop; + queue= binlog_background_thread_queue; + if (stop || queue) + break; + mysql_cond_wait(&mysql_bin_log.COND_binlog_thread, + &mysql_bin_log.LOCK_binlog_thread); + } + /* Grab the queue, if any. */ + binlog_background_thread_queue= NULL; + mysql_mutex_unlock(&mysql_bin_log.LOCK_binlog_thread); + + /* Process any incoming commit_checkpoint_notify() calls. */ + while (queue) + { + thd_proc_info(thd, "Processing binlog checkpoint notification"); + /* Grab next pointer first, as mark_xid_done() may free the element. */ + next= queue->next_in_queue; + mysql_bin_log.mark_xid_done(queue->binlog_id, true); + queue= next; + + DBUG_EXECUTE_IF("binlog_background_checkpoint_processed", + DBUG_ASSERT(!debug_sync_set_action( + thd, + STRING_WITH_LEN("now SIGNAL binlog_background_checkpoint_processed"))); + ); + } + + if (stop) + break; + } + + thd_proc_info(thd, "Stopping binlog background thread"); + + mysql_mutex_lock(&LOCK_thread_count); + delete thd; + mysql_mutex_unlock(&LOCK_thread_count); + + my_thread_end(); + + /* Signal that we are (almost) stopped. */ + mysql_mutex_lock(&mysql_bin_log.LOCK_binlog_thread); + binlog_background_thread_stop= false; + mysql_cond_signal(&mysql_bin_log.COND_binlog_thread_end); + mysql_mutex_unlock(&mysql_bin_log.LOCK_binlog_thread); + + return 0; +} + +#ifdef HAVE_PSI_INTERFACE +static PSI_thread_key key_thread_binlog; + +static PSI_thread_info all_binlog_threads[]= +{ + { &key_thread_binlog, "binlog_background", PSI_FLAG_GLOBAL}, +}; +#endif /* HAVE_PSI_INTERFACE */ + +static bool +start_binlog_background_thread() +{ + pthread_t th; + +#ifdef HAVE_PSI_INTERFACE + if (PSI_server) + PSI_server->register_thread("sql", all_binlog_threads, + array_elements(all_binlog_threads)); +#endif + + if (mysql_thread_create(key_thread_binlog, &th, NULL, + binlog_background_thread, NULL)) + return 1; + + binlog_thread_started= true; + return 0; +} + + int TC_LOG_BINLOG::recover(LOG_INFO *linfo, const char *last_log_name, IO_CACHE *first_log, Format_description_log_event *fdle)
=== modified file 'sql/log.h' --- a/sql/log.h 2012-09-13 12:31:29 +0000 +++ b/sql/log.h 2012-09-14 12:44:53 +0000 @@ -395,8 +395,6 @@ class MYSQL_QUERY_LOG: public MYSQL_LOG #define BINLOG_COOKIE_IS_DUMMY(c) \ ( ((ulong)(c)>>1) == BINLOG_COOKIE_DUMMY_ID )
-void binlog_checkpoint_callback(void *cookie); - class binlog_cache_mngr; class MYSQL_BIN_LOG: public TC_LOG, private MYSQL_LOG { @@ -451,27 +449,6 @@ class MYSQL_BIN_LOG: public TC_LOG, priv };
/* - A list of struct xid_count_per_binlog is used to keep track of how many - XIDs are in prepared, but not committed, state in each binlog. And how - many commit_checkpoint_request()'s are pending. - - When count drops to zero in a binlog after rotation, it means that there - are no more XIDs in prepared state, so that binlog is no longer needed - for XA crash recovery, and we can log a new binlog checkpoint event. - - The list is protected against simultaneous access from multiple - threads by LOCK_xid_list. - */ - struct xid_count_per_binlog : public ilink { - char *binlog_name; - uint binlog_name_len; - ulong binlog_id; - /* Total prepared XIDs and pending checkpoint requests in this binlog. */ - long xid_count; - xid_count_per_binlog(); /* Give link error if constructor used. */ - }; - I_List<xid_count_per_binlog> binlog_xid_count_list; - /* When this is set, a RESET MASTER is in progress.
Then we should not write any binlog checkpoints into the binlog (that @@ -480,7 +457,6 @@ class MYSQL_BIN_LOG: public TC_LOG, priv checkpoint arrives - when all have arrived, RESET MASTER will complete. */ bool reset_master_pending; - friend void binlog_checkpoint_callback(void *cookie);
/* LOCK_log and LOCK_index are inited by init_pthread_objects() */ mysql_mutex_t LOCK_index; @@ -553,10 +529,35 @@ class MYSQL_BIN_LOG: public TC_LOG, priv int write_transaction_or_stmt(group_commit_entry *entry); bool write_transaction_to_binlog_events(group_commit_entry *entry); void trx_group_commit_leader(group_commit_entry *leader); - void mark_xid_done(ulong cookie, bool write_checkpoint); - void mark_xids_active(ulong cookie, uint xid_count);
public: + /* + A list of struct xid_count_per_binlog is used to keep track of how many + XIDs are in prepared, but not committed, state in each binlog. And how + many commit_checkpoint_request()'s are pending. + + When count drops to zero in a binlog after rotation, it means that there + are no more XIDs in prepared state, so that binlog is no longer needed + for XA crash recovery, and we can log a new binlog checkpoint event. + + The list is protected against simultaneous access from multiple + threads by LOCK_xid_list. + */ + struct xid_count_per_binlog : public ilink { + char *binlog_name; + uint binlog_name_len; + ulong binlog_id; + /* Total prepared XIDs and pending checkpoint requests in this binlog. */ + long xid_count; + /* For linking in requests to the binlog background thread. */ + xid_count_per_binlog *next_in_queue; + xid_count_per_binlog(); /* Give link error if constructor used. */ + }; + I_List<xid_count_per_binlog> binlog_xid_count_list; + mysql_mutex_t LOCK_binlog_thread; + mysql_cond_t COND_binlog_thread; + mysql_cond_t COND_binlog_thread_end; + using MYSQL_LOG::generate_name; using MYSQL_LOG::is_open;
@@ -712,6 +713,8 @@ class MYSQL_BIN_LOG: public TC_LOG, priv bool appendv(const char* buf,uint len,...); bool append(Log_event* ev);
+ void mark_xids_active(ulong cookie, uint xid_count); + void mark_xid_done(ulong cookie, bool write_checkpoint); void make_log_name(char* buf, const char* log_ident); bool is_active(const char* log_file_name); bool can_purge_log(const char *log_file_name);
=== modified file 'sql/mysqld.cc' --- a/sql/mysqld.cc 2012-09-13 12:31:29 +0000 +++ b/sql/mysqld.cc 2012-09-14 12:44:53 +0000 @@ -724,6 +724,7 @@ PSI_mutex_key key_LOCK_des_key_file; #endif /* HAVE_OPENSSL */
PSI_mutex_key key_BINLOG_LOCK_index, key_BINLOG_LOCK_xid_list, + key_BINLOG_LOCK_binlog_thread, key_delayed_insert_mutex, key_hash_filo_lock, key_LOCK_active_mi, key_LOCK_connection_count, key_LOCK_crypt, key_LOCK_delayed_create, key_LOCK_delayed_insert, key_LOCK_delayed_status, key_LOCK_error_log, @@ -766,6 +767,7 @@ static PSI_mutex_info all_server_mutexes
{ &key_BINLOG_LOCK_index, "MYSQL_BIN_LOG::LOCK_index", 0}, { &key_BINLOG_LOCK_xid_list, "MYSQL_BIN_LOG::LOCK_xid_list", 0}, + { &key_BINLOG_LOCK_binlog_thread, "MYSQL_BIN_LOG::LOCK_binlog_thread", 0}, { &key_RELAYLOG_LOCK_index, "MYSQL_RELAY_LOG::LOCK_index", 0}, { &key_delayed_insert_mutex, "Delayed_insert::mutex", 0}, { &key_hash_filo_lock, "hash_filo::lock", 0}, @@ -834,6 +836,7 @@ PSI_cond_key key_PAGE_cond, key_COND_act #endif /* HAVE_MMAP */
PSI_cond_key key_BINLOG_COND_xid_list, key_BINLOG_update_cond, + key_BINLOG_COND_binlog_thread, key_BINLOG_COND_binlog_thread_end, key_COND_cache_status_changed, key_COND_manager, key_COND_rpl_status, key_COND_server_started, key_delayed_insert_cond, key_delayed_insert_cond_client, @@ -863,6 +866,8 @@ static PSI_cond_info all_server_conds[]= #endif /* HAVE_MMAP */ { &key_BINLOG_COND_xid_list, "MYSQL_BIN_LOG::COND_xid_list", 0}, { &key_BINLOG_update_cond, "MYSQL_BIN_LOG::update_cond", 0}, + { &key_BINLOG_COND_binlog_thread, "MYSQL_BIN_LOG::COND_binlog_thread", 0}, + { &key_BINLOG_COND_binlog_thread_end, "MYSQL_BIN_LOG::COND_binlog_thread_end", 0}, { &key_BINLOG_COND_queue_busy, "MYSQL_BIN_LOG::COND_queue_busy", 0}, { &key_RELAYLOG_update_cond, "MYSQL_RELAY_LOG::update_cond", 0}, { &key_RELAYLOG_COND_queue_busy, "MYSQL_RELAY_LOG::COND_queue_busy", 0},
=== modified file 'sql/mysqld.h' --- a/sql/mysqld.h 2012-09-13 12:31:29 +0000 +++ b/sql/mysqld.h 2012-09-14 12:44:53 +0000 @@ -226,6 +226,7 @@ extern PSI_mutex_key key_LOCK_des_key_fi #endif
extern PSI_mutex_key key_BINLOG_LOCK_index, key_BINLOG_LOCK_xid_list, + key_BINLOG_LOCK_binlog_thread, key_delayed_insert_mutex, key_hash_filo_lock, key_LOCK_active_mi, key_LOCK_connection_count, key_LOCK_crypt, key_LOCK_delayed_create, key_LOCK_delayed_insert, key_LOCK_delayed_status, key_LOCK_error_log, @@ -257,6 +258,7 @@ extern PSI_cond_key key_PAGE_cond, key_C #endif /* HAVE_MMAP */
extern PSI_cond_key key_BINLOG_COND_xid_list, key_BINLOG_update_cond, + key_BINLOG_COND_binlog_thread, key_BINLOG_COND_binlog_thread_end, key_COND_cache_status_changed, key_COND_manager, key_COND_rpl_status, key_COND_server_started, key_delayed_insert_cond, key_delayed_insert_cond_client,
=== modified file 'sql/rpl_rli.cc' --- a/sql/rpl_rli.cc 2012-09-13 12:31:29 +0000 +++ b/sql/rpl_rli.cc 2012-09-14 12:44:53 +0000 @@ -58,6 +58,7 @@ Relay_log_info::Relay_log_info(bool is_s { DBUG_ENTER("Relay_log_info::Relay_log_info");
+ relay_log.is_relay_log= TRUE; #ifdef HAVE_PSI_INTERFACE relay_log.set_psi_keys(key_RELAYLOG_LOCK_index, key_RELAYLOG_update_cond, @@ -206,8 +207,6 @@ a file name for --relay-log-index option name_warning_sent= 1; }
- rli->relay_log.is_relay_log= TRUE; - /* note, that if open() fails, we'll still have index file open but a destructor will take care of that
=== modified file 'sql/sql_class.h' --- a/sql/sql_class.h 2012-09-13 12:31:29 +0000 +++ b/sql/sql_class.h 2012-09-14 12:44:53 +0000 @@ -1244,7 +1244,8 @@ enum enum_thread_type SYSTEM_THREAD_SLAVE_SQL= 4, SYSTEM_THREAD_NDBCLUSTER_BINLOG= 8, SYSTEM_THREAD_EVENT_SCHEDULER= 16, - SYSTEM_THREAD_EVENT_WORKER= 32 + SYSTEM_THREAD_EVENT_WORKER= 32, + SYSTEM_THREAD_BINLOG_BACKGROUND= 64 };
inline char const *
=== modified file 'storage/innobase/handler/ha_innodb.cc' --- a/storage/innobase/handler/ha_innodb.cc 2012-09-13 12:31:29 +0000 +++ b/storage/innobase/handler/ha_innodb.cc 2012-09-14 12:44:53 +0000 @@ -106,6 +106,7 @@ static ulong commit_threads = 0; static mysql_mutex_t commit_threads_m; static mysql_cond_t commit_cond; static mysql_mutex_t commit_cond_m; +static mysql_mutex_t pending_checkpoint_mutex; static bool innodb_inited = 0;
#define INSIDE_HA_INNOBASE_CC @@ -222,11 +223,13 @@ static mysql_pfs_key_t innobase_share_mu static mysql_pfs_key_t commit_threads_m_key; static mysql_pfs_key_t commit_cond_mutex_key; static mysql_pfs_key_t commit_cond_key; +static mysql_pfs_key_t pending_checkpoint_mutex_key;
static PSI_mutex_info all_pthread_mutexes[] = { {&commit_threads_m_key, "commit_threads_m", 0}, {&commit_cond_mutex_key, "commit_cond_mutex", 0}, - {&innobase_share_mutex_key, "innobase_share_mutex", 0} + {&innobase_share_mutex_key, "innobase_share_mutex", 0}, + {&pending_checkpoint_mutex_key, "pending_checkpoint_mutex", 0} };
static PSI_cond_info all_innodb_conds[] = { @@ -2592,6 +2595,9 @@ innobase_init( mysql_mutex_init(commit_cond_mutex_key, &commit_cond_m, MY_MUTEX_INIT_FAST); mysql_cond_init(commit_cond_key, &commit_cond, NULL); + mysql_mutex_init(pending_checkpoint_mutex_key, + &pending_checkpoint_mutex, + MY_MUTEX_INIT_FAST); innodb_inited= 1; #ifdef MYSQL_DYNAMIC_PLUGIN if (innobase_hton != p) { @@ -2639,6 +2645,7 @@ innobase_end( mysql_mutex_destroy(&commit_threads_m); mysql_mutex_destroy(&commit_cond_m); mysql_cond_destroy(&commit_cond); + mysql_mutex_destroy(&pending_checkpoint_mutex); }
DBUG_RETURN(err); @@ -3008,6 +3015,16 @@ innobase_rollback_trx( DBUG_RETURN(convert_error_code_to_mysql(error, 0, NULL)); }
+ +struct pending_checkpoint { + struct pending_checkpoint *next; + handlerton *hton; + void *cookie; + ib_uint64_t lsn; +}; +static struct pending_checkpoint *pending_checkpoint_list; +static struct pending_checkpoint *pending_checkpoint_list_end; + /*****************************************************************//** Handle a commit checkpoint request from server layer. We simply flush the redo log immediately and do the notify call.*/ @@ -3017,8 +3034,113 @@ innobase_checkpoint_request( handlerton *hton, void *cookie) { - log_buffer_flush_to_disk(); - commit_checkpoint_notify_ha(hton, cookie); + ib_uint64_t lsn; + ib_uint64_t flush_lsn; + struct pending_checkpoint * entry; + + /* Do the allocation outside of lock to reduce contention. The normal + case is that not everything is flushed, so we will need to enqueue. */ + entry = static_cast<struct pending_checkpoint *> + (my_malloc(sizeof(*entry), MYF(MY_WME))); + if (!entry) { + sql_print_error("Failed to allocate %u bytes." + " Commit checkpoint will be skipped.", + static_cast<unsigned>(sizeof(*entry))); + return; + } + + entry->next = NULL; + entry->hton = hton; + entry->cookie = cookie; + + mysql_mutex_lock(&pending_checkpoint_mutex); + lsn = log_get_lsn(); + flush_lsn = log_get_flush_lsn(); + if (lsn > flush_lsn) { + /* Put the request in queue. + When the log gets flushed past the lsn, we will remove the + entry from the queue and notify the upper layer. */ + entry->lsn = lsn; + if (pending_checkpoint_list_end) { + pending_checkpoint_list_end->next = entry; + } else { + pending_checkpoint_list = entry; + } + pending_checkpoint_list_end = entry; + entry = NULL; + } + mysql_mutex_unlock(&pending_checkpoint_mutex); + + if (entry) { + /* We are already flushed. Notify the checkpoint immediately. */ + commit_checkpoint_notify_ha(entry->hton, entry->cookie); + my_free(entry); + } +} + +/*****************************************************************//** +Log code calls this whenever log has been written and/or flushed up +to a new position. We use this to notify upper layer of a new commit +checkpoint when necessary.*/ +extern "C" UNIV_INTERN +void +innobase_mysql_log_notify( +/*===============*/ + ib_uint64_t write_lsn, /*!< in: LSN written to log file */ + ib_uint64_t flush_lsn) /*!< in: LSN flushed to disk */ +{ + struct pending_checkpoint * pending; + struct pending_checkpoint * entry; + struct pending_checkpoint * last_ready; + + /* It is safe to do a quick check for NULL first without lock. + Even if we should race, we will at most skip one checkpoint and + take the next one, which is harmless. */ + if (!pending_checkpoint_list) + return; + + mysql_mutex_lock(&pending_checkpoint_mutex); + pending = pending_checkpoint_list; + if (!pending) + { + mysql_mutex_unlock(&pending_checkpoint_mutex); + return; + } + + last_ready = NULL; + for (entry = pending; entry != NULL; entry = entry -> next) + { + if (entry->lsn > flush_lsn) + break; + last_ready = entry; + } + + if (last_ready) + { + /* We found some pending checkpoints that are now flushed to + disk. So remove them from the list. */ + pending_checkpoint_list = entry; + if (!entry) + pending_checkpoint_list_end = NULL; + } + + mysql_mutex_unlock(&pending_checkpoint_mutex); + + if (!last_ready) + return; + + /* Now that we have released the lock, notify upper layer about all + commit checkpoints that have now completed. */ + for (;;) { + entry = pending; + pending = pending->next; + + commit_checkpoint_notify_ha(entry->hton, entry->cookie); + + my_free(entry); + if (entry == last_ready) + break; + } }
/*****************************************************************//**
=== modified file 'storage/innobase/include/ha_prototypes.h' --- a/storage/innobase/include/ha_prototypes.h 2011-04-26 17:55:52 +0000 +++ b/storage/innobase/include/ha_prototypes.h 2012-09-14 12:44:53 +0000 @@ -136,6 +136,17 @@ innobase_mysql_print_thd( uint max_query_len); /*!< in: max query length to print, or 0 to use the default max length */
+/*****************************************************************//** +Log code calls this whenever log has been written and/or flushed up +to a new position. We use this to notify upper layer of a new commit +checkpoint when necessary.*/ +UNIV_INTERN +void +innobase_mysql_log_notify( +/*===============*/ + ib_uint64_t write_lsn, /*!< in: LSN written to log file */ + ib_uint64_t flush_lsn); /*!< in: LSN flushed to disk */ + /**************************************************************//** Converts a MySQL type to an InnoDB type. Note that this function returns the 'mtype' of InnoDB. InnoDB differentiates between MySQL's old <= 4.1
=== modified file 'storage/innobase/include/log0log.h' --- a/storage/innobase/include/log0log.h 2012-06-07 13:44:26 +0000 +++ b/storage/innobase/include/log0log.h 2012-09-14 12:44:53 +0000 @@ -151,6 +151,13 @@ UNIV_INLINE ib_uint64_t log_get_lsn(void); /*=============*/ +/************************************************************//** +Gets the last lsn that is fully flushed to disk. +@return last flushed lsn */ +UNIV_INLINE +ib_uint64_t +log_get_flush_lsn(void); +/*=============*/ /**************************************************************** Gets the log group capacity. It is OK to read the value without holding log_sys->mutex because it is constant.
=== modified file 'storage/innobase/include/log0log.ic' --- a/storage/innobase/include/log0log.ic 2011-04-05 07:18:43 +0000 +++ b/storage/innobase/include/log0log.ic 2012-09-14 12:44:53 +0000 @@ -411,6 +411,25 @@ log_get_lsn(void) return(lsn); }
+/************************************************************//** +Gets the last lsn that is fully flushed to disk. +@return last flushed lsn */ +UNIV_INLINE +ib_uint64_t +log_get_flush_lsn(void) +/*=============*/ +{ + ib_uint64_t lsn; + + mutex_enter(&(log_sys->mutex)); + + lsn = log_sys->flushed_to_disk_lsn; + + mutex_exit(&(log_sys->mutex)); + + return(lsn); +} + /**************************************************************** Gets the log group capacity. It is OK to read the value without holding log_sys->mutex because it is constant.
=== modified file 'storage/innobase/log/log0log.c' --- a/storage/innobase/log/log0log.c 2012-03-21 03:48:12 +0000 +++ b/storage/innobase/log/log0log.c 2012-09-14 12:44:53 +0000 @@ -1353,6 +1353,8 @@ log_write_up_to( ulint loop_count = 0; #endif /* UNIV_DEBUG */ ulint unlock; + ib_uint64_t write_lsn; + ib_uint64_t flush_lsn;
if (recv_no_ibuf_operations) { /* Recovery is running and no operations on the log files are @@ -1530,8 +1532,13 @@ log_write_up_to(
log_flush_do_unlocks(unlock);
+ write_lsn = log_sys->write_lsn; + flush_lsn = log_sys->flushed_to_disk_lsn; + mutex_exit(&(log_sys->mutex));
+ innobase_mysql_log_notify(write_lsn, flush_lsn); + return;
do_waits:
=== modified file 'storage/xtradb/handler/ha_innodb.cc' --- a/storage/xtradb/handler/ha_innodb.cc 2012-09-13 12:31:29 +0000 +++ b/storage/xtradb/handler/ha_innodb.cc 2012-09-14 12:44:53 +0000 @@ -120,6 +120,7 @@ static ulong commit_threads = 0; static mysql_mutex_t commit_threads_m; static mysql_cond_t commit_cond; static mysql_mutex_t commit_cond_m; +static mysql_mutex_t pending_checkpoint_mutex; static bool innodb_inited = 0;
@@ -253,11 +254,13 @@ static mysql_pfs_key_t innobase_share_mu static mysql_pfs_key_t commit_threads_m_key; static mysql_pfs_key_t commit_cond_mutex_key; static mysql_pfs_key_t commit_cond_key; +static mysql_pfs_key_t pending_checkpoint_mutex_key;
static PSI_mutex_info all_pthread_mutexes[] = { {&commit_threads_m_key, "commit_threads_m", 0}, {&commit_cond_mutex_key, "commit_cond_mutex", 0}, - {&innobase_share_mutex_key, "innobase_share_mutex", 0} + {&innobase_share_mutex_key, "innobase_share_mutex", 0}, + {&pending_checkpoint_mutex_key, "pending_checkpoint_mutex", 0} };
static PSI_cond_info all_innodb_conds[] = { @@ -3060,6 +3063,9 @@ innobase_init( mysql_mutex_init(commit_cond_mutex_key, &commit_cond_m, MY_MUTEX_INIT_FAST); mysql_cond_init(commit_cond_key, &commit_cond, NULL); + mysql_mutex_init(pending_checkpoint_mutex_key, + &pending_checkpoint_mutex, + MY_MUTEX_INIT_FAST); innodb_inited= 1; #ifdef MYSQL_DYNAMIC_PLUGIN if (innobase_hton != p) { @@ -3107,6 +3113,7 @@ innobase_end( mysql_mutex_destroy(&commit_threads_m); mysql_mutex_destroy(&commit_cond_m); mysql_cond_destroy(&commit_cond); + mysql_mutex_destroy(&pending_checkpoint_mutex); }
DBUG_RETURN(err); @@ -3500,6 +3507,16 @@ innobase_rollback_trx( DBUG_RETURN(convert_error_code_to_mysql(error, 0, NULL)); }
+ +struct pending_checkpoint { + struct pending_checkpoint *next; + handlerton *hton; + void *cookie; + ib_uint64_t lsn; +}; +static struct pending_checkpoint *pending_checkpoint_list; +static struct pending_checkpoint *pending_checkpoint_list_end; + /*****************************************************************//** Handle a commit checkpoint request from server layer. We simply flush the redo log immediately and do the notify call.*/ @@ -3509,8 +3526,113 @@ innobase_checkpoint_request( handlerton *hton, void *cookie) { - log_buffer_flush_to_disk(); - commit_checkpoint_notify_ha(hton, cookie); + ib_uint64_t lsn; + ib_uint64_t flush_lsn; + struct pending_checkpoint * entry; + + /* Do the allocation outside of lock to reduce contention. The normal + case is that not everything is flushed, so we will need to enqueue. */ + entry = static_cast<struct pending_checkpoint *> + (my_malloc(sizeof(*entry), MYF(MY_WME))); + if (!entry) { + sql_print_error("Failed to allocate %u bytes." + " Commit checkpoint will be skipped.", + static_cast<unsigned>(sizeof(*entry))); + return; + } + + entry->next = NULL; + entry->hton = hton; + entry->cookie = cookie; + + mysql_mutex_lock(&pending_checkpoint_mutex); + lsn = log_get_lsn(); + flush_lsn = log_get_flush_lsn(); + if (lsn > flush_lsn) { + /* Put the request in queue. + When the log gets flushed past the lsn, we will remove the + entry from the queue and notify the upper layer. */ + entry->lsn = lsn; + if (pending_checkpoint_list_end) { + pending_checkpoint_list_end->next = entry; + } else { + pending_checkpoint_list = entry; + } + pending_checkpoint_list_end = entry; + entry = NULL; + } + mysql_mutex_unlock(&pending_checkpoint_mutex); + + if (entry) { + /* We are already flushed. Notify the checkpoint immediately. */ + commit_checkpoint_notify_ha(entry->hton, entry->cookie); + my_free(entry); + } +} + +/*****************************************************************//** +Log code calls this whenever log has been written and/or flushed up +to a new position. We use this to notify upper layer of a new commit +checkpoint when necessary.*/ +extern "C" UNIV_INTERN +void +innobase_mysql_log_notify( +/*===============*/ + ib_uint64_t write_lsn, /*!< in: LSN written to log file */ + ib_uint64_t flush_lsn) /*!< in: LSN flushed to disk */ +{ + struct pending_checkpoint * pending; + struct pending_checkpoint * entry; + struct pending_checkpoint * last_ready; + + /* It is safe to do a quick check for NULL first without lock. + Even if we should race, we will at most skip one checkpoint and + take the next one, which is harmless. */ + if (!pending_checkpoint_list) + return; + + mysql_mutex_lock(&pending_checkpoint_mutex); + pending = pending_checkpoint_list; + if (!pending) + { + mysql_mutex_unlock(&pending_checkpoint_mutex); + return; + } + + last_ready = NULL; + for (entry = pending; entry != NULL; entry = entry -> next) + { + if (entry->lsn > flush_lsn) + break; + last_ready = entry; + } + + if (last_ready) + { + /* We found some pending checkpoints that are now flushed to + disk. So remove them from the list. */ + pending_checkpoint_list = entry; + if (!entry) + pending_checkpoint_list_end = NULL; + } + + mysql_mutex_unlock(&pending_checkpoint_mutex); + + if (!last_ready) + return; + + /* Now that we have released the lock, notify upper layer about all + commit checkpoints that have now completed. */ + for (;;) { + entry = pending; + pending = pending->next; + + commit_checkpoint_notify_ha(entry->hton, entry->cookie); + + my_free(entry); + if (entry == last_ready) + break; + } }
/*****************************************************************//**
=== modified file 'storage/xtradb/include/ha_prototypes.h' --- a/storage/xtradb/include/ha_prototypes.h 2012-02-21 19:51:56 +0000 +++ b/storage/xtradb/include/ha_prototypes.h 2012-09-14 12:44:53 +0000 @@ -136,6 +136,17 @@ innobase_mysql_print_thd( uint max_query_len); /*!< in: max query length to print, or 0 to use the default max length */
+/*****************************************************************//** +Log code calls this whenever log has been written and/or flushed up +to a new position. We use this to notify upper layer of a new commit +checkpoint when necessary.*/ +UNIV_INTERN +void +innobase_mysql_log_notify( +/*===============*/ + ib_uint64_t write_lsn, /*!< in: LSN written to log file */ + ib_uint64_t flush_lsn); /*!< in: LSN flushed to disk */ + /**************************************************************//** Converts a MySQL type to an InnoDB type. Note that this function returns the 'mtype' of InnoDB. InnoDB differentiates between MySQL's old <= 4.1
=== modified file 'storage/xtradb/include/log0log.h' --- a/storage/xtradb/include/log0log.h 2012-08-27 16:13:17 +0000 +++ b/storage/xtradb/include/log0log.h 2012-09-14 12:44:53 +0000 @@ -151,6 +151,13 @@ UNIV_INLINE ib_uint64_t log_get_lsn(void); /*=============*/ +/************************************************************//** +Gets the last lsn that is fully flushed to disk. +@return last flushed lsn */ +UNIV_INLINE +ib_uint64_t +log_get_flush_lsn(void); +/*=============*/ /**************************************************************** Gets the log group capacity. It is OK to read the value without holding log_sys->mutex because it is constant.
=== modified file 'storage/xtradb/include/log0log.ic' --- a/storage/xtradb/include/log0log.ic 2011-07-14 19:22:41 +0000 +++ b/storage/xtradb/include/log0log.ic 2012-09-14 12:44:53 +0000 @@ -411,6 +411,25 @@ log_get_lsn(void) return(lsn); }
+/************************************************************//** +Gets the last lsn that is fully flushed to disk. +@return last flushed lsn */ +UNIV_INLINE +ib_uint64_t +log_get_flush_lsn(void) +/*=============*/ +{ + ib_uint64_t lsn; + + mutex_enter(&(log_sys->mutex)); + + lsn = log_sys->flushed_to_disk_lsn; + + mutex_exit(&(log_sys->mutex)); + + return(lsn); +} + /**************************************************************** Gets the log group capacity. It is OK to read the value without holding log_sys->mutex because it is constant.
=== modified file 'storage/xtradb/log/log0log.c' --- a/storage/xtradb/log/log0log.c 2012-08-27 16:13:17 +0000 +++ b/storage/xtradb/log/log0log.c 2012-09-14 12:44:53 +0000 @@ -1390,6 +1390,8 @@ log_write_up_to( ulint loop_count = 0; #endif /* UNIV_DEBUG */ ulint unlock; + ib_uint64_t write_lsn; + ib_uint64_t flush_lsn;
if (recv_no_ibuf_operations) { /* Recovery is running and no operations on the log files are @@ -1568,8 +1570,13 @@ log_write_up_to(
log_flush_do_unlocks(unlock);
+ write_lsn = log_sys->write_lsn; + flush_lsn = log_sys->flushed_to_disk_lsn; + mutex_exit(&(log_sys->mutex));
+ innobase_mysql_log_notify(write_lsn, flush_lsn); + return;
do_waits:
_______________________________________________ commits mailing list commits@mariadb.org https://lists.askmonty.org/cgi-bin/mailman/listinfo/commits