[PATCH] MDEV-34481 optimize away waiting for owned by prepared xa non-unique index record
From: Andrei <andrei.elkin@mariadb.com> This work partly implements a ROW binlog format MDEV-33455 part that is makes non-unique-index-only table XA replication safe in RBR. Existence of at least one non-null unique index has always guaranteed safety (no hang to error). Two transaction that update a non-unique-only index table could not be isolated on slave when on slave they used different non-unique indexes than on master. Unsolvable hang could be seen in case the first of the two is a prepared XA --connection slave_worker_1 xa start 'xid'; /* ... lock here ... */ ; xa prepare 'xid' while the 2nd being of any kind including of normal type --connection slave_worker_2 begin; /* ... get lock ... => wait/hang...error out */ was unable to wait up for the conflicting lock, even though the XA transaction did not really lock target records of the 2nd. This type of hang was caused by a chosen method the 2nd transaction employs to reach the target record, which is the index scan. The scanning orthodoxically just could not step over a record in the way that was locked by the XA. However as the in-the-way record can not be targeted by the 2nd transaction, otherwise the transactions would have sensed the conflict back on master *and* the other possibility of collecting extra locks by the 'xid' on non-modified records is tacked by MDEV-33454/MDEV-34466, the non-unique index scanning server/slave-applier layer must not panic at seeing a timeout error from the engine. Instead the scanning would just proceed to next possibly free index records of the same key value and ultimately must reach the target one. More generally, on the way to its target all busy records belonging to earlier (binlog order) prepared XA transactions need not be tried locking by the current non-unique index scanning transaction. This patch implements the plan for Innodb. The server layer expects the engine to mark an attempt to wait for a conflicting lock that belongs to a transaction in prepared state. The engine won't exercise, need not to, the timeout wait. When marking is done the timeout error is ignored by the server and next index record is tried out. An mtr test checks a scenario in sequential and parallel modes. --- sql/log_event_server.cc | 37 +++++++++++++++++++++++++----- sql/rpl_rli.cc | 4 +++- sql/rpl_rli.h | 14 +++++++++++ sql/sql_class.cc | 30 +++++++++++++++++++----- storage/innobase/lock/lock0lock.cc | 30 +++++++++++++++++------- 5 files changed, 94 insertions(+), 21 deletions(-) diff --git a/sql/log_event_server.cc b/sql/log_event_server.cc index f3d01b7d9ed..166295ed263 100644 --- a/sql/log_event_server.cc +++ b/sql/log_event_server.cc @@ -8105,6 +8105,11 @@ int Rows_log_event::find_row(rpl_group_info *rgi) if (m_key_info) { + bool skip_first_compare= false; + bool is_index_unique= + (table->key_info->flags & HA_NOSAME) && + !(table->key_info->flags & (HA_NULL_PART_KEY)); + DBUG_PRINT("info",("locating record using key #%u [%s] (index_read)", m_key_nr, m_key_info->name.str)); /* We use this to test that the correct key is used in test cases. */ @@ -8149,11 +8154,20 @@ int Rows_log_event::find_row(rpl_group_info *rgi) HA_READ_KEY_EXACT)))) { DBUG_PRINT("info",("no record matching the key found in the table")); - if (error == HA_ERR_KEY_NOT_FOUND) - error= row_not_found_error(rgi); - table->file->print_error(error, MYF(0)); - table->file->ha_index_end(); - goto end; + if (error == HA_ERR_LOCK_WAIT_TIMEOUT && !is_index_unique && + (rgi->exec_flags & (1 << rpl_group_info::HIT_BUSY_INDEX))) + { + rgi->exec_flags &= ~(1 << rpl_group_info::HIT_BUSY_INDEX); + skip_first_compare= true; + } + else + { + if (error == HA_ERR_KEY_NOT_FOUND) + error= row_not_found_error(rgi); + table->file->print_error(error, MYF(0)); + table->file->ha_index_end(); + goto end; + } } /* @@ -8222,17 +8236,28 @@ int Rows_log_event::find_row(rpl_group_info *rgi) /* We use this to test that the correct key is used in test cases. */ DBUG_EXECUTE_IF("slave_crash_if_index_scan", abort();); - while (record_compare(table, m_vers_from_plain)) + while (skip_first_compare || record_compare(table, m_vers_from_plain)) { + if (skip_first_compare) + skip_first_compare= false; while ((error= table->file->ha_index_next(table->record[0]))) { DBUG_PRINT("info",("no record matching the given row found")); if (error == HA_ERR_END_OF_FILE) error= end_of_file_error(rgi); + else if (error == HA_ERR_LOCK_WAIT_TIMEOUT && !is_index_unique && + (rgi->exec_flags & (1 << rpl_group_info::HIT_BUSY_INDEX))) + { + rgi->exec_flags &= ~(1 << rpl_group_info::HIT_BUSY_INDEX); + continue; + } + DBUG_ASSERT(!(rgi->exec_flags & (1 << rpl_group_info::HIT_BUSY_INDEX))); + table->file->print_error(error, MYF(0)); table->file->ha_index_end(); goto end; } + DBUG_ASSERT(!(rgi->exec_flags & (1 << rpl_group_info::HIT_BUSY_INDEX))); } } else diff --git a/sql/rpl_rli.cc b/sql/rpl_rli.cc index 3b036cdaa79..2839b6e68b0 100644 --- a/sql/rpl_rli.cc +++ b/sql/rpl_rli.cc @@ -2154,12 +2154,14 @@ rpl_group_info::reinit(Relay_log_info *rli) gtid_ignore_duplicate_state= GTID_DUPLICATE_NULL; speculation= SPECULATE_NO; commit_orderer.reinit(); + exec_flags= 0; } rpl_group_info::rpl_group_info(Relay_log_info *rli) : thd(0), wait_commit_sub_id(0), wait_commit_group_info(0), parallel_entry(0), - deferred_events(NULL), m_annotate_event(0), is_parallel_exec(false) + deferred_events(NULL), m_annotate_event(0), is_parallel_exec(false), + exec_flags(0) { reinit(rli); bzero(¤t_gtid, sizeof(current_gtid)); diff --git a/sql/rpl_rli.h b/sql/rpl_rli.h index 3f24481839b..a9ec44b58cb 100644 --- a/sql/rpl_rli.h +++ b/sql/rpl_rli.h @@ -833,6 +833,15 @@ struct rpl_group_info RETRY_KILL_KILLED }; uchar killed_for_retry; + enum enum_exec_flags { + HIT_BUSY_INDEX= 0 + }; + /* + Being executed replication event's context. It could contain + notification from engine such as attempts to lock already acquired + index record. + */ + uint32 exec_flags; rpl_group_info(Relay_log_info *rli_); ~rpl_group_info(); @@ -960,6 +969,11 @@ struct rpl_group_info if (!is_parallel_exec) rli->event_relay_log_pos= future_event_relay_log_pos; } + + bool is_row_event_execution() + { + return m_table_map.count() > 0; + } }; diff --git a/sql/sql_class.cc b/sql/sql_class.cc index 6419a58fbe4..b2321cfd865 100644 --- a/sql/sql_class.cc +++ b/sql/sql_class.cc @@ -5418,7 +5418,9 @@ thd_need_wait_reports(const MYSQL_THD thd) /* Used by storage engines (currently InnoDB) to report that one transaction THD is about to go to wait for a transactional lock held by - another transactions OTHER_THD. + another transactions OTHER_THD. Also in case of a non-unique index locking + the slave applier is informed on a prepared XA transaction holding + an offending lock. This is used for parallel replication, where transactions are required to commit in the same order on the slave as they did on the master. If the @@ -5432,6 +5434,9 @@ thd_need_wait_reports(const MYSQL_THD thd) slave in the first place. However, it is possible in case when the optimizer chooses a different plan on the slave than on the master (eg. table scan instead of index scan). + When the latter takes places and the conflict on a non-unique index involves + the slave applier the latter needs to know whether the lock onwer is + a prepared XA and this is provided. Storage engines report lock waits using this call. If a lock wait causes a deadlock with the pre-determined commit order, we kill the later @@ -5444,20 +5449,33 @@ thd_need_wait_reports(const MYSQL_THD thd) transaction. */ extern "C" int -thd_rpl_deadlock_check(MYSQL_THD thd, MYSQL_THD other_thd) +thd_rpl_deadlock_check(MYSQL_THD thd, MYSQL_THD other_thd, + bool is_other_prepared, bool is_index_unique, + uint *flagged) { - rpl_group_info *rgi; - rpl_group_info *other_rgi; + rpl_group_info *rgi= thd ? thd->rgi_slave : NULL; + rpl_group_info *other_rgi= other_thd ? other_thd->rgi_slave : NULL; if (!thd) return 0; DEBUG_SYNC(thd, "thd_report_wait_for"); thd->transaction->stmt.mark_trans_did_wait(); + + /* + Return to the caller the fact of the non-unique index wait lock + conflicts with one of a prepared state transaction. + */ + if (!is_index_unique && rgi && rgi->is_row_event_execution()) { + if (is_other_prepared && !other_thd) + { + rgi->exec_flags |= 1 << rpl_group_info::HIT_BUSY_INDEX; + (*flagged)++; + } + } + if (!other_thd) return 0; binlog_report_wait_for(thd, other_thd); - rgi= thd->rgi_slave; - other_rgi= other_thd->rgi_slave; if (!rgi || !other_rgi) return 0; if (!rgi->is_parallel_exec) diff --git a/storage/innobase/lock/lock0lock.cc b/storage/innobase/lock/lock0lock.cc index 7b7e7ef3e46..2aeb4774631 100644 --- a/storage/innobase/lock/lock0lock.cc +++ b/storage/innobase/lock/lock0lock.cc @@ -61,7 +61,10 @@ my_bool innodb_deadlock_detect; ulong innodb_deadlock_report; #ifdef HAVE_REPLICATION -extern "C" void thd_rpl_deadlock_check(MYSQL_THD thd, MYSQL_THD other_thd); +extern "C" void thd_rpl_deadlock_check(MYSQL_THD thd, MYSQL_THD other_thd, + bool is_prepared= false, + bool is_index_uniq= false, + uint *flagged= NULL); extern "C" int thd_need_wait_reports(const MYSQL_THD thd); extern "C" int thd_need_ordering_with(const MYSQL_THD thd, const MYSQL_THD other_thd); extern "C" int thd_deadlock_victim_preference(const MYSQL_THD thd1, const MYSQL_THD thd2); @@ -1874,10 +1877,10 @@ ATTRIBUTE_NOINLINE MY_ATTRIBUTE((nonnull, warn_unused_result)) trx->error_state= DB_DEADLOCK if trx->lock.was_chosen_as_deadlock_victim was set when lock_sys.wait_mutex was unlocked. @param trx transaction that may be waiting for a lock -@param wait_lock lock that is being waited for +@param wait_timeout either remains or can be zeroed @return lock being waited for (may have been replaced by an equivalent one) @retval nullptr if no lock is being waited for */ -static lock_t *lock_wait_rpl_report(trx_t *trx) +static lock_t *lock_wait_rpl_report(trx_t *trx, ulong& wait_timeout) { mysql_mutex_assert_owner(&lock_sys.wait_mutex); ut_ad(trx->state == TRX_STATE_ACTIVE); @@ -1886,6 +1889,7 @@ static lock_t *lock_wait_rpl_report(trx_t *trx) lock_t *wait_lock= trx->lock.wait_lock; if (!wait_lock) return nullptr; + uint count_non_unique_index_hit= 0; /* This would likely be too large to attempt to use a memory transaction, even for wait_lock->is_table(). */ const bool nowait= lock_sys.wr_lock_try(); @@ -1903,6 +1907,8 @@ static lock_t *lock_wait_rpl_report(trx_t *trx) lock_sys.wait_mutex was unlocked, let's check it. */ if (!nowait && trx->lock.was_chosen_as_deadlock_victim) trx->error_state= DB_DEADLOCK; + if (count_non_unique_index_hit > 0) + wait_timeout= 0; return wait_lock; } ut_ad(wait_lock->is_waiting()); @@ -1940,7 +1946,11 @@ static lock_t *lock_wait_rpl_report(trx_t *trx) lock= lock_rec_get_next(heap_no, lock); do if (lock->trx->mysql_thd != thd) - thd_rpl_deadlock_check(thd, lock->trx->mysql_thd); + thd_rpl_deadlock_check(thd, lock->trx->mysql_thd, + lock->trx->state == TRX_STATE_PREPARED, + (lock->index->is_unique() && + !lock->index->n_nullable), + &count_non_unique_index_hit); while ((lock= lock_rec_get_next(heap_no, lock))); } } @@ -1971,7 +1981,7 @@ dberr_t lock_wait(que_thr_t *thr) /* InnoDB system transactions may use the global value of innodb_lock_wait_timeout, because trx->mysql_thd == NULL. */ - const ulong innodb_lock_wait_timeout= trx_lock_wait_timeout_get(trx); + ulong innodb_lock_wait_timeout= trx_lock_wait_timeout_get(trx); const my_hrtime_t suspend_time= my_hrtime_coarse(); ut_ad(!trx->dict_operation_lock_mode); @@ -2034,7 +2044,6 @@ dberr_t lock_wait(que_thr_t *thr) const bool row_lock_wait= thr->lock_state == QUE_THR_LOCK_ROW; timespec abstime; set_timespec_time_nsec(abstime, suspend_time.val * 1000); - abstime.MY_tv_sec+= innodb_lock_wait_timeout; /* Dictionary transactions must wait be immune to lock wait timeouts for locks on data dictionary tables. Here we check only for SYS_TABLES, SYS_COLUMNS, SYS_INDEXES, SYS_FIELDS. Locks on further @@ -2094,10 +2103,15 @@ dberr_t lock_wait(que_thr_t *thr) lock_sys.wait_start(); #ifdef HAVE_REPLICATION + /* + innodb_lock_wait_timeout can be zeroed in a specific case of non-unique + index locking by slave applier. When that happens no actual waiting for + wait_lock is done before the server layer has received the timeout error. + */ if (rpl) - wait_lock= lock_wait_rpl_report(trx); + wait_lock= lock_wait_rpl_report(trx, innodb_lock_wait_timeout); #endif - + abstime.MY_tv_sec+= innodb_lock_wait_timeout; switch (trx->error_state) { case DB_SUCCESS: break; -- 2.39.2
participants (1)
-
Kristian Nielsen