revision-id: e4d7dbc47038638ed3d199c3d959ed5ead84f353 (v5.8-1025-ge4d7dbc47) parent(s): e3e66e6e0af211cb2b963e4cd9c0539b746e0b8d author: Sergei Petrunia committer: Sergei Petrunia timestamp: 2019-01-14 21:42:02 +0300 message: Make range lock escalation update transaction's list of owned locks --- utilities/transactions/transaction_lock_mgr.cc | 142 +++++++++++++++++-------- utilities/transactions/transaction_lock_mgr.h | 2 +- 2 files changed, 98 insertions(+), 46 deletions(-) diff --git a/utilities/transactions/transaction_lock_mgr.cc b/utilities/transactions/transaction_lock_mgr.cc index 395bcbaaf..ed382cefd 100644 --- a/utilities/transactions/transaction_lock_mgr.cc +++ b/utilities/transactions/transaction_lock_mgr.cc @@ -327,26 +327,36 @@ void RangeLockMgr::KillLockWait(void *cdata) We store them in toku::range_buffer because toku::locktree::release_locks() accepts that as an argument. - TODO: lock escalation in the lock table should affect this structure, too? + Note: the list of locks may differ slighly from the contents of the lock + tree, due to concurrency between lock acquisition, lock release, and lock + escalation. See MDEV-18227 and RangeLockMgr::UnLockAll for details. + This property is currently harmless. */ class RangeLockList: public PessimisticTransaction::LockStorage { public: - virtual ~RangeLockList() - { + virtual ~RangeLockList() { buffer.destroy(); } - RangeLockList() - { + RangeLockList() : releasing_locks(false) { buffer.create(); } void append(const DBT *left_key, const DBT *right_key) { + MutexLock l(&mutex_); + // there's only one thread that calls this function. + // the same thread will do lock release. + assert(!releasing_locks); buffer.append(left_key, right_key); } + /* Ranges that we are holding the locks on. */ toku::range_buffer buffer; + + /* Synchronization. See RangeLockMgr::UnLockAll for details */ + port::Mutex mutex_; + bool releasing_locks; }; // Get a range lock on [start_key; end_key] range @@ -364,8 +374,8 @@ Status RangeLockMgr::TryRangeLock(PessimisticTransaction* txn, toku_fill_dbt(&start_key_dbt, start_key.data(), start_key.size()); toku_fill_dbt(&end_key_dbt, end_key.data(), end_key.size()); - request.set(lt, txn->GetID(), &start_key_dbt, &end_key_dbt, toku::lock_request::WRITE, - false /* not a big txn */, (void*)txn->GetID() /*client_extra*/); + request.set(lt, (TXNID)txn, &start_key_dbt, &end_key_dbt, toku::lock_request::WRITE, + false /* not a big txn */, (void*)txn->GetID()/*client_extra, for KILL*/); uint64_t killed_time_msec = 0; // TODO: what should this have? uint64_t wait_time_msec = txn->GetLockTimeout(); @@ -806,7 +816,7 @@ range_lock_mgr_release_lock_int(toku::locktree *lt, toku::range_buffer range_buf; range_buf.create(); range_buf.append(&key_dbt, &key_dbt); - lt->release_locks(txn->GetID(), &range_buf); + lt->release_locks((TXNID)txn, &range_buf); range_buf.destroy(); } @@ -836,21 +846,6 @@ void RangeLockMgr::UnLock(const PessimisticTransaction* txn, void RangeLockMgr::UnLockAll(const PessimisticTransaction* txn, const TransactionKeyMap* key_map, Env* env) { -#if 0 - //TODO: collecting multiple locks into a buffer and then making one call - // to lock_tree::release_locks() will be faster. - for (auto& key_map_iter : *key_map) { - uint32_t column_family_id = key_map_iter.first; - auto& keys = key_map_iter.second; - - for (auto& key_iter : keys) { - const std::string& key = key_iter.first; - range_lock_mgr_release_lock_int(lt, txn, column_family_id, key, true); - } - } - - toku::lock_request::retry_all_lock_requests(lt, nullptr /* lock_wait_needed_callback */); -#endif // owned_locks may hold nullptr if the transaction has never acquired any // locks. @@ -858,33 +853,49 @@ void RangeLockMgr::UnLockAll(const PessimisticTransaction* txn, { RangeLockList* range_list= (RangeLockList*)txn->owned_locks.get(); + { + MutexLock l(&range_list->mutex_); + /* + The lt->release_locks() call below will walk range_list->buffer. We + need to prevent lock escalation callback from replacing + range_list->buffer while we are doing that. + + Additional complication here is internal mutex(es) in the locktree + (let's call them latches): + - Lock escalation first obtains latches on the lock tree + - Then, it calls RangeLockMgr::on_escalate to replace transaction's + range_list->buffer. + = Access to that buffer must be synchronized, so it will want to + acquire the range_list->mutex_. + + While in this function we would want to do the reverse: + - Acquire range_list->mutex_ to prevent access to the range_list. + - Then, lt->release_locks() call will walk through the range_list + - and acquire latches on parts of the lock tree to remove locks from + it. + + How do we avoid the deadlock? Thei ideas is that here we set + releasing_locks=true, and release the mutex. + All other users of the range_list must: + - Acquire the mutex, then check that releasing_locks=false. + (the code in this function doesnt do that as there's only one thread + that does lock release) + */ + range_list->releasing_locks= true; + } + // Don't try to call release_locks() if the buffer is empty! if we are - // not holding any locks, other transaction may be in STO-mode currently - // and our attempt to release an empty set of locks will cause an - // assertion failure. + // not holding any locks, the lock tree might be on STO-mode with another + // transaction, and our attempt to release an empty set of locks will + // cause an assertion failure. if (range_list->buffer.get_num_ranges()) - lt->release_locks(txn->GetID(), &range_list->buffer, true); + lt->release_locks((TXNID)txn, &range_list->buffer, true); range_list->buffer.destroy(); range_list->buffer.create(); - toku::lock_request::retry_all_lock_requests(lt, nullptr /* lock_wait_needed_callback */); - } + range_list->releasing_locks= false; -#if 0 - Original usage: - void release_locks(TXNID txnid, const range_buffer *ranges); - - // release all of the locks this txn has ever successfully - // acquired and stored in the range buffer for this locktree - lt->release_locks(txnid, ranges->buffer); - lt->get_manager()->note_mem_released(ranges->buffer->total_memory_size()); - ranges->buffer->destroy(); - toku_free(ranges->buffer); - - // all of our locks have been released, so first try to wake up - // pending lock requests, then release our reference on the lt toku::lock_request::retry_all_lock_requests(lt, nullptr /* lock_wait_needed_callback */); - -#endif + } } int RangeLockMgr::compare_dbt_endpoints(__toku_db*, void *arg, @@ -903,6 +914,46 @@ RangeLockMgr::RangeLockMgr(TransactionDB* txn_db) : my_txn_db(txn_db) lt= nullptr; } + +/* + @brief Lock Escalation Callback function + + @param txnid Transaction whose locks got escalated + @param lt Lock Tree where escalation is happening (currently there is only one) + @param buffer Escalation result: list of locks that this transaction now + owns in this lock tree. + @param extra Callback context +*/ + +void RangeLockMgr::on_escalate(TXNID txnid, const locktree *lt, + const range_buffer &buffer, void *extra) +{ + auto txn= (PessimisticTransaction*)txnid; + + RangeLockList* trx_locks= (RangeLockList*)txn->owned_locks.get(); + + MutexLock l(&trx_locks->mutex_); + if (trx_locks->releasing_locks) { + /* + Do nothing. The transaction is releasing its locks, so it will not care + about having a correct list of ranges. (In TokuDB, + toku_db_txn_escalate_callback() makes use of this property, too) + */ + return; + } + + // TODO: are we tracking this mem: lt->get_manager()->note_mem_released(trx_locks->ranges.buffer->total_memory_size()); + trx_locks->buffer.destroy(); + trx_locks->buffer.create(); + toku::range_buffer::iterator iter(&buffer); + toku::range_buffer::iterator::record rec; + while (iter.current(&rec)) { + trx_locks->buffer.append(rec.get_left_key(), rec.get_right_key()); + iter.next(); + } + // TODO: same as above: lt->get_manager()->note_mem_used(ranges.buffer->total_memory_size()); +} + void RangeLockMgr::set_endpoint_cmp_functions(convert_key_to_endpoint_func cvt_func, compare_endpoints_func cmp_func) { @@ -997,7 +1048,7 @@ struct LOCK_PRINT_CONTEXT { static void push_into_lock_status_data(void* param, const DBT *left, - const DBT *right, TXNID txnid) + const DBT *right, TXNID txnid_arg) { struct LOCK_PRINT_CONTEXT *ctx= (LOCK_PRINT_CONTEXT*)param; struct KeyLockInfo info; @@ -1013,6 +1064,7 @@ void push_into_lock_status_data(void* param, const DBT *left, info.key2.append((const char*)right->data, right->size); } + TXNID txnid= ((PessimisticTransaction*)txnid_arg)->GetID(); info.ids.push_back(txnid); ctx->data->insert({ctx->cfh_id, info}); } diff --git a/utilities/transactions/transaction_lock_mgr.h b/utilities/transactions/transaction_lock_mgr.h index 59234c26f..8777983e4 100644 --- a/utilities/transactions/transaction_lock_mgr.h +++ b/utilities/transactions/transaction_lock_mgr.h @@ -256,7 +256,7 @@ class RangeLockMgr : static int on_create(locktree *lt, void *extra) { return 0; /* no error */ } static void on_destroy(locktree *lt) {} static void on_escalate(TXNID txnid, const locktree *lt, - const range_buffer &buffer, void *extra) {} + const range_buffer &buffer, void *extra); };