[Commits] 2f0ee8975: Initial support for shared point locks.
revision-id: 2f0ee897552bb4a8aa66b933c0d6f8529a82e2e8 (v5.8-1042-g2f0ee8975) parent(s): 1b423e3954d932c4624337308e3e1cd98481a495 author: Sergei Petrunia committer: Sergei Petrunia timestamp: 2019-05-13 17:09:36 +0300 message: Initial support for shared point locks. - Locks can now be shared. - Sharing only supported as long as the locked ranges are an exact match. If this requirement is not met, the locks behave as exclusive locks. - TODO Lock escalation is not supported yet (the intent is to not escalate the shared locks at all; but even this is not yet implemented, current code will exhibit incorrect behaviour if escalation hits a shared lock) --- utilities/transactions/range_locking/db.h | 3 + .../range_locking/locktree/concurrent_tree.cc | 17 ++- .../range_locking/locktree/concurrent_tree.h | 18 ++- .../range_locking/locktree/locktree.cc | 166 +++++++++++++++++---- .../transactions/range_locking/locktree/locktree.h | 19 ++- .../range_locking/locktree/range_buffer.cc | 20 ++- .../range_locking/locktree/range_buffer.h | 15 +- .../range_locking/locktree/treenode.cc | 74 ++++++--- .../transactions/range_locking/locktree/treenode.h | 41 ++++- .../range_locking/portability/txn_subst.h | 16 ++ utilities/transactions/transaction_lock_mgr.cc | 24 ++- 11 files changed, 334 insertions(+), 79 deletions(-) diff --git a/utilities/transactions/range_locking/db.h b/utilities/transactions/range_locking/db.h index f9349c6ae..64ea28345 100644 --- a/utilities/transactions/range_locking/db.h +++ b/utilities/transactions/range_locking/db.h @@ -6,6 +6,8 @@ typedef struct __toku_db DB; typedef struct __toku_dbt DBT; + +// port: this is currently not used struct simple_dbt { uint32_t len; void *data; @@ -72,6 +74,7 @@ struct __toku_dbt { void*data; uint32_t size; uint32_t ulen; + // One of DB_DBT_XXX flags uint32_t flags; }; typedef struct __toku_descriptor { diff --git a/utilities/transactions/range_locking/locktree/concurrent_tree.cc b/utilities/transactions/range_locking/locktree/concurrent_tree.cc index a35a9e40b..74d65f710 100644 --- a/utilities/transactions/range_locking/locktree/concurrent_tree.cc +++ b/utilities/transactions/range_locking/locktree/concurrent_tree.cc @@ -97,6 +97,12 @@ void concurrent_tree::locked_keyrange::acquire(const keyrange &range) { m_subtree = subtree; } +void concurrent_tree::locked_keyrange::add_shared_owner(const keyrange &range, + TXNID new_owner) +{ + m_subtree->insert(range, new_owner, /*is_shared*/ true); +} + void concurrent_tree::locked_keyrange::release(void) { m_subtree->mutex_unlock(); } @@ -110,18 +116,19 @@ void concurrent_tree::locked_keyrange::iterate(F *function) const { } } -void concurrent_tree::locked_keyrange::insert(const keyrange &range, TXNID txnid) { +void concurrent_tree::locked_keyrange::insert(const keyrange &range, + TXNID txnid, bool is_shared) { // empty means no children, and only the root should ever be empty if (m_subtree->is_empty()) { - m_subtree->set_range_and_txnid(range, txnid); + m_subtree->set_range_and_txnid(range, txnid, is_shared); } else { - m_subtree->insert(range, txnid); + m_subtree->insert(range, txnid, is_shared); } } -void concurrent_tree::locked_keyrange::remove(const keyrange &range) { +void concurrent_tree::locked_keyrange::remove(const keyrange &range, TXNID txnid) { invariant(!m_subtree->is_empty()); - treenode *new_subtree = m_subtree->remove(range); + treenode *new_subtree = m_subtree->remove(range, txnid); // if removing range changed the root of the subtree, // then the subtree must be the root of the entire tree. if (new_subtree == nullptr) { diff --git a/utilities/transactions/range_locking/locktree/concurrent_tree.h b/utilities/transactions/range_locking/locktree/concurrent_tree.h index 66a7ff176..fabda7294 100644 --- a/utilities/transactions/range_locking/locktree/concurrent_tree.h +++ b/utilities/transactions/range_locking/locktree/concurrent_tree.h @@ -106,15 +106,25 @@ public: template <class F> void iterate(F *function) const; + // Adds another owner to the lock on the specified keyrange. + // requires: the keyrange contains one treenode whose bounds are + // exactly equal to the specifed range (no sub/supersets) + void add_shared_owner(const keyrange &range, TXNID new_owner); + // inserts the given range into the tree, with an associated txnid. // requires: range does not overlap with anything in this locked_keyrange // rationale: caller is responsible for only inserting unique ranges - void insert(const keyrange &range, TXNID txnid); - - // effect: removes the given range from the tree + void insert(const keyrange &range, TXNID txnid, bool is_shared); + + // effect: removes the given range from the tree. + // - txnid=TXNID_ANY means remove the range no matter what its + // owners are + // - Other value means remove the specified txnid from + // ownership (if the range has other owners, it will remain + // in the tree) // requires: range exists exactly in this locked_keyrange // rationale: caller is responsible for only removing existing ranges - void remove(const keyrange &range); + void remove(const keyrange &range, TXNID txnid); // effect: removes all of the keys represented by this locked keyrange // rationale: we'd like a fast way to empty out a tree diff --git a/utilities/transactions/range_locking/locktree/locktree.cc b/utilities/transactions/range_locking/locktree/locktree.cc index 9b530c7b0..0e5f7c307 100644 --- a/utilities/transactions/range_locking/locktree/locktree.cc +++ b/utilities/transactions/range_locking/locktree/locktree.cc @@ -147,6 +147,8 @@ uint32_t locktree::get_reference_count(void) { struct row_lock { keyrange range; TXNID txnid; + bool is_shared; + TxnidVector *owners; }; // iterate over a locked keyrange and copy out all of the data, @@ -157,8 +159,10 @@ static void iterate_and_get_overlapping_row_locks(const concurrent_tree::locked_ GrowableArray<row_lock> *row_locks) { struct copy_fn_obj { GrowableArray<row_lock> *row_locks; - bool fn(const keyrange &range, TXNID txnid) { - row_lock lock = { .range = range, .txnid = txnid }; + bool fn(const keyrange &range, TXNID txnid, bool is_shared, + TxnidVector *owners) { + row_lock lock = { .range = range, .txnid = txnid, + .is_shared = is_shared, .owners = owners}; row_locks->push(lock); return true; } @@ -196,9 +200,10 @@ static uint64_t row_lock_size_in_tree(const row_lock &lock) { // remove and destroy the given row lock from the locked keyrange, // then notify the memory tracker of the newly freed lock. static void remove_row_lock_from_tree(concurrent_tree::locked_keyrange *lkr, - const row_lock &lock, locktree_manager *mgr) { + const row_lock &lock, TXNID txnid, + locktree_manager *mgr) { const uint64_t mem_released = row_lock_size_in_tree(lock); - lkr->remove(lock.range); + lkr->remove(lock.range, txnid); if (mgr != nullptr) { mgr->note_mem_released(mem_released); } @@ -209,7 +214,7 @@ static void remove_row_lock_from_tree(concurrent_tree::locked_keyrange *lkr, static void insert_row_lock_into_tree(concurrent_tree::locked_keyrange *lkr, const row_lock &lock, locktree_manager *mgr) { uint64_t mem_used = row_lock_size_in_tree(lock); - lkr->insert(lock.range, lock.txnid); + lkr->insert(lock.range, lock.txnid, lock.is_shared); if (mgr != nullptr) { mgr->note_mem_used(mem_used); } @@ -221,13 +226,17 @@ void locktree::sto_begin(TXNID txnid) { m_sto_txnid = txnid; } -void locktree::sto_append(const DBT *left_key, const DBT *right_key) { +void locktree::sto_append(const DBT *left_key, const DBT *right_key, + bool is_write_request) { uint64_t buffer_mem, delta; + + // psergey: the below two lines do not make any sense + // (and it's the same in upstream TokuDB) keyrange range; range.create(left_key, right_key); buffer_mem = m_sto_buffer.total_memory_size(); - m_sto_buffer.append(left_key, right_key); + m_sto_buffer.append(left_key, right_key, is_write_request); delta = m_sto_buffer.total_memory_size() - buffer_mem; if (m_mgr != nullptr) { m_mgr->note_mem_used(delta); @@ -274,8 +283,10 @@ void locktree::sto_migrate_buffer_ranges_to_tree(void *prepared_lkr) { range_buffer::iterator::record rec; while (iter.current(&rec)) { sto_lkr.prepare(&sto_rangetree); - int r = acquire_lock_consolidated(&sto_lkr, - m_sto_txnid, rec.get_left_key(), rec.get_right_key(), nullptr); + int r = acquire_lock_consolidated(&sto_lkr, m_sto_txnid, + rec.get_left_key(), + rec.get_right_key(), + rec.get_exclusive_flag(), nullptr); invariant_zero(r); sto_lkr.release(); iter.next(); @@ -285,8 +296,10 @@ void locktree::sto_migrate_buffer_ranges_to_tree(void *prepared_lkr) { // locktree's rangetree, on behalf of the old single txnid. struct migrate_fn_obj { concurrent_tree::locked_keyrange *dst_lkr; - bool fn(const keyrange &range, TXNID txnid) { - dst_lkr->insert(range, txnid); + bool fn(const keyrange &range, TXNID txnid, bool is_shared, + TxnidVector *owners) { + assert(owners == nullptr); + dst_lkr->insert(range, txnid, is_shared); return true; } } migrate_fn; @@ -301,7 +314,8 @@ void locktree::sto_migrate_buffer_ranges_to_tree(void *prepared_lkr) { bool locktree::sto_try_acquire(void *prepared_lkr, TXNID txnid, - const DBT *left_key, const DBT *right_key) { + const DBT *left_key, const DBT *right_key, + bool is_write_request) { if (m_rangetree->is_empty() && m_sto_buffer.is_empty() && toku_unsafe_fetch(m_sto_score) >= STO_SCORE_THRESHOLD) { // We can do the optimization because the rangetree is empty, and // we know its worth trying because the sto score is big enough. @@ -319,7 +333,7 @@ bool locktree::sto_try_acquire(void *prepared_lkr, // this txnid can append its lock to the sto buffer successfully. if (m_sto_txnid != TXNID_NONE) { invariant(m_sto_txnid == txnid); - sto_append(left_key, right_key); + sto_append(left_key, right_key, is_write_request); return true; } else { invariant(m_sto_buffer.is_empty()); @@ -327,12 +341,66 @@ bool locktree::sto_try_acquire(void *prepared_lkr, } } + +/* + Do the same as iterate_and_get_overlapping_row_locks does, but also check for + this: + The set of overlapping rows locks consists of just one read-only shared + lock with the same endpoints as specified (in that case, we can just add + ourselves into that list) + + @return true - One compatible shared lock + false - Otherwise +*/ +static +bool iterate_and_get_overlapping_row_locks2(const concurrent_tree::locked_keyrange *lkr, + const DBT *left_key, const DBT *right_key, + comparator *cmp, + TXNID txnid, + GrowableArray<row_lock> *row_locks) { + struct copy_fn_obj { + GrowableArray<row_lock> *row_locks; + bool first_call= true; + bool matching_lock_found = false; + const DBT *left_key, *right_key; + comparator *cmp; + + bool fn(const keyrange &range, TXNID txnid, bool is_shared, + TxnidVector *owners) { + + if (first_call) { + first_call = false; + if (is_shared && + !(*cmp)(left_key, range.get_left_key()) && + !(*cmp)(right_key, range.get_right_key())) { + matching_lock_found = true; + } + } else { + // if we see multiple matching locks, it doesn't matter whether + // the first one was matching. + matching_lock_found = false; + } + row_lock lock = { .range = range, .txnid = txnid, + .is_shared = is_shared, .owners = owners }; + row_locks->push(lock); + return true; + } + } copy_fn; + copy_fn.row_locks = row_locks; + copy_fn.left_key = left_key; + copy_fn.right_key = right_key; + copy_fn.cmp = cmp; + lkr->iterate(©_fn); + return copy_fn.matching_lock_found; +} + // try to acquire a lock and consolidate it with existing locks if possible // param: lkr, a prepared locked keyrange // return: 0 on success, DB_LOCK_NOTGRANTED if conflicting locks exist. int locktree::acquire_lock_consolidated(void *prepared_lkr, TXNID txnid, const DBT *left_key, const DBT *right_key, + bool is_write_request, txnid_set *conflicts) { int r = 0; concurrent_tree::locked_keyrange *lkr; @@ -345,24 +413,60 @@ int locktree::acquire_lock_consolidated(void *prepared_lkr, // copy out the set of overlapping row locks. GrowableArray<row_lock> overlapping_row_locks; overlapping_row_locks.init(); - iterate_and_get_overlapping_row_locks(lkr, &overlapping_row_locks); + bool matching_shared_lock_found= false; + + if (is_write_request) + iterate_and_get_overlapping_row_locks(lkr, &overlapping_row_locks); + else { + matching_shared_lock_found= + iterate_and_get_overlapping_row_locks2(lkr, left_key, right_key, &m_cmp, + txnid, &overlapping_row_locks); + // psergey-todo: what to do now? So, we have figured we have just one + // shareable lock. Need to add us into it as an owner but the lock + // pointer cannot be kept? + // A: use find_node_with_overlapping_child(key_range, nullptr); + // then, add ourselves to the owner list. + // Dont' foreget to release the subtree after that. + } + + if (matching_shared_lock_found) { + // there is just one non-confliting matching shared lock. + // we are hilding a lock on it (see acquire() call above). + // we need to modify it to indicate there is another locker... + lkr->add_shared_owner(requested_range, txnid); + + // Pretend shared lock uses as much memory. + row_lock new_lock = { .range = requested_range, .txnid = txnid, + .is_shared = false, .owners = nullptr }; + uint64_t mem_used = row_lock_size_in_tree(new_lock); + if (m_mgr) { + m_mgr->note_mem_used(mem_used); + } + return 0; + } + + size_t num_overlapping_row_locks = overlapping_row_locks.get_size(); // if any overlapping row locks conflict with this request, bail out. + bool conflicts_exist = determine_conflicting_txnids(overlapping_row_locks, txnid, conflicts); if (!conflicts_exist) { // there are no conflicts, so all of the overlaps are for the requesting txnid. // so, we must consolidate all existing overlapping ranges and the requested // range into one dominating range. then we insert the dominating range. + bool all_shared = !is_write_request; for (size_t i = 0; i < num_overlapping_row_locks; i++) { row_lock overlapping_lock = overlapping_row_locks.fetch_unchecked(i); invariant(overlapping_lock.txnid == txnid); requested_range.extend(m_cmp, overlapping_lock.range); - remove_row_lock_from_tree(lkr, overlapping_lock, m_mgr); + remove_row_lock_from_tree(lkr, overlapping_lock, TXNID_ANY, m_mgr); + all_shared = all_shared && overlapping_lock.is_shared; } - row_lock new_lock = { .range = requested_range, .txnid = txnid }; + row_lock new_lock = { .range = requested_range, .txnid = txnid, + .is_shared = all_shared, .owners = nullptr }; insert_row_lock_into_tree(lkr, new_lock, m_mgr); } else { r = DB_LOCK_NOTGRANTED; @@ -383,7 +487,7 @@ int locktree::acquire_lock(bool is_write_request, int r = 0; // we are only supporting write locks for simplicity - invariant(is_write_request); + //invariant(is_write_request); // acquire and prepare a locked keyrange over the requested range. // prepare is a serialzation point, so we take the opportunity to @@ -391,9 +495,11 @@ int locktree::acquire_lock(bool is_write_request, concurrent_tree::locked_keyrange lkr; lkr.prepare(m_rangetree); - bool acquired = sto_try_acquire(&lkr, txnid, left_key, right_key); + bool acquired = sto_try_acquire(&lkr, txnid, left_key, right_key, + is_write_request); if (!acquired) { - r = acquire_lock_consolidated(&lkr, txnid, left_key, right_key, conflicts); + r = acquire_lock_consolidated(&lkr, txnid, left_key, right_key, + is_write_request, conflicts); } lkr.release(); @@ -418,7 +524,7 @@ int locktree::try_acquire_lock(bool is_write_request, // the locktree silently upgrades read locks to write locks for simplicity int locktree::acquire_read_lock(TXNID txnid, const DBT *left_key, const DBT *right_key, txnid_set *conflicts, bool big_txn) { - return acquire_write_lock(txnid, left_key, right_key, conflicts, big_txn); + return try_acquire_lock(false, txnid, left_key, right_key, conflicts, big_txn); } int locktree::acquire_write_lock(TXNID txnid, const DBT *left_key, const DBT *right_key, @@ -447,7 +553,9 @@ void locktree::dump_locks(void *cdata, dump_callback cb) (*cb)(cdata, lock.range.get_left_key(), lock.range.get_right_key(), - lock.txnid); + lock.txnid, + lock.is_shared, + lock.owners); } lkr.release(); all_locks.deinit(); @@ -525,8 +633,11 @@ void locktree::remove_overlapping_locks_for_txnid(TXNID txnid, row_lock lock = overlapping_row_locks.fetch_unchecked(i); // If this isn't our lock, that's ok, just don't remove it. // See rationale above. - if (lock.txnid == txnid) { - remove_row_lock_from_tree(&lkr, lock, m_mgr); + // psergey-todo: for shared locks, just remove ourselves from the + // owners. + if (lock.txnid == txnid || + (lock.owners && lock.owners->contains(txnid))) { + remove_row_lock_from_tree(&lkr, lock, txnid, m_mgr); } } @@ -630,7 +741,8 @@ static int extract_first_n_row_locks(concurrent_tree::locked_keyrange *lkr, int num_extracted; int num_to_extract; row_lock *row_locks; - bool fn(const keyrange &range, TXNID txnid) { + bool fn(const keyrange &range, TXNID txnid, bool is_shared, TxnidVector *owners) { + // psergey-todo: multiple owners! if (num_extracted < num_to_extract) { row_lock lock; lock.range.create_copy(range); @@ -655,7 +767,7 @@ static int extract_first_n_row_locks(concurrent_tree::locked_keyrange *lkr, int num_extracted = extract_fn.num_extracted; invariant(num_extracted <= num_to_extract); for (int i = 0; i < num_extracted; i++) { - remove_row_lock_from_tree(lkr, row_locks[i], mgr); + remove_row_lock_from_tree(lkr, row_locks[i], TXNID_ANY, mgr); } return num_extracted; @@ -781,7 +893,9 @@ void locktree::escalate(lt_escalate_cb after_escalate_callback, void *after_esca while (iter.current(&rec)) { keyrange range; range.create(rec.get_left_key(), rec.get_right_key()); - row_lock lock = { .range = range, .txnid = current_txnid }; + row_lock lock = { .range = range, .txnid = current_txnid, + .is_shared= false, // psergey-todo: SharedLockEscalation + .owners= nullptr }; insert_row_lock_into_tree(&lkr, lock, m_mgr); iter.next(); } diff --git a/utilities/transactions/range_locking/locktree/locktree.h b/utilities/transactions/range_locking/locktree/locktree.h index 5ff4f7449..e7c909be0 100644 --- a/utilities/transactions/range_locking/locktree/locktree.h +++ b/utilities/transactions/range_locking/locktree/locktree.h @@ -339,7 +339,10 @@ namespace toku { // since the lock_request object is opaque struct lt_lock_request_info *get_lock_request_info(void); - typedef void (*dump_callback)(void *cdata, const DBT *left, const DBT *right, TXNID txnid); + typedef void (*dump_callback)(void *cdata, + const DBT *left, const DBT *right, + TXNID txnid, bool is_shared, + TxnidVector *owners); void dump_locks(void *cdata, dump_callback cb); private: locktree_manager *m_mgr; @@ -360,6 +363,12 @@ namespace toku { void *m_userdata; struct lt_lock_request_info m_lock_request_info; + // psergey-todo: + // Each transaction also keeps a list of ranges it has locked. + // So, when a transaction is running in STO mode, two identical + // lists are kept: the STO lock list and transaction's owned locks + // list. Why can't we do with just one list? + // The following fields and members prefixed with "sto_" are for // the single txnid optimization, intended to speed up the case // when only one transaction is using the locktree. If we know @@ -453,7 +462,8 @@ namespace toku { // effect: append a range to the sto buffer // requires: m_sto_txnid is valid - void sto_append(const DBT *left_key, const DBT *right_key); + void sto_append(const DBT *left_key, const DBT *right_key, + bool is_write_request); // effect: ends the single txnid optimization, releaseing any memory // stored in the sto buffer, notifying the tracker, and @@ -494,7 +504,8 @@ namespace toku { // back to zero. // returns: true if the lock was acquired for this txnid bool sto_try_acquire(void *prepared_lkr, TXNID txnid, - const DBT *left_key, const DBT *right_key); + const DBT *left_key, const DBT *right_key, + bool is_write_request); // Effect: // Provides a hook for a helgrind suppression. @@ -513,7 +524,7 @@ namespace toku { int acquire_lock_consolidated(void *prepared_lkr, TXNID txnid, const DBT *left_key, const DBT *right_key, - txnid_set *conflicts); + bool is_write_request, txnid_set *conflicts); int acquire_lock(bool is_write_request, TXNID txnid, const DBT *left_key, const DBT *right_key, diff --git a/utilities/transactions/range_locking/locktree/range_buffer.cc b/utilities/transactions/range_locking/locktree/range_buffer.cc index d1f14fc4a..eab374945 100644 --- a/utilities/transactions/range_locking/locktree/range_buffer.cc +++ b/utilities/transactions/range_locking/locktree/range_buffer.cc @@ -66,7 +66,9 @@ namespace toku { return right_neg_inf || right_pos_inf; } - void range_buffer::record_header::init(const DBT *left_key, const DBT *right_key) { + void range_buffer::record_header::init(const DBT *left_key, const DBT *right_key, + bool is_exclusive) { + is_exclusive_lock= is_exclusive; left_neg_inf = left_key == toku_dbt_negative_infinity(); left_pos_inf = left_key == toku_dbt_positive_infinity(); left_key_size = toku_dbt_is_infinite(left_key) ? 0 : left_key->size; @@ -186,15 +188,16 @@ namespace toku { _num_ranges = 0; } - void range_buffer::append(const DBT *left_key, const DBT *right_key) { + void range_buffer::append(const DBT *left_key, const DBT *right_key, + bool is_write_request) { // if the keys are equal, then only one copy is stored. if (toku_dbt_equals(left_key, right_key)) { invariant(left_key->size <= MAX_KEY_SIZE); - append_point(left_key); + append_point(left_key, is_write_request); } else { invariant(left_key->size <= MAX_KEY_SIZE); invariant(right_key->size <= MAX_KEY_SIZE); - append_range(left_key, right_key); + append_range(left_key, right_key, is_write_request); } _num_ranges++; } @@ -215,12 +218,13 @@ namespace toku { _arena.destroy(); } - void range_buffer::append_range(const DBT *left_key, const DBT *right_key) { + void range_buffer::append_range(const DBT *left_key, const DBT *right_key, + bool is_exclusive) { size_t record_length = sizeof(record_header) + left_key->size + right_key->size; char *buf = reinterpret_cast<char *>(_arena.malloc_from_arena(record_length)); record_header h; - h.init(left_key, right_key); + h.init(left_key, right_key, is_exclusive); // serialize the header memcpy(buf, &h, sizeof(record_header)); @@ -238,12 +242,12 @@ namespace toku { } } - void range_buffer::append_point(const DBT *key) { + void range_buffer::append_point(const DBT *key, bool is_exclusive) { size_t record_length = sizeof(record_header) + key->size; char *buf = reinterpret_cast<char *>(_arena.malloc_from_arena(record_length)); record_header h; - h.init(key, nullptr); + h.init(key, nullptr, is_exclusive); // serialize the header memcpy(buf, &h, sizeof(record_header)); diff --git a/utilities/transactions/range_locking/locktree/range_buffer.h b/utilities/transactions/range_locking/locktree/range_buffer.h index 9bc02dc22..e8869fae5 100644 --- a/utilities/transactions/range_locking/locktree/range_buffer.h +++ b/utilities/transactions/range_locking/locktree/range_buffer.h @@ -77,12 +77,14 @@ namespace toku { bool right_neg_inf; uint16_t left_key_size; uint16_t right_key_size; + bool is_exclusive_lock; bool left_is_infinite(void) const; bool right_is_infinite(void) const; - void init(const DBT *left_key, const DBT *right_key); + void init(const DBT *left_key, const DBT *right_key, + bool is_exclusive); }; // PORT static_assert(sizeof(record_header) == 8, "record header format is off"); @@ -109,6 +111,10 @@ namespace toku { // how big is this record? this tells us where the next record is size_t size(void) const; + bool get_exclusive_flag() const { + return _header.is_exclusive_lock; + } + // populate a record header and point our DBT's // buffers into ours if they are not infinite. void deserialize(const char *buf); @@ -145,7 +151,8 @@ namespace toku { // append a left/right key range to the buffer. // if the keys are equal, then only one copy is stored. - void append(const DBT *left_key, const DBT *right_key); + void append(const DBT *left_key, const DBT *right_key, + bool is_write_request=false); // is this range buffer empty? bool is_empty(void) const; @@ -162,11 +169,11 @@ namespace toku { memarena _arena; int _num_ranges; - void append_range(const DBT *left_key, const DBT *right_key); + void append_range(const DBT *left_key, const DBT *right_key, bool is_write_request); // append a point to the buffer. this is the space/time saving // optimization for key ranges where left == right. - void append_point(const DBT *key); + void append_point(const DBT *key, bool is_write_request); }; } /* namespace toku */ diff --git a/utilities/transactions/range_locking/locktree/treenode.cc b/utilities/transactions/range_locking/locktree/treenode.cc index 051ec7d1c..f44918a1b 100644 --- a/utilities/transactions/range_locking/locktree/treenode.cc +++ b/utilities/transactions/range_locking/locktree/treenode.cc @@ -64,6 +64,10 @@ void treenode::init(const comparator *cmp) { m_is_root = false; m_is_empty = true; m_cmp = cmp; + + m_is_shared= false; + m_owners= nullptr; + // use an adaptive mutex at each node since we expect the time the // lock is held to be relatively short compared to a context switch. // indeed, this improves performance at high thread counts considerably. @@ -89,10 +93,11 @@ void treenode::destroy_root(void) { m_cmp = nullptr; } -void treenode::set_range_and_txnid(const keyrange &range, TXNID txnid) { +void treenode::set_range_and_txnid(const keyrange &range, TXNID txnid, bool is_shared) { // allocates a new copy of the range for this node m_range.create_copy(range); m_txnid = txnid; + m_is_shared= is_shared; m_is_empty = false; } @@ -108,10 +113,11 @@ bool treenode::range_overlaps(const keyrange &range) { return m_range.overlaps(*m_cmp, range); } -treenode *treenode::alloc(const comparator *cmp, const keyrange &range, TXNID txnid) { +treenode *treenode::alloc(const comparator *cmp, const keyrange &range, + TXNID txnid, bool is_shared) { treenode *XCALLOC(node); node->init(cmp); - node->set_range_and_txnid(range, txnid); + node->set_range_and_txnid(range, txnid, is_shared); return node; } @@ -122,12 +128,21 @@ void treenode::swap_in_place(treenode *node1, treenode *node2) { node1->m_txnid = node2->m_txnid; node2->m_range = tmp_range; node2->m_txnid = tmp_txnid; + + bool tmp_is_shared= node1->m_is_shared; + node1->m_is_shared= node2->m_is_shared; + node2->m_is_shared= tmp_is_shared; } void treenode::free(treenode *node) { // destroy the range, freeing any copied keys node->m_range.destroy(); + if (node->m_owners) { + delete node->m_owners; + node->m_owners = nullptr; // need this? + } + // the root is simply marked as empty. if (node->is_root()) { // PORT toku_mutex_assert_locked(&node->m_mutex); @@ -189,7 +204,7 @@ void treenode::traverse_overlaps(const keyrange &range, F *function) { if (c == keyrange::comparison::EQUALS) { // Doesn't matter if fn wants to keep going, there // is nothing left, so return. - function->fn(m_range, m_txnid); + function->fn(m_range, m_txnid, m_is_shared, m_owners); return; } @@ -204,7 +219,7 @@ void treenode::traverse_overlaps(const keyrange &range, F *function) { } if (c == keyrange::comparison::OVERLAPS) { - bool keep_going = function->fn(m_range, m_txnid); + bool keep_going = function->fn(m_range, m_txnid, m_is_shared, m_owners); if (!keep_going) { return; } @@ -221,29 +236,35 @@ void treenode::traverse_overlaps(const keyrange &range, F *function) { } } -void treenode::insert(const keyrange &range, TXNID txnid) { +void treenode::insert(const keyrange &range, TXNID txnid, bool is_shared) { // choose a child to check. if that child is null, then insert the new node there. // otherwise recur down that child's subtree keyrange::comparison c = range.compare(*m_cmp, m_range); if (c == keyrange::comparison::LESS_THAN) { treenode *left_child = lock_and_rebalance_left(); if (left_child == nullptr) { - left_child = treenode::alloc(m_cmp, range, txnid); + left_child = treenode::alloc(m_cmp, range, txnid, is_shared); m_left_child.set(left_child); } else { - left_child->insert(range, txnid); + left_child->insert(range, txnid, is_shared); left_child->mutex_unlock(); } - } else { - invariant(c == keyrange::comparison::GREATER_THAN); + } else if (c == keyrange::comparison::GREATER_THAN) { + //invariant(c == keyrange::comparison::GREATER_THAN); treenode *right_child = lock_and_rebalance_right(); if (right_child == nullptr) { - right_child = treenode::alloc(m_cmp, range, txnid); + right_child = treenode::alloc(m_cmp, range, txnid, is_shared); m_right_child.set(right_child); } else { - right_child->insert(range, txnid); + right_child->insert(range, txnid, is_shared); right_child->mutex_unlock(); } + } else if (c == keyrange::comparison::EQUALS) { + invariant(is_shared); + invariant(m_is_shared); + add_shared_owner(txnid); + } else { + invariant(0); } } @@ -337,19 +358,38 @@ void treenode::recursive_remove(void) { treenode::free(this); } -treenode *treenode::remove(const keyrange &range) { +void treenode::remove_shared_owner(TXNID txnid) { + m_owners->erase(txnid); + /* if there is just one owner left, move it to m_txnid */ + if (m_owners->size() == 1) + { + m_txnid = * m_owners->begin(); + delete m_owners; + m_owners = nullptr; + } +} + +treenode *treenode::remove(const keyrange &range, TXNID txnid) { treenode *child; // if the range is equal to this node's range, then just remove // the root of this subtree. otherwise search down the tree // in either the left or right children. keyrange::comparison c = range.compare(*m_cmp, m_range); switch (c) { - case keyrange::comparison::EQUALS: - return remove_root_of_subtree(); + case keyrange::comparison::EQUALS: { + // if we are the only owners, remove. Otherwise, just remove + // us from the owners list. + if (txnid != TXNID_ANY && has_multiple_owners()) { + remove_shared_owner(txnid); + return this; + } else { + return remove_root_of_subtree(); + } + } case keyrange::comparison::LESS_THAN: child = m_left_child.get_locked(); invariant_notnull(child); - child = child->remove(range); + child = child->remove(range, txnid); // unlock the child if there still is one. // regardless, set the right child pointer @@ -361,7 +401,7 @@ treenode *treenode::remove(const keyrange &range) { case keyrange::comparison::GREATER_THAN: child = m_right_child.get_locked(); invariant_notnull(child); - child = child->remove(range); + child = child->remove(range, txnid); // unlock the child if there still is one. // regardless, set the right child pointer diff --git a/utilities/transactions/range_locking/locktree/treenode.h b/utilities/transactions/range_locking/locktree/treenode.h index a4b01f1cc..6b082acc4 100644 --- a/utilities/transactions/range_locking/locktree/treenode.h +++ b/utilities/transactions/range_locking/locktree/treenode.h @@ -92,7 +92,7 @@ public: void destroy_root(void); // effect: sets the txnid and copies the given range for this node - void set_range_and_txnid(const keyrange &range, TXNID txnid); + void set_range_and_txnid(const keyrange &range, TXNID txnid, bool is_shared); // returns: true iff this node is marked as empty bool is_empty(void); @@ -127,12 +127,12 @@ public: // effect: inserts the given range and txnid into a subtree, recursively // requires: range does not overlap with any node below the subtree - void insert(const keyrange &range, TXNID txnid); + void insert(const keyrange &range, TXNID txnid, bool is_shared); // effect: removes the given range from the subtree // requires: range exists in the subtree // returns: the root of the resulting subtree - treenode *remove(const keyrange &range); + treenode *remove(const keyrange &range, TXNID txnid); // effect: removes this node and all of its children, recursively // requires: every node at and below this node is unlocked @@ -166,13 +166,41 @@ private: // destroyed, it frees the memory associated with whatever range // it has at the time of destruction. keyrange m_range; + + void add_shared_owner(TXNID txnid) + { + assert(m_is_shared); + if (m_txnid != TXNID_SHARED) + { + m_owners= new TxnidVector; + m_owners->insert(m_txnid); + m_txnid= TXNID_SHARED; + } + m_owners->insert(txnid); + } + void remove_shared_owner(TXNID txnid); + + bool has_multiple_owners() { return (m_txnid == TXNID_SHARED); } + +private: + // Owner transaction id. + // A value of TXNID_SHARED means this node has multiple owners TXNID m_txnid; + // If true, this lock is a non-exclusive lock, and it can have either + // one or several owners. + bool m_is_shared; + + // List of the owners, or nullptr if there's just one owner. + TxnidVector *m_owners; + // two child pointers child_ptr m_left_child; child_ptr m_right_child; // comparator for ranges + // psergey-todo: Is there any sense to store the comparator in each tree + // node? const comparator *m_cmp; // marked for the root node. the root node is never free()'d @@ -185,6 +213,10 @@ private: // effect: initializes an empty node with the given comparator void init(const comparator *cmp); + // requires: this is a shared node (m_is_shared==true) + // effect: another transaction is added as an owner. + void add_shared_owner(TXNID txnid); + // requires: *parent is initialized to something meaningful. // requires: subtree is non-empty // returns: the leftmost child of the given subtree @@ -230,7 +262,8 @@ private: treenode *maybe_rebalance(void); // returns: allocated treenode populated with a copy of the range and txnid - static treenode *alloc(const comparator *cmp, const keyrange &range, TXNID txnid); + static treenode *alloc(const comparator *cmp, const keyrange &range, + TXNID txnid, bool is_shared); // requires: node is a locked root node, or an unlocked non-root node static void free(treenode *node); diff --git a/utilities/transactions/range_locking/portability/txn_subst.h b/utilities/transactions/range_locking/portability/txn_subst.h index 3882eb1c5..58c3fced0 100644 --- a/utilities/transactions/range_locking/portability/txn_subst.h +++ b/utilities/transactions/range_locking/portability/txn_subst.h @@ -3,8 +3,24 @@ // #pragma once +#include <set> #include "util/omt.h" typedef uint64_t TXNID; #define TXNID_NONE ((TXNID)0) +// A set of transactions +// (TODO: consider using class toku::txnid_set. The reason for using STL +// container was that its API is easier) +class TxnidVector : public std::set<TXNID> { +public: + bool contains(TXNID txnid) { return find(txnid) != end(); } +}; + +// A value for lock structures with a meaning "the lock is owned by multiple +// transactions (and one has to check the TxnidVector to get their ids) +#define TXNID_SHARED (TXNID(-1)) + +// Auxiliary value meaning "any transaction id will do". No real transaction +// may have this is as id. +#define TXNID_ANY (TXNID(-2)) diff --git a/utilities/transactions/transaction_lock_mgr.cc b/utilities/transactions/transaction_lock_mgr.cc index b6c91393d..34144c2af 100644 --- a/utilities/transactions/transaction_lock_mgr.cc +++ b/utilities/transactions/transaction_lock_mgr.cc @@ -826,7 +826,7 @@ Status RangeLockMgr::TryRangeLock(PessimisticTransaction* txn, uint32_t column_family_id, const Endpoint &start_endp, const Endpoint &end_endp, - bool /*exclusive*/) { + bool exclusive) { toku::lock_request request; request.create(mutex_factory_); DBT start_key_dbt, end_key_dbt; @@ -848,7 +848,8 @@ Status RangeLockMgr::TryRangeLock(PessimisticTransaction* txn, auto lt= get_locktree_by_cfid(column_family_id); request.set(lt, (TXNID)txn, &start_key_dbt, &end_key_dbt, - toku::lock_request::WRITE, false /* not a big txn */, + exclusive? toku::lock_request::WRITE: toku::lock_request::READ, + false /* not a big txn */, (void*)wait_txn_id); uint64_t killed_time_msec = 0; // TODO: what should this have? @@ -1260,13 +1261,15 @@ struct LOCK_PRINT_CONTEXT { }; static -void push_into_lock_status_data(void* param, const DBT *left, - const DBT *right, TXNID txnid_arg) { +void push_into_lock_status_data(void* param, + const DBT *left, const DBT *right, + TXNID txnid_arg, bool is_shared, + TxnidVector *owners) { struct LOCK_PRINT_CONTEXT *ctx= (LOCK_PRINT_CONTEXT*)param; struct KeyLockInfo info; info.key.append((const char*)left->data, (size_t)left->size); - info.exclusive= true; + info.exclusive= !is_shared; if (!(left->size == right->size && !memcmp(left->data, right->data, left->size))) @@ -1276,8 +1279,15 @@ void push_into_lock_status_data(void* param, const DBT *left, info.key2.append((const char*)right->data, right->size); } - TXNID txnid= ((PessimisticTransaction*)txnid_arg)->GetID(); - info.ids.push_back(txnid); + if (txnid_arg != TXNID_SHARED) { + TXNID txnid= ((PessimisticTransaction*)txnid_arg)->GetID(); + info.ids.push_back(txnid); + } else { + for (auto it : *owners) { + TXNID real_id= ((PessimisticTransaction*)it)->GetID(); + info.ids.push_back(real_id); + } + } ctx->data->insert({ctx->cfh_id, info}); }
participants (1)
-
Sergei Petrunia