revision-id: 1b423e3954d932c4624337308e3e1cd98481a495 (v5.8-1041-g1b423e395) parent(s): bcc7923688446c58a21f4f0ab287b147f406abc2 author: Sergei Petrunia committer: Sergei Petrunia timestamp: 2019-04-29 23:29:02 +0300 message: Range Locking: clean up comments --- include/rocksdb/utilities/transaction.h | 62 ++++++++++++----------- include/rocksdb/utilities/transaction_db.h | 13 +++-- utilities/transactions/pessimistic_transaction.cc | 1 - utilities/transactions/transaction_lock_mgr.cc | 21 +++----- utilities/transactions/transaction_lock_mgr.h | 14 +++-- 5 files changed, 52 insertions(+), 59 deletions(-) diff --git a/include/rocksdb/utilities/transaction.h b/include/rocksdb/utilities/transaction.h index b0f1308ed..6b84d4a44 100644 --- a/include/rocksdb/utilities/transaction.h +++ b/include/rocksdb/utilities/transaction.h @@ -31,50 +31,52 @@ using TransactionID = uint64_t; Prefix ranges are introduced below. == Basic Ranges == - - Basic ranges can be defined over rowkeys. Key Comparator defines ordering of - rowkeys, a finite closed range is the same as range over numbers: + Let's start from basic ranges. Key Comparator defines ordering of rowkeys. + Then, one can specify finite closed ranges by just providing rowkeys of their + endpoints: lower_endpoint <= X <= upper_endpoint - The endpoints here are possible rowkey values. - - == Lexicographic-like ordering == - - A lexicographic-like ordering satisfies these criteria: - - 1.The ordering is prefix-based. If there are two keys in form + However our goal is to provide a richer set of endpoints. Read on. - key_a = {prefix_a suffix_a} - key_b = {prefix_b suffix_b} + == Lexicographic ordering == + A lexicographic (or dictionary) ordering satisfies these criteria: If there + are two keys in form + key_a = {prefix_a, suffix_a} + key_b = {prefix_b, suffix_b} and prefix_a < prefix_b then key_a < key_b. - 2. An empty string is less than any other value. From this it follows that - for any prefix and suffix, {prefix, suffix} > {prefix}. + == Prefix ranges == + With lexicographic ordering, one may want to define ranges in form - 3. The row comparison function is able to compare key prefixes. If the data - domain includes keys A and B, then the comparison function is able to compare - equal-length prefixes: + "prefix is $PREFIX" - min_len= min(byte_length(A), byte_length(B)); - cmp(Slice(A, min_len), Slice(B, min_len)) // this call is valid + which translates to a range in form - == Prefix ranges == + {$PREFIX, -infinity} < X < {$PREFIX, +infinity} + + where -infinity will compare less than any possible suffix, and +infinity + will compare as greater than any possible suffix. - With lexicographic-like ordering, one may wish to construct ranges from a - restriction in form prefix=P: - - the left endpoint would would be {P, inf_suffix=false} - - the right endpoint would be {P, inf_suffix=true}. + class Endpoint allows to define these kind of rangtes. - == Supported comparators == - BytewiseComparator and ReverseBytweiseComparator meet the lexicographic-like - ordering requirements. + == Notes == + BytewiseComparator and ReverseBytewiseComparator produce lexicographic + ordering. + + The row comparison function is able to compare key prefixes. If the data + domain includes keys A and B, then the comparison function is able to compare + equal-length prefixes: + + min_len= min(byte_length(A), byte_length(B)); + cmp(Slice(A, min_len), Slice(B, min_len)); // this call is valid - TODO: RangeLocking will refuse to work if any other comparator is used, - although other comparators meeting this property could be used as well. + == Other options == + As far as MyRocks is concerned, the alternative to prefix ranges would be to + support both open (non-inclusive) and closed (inclusive) range endpoints. */ class Endpoint { @@ -82,7 +84,7 @@ class Endpoint { Slice slice; /* - true : the key has an "infinity" suffix. A suffix that would compare as + true : the key has a "+infinity" suffix. A suffix that would compare as greater than any other suffix false : otherwise */ diff --git a/include/rocksdb/utilities/transaction_db.h b/include/rocksdb/utilities/transaction_db.h index 8f0e018f2..7ebac3e06 100644 --- a/include/rocksdb/utilities/transaction_db.h +++ b/include/rocksdb/utilities/transaction_db.h @@ -33,7 +33,8 @@ enum TxnDBWritePolicy { const uint32_t kInitialMaxDeadlocks = 5; -// A handle to control RangeLockMgr +// A handle to control RangeLockMgr (Range-based lock manager) from outside +// RocksDB class RangeLockMgrHandle { public: virtual int set_max_lock_memory(size_t max_lock_memory) = 0; @@ -41,12 +42,15 @@ class RangeLockMgrHandle { virtual ~RangeLockMgrHandle() {}; }; -// A factory function to create a Range Lock Manager +// A factory function to create a Range Lock Manager. The created object should +// be: +// 1. Passed in TransactionDBOptions::range_lock_mgr to open the database in +// range-locking mode +// 2. Used to control the lock manager when the DB is already open. RangeLockMgrHandle* NewRangeLockManager( std::shared_ptr<TransactionDBMutexFactory> mutex_factory ); - struct TransactionDBOptions { // Specifies the maximum number of keys that can be locked at the same time // per column family. @@ -210,7 +214,6 @@ struct DeadlockPath { bool empty() { return path.empty() && !limit_exceeded; } }; - class TransactionDB : public StackableDB { public: // Optimized version of ::Write that receives more optimization request such @@ -284,7 +287,7 @@ class TransactionDB : public StackableDB { GetLockStatusData() = 0; virtual std::vector<DeadlockPath> GetDeadlockInfoBuffer() = 0; virtual void SetDeadlockInfoBufferSize(uint32_t target_size) = 0; - + protected: // To Create an TransactionDB, call Open() // The ownership of db is transferred to the base StackableDB diff --git a/utilities/transactions/pessimistic_transaction.cc b/utilities/transactions/pessimistic_transaction.cc index ff653fa0e..082934e75 100644 --- a/utilities/transactions/pessimistic_transaction.cc +++ b/utilities/transactions/pessimistic_transaction.cc @@ -133,7 +133,6 @@ WriteCommittedTxn::WriteCommittedTxn(TransactionDB* txn_db, Status PessimisticTransaction::CommitBatch(WriteBatch* batch) { TransactionKeyMap keys_to_unlock; - Status s = LockBatch(batch, &keys_to_unlock); if (!s.ok()) { diff --git a/utilities/transactions/transaction_lock_mgr.cc b/utilities/transactions/transaction_lock_mgr.cc index f837f6a5a..b6c91393d 100644 --- a/utilities/transactions/transaction_lock_mgr.cc +++ b/utilities/transactions/transaction_lock_mgr.cc @@ -1180,18 +1180,6 @@ void RangeLockMgr::AddColumnFamily(const ColumnFamilyHandle *cfh) { InstrumentedMutexLock l(<ree_map_mutex_); if (ltree_map_.find(column_family_id) == ltree_map_.end()) { DICTIONARY_ID dict_id = { .dictid = column_family_id }; - // "RocksDB_SE_v3.10" // BytewiseComparator() ,ReverseBytewiseComparator() - /* - toku::comparator *ltree_cmp; - if (!strcmp(cfh->GetComparator()->Name(), "RocksDB_SE_v3.10")) - ltree_cmp = &fw_cmp_; - else if (!strcmp(cfh->GetComparator()->Name(),"rev:RocksDB_SE_v3.10")) - ltree_cmp = &bw_cmp_; - else { - assert(false); - ltree_cmp= nullptr; - } - */ toku::comparator cmp; cmp.create(compare_dbt_endpoints, (void*)cfh->GetComparator(), NULL); toku::locktree *ltree= ltm_.get_lt(dict_id, cmp, @@ -1210,6 +1198,10 @@ void RangeLockMgr::RemoveColumnFamily(const ColumnFamilyHandle *cfh) { // Remove lock_map for this column family. Since the lock map is stored // as a shared ptr, concurrent transactions can still keep using it // until they release their references to it. + + // TODO^ if one can drop a column family while a transaction is holding a + // lock in it, is column family's comparator guaranteed to survive until + // all locks are released? (we depend on this). { InstrumentedMutexLock l(<ree_map_mutex_); @@ -1263,9 +1255,8 @@ toku::locktree *RangeLockMgr::get_locktree_by_cfid(uint32_t column_family_id) { } struct LOCK_PRINT_CONTEXT { - BaseLockMgr::LockStatusData *data; - // this will not be needed when locks are per-column-family: - uint32_t cfh_id; + BaseLockMgr::LockStatusData *data; // Save locks here + uint32_t cfh_id; // Column Family whose tree we are traversing }; static diff --git a/utilities/transactions/transaction_lock_mgr.h b/utilities/transactions/transaction_lock_mgr.h index 1f5dbc634..608d6f34f 100644 --- a/utilities/transactions/transaction_lock_mgr.h +++ b/utilities/transactions/transaction_lock_mgr.h @@ -212,8 +212,8 @@ class RangeLockMgr : }; // Get a lock on a range - // (TODO: this allows to acquire exclusive range locks although they are not - // used ATM) + // @note only exclusive locks are currently supported (requesting a + // non-exclusive lock will get an exclusive one) Status TryRangeLock(PessimisticTransaction* txn, uint32_t column_family_id, const Endpoint &start_endp, @@ -222,10 +222,7 @@ class RangeLockMgr : void UnLock(const PessimisticTransaction* txn, const TransactionKeyMap* keys, Env* env) override ; - /* - Same as above, but *keys is guaranteed to hold all the locks obtained by - the transaction. - */ + // Release all locks the transaction is holding void UnLockAll(const PessimisticTransaction* txn, Env* env); void UnLock(PessimisticTransaction* txn, uint32_t column_family_id, const std::string& key, Env* env) override ; @@ -238,8 +235,7 @@ class RangeLockMgr : ~RangeLockMgr(); - int set_max_lock_memory(size_t max_lock_memory) override - { + int set_max_lock_memory(size_t max_lock_memory) override { return ltm_.set_max_lock_memory(max_lock_memory); } @@ -261,8 +257,10 @@ class RangeLockMgr : InstrumentedMutex ltree_map_mutex_; // Per-thread cache of ltree_map_. + // (uses the same approach as TransactionLockMgr::lock_maps_cache_) std::unique_ptr<ThreadLocalPtr> ltree_lookup_cache_; + // Get the lock tree which stores locks for Column Family with given cf_id toku::locktree *get_locktree_by_cfid(uint32_t cf_id); static int compare_dbt_endpoints(__toku_db*, void *arg, const DBT *a_key, const DBT *b_key);