revision-id: 34a0cf718aa19d5bda2d471b2c6e20dac8256989 (fb-prod201903-257-g34a0cf718aa)
parent(s): 30c4f566dca5d8a8bec2969faa7132c746afad92
author: Sergei Petrunia
committer: Sergei Petrunia
timestamp: 2019-11-24 20:01:53 +0300
message:
Range locking fixes
- Update RocksDB to revision that prints the locks in STO-mode, too.
- Make rocksdb.range_locking test survive --repeat=N runs.
---
.../suite/rocksdb/include/select_from_is_rowlocks.inc | 2 +-
mysql-test/suite/rocksdb/r/range_locking.result | 11 +++++++++++
mysql-test/suite/rocksdb/t/range_locking.inc | 18 +++++++++++++++++-
rocksdb | 2 +-
4 files changed, 30 insertions(+), 3 deletions(-)
diff --git a/mysql-test/suite/rocksdb/include/select_from_is_rowlocks.inc b/mysql-test/suite/rocksdb/include/select_from_is_rowlocks.inc
index da80d466012..cb95b149bae 100644
--- a/mysql-test/suite/rocksdb/include/select_from_is_rowlocks.inc
+++ b/mysql-test/suite/rocksdb/include/select_from_is_rowlocks.inc
@@ -17,7 +17,7 @@ set @cf_id=(select column_family from information_schema.rocksdb_ddl
where table_name='t1' and index_name='PRIMARY');
set @rtrx_id=(select transaction_id from information_schema.rocksdb_trx
where thread_id=connection_id());
-set @indexnr= (select lpad(hex(index_number),6,'0') from information_schema.rocksdb_ddl
+set @indexnr= (select lower(lpad(hex(index_number),6,'0')) from information_schema.rocksdb_ddl
where table_name='t1' and index_name='PRIMARY');
set @indexnr_next= (select lpad(hex(index_number+1),6,'0') from information_schema.rocksdb_ddl
diff --git a/mysql-test/suite/rocksdb/r/range_locking.result b/mysql-test/suite/rocksdb/r/range_locking.result
index f9f77a31476..bee37926ddf 100644
--- a/mysql-test/suite/rocksdb/r/range_locking.result
+++ b/mysql-test/suite/rocksdb/r/range_locking.result
@@ -80,6 +80,12 @@ drop table t1;
#
# Test INFORMATION_SCHEMA.lock_info in range-locking mode
#
+connect con1,localhost,root,,;
+connection con1;
+create table t0 (a int primary key);
+begin;
+insert into t0 values (1);
+connection default;
create table t1 (
pk int,
a int,
@@ -111,6 +117,11 @@ $cf_id $trx_id 0000${indexnr}80000002 - 0100${indexnr}80000009 X
$cf_id $trx_id 0000${indexnr}8000000a X
rollback;
drop table t1;
+connection con1;
+rollback;
+drop table t0;
+connection default;
+disconnect con1;
#
# MDEV-18104: MyRocks-Gap-Lock: range locking bounds are incorrect for multi-part keys
#
diff --git a/mysql-test/suite/rocksdb/t/range_locking.inc b/mysql-test/suite/rocksdb/t/range_locking.inc
index 1c5fb92b584..ecc8e4432bb 100644
--- a/mysql-test/suite/rocksdb/t/range_locking.inc
+++ b/mysql-test/suite/rocksdb/t/range_locking.inc
@@ -90,7 +90,7 @@ connection con1;
rollback;
drop table t2;
-# Cleanup
+# cleanup
connection default;
disconnect con1;
disconnect con2;
@@ -99,6 +99,15 @@ drop table t1;
--echo #
--echo # Test INFORMATION_SCHEMA.lock_info in range-locking mode
--echo #
+
+connect (con1,localhost,root,,);
+connection con1;
+eval create table t0 (a int primary key);
+begin;
+insert into t0 values (1);
+connection default;
+
+
eval
create table t1 (
pk int,
@@ -112,6 +121,8 @@ insert into t1 values
begin;
select * from t1 where pk=10 for update;
+#let TRX1_ID=`(select transaction_id from information_schema.rocksdb_trx where thread_id=connection_id())` ;
+let $select_from_is_rowlocks_current_trx_only=1;
--source suite/rocksdb/include/select_from_is_rowlocks.inc
delete from t1 where pk between 25 and 40;
@@ -127,6 +138,11 @@ select * from t1 where pk between 2 and 9 for update;
rollback;
drop table t1;
+connection con1;
+rollback;
+drop table t0;
+connection default;
+disconnect con1;
--echo #
--echo # MDEV-18104: MyRocks-Gap-Lock: range locking bounds are incorrect for multi-part keys
diff --git a/rocksdb b/rocksdb
index c15474bf496..670bbd25c10 160000
--- a/rocksdb
+++ b/rocksdb
@@ -1 +1 @@
-Subproject commit c15474bf496a227c6247e2e56040009c55c4bbc0
+Subproject commit 670bbd25c10c47638f41ee7f635a99443d21e937
1
0

[Commits] 670bbd25c: Range Locking: make GetLockStatusData report locks in STO mode
by psergey 24 Nov '19
by psergey 24 Nov '19
24 Nov '19
revision-id: 670bbd25c10c47638f41ee7f635a99443d21e937 (v5.8-1896-g670bbd25c)
parent(s): c15474bf496a227c6247e2e56040009c55c4bbc0
author: Sergei Petrunia
committer: Sergei Petrunia
timestamp: 2019-11-24 18:26:26 +0300
message:
Range Locking: make GetLockStatusData report locks in STO mode
(Implement this in locktree:dump_locks)
Before this patch, GetLockStatusData would return empty list when
the lock tree is in STO (Single Transaction Optimization mode.
This is confusing.
Note: some MTR testcases have workarounds for this: they get a lock in
another connection to make sure we're not in STO-mode.
They will still have to do that, because STO-mode may report different
set of locks (e.g. the locks can overlap with one another).
---
.../range_locking/locktree/locktree.cc | 44 +++++++++++++++-------
1 file changed, 30 insertions(+), 14 deletions(-)
diff --git a/utilities/transactions/range_locking/locktree/locktree.cc b/utilities/transactions/range_locking/locktree/locktree.cc
index 00ce5aace..4c7ce1bfe 100644
--- a/utilities/transactions/range_locking/locktree/locktree.cc
+++ b/utilities/transactions/range_locking/locktree/locktree.cc
@@ -543,22 +543,38 @@ void locktree::dump_locks(void *cdata, dump_callback cb)
lkr.prepare(m_rangetree);
lkr.acquire(range);
- GrowableArray<row_lock> all_locks;
- all_locks.init();
- iterate_and_get_overlapping_row_locks(&lkr, &all_locks);
-
- const size_t n_locks = all_locks.get_size();
- for (size_t i = 0; i < n_locks; i++) {
- const row_lock lock = all_locks.fetch_unchecked(i);
- (*cb)(cdata,
- lock.range.get_left_key(),
- lock.range.get_right_key(),
- lock.txnid,
- lock.is_shared,
- lock.owners);
+ TXNID sto_txn;
+ if ((sto_txn = toku_unsafe_fetch(m_sto_txnid)) != TXNID_NONE) {
+ // insert all of the ranges from the single txnid buffer into a new rangtree
+ range_buffer::iterator iter(&m_sto_buffer);
+ range_buffer::iterator::record rec;
+ while (iter.current(&rec)) {
+ (*cb)(cdata,
+ rec.get_left_key(),
+ rec.get_right_key(),
+ sto_txn,
+ !rec.get_exclusive_flag(),
+ nullptr);
+ iter.next();
+ }
+ } else {
+ GrowableArray<row_lock> all_locks;
+ all_locks.init();
+ iterate_and_get_overlapping_row_locks(&lkr, &all_locks);
+
+ const size_t n_locks = all_locks.get_size();
+ for (size_t i = 0; i < n_locks; i++) {
+ const row_lock lock = all_locks.fetch_unchecked(i);
+ (*cb)(cdata,
+ lock.range.get_left_key(),
+ lock.range.get_right_key(),
+ lock.txnid,
+ lock.is_shared,
+ lock.owners);
+ }
+ all_locks.deinit();
}
lkr.release();
- all_locks.deinit();
range.destroy();
}
1
0
revision-id: 30c4f566dca5d8a8bec2969faa7132c746afad92 (fb-prod201903-256-g30c4f566dca)
parent(s): b4a950866d15a0dbcbee527c2ab24ae9f2c8f2c6
author: Sergei Petrunia
committer: Sergei Petrunia
timestamp: 2019-11-24 17:23:49 +0300
message:
Update to merged range-locking rocksdb
---
rocksdb | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/rocksdb b/rocksdb
index 156a12604b8..c15474bf496 160000
--- a/rocksdb
+++ b/rocksdb
@@ -1 +1 @@
-Subproject commit 156a12604b830df3765a12d7194ca5f9ca7d67e6
+Subproject commit c15474bf496a227c6247e2e56040009c55c4bbc0
1
0

[Commits] c15474bf4: Backport of: Initial support for shared point locks, support lock escalations
by psergey 24 Nov '19
by psergey 24 Nov '19
24 Nov '19
revision-id: c15474bf496a227c6247e2e56040009c55c4bbc0 (v5.8-1895-gc15474bf4)
parent(s): 156a12604b830df3765a12d7194ca5f9ca7d67e6
author: Sergei Petrunia
committer: Sergei Petrunia
timestamp: 2019-11-24 17:03:40 +0300
message:
Backport of: Initial support for shared point locks, support lock escalations
- Locks can now be shared.
- Sharing only supported as long as the locked ranges are an exact match.
If this requirement is not met, the locks behave as exclusive locks.
- Make Lock Escalation keep shared locks. Shared locks are not collapsed
with other kinds of locks.
- Replace RangeLockMgrHandle::get_escalation_count() with GetStatus()
which also reports amount of memory used for Range Locking (and there
is more data we could report through this)
- Initialize LTM_STATUS_S::m_initialized.
---
include/rocksdb/utilities/transaction_db.h | 9 +-
utilities/transactions/range_locking/db.h | 3 +
.../transactions/range_locking/ft/ft-status.h | 2 +-
.../range_locking/locktree/concurrent_tree.cc | 17 +-
.../range_locking/locktree/concurrent_tree.h | 18 +-
.../range_locking/locktree/locktree.cc | 240 +++++++++++++++++----
.../transactions/range_locking/locktree/locktree.h | 19 +-
.../range_locking/locktree/range_buffer.cc | 20 +-
.../range_locking/locktree/range_buffer.h | 15 +-
.../range_locking/locktree/treenode.cc | 84 ++++++--
.../transactions/range_locking/locktree/treenode.h | 30 ++-
.../range_locking/portability/txn_subst.h | 16 ++
utilities/transactions/transaction_lock_mgr.cc | 42 ++--
utilities/transactions/transaction_lock_mgr.h | 6 +-
14 files changed, 410 insertions(+), 111 deletions(-)
diff --git a/include/rocksdb/utilities/transaction_db.h b/include/rocksdb/utilities/transaction_db.h
index 19e6d2411..bb13bba07 100644
--- a/include/rocksdb/utilities/transaction_db.h
+++ b/include/rocksdb/utilities/transaction_db.h
@@ -56,7 +56,14 @@ class LockManagerHandle {
class RangeLockMgrHandle : public LockManagerHandle {
public:
virtual int set_max_lock_memory(size_t max_lock_memory) = 0;
- virtual uint64_t get_escalation_count() = 0;
+
+ class Counters {
+ public:
+ uint64_t escalation_count;
+ uint64_t current_lock_memory;
+ };
+
+ virtual Counters GetStatus() = 0;
virtual ~RangeLockMgrHandle() {};
};
diff --git a/utilities/transactions/range_locking/db.h b/utilities/transactions/range_locking/db.h
index f9349c6ae..64ea28345 100644
--- a/utilities/transactions/range_locking/db.h
+++ b/utilities/transactions/range_locking/db.h
@@ -6,6 +6,8 @@
typedef struct __toku_db DB;
typedef struct __toku_dbt DBT;
+
+// port: this is currently not used
struct simple_dbt {
uint32_t len;
void *data;
@@ -72,6 +74,7 @@ struct __toku_dbt {
void*data;
uint32_t size;
uint32_t ulen;
+ // One of DB_DBT_XXX flags
uint32_t flags;
};
typedef struct __toku_descriptor {
diff --git a/utilities/transactions/range_locking/ft/ft-status.h b/utilities/transactions/range_locking/ft/ft-status.h
index 25051f1ed..242964f0b 100644
--- a/utilities/transactions/range_locking/ft/ft-status.h
+++ b/utilities/transactions/range_locking/ft/ft-status.h
@@ -80,7 +80,7 @@ public:
TOKU_ENGINE_STATUS_ROW_S status[LTM_STATUS_NUM_ROWS];
private:
- bool m_initialized;
+ bool m_initialized = false;
};
typedef LTM_STATUS_S* LTM_STATUS;
extern LTM_STATUS_S ltm_status;
diff --git a/utilities/transactions/range_locking/locktree/concurrent_tree.cc b/utilities/transactions/range_locking/locktree/concurrent_tree.cc
index a35a9e40b..74d65f710 100644
--- a/utilities/transactions/range_locking/locktree/concurrent_tree.cc
+++ b/utilities/transactions/range_locking/locktree/concurrent_tree.cc
@@ -97,6 +97,12 @@ void concurrent_tree::locked_keyrange::acquire(const keyrange &range) {
m_subtree = subtree;
}
+void concurrent_tree::locked_keyrange::add_shared_owner(const keyrange &range,
+ TXNID new_owner)
+{
+ m_subtree->insert(range, new_owner, /*is_shared*/ true);
+}
+
void concurrent_tree::locked_keyrange::release(void) {
m_subtree->mutex_unlock();
}
@@ -110,18 +116,19 @@ void concurrent_tree::locked_keyrange::iterate(F *function) const {
}
}
-void concurrent_tree::locked_keyrange::insert(const keyrange &range, TXNID txnid) {
+void concurrent_tree::locked_keyrange::insert(const keyrange &range,
+ TXNID txnid, bool is_shared) {
// empty means no children, and only the root should ever be empty
if (m_subtree->is_empty()) {
- m_subtree->set_range_and_txnid(range, txnid);
+ m_subtree->set_range_and_txnid(range, txnid, is_shared);
} else {
- m_subtree->insert(range, txnid);
+ m_subtree->insert(range, txnid, is_shared);
}
}
-void concurrent_tree::locked_keyrange::remove(const keyrange &range) {
+void concurrent_tree::locked_keyrange::remove(const keyrange &range, TXNID txnid) {
invariant(!m_subtree->is_empty());
- treenode *new_subtree = m_subtree->remove(range);
+ treenode *new_subtree = m_subtree->remove(range, txnid);
// if removing range changed the root of the subtree,
// then the subtree must be the root of the entire tree.
if (new_subtree == nullptr) {
diff --git a/utilities/transactions/range_locking/locktree/concurrent_tree.h b/utilities/transactions/range_locking/locktree/concurrent_tree.h
index 66a7ff176..fabda7294 100644
--- a/utilities/transactions/range_locking/locktree/concurrent_tree.h
+++ b/utilities/transactions/range_locking/locktree/concurrent_tree.h
@@ -106,15 +106,25 @@ public:
template <class F>
void iterate(F *function) const;
+ // Adds another owner to the lock on the specified keyrange.
+ // requires: the keyrange contains one treenode whose bounds are
+ // exactly equal to the specifed range (no sub/supersets)
+ void add_shared_owner(const keyrange &range, TXNID new_owner);
+
// inserts the given range into the tree, with an associated txnid.
// requires: range does not overlap with anything in this locked_keyrange
// rationale: caller is responsible for only inserting unique ranges
- void insert(const keyrange &range, TXNID txnid);
-
- // effect: removes the given range from the tree
+ void insert(const keyrange &range, TXNID txnid, bool is_shared);
+
+ // effect: removes the given range from the tree.
+ // - txnid=TXNID_ANY means remove the range no matter what its
+ // owners are
+ // - Other value means remove the specified txnid from
+ // ownership (if the range has other owners, it will remain
+ // in the tree)
// requires: range exists exactly in this locked_keyrange
// rationale: caller is responsible for only removing existing ranges
- void remove(const keyrange &range);
+ void remove(const keyrange &range, TXNID txnid);
// effect: removes all of the keys represented by this locked keyrange
// rationale: we'd like a fast way to empty out a tree
diff --git a/utilities/transactions/range_locking/locktree/locktree.cc b/utilities/transactions/range_locking/locktree/locktree.cc
index 9b530c7b0..00ce5aace 100644
--- a/utilities/transactions/range_locking/locktree/locktree.cc
+++ b/utilities/transactions/range_locking/locktree/locktree.cc
@@ -147,6 +147,8 @@ uint32_t locktree::get_reference_count(void) {
struct row_lock {
keyrange range;
TXNID txnid;
+ bool is_shared;
+ TxnidVector *owners;
};
// iterate over a locked keyrange and copy out all of the data,
@@ -157,8 +159,10 @@ static void iterate_and_get_overlapping_row_locks(const concurrent_tree::locked_
GrowableArray<row_lock> *row_locks) {
struct copy_fn_obj {
GrowableArray<row_lock> *row_locks;
- bool fn(const keyrange &range, TXNID txnid) {
- row_lock lock = { .range = range, .txnid = txnid };
+ bool fn(const keyrange &range, TXNID txnid, bool is_shared,
+ TxnidVector *owners) {
+ row_lock lock = { .range = range, .txnid = txnid,
+ .is_shared = is_shared, .owners = owners};
row_locks->push(lock);
return true;
}
@@ -196,9 +200,10 @@ static uint64_t row_lock_size_in_tree(const row_lock &lock) {
// remove and destroy the given row lock from the locked keyrange,
// then notify the memory tracker of the newly freed lock.
static void remove_row_lock_from_tree(concurrent_tree::locked_keyrange *lkr,
- const row_lock &lock, locktree_manager *mgr) {
+ const row_lock &lock, TXNID txnid,
+ locktree_manager *mgr) {
const uint64_t mem_released = row_lock_size_in_tree(lock);
- lkr->remove(lock.range);
+ lkr->remove(lock.range, txnid);
if (mgr != nullptr) {
mgr->note_mem_released(mem_released);
}
@@ -209,7 +214,7 @@ static void remove_row_lock_from_tree(concurrent_tree::locked_keyrange *lkr,
static void insert_row_lock_into_tree(concurrent_tree::locked_keyrange *lkr,
const row_lock &lock, locktree_manager *mgr) {
uint64_t mem_used = row_lock_size_in_tree(lock);
- lkr->insert(lock.range, lock.txnid);
+ lkr->insert(lock.range, lock.txnid, lock.is_shared);
if (mgr != nullptr) {
mgr->note_mem_used(mem_used);
}
@@ -221,13 +226,17 @@ void locktree::sto_begin(TXNID txnid) {
m_sto_txnid = txnid;
}
-void locktree::sto_append(const DBT *left_key, const DBT *right_key) {
+void locktree::sto_append(const DBT *left_key, const DBT *right_key,
+ bool is_write_request) {
uint64_t buffer_mem, delta;
+
+ // psergey: the below two lines do not make any sense
+ // (and it's the same in upstream TokuDB)
keyrange range;
range.create(left_key, right_key);
buffer_mem = m_sto_buffer.total_memory_size();
- m_sto_buffer.append(left_key, right_key);
+ m_sto_buffer.append(left_key, right_key, is_write_request);
delta = m_sto_buffer.total_memory_size() - buffer_mem;
if (m_mgr != nullptr) {
m_mgr->note_mem_used(delta);
@@ -274,8 +283,10 @@ void locktree::sto_migrate_buffer_ranges_to_tree(void *prepared_lkr) {
range_buffer::iterator::record rec;
while (iter.current(&rec)) {
sto_lkr.prepare(&sto_rangetree);
- int r = acquire_lock_consolidated(&sto_lkr,
- m_sto_txnid, rec.get_left_key(), rec.get_right_key(), nullptr);
+ int r = acquire_lock_consolidated(&sto_lkr, m_sto_txnid,
+ rec.get_left_key(),
+ rec.get_right_key(),
+ rec.get_exclusive_flag(), nullptr);
invariant_zero(r);
sto_lkr.release();
iter.next();
@@ -285,8 +296,10 @@ void locktree::sto_migrate_buffer_ranges_to_tree(void *prepared_lkr) {
// locktree's rangetree, on behalf of the old single txnid.
struct migrate_fn_obj {
concurrent_tree::locked_keyrange *dst_lkr;
- bool fn(const keyrange &range, TXNID txnid) {
- dst_lkr->insert(range, txnid);
+ bool fn(const keyrange &range, TXNID txnid, bool is_shared,
+ TxnidVector *owners) {
+ assert(owners == nullptr);
+ dst_lkr->insert(range, txnid, is_shared);
return true;
}
} migrate_fn;
@@ -301,7 +314,8 @@ void locktree::sto_migrate_buffer_ranges_to_tree(void *prepared_lkr) {
bool locktree::sto_try_acquire(void *prepared_lkr,
TXNID txnid,
- const DBT *left_key, const DBT *right_key) {
+ const DBT *left_key, const DBT *right_key,
+ bool is_write_request) {
if (m_rangetree->is_empty() && m_sto_buffer.is_empty() && toku_unsafe_fetch(m_sto_score) >= STO_SCORE_THRESHOLD) {
// We can do the optimization because the rangetree is empty, and
// we know its worth trying because the sto score is big enough.
@@ -319,7 +333,7 @@ bool locktree::sto_try_acquire(void *prepared_lkr,
// this txnid can append its lock to the sto buffer successfully.
if (m_sto_txnid != TXNID_NONE) {
invariant(m_sto_txnid == txnid);
- sto_append(left_key, right_key);
+ sto_append(left_key, right_key, is_write_request);
return true;
} else {
invariant(m_sto_buffer.is_empty());
@@ -327,12 +341,66 @@ bool locktree::sto_try_acquire(void *prepared_lkr,
}
}
+
+/*
+ Do the same as iterate_and_get_overlapping_row_locks does, but also check for
+ this:
+ The set of overlapping rows locks consists of just one read-only shared
+ lock with the same endpoints as specified (in that case, we can just add
+ ourselves into that list)
+
+ @return true - One compatible shared lock
+ false - Otherwise
+*/
+static
+bool iterate_and_get_overlapping_row_locks2(const concurrent_tree::locked_keyrange *lkr,
+ const DBT *left_key, const DBT *right_key,
+ comparator *cmp,
+ TXNID txnid,
+ GrowableArray<row_lock> *row_locks) {
+ struct copy_fn_obj {
+ GrowableArray<row_lock> *row_locks;
+ bool first_call= true;
+ bool matching_lock_found = false;
+ const DBT *left_key, *right_key;
+ comparator *cmp;
+
+ bool fn(const keyrange &range, TXNID txnid, bool is_shared,
+ TxnidVector *owners) {
+
+ if (first_call) {
+ first_call = false;
+ if (is_shared &&
+ !(*cmp)(left_key, range.get_left_key()) &&
+ !(*cmp)(right_key, range.get_right_key())) {
+ matching_lock_found = true;
+ }
+ } else {
+ // if we see multiple matching locks, it doesn't matter whether
+ // the first one was matching.
+ matching_lock_found = false;
+ }
+ row_lock lock = { .range = range, .txnid = txnid,
+ .is_shared = is_shared, .owners = owners };
+ row_locks->push(lock);
+ return true;
+ }
+ } copy_fn;
+ copy_fn.row_locks = row_locks;
+ copy_fn.left_key = left_key;
+ copy_fn.right_key = right_key;
+ copy_fn.cmp = cmp;
+ lkr->iterate(©_fn);
+ return copy_fn.matching_lock_found;
+}
+
// try to acquire a lock and consolidate it with existing locks if possible
// param: lkr, a prepared locked keyrange
// return: 0 on success, DB_LOCK_NOTGRANTED if conflicting locks exist.
int locktree::acquire_lock_consolidated(void *prepared_lkr,
TXNID txnid,
const DBT *left_key, const DBT *right_key,
+ bool is_write_request,
txnid_set *conflicts) {
int r = 0;
concurrent_tree::locked_keyrange *lkr;
@@ -345,24 +413,60 @@ int locktree::acquire_lock_consolidated(void *prepared_lkr,
// copy out the set of overlapping row locks.
GrowableArray<row_lock> overlapping_row_locks;
overlapping_row_locks.init();
- iterate_and_get_overlapping_row_locks(lkr, &overlapping_row_locks);
+ bool matching_shared_lock_found= false;
+
+ if (is_write_request)
+ iterate_and_get_overlapping_row_locks(lkr, &overlapping_row_locks);
+ else {
+ matching_shared_lock_found=
+ iterate_and_get_overlapping_row_locks2(lkr, left_key, right_key, &m_cmp,
+ txnid, &overlapping_row_locks);
+ // psergey-todo: what to do now? So, we have figured we have just one
+ // shareable lock. Need to add us into it as an owner but the lock
+ // pointer cannot be kept?
+ // A: use find_node_with_overlapping_child(key_range, nullptr);
+ // then, add ourselves to the owner list.
+ // Dont' foreget to release the subtree after that.
+ }
+
+ if (matching_shared_lock_found) {
+ // there is just one non-confliting matching shared lock.
+ // we are hilding a lock on it (see acquire() call above).
+ // we need to modify it to indicate there is another locker...
+ lkr->add_shared_owner(requested_range, txnid);
+
+ // Pretend shared lock uses as much memory.
+ row_lock new_lock = { .range = requested_range, .txnid = txnid,
+ .is_shared = false, .owners = nullptr };
+ uint64_t mem_used = row_lock_size_in_tree(new_lock);
+ if (m_mgr) {
+ m_mgr->note_mem_used(mem_used);
+ }
+ return 0;
+ }
+
+
size_t num_overlapping_row_locks = overlapping_row_locks.get_size();
// if any overlapping row locks conflict with this request, bail out.
+
bool conflicts_exist = determine_conflicting_txnids(overlapping_row_locks,
txnid, conflicts);
if (!conflicts_exist) {
// there are no conflicts, so all of the overlaps are for the requesting txnid.
// so, we must consolidate all existing overlapping ranges and the requested
// range into one dominating range. then we insert the dominating range.
+ bool all_shared = !is_write_request;
for (size_t i = 0; i < num_overlapping_row_locks; i++) {
row_lock overlapping_lock = overlapping_row_locks.fetch_unchecked(i);
invariant(overlapping_lock.txnid == txnid);
requested_range.extend(m_cmp, overlapping_lock.range);
- remove_row_lock_from_tree(lkr, overlapping_lock, m_mgr);
+ remove_row_lock_from_tree(lkr, overlapping_lock, TXNID_ANY, m_mgr);
+ all_shared = all_shared && overlapping_lock.is_shared;
}
- row_lock new_lock = { .range = requested_range, .txnid = txnid };
+ row_lock new_lock = { .range = requested_range, .txnid = txnid,
+ .is_shared = all_shared, .owners = nullptr };
insert_row_lock_into_tree(lkr, new_lock, m_mgr);
} else {
r = DB_LOCK_NOTGRANTED;
@@ -383,7 +487,7 @@ int locktree::acquire_lock(bool is_write_request,
int r = 0;
// we are only supporting write locks for simplicity
- invariant(is_write_request);
+ //invariant(is_write_request);
// acquire and prepare a locked keyrange over the requested range.
// prepare is a serialzation point, so we take the opportunity to
@@ -391,9 +495,11 @@ int locktree::acquire_lock(bool is_write_request,
concurrent_tree::locked_keyrange lkr;
lkr.prepare(m_rangetree);
- bool acquired = sto_try_acquire(&lkr, txnid, left_key, right_key);
+ bool acquired = sto_try_acquire(&lkr, txnid, left_key, right_key,
+ is_write_request);
if (!acquired) {
- r = acquire_lock_consolidated(&lkr, txnid, left_key, right_key, conflicts);
+ r = acquire_lock_consolidated(&lkr, txnid, left_key, right_key,
+ is_write_request, conflicts);
}
lkr.release();
@@ -418,7 +524,7 @@ int locktree::try_acquire_lock(bool is_write_request,
// the locktree silently upgrades read locks to write locks for simplicity
int locktree::acquire_read_lock(TXNID txnid, const DBT *left_key, const DBT *right_key,
txnid_set *conflicts, bool big_txn) {
- return acquire_write_lock(txnid, left_key, right_key, conflicts, big_txn);
+ return try_acquire_lock(false, txnid, left_key, right_key, conflicts, big_txn);
}
int locktree::acquire_write_lock(TXNID txnid, const DBT *left_key, const DBT *right_key,
@@ -447,7 +553,9 @@ void locktree::dump_locks(void *cdata, dump_callback cb)
(*cb)(cdata,
lock.range.get_left_key(),
lock.range.get_right_key(),
- lock.txnid);
+ lock.txnid,
+ lock.is_shared,
+ lock.owners);
}
lkr.release();
all_locks.deinit();
@@ -525,8 +633,11 @@ void locktree::remove_overlapping_locks_for_txnid(TXNID txnid,
row_lock lock = overlapping_row_locks.fetch_unchecked(i);
// If this isn't our lock, that's ok, just don't remove it.
// See rationale above.
- if (lock.txnid == txnid) {
- remove_row_lock_from_tree(&lkr, lock, m_mgr);
+ // psergey-todo: for shared locks, just remove ourselves from the
+ // owners.
+ if (lock.txnid == txnid ||
+ (lock.owners && lock.owners->contains(txnid))) {
+ remove_row_lock_from_tree(&lkr, lock, txnid, m_mgr);
}
}
@@ -630,11 +741,17 @@ static int extract_first_n_row_locks(concurrent_tree::locked_keyrange *lkr,
int num_extracted;
int num_to_extract;
row_lock *row_locks;
- bool fn(const keyrange &range, TXNID txnid) {
+ bool fn(const keyrange &range, TXNID txnid, bool is_shared, TxnidVector *owners) {
if (num_extracted < num_to_extract) {
row_lock lock;
lock.range.create_copy(range);
lock.txnid = txnid;
+ lock.is_shared= is_shared;
+ // deep-copy the set of owners:
+ if (owners)
+ lock.owners = new TxnidVector(*owners);
+ else
+ lock.owners = nullptr;
row_locks[num_extracted++] = lock;
return true;
} else {
@@ -655,7 +772,7 @@ static int extract_first_n_row_locks(concurrent_tree::locked_keyrange *lkr,
int num_extracted = extract_fn.num_extracted;
invariant(num_extracted <= num_to_extract);
for (int i = 0; i < num_extracted; i++) {
- remove_row_lock_from_tree(lkr, row_locks[i], mgr);
+ remove_row_lock_from_tree(lkr, row_locks[i], TXNID_ANY, mgr);
}
return num_extracted;
@@ -722,38 +839,60 @@ void locktree::escalate(lt_escalate_cb after_escalate_callback, void *after_esca
// through them and merge adjacent locks with the same txnid into
// one dominating lock and save it to a set of escalated locks.
//
- // first, find the index of the next row lock with a different txnid
+ // first, find the index of the next row lock that
+ // - belongs to a different txnid, or
+ // - belongs to several txnids, or
+ // - is a shared lock (we could potentially merge those but
+ // currently we don't)
int next_txnid_index = current_index + 1;
+
while (next_txnid_index < num_extracted &&
- extracted_buf[current_index].txnid == extracted_buf[next_txnid_index].txnid) {
+ (extracted_buf[current_index].txnid ==
+ extracted_buf[next_txnid_index].txnid) &&
+ !extracted_buf[next_txnid_index].is_shared &&
+ !extracted_buf[next_txnid_index].owners) {
next_txnid_index++;
}
// Create an escalated range for the current txnid that dominates
// each range between the current indext and the next txnid's index.
- const TXNID current_txnid = extracted_buf[current_index].txnid;
+ //const TXNID current_txnid = extracted_buf[current_index].txnid;
const DBT *escalated_left_key = extracted_buf[current_index].range.get_left_key();
const DBT *escalated_right_key = extracted_buf[next_txnid_index - 1].range.get_right_key();
// Try to find a range buffer for the current txnid. Create one if it doesn't exist.
// Then, append the new escalated range to the buffer.
- uint32_t idx;
- struct txnid_range_buffer *existing_range_buffer;
- int r = range_buffers.find_zero<TXNID, txnid_range_buffer::find_by_txnid>(
- current_txnid,
- &existing_range_buffer,
- &idx
- );
- if (r == DB_NOTFOUND) {
- struct txnid_range_buffer *XMALLOC(new_range_buffer);
- new_range_buffer->txnid = current_txnid;
- new_range_buffer->buffer.create();
- new_range_buffer->buffer.append(escalated_left_key, escalated_right_key);
- range_buffers.insert_at(new_range_buffer, idx);
- } else {
- invariant_zero(r);
- invariant(existing_range_buffer->txnid == current_txnid);
- existing_range_buffer->buffer.append(escalated_left_key, escalated_right_key);
+ // (If a lock is shared by multiple txnids, append it each of txnid's lists)
+ TxnidVector *owners_ptr;
+ TxnidVector singleton_owner;
+ if (extracted_buf[current_index].owners)
+ owners_ptr = extracted_buf[current_index].owners;
+ else {
+ singleton_owner.insert(extracted_buf[current_index].txnid);
+ owners_ptr = &singleton_owner;
+ }
+
+ for (auto cur_txnid : *owners_ptr ) {
+ uint32_t idx;
+ struct txnid_range_buffer *existing_range_buffer;
+ int r = range_buffers.find_zero<TXNID, txnid_range_buffer::find_by_txnid>(
+ cur_txnid,
+ &existing_range_buffer,
+ &idx
+ );
+ if (r == DB_NOTFOUND) {
+ struct txnid_range_buffer *XMALLOC(new_range_buffer);
+ new_range_buffer->txnid = cur_txnid;
+ new_range_buffer->buffer.create();
+ new_range_buffer->buffer.append(escalated_left_key, escalated_right_key,
+ !extracted_buf[current_index].is_shared);
+ range_buffers.insert_at(new_range_buffer, idx);
+ } else {
+ invariant_zero(r);
+ invariant(existing_range_buffer->txnid == cur_txnid);
+ existing_range_buffer->buffer.append(escalated_left_key, escalated_right_key,
+ !extracted_buf[current_index].is_shared);
+ }
}
current_index = next_txnid_index;
@@ -761,6 +900,7 @@ void locktree::escalate(lt_escalate_cb after_escalate_callback, void *after_esca
// destroy the ranges copied during the extraction
for (int i = 0; i < num_extracted; i++) {
+ delete extracted_buf[i].owners;
extracted_buf[i].range.destroy();
}
}
@@ -768,6 +908,12 @@ void locktree::escalate(lt_escalate_cb after_escalate_callback, void *after_esca
// Rebuild the locktree from each range in each range buffer,
// then notify higher layers that the txnid's locks have changed.
+ //
+ // (shared locks: if a lock was initially shared between transactions TRX1,
+ // TRX2, etc, we will now try to acquire it acting on behalf on TRX1, on
+ // TRX2, etc. This will succeed and an identical shared lock will be
+ // constructed)
+
invariant(m_rangetree->is_empty());
const size_t num_range_buffers = range_buffers.size();
for (size_t i = 0; i < num_range_buffers; i++) {
@@ -781,7 +927,9 @@ void locktree::escalate(lt_escalate_cb after_escalate_callback, void *after_esca
while (iter.current(&rec)) {
keyrange range;
range.create(rec.get_left_key(), rec.get_right_key());
- row_lock lock = { .range = range, .txnid = current_txnid };
+ row_lock lock = { .range = range, .txnid = current_txnid,
+ .is_shared= !rec.get_exclusive_flag(),
+ .owners= nullptr };
insert_row_lock_into_tree(&lkr, lock, m_mgr);
iter.next();
}
diff --git a/utilities/transactions/range_locking/locktree/locktree.h b/utilities/transactions/range_locking/locktree/locktree.h
index 5ff4f7449..e7c909be0 100644
--- a/utilities/transactions/range_locking/locktree/locktree.h
+++ b/utilities/transactions/range_locking/locktree/locktree.h
@@ -339,7 +339,10 @@ namespace toku {
// since the lock_request object is opaque
struct lt_lock_request_info *get_lock_request_info(void);
- typedef void (*dump_callback)(void *cdata, const DBT *left, const DBT *right, TXNID txnid);
+ typedef void (*dump_callback)(void *cdata,
+ const DBT *left, const DBT *right,
+ TXNID txnid, bool is_shared,
+ TxnidVector *owners);
void dump_locks(void *cdata, dump_callback cb);
private:
locktree_manager *m_mgr;
@@ -360,6 +363,12 @@ namespace toku {
void *m_userdata;
struct lt_lock_request_info m_lock_request_info;
+ // psergey-todo:
+ // Each transaction also keeps a list of ranges it has locked.
+ // So, when a transaction is running in STO mode, two identical
+ // lists are kept: the STO lock list and transaction's owned locks
+ // list. Why can't we do with just one list?
+
// The following fields and members prefixed with "sto_" are for
// the single txnid optimization, intended to speed up the case
// when only one transaction is using the locktree. If we know
@@ -453,7 +462,8 @@ namespace toku {
// effect: append a range to the sto buffer
// requires: m_sto_txnid is valid
- void sto_append(const DBT *left_key, const DBT *right_key);
+ void sto_append(const DBT *left_key, const DBT *right_key,
+ bool is_write_request);
// effect: ends the single txnid optimization, releaseing any memory
// stored in the sto buffer, notifying the tracker, and
@@ -494,7 +504,8 @@ namespace toku {
// back to zero.
// returns: true if the lock was acquired for this txnid
bool sto_try_acquire(void *prepared_lkr, TXNID txnid,
- const DBT *left_key, const DBT *right_key);
+ const DBT *left_key, const DBT *right_key,
+ bool is_write_request);
// Effect:
// Provides a hook for a helgrind suppression.
@@ -513,7 +524,7 @@ namespace toku {
int acquire_lock_consolidated(void *prepared_lkr, TXNID txnid,
const DBT *left_key, const DBT *right_key,
- txnid_set *conflicts);
+ bool is_write_request, txnid_set *conflicts);
int acquire_lock(bool is_write_request, TXNID txnid,
const DBT *left_key, const DBT *right_key,
diff --git a/utilities/transactions/range_locking/locktree/range_buffer.cc b/utilities/transactions/range_locking/locktree/range_buffer.cc
index d1f14fc4a..eab374945 100644
--- a/utilities/transactions/range_locking/locktree/range_buffer.cc
+++ b/utilities/transactions/range_locking/locktree/range_buffer.cc
@@ -66,7 +66,9 @@ namespace toku {
return right_neg_inf || right_pos_inf;
}
- void range_buffer::record_header::init(const DBT *left_key, const DBT *right_key) {
+ void range_buffer::record_header::init(const DBT *left_key, const DBT *right_key,
+ bool is_exclusive) {
+ is_exclusive_lock= is_exclusive;
left_neg_inf = left_key == toku_dbt_negative_infinity();
left_pos_inf = left_key == toku_dbt_positive_infinity();
left_key_size = toku_dbt_is_infinite(left_key) ? 0 : left_key->size;
@@ -186,15 +188,16 @@ namespace toku {
_num_ranges = 0;
}
- void range_buffer::append(const DBT *left_key, const DBT *right_key) {
+ void range_buffer::append(const DBT *left_key, const DBT *right_key,
+ bool is_write_request) {
// if the keys are equal, then only one copy is stored.
if (toku_dbt_equals(left_key, right_key)) {
invariant(left_key->size <= MAX_KEY_SIZE);
- append_point(left_key);
+ append_point(left_key, is_write_request);
} else {
invariant(left_key->size <= MAX_KEY_SIZE);
invariant(right_key->size <= MAX_KEY_SIZE);
- append_range(left_key, right_key);
+ append_range(left_key, right_key, is_write_request);
}
_num_ranges++;
}
@@ -215,12 +218,13 @@ namespace toku {
_arena.destroy();
}
- void range_buffer::append_range(const DBT *left_key, const DBT *right_key) {
+ void range_buffer::append_range(const DBT *left_key, const DBT *right_key,
+ bool is_exclusive) {
size_t record_length = sizeof(record_header) + left_key->size + right_key->size;
char *buf = reinterpret_cast<char *>(_arena.malloc_from_arena(record_length));
record_header h;
- h.init(left_key, right_key);
+ h.init(left_key, right_key, is_exclusive);
// serialize the header
memcpy(buf, &h, sizeof(record_header));
@@ -238,12 +242,12 @@ namespace toku {
}
}
- void range_buffer::append_point(const DBT *key) {
+ void range_buffer::append_point(const DBT *key, bool is_exclusive) {
size_t record_length = sizeof(record_header) + key->size;
char *buf = reinterpret_cast<char *>(_arena.malloc_from_arena(record_length));
record_header h;
- h.init(key, nullptr);
+ h.init(key, nullptr, is_exclusive);
// serialize the header
memcpy(buf, &h, sizeof(record_header));
diff --git a/utilities/transactions/range_locking/locktree/range_buffer.h b/utilities/transactions/range_locking/locktree/range_buffer.h
index 9bc02dc22..e8869fae5 100644
--- a/utilities/transactions/range_locking/locktree/range_buffer.h
+++ b/utilities/transactions/range_locking/locktree/range_buffer.h
@@ -77,12 +77,14 @@ namespace toku {
bool right_neg_inf;
uint16_t left_key_size;
uint16_t right_key_size;
+ bool is_exclusive_lock;
bool left_is_infinite(void) const;
bool right_is_infinite(void) const;
- void init(const DBT *left_key, const DBT *right_key);
+ void init(const DBT *left_key, const DBT *right_key,
+ bool is_exclusive);
};
// PORT static_assert(sizeof(record_header) == 8, "record header format is off");
@@ -109,6 +111,10 @@ namespace toku {
// how big is this record? this tells us where the next record is
size_t size(void) const;
+ bool get_exclusive_flag() const {
+ return _header.is_exclusive_lock;
+ }
+
// populate a record header and point our DBT's
// buffers into ours if they are not infinite.
void deserialize(const char *buf);
@@ -145,7 +151,8 @@ namespace toku {
// append a left/right key range to the buffer.
// if the keys are equal, then only one copy is stored.
- void append(const DBT *left_key, const DBT *right_key);
+ void append(const DBT *left_key, const DBT *right_key,
+ bool is_write_request=false);
// is this range buffer empty?
bool is_empty(void) const;
@@ -162,11 +169,11 @@ namespace toku {
memarena _arena;
int _num_ranges;
- void append_range(const DBT *left_key, const DBT *right_key);
+ void append_range(const DBT *left_key, const DBT *right_key, bool is_write_request);
// append a point to the buffer. this is the space/time saving
// optimization for key ranges where left == right.
- void append_point(const DBT *key);
+ void append_point(const DBT *key, bool is_write_request);
};
} /* namespace toku */
diff --git a/utilities/transactions/range_locking/locktree/treenode.cc b/utilities/transactions/range_locking/locktree/treenode.cc
index 051ec7d1c..5bf349749 100644
--- a/utilities/transactions/range_locking/locktree/treenode.cc
+++ b/utilities/transactions/range_locking/locktree/treenode.cc
@@ -64,6 +64,10 @@ void treenode::init(const comparator *cmp) {
m_is_root = false;
m_is_empty = true;
m_cmp = cmp;
+
+ m_is_shared= false;
+ m_owners= nullptr;
+
// use an adaptive mutex at each node since we expect the time the
// lock is held to be relatively short compared to a context switch.
// indeed, this improves performance at high thread counts considerably.
@@ -89,10 +93,11 @@ void treenode::destroy_root(void) {
m_cmp = nullptr;
}
-void treenode::set_range_and_txnid(const keyrange &range, TXNID txnid) {
+void treenode::set_range_and_txnid(const keyrange &range, TXNID txnid, bool is_shared) {
// allocates a new copy of the range for this node
m_range.create_copy(range);
m_txnid = txnid;
+ m_is_shared= is_shared;
m_is_empty = false;
}
@@ -108,10 +113,11 @@ bool treenode::range_overlaps(const keyrange &range) {
return m_range.overlaps(*m_cmp, range);
}
-treenode *treenode::alloc(const comparator *cmp, const keyrange &range, TXNID txnid) {
+treenode *treenode::alloc(const comparator *cmp, const keyrange &range,
+ TXNID txnid, bool is_shared) {
treenode *XCALLOC(node);
node->init(cmp);
- node->set_range_and_txnid(range, txnid);
+ node->set_range_and_txnid(range, txnid, is_shared);
return node;
}
@@ -122,12 +128,31 @@ void treenode::swap_in_place(treenode *node1, treenode *node2) {
node1->m_txnid = node2->m_txnid;
node2->m_range = tmp_range;
node2->m_txnid = tmp_txnid;
+
+ bool tmp_is_shared= node1->m_is_shared;
+ node1->m_is_shared= node2->m_is_shared;
+ node2->m_is_shared= tmp_is_shared;
+}
+
+void treenode::add_shared_owner(TXNID txnid) {
+ assert(m_is_shared);
+ if (m_txnid != TXNID_SHARED) {
+ m_owners= new TxnidVector;
+ m_owners->insert(m_txnid);
+ m_txnid= TXNID_SHARED;
+ }
+ m_owners->insert(txnid);
}
void treenode::free(treenode *node) {
// destroy the range, freeing any copied keys
node->m_range.destroy();
+ if (node->m_owners) {
+ delete node->m_owners;
+ node->m_owners = nullptr; // need this?
+ }
+
// the root is simply marked as empty.
if (node->is_root()) {
// PORT toku_mutex_assert_locked(&node->m_mutex);
@@ -189,7 +214,7 @@ void treenode::traverse_overlaps(const keyrange &range, F *function) {
if (c == keyrange::comparison::EQUALS) {
// Doesn't matter if fn wants to keep going, there
// is nothing left, so return.
- function->fn(m_range, m_txnid);
+ function->fn(m_range, m_txnid, m_is_shared, m_owners);
return;
}
@@ -204,7 +229,7 @@ void treenode::traverse_overlaps(const keyrange &range, F *function) {
}
if (c == keyrange::comparison::OVERLAPS) {
- bool keep_going = function->fn(m_range, m_txnid);
+ bool keep_going = function->fn(m_range, m_txnid, m_is_shared, m_owners);
if (!keep_going) {
return;
}
@@ -221,29 +246,35 @@ void treenode::traverse_overlaps(const keyrange &range, F *function) {
}
}
-void treenode::insert(const keyrange &range, TXNID txnid) {
+void treenode::insert(const keyrange &range, TXNID txnid, bool is_shared) {
// choose a child to check. if that child is null, then insert the new node there.
// otherwise recur down that child's subtree
keyrange::comparison c = range.compare(*m_cmp, m_range);
if (c == keyrange::comparison::LESS_THAN) {
treenode *left_child = lock_and_rebalance_left();
if (left_child == nullptr) {
- left_child = treenode::alloc(m_cmp, range, txnid);
+ left_child = treenode::alloc(m_cmp, range, txnid, is_shared);
m_left_child.set(left_child);
} else {
- left_child->insert(range, txnid);
+ left_child->insert(range, txnid, is_shared);
left_child->mutex_unlock();
}
- } else {
- invariant(c == keyrange::comparison::GREATER_THAN);
+ } else if (c == keyrange::comparison::GREATER_THAN) {
+ //invariant(c == keyrange::comparison::GREATER_THAN);
treenode *right_child = lock_and_rebalance_right();
if (right_child == nullptr) {
- right_child = treenode::alloc(m_cmp, range, txnid);
+ right_child = treenode::alloc(m_cmp, range, txnid, is_shared);
m_right_child.set(right_child);
} else {
- right_child->insert(range, txnid);
+ right_child->insert(range, txnid, is_shared);
right_child->mutex_unlock();
}
+ } else if (c == keyrange::comparison::EQUALS) {
+ invariant(is_shared);
+ invariant(m_is_shared);
+ add_shared_owner(txnid);
+ } else {
+ invariant(0);
}
}
@@ -337,19 +368,38 @@ void treenode::recursive_remove(void) {
treenode::free(this);
}
-treenode *treenode::remove(const keyrange &range) {
+void treenode::remove_shared_owner(TXNID txnid) {
+ m_owners->erase(txnid);
+ /* if there is just one owner left, move it to m_txnid */
+ if (m_owners->size() == 1)
+ {
+ m_txnid = * m_owners->begin();
+ delete m_owners;
+ m_owners = nullptr;
+ }
+}
+
+treenode *treenode::remove(const keyrange &range, TXNID txnid) {
treenode *child;
// if the range is equal to this node's range, then just remove
// the root of this subtree. otherwise search down the tree
// in either the left or right children.
keyrange::comparison c = range.compare(*m_cmp, m_range);
switch (c) {
- case keyrange::comparison::EQUALS:
- return remove_root_of_subtree();
+ case keyrange::comparison::EQUALS: {
+ // if we are the only owners, remove. Otherwise, just remove
+ // us from the owners list.
+ if (txnid != TXNID_ANY && has_multiple_owners()) {
+ remove_shared_owner(txnid);
+ return this;
+ } else {
+ return remove_root_of_subtree();
+ }
+ }
case keyrange::comparison::LESS_THAN:
child = m_left_child.get_locked();
invariant_notnull(child);
- child = child->remove(range);
+ child = child->remove(range, txnid);
// unlock the child if there still is one.
// regardless, set the right child pointer
@@ -361,7 +411,7 @@ treenode *treenode::remove(const keyrange &range) {
case keyrange::comparison::GREATER_THAN:
child = m_right_child.get_locked();
invariant_notnull(child);
- child = child->remove(range);
+ child = child->remove(range, txnid);
// unlock the child if there still is one.
// regardless, set the right child pointer
diff --git a/utilities/transactions/range_locking/locktree/treenode.h b/utilities/transactions/range_locking/locktree/treenode.h
index a4b01f1cc..f23324f03 100644
--- a/utilities/transactions/range_locking/locktree/treenode.h
+++ b/utilities/transactions/range_locking/locktree/treenode.h
@@ -92,7 +92,7 @@ public:
void destroy_root(void);
// effect: sets the txnid and copies the given range for this node
- void set_range_and_txnid(const keyrange &range, TXNID txnid);
+ void set_range_and_txnid(const keyrange &range, TXNID txnid, bool is_shared);
// returns: true iff this node is marked as empty
bool is_empty(void);
@@ -127,12 +127,12 @@ public:
// effect: inserts the given range and txnid into a subtree, recursively
// requires: range does not overlap with any node below the subtree
- void insert(const keyrange &range, TXNID txnid);
+ void insert(const keyrange &range, TXNID txnid, bool is_shared);
// effect: removes the given range from the subtree
// requires: range exists in the subtree
// returns: the root of the resulting subtree
- treenode *remove(const keyrange &range);
+ treenode *remove(const keyrange &range, TXNID txnid);
// effect: removes this node and all of its children, recursively
// requires: every node at and below this node is unlocked
@@ -166,13 +166,30 @@ private:
// destroyed, it frees the memory associated with whatever range
// it has at the time of destruction.
keyrange m_range;
+
+ void remove_shared_owner(TXNID txnid);
+
+ bool has_multiple_owners() { return (m_txnid == TXNID_SHARED); }
+
+private:
+ // Owner transaction id.
+ // A value of TXNID_SHARED means this node has multiple owners
TXNID m_txnid;
+ // If true, this lock is a non-exclusive lock, and it can have either
+ // one or several owners.
+ bool m_is_shared;
+
+ // List of the owners, or nullptr if there's just one owner.
+ TxnidVector *m_owners;
+
// two child pointers
child_ptr m_left_child;
child_ptr m_right_child;
// comparator for ranges
+ // psergey-todo: Is there any sense to store the comparator in each tree
+ // node?
const comparator *m_cmp;
// marked for the root node. the root node is never free()'d
@@ -185,6 +202,10 @@ private:
// effect: initializes an empty node with the given comparator
void init(const comparator *cmp);
+ // requires: this is a shared node (m_is_shared==true)
+ // effect: another transaction is added as an owner.
+ void add_shared_owner(TXNID txnid);
+
// requires: *parent is initialized to something meaningful.
// requires: subtree is non-empty
// returns: the leftmost child of the given subtree
@@ -230,7 +251,8 @@ private:
treenode *maybe_rebalance(void);
// returns: allocated treenode populated with a copy of the range and txnid
- static treenode *alloc(const comparator *cmp, const keyrange &range, TXNID txnid);
+ static treenode *alloc(const comparator *cmp, const keyrange &range,
+ TXNID txnid, bool is_shared);
// requires: node is a locked root node, or an unlocked non-root node
static void free(treenode *node);
diff --git a/utilities/transactions/range_locking/portability/txn_subst.h b/utilities/transactions/range_locking/portability/txn_subst.h
index 3882eb1c5..58c3fced0 100644
--- a/utilities/transactions/range_locking/portability/txn_subst.h
+++ b/utilities/transactions/range_locking/portability/txn_subst.h
@@ -3,8 +3,24 @@
//
#pragma once
+#include <set>
#include "util/omt.h"
typedef uint64_t TXNID;
#define TXNID_NONE ((TXNID)0)
+// A set of transactions
+// (TODO: consider using class toku::txnid_set. The reason for using STL
+// container was that its API is easier)
+class TxnidVector : public std::set<TXNID> {
+public:
+ bool contains(TXNID txnid) { return find(txnid) != end(); }
+};
+
+// A value for lock structures with a meaning "the lock is owned by multiple
+// transactions (and one has to check the TxnidVector to get their ids)
+#define TXNID_SHARED (TXNID(-1))
+
+// Auxiliary value meaning "any transaction id will do". No real transaction
+// may have this is as id.
+#define TXNID_ANY (TXNID(-2))
diff --git a/utilities/transactions/transaction_lock_mgr.cc b/utilities/transactions/transaction_lock_mgr.cc
index bd81dbdfc..fa8e49476 100644
--- a/utilities/transactions/transaction_lock_mgr.cc
+++ b/utilities/transactions/transaction_lock_mgr.cc
@@ -820,7 +820,7 @@ Status RangeLockMgr::TryRangeLock(PessimisticTransaction* txn,
uint32_t column_family_id,
const Endpoint &start_endp,
const Endpoint &end_endp,
- bool /*exclusive*/) {
+ bool exclusive) {
toku::lock_request request;
request.create(mutex_factory_);
DBT start_key_dbt, end_key_dbt;
@@ -842,7 +842,8 @@ Status RangeLockMgr::TryRangeLock(PessimisticTransaction* txn,
auto lt= get_locktree_by_cfid(column_family_id);
request.set(lt, (TXNID)txn, &start_key_dbt, &end_key_dbt,
- toku::lock_request::WRITE, false /* not a big txn */,
+ exclusive? toku::lock_request::WRITE: toku::lock_request::READ,
+ false /* not a big txn */,
(void*)wait_txn_id);
uint64_t killed_time_msec = 0; // TODO: what should this have?
@@ -1147,25 +1148,25 @@ RangeLockMgr::~RangeLockMgr() {
ltm_.destroy();
}
-uint64_t RangeLockMgr::get_escalation_count() {
+RangeLockMgrHandle::Counters RangeLockMgr::GetStatus() {
LTM_STATUS_S ltm_status_test;
ltm_.get_status(<m_status_test);
+ Counters res;
// Searching status variable by its string name is how Toku's unit tests
// do it (why didn't they make LTM_ESCALATION_COUNT constant visible?)
- TOKU_ENGINE_STATUS_ROW key_status = NULL;
// lookup keyname in status
- for (int i = 0; ; i++) {
+ for (int i = 0; i < LTM_STATUS_S::LTM_STATUS_NUM_ROWS; i++) {
TOKU_ENGINE_STATUS_ROW status = <m_status_test.status[i];
- if (status->keyname == NULL)
- break;
if (strcmp(status->keyname, "LTM_ESCALATION_COUNT") == 0) {
- key_status = status;
- break;
+ res.escalation_count = status->value.num;
+ continue;
+ }
+ if (strcmp(status->keyname, "LTM_SIZE_CURRENT") == 0) {
+ res.current_lock_memory = status->value.num;
}
}
- assert(key_status);
- return key_status->value.num;
+ return res;
}
void RangeLockMgr::AddColumnFamily(const ColumnFamilyHandle *cfh) {
@@ -1254,13 +1255,15 @@ struct LOCK_PRINT_CONTEXT {
};
static
-void push_into_lock_status_data(void* param, const DBT *left,
- const DBT *right, TXNID txnid_arg) {
+void push_into_lock_status_data(void* param,
+ const DBT *left, const DBT *right,
+ TXNID txnid_arg, bool is_shared,
+ TxnidVector *owners) {
struct LOCK_PRINT_CONTEXT *ctx= (LOCK_PRINT_CONTEXT*)param;
struct KeyLockInfo info;
info.key.append((const char*)left->data, (size_t)left->size);
- info.exclusive= true;
+ info.exclusive= !is_shared;
if (!(left->size == right->size &&
!memcmp(left->data, right->data, left->size)))
@@ -1270,8 +1273,15 @@ void push_into_lock_status_data(void* param, const DBT *left,
info.key2.append((const char*)right->data, right->size);
}
- TXNID txnid= ((PessimisticTransaction*)txnid_arg)->GetID();
- info.ids.push_back(txnid);
+ if (txnid_arg != TXNID_SHARED) {
+ TXNID txnid= ((PessimisticTransaction*)txnid_arg)->GetID();
+ info.ids.push_back(txnid);
+ } else {
+ for (auto it : *owners) {
+ TXNID real_id= ((PessimisticTransaction*)it)->GetID();
+ info.ids.push_back(real_id);
+ }
+ }
ctx->data->insert({ctx->cfh_id, info});
}
diff --git a/utilities/transactions/transaction_lock_mgr.h b/utilities/transactions/transaction_lock_mgr.h
index 33ebbeb03..4bb66febf 100644
--- a/utilities/transactions/transaction_lock_mgr.h
+++ b/utilities/transactions/transaction_lock_mgr.h
@@ -240,7 +240,11 @@ class RangeLockMgr :
return ltm_.set_max_lock_memory(max_lock_memory);
}
- uint64_t get_escalation_count() override;
+ size_t get_max_lock_memory() {
+ return ltm_.get_max_lock_memory();
+ }
+
+ Counters GetStatus() override;
LockStatusData GetLockStatusData() override;
1
0

[Commits] be83f81: MDEV-19919 Assertion `!prebuilt->index->is_primary()' failed
by IgorBabaev 23 Nov '19
by IgorBabaev 23 Nov '19
23 Nov '19
revision-id: be83f81fbf8fea75955e047c13501443bb953452 (mariadb-10.4.10-23-gbe83f81)
parent(s): 6cedb671e99038f1a10e0d8504f835aaabed9780
author: Igor Babaev
committer: Igor Babaev
timestamp: 2019-11-22 19:11:58 -0800
message:
MDEV-19919 Assertion `!prebuilt->index->is_primary()' failed
in row_search_idx_cond_check
For a single table query with ORDER BY and several sargable range
conditions the optimizer may choose an execution plan that employs
a rowid filter. In this case it is important to build the filter before
calling the function JOIN_TAB::sort_table() that creates sort index
for the result set, because when this is index created the filter has
to be already filled. After the sort index has been created the
filter must be deactivated. If not to do this the innodb function
row_search_idx_cond_check() is getting confused when it has to read rows
from the created sort index by using ha_rnd_pos().
The order of actions mentioned above is needed also when processing a
join query if sorting is performed for the first non constant table in
the chosen execution plan.
---
mysql-test/main/rowid_filter_innodb.result | 35 ++++++++++++++++++++++++++++++
mysql-test/main/rowid_filter_innodb.test | 33 ++++++++++++++++++++++++++++
sql/sql_select.cc | 8 +++++--
3 files changed, 74 insertions(+), 2 deletions(-)
diff --git a/mysql-test/main/rowid_filter_innodb.result b/mysql-test/main/rowid_filter_innodb.result
index 390c783..37e32f0 100644
--- a/mysql-test/main/rowid_filter_innodb.result
+++ b/mysql-test/main/rowid_filter_innodb.result
@@ -2210,3 +2210,38 @@ a b
drop table t1;
set optimizer_switch=@save_optimizer_switch;
SET SESSION STORAGE_ENGINE=DEFAULT;
+#
+# MDEV-19919: use of rowid filter for innodb table + ORDER BY
+#
+SET @stats.save= @@innodb_stats_persistent;
+SET GLOBAL innodb_stats_persistent= ON;
+CREATE TABLE t1 (
+a INT,
+b VARCHAR(10),
+c VARCHAR(1024),
+KEY (b),
+KEY (c)
+) ENGINE=InnoDB;
+INSERT INTO t1 VALUES
+(1,'w','z'), (1,'X','o'), (1,'q','c'), (5,'w','c'), (2,'j','m'),
+(2,'Q','s'), (9,'e','J'), (2,'p','W'), (9,'o','F'), (2,'g','S'),
+(1,'Y','a'), (NULL,'Y','p'), (NULL,'s','x'), (NULL,'i','S'),
+(1,'l','q'), (7,'r','e'), (4,'b','h'), (NULL,'E','c'),
+(NULL,'M','a'), (3,'e','X'), (NULL,'p','r'), (9,'e','i'),
+(3,'g','x'), (2,'h','y');
+ANALYZE TABLE t1;
+Table Op Msg_type Msg_text
+test.t1 analyze status Engine-independent statistics collected
+test.t1 analyze status OK
+EXPLAIN EXTENDED
+SELECT a FROM t1 WHERE c < 'k' AND b > 't' ORDER BY a;
+id select_type table type possible_keys key key_len ref rows filtered Extra
+1 SIMPLE t1 range|filter b,c b|c 13|1027 NULL 5 (42%) 41.67 Using index condition; Using where; Using filesort; Using rowid filter
+Warnings:
+Note 1003 select `test`.`t1`.`a` AS `a` from `test`.`t1` where `test`.`t1`.`c` < 'k' and `test`.`t1`.`b` > 't' order by `test`.`t1`.`a`
+SELECT a FROM t1 WHERE c < 'k' AND b > 't' ORDER BY a;
+a
+1
+5
+DROP TABLE t1;
+SET GLOBAL innodb_stats_persistent= @stats.save;
diff --git a/mysql-test/main/rowid_filter_innodb.test b/mysql-test/main/rowid_filter_innodb.test
index 240cd92..4a6c431 100644
--- a/mysql-test/main/rowid_filter_innodb.test
+++ b/mysql-test/main/rowid_filter_innodb.test
@@ -96,3 +96,36 @@ drop table t1;
set optimizer_switch=@save_optimizer_switch;
SET SESSION STORAGE_ENGINE=DEFAULT;
+
+--echo #
+--echo # MDEV-19919: use of rowid filter for innodb table + ORDER BY
+--echo #
+
+SET @stats.save= @@innodb_stats_persistent;
+SET GLOBAL innodb_stats_persistent= ON;
+
+CREATE TABLE t1 (
+ a INT,
+ b VARCHAR(10),
+ c VARCHAR(1024),
+ KEY (b),
+ KEY (c)
+) ENGINE=InnoDB;
+
+INSERT INTO t1 VALUES
+ (1,'w','z'), (1,'X','o'), (1,'q','c'), (5,'w','c'), (2,'j','m'),
+ (2,'Q','s'), (9,'e','J'), (2,'p','W'), (9,'o','F'), (2,'g','S'),
+ (1,'Y','a'), (NULL,'Y','p'), (NULL,'s','x'), (NULL,'i','S'),
+ (1,'l','q'), (7,'r','e'), (4,'b','h'), (NULL,'E','c'),
+ (NULL,'M','a'), (3,'e','X'), (NULL,'p','r'), (9,'e','i'),
+ (3,'g','x'), (2,'h','y');
+
+ANALYZE TABLE t1;
+
+EXPLAIN EXTENDED
+SELECT a FROM t1 WHERE c < 'k' AND b > 't' ORDER BY a;
+
+SELECT a FROM t1 WHERE c < 'k' AND b > 't' ORDER BY a;
+
+DROP TABLE t1;
+SET GLOBAL innodb_stats_persistent= @stats.save;
diff --git a/sql/sql_select.cc b/sql/sql_select.cc
index a05c2f8..1ee2a17 100644
--- a/sql/sql_select.cc
+++ b/sql/sql_select.cc
@@ -21101,11 +21101,12 @@ int join_init_read_record(JOIN_TAB *tab)
*/
if (tab->distinct && tab->remove_duplicates()) // Remove duplicates.
return 1;
- if (tab->filesort && tab->sort_table()) // Sort table.
- return 1;
tab->build_range_rowid_filter_if_needed();
+ if (tab->filesort && tab->sort_table()) // Sort table.
+ return 1;
+
DBUG_EXECUTE_IF("kill_join_init_read_record",
tab->join->thd->set_killed(KILL_QUERY););
if (tab->select && tab->select->quick && tab->select->quick->reset())
@@ -21165,6 +21166,9 @@ JOIN_TAB::sort_table()
JOIN::ordered_index_order_by :
JOIN::ordered_index_group_by));
rc= create_sort_index(join->thd, join, this, NULL);
+ /* Disactivate rowid filter if it was used when creating sort index */
+ if (rowid_filter)
+ table->file->rowid_filter_is_active= false;
return (rc != 0);
}
1
0

[Commits] 7b2e5218495: Support Create_time and Update_time in MyRocks table status
by psergey 18 Nov '19
by psergey 18 Nov '19
18 Nov '19
revision-id: 7b2e521849593b46a62ca3f32097c35a4bbaab18 (fb-prod201903-144-g7b2e5218495)
parent(s): d97c0c628e5dc60abd725f6a7120a8d87b09321e
author: Sergei Petrunia
committer: Sergei Petrunia
timestamp: 2019-11-19 01:14:44 +0300
message:
Support Create_time and Update_time in MyRocks table status
(Variant #7 of the patch) The implementation follows InnoDB:
- Create_time is taken from the .frm file creation timestamp
- Update_time is maintained in memory only and is set NULL on server
restart
---
mysql-test/suite/rocksdb/include/bulk_load.inc | 4 +-
.../suite/rocksdb/include/bulk_load_unsorted.inc | 4 +-
mysql-test/suite/rocksdb/r/bulk_load.result | 12 +--
mysql-test/suite/rocksdb/r/bulk_load_rev_cf.result | 12 +--
.../rocksdb/r/bulk_load_rev_cf_and_data.result | 12 +--
.../suite/rocksdb/r/bulk_load_rev_data.result | 12 +--
.../suite/rocksdb/r/bulk_load_unsorted.result | 12 +--
.../suite/rocksdb/r/bulk_load_unsorted_rev.result | 12 +--
mysql-test/suite/rocksdb/r/issue255.result | 16 +--
mysql-test/suite/rocksdb/r/rocksdb.result | 6 +-
.../suite/rocksdb/r/show_table_status.result | 118 ++++++++++++++++++++-
mysql-test/suite/rocksdb/r/truncate_table.result | 8 +-
mysql-test/suite/rocksdb/t/issue255.test | 17 +--
mysql-test/suite/rocksdb/t/rocksdb.test | 4 +-
mysql-test/suite/rocksdb/t/show_table_status.test | 117 +++++++++++++++++++-
mysql-test/suite/rocksdb/t/truncate_table.test | 8 +-
storage/rocksdb/ha_rocksdb.cc | 36 +++++++
storage/rocksdb/rdb_datadic.cc | 20 ++++
storage/rocksdb/rdb_datadic.h | 20 +++-
19 files changed, 372 insertions(+), 78 deletions(-)
diff --git a/mysql-test/suite/rocksdb/include/bulk_load.inc b/mysql-test/suite/rocksdb/include/bulk_load.inc
index 1b79825e507..7e163602202 100644
--- a/mysql-test/suite/rocksdb/include/bulk_load.inc
+++ b/mysql-test/suite/rocksdb/include/bulk_load.inc
@@ -121,12 +121,12 @@ set rocksdb_bulk_load=0;
--remove_file $file
# Make sure row count index stats are correct
---replace_column 6 # 7 # 8 # 9 #
+--replace_column 6 # 7 # 8 # 9 # 12 # 13 #
SHOW TABLE STATUS WHERE name LIKE 't%';
ANALYZE TABLE t1, t2, t3;
---replace_column 6 # 7 # 8 # 9 #
+--replace_column 6 # 7 # 8 # 9 # 12 # 13 #
SHOW TABLE STATUS WHERE name LIKE 't%';
# Make sure all the data is there.
diff --git a/mysql-test/suite/rocksdb/include/bulk_load_unsorted.inc b/mysql-test/suite/rocksdb/include/bulk_load_unsorted.inc
index 5cdc76a32d4..812af0401aa 100644
--- a/mysql-test/suite/rocksdb/include/bulk_load_unsorted.inc
+++ b/mysql-test/suite/rocksdb/include/bulk_load_unsorted.inc
@@ -119,12 +119,12 @@ set rocksdb_bulk_load=0;
--remove_file $file
# Make sure row count index stats are correct
---replace_column 6 # 7 # 8 # 9 #
+--replace_column 6 # 7 # 8 # 9 # 12 # 13 #
SHOW TABLE STATUS WHERE name LIKE 't%';
ANALYZE TABLE t1, t2, t3;
---replace_column 6 # 7 # 8 # 9 #
+--replace_column 6 # 7 # 8 # 9 # 12 # 13 #
SHOW TABLE STATUS WHERE name LIKE 't%';
# Make sure all the data is there.
diff --git a/mysql-test/suite/rocksdb/r/bulk_load.result b/mysql-test/suite/rocksdb/r/bulk_load.result
index a36f99a7619..76db28e66bd 100644
--- a/mysql-test/suite/rocksdb/r/bulk_load.result
+++ b/mysql-test/suite/rocksdb/r/bulk_load.result
@@ -38,9 +38,9 @@ pk a b
set rocksdb_bulk_load=0;
SHOW TABLE STATUS WHERE name LIKE 't%';
Name Engine Version Row_format Rows Avg_row_length Data_length Max_data_length Index_length Data_free Auto_increment Create_time Update_time Check_time Collation Checksum Create_options Comment
-t1 ROCKSDB 10 Fixed 5000000 # # # # 0 NULL NULL NULL NULL latin1_bin NULL
-t2 ROCKSDB 10 Fixed 5000000 # # # # 0 NULL NULL NULL NULL latin1_bin NULL
-t3 ROCKSDB 10 Fixed 5000000 # # # # 0 NULL NULL NULL NULL latin1_bin NULL partitioned
+t1 ROCKSDB 10 Fixed 5000000 # # # # 0 NULL # # NULL latin1_bin NULL
+t2 ROCKSDB 10 Fixed 5000000 # # # # 0 NULL # # NULL latin1_bin NULL
+t3 ROCKSDB 10 Fixed 5000000 # # # # 0 NULL # # NULL latin1_bin NULL partitioned
ANALYZE TABLE t1, t2, t3;
Table Op Msg_type Msg_text
test.t1 analyze status OK
@@ -48,9 +48,9 @@ test.t2 analyze status OK
test.t3 analyze status OK
SHOW TABLE STATUS WHERE name LIKE 't%';
Name Engine Version Row_format Rows Avg_row_length Data_length Max_data_length Index_length Data_free Auto_increment Create_time Update_time Check_time Collation Checksum Create_options Comment
-t1 ROCKSDB 10 Fixed 5000000 # # # # 0 NULL NULL NULL NULL latin1_bin NULL
-t2 ROCKSDB 10 Fixed 5000000 # # # # 0 NULL NULL NULL NULL latin1_bin NULL
-t3 ROCKSDB 10 Fixed 5000000 # # # # 0 NULL NULL NULL NULL latin1_bin NULL partitioned
+t1 ROCKSDB 10 Fixed 5000000 # # # # 0 NULL # # NULL latin1_bin NULL
+t2 ROCKSDB 10 Fixed 5000000 # # # # 0 NULL # # NULL latin1_bin NULL
+t3 ROCKSDB 10 Fixed 5000000 # # # # 0 NULL # # NULL latin1_bin NULL partitioned
select count(pk) from t1;
count(pk)
5000000
diff --git a/mysql-test/suite/rocksdb/r/bulk_load_rev_cf.result b/mysql-test/suite/rocksdb/r/bulk_load_rev_cf.result
index b5d3e252c5d..ae363f7ec0c 100644
--- a/mysql-test/suite/rocksdb/r/bulk_load_rev_cf.result
+++ b/mysql-test/suite/rocksdb/r/bulk_load_rev_cf.result
@@ -38,9 +38,9 @@ pk a b
set rocksdb_bulk_load=0;
SHOW TABLE STATUS WHERE name LIKE 't%';
Name Engine Version Row_format Rows Avg_row_length Data_length Max_data_length Index_length Data_free Auto_increment Create_time Update_time Check_time Collation Checksum Create_options Comment
-t1 ROCKSDB 10 Fixed 5000000 # # # # 0 NULL NULL NULL NULL latin1_bin NULL
-t2 ROCKSDB 10 Fixed 5000000 # # # # 0 NULL NULL NULL NULL latin1_bin NULL
-t3 ROCKSDB 10 Fixed 5000000 # # # # 0 NULL NULL NULL NULL latin1_bin NULL partitioned
+t1 ROCKSDB 10 Fixed 5000000 # # # # 0 NULL # # NULL latin1_bin NULL
+t2 ROCKSDB 10 Fixed 5000000 # # # # 0 NULL # # NULL latin1_bin NULL
+t3 ROCKSDB 10 Fixed 5000000 # # # # 0 NULL # # NULL latin1_bin NULL partitioned
ANALYZE TABLE t1, t2, t3;
Table Op Msg_type Msg_text
test.t1 analyze status OK
@@ -48,9 +48,9 @@ test.t2 analyze status OK
test.t3 analyze status OK
SHOW TABLE STATUS WHERE name LIKE 't%';
Name Engine Version Row_format Rows Avg_row_length Data_length Max_data_length Index_length Data_free Auto_increment Create_time Update_time Check_time Collation Checksum Create_options Comment
-t1 ROCKSDB 10 Fixed 5000000 # # # # 0 NULL NULL NULL NULL latin1_bin NULL
-t2 ROCKSDB 10 Fixed 5000000 # # # # 0 NULL NULL NULL NULL latin1_bin NULL
-t3 ROCKSDB 10 Fixed 5000000 # # # # 0 NULL NULL NULL NULL latin1_bin NULL partitioned
+t1 ROCKSDB 10 Fixed 5000000 # # # # 0 NULL # # NULL latin1_bin NULL
+t2 ROCKSDB 10 Fixed 5000000 # # # # 0 NULL # # NULL latin1_bin NULL
+t3 ROCKSDB 10 Fixed 5000000 # # # # 0 NULL # # NULL latin1_bin NULL partitioned
select count(pk) from t1;
count(pk)
5000000
diff --git a/mysql-test/suite/rocksdb/r/bulk_load_rev_cf_and_data.result b/mysql-test/suite/rocksdb/r/bulk_load_rev_cf_and_data.result
index f46acd41080..dd8dd7e60a8 100644
--- a/mysql-test/suite/rocksdb/r/bulk_load_rev_cf_and_data.result
+++ b/mysql-test/suite/rocksdb/r/bulk_load_rev_cf_and_data.result
@@ -38,9 +38,9 @@ pk a b
set rocksdb_bulk_load=0;
SHOW TABLE STATUS WHERE name LIKE 't%';
Name Engine Version Row_format Rows Avg_row_length Data_length Max_data_length Index_length Data_free Auto_increment Create_time Update_time Check_time Collation Checksum Create_options Comment
-t1 ROCKSDB 10 Fixed 5000000 # # # # 0 NULL NULL NULL NULL latin1_bin NULL
-t2 ROCKSDB 10 Fixed 5000000 # # # # 0 NULL NULL NULL NULL latin1_bin NULL
-t3 ROCKSDB 10 Fixed 5000000 # # # # 0 NULL NULL NULL NULL latin1_bin NULL partitioned
+t1 ROCKSDB 10 Fixed 5000000 # # # # 0 NULL # # NULL latin1_bin NULL
+t2 ROCKSDB 10 Fixed 5000000 # # # # 0 NULL # # NULL latin1_bin NULL
+t3 ROCKSDB 10 Fixed 5000000 # # # # 0 NULL # # NULL latin1_bin NULL partitioned
ANALYZE TABLE t1, t2, t3;
Table Op Msg_type Msg_text
test.t1 analyze status OK
@@ -48,9 +48,9 @@ test.t2 analyze status OK
test.t3 analyze status OK
SHOW TABLE STATUS WHERE name LIKE 't%';
Name Engine Version Row_format Rows Avg_row_length Data_length Max_data_length Index_length Data_free Auto_increment Create_time Update_time Check_time Collation Checksum Create_options Comment
-t1 ROCKSDB 10 Fixed 5000000 # # # # 0 NULL NULL NULL NULL latin1_bin NULL
-t2 ROCKSDB 10 Fixed 5000000 # # # # 0 NULL NULL NULL NULL latin1_bin NULL
-t3 ROCKSDB 10 Fixed 5000000 # # # # 0 NULL NULL NULL NULL latin1_bin NULL partitioned
+t1 ROCKSDB 10 Fixed 5000000 # # # # 0 NULL # # NULL latin1_bin NULL
+t2 ROCKSDB 10 Fixed 5000000 # # # # 0 NULL # # NULL latin1_bin NULL
+t3 ROCKSDB 10 Fixed 5000000 # # # # 0 NULL # # NULL latin1_bin NULL partitioned
select count(pk) from t1;
count(pk)
5000000
diff --git a/mysql-test/suite/rocksdb/r/bulk_load_rev_data.result b/mysql-test/suite/rocksdb/r/bulk_load_rev_data.result
index 3389968ef37..96738ae62e2 100644
--- a/mysql-test/suite/rocksdb/r/bulk_load_rev_data.result
+++ b/mysql-test/suite/rocksdb/r/bulk_load_rev_data.result
@@ -38,9 +38,9 @@ pk a b
set rocksdb_bulk_load=0;
SHOW TABLE STATUS WHERE name LIKE 't%';
Name Engine Version Row_format Rows Avg_row_length Data_length Max_data_length Index_length Data_free Auto_increment Create_time Update_time Check_time Collation Checksum Create_options Comment
-t1 ROCKSDB 10 Fixed 5000000 # # # # 0 NULL NULL NULL NULL latin1_bin NULL
-t2 ROCKSDB 10 Fixed 5000000 # # # # 0 NULL NULL NULL NULL latin1_bin NULL
-t3 ROCKSDB 10 Fixed 5000000 # # # # 0 NULL NULL NULL NULL latin1_bin NULL partitioned
+t1 ROCKSDB 10 Fixed 5000000 # # # # 0 NULL # # NULL latin1_bin NULL
+t2 ROCKSDB 10 Fixed 5000000 # # # # 0 NULL # # NULL latin1_bin NULL
+t3 ROCKSDB 10 Fixed 5000000 # # # # 0 NULL # # NULL latin1_bin NULL partitioned
ANALYZE TABLE t1, t2, t3;
Table Op Msg_type Msg_text
test.t1 analyze status OK
@@ -48,9 +48,9 @@ test.t2 analyze status OK
test.t3 analyze status OK
SHOW TABLE STATUS WHERE name LIKE 't%';
Name Engine Version Row_format Rows Avg_row_length Data_length Max_data_length Index_length Data_free Auto_increment Create_time Update_time Check_time Collation Checksum Create_options Comment
-t1 ROCKSDB 10 Fixed 5000000 # # # # 0 NULL NULL NULL NULL latin1_bin NULL
-t2 ROCKSDB 10 Fixed 5000000 # # # # 0 NULL NULL NULL NULL latin1_bin NULL
-t3 ROCKSDB 10 Fixed 5000000 # # # # 0 NULL NULL NULL NULL latin1_bin NULL partitioned
+t1 ROCKSDB 10 Fixed 5000000 # # # # 0 NULL # # NULL latin1_bin NULL
+t2 ROCKSDB 10 Fixed 5000000 # # # # 0 NULL # # NULL latin1_bin NULL
+t3 ROCKSDB 10 Fixed 5000000 # # # # 0 NULL # # NULL latin1_bin NULL partitioned
select count(pk) from t1;
count(pk)
5000000
diff --git a/mysql-test/suite/rocksdb/r/bulk_load_unsorted.result b/mysql-test/suite/rocksdb/r/bulk_load_unsorted.result
index 924032549ac..87fc63af2da 100644
--- a/mysql-test/suite/rocksdb/r/bulk_load_unsorted.result
+++ b/mysql-test/suite/rocksdb/r/bulk_load_unsorted.result
@@ -70,9 +70,9 @@ LOAD DATA INFILE <input_file> INTO TABLE t3;
set rocksdb_bulk_load=0;
SHOW TABLE STATUS WHERE name LIKE 't%';
Name Engine Version Row_format Rows Avg_row_length Data_length Max_data_length Index_length Data_free Auto_increment Create_time Update_time Check_time Collation Checksum Create_options Comment
-t1 ROCKSDB 10 Fixed 5000000 # # # # 0 NULL NULL NULL NULL latin1_swedish_ci NULL
-t2 ROCKSDB 10 Fixed 5000000 # # # # 0 NULL NULL NULL NULL latin1_swedish_ci NULL
-t3 ROCKSDB 10 Fixed 5000000 # # # # 0 NULL NULL NULL NULL latin1_swedish_ci NULL partitioned
+t1 ROCKSDB 10 Fixed 5000000 # # # # 0 NULL # # NULL latin1_swedish_ci NULL
+t2 ROCKSDB 10 Fixed 5000000 # # # # 0 NULL # # NULL latin1_swedish_ci NULL
+t3 ROCKSDB 10 Fixed 5000000 # # # # 0 NULL # # NULL latin1_swedish_ci NULL partitioned
ANALYZE TABLE t1, t2, t3;
Table Op Msg_type Msg_text
test.t1 analyze status OK
@@ -80,9 +80,9 @@ test.t2 analyze status OK
test.t3 analyze status OK
SHOW TABLE STATUS WHERE name LIKE 't%';
Name Engine Version Row_format Rows Avg_row_length Data_length Max_data_length Index_length Data_free Auto_increment Create_time Update_time Check_time Collation Checksum Create_options Comment
-t1 ROCKSDB 10 Fixed 5000000 # # # # 0 NULL NULL NULL NULL latin1_swedish_ci NULL
-t2 ROCKSDB 10 Fixed 5000000 # # # # 0 NULL NULL NULL NULL latin1_swedish_ci NULL
-t3 ROCKSDB 10 Fixed 5000000 # # # # 0 NULL NULL NULL NULL latin1_swedish_ci NULL partitioned
+t1 ROCKSDB 10 Fixed 5000000 # # # # 0 NULL # # NULL latin1_swedish_ci NULL
+t2 ROCKSDB 10 Fixed 5000000 # # # # 0 NULL # # NULL latin1_swedish_ci NULL
+t3 ROCKSDB 10 Fixed 5000000 # # # # 0 NULL # # NULL latin1_swedish_ci NULL partitioned
select count(a) from t1;
count(a)
5000000
diff --git a/mysql-test/suite/rocksdb/r/bulk_load_unsorted_rev.result b/mysql-test/suite/rocksdb/r/bulk_load_unsorted_rev.result
index 3cc9fb8e459..8e0914f0159 100644
--- a/mysql-test/suite/rocksdb/r/bulk_load_unsorted_rev.result
+++ b/mysql-test/suite/rocksdb/r/bulk_load_unsorted_rev.result
@@ -70,9 +70,9 @@ LOAD DATA INFILE <input_file> INTO TABLE t3;
set rocksdb_bulk_load=0;
SHOW TABLE STATUS WHERE name LIKE 't%';
Name Engine Version Row_format Rows Avg_row_length Data_length Max_data_length Index_length Data_free Auto_increment Create_time Update_time Check_time Collation Checksum Create_options Comment
-t1 ROCKSDB 10 Fixed 5000000 # # # # 0 NULL NULL NULL NULL latin1_swedish_ci NULL
-t2 ROCKSDB 10 Fixed 5000000 # # # # 0 NULL NULL NULL NULL latin1_swedish_ci NULL
-t3 ROCKSDB 10 Fixed 5000000 # # # # 0 NULL NULL NULL NULL latin1_swedish_ci NULL partitioned
+t1 ROCKSDB 10 Fixed 5000000 # # # # 0 NULL # # NULL latin1_swedish_ci NULL
+t2 ROCKSDB 10 Fixed 5000000 # # # # 0 NULL # # NULL latin1_swedish_ci NULL
+t3 ROCKSDB 10 Fixed 5000000 # # # # 0 NULL # # NULL latin1_swedish_ci NULL partitioned
ANALYZE TABLE t1, t2, t3;
Table Op Msg_type Msg_text
test.t1 analyze status OK
@@ -80,9 +80,9 @@ test.t2 analyze status OK
test.t3 analyze status OK
SHOW TABLE STATUS WHERE name LIKE 't%';
Name Engine Version Row_format Rows Avg_row_length Data_length Max_data_length Index_length Data_free Auto_increment Create_time Update_time Check_time Collation Checksum Create_options Comment
-t1 ROCKSDB 10 Fixed 5000000 # # # # 0 NULL NULL NULL NULL latin1_swedish_ci NULL
-t2 ROCKSDB 10 Fixed 5000000 # # # # 0 NULL NULL NULL NULL latin1_swedish_ci NULL
-t3 ROCKSDB 10 Fixed 5000000 # # # # 0 NULL NULL NULL NULL latin1_swedish_ci NULL partitioned
+t1 ROCKSDB 10 Fixed 5000000 # # # # 0 NULL # # NULL latin1_swedish_ci NULL
+t2 ROCKSDB 10 Fixed 5000000 # # # # 0 NULL # # NULL latin1_swedish_ci NULL
+t3 ROCKSDB 10 Fixed 5000000 # # # # 0 NULL # # NULL latin1_swedish_ci NULL partitioned
select count(a) from t1;
count(a)
5000000
diff --git a/mysql-test/suite/rocksdb/r/issue255.result b/mysql-test/suite/rocksdb/r/issue255.result
index c1ce3be2276..b45b3b5afc7 100644
--- a/mysql-test/suite/rocksdb/r/issue255.result
+++ b/mysql-test/suite/rocksdb/r/issue255.result
@@ -2,7 +2,7 @@ CREATE TABLE t1 (pk BIGINT NOT NULL PRIMARY KEY AUTO_INCREMENT);
INSERT INTO t1 VALUES (5);
SHOW TABLE STATUS LIKE 't1';
Name Engine Version Row_format Rows Avg_row_length Data_length Max_data_length Index_length Data_free Auto_increment Create_time Update_time Check_time Collation Checksum Create_options Comment
-t1 ROCKSDB # Fixed # # # # # # 6 NULL NULL NULL latin1_swedish_ci NULL
+t1 ROCKSDB # Fixed # # # # # # 6 # # NULL latin1_swedish_ci NULL
INSERT INTO t1 VALUES ('538647864786478647864');
Warnings:
Warning 1264 Out of range value for column 'pk' at row 1
@@ -12,7 +12,7 @@ pk
9223372036854775807
SHOW TABLE STATUS LIKE 't1';
Name Engine Version Row_format Rows Avg_row_length Data_length Max_data_length Index_length Data_free Auto_increment Create_time Update_time Check_time Collation Checksum Create_options Comment
-t1 ROCKSDB 10 Fixed 2 22 44 0 0 0 9223372036854775807 NULL NULL NULL latin1_swedish_ci NULL
+t1 ROCKSDB 10 Fixed 2 22 44 0 0 0 9223372036854775807 # # NULL latin1_swedish_ci NULL
INSERT INTO t1 VALUES ();
ERROR 23000: Duplicate entry '9223372036854775807' for key 'PRIMARY'
SELECT * FROM t1;
@@ -21,7 +21,7 @@ pk
9223372036854775807
SHOW TABLE STATUS LIKE 't1';
Name Engine Version Row_format Rows Avg_row_length Data_length Max_data_length Index_length Data_free Auto_increment Create_time Update_time Check_time Collation Checksum Create_options Comment
-t1 ROCKSDB # Fixed # # # # # # 9223372036854775807 NULL NULL NULL latin1_swedish_ci NULL
+t1 ROCKSDB # Fixed # # # # # # 9223372036854775807 # # NULL latin1_swedish_ci NULL
INSERT INTO t1 VALUES ();
ERROR 23000: Duplicate entry '9223372036854775807' for key 'PRIMARY'
SELECT * FROM t1;
@@ -30,13 +30,13 @@ pk
9223372036854775807
SHOW TABLE STATUS LIKE 't1';
Name Engine Version Row_format Rows Avg_row_length Data_length Max_data_length Index_length Data_free Auto_increment Create_time Update_time Check_time Collation Checksum Create_options Comment
-t1 ROCKSDB # Fixed # # # # # # 9223372036854775807 NULL NULL NULL latin1_swedish_ci NULL
+t1 ROCKSDB # Fixed # # # # # # 9223372036854775807 # # NULL latin1_swedish_ci NULL
DROP TABLE t1;
CREATE TABLE t1 (pk TINYINT NOT NULL PRIMARY KEY AUTO_INCREMENT);
INSERT INTO t1 VALUES (5);
SHOW TABLE STATUS LIKE 't1';
Name Engine Version Row_format Rows Avg_row_length Data_length Max_data_length Index_length Data_free Auto_increment Create_time Update_time Check_time Collation Checksum Create_options Comment
-t1 ROCKSDB # Fixed # # # # # # 6 NULL NULL NULL latin1_swedish_ci NULL
+t1 ROCKSDB # Fixed # # # # # # 6 # # NULL latin1_swedish_ci NULL
INSERT INTO t1 VALUES (1000);
Warnings:
Warning 1264 Out of range value for column 'pk' at row 1
@@ -46,7 +46,7 @@ pk
127
SHOW TABLE STATUS LIKE 't1';
Name Engine Version Row_format Rows Avg_row_length Data_length Max_data_length Index_length Data_free Auto_increment Create_time Update_time Check_time Collation Checksum Create_options Comment
-t1 ROCKSDB # Fixed # # # # # # 127 NULL NULL NULL latin1_swedish_ci NULL
+t1 ROCKSDB # Fixed # # # # # # 127 # # NULL latin1_swedish_ci NULL
INSERT INTO t1 VALUES ();
ERROR 23000: Duplicate entry '127' for key 'PRIMARY'
SELECT * FROM t1;
@@ -55,7 +55,7 @@ pk
127
SHOW TABLE STATUS LIKE 't1';
Name Engine Version Row_format Rows Avg_row_length Data_length Max_data_length Index_length Data_free Auto_increment Create_time Update_time Check_time Collation Checksum Create_options Comment
-t1 ROCKSDB # Fixed # # # # # # 127 NULL NULL NULL latin1_swedish_ci NULL
+t1 ROCKSDB # Fixed # # # # # # 127 # # NULL latin1_swedish_ci NULL
INSERT INTO t1 VALUES ();
ERROR 23000: Duplicate entry '127' for key 'PRIMARY'
SELECT * FROM t1;
@@ -64,5 +64,5 @@ pk
127
SHOW TABLE STATUS LIKE 't1';
Name Engine Version Row_format Rows Avg_row_length Data_length Max_data_length Index_length Data_free Auto_increment Create_time Update_time Check_time Collation Checksum Create_options Comment
-t1 ROCKSDB # Fixed # # # # # # 127 NULL NULL NULL latin1_swedish_ci NULL
+t1 ROCKSDB # Fixed # # # # # # 127 # # NULL latin1_swedish_ci NULL
DROP TABLE t1;
diff --git a/mysql-test/suite/rocksdb/r/rocksdb.result b/mysql-test/suite/rocksdb/r/rocksdb.result
index 088eb050f6f..a631d58ac69 100644
--- a/mysql-test/suite/rocksdb/r/rocksdb.result
+++ b/mysql-test/suite/rocksdb/r/rocksdb.result
@@ -1417,7 +1417,7 @@ create table t1 (i int primary key auto_increment) engine=RocksDB;
insert into t1 values (null),(null);
show table status like 't1';
Name Engine Version Row_format Rows Avg_row_length Data_length Max_data_length Index_length Data_free Auto_increment Create_time Update_time Check_time Collation Checksum Create_options Comment
-t1 ROCKSDB 10 Fixed 1000 0 # 0 0 0 3 NULL NULL NULL latin1_swedish_ci NULL
+t1 ROCKSDB 10 Fixed 1000 0 # 0 0 0 3 # # NULL latin1_swedish_ci NULL
drop table t1;
#
# Fix Issue #4: Crash when using pseudo-unique keys
@@ -2612,7 +2612,7 @@ CREATE TABLE t1(a INT AUTO_INCREMENT KEY);
INSERT INTO t1 VALUES(0),(-1),(0);
SHOW TABLE STATUS LIKE 't1';
Name Engine Version Row_format Rows Avg_row_length Data_length Max_data_length Index_length Data_free Auto_increment Create_time Update_time Check_time Collation Checksum Create_options Comment
-t1 ROCKSDB 10 Fixed 1000 0 0 0 0 0 3 NULL NULL NULL latin1_swedish_ci NULL
+t1 ROCKSDB 10 Fixed 1000 0 0 0 0 0 3 # # NULL latin1_swedish_ci NULL
SELECT * FROM t1;
a
-1
@@ -2623,7 +2623,7 @@ CREATE TABLE t1(a INT AUTO_INCREMENT KEY);
INSERT INTO t1 VALUES(0),(10),(0);
SHOW TABLE STATUS LIKE 't1';
Name Engine Version Row_format Rows Avg_row_length Data_length Max_data_length Index_length Data_free Auto_increment Create_time Update_time Check_time Collation Checksum Create_options Comment
-t1 ROCKSDB 10 Fixed 1000 0 0 0 0 0 12 NULL NULL NULL latin1_swedish_ci NULL
+t1 ROCKSDB 10 Fixed 1000 0 0 0 0 0 12 # # NULL latin1_swedish_ci NULL
SELECT * FROM t1;
a
1
diff --git a/mysql-test/suite/rocksdb/r/show_table_status.result b/mysql-test/suite/rocksdb/r/show_table_status.result
index 29140f045e4..c3540d9de1d 100644
--- a/mysql-test/suite/rocksdb/r/show_table_status.result
+++ b/mysql-test/suite/rocksdb/r/show_table_status.result
@@ -7,12 +7,12 @@ set global rocksdb_force_flush_memtable_now = true;
CREATE TABLE t3 (a INT, b CHAR(8), pk INT PRIMARY KEY) ENGINE=rocksdb CHARACTER SET utf8;
SHOW TABLE STATUS WHERE name IN ( 't1', 't2', 't3' );
Name Engine Version Row_format Rows Avg_row_length Data_length Max_data_length Index_length Data_free Auto_increment Create_time Update_time Check_time Collation Checksum Create_options Comment
-t1 ROCKSDB 10 Fixed 1000 # # 0 0 0 NULL NULL NULL NULL latin1_swedish_ci NULL
-t2 ROCKSDB 10 Fixed 1000 # # 0 0 0 NULL NULL NULL NULL latin1_swedish_ci NULL
-t3 ROCKSDB 10 Fixed 1000 # # 0 0 0 NULL NULL NULL NULL utf8_general_ci NULL
+t1 ROCKSDB 10 Fixed 1000 # # 0 0 0 NULL # # NULL latin1_swedish_ci NULL
+t2 ROCKSDB 10 Fixed 1000 # # 0 0 0 NULL # # NULL latin1_swedish_ci NULL
+t3 ROCKSDB 10 Fixed 1000 # # 0 0 0 NULL # # NULL utf8_general_ci NULL
SHOW TABLE STATUS WHERE name LIKE 't2';
Name Engine Version Row_format Rows Avg_row_length Data_length Max_data_length Index_length Data_free Auto_increment Create_time Update_time Check_time Collation Checksum Create_options Comment
-t2 ROCKSDB 10 Fixed 1000 # # 0 0 0 NULL NULL NULL NULL latin1_swedish_ci NULL
+t2 ROCKSDB 10 Fixed 1000 # # 0 0 0 NULL # # NULL latin1_swedish_ci NULL
DROP TABLE t1, t2, t3;
CREATE DATABASE `db_new..............................................end`;
USE `db_new..............................................end`;
@@ -22,3 +22,113 @@ SELECT TABLE_SCHEMA, TABLE_NAME FROM information_schema.table_statistics WHERE T
TABLE_SCHEMA db_new..............................................end
TABLE_NAME t1_new..............................................end
DROP DATABASE `db_new..............................................end`;
+#
+# MDEV-17171: Bug: RocksDB Tables do not have "Creation Date"
+#
+use test;
+create table t1 (a int) engine=rocksdb;
+select create_time is not null, update_time, check_time
+from information_schema.tables where table_schema=database() and table_name='t1';
+create_time is not null update_time check_time
+1 NULL NULL
+insert into t1 values (1);
+select create_time is not null, update_time is not null, check_time
+from information_schema.tables where table_schema=database() and table_name='t1';
+create_time is not null update_time is not null check_time
+1 1 NULL
+flush tables;
+select create_time is not null, update_time is not null, check_time
+from information_schema.tables where table_schema=database() and table_name='t1';
+create_time is not null update_time is not null check_time
+1 1 NULL
+select create_time, update_time into @create_tm, @update_tm
+from information_schema.tables
+where table_schema=database() and table_name='t1';
+select sleep(3);
+sleep(3)
+0
+insert into t1 values (2);
+select
+create_time=@create_tm /* should not change */ ,
+timestampdiff(second, @update_tm, update_time) > 2,
+check_time
+from information_schema.tables
+where table_schema=database() and table_name='t1';
+create_time=@create_tm 1
+timestampdiff(second, @update_tm, update_time) > 2 1
+check_time NULL
+#
+# Check how create_time survives ALTER TABLE.
+# First, an ALTER TABLE that re-creates the table:
+alter table t1 add b int;
+select sleep(2);
+sleep(2) 0
+select
+create_time<>@create_tm /* should change */,
+create_time IS NOT NULL,
+update_time IS NULL
+from information_schema.tables
+where table_schema=database() and table_name='t1';
+create_time<>@create_tm 1
+create_time IS NOT NULL 1
+update_time IS NULL 1
+insert into t1 values (5,5);
+select create_time, update_time into @create_tm, @update_tm
+from information_schema.tables
+where table_schema=database() and table_name='t1';
+select sleep(2);
+sleep(2) 0
+# Then, an in-place ALTER TABLE:
+alter table t1 add key (a);
+# create_time will change as .frm file is rewritten:
+select
+create_time=@create_tm,
+update_time
+from information_schema.tables
+where table_schema=database() and table_name='t1';
+create_time=@create_tm 0
+update_time NULL
+# Check TRUNCATE TABLE
+insert into t1 values (10,10);
+select create_time, update_time into @create_tm, @update_tm
+from information_schema.tables
+where table_schema=database() and table_name='t1';
+select sleep(2);
+sleep(2) 0
+truncate table t1;
+select
+create_time=@create_tm /* should not change */,
+update_time
+from information_schema.tables
+where table_schema=database() and table_name='t1';
+create_time=@create_tm 1
+update_time NULL
+#
+# Check what is left after server restart
+#
+# Save t1's creation time
+create table t2 as
+select create_time
+from information_schema.tables
+where table_schema=database() and table_name='t1';
+select sleep(2);
+sleep(2) 0
+select
+create_time=(select create_time from t2) /* should not change */,
+update_time
+from information_schema.tables
+where table_schema=database() and table_name='t1';
+create_time=(select create_time from t2) 1
+update_time NULL
+drop table t1, t2;
+#
+# Check how it works for partitioned tables
+#
+create table t1 (pk int primary key) partition by hash(pk) partitions 2;
+insert into t1 values (1);
+select create_time IS NOT NULL , update_time IS NOT NULL
+from information_schema.tables
+where table_schema=database() and table_name='t1';
+create_time IS NOT NULL 1
+update_time IS NOT NULL 1
+drop table t1;
diff --git a/mysql-test/suite/rocksdb/r/truncate_table.result b/mysql-test/suite/rocksdb/r/truncate_table.result
index 1544256f194..79b266a2453 100644
--- a/mysql-test/suite/rocksdb/r/truncate_table.result
+++ b/mysql-test/suite/rocksdb/r/truncate_table.result
@@ -9,19 +9,19 @@ DROP TABLE t1;
CREATE TABLE t1 (a INT KEY AUTO_INCREMENT, c CHAR(8)) ENGINE=rocksdb;
SHOW TABLE STATUS LIKE 't1';
Name Engine Version Row_format Rows Avg_row_length Data_length Max_data_length Index_length Data_free Auto_increment Create_time Update_time Check_time Collation Checksum Create_options Comment
-t1 ROCKSDB 10 Fixed # # # 0 0 0 1 NULL NULL NULL latin1_swedish_ci NULL
+t1 ROCKSDB 10 Fixed # # # 0 0 0 1 # # NULL latin1_swedish_ci NULL
INSERT INTO t1 (c) VALUES ('a'),('b'),('c');
SHOW TABLE STATUS LIKE 't1';
Name Engine Version Row_format Rows Avg_row_length Data_length Max_data_length Index_length Data_free Auto_increment Create_time Update_time Check_time Collation Checksum Create_options Comment
-t1 ROCKSDB 10 Fixed # # # 0 0 0 4 NULL NULL NULL latin1_swedish_ci NULL
+t1 ROCKSDB 10 Fixed # # # 0 0 0 4 # # NULL latin1_swedish_ci NULL
TRUNCATE TABLE t1;
SHOW TABLE STATUS LIKE 't1';
Name Engine Version Row_format Rows Avg_row_length Data_length Max_data_length Index_length Data_free Auto_increment Create_time Update_time Check_time Collation Checksum Create_options Comment
-t1 ROCKSDB 10 Fixed # # # 0 0 0 1 NULL NULL NULL latin1_swedish_ci NULL
+t1 ROCKSDB 10 Fixed # # # 0 0 0 1 # # NULL latin1_swedish_ci NULL
INSERT INTO t1 (c) VALUES ('d');
SHOW TABLE STATUS LIKE 't1';
Name Engine Version Row_format Rows Avg_row_length Data_length Max_data_length Index_length Data_free Auto_increment Create_time Update_time Check_time Collation Checksum Create_options Comment
-t1 ROCKSDB 10 Fixed # # # 0 0 0 2 NULL NULL NULL latin1_swedish_ci NULL
+t1 ROCKSDB 10 Fixed # # # 0 0 0 2 # # NULL latin1_swedish_ci NULL
SELECT a,c FROM t1;
a c
1 d
diff --git a/mysql-test/suite/rocksdb/t/issue255.test b/mysql-test/suite/rocksdb/t/issue255.test
index 370dece0c6c..686f45b4056 100644
--- a/mysql-test/suite/rocksdb/t/issue255.test
+++ b/mysql-test/suite/rocksdb/t/issue255.test
@@ -3,24 +3,25 @@
CREATE TABLE t1 (pk BIGINT NOT NULL PRIMARY KEY AUTO_INCREMENT);
INSERT INTO t1 VALUES (5);
---replace_column 3 # 5 # 6 # 7 # 8 # 9 # 10 #
+--replace_column 3 # 5 # 6 # 7 # 8 # 9 # 10 # 12 # 13 #
SHOW TABLE STATUS LIKE 't1';
INSERT INTO t1 VALUES ('538647864786478647864');
---replace_column 3 # 5 # 6 # 7 # 8 # 9 # 10 #
+--replace_column 3 # 5 # 6 # 7 # 8 # 9 # 10 # 12 # 13 #
SELECT * FROM t1;
+--replace_column 12 # 13 #
SHOW TABLE STATUS LIKE 't1';
--error ER_DUP_ENTRY
INSERT INTO t1 VALUES ();
SELECT * FROM t1;
---replace_column 3 # 5 # 6 # 7 # 8 # 9 # 10 #
+--replace_column 3 # 5 # 6 # 7 # 8 # 9 # 10 # 12 # 13 #
SHOW TABLE STATUS LIKE 't1';
--error ER_DUP_ENTRY
INSERT INTO t1 VALUES ();
SELECT * FROM t1;
---replace_column 3 # 5 # 6 # 7 # 8 # 9 # 10 #
+--replace_column 3 # 5 # 6 # 7 # 8 # 9 # 10 # 12 # 13 #
SHOW TABLE STATUS LIKE 't1';
DROP TABLE t1;
@@ -28,24 +29,24 @@ DROP TABLE t1;
CREATE TABLE t1 (pk TINYINT NOT NULL PRIMARY KEY AUTO_INCREMENT);
INSERT INTO t1 VALUES (5);
---replace_column 3 # 5 # 6 # 7 # 8 # 9 # 10 #
+--replace_column 3 # 5 # 6 # 7 # 8 # 9 # 10 # 12 # 13 #
SHOW TABLE STATUS LIKE 't1';
INSERT INTO t1 VALUES (1000);
SELECT * FROM t1;
---replace_column 3 # 5 # 6 # 7 # 8 # 9 # 10 #
+--replace_column 3 # 5 # 6 # 7 # 8 # 9 # 10 # 12 # 13 #
SHOW TABLE STATUS LIKE 't1';
--error ER_DUP_ENTRY
INSERT INTO t1 VALUES ();
SELECT * FROM t1;
---replace_column 3 # 5 # 6 # 7 # 8 # 9 # 10 #
+--replace_column 3 # 5 # 6 # 7 # 8 # 9 # 10 # 12 # 13 #
SHOW TABLE STATUS LIKE 't1';
--error ER_DUP_ENTRY
INSERT INTO t1 VALUES ();
SELECT * FROM t1;
---replace_column 3 # 5 # 6 # 7 # 8 # 9 # 10 #
+--replace_column 3 # 5 # 6 # 7 # 8 # 9 # 10 # 12 # 13 #
SHOW TABLE STATUS LIKE 't1';
DROP TABLE t1;
diff --git a/mysql-test/suite/rocksdb/t/rocksdb.test b/mysql-test/suite/rocksdb/t/rocksdb.test
index 5eff0fbf38f..7dcae569c92 100644
--- a/mysql-test/suite/rocksdb/t/rocksdb.test
+++ b/mysql-test/suite/rocksdb/t/rocksdb.test
@@ -1198,7 +1198,7 @@ drop table t1;
create table t1 (i int primary key auto_increment) engine=RocksDB;
insert into t1 values (null),(null);
---replace_column 7 #
+--replace_column 7 # 12 # 13 #
show table status like 't1';
drop table t1;
@@ -1903,11 +1903,13 @@ DROP TABLE t1;
# value is 4 while MyRocks will show it as 3.
CREATE TABLE t1(a INT AUTO_INCREMENT KEY);
INSERT INTO t1 VALUES(0),(-1),(0);
+--replace_column 12 # 13 #
SHOW TABLE STATUS LIKE 't1';
SELECT * FROM t1;
DROP TABLE t1;
CREATE TABLE t1(a INT AUTO_INCREMENT KEY);
INSERT INTO t1 VALUES(0),(10),(0);
+--replace_column 12 # 13 #
SHOW TABLE STATUS LIKE 't1';
SELECT * FROM t1;
DROP TABLE t1;
diff --git a/mysql-test/suite/rocksdb/t/show_table_status.test b/mysql-test/suite/rocksdb/t/show_table_status.test
index 29cc2ccfb5e..793c2a65c63 100644
--- a/mysql-test/suite/rocksdb/t/show_table_status.test
+++ b/mysql-test/suite/rocksdb/t/show_table_status.test
@@ -1,5 +1,5 @@
--source include/have_rocksdb.inc
-
+--source include/have_partition.inc
#
# SHOW TABLE STATUS statement
#
@@ -24,7 +24,7 @@ set global rocksdb_force_flush_memtable_now = true;
CREATE TABLE t3 (a INT, b CHAR(8), pk INT PRIMARY KEY) ENGINE=rocksdb CHARACTER SET utf8;
---replace_column 6 # 7 #
+--replace_column 6 # 7 # 12 # 13 #
SHOW TABLE STATUS WHERE name IN ( 't1', 't2', 't3' );
# Some statistics don't get updated as quickly. The Data_length and
@@ -48,7 +48,7 @@ set global rocksdb_force_flush_memtable_now = true;
# We expect the number of rows to be 10000. Data_len and Avg_row_len
# may vary, depending on built-in compression library.
---replace_column 6 # 7 #
+--replace_column 6 # 7 # 12 # 13 #
SHOW TABLE STATUS WHERE name LIKE 't2';
DROP TABLE t1, t2, t3;
@@ -62,3 +62,114 @@ CREATE TABLE `t1_new..............................................end`(a int) en
INSERT INTO `t1_new..............................................end` VALUES (1);
--query_vertical SELECT TABLE_SCHEMA, TABLE_NAME FROM information_schema.table_statistics WHERE TABLE_NAME = 't1_new..............................................end'
DROP DATABASE `db_new..............................................end`;
+
+--echo #
+--echo # MDEV-17171: Bug: RocksDB Tables do not have "Creation Date"
+--echo #
+
+use test;
+create table t1 (a int) engine=rocksdb;
+
+select create_time is not null, update_time, check_time
+from information_schema.tables where table_schema=database() and table_name='t1';
+
+insert into t1 values (1);
+select create_time is not null, update_time is not null, check_time
+from information_schema.tables where table_schema=database() and table_name='t1';
+
+flush tables;
+select create_time is not null, update_time is not null, check_time
+from information_schema.tables where table_schema=database() and table_name='t1';
+
+select create_time, update_time into @create_tm, @update_tm
+from information_schema.tables
+where table_schema=database() and table_name='t1';
+
+select sleep(3);
+insert into t1 values (2);
+
+--vertical_results
+select
+ create_time=@create_tm /* should not change */ ,
+ timestampdiff(second, @update_tm, update_time) > 2,
+ check_time
+from information_schema.tables
+where table_schema=database() and table_name='t1';
+
+--echo #
+--echo # Check how create_time survives ALTER TABLE.
+--echo # First, an ALTER TABLE that re-creates the table:
+alter table t1 add b int;
+select sleep(2);
+
+select
+ create_time<>@create_tm /* should change */,
+ create_time IS NOT NULL,
+ update_time IS NULL
+from information_schema.tables
+where table_schema=database() and table_name='t1';
+
+insert into t1 values (5,5);
+
+select create_time, update_time into @create_tm, @update_tm
+from information_schema.tables
+where table_schema=database() and table_name='t1';
+
+select sleep(2);
+--echo # Then, an in-place ALTER TABLE:
+alter table t1 add key (a);
+
+--echo # create_time will change as .frm file is rewritten:
+select
+ create_time=@create_tm,
+ update_time
+from information_schema.tables
+where table_schema=database() and table_name='t1';
+
+--echo # Check TRUNCATE TABLE
+insert into t1 values (10,10);
+select create_time, update_time into @create_tm, @update_tm
+from information_schema.tables
+where table_schema=database() and table_name='t1';
+
+select sleep(2);
+truncate table t1;
+
+select
+ create_time=@create_tm /* should not change */,
+ update_time
+from information_schema.tables
+where table_schema=database() and table_name='t1';
+
+
+--echo #
+--echo # Check what is left after server restart
+--echo #
+
+--echo # Save t1's creation time
+create table t2 as
+select create_time
+from information_schema.tables
+where table_schema=database() and table_name='t1';
+
+--source include/restart_mysqld.inc
+
+select sleep(2);
+select
+ create_time=(select create_time from t2) /* should not change */,
+ update_time
+from information_schema.tables
+where table_schema=database() and table_name='t1';
+
+drop table t1, t2;
+
+--echo #
+--echo # Check how it works for partitioned tables
+--echo #
+create table t1 (pk int primary key) partition by hash(pk) partitions 2;
+insert into t1 values (1);
+
+select create_time IS NOT NULL , update_time IS NOT NULL
+from information_schema.tables
+where table_schema=database() and table_name='t1';
+drop table t1;
diff --git a/mysql-test/suite/rocksdb/t/truncate_table.test b/mysql-test/suite/rocksdb/t/truncate_table.test
index a61488654a3..1001eeb6cde 100644
--- a/mysql-test/suite/rocksdb/t/truncate_table.test
+++ b/mysql-test/suite/rocksdb/t/truncate_table.test
@@ -29,22 +29,22 @@ DROP TABLE t1;
CREATE TABLE t1 (a INT KEY AUTO_INCREMENT, c CHAR(8)) ENGINE=rocksdb;
#--replace_column 2 # 3 # 4 # 5 # 6 # 7 # 8 # 9 # 10 # 12 # 13 # 14 # 15 # 16 # 17 # 18 #
---replace_column 5 # 6 # 7 #
+--replace_column 5 # 6 # 7 # 12 # 13 #
SHOW TABLE STATUS LIKE 't1';
INSERT INTO t1 (c) VALUES ('a'),('b'),('c');
#--replace_column 2 # 3 # 4 # 5 # 6 # 7 # 8 # 9 # 10 # 12 # 13 # 14 # 15 # 16 # 17 # 18 #
---replace_column 5 # 6 # 7 #
+--replace_column 5 # 6 # 7 # 12 # 13 #
SHOW TABLE STATUS LIKE 't1';
TRUNCATE TABLE t1;
#--replace_column 2 # 3 # 4 # 5 # 6 # 7 # 8 # 9 # 10 # 12 # 13 # 14 # 15 # 16 # 17 # 18 #
---replace_column 5 # 6 # 7 #
+--replace_column 5 # 6 # 7 # 12 # 13 #
SHOW TABLE STATUS LIKE 't1';
INSERT INTO t1 (c) VALUES ('d');
#--replace_column 2 # 3 # 4 # 5 # 6 # 7 # 8 # 9 # 10 # 12 # 13 # 14 # 15 # 16 # 17 # 18 #
---replace_column 5 # 6 # 7 #
+--replace_column 5 # 6 # 7 # 12 # 13 #
SHOW TABLE STATUS LIKE 't1';
--sorted_result
diff --git a/storage/rocksdb/ha_rocksdb.cc b/storage/rocksdb/ha_rocksdb.cc
index d6c89087bea..34062d07cd9 100644
--- a/storage/rocksdb/ha_rocksdb.cc
+++ b/storage/rocksdb/ha_rocksdb.cc
@@ -2456,6 +2456,8 @@ class Rdb_transaction {
bool m_is_delayed_snapshot = false;
bool m_is_two_phase = false;
+ std::unordered_set<Rdb_tbl_def*> modified_tables;
+
private:
/*
Number of write operations this transaction had when we took the last
@@ -3139,6 +3141,25 @@ class Rdb_transaction {
virtual void start_tx() = 0;
virtual void start_stmt() = 0;
+ protected:
+ // Non-virtual functions with actions to be done on transaction start and
+ // commit.
+ void on_commit() {
+ time_t tm;
+ tm = time(nullptr);
+ for (auto &it : modified_tables) {
+ it->m_update_time = tm;
+ }
+ modified_tables.clear();
+ }
+ void on_rollback() {
+ modified_tables.clear();
+ }
+ public:
+ void log_table_write_op(Rdb_tbl_def *tbl) {
+ modified_tables.insert(tbl);
+ }
+
void set_initial_savepoint() {
/*
Set the initial savepoint. If the first statement in the transaction
@@ -3322,7 +3343,9 @@ class Rdb_transaction_impl : public Rdb_transaction {
goto error;
}
+ on_commit();
error:
+ on_rollback();
/* Save the transaction object to be reused */
release_tx();
@@ -3338,6 +3361,7 @@ class Rdb_transaction_impl : public Rdb_transaction {
public:
void rollback() override {
+ on_rollback();
m_write_count = 0;
m_insert_count = 0;
m_update_count = 0;
@@ -3659,7 +3683,9 @@ class Rdb_writebatch_impl : public Rdb_transaction {
res = true;
goto error;
}
+ on_commit();
error:
+ on_rollback();
reset();
m_write_count = 0;
@@ -3691,6 +3717,7 @@ class Rdb_writebatch_impl : public Rdb_transaction {
}
void rollback() override {
+ on_rollback();
m_write_count = 0;
m_insert_count = 0;
m_update_count = 0;
@@ -10177,6 +10204,8 @@ int ha_rocksdb::update_write_row(const uchar *const old_data,
row_info.tx->incr_insert_count();
}
+ row_info.tx->log_table_write_op(m_tbl_def);
+
if (do_bulk_commit(row_info.tx)) {
DBUG_RETURN(HA_ERR_ROCKSDB_BULK_LOAD);
}
@@ -10648,6 +10677,7 @@ int ha_rocksdb::delete_row(const uchar *const buf) {
}
tx->incr_delete_count();
+ tx->log_table_write_op(m_tbl_def);
if (do_bulk_commit(tx)) {
DBUG_RETURN(HA_ERR_ROCKSDB_BULK_LOAD);
@@ -10802,6 +10832,12 @@ int ha_rocksdb::info(uint flag) {
k->rec_per_key[j] = x;
}
}
+
+ stats.create_time = m_tbl_def->get_create_time();
+ }
+
+ if (flag & HA_STATUS_TIME) {
+ stats.update_time = m_tbl_def->m_update_time;
}
if (flag & HA_STATUS_ERRKEY) {
diff --git a/storage/rocksdb/rdb_datadic.cc b/storage/rocksdb/rdb_datadic.cc
index c0741a1ce9b..166b22c62ff 100644
--- a/storage/rocksdb/rdb_datadic.cc
+++ b/storage/rocksdb/rdb_datadic.cc
@@ -3563,6 +3563,26 @@ bool Rdb_tbl_def::put_dict(Rdb_dict_manager *const dict,
return false;
}
+time_t Rdb_tbl_def::get_create_time() {
+ time_t create_time = m_create_time;
+
+ if (create_time == CREATE_TIME_UNKNOWN) {
+ // Read it from the .frm file. It's not a problem if several threads do this
+ // concurrently
+ char path[FN_REFLEN];
+ snprintf(path, sizeof(path), "%s/%s/%s%s", mysql_data_home,
+ m_dbname.c_str(), m_tablename.c_str(), reg_ext);
+ unpack_filename(path,path);
+ MY_STAT f_stat;
+ if (my_stat(path, &f_stat, MYF(0)))
+ create_time = f_stat.st_ctime;
+ else
+ create_time = 0; // will be shown as SQL NULL
+ m_create_time = create_time;
+ }
+ return create_time;
+}
+
// Length that each index flag takes inside the record.
// Each index in the array maps to the enum INDEX_FLAG
static const std::array<uint, 1> index_flag_lengths = {
diff --git a/storage/rocksdb/rdb_datadic.h b/storage/rocksdb/rdb_datadic.h
index 416857cad38..0bf1372410f 100644
--- a/storage/rocksdb/rdb_datadic.h
+++ b/storage/rocksdb/rdb_datadic.h
@@ -1094,7 +1094,9 @@ class Rdb_tbl_def {
: m_key_descr_arr(nullptr),
m_hidden_pk_val(0),
m_auto_incr_val(0),
- m_tbl_stats() {
+ m_tbl_stats(),
+ m_update_time(0),
+ m_create_time(CREATE_TIME_UNKNOWN) {
set_name(name);
}
@@ -1102,7 +1104,9 @@ class Rdb_tbl_def {
: m_key_descr_arr(nullptr),
m_hidden_pk_val(0),
m_auto_incr_val(0),
- m_tbl_stats() {
+ m_tbl_stats(),
+ m_update_time(0),
+ m_create_time(CREATE_TIME_UNKNOWN) {
set_name(std::string(name, len));
}
@@ -1110,7 +1114,9 @@ class Rdb_tbl_def {
: m_key_descr_arr(nullptr),
m_hidden_pk_val(0),
m_auto_incr_val(0),
- m_tbl_stats() {
+ m_tbl_stats(),
+ m_update_time(0),
+ m_create_time(CREATE_TIME_UNKNOWN) {
set_name(std::string(slice.data() + pos, slice.size() - pos));
}
@@ -1161,6 +1167,14 @@ class Rdb_tbl_def {
const std::string &base_tablename() const { return m_tablename; }
const std::string &base_partition() const { return m_partition; }
GL_INDEX_ID get_autoincr_gl_index_id();
+
+ time_t get_create_time();
+ std::atomic<time_t> m_update_time; // in-memory only value
+ private:
+ const time_t CREATE_TIME_UNKNOWN = 1;
+ // CREATE_TIME_UNKNOWN means "didn't try to read, yet"
+ // 0 means "no data available"
+ std::atomic<time_t> m_create_time;
};
/*
1
0

[Commits] b4a950866d1: Post-rebase fixes: update to re-based RocksDB with range locking
by psergey 18 Nov '19
by psergey 18 Nov '19
18 Nov '19
revision-id: b4a950866d15a0dbcbee527c2ab24ae9f2c8f2c6 (fb-prod201903-255-gb4a950866d1)
parent(s): 9b215dc1969cce0bd6bc8716bc0fb44689fad523
author: Sergei Petrunia
committer: Sergei Petrunia
timestamp: 2019-11-18 17:49:34 +0300
message:
Post-rebase fixes: update to re-based RocksDB with range locking
---
rocksdb | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/rocksdb b/rocksdb
index 125d429de0b..156a12604b8 160000
--- a/rocksdb
+++ b/rocksdb
@@ -1 +1 @@
-Subproject commit 125d429de0b50e62cee15767b0b5073a88c209d6
+Subproject commit 156a12604b830df3765a12d7194ca5f9ca7d67e6
1
0

[Commits] 409ed60bb84: Fix compile failure on Windows: use explicit type casts
by psergey 16 Nov '19
by psergey 16 Nov '19
16 Nov '19
revision-id: 409ed60bb844252d46127c5df37d23fecf52159f (mariadb-10.3.20-10-g409ed60bb84)
parent(s): 86167e908fe5de6f6e9f5076b4ea8041514d0820
author: Sergei Petrunia
committer: Sergei Petrunia
timestamp: 2019-11-16 13:18:24 +0300
message:
Fix compile failure on Windows: use explicit type casts
---
sql/multi_range_read.cc | 6 +++---
1 file changed, 3 insertions(+), 3 deletions(-)
diff --git a/sql/multi_range_read.cc b/sql/multi_range_read.cc
index 6d62ea07dfa..5e0afd6edb2 100644
--- a/sql/multi_range_read.cc
+++ b/sql/multi_range_read.cc
@@ -1714,9 +1714,9 @@ bool DsMrr_impl::get_disk_sweep_mrr_cost(uint keynr, ha_rows rows, uint flags,
Request memory to finish the scan in one pass but do not request
more than @@mrr_buff_size.
*/
- *buffer_size = MY_MIN(extra_mem_overhead + rows*elem_size,
- MY_MAX(table->in_use->variables.mrr_buff_size,
- extra_mem_overhead));
+ *buffer_size= (uint) MY_MIN(extra_mem_overhead + elem_size*(ulong)rows,
+ MY_MAX(table->in_use->variables.mrr_buff_size,
+ extra_mem_overhead));
}
if (elem_size + extra_mem_overhead > *buffer_size)
1
0

[Commits] 86167e908fe: MDEV-20611: MRR scan over partitioned InnoDB table produces "Out of memory" error
by psergey 15 Nov '19
by psergey 15 Nov '19
15 Nov '19
revision-id: 86167e908fe5de6f6e9f5076b4ea8041514d0820 (mariadb-10.3.20-9-g86167e908fe)
parent(s): 3d4a80153345209bad736235d4f66dcaa51a9d51
author: Sergei Petrunia
committer: Sergei Petrunia
timestamp: 2019-11-15 23:37:28 +0300
message:
MDEV-20611: MRR scan over partitioned InnoDB table produces "Out of memory" error
Fix partitioning and DS-MRR to work together
- In ha_partition::index_end(): take into account that ha_innobase (and
other engines using DS-MRR) will have inited=RND when initialized for
DS-MRR scan.
- In ha_partition::multi_range_read_next(): if the MRR scan is using
HA_MRR_NO_ASSOCIATION mode, it is not guaranteed that the partition's
handler will store anything into *range_info.
- In DsMrr_impl::choose_mrr_impl(): ha_partition will inquire partitions
about how much memory their MRR implementation needs by passing
*buffer_size=0. DS-MRR code didn't know about this (actually it used
uint for buffer size calculation and would have an under-flow).
Returning *buffer_size=0 made ha_partition assume that partitions do
not need MRR memory and pass the same buffer to each of them.
Now, this is fixed. If DS-MRR gets *buffer_size=0, it will return
the amount of buffer space needed, but not more than about
@@mrr_buffer_size.
* Fix ha_{innobase,maria,myisam}::clone. If ha_partition uses MRR on its
partitions, and partition use DS-MRR, the code will call handler->clone
with TABLE (*NOT partition*) name as an argument.
DS-MRR has no way of knowing the partition name, so the solution was
to have the ::clone() function for the affected storage engine to ignore
the name argument and get it elsewhere.
---
mysql-test/include/partition_mrr.inc | 46 +++++++++++++++++
mysql-test/main/partition_mrr_aria.result | 79 +++++++++++++++++++++++++++++
mysql-test/main/partition_mrr_aria.test | 2 +
mysql-test/main/partition_mrr_innodb.result | 79 +++++++++++++++++++++++++++++
mysql-test/main/partition_mrr_innodb.test | 4 ++
mysql-test/main/partition_mrr_myisam.result | 79 +++++++++++++++++++++++++++++
mysql-test/main/partition_mrr_myisam.test | 3 ++
sql/ha_partition.cc | 14 ++++-
sql/multi_range_read.cc | 33 +++++++++---
sql/multi_range_read.h | 5 +-
storage/innobase/handler/ha_innodb.cc | 2 +-
storage/maria/ha_maria.cc | 8 +--
storage/myisam/ha_myisam.cc | 7 +--
13 files changed, 343 insertions(+), 18 deletions(-)
diff --git a/mysql-test/include/partition_mrr.inc b/mysql-test/include/partition_mrr.inc
new file mode 100644
index 00000000000..4c285791ec7
--- /dev/null
+++ b/mysql-test/include/partition_mrr.inc
@@ -0,0 +1,46 @@
+--source include/have_partition.inc
+
+--disable_warnings
+drop table if exists t1,t3;
+--enable_warnings
+
+--echo #
+--echo # MDEV-20611: MRR scan over partitioned InnoDB table produces "Out of memory" error
+--echo #
+create table t1(a int);
+insert into t1 values (0),(1),(2),(3),(4),(5),(6),(7),(8),(9);
+
+set @tmp=@@storage_engine;
+eval set storage_engine=$engine_type;
+
+create table t3 (
+ ID bigint(20) NOT NULL AUTO_INCREMENT,
+ part_id int,
+ key_col int,
+ col2 int,
+ key(key_col),
+ PRIMARY KEY (ID,part_id)
+) PARTITION BY RANGE (part_id)
+(PARTITION p1 VALUES LESS THAN (3),
+ PARTITION p2 VALUES LESS THAN (7),
+ PARTITION p3 VALUES LESS THAN (10)
+);
+
+show create table t3;
+set storage_engine= @tmp;
+
+insert into t3 select
+ A.a+10*B.a,
+ A.a,
+ B.a,
+ 123456
+from t1 A, t1 B;
+
+set optimizer_switch='mrr=on';
+--replace_column 9 #
+explain
+select * from t3 force index (key_col) where key_col < 3;
+select * from t3 force index (key_col) where key_col < 3;
+
+drop table t1,t3;
+
diff --git a/mysql-test/main/partition_mrr_aria.result b/mysql-test/main/partition_mrr_aria.result
new file mode 100644
index 00000000000..7a0c35a309e
--- /dev/null
+++ b/mysql-test/main/partition_mrr_aria.result
@@ -0,0 +1,79 @@
+drop table if exists t1,t3;
+#
+# MDEV-20611: MRR scan over partitioned InnoDB table produces "Out of memory" error
+#
+create table t1(a int);
+insert into t1 values (0),(1),(2),(3),(4),(5),(6),(7),(8),(9);
+set @tmp=@@storage_engine;
+set storage_engine=Aria;
+create table t3 (
+ID bigint(20) NOT NULL AUTO_INCREMENT,
+part_id int,
+key_col int,
+col2 int,
+key(key_col),
+PRIMARY KEY (ID,part_id)
+) PARTITION BY RANGE (part_id)
+(PARTITION p1 VALUES LESS THAN (3),
+PARTITION p2 VALUES LESS THAN (7),
+PARTITION p3 VALUES LESS THAN (10)
+);
+show create table t3;
+Table Create Table
+t3 CREATE TABLE `t3` (
+ `ID` bigint(20) NOT NULL AUTO_INCREMENT,
+ `part_id` int(11) NOT NULL,
+ `key_col` int(11) DEFAULT NULL,
+ `col2` int(11) DEFAULT NULL,
+ PRIMARY KEY (`ID`,`part_id`),
+ KEY `key_col` (`key_col`)
+) ENGINE=Aria DEFAULT CHARSET=latin1
+ PARTITION BY RANGE (`part_id`)
+(PARTITION `p1` VALUES LESS THAN (3) ENGINE = Aria,
+ PARTITION `p2` VALUES LESS THAN (7) ENGINE = Aria,
+ PARTITION `p3` VALUES LESS THAN (10) ENGINE = Aria)
+set storage_engine= @tmp;
+insert into t3 select
+A.a+10*B.a,
+A.a,
+B.a,
+123456
+from t1 A, t1 B;
+set optimizer_switch='mrr=on';
+explain
+select * from t3 force index (key_col) where key_col < 3;
+id select_type table type possible_keys key key_len ref rows Extra
+1 SIMPLE t3 range key_col key_col 5 NULL # Using where; Rowid-ordered scan
+select * from t3 force index (key_col) where key_col < 3;
+ID part_id key_col col2
+1 0 0 123456
+1 1 0 123456
+2 2 0 123456
+10 0 1 123456
+11 1 1 123456
+12 2 1 123456
+20 0 2 123456
+21 1 2 123456
+22 2 2 123456
+3 3 0 123456
+4 4 0 123456
+5 5 0 123456
+6 6 0 123456
+13 3 1 123456
+14 4 1 123456
+15 5 1 123456
+16 6 1 123456
+23 3 2 123456
+24 4 2 123456
+25 5 2 123456
+26 6 2 123456
+7 7 0 123456
+8 8 0 123456
+9 9 0 123456
+17 7 1 123456
+18 8 1 123456
+19 9 1 123456
+27 7 2 123456
+28 8 2 123456
+29 9 2 123456
+drop table t1,t3;
diff --git a/mysql-test/main/partition_mrr_aria.test b/mysql-test/main/partition_mrr_aria.test
new file mode 100644
index 00000000000..e3dfe8cd9b5
--- /dev/null
+++ b/mysql-test/main/partition_mrr_aria.test
@@ -0,0 +1,2 @@
+let $engine_type= Aria;
+--source include/partition_mrr.inc
diff --git a/mysql-test/main/partition_mrr_innodb.result b/mysql-test/main/partition_mrr_innodb.result
new file mode 100644
index 00000000000..c188f7e9929
--- /dev/null
+++ b/mysql-test/main/partition_mrr_innodb.result
@@ -0,0 +1,79 @@
+drop table if exists t1,t3;
+#
+# MDEV-20611: MRR scan over partitioned InnoDB table produces "Out of memory" error
+#
+create table t1(a int);
+insert into t1 values (0),(1),(2),(3),(4),(5),(6),(7),(8),(9);
+set @tmp=@@storage_engine;
+set storage_engine=InnoDB;
+create table t3 (
+ID bigint(20) NOT NULL AUTO_INCREMENT,
+part_id int,
+key_col int,
+col2 int,
+key(key_col),
+PRIMARY KEY (ID,part_id)
+) PARTITION BY RANGE (part_id)
+(PARTITION p1 VALUES LESS THAN (3),
+PARTITION p2 VALUES LESS THAN (7),
+PARTITION p3 VALUES LESS THAN (10)
+);
+show create table t3;
+Table Create Table
+t3 CREATE TABLE `t3` (
+ `ID` bigint(20) NOT NULL AUTO_INCREMENT,
+ `part_id` int(11) NOT NULL,
+ `key_col` int(11) DEFAULT NULL,
+ `col2` int(11) DEFAULT NULL,
+ PRIMARY KEY (`ID`,`part_id`),
+ KEY `key_col` (`key_col`)
+) ENGINE=InnoDB DEFAULT CHARSET=latin1
+ PARTITION BY RANGE (`part_id`)
+(PARTITION `p1` VALUES LESS THAN (3) ENGINE = InnoDB,
+ PARTITION `p2` VALUES LESS THAN (7) ENGINE = InnoDB,
+ PARTITION `p3` VALUES LESS THAN (10) ENGINE = InnoDB)
+set storage_engine= @tmp;
+insert into t3 select
+A.a+10*B.a,
+A.a,
+B.a,
+123456
+from t1 A, t1 B;
+set optimizer_switch='mrr=on';
+explain
+select * from t3 force index (key_col) where key_col < 3;
+id select_type table type possible_keys key key_len ref rows Extra
+1 SIMPLE t3 range key_col key_col 5 NULL # Using where; Rowid-ordered scan
+select * from t3 force index (key_col) where key_col < 3;
+ID part_id key_col col2
+1 0 0 123456
+1 1 0 123456
+2 2 0 123456
+10 0 1 123456
+11 1 1 123456
+12 2 1 123456
+20 0 2 123456
+21 1 2 123456
+22 2 2 123456
+3 3 0 123456
+4 4 0 123456
+5 5 0 123456
+6 6 0 123456
+13 3 1 123456
+14 4 1 123456
+15 5 1 123456
+16 6 1 123456
+23 3 2 123456
+24 4 2 123456
+25 5 2 123456
+26 6 2 123456
+7 7 0 123456
+8 8 0 123456
+9 9 0 123456
+17 7 1 123456
+18 8 1 123456
+19 9 1 123456
+27 7 2 123456
+28 8 2 123456
+29 9 2 123456
+drop table t1,t3;
diff --git a/mysql-test/main/partition_mrr_innodb.test b/mysql-test/main/partition_mrr_innodb.test
new file mode 100644
index 00000000000..1eccf070e5c
--- /dev/null
+++ b/mysql-test/main/partition_mrr_innodb.test
@@ -0,0 +1,4 @@
+--source include/have_innodb.inc
+let $engine_type= InnoDB;
+
+--source include/partition_mrr.inc
diff --git a/mysql-test/main/partition_mrr_myisam.result b/mysql-test/main/partition_mrr_myisam.result
new file mode 100644
index 00000000000..1f1cea8e9d6
--- /dev/null
+++ b/mysql-test/main/partition_mrr_myisam.result
@@ -0,0 +1,79 @@
+drop table if exists t1,t3;
+#
+# MDEV-20611: MRR scan over partitioned InnoDB table produces "Out of memory" error
+#
+create table t1(a int);
+insert into t1 values (0),(1),(2),(3),(4),(5),(6),(7),(8),(9);
+set @tmp=@@storage_engine;
+set storage_engine=myisam;
+create table t3 (
+ID bigint(20) NOT NULL AUTO_INCREMENT,
+part_id int,
+key_col int,
+col2 int,
+key(key_col),
+PRIMARY KEY (ID,part_id)
+) PARTITION BY RANGE (part_id)
+(PARTITION p1 VALUES LESS THAN (3),
+PARTITION p2 VALUES LESS THAN (7),
+PARTITION p3 VALUES LESS THAN (10)
+);
+show create table t3;
+Table Create Table
+t3 CREATE TABLE `t3` (
+ `ID` bigint(20) NOT NULL AUTO_INCREMENT,
+ `part_id` int(11) NOT NULL,
+ `key_col` int(11) DEFAULT NULL,
+ `col2` int(11) DEFAULT NULL,
+ PRIMARY KEY (`ID`,`part_id`),
+ KEY `key_col` (`key_col`)
+) ENGINE=MyISAM DEFAULT CHARSET=latin1
+ PARTITION BY RANGE (`part_id`)
+(PARTITION `p1` VALUES LESS THAN (3) ENGINE = MyISAM,
+ PARTITION `p2` VALUES LESS THAN (7) ENGINE = MyISAM,
+ PARTITION `p3` VALUES LESS THAN (10) ENGINE = MyISAM)
+set storage_engine= @tmp;
+insert into t3 select
+A.a+10*B.a,
+A.a,
+B.a,
+123456
+from t1 A, t1 B;
+set optimizer_switch='mrr=on';
+explain
+select * from t3 force index (key_col) where key_col < 3;
+id select_type table type possible_keys key key_len ref rows Extra
+1 SIMPLE t3 range key_col key_col 5 NULL # Using where; Rowid-ordered scan
+select * from t3 force index (key_col) where key_col < 3;
+ID part_id key_col col2
+1 0 0 123456
+1 1 0 123456
+2 2 0 123456
+10 0 1 123456
+11 1 1 123456
+12 2 1 123456
+20 0 2 123456
+21 1 2 123456
+22 2 2 123456
+3 3 0 123456
+4 4 0 123456
+5 5 0 123456
+6 6 0 123456
+13 3 1 123456
+14 4 1 123456
+15 5 1 123456
+16 6 1 123456
+23 3 2 123456
+24 4 2 123456
+25 5 2 123456
+26 6 2 123456
+7 7 0 123456
+8 8 0 123456
+9 9 0 123456
+17 7 1 123456
+18 8 1 123456
+19 9 1 123456
+27 7 2 123456
+28 8 2 123456
+29 9 2 123456
+drop table t1,t3;
diff --git a/mysql-test/main/partition_mrr_myisam.test b/mysql-test/main/partition_mrr_myisam.test
new file mode 100644
index 00000000000..d67a37ab3d2
--- /dev/null
+++ b/mysql-test/main/partition_mrr_myisam.test
@@ -0,0 +1,3 @@
+let $engine_type= myisam;
+
+--source include/partition_mrr.inc
diff --git a/sql/ha_partition.cc b/sql/ha_partition.cc
index ccda01de6b7..09664deb458 100644
--- a/sql/ha_partition.cc
+++ b/sql/ha_partition.cc
@@ -5478,6 +5478,13 @@ int ha_partition::index_end()
if ((tmp= (*file)->ha_index_end()))
error= tmp;
}
+ else if ((*file)->inited == RND)
+ {
+ // Possible due to MRR
+ int tmp;
+ if ((tmp= (*file)->ha_rnd_end()))
+ error= tmp;
+ }
} while (*(++file));
destroy_record_priority_queue();
DBUG_RETURN(error);
@@ -6519,8 +6526,11 @@ int ha_partition::multi_range_read_next(range_id_t *range_info)
else if (unlikely((error= handle_unordered_next(table->record[0], FALSE))))
DBUG_RETURN(error);
- *range_info=
- ((PARTITION_KEY_MULTI_RANGE *) m_range_info[m_last_part])->ptr;
+ if (!(m_mrr_mode & HA_MRR_NO_ASSOCIATION))
+ {
+ *range_info=
+ ((PARTITION_KEY_MULTI_RANGE *) m_range_info[m_last_part])->ptr;
+ }
}
DBUG_RETURN(0);
}
diff --git a/sql/multi_range_read.cc b/sql/multi_range_read.cc
index f40c8d0fbd8..6d62ea07dfa 100644
--- a/sql/multi_range_read.cc
+++ b/sql/multi_range_read.cc
@@ -1589,11 +1589,10 @@ bool DsMrr_impl::choose_mrr_impl(uint keyno, ha_rows rows, uint *flags,
}
uint add_len= share->key_info[keyno].key_length + primary_file->ref_length;
- *bufsz -= add_len;
- if (get_disk_sweep_mrr_cost(keyno, rows, *flags, bufsz, &dsmrr_cost))
+ if (get_disk_sweep_mrr_cost(keyno, rows, *flags, bufsz, add_len,
+ &dsmrr_cost))
return TRUE;
- *bufsz += add_len;
-
+
bool force_dsmrr;
/*
If mrr_cost_based flag is not set, then set cost of DS-MRR to be minimum of
@@ -1682,6 +1681,11 @@ static void get_sort_and_sweep_cost(TABLE *table, ha_rows nrows, Cost_estimate *
@param rows E(Number of rows to be scanned)
@param flags Scan parameters (HA_MRR_* flags)
@param buffer_size INOUT Buffer size
+ IN: Buffer of size 0 means the function
+ will determine the best size and return it.
+ @param extra_mem_overhead Extra memory overhead of the MRR implementation
+ (the function assumes this many bytes of buffer
+ space will not be usable by DS-MRR)
@param cost OUT The cost
@retval FALSE OK
@@ -1690,7 +1694,9 @@ static void get_sort_and_sweep_cost(TABLE *table, ha_rows nrows, Cost_estimate *
*/
bool DsMrr_impl::get_disk_sweep_mrr_cost(uint keynr, ha_rows rows, uint flags,
- uint *buffer_size, Cost_estimate *cost)
+ uint *buffer_size,
+ uint extra_mem_overhead,
+ Cost_estimate *cost)
{
ulong max_buff_entries, elem_size;
ha_rows rows_in_full_step;
@@ -1700,11 +1706,24 @@ bool DsMrr_impl::get_disk_sweep_mrr_cost(uint keynr, ha_rows rows, uint flags,
elem_size= primary_file->ref_length +
sizeof(void*) * (!MY_TEST(flags & HA_MRR_NO_ASSOCIATION));
- max_buff_entries = *buffer_size / elem_size;
- if (!max_buff_entries)
+ if (!*buffer_size)
+ {
+ /*
+ We are requested to determine how much memory we need.
+ Request memory to finish the scan in one pass but do not request
+ more than @@mrr_buff_size.
+ */
+ *buffer_size = MY_MIN(extra_mem_overhead + rows*elem_size,
+ MY_MAX(table->in_use->variables.mrr_buff_size,
+ extra_mem_overhead));
+ }
+
+ if (elem_size + extra_mem_overhead > *buffer_size)
return TRUE; /* Buffer has not enough space for even 1 rowid */
+ max_buff_entries = (*buffer_size - extra_mem_overhead) / elem_size;
+
/* Number of iterations we'll make with full buffer */
n_full_steps= (uint)floor(rows2double(rows) / max_buff_entries);
diff --git a/sql/multi_range_read.h b/sql/multi_range_read.h
index 85578aa312c..0473fef04ae 100644
--- a/sql/multi_range_read.h
+++ b/sql/multi_range_read.h
@@ -631,8 +631,9 @@ class DsMrr_impl
bool choose_mrr_impl(uint keyno, ha_rows rows, uint *flags, uint *bufsz,
Cost_estimate *cost);
- bool get_disk_sweep_mrr_cost(uint keynr, ha_rows rows, uint flags,
- uint *buffer_size, Cost_estimate *cost);
+ bool get_disk_sweep_mrr_cost(uint keynr, ha_rows rows, uint flags,
+ uint *buffer_size, uint extra_mem_overhead,
+ Cost_estimate *cost);
bool check_cpk_scan(THD *thd, TABLE_SHARE *share, uint keyno, uint mrr_flags);
bool setup_buffer_sharing(uint key_size_in_keybuf, key_part_map key_tuple_map);
diff --git a/storage/innobase/handler/ha_innodb.cc b/storage/innobase/handler/ha_innodb.cc
index 29f29544f29..eecc72ad1f6 100644
--- a/storage/innobase/handler/ha_innodb.cc
+++ b/storage/innobase/handler/ha_innodb.cc
@@ -6466,7 +6466,7 @@ ha_innobase::clone(
DBUG_ENTER("ha_innobase::clone");
ha_innobase* new_handler = static_cast<ha_innobase*>(
- handler::clone(name, mem_root));
+ handler::clone(m_prebuilt->table->name.m_name, mem_root));
if (new_handler != NULL) {
DBUG_ASSERT(new_handler->m_prebuilt != NULL);
diff --git a/storage/maria/ha_maria.cc b/storage/maria/ha_maria.cc
index 57537a69082..71456666fbe 100644
--- a/storage/maria/ha_maria.cc
+++ b/storage/maria/ha_maria.cc
@@ -1000,10 +1000,12 @@ can_enable_indexes(1), bulk_insert_single_undo(BULK_INSERT_NONE)
{}
-handler *ha_maria::clone(const char *name, MEM_ROOT *mem_root)
+handler *ha_maria::clone(const char *name __attribute__((unused)),
+ MEM_ROOT *mem_root)
{
- ha_maria *new_handler= static_cast <ha_maria *>(handler::clone(name,
- mem_root));
+ ha_maria *new_handler=
+ static_cast <ha_maria *>(handler::clone(file->s->open_file_name.str,
+ mem_root));
if (new_handler)
{
new_handler->file->state= file->state;
diff --git a/storage/myisam/ha_myisam.cc b/storage/myisam/ha_myisam.cc
index 40c8ea61ddc..9b4dff68683 100644
--- a/storage/myisam/ha_myisam.cc
+++ b/storage/myisam/ha_myisam.cc
@@ -702,10 +702,11 @@ ha_myisam::ha_myisam(handlerton *hton, TABLE_SHARE *table_arg)
can_enable_indexes(1)
{}
-handler *ha_myisam::clone(const char *name, MEM_ROOT *mem_root)
+handler *ha_myisam::clone(const char *name __attribute__((unused)),
+ MEM_ROOT *mem_root)
{
- ha_myisam *new_handler= static_cast <ha_myisam *>(handler::clone(name,
- mem_root));
+ ha_myisam *new_handler=
+ static_cast <ha_myisam *>(handler::clone(file->filename, mem_root));
if (new_handler)
new_handler->file->state= file->state;
return new_handler;
1
0

[Commits] b46e58ffb82: MDEV-21044: Wrong result when using a smaller size for sort buffer
by Varun 15 Nov '19
by Varun 15 Nov '19
15 Nov '19
revision-id: b46e58ffb828e90c83e3b1ee536b0e6d345570c6 (mariadb-10.1.41-92-gb46e58ffb82)
parent(s): 214023aa0e6ec00dcac386167a2b2cf9394b6c7e
author: Varun Gupta
committer: Varun Gupta
timestamp: 2019-11-15 12:33:32 +0530
message:
MDEV-21044: Wrong result when using a smaller size for sort buffer
Make sure that the sort buffers can store atleast one sort key.
This is needed to make sure that all merge buffers are read else
with no sort keys some merge buffers are skipped because the code
makes a conclusion there is no data to be read.
---
mysql-test/r/order_by.result | 30 ++++++++++++++++++++++++++++++
mysql-test/t/order_by.test | 16 ++++++++++++++++
sql/filesort.cc | 1 +
3 files changed, 47 insertions(+)
diff --git a/mysql-test/r/order_by.result b/mysql-test/r/order_by.result
index 4cd9aebdf49..380687554d7 100644
--- a/mysql-test/r/order_by.result
+++ b/mysql-test/r/order_by.result
@@ -3207,3 +3207,33 @@ pk
2
3
DROP TABLE t1;
+#
+# MDEV-21044: Wrong result when using a smaller size for sort buffer
+#
+create table t1(a varchar(765),b int);
+insert into t1 values ("a",1),("b",2),("c",3),("e",4);
+insert into t1 values ("d",5),("f",6),("g",7),("h",8);
+insert into t1 values ("k",11),("l",12),("i",9),("j",10);
+insert into t1 values ("m",13),("n",14),("o",15),("p",16);
+set @save_sort_buffer_size= @@sort_buffer_size;
+set sort_buffer_size=1024;
+select * from t1 order by b;
+a b
+a 1
+b 2
+c 3
+e 4
+d 5
+f 6
+g 7
+h 8
+i 9
+j 10
+k 11
+l 12
+m 13
+n 14
+o 15
+p 16
+set @@sort_buffer_size= @save_sort_buffer_size;
+drop table t1;
diff --git a/mysql-test/t/order_by.test b/mysql-test/t/order_by.test
index 1ca258d1d48..999c7314139 100644
--- a/mysql-test/t/order_by.test
+++ b/mysql-test/t/order_by.test
@@ -2141,3 +2141,19 @@ INSERT INTO t1 VALUES (1),(2),(3);
SELECT DISTINCT pk FROM t1 GROUP BY 'foo';
SELECT DISTINCT pk FROM t1;
DROP TABLE t1;
+
+--echo #
+--echo # MDEV-21044: Wrong result when using a smaller size for sort buffer
+--echo #
+
+create table t1(a varchar(765),b int);
+insert into t1 values ("a",1),("b",2),("c",3),("e",4);
+insert into t1 values ("d",5),("f",6),("g",7),("h",8);
+insert into t1 values ("k",11),("l",12),("i",9),("j",10);
+insert into t1 values ("m",13),("n",14),("o",15),("p",16);
+set @save_sort_buffer_size= @@sort_buffer_size;
+set sort_buffer_size=1024;
+select * from t1 order by b;
+set @@sort_buffer_size= @save_sort_buffer_size;
+drop table t1;
+
diff --git a/sql/filesort.cc b/sql/filesort.cc
index 4f195f68059..bb3e73343ad 100644
--- a/sql/filesort.cc
+++ b/sql/filesort.cc
@@ -343,6 +343,7 @@ ha_rows filesort(THD *thd, TABLE *table, SORT_FIELD *sortorder, uint s_length,
param.max_keys_per_buffer=((param.max_keys_per_buffer *
(param.rec_length + sizeof(char*))) /
param.rec_length - 1);
+ set_if_bigger(param.max_keys_per_buffer, 1);
maxbuffer--; // Offset from 0
if (merge_many_buff(¶m,
(uchar*) table_sort.get_sort_keys(),
1
0