Hi Sergei, On Sun, Apr 4, 2021 at 2:48 PM Sergei Golubchik <serg@mariadb.org> wrote:
Hi, Aleksey!
Note, it's a review of the `git diff 82e44d60d1e 8175ce8a5f1` so it's not only commit c4de76aeff8
On Apr 04, Aleksey Midenkov wrote:
revision-id: c4de76aeff8 (mariadb-10.5.2-540-gc4de76aeff8) parent(s): 82e44d60d1e author: Aleksey Midenkov <midenok@gmail.com> committer: Aleksey Midenkov <midenok@gmail.com> timestamp: 2021-03-31 21:17:55 +0300 message:
MDEV-17554 Auto-create new partition for system versioned tables with history partitioned by INTERVAL/LIMIT
Overall it's very good! I have few minor questions/comments, see below. But it's almost done, I think.
diff --git a/mysql-test/suite/versioning/r/delete_history.result b/mysql-test/suite/versioning/r/delete_history.result index cb865a835b3..90c9e4777bb 100644 --- a/mysql-test/suite/versioning/r/delete_history.result +++ b/mysql-test/suite/versioning/r/delete_history.result @@ -154,3 +152,18 @@ select * from t1; a 1 drop table t1; +# +# MDEV-17554 Auto-create new partition for system versioned tables with history partitioned by INTERVAL/LIMIT +# +# Don't auto-create new partition on DELETE HISTORY: +create or replace table t (a int) with system versioning +partition by system_time limit 1000 auto; +delete history from t; +show create table t; +Table Create Table +t CREATE TABLE `t` ( + `a` int(11) DEFAULT NULL +) ENGINE=DEFAULT_ENGINE DEFAULT CHARSET=latin1 WITH SYSTEM VERSIONING + PARTITION BY SYSTEM_TIME LIMIT 1000 AUTO +PARTITIONS 2
Hmm, and if DELETE HISTORY would auto-create new partitions, what output would one expect here?
I mean, how one can see whether the test result is correct or wrong?
By PARTITIONS value. Stale test case, fixed.
+drop table t; diff --git a/mysql-test/suite/versioning/t/partition.test b/mysql-test/suite/versioning/t/partition.test index 445f5844630..57b80ca0b47 100644 --- a/mysql-test/suite/versioning/t/partition.test +++ b/mysql-test/suite/versioning/t/partition.test @@ -1068,11 +1078,412 @@ alter table t1 add partition partitions 2; --replace_result $default_engine DEFAULT_ENGINE show create table t1; alter table t1 add partition partitions 8; +drop table t1; + +--echo # +--echo # MDEV-17554 Auto-create new partition for system versioned tables with history partitioned by INTERVAL/LIMIT +--echo # +create or replace table t1 (x int) with system versioning +partition by system_time limit 1 auto; +--replace_result $default_engine DEFAULT_ENGINE +show create table t1; + +--echo # INSERT, INSERT .. SELECT don't increment partitions
it's not really "increment", better say "don't auto-create"
Actually I like "increment" more. "Auto-create" overcomplicates phrases: --echo # Increment from 3 to 5 --echo # Increment from 3 to 6, manual names, LOCK TABLES --echo # Multiple increments in single command Besides "increment" is correct because PARTITIONS number is incremented.
+insert into t1 values (1); + +create or replace table t2 (y int); +insert into t2 values (2); + +insert into t1 select * from t2; +insert into t2 select * from t1;
why do you need a t2 table in this test?
t1 is not incremented in case of insert into t2 select * from t1;
+--replace_result $default_engine DEFAULT_ENGINE +show create table t1; + +--echo # Too many partitions error +set timestamp= unix_timestamp('2000-01-01 00:00:00'); +create or replace table t1 (x int) with system versioning +partition by system_time interval 1 hour auto; +set timestamp= unix_timestamp('2001-01-01 00:01:00'); +--error ER_TOO_MANY_PARTITIONS_ERROR +update t1 set x= x + 1; + +--enable_info +--echo # Increment from 3 to 5 +create or replace table t1 (x int) with system versioning +partition by system_time interval 3600 second +starts '2000-01-01 00:00:00' auto partitions 3; + +insert into t1 values (1); +--replace_result $default_engine DEFAULT_ENGINE +show create table t1; + +set timestamp= unix_timestamp('2000-01-01 02:00:00'); +update t1 set x= x + 1; +--replace_result $default_engine DEFAULT_ENGINE +show create table t1; + +set timestamp= unix_timestamp('2000-01-01 03:00:00'); +update t1 set x= x + 2; +--replace_result $default_engine DEFAULT_ENGINE +show create table t1; + +--echo # Increment from 3 to 6, manual names, LOCK TABLES +set timestamp= unix_timestamp('2000-01-01 00:00:00'); +create or replace table t1 (x int) with system versioning +partition by system_time interval 1 hour auto ( + partition p1 history, + partition p3 history, + partition pn current); + +insert into t1 values (1); --replace_result $default_engine DEFAULT_ENGINE show create table t1;
+set timestamp= unix_timestamp('2000-01-01 02:00:00'); +update t1 set x= x + 3; +--replace_result $default_engine DEFAULT_ENGINE +show create table t1; + +set timestamp= unix_timestamp('2000-01-01 03:00:00'); +update t1 set x= x + 4; +--replace_result $default_engine DEFAULT_ENGINE +show create table t1; + +set timestamp= unix_timestamp('2000-01-01 04:00:00'); +lock tables t1 write; +update t1 set x= x + 5; +--replace_result $default_engine DEFAULT_ENGINE +show create table t1; +unlock tables; +set timestamp= default; + +--echo # Couple of more LOCK TABLES cases +create or replace table t1 (x int) with system versioning +partition by system_time limit 1000 auto; +lock tables t1 write; +insert into t1 values (1); +update t1 set x= x + 1; +unlock tables; +--replace_result $default_engine DEFAULT_ENGINE +show create table t1;
what does this test case demonstrate?
Fixed stale test case.
+ +--echo # Overflow prevention under LOCK TABLES +create or replace table t1 (x int) +with system versioning partition by system_time +limit 10 auto; + +insert into t1 values (1), (2), (3), (4), (5), (6), (7), (8), (9); +update t1 set x= x + 10; + +lock tables t1 write; +update t1 set x= 1 where x = 11; +update t1 set x= 2 where x = 12; +update t1 set x= 3 where x = 13; +unlock tables; + +select count(x) from t1 partition (p0); +--replace_result $default_engine DEFAULT_ENGINE +show create table t1; +drop tables t1; + +--echo # Test VIEW, LOCK TABLES +set timestamp= unix_timestamp('2000-01-01 00:00:00'); +create or replace table t1 (x int) with system versioning +partition by system_time interval 1 hour auto partitions 3; +create or replace view v1 as select * from t1; + +insert into v1 values (1);
why would a view matter in this test case?
Made it matter.
+--replace_result $default_engine DEFAULT_ENGINE +show create table t1; + +set timestamp= unix_timestamp('2000-01-01 01:00:00'); +lock tables t1 write; +update t1 set x= x + 3; +--replace_result $default_engine DEFAULT_ENGINE +show create table t1; +unlock tables; + +drop view v1; drop tables t1;
+--echo # Multiple increments in single command +set timestamp= unix_timestamp('2000-01-01 00:00:00'); +create or replace table t1 (x int) with system versioning +partition by system_time interval 1 hour auto partitions 3; + +create or replace table t2 (y int) with system versioning +partition by system_time interval 1 hour auto; + +insert into t1 values (1); +insert into t2 values (2); + +set timestamp= unix_timestamp('2000-01-01 01:00:00'); +update t1, t2 set x= x + 1, y= y + 1; +--replace_result $default_engine DEFAULT_ENGINE +show create table t1; +--replace_result $default_engine DEFAULT_ENGINE +show create table t2; + +set timestamp= unix_timestamp('2000-01-01 02:00:00'); +update t1, t2 set x= x + 1, y= y + 1; +--replace_result $default_engine DEFAULT_ENGINE +show create table t1; +--replace_result $default_engine DEFAULT_ENGINE +show create table t2; + +--echo # Here t2 is incremented too, but not updated +set timestamp= unix_timestamp('2000-01-01 03:00:00'); +update t1, t2 set t1.x= 0 where t1.x< t2.y; +--replace_result $default_engine DEFAULT_ENGINE +show create table t1; +# Multiupdate_prelocking_strategy::handle_end() is processed after table open. +# For PS it is possible to skip unneeded auto-creation because the above happens at +# prepare stage and auto-creation is done at execute stage. +--replace_result $default_engine DEFAULT_ENGINE 'PARTITIONS 4' 'PARTITIONS ok' 'PARTITIONS 5' 'PARTITIONS ok'
eh... I don't think this is really "ok". As far as I remember, Multiupdate_prelocking_strategy knows what tables should be opened for writing and what for reading. Why would a new partition be created for t2?
It knows this after tables are opened. Look at handle_end(), specifically mysql_handle_derived(), handle_derived(), setup_fields_with_no_wrap() and check_fields(). I believe all these calls are required to get proper get_table_map(). To get this working properly there must be 2-staged open tables, something like PS does.
+show create table t2; + +drop tables t1, t2; + +--echo # PS, SP, LOCK TABLES +set timestamp= unix_timestamp('2000-01-01 00:00:00'); +create or replace table t1 (x int) with system versioning +partition by system_time interval 1 hour auto; + +insert into t1 values (1); + +set timestamp= unix_timestamp('2000-01-01 01:00:00'); +execute immediate 'update t1 set x= x + 5'; +--replace_result $default_engine DEFAULT_ENGINE +show create table t1; + +prepare s from 'update t1 set x= x + 6'; +set timestamp= unix_timestamp('2000-01-01 02:00:00'); +execute s; execute s; +--replace_result $default_engine DEFAULT_ENGINE +show create table t1; + +set timestamp= unix_timestamp('2000-01-01 03:00:00'); +lock tables t1 write; +execute s; execute s; +--replace_result $default_engine DEFAULT_ENGINE +show create table t1; +unlock tables; +drop prepare s; + +create procedure sp() update t1 set x= x + 7; +set timestamp= unix_timestamp('2000-01-01 04:00:00'); +call sp; call sp; +--replace_result $default_engine DEFAULT_ENGINE +show create table t1; +set timestamp= unix_timestamp('2000-01-01 05:00:00'); +lock tables t1 write; +call sp; call sp; +--replace_result $default_engine DEFAULT_ENGINE +show create table t1; +unlock tables; +drop procedure sp; + +set timestamp= unix_timestamp('2001-01-01 00:00:00'); +create or replace table t1 (i int) with system versioning +partition by system_time interval 1 day starts '2000-01-01 00:00:00'; +insert into t1 values (0); +set timestamp= unix_timestamp('2001-01-01 00:00:01'); +prepare s from 'update t1 set i= i + 1'; +execute s; +set timestamp= unix_timestamp('2001-01-02 00:00:01'); +execute s; +drop prepare s; + +if (!$MTR_COMBINATION_HEAP)
because of blobs?
Because of blobs.
+{ +--echo # Complex table +set timestamp= unix_timestamp('2000-01-01 00:00:00'); +create or replace table t1 ( + x int primary key auto_increment, ... +--echo # Transaction +create or replace table t1 (x int) with system versioning engine innodb +partition by system_time interval 1 hour auto; +start transaction; +insert into t1 values (1); +select * from t1; + +--connect con1, localhost, root +set lock_wait_timeout= 1; +set innodb_lock_wait_timeout= 1; +--error ER_LOCK_WAIT_TIMEOUT +update t1 set x= x + 111; +select * from t1;
what do you test here? (there is no show create table in the test)
This was MENT-675 Assertion `thd->transaction.stmt.is_empty() || thd->in_sub_stmt' failed I moved it to MDEV-25339 and added a comment.
+ +# cleanup +--disconnect con1 +--connection default +drop table t1; + +set timestamp= unix_timestamp('2000-01-01 00:00:00'); +create or replace table t1 (x int) with system versioning engine innodb +partition by system_time interval 1 hour auto; + +insert into t1 values (1); +set timestamp= unix_timestamp('2000-01-01 01:00:00'); +start transaction; +update t1 set x= 0; +--connect con1, localhost, root +select * from t1;
if you add a show create table here, what would it show?
Added. +show create table t1; +Table Create Table +t1 CREATE TABLE `t1` ( + `x` int(11) DEFAULT NULL +) ENGINE=DEFAULT_ENGINE DEFAULT CHARSET=latin1 WITH SYSTEM VERSIONING + PARTITION BY SYSTEM_TIME INTERVAL 1 HOUR STARTS TIMESTAMP'2000-01-01 00:00:00' AUTO +PARTITIONS 3
+--connection default +commit; +show create table t1; + +set timestamp= unix_timestamp('2000-01-01 02:00:00'); +start transaction; +update t1 set x= 1; +--connection con1 +select * from t1; +--connection default +rollback; +show create table t1; +--disconnect con1 +--connection default +drop table t1; + +--echo # +--echo # MENT-724 Locking timeout caused by auto-creation affects original DML
I'd better avoid MENT references here. Let's only mention Jira issues that users can actually see
Fixed.
+--echo # +set timestamp= unix_timestamp('2000-01-01 00:00:00'); +create or replace table t1 (x int primary key) with system versioning engine innodb +partition by system_time interval 1 hour auto; + +insert into t1 values (1); +start transaction; +insert into t1 values (2); + +--connect con1, localhost, root +set lock_wait_timeout= 1; +set innodb_lock_wait_timeout= 1; +set timestamp= unix_timestamp('2000-01-01 01:00:01'); +update t1 set x= x + 122 where x = 1;
isn't it a test that you already have above? with x = x + 111
Not fully. That was the original bug posted by Elena (MDEV-25339). And this modification is posted by me (MDEV-23642).
+--disconnect con1 +--connection default +select * from t1; + +# cleanup +drop table t1; +set timestamp= default; + --echo # --echo # End of 10.5 tests --echo # diff --git a/mysql-test/suite/versioning/r/partition.result b/mysql-test/suite/versioning/r/partition.result index 2a277b1c2ea..8369b40d98c 100644 --- a/mysql-test/suite/versioning/r/partition.result +++ b/mysql-test/suite/versioning/r/partition.result @@ -1236,27 +1270,752 @@ t1 CREATE TABLE `t1` ( ... +set timestamp= unix_timestamp('2000-01-01 03:00:00'); +affected rows: 0 +lock tables t1 write; +affected rows: 0 +execute s; +affected rows: 1 +info: Rows matched: 1 Changed: 1 Inserted: 1 Warnings: 0 +execute s; +affected rows: 1 +info: Rows matched: 1 Changed: 1 Inserted: 1 Warnings: 0 +show create table t1; +Table Create Table +t1 CREATE TABLE `t1` ( + `x` int(11) DEFAULT NULL +) ENGINE=DEFAULT_ENGINE DEFAULT CHARSET=latin1 WITH SYSTEM VERSIONING + PARTITION BY SYSTEM_TIME INTERVAL 1 HOUR STARTS TIMESTAMP'2000-01-01 00:00:00' AUTO +PARTITIONS 6
why did it add two partitions here?
Because of this condition /* When hist_part is almost full LOCK TABLES my overflow the partition as we can't add new partitions under LOCK TABLES. Reserve one more for this case. */ if (auto_hist && create_count < 2 && thd->lex->sql_command == SQLCOM_LOCK_TABLES && vers_info->hist_part->id + 1 == vers_info->now_part->id) create_count++;
+affected rows: 1 +unlock tables; +affected rows: 0 +drop prepare s; +affected rows: 0 +create procedure sp() update t1 set x= x + 7; +affected rows: 0 +set timestamp= unix_timestamp('2000-01-01 04:00:00'); +affected rows: 0 +call sp; +affected rows: 1 +call sp; +affected rows: 1 +show create table t1; +Table Create Table +t1 CREATE TABLE `t1` ( + `x` int(11) DEFAULT NULL +) ENGINE=DEFAULT_ENGINE DEFAULT CHARSET=latin1 WITH SYSTEM VERSIONING + PARTITION BY SYSTEM_TIME INTERVAL 1 HOUR STARTS TIMESTAMP'2000-01-01 00:00:00' AUTO +PARTITIONS 6 +affected rows: 1 +set timestamp= unix_timestamp('2000-01-01 05:00:00'); +affected rows: 0 +lock tables t1 write; +affected rows: 0 +call sp; +affected rows: 1 +call sp; +affected rows: 1 +show create table t1; +Table Create Table +t1 CREATE TABLE `t1` ( + `x` int(11) DEFAULT NULL +) ENGINE=DEFAULT_ENGINE DEFAULT CHARSET=latin1 WITH SYSTEM VERSIONING + PARTITION BY SYSTEM_TIME INTERVAL 1 HOUR STARTS TIMESTAMP'2000-01-01 00:00:00' AUTO +PARTITIONS 8
and two here
Same here.
+affected rows: 1 +unlock tables; +affected rows: 0 +drop procedure sp; +affected rows: 0 +set timestamp= unix_timestamp('2001-01-01 00:00:00'); +affected rows: 0 +create or replace table t1 (i int) with system versioning +partition by system_time interval 1 day starts '2000-01-01 00:00:00'; +affected rows: 0 +insert into t1 values (0); +affected rows: 1 +set timestamp= unix_timestamp('2001-01-01 00:00:01'); +affected rows: 0 +prepare s from 'update t1 set i= i + 1'; +affected rows: 0 +info: Statement prepared +execute s; +affected rows: 1 +info: Rows matched: 1 Changed: 1 Inserted: 1 Warnings: 1 +Warnings: +Warning 4114 Versioned table `test`.`t1`: last HISTORY partition (`p0`) is out of INTERVAL, need more HISTORY partitions
why?
Actually this is a typo in the test case: year 2000 vs 2001. Fixed.
+set timestamp= unix_timestamp('2001-01-02 00:00:01'); +affected rows: 0 +execute s; +affected rows: 1 +info: Rows matched: 1 Changed: 1 Inserted: 1 Warnings: 1 +Warnings: +Warning 4114 Versioned table `test`.`t1`: last HISTORY partition (`p0`) is out of INTERVAL, need more HISTORY partitions +drop prepare s; +affected rows: 0 +# Complex table ... diff --git a/mysql-test/suite/versioning/t/rpl.test b/mysql-test/suite/versioning/t/rpl.test index b5be68feece..54258a8bdf1 100644 --- a/mysql-test/suite/versioning/t/rpl.test +++ b/mysql-test/suite/versioning/t/rpl.test @@ -133,4 +133,44 @@ sync_slave_with_master; connection master; drop table t1;
+--echo # +--echo # MDEV-17554 Auto-create new partition for system versioned tables with history partitioned by INTERVAL/LIMIT +--echo # +set timestamp= unix_timestamp('2000-01-01 00:00:00'); +create or replace table t1 (x int) with system versioning +partition by system_time interval 1 hour auto; +insert t1 values (); +set timestamp= unix_timestamp('2000-01-01 01:00:00'); +delete from t1; +--replace_result InnoDB ENGINE MyISAM ENGINE MEMORY ENGINE +show create table t1; +--sync_slave_with_master +--replace_result InnoDB ENGINE MyISAM ENGINE MEMORY ENGINE +show create table t1; +--connection master +drop table t1; +set timestamp= default; + +--echo # +--echo # MENT-685 DML events for auto-partitioned tables are written into binary log twice
same comment about MENT
Fixed.
+--echo # +create table t1 (a int) with system versioning +partition by system_time limit 1 auto; + +insert into t1 values (1); +update t1 set a= a + 1; +update t1 set a= a + 1; +--replace_result InnoDB ENGINE MyISAM ENGINE MEMORY ENGINE +show create table t1; +select * from t1; + +--sync_slave_with_master +--replace_result InnoDB ENGINE MyISAM ENGINE MEMORY ENGINE +show create table t1; + +select * from t1; +--connection master +# cleanup +drop table t1;
may be show binlog events? you know, to verify that DML events aren't written into binary log twice
Added.
+ --source include/rpl_end.inc diff --git a/sql/ha_partition.h b/sql/ha_partition.h index 60a2d7f6762..7ade4419c22 100644 --- a/sql/ha_partition.h +++ b/sql/ha_partition.h @@ -1610,7 +1610,7 @@ class ha_partition :public handler for (; part_id < part_id_end; ++part_id) { handler *file= m_file[part_id]; - DBUG_ASSERT(bitmap_is_set(&(m_part_info->read_partitions), part_id)); + bitmap_set_bit(&(m_part_info->read_partitions), part_id);
Where was it set before your patch?
#1 0x0000000000a99d04 in partition_info::set_partition_bitmaps (this=0x7fc214b7af80, partition_names=0x0 ) at ../src/sql/partition_info.cc:274 #2 0x0000000001087da4 in ha_partition::change_partitions_to_open (this=0x7fc214b787a0, partition_names=0 x0) at ../src/sql/ha_partition.cc:8644 #3 0x0000000000822c01 in set_partitions_as_used (tl=0x7fc214015988, t=0x7fc214a61fb8) at ../src/sql/sql_ base.cc:1562 #4 0x00000000008222a7 in open_table (thd=0x7fc214000d48, table_list=0x7fc214015988, ot_ctx=0x7fc23c0a19c 8) at ../src/sql/sql_base.cc:1990 #5 0x0000000000827497 in open_and_process_table (thd=0x7fc214000d48, tables=0x7fc214015988, counter=0x7f c23c0a1acc, flags=0, prelocking_strategy=0x7fc23c0a1b40, has_prelocking_list=false, ot_ctx=0x7fc23c0a19c8 ) at ../src/sql/sql_base.cc:3801 #6 0x0000000000825de0 in open_tables (thd=0x7fc214000d48, options=..., start=0x7fc23c0a1ae0, counter=0x7 fc23c0a1acc, flags=0, prelocking_strategy=0x7fc23c0a1b40) at ../src/sql/sql_base.cc:4275
file->info(HA_STATUS_VARIABLE | HA_STATUS_NO_LOCK | HA_STATUS_OPEN); part_recs+= file->stats.records; } diff --git a/sql/handler.cc b/sql/handler.cc index b312635c8ee..6bb6c279193 100644 --- a/sql/handler.cc +++ b/sql/handler.cc @@ -1572,7 +1572,7 @@ int ha_commit_trans(THD *thd, bool all) DBUG_ASSERT(thd->transaction->stmt.ha_list == NULL || trans == &thd->transaction->stmt);
- if (thd->in_sub_stmt) + if (thd->in_sub_stmt & ~SUB_STMT_AUTO_HIST) {
where is this ha_commit_trans(thd, false) called from? mysql_alter_table that adds a new partition?
Looks like garbage. Removed SUB_STMT_AUTO_HIST.
DBUG_ASSERT(0); /* diff --git a/sql/partition_info.cc b/sql/partition_info.cc index 871411cf6c4..cf8dd140553 100644 --- a/sql/partition_info.cc +++ b/sql/partition_info.cc @@ -816,10 +816,17 @@ bool partition_info::has_unique_name(partition_element *element) vers_info->interval Limit by fixed time interval vers_info->hist_part (out) Working history partition */ -void partition_info::vers_set_hist_part(THD *thd) +uint partition_info::vers_set_hist_part(THD *thd, bool auto_hist) { + DBUG_ASSERT(!thd->lex->last_table() || + !thd->lex->last_table()->vers_conditions.delete_history);
is that right? Conceptually you need to test vers_conditions.delete_history for the table that owns this partition_info. Is it always last_table() ?
It is always last_table() because DELETE HISTORY works with one table.
I'd say it'd be more logical to do, like
part_field_array[0]->table->pos_in_table_list
Fixed.
+ + uint create_count= 0; + auto_hist= auto_hist && vers_info->auto_hist; + if (vers_info->limit) { + DBUG_ASSERT(!vers_info->interval.is_set()); ha_partition *hp= (ha_partition*)(table->file); partition_element *next= NULL; List_iterator<partition_element> it(partitions); @@ -862,9 +869,183 @@ void partition_info::vers_set_hist_part(THD *thd) { vers_info->hist_part= next; if (next->range_value > thd->query_start()) - return; + { + error= false; + break; + } + } + if (error) + { + if (auto_hist) + { + DBUG_ASSERT(thd->query_start() >= vers_info->hist_part->range_value); + my_time_t diff= thd->query_start() - (my_time_t) vers_info->hist_part->range_value; + if (diff > 0) + { + size_t delta= vers_info->interval.seconds(); + create_count= (uint) (diff / delta + 1); + if (diff % delta) + create_count++; + } + else + create_count= 1; + } + else + { + my_error(WARN_VERS_PART_FULL, MYF(ME_WARNING|ME_ERROR_LOG),
after such an overflow a table becomes essentially corrupted, rows are in a wrong partition. So new history partitions cannot be added anymore without reorganizing the last partition.
How is it handled now? (it's just a question, not part of the review, as it's not something you were changing in your patch)
+ table->s->db.str, table->s->table_name.str, + vers_info->hist_part->partition_name, "INTERVAL"); + } } } + + /* + When hist_part is almost full LOCK TABLES my overflow the partition as we
s/my/may/
Fixed.
+ can't add new partitions under LOCK TABLES. Reserve one for this case. + */ + if (auto_hist && create_count == 0 && + thd->lex->sql_command == SQLCOM_LOCK_TABLES && + vers_info->hist_part->id + 1 == vers_info->now_part->id) + create_count= 1; + + return create_count; +} + + +/** + @brief Run fast_alter_partition_table() to add new history partitions + for tables requiring them. +*/ +bool vers_add_auto_hist_parts(THD *thd, TABLE_LIST* tl, uint num_parts) +{ + bool result= true; + HA_CREATE_INFO create_info; + Alter_info alter_info; + partition_info *save_part_info= thd->work_part_info; + Query_tables_list save_query_tables; + Reprepare_observer *save_reprepare_observer= thd->m_reprepare_observer; + bool save_no_write_to_binlog= thd->lex->no_write_to_binlog; + thd->m_reprepare_observer= NULL; + thd->lex->reset_n_backup_query_tables_list(&save_query_tables); + thd->in_sub_stmt|= SUB_STMT_AUTO_HIST; + thd->lex->no_write_to_binlog= true; + TABLE *table= tl->table; + + DBUG_ASSERT(!thd->is_error()); + + { + DBUG_ASSERT(table->s->get_table_ref_type() == TABLE_REF_BASE_TABLE); + DBUG_ASSERT(table->versioned()); + DBUG_ASSERT(table->part_info); + DBUG_ASSERT(table->part_info->vers_info); + alter_info.reset(); + alter_info.partition_flags= ALTER_PARTITION_ADD|ALTER_PARTITION_AUTO_HIST; + create_info.init(); + create_info.alter_info= &alter_info; + Alter_table_ctx alter_ctx(thd, tl, 1, &table->s->db, &table->s->table_name); + + MDL_REQUEST_INIT(&tl->mdl_request, MDL_key::TABLE, tl->db.str, + tl->table_name.str, MDL_SHARED_NO_WRITE, MDL_TRANSACTION); + if (thd->mdl_context.acquire_lock(&tl->mdl_request, + thd->variables.lock_wait_timeout)) + goto exit; + table->mdl_ticket= tl->mdl_request.ticket; + + create_info.db_type= table->s->db_type(); + create_info.options|= HA_VERSIONED_TABLE; + DBUG_ASSERT(create_info.db_type); + + create_info.vers_info.set_start(table->s->vers_start_field()->field_name); + create_info.vers_info.set_end(table->s->vers_end_field()->field_name); + + partition_info *part_info= new partition_info(); + if (unlikely(!part_info)) + { + my_error(ER_OUT_OF_RESOURCES, MYF(0)); + goto exit; + } + part_info->use_default_num_partitions= false; + part_info->use_default_num_subpartitions= false; + part_info->num_parts= num_parts; + part_info->num_subparts= table->part_info->num_subparts; + part_info->subpart_type= table->part_info->subpart_type; + if (unlikely(part_info->vers_init_info(thd))) + { + my_error(ER_OUT_OF_RESOURCES, MYF(0)); + goto exit; + } + + // NB: set_ok_status() requires DA_EMPTY + thd->get_stmt_da()->reset_diagnostics_area();
would it not be DA_EMPTY without a reset? this is at the beginning of a statement, I'd expect it normally be DA_EMPTY here. What other DA states can you get here?
You are right. Removed.
+ + thd->work_part_info= part_info; + if (part_info->set_up_defaults_for_partitioning(thd, table->file, NULL, + table->part_info->next_part_no(num_parts))) + { + push_warning(thd, Sql_condition::WARN_LEVEL_WARN, + ER_VERS_HIST_PART_FAILED, + "Auto-increment history partition: " + "setting up defaults failed"); + my_error(ER_VERS_HIST_PART_FAILED, MYF(ME_WARNING), + tl->db.str, tl->table_name.str); + goto exit; + } + bool partition_changed= false; + bool fast_alter_partition= false; + if (prep_alter_part_table(thd, table, &alter_info, &create_info, + &partition_changed, &fast_alter_partition)) + { + push_warning(thd, Sql_condition::WARN_LEVEL_WARN, ER_VERS_HIST_PART_FAILED, + "Auto-increment history partition: " + "alter partitition prepare failed"); + my_error(ER_VERS_HIST_PART_FAILED, MYF(ME_WARNING), + tl->db.str, tl->table_name.str); + goto exit; + } + if (!fast_alter_partition) + { + push_warning(thd, Sql_condition::WARN_LEVEL_WARN, ER_VERS_HIST_PART_FAILED, + "Auto-increment history partition: " + "fast alter partitition is not possible"); + my_error(ER_VERS_HIST_PART_FAILED, MYF(ME_WARNING), + tl->db.str, tl->table_name.str); + goto exit; + } + DBUG_ASSERT(partition_changed); + if (mysql_prepare_alter_table(thd, table, &create_info, &alter_info, + &alter_ctx)) + { + push_warning(thd, Sql_condition::WARN_LEVEL_WARN, ER_VERS_HIST_PART_FAILED, + "Auto-increment history partition: " + "alter prepare failed"); + my_error(ER_VERS_HIST_PART_FAILED, MYF(ME_WARNING), + tl->db.str, tl->table_name.str); + goto exit; + } + + if (fast_alter_partition_table(thd, table, &alter_info, &create_info, + tl, &table->s->db, &table->s->table_name)) + { + push_warning(thd, Sql_condition::WARN_LEVEL_WARN, ER_VERS_HIST_PART_FAILED, + "Auto-increment history partition: " + "alter partition table failed"); + my_error(ER_VERS_HIST_PART_FAILED, MYF(ME_WARNING), + tl->db.str, tl->table_name.str); + goto exit; + } + } + + result= false; + // NB: we have to return DA_EMPTY for new command
may be DBUG_ASSERT(thd->get_stmt_da()->is_ok());
Added.
+ thd->get_stmt_da()->reset_diagnostics_area(); + +exit: + thd->work_part_info= save_part_info; + thd->m_reprepare_observer= save_reprepare_observer; + thd->lex->restore_backup_query_tables_list(&save_query_tables); + thd->in_sub_stmt&= ~SUB_STMT_AUTO_HIST; + thd->lex->no_write_to_binlog= save_no_write_to_binlog; + return result; }
diff --git a/sql/partition_info.h b/sql/partition_info.h index 0656238ec07..94093353d60 100644 --- a/sql/partition_info.h +++ b/sql/partition_info.h @@ -72,9 +73,26 @@ struct Vers_part_info : public Sql_alloc my_time_t start; INTERVAL step; enum interval_type type; - bool is_set() { return type < INTERVAL_LAST; } + bool is_set() const { return type < INTERVAL_LAST; } + size_t seconds() const + { + if (step.second) + return step.second; + if (step.minute) + return step.minute * 60; + if (step.hour) + return step.hour * 3600; + if (step.day) + return step.day * 3600 * 24; + // comparison is used in rough estimates, it doesn't need to be calendar-correct + if (step.month) + return step.month * 3600 * 24 * 30;
shouldn't it be `* 28` ? you need a most pessimistic estimate to make sure you never miss to create a partition when one is needed. You can sometimes create one when it's not needed yet, but it's less of a problem.
Good catch. Made it calendar-correct.
+ DBUG_ASSERT(step.year); + return step.year * 86400 * 30 * 365;
that's one `* 30` too many :)
+ } } interval; ulonglong limit; + bool auto_hist; partition_element *now_part; partition_element *hist_part; }; diff --git a/sql/sql_base.cc b/sql/sql_base.cc index 1a1186aca73..d0e255186da 100644 --- a/sql/sql_base.cc +++ b/sql/sql_base.cc @@ -1625,6 +1625,52 @@ bool is_locked_view(THD *thd, TABLE_LIST *t) }
+bool TABLE::vers_need_hist_part(const THD *thd, const TABLE_LIST *table_list) const +{ +#ifdef WITH_PARTITION_STORAGE_ENGINE + if (part_info && part_info->part_type == VERSIONING_PARTITION && + !table_list->vers_conditions.delete_history && + !thd->stmt_arena->is_stmt_prepare() && + table_list->lock_type >= TL_WRITE_ALLOW_WRITE && + table_list->mdl_request.type >= MDL_SHARED_WRITE && + table_list->mdl_request.type < MDL_EXCLUSIVE) + { + switch (thd->lex->sql_command) + {
SQLCOM_LOAD with DUP_REPLACE?
Added.
+ case SQLCOM_INSERT: + if (thd->lex->duplicates != DUP_UPDATE) + break; + /* fall-through: */ + case SQLCOM_LOCK_TABLES: + case SQLCOM_DELETE: + case SQLCOM_UPDATE: + case SQLCOM_REPLACE: + case SQLCOM_REPLACE_SELECT: + case SQLCOM_DELETE_MULTI: + case SQLCOM_UPDATE_MULTI: + return true; + default:; + break; + } + if (thd->rgi_slave && thd->rgi_slave->current_event && thd->lex->sql_command == SQLCOM_END)
I wonder, would it be possible, instead of introducing rgi_slave->current_event, just make row events set thd->lex->sql_command appropriately? For example, currently row events increment thd->status_var.com_stat[] each event for its own SQLCOM_xxx, it won't be needed if they'll just set thd->lex->sql_command.
Do you think this is a quick refactoring? I had the same idea when I did this. And probably I tried to do that quickly already. Added TODO comment.
+ { + switch (thd->rgi_slave->current_event->get_type_code()) + { + case UPDATE_ROWS_EVENT: + case UPDATE_ROWS_EVENT_V1: + case DELETE_ROWS_EVENT: + case DELETE_ROWS_EVENT_V1: + return true; + default:; + break; + } + } + } +#endif + return false; +} + + /** Open a base table.
@@ -1777,6 +1823,11 @@ bool open_table(THD *thd, TABLE_LIST *table_list, Open_table_context *ot_ctx) DBUG_PRINT("info",("Using locked table")); #ifdef WITH_PARTITION_STORAGE_ENGINE part_names_error= set_partitions_as_used(table_list, table); + if (table->vers_need_hist_part(thd, table_list)) + { + /* Rotation is still needed under LOCK TABLES */
a bit confusing comment, you left out stuff that are obvious to you but not to others. I'd suggest something like
/* New partitions are not auto-created under LOCK TABLES (TODO: fix it) but rotation can still happen. */
Fixed.
+ table->part_info->vers_set_hist_part(thd, false); + } #endif goto reset; } @@ -2032,6 +2083,20 @@ bool open_table(THD *thd, TABLE_LIST *table_list, Open_table_context *ot_ctx) tc_add_table(thd, table); }
+ if (!ot_ctx->vers_create_count &&
what does this condition mean? When is ot_ctx->vers_create_count > 0 ?
When we already requested backoff action.
+ table->vers_need_hist_part(thd, table_list)) + { + ot_ctx->vers_create_count= table->part_info->vers_set_hist_part(thd, true); + if (ot_ctx->vers_create_count) + { + MYSQL_UNBIND_TABLE(table->file); + tc_release_table(table); + ot_ctx->request_backoff_action(Open_table_context::OT_ADD_HISTORY_PARTITION, + table_list); + DBUG_RETURN(TRUE); + } + } + if (!(flags & MYSQL_OPEN_HAS_MDL_LOCK) && table->s->table_category < TABLE_CATEGORY_INFORMATION) { @@ -3172,10 +3239,18 @@ Open_table_context::recover_from_failed_open() break; case OT_DISCOVER: case OT_REPAIR: - if ((result= lock_table_names(m_thd, m_thd->lex->create_info, - m_failed_table, NULL, - get_timeout(), 0))) + case OT_ADD_HISTORY_PARTITION: + result= lock_table_names(m_thd, m_thd->lex->create_info, m_failed_table, + NULL, get_timeout(), 0); + if (result) + { + if (m_action == OT_ADD_HISTORY_PARTITION) + { + m_thd->clear_error();
why?
Because of MDEV-23642.
+ result= false; + } break; + }
tdc_remove_table(m_thd, m_failed_table->db.str, m_failed_table->table_name.str); diff --git a/sql/sql_error.h b/sql/sql_error.h index 318d5076534..92076616adb 100644 --- a/sql/sql_error.h +++ b/sql/sql_error.h @@ -1195,7 +1195,6 @@ class Diagnostics_area: public Sql_state_errno,
void copy_non_errors_from_wi(THD *thd, const Warning_info *src_wi);
-private:
It doesn't seem you're using get_warning_info() anywhere
Reverted.
Warning_info *get_warning_info() { return m_wi_stack.front(); }
const Warning_info *get_warning_info() const { return m_wi_stack.front(); } diff --git a/sql/sql_partition.cc b/sql/sql_partition.cc index 86ba393fb7b..f4a960afe47 100644 --- a/sql/sql_partition.cc +++ b/sql/sql_partition.cc @@ -7237,7 +7246,8 @@ uint fast_alter_partition_table(THD *thd, TABLE *table, } else if ((alter_info->partition_flags & ALTER_PARTITION_ADD) && (part_info->part_type == RANGE_PARTITION || - part_info->part_type == LIST_PARTITION)) + part_info->part_type == LIST_PARTITION || + alter_info->partition_flags & ALTER_PARTITION_AUTO_HIST))
it'd be good it adding new empty VERSIONING partitions would always go this way, auto or not. but it's a separate task.
Added comment.
{ /* ADD RANGE/LIST PARTITIONS @@ -7281,7 +7291,7 @@ uint fast_alter_partition_table(THD *thd, TABLE *table, write_log_add_change_partition(lpt) || ERROR_INJECT_CRASH("crash_add_partition_4") || ERROR_INJECT_ERROR("fail_add_partition_4") || - mysql_change_partitions(lpt) || + mysql_change_partitions(lpt, false) ||
this way you skip trans_commit_stmt/trans_commit_implicit for ALTER TABLE ... ADD RANGE/LIST partition. is it always ok?
A safer alternative would've been
mysql_change_partitions(lpt, !(alter_info->partition_flags & ALTER_PARTITION_AUTO_HIST))
but it's more complex, so I'd prefer your variant, if we can be sure that it doesn't break anything.
The condition is if (copy_data && mysql_trans_commit_alter_copy_data(thd)) Data is not needed to be copied in ADD RANGE/LIST partition, is it? We can be sure only from testing period until GA.
ERROR_INJECT_CRASH("crash_add_partition_5") || ERROR_INJECT_ERROR("fail_add_partition_5") || alter_close_table(lpt) ||
Regards, Sergei VP of MariaDB Server Engineering and security@mariadb.org
-- All the best, Aleksey Midenkov @midenok