Hi, Aleksey!
Like last time, it's a review of the diff bc04ded2353 22158b3c05a,
so not only commit 38a888da0f1.
The main thing below - I didn't understand a couple of changes and asked
for clarifications.
On May 30, Aleksey Midenkov wrote:
> revision-id: 38a888da0f1 (mariadb-10.5.2-653-g38a888da0f1)
> parent(s): bc04ded2353
> author: Aleksey Midenkov <midenok(a)gmail.com>
> committer: Aleksey Midenkov <midenok(a)gmail.com>
> timestamp: 2021-04-22 23:35:06 +0300
> message:
>
> MDEV-17554 Auto-create new partition for system versioned tables with history partitioned by INTERVAL/LIMIT
> diff --git a/mysql-test/suite/versioning/r/delete_history.result b/mysql-test/suite/versioning/r/delete_history.result
> --- a/mysql-test/suite/versioning/r/delete_history.result
> +++ b/mysql-test/suite/versioning/r/delete_history.result
> @@ -154,3 +152,21 @@ select * from t1;
> a
> 1
> drop table t1;
> +#
> +# MDEV-17554 Auto-create new partition for system versioned tables with history partitioned by INTERVAL/LIMIT
> +#
> +# Don't auto-create new partition on DELETE HISTORY:
> +set timestamp= unix_timestamp('2000-01-01 00:00:00');
> +create or replace table t (a int) with system versioning
> +partition by system_time interval 1 hour auto;
> +set timestamp= unix_timestamp('2000-01-01 10:00:00');
perhaps, make it only 02:00:00 to avoid any chance that
a partition won't be created because of max-auto-create limit.
it's possible we put the limit at 10, but 2 is very unlikely.
or does max-auto-create limit always result in a warning?
> +delete history from t;
> +set timestamp= default;
> +show create table t;
> +Table Create Table
> +t CREATE TABLE `t` (
> + `a` int(11) DEFAULT NULL
> +) ENGINE=DEFAULT_ENGINE DEFAULT CHARSET=latin1 WITH SYSTEM VERSIONING
> + PARTITION BY SYSTEM_TIME INTERVAL 1 HOUR STARTS TIMESTAMP'2000-01-01 00:00:00' AUTO
> +PARTITIONS 2
> +drop table t;
> diff --git a/mysql-test/suite/versioning/r/partition.result b/mysql-test/suite/versioning/r/partition.result
> --- a/mysql-test/suite/versioning/r/partition.result
> +++ b/mysql-test/suite/versioning/r/partition.result
> @@ -1236,27 +1270,826 @@ t1 CREATE TABLE `t1` (
> PARTITION `p5` HISTORY ENGINE = DEFAULT_ENGINE,
> PARTITION `pn` CURRENT ENGINE = DEFAULT_ENGINE)
> alter table t1 add partition partitions 8;
> +drop table t1;
> +#
> +# End of 10.5 tests
> +#
> +#
> +# MDEV-17554 Auto-create new partition for system versioned tables with history partitioned by INTERVAL/LIMIT
> +#
> +create or replace table t1 (x int) with system versioning
> +partition by system_time limit 1 auto;
> show create table t1;
> Table Create Table
> t1 CREATE TABLE `t1` (
> `x` int(11) DEFAULT NULL
> ) ENGINE=DEFAULT_ENGINE DEFAULT CHARSET=latin1 WITH SYSTEM VERSIONING
> - PARTITION BY SYSTEM_TIME LIMIT 1
> -(PARTITION `p2` HISTORY ENGINE = DEFAULT_ENGINE,
> - PARTITION `p8` HISTORY ENGINE = DEFAULT_ENGINE,
> + PARTITION BY SYSTEM_TIME LIMIT 1 AUTO
> +PARTITIONS 2
> +insert into t1 values (1);
> +create or replace table t2 (y int);
> +insert into t2 values (2);
> +insert into t1 select * from t2;
> +insert into t2 select * from t1;
> +show create table t1;
> +Table Create Table
> +t1 CREATE TABLE `t1` (
> + `x` int(11) DEFAULT NULL
> +) ENGINE=DEFAULT_ENGINE DEFAULT CHARSET=latin1 WITH SYSTEM VERSIONING
> + PARTITION BY SYSTEM_TIME LIMIT 1 AUTO
> +PARTITIONS 2
> +# Too many partitions error
> +set timestamp= unix_timestamp('2000-01-01 00:00:00');
> +create or replace table t1 (x int) with system versioning
> +partition by system_time interval 1 hour auto;
> +set timestamp= unix_timestamp('2001-01-01 00:01:00');
> +update t1 set x= x + 1;
> +ERROR HY000: Too many partitions (including subpartitions) were defined
> +# Partition overflow error and manual fix
seems to be a wrong comment? no manual fix here.
> +set timestamp= unix_timestamp('2000-01-01 00:00:00');
> +create or replace table t1 (x int) with system versioning
> +partition by system_time interval 1 hour;
> +insert into t1 values (333);
> +set timestamp= unix_timestamp('2000-01-01 01:00:00');
> +update t1 set x= x + 1;
> +Warnings:
> +Warning 4114 Versioned table `test`.`t1`: last HISTORY partition (`p0`) is out of INTERVAL, need more HISTORY partitions
> +drop table t1;
> +# Partition overflow error and manual fix
> +set timestamp= unix_timestamp('2000-01-01 00:00:00');
> +create or replace table t1 (x int) with system versioning
> +partition by system_time interval 1 hour;
> +insert into t1 values (440);
> +set timestamp= unix_timestamp('2000-01-01 00:10:00');
> +update t1 set x= x + 1;
> +# Check how pruning boundaries work
> +explain partitions select * from t1 for system_time as of '2000-01-01 00:59:58';
> +id select_type table partitions type possible_keys key key_len ref rows Extra
> +1 SIMPLE t1 p0,pn # NULL NULL NULL NULL # #
> +explain partitions select * from t1 for system_time as of '2000-01-01 00:59:59';
> +id select_type table partitions type possible_keys key key_len ref rows Extra
> +1 SIMPLE t1 pn # NULL NULL NULL NULL # #
Why the first select looks in both partitions and the second one - only in pn?
> +explain partitions select * from t1 for system_time as of '2000-01-01 01:00:00';
> +id select_type table partitions type possible_keys key key_len ref rows Extra
> +1 SIMPLE t1 pn # NULL NULL NULL NULL # #
> +select * from t1 for system_time as of '2000-01-01 00:09:59';
> +x
> +440
> +set timestamp= unix_timestamp('2000-01-01 02:00:00');
> +update t1 set x= x + 1;
> +Warnings:
> +Warning 4114 Versioned table `test`.`t1`: last HISTORY partition (`p0`) is out of INTERVAL, need more HISTORY partitions
> +select * from t1 for system_time as of '2000-01-01 01:00:00';
> +x
> +select * from t1 partition (p0) order by x;
> +x
> +440
> +441
> +alter table t1 add partition partitions 3;
> +select * from t1 for system_time as of '2000-01-01 01:00:00';
> +x
> +441
> +select * from t1 partition (p0) order by x;
> +x
> +440
would be good to have EXPLAINs after the overflow and after the manual fix.
to show how ALTER changes what partition will be pruned
> +drop table t1;
> +create or replace table t1 (x int) with system versioning
> +partition by system_time interval 3600 second
> +starts '2000-01-01 00:00:00' auto partitions 3;
> +affected rows: 0
> +insert into t1 values (1);
> +affected rows: 1
> +show create table t1;
> +Table Create Table
> +t1 CREATE TABLE `t1` (
> + `x` int(11) DEFAULT NULL
> +) ENGINE=DEFAULT_ENGINE DEFAULT CHARSET=latin1 WITH SYSTEM VERSIONING
> + PARTITION BY SYSTEM_TIME INTERVAL 3600 SECOND STARTS TIMESTAMP'2000-01-01 00:00:00' AUTO
> +PARTITIONS 3
> ...
> diff --git a/mysql-test/suite/versioning/r/rpl_mix.result b/mysql-test/suite/versioning/r/rpl_mix.result
> --- a/mysql-test/suite/versioning/r/rpl_mix.result
> +++ b/mysql-test/suite/versioning/r/rpl_mix.result
> @@ -8,4 +8,59 @@ DELETE HISTORY FROM t1;
> connection slave;
> connection master;
> drop table t1;
> +#
> +# MDEV-25347 DML events for auto-partitioned tables are written into binary log twice
> +#
> +flush binary logs;
> +create table t1 (a int) with system versioning
> +partition by system_time limit 1 auto;
> +insert into t1 values (1);
> +update t1 set a= a + 1;
> +update t1 set a= a + 1;
better do a+2 the second time. otherwise it's a bit confusing, the bug says
"written twice" and you indeed have update a=a+1 written twice :)
with clearly different statements the test will prove that every event is
written once, and not one is skipped and the other is duplicated
> +show create table t1;
> +Table Create Table
> +t1 CREATE TABLE `t1` (
> + `a` int(11) DEFAULT NULL
> +) ENGINE=ENGINE DEFAULT CHARSET=latin1 WITH SYSTEM VERSIONING
> + PARTITION BY SYSTEM_TIME LIMIT 1 AUTO
> +PARTITIONS 3
> +select * from t1;
> +a
> +3
> +include/show_binlog_events.inc
> +Log_name Pos Event_type Server_id End_log_pos Info
> +master-bin.000002 # Binlog_checkpoint # # master-bin.000002
> +master-bin.000002 # Gtid # # GTID #-#-#
> +master-bin.000002 # Query # # use `test`; create table t1 (a int) with system versioning
> +partition by system_time limit 1 auto
> +master-bin.000002 # Gtid # # BEGIN GTID #-#-#
> +master-bin.000002 # Annotate_rows # # insert into t1 values (1)
> +master-bin.000002 # Table_map # # table_id: # (test.t1)
> +master-bin.000002 # Write_rows_v1 # # table_id: # flags: STMT_END_F
> +master-bin.000002 # Query # # COMMIT
> +master-bin.000002 # Gtid # # BEGIN GTID #-#-#
> +master-bin.000002 # Annotate_rows # # update t1 set a= a + 1
> +master-bin.000002 # Table_map # # table_id: # (test.t1)
> +master-bin.000002 # Update_rows_v1 # # table_id: #
> +master-bin.000002 # Write_rows_v1 # # table_id: # flags: STMT_END_F
> +master-bin.000002 # Query # # COMMIT
> +master-bin.000002 # Gtid # # BEGIN GTID #-#-#
> +master-bin.000002 # Annotate_rows # # update t1 set a= a + 1
> +master-bin.000002 # Table_map # # table_id: # (test.t1)
> +master-bin.000002 # Update_rows_v1 # # table_id: #
> +master-bin.000002 # Write_rows_v1 # # table_id: # flags: STMT_END_F
> +master-bin.000002 # Query # # COMMIT
> +connection slave;
> +show create table t1;
> +Table Create Table
> +t1 CREATE TABLE `t1` (
> + `a` int(11) DEFAULT NULL
> +) ENGINE=ENGINE DEFAULT CHARSET=latin1 WITH SYSTEM VERSIONING
> + PARTITION BY SYSTEM_TIME LIMIT 1 AUTO
> +PARTITIONS 3
> +select * from t1;
> +a
> +3
> +connection master;
> +drop table t1;
> include/rpl_end.inc
> diff --git a/mysql-test/suite/versioning/r/rpl_row.result b/mysql-test/suite/versioning/r/rpl_row.result
> --- a/mysql-test/suite/versioning/r/rpl_row.result
> +++ b/mysql-test/suite/versioning/r/rpl_row.result
> @@ -11,4 +11,59 @@ connection slave;
> connection master;
> drop table t1;
> set binlog_row_image= @old_row_image;
> +#
> +# MDEV-25347 DML events for auto-partitioned tables are written into binary log twice
> +#
> +flush binary logs;
> +create table t1 (a int) with system versioning
> +partition by system_time limit 1 auto;
> +insert into t1 values (1);
> +update t1 set a= a + 1;
> +update t1 set a= a + 1;
same. as a general rule it's helpful to use slightly different statement,
to be able to map them to binlog events uniquely. a+2, or 1+a, or whatever.
> +show create table t1;
> +Table Create Table
> +t1 CREATE TABLE `t1` (
> + `a` int(11) DEFAULT NULL
> +) ENGINE=ENGINE DEFAULT CHARSET=latin1 WITH SYSTEM VERSIONING
> + PARTITION BY SYSTEM_TIME LIMIT 1 AUTO
> +PARTITIONS 3
> +select * from t1;
> +a
> +3
> +include/show_binlog_events.inc
> +Log_name Pos Event_type Server_id End_log_pos Info
> +master-bin.000002 # Binlog_checkpoint # # master-bin.000002
> +master-bin.000002 # Gtid # # GTID #-#-#
> +master-bin.000002 # Query # # use `test`; create table t1 (a int) with system versioning
> +partition by system_time limit 1 auto
> +master-bin.000002 # Gtid # # BEGIN GTID #-#-#
> +master-bin.000002 # Annotate_rows # # insert into t1 values (1)
> +master-bin.000002 # Table_map # # table_id: # (test.t1)
> +master-bin.000002 # Write_rows_v1 # # table_id: # flags: STMT_END_F
> +master-bin.000002 # Query # # COMMIT
> +master-bin.000002 # Gtid # # BEGIN GTID #-#-#
> +master-bin.000002 # Annotate_rows # # update t1 set a= a + 1
> +master-bin.000002 # Table_map # # table_id: # (test.t1)
> +master-bin.000002 # Update_rows_v1 # # table_id: #
> +master-bin.000002 # Write_rows_v1 # # table_id: # flags: STMT_END_F
> +master-bin.000002 # Query # # COMMIT
> +master-bin.000002 # Gtid # # BEGIN GTID #-#-#
> +master-bin.000002 # Annotate_rows # # update t1 set a= a + 1
> +master-bin.000002 # Table_map # # table_id: # (test.t1)
> +master-bin.000002 # Update_rows_v1 # # table_id: #
> +master-bin.000002 # Write_rows_v1 # # table_id: # flags: STMT_END_F
> +master-bin.000002 # Query # # COMMIT
> +connection slave;
> +show create table t1;
> +Table Create Table
> +t1 CREATE TABLE `t1` (
> + `a` int(11) DEFAULT NULL
> +) ENGINE=ENGINE DEFAULT CHARSET=latin1 WITH SYSTEM VERSIONING
> + PARTITION BY SYSTEM_TIME LIMIT 1 AUTO
> +PARTITIONS 3
> +select * from t1;
> +a
> +3
> +connection master;
> +drop table t1;
> include/rpl_end.inc
> diff --git a/mysql-test/suite/versioning/t/partition.test b/mysql-test/suite/versioning/t/partition.test
> --- a/mysql-test/suite/versioning/t/partition.test
> +++ b/mysql-test/suite/versioning/t/partition.test
> @@ -1068,13 +1078,456 @@ alter table t1 add partition partitions 2;
> --replace_result $default_engine DEFAULT_ENGINE
> show create table t1;
> alter table t1 add partition partitions 8;
> +drop table t1;
> +
> +--echo #
> +--echo # End of 10.5 tests
> +--echo #
> +
> +--echo #
> +--echo # MDEV-17554 Auto-create new partition for system versioned tables with history partitioned by INTERVAL/LIMIT
> +--echo #
> +create or replace table t1 (x int) with system versioning
> +partition by system_time limit 1 auto;
> +--replace_result $default_engine DEFAULT_ENGINE
> +show create table t1;
> +
in the previous version of this patch you had clarifying comments here, like
--echo # INSERT, INSERT .. SELECT don't auto-create partitions
or
--echo # Number of partitions goes from 3 to 5
I kind of miss them now, they were helpful
> +insert into t1 values (1);
> +
> +create or replace table t2 (y int);
> +insert into t2 values (2);
> +
> +insert into t1 select * from t2;
> +insert into t2 select * from t1;
> +--replace_result $default_engine DEFAULT_ENGINE
> +show create table t1;
> +
> ...
> diff --git a/sql/ha_partition.h b/sql/ha_partition.h
> --- a/sql/ha_partition.h
> +++ b/sql/ha_partition.h
> @@ -1607,7 +1607,7 @@ class ha_partition :public handler
> for (; part_id < part_id_end; ++part_id)
> {
> handler *file= m_file[part_id];
> - DBUG_ASSERT(bitmap_is_set(&(m_part_info->read_partitions), part_id));
> + bitmap_set_bit(&(m_part_info->read_partitions), part_id);
would be good to have a comment before the method that it's
only used in vers_set_hist_part(), if (vers_info->limit).
because if it's a generic method to return the number of rows in a partition
(incl. subpartitions) then it's totally not clear why it modifies
read_partitions bitmap.
or may be you'd want to rename the method to make it look more
specific for LIMIT partitions in vers_set_hist_part() ?
> file->info(HA_STATUS_VARIABLE | HA_STATUS_NO_LOCK | HA_STATUS_OPEN);
> part_recs+= file->stats.records;
> }
> diff --git a/sql/handler.cc b/sql/handler.cc
> --- a/sql/handler.cc
> +++ b/sql/handler.cc
> @@ -1574,6 +1574,8 @@ int ha_commit_trans(THD *thd, bool all)
> DBUG_ASSERT(thd->transaction->stmt.ha_list == NULL ||
> trans == &thd->transaction->stmt);
>
> + DBUG_ASSERT(!thd->in_sub_stmt);
> if (thd->in_sub_stmt)
> {
> DBUG_ASSERT(0);
This is a bit redundant, don't you think?
One DBUG_ASSERT would be enough.
(personally, I'd prefer the one you added, not DBUG_ASSERT(0))
> diff --git a/sql/lock.cc b/sql/lock.cc
> --- a/sql/lock.cc
> +++ b/sql/lock.cc
> @@ -662,16 +662,28 @@ MYSQL_LOCK *mysql_lock_merge(MYSQL_LOCK *a,MYSQL_LOCK *b)
> DBUG_PRINT("enter", ("a->lock_count: %u b->lock_count: %u",
> a->lock_count, b->lock_count));
>
> - if (!(sql_lock= (MYSQL_LOCK*)
> - my_malloc(key_memory_MYSQL_LOCK, sizeof(*sql_lock) +
> - sizeof(THR_LOCK_DATA*)*((a->lock_count+b->lock_count)*2) +
> - sizeof(TABLE*)*(a->table_count+b->table_count),MYF(MY_WME))))
> - DBUG_RETURN(0); // Fatal error
> + const size_t lock_size= sizeof(*sql_lock) +
> + sizeof(THR_LOCK_DATA *) * ((a->lock_count + b->lock_count) * 2) +
> + sizeof(TABLE *) * (a->table_count + b->table_count);
> + if (thd)
> + {
> + sql_lock= (MYSQL_LOCK *) thd->alloc(lock_size);
> + if (!sql_lock)
> + DBUG_RETURN(0);
> + sql_lock->flags= GET_LOCK_ON_THD;
why? the commit comment for MDEV-23639 doesn't mention it.
generally, I think it's a good idea and, likely, should be used more than
just in recover_from_failed_open(). But why right now and in this commit?
Does this commit (22158b3c) rely on locks being on THD memroot? How so?
> + }
> + else
> + {
> + sql_lock= (MYSQL_LOCK *)
> + my_malloc(key_memory_MYSQL_LOCK, lock_size, MYF(MY_WME));
> + if (!sql_lock)
> + DBUG_RETURN(0);
> + sql_lock->flags= 0;
> + }
> sql_lock->lock_count=a->lock_count+b->lock_count;
> sql_lock->table_count=a->table_count+b->table_count;
> sql_lock->locks=(THR_LOCK_DATA**) (sql_lock+1);
> sql_lock->table=(TABLE**) (sql_lock->locks+sql_lock->lock_count*2);
> - sql_lock->flags= 0;
> memcpy(sql_lock->locks,a->locks,a->lock_count*sizeof(*a->locks));
> memcpy(sql_lock->locks+a->lock_count,b->locks,
> b->lock_count*sizeof(*b->locks));
> @@ -705,8 +717,10 @@ MYSQL_LOCK *mysql_lock_merge(MYSQL_LOCK *a,MYSQL_LOCK *b)
> a->lock_count, b->lock_count);
>
> /* Delete old, not needed locks */
> - my_free(a);
> - my_free(b);
> + if (!(a->flags & GET_LOCK_ON_THD))
> + my_free(a);
> + if (!(b->flags & GET_LOCK_ON_THD))
> + my_free(b);
this even looks like a bug, no check that old locks weren't on THD memroot.
(probably impossible in this particular case, but still rather fragile)
thanks!
> DBUG_RETURN(sql_lock);
> }
>
> diff --git a/sql/share/errmsg-utf8.txt b/sql/share/errmsg-utf8.txt
> --- a/sql/share/errmsg-utf8.txt
> +++ b/sql/share/errmsg-utf8.txt
> @@ -7985,3 +7985,5 @@ ER_JSON_TABLE_MULTIPLE_MATCHES
> eng "Can't store multiple matches of the path in the column '%s' of JSON_TABLE '%s'."
> ER_WITH_TIES_NEEDS_ORDER
> eng "FETCH ... WITH TIES requires ORDER BY clause to be present"
> +ER_VERS_HIST_PART_FAILED
> + eng "Versioned table %`s.%`s: adding HISTORY partition failed"
Can you come up with a test for this error?
> diff --git a/sql/sql_base.cc b/sql/sql_base.cc
> --- a/sql/sql_base.cc
> +++ b/sql/sql_base.cc
> @@ -931,7 +934,7 @@ void close_thread_table(THD *thd, TABLE **table_ptr)
> DBUG_PRINT("tcache", ("table: '%s'.'%s' %p", table->s->db.str,
> table->s->table_name.str, table));
> DBUG_ASSERT(!table->file->keyread_enabled());
> - DBUG_ASSERT(!table->file || table->file->inited == handler::NONE);
> + DBUG_ASSERT(table->file->inited == handler::NONE);
indeed
>
> /*
> The metadata lock must be released after giving back
> @@ -1625,6 +1625,125 @@ bool is_locked_view(THD *thd, TABLE_LIST *t)
> }
>
>
> +#ifdef WITH_PARTITION_STORAGE_ENGINE
> +/**
> + Switch part_info->hist_part and request partition creation if needed.
> +
> + @retval true Error or partition creation was requested.
> + @retval false No error
> +*/
> +bool TABLE::vers_switch_partition(THD *thd, TABLE_LIST *table_list,
> + Open_table_context *ot_ctx)
> +{
> + if (!part_info || part_info->part_type != VERSIONING_PARTITION ||
> + table_list->vers_conditions.delete_history ||
> + thd->stmt_arena->is_stmt_prepare() ||
> + table_list->lock_type < TL_WRITE_ALLOW_WRITE ||
> + table_list->mdl_request.type < MDL_SHARED_WRITE ||
> + table_list->mdl_request.type == MDL_EXCLUSIVE)
> + {
> + return false;
> + }
> +
> + switch (thd->lex->sql_command)
> + {
> + case SQLCOM_INSERT:
> + if (thd->lex->duplicates != DUP_UPDATE)
> + return false;
> + break;
> + case SQLCOM_LOAD:
> + if (thd->lex->duplicates != DUP_REPLACE)
> + return false;
> + break;
> + case SQLCOM_LOCK_TABLES:
> + case SQLCOM_DELETE:
> + case SQLCOM_UPDATE:
> + case SQLCOM_REPLACE:
> + case SQLCOM_REPLACE_SELECT:
> + case SQLCOM_DELETE_MULTI:
> + case SQLCOM_UPDATE_MULTI:
> + break;
> + default:
> + /*
> + TODO: make row events set thd->lex->sql_command appropriately.
> +
> + Sergei Golubchik: f.ex. currently row events increment
> + thd->status_var.com_stat[] each event for its own SQLCOM_xxx, it won't be
> + needed if they'll just set thd->lex->sql_command.
> + */
> + if (thd->rgi_slave && thd->rgi_slave->current_event &&
> + thd->lex->sql_command == SQLCOM_END)
> + {
> + switch (thd->rgi_slave->current_event->get_type_code())
> + {
> + case UPDATE_ROWS_EVENT:
> + case UPDATE_ROWS_EVENT_V1:
> + case DELETE_ROWS_EVENT:
> + case DELETE_ROWS_EVENT_V1:
> + break;
> + default:;
> + return false;
> + }
> + }
> + break;
> + }
> +
> + TABLE *table= this;
> +
> + /*
> + NOTE: The semantics of vers_set_hist_part() is double: even when we
twofold
> + don't need auto-create, we need to update part_info->hist_part.
> + */
> + uint *create_count= table_list->vers_skip_create ?
> + NULL : &ot_ctx->vers_create_count;
> + table_list->vers_skip_create= true;
> + if (table->part_info->vers_set_hist_part(thd, create_count))
> + {
> + MYSQL_UNBIND_TABLE(table->file);
> + tc_release_table(table);
> + return true;
> + }
> + if (ot_ctx->vers_create_count)
> + {
> + Open_table_context::enum_open_table_action action;
> + TABLE_LIST *table_arg;
> + mysql_mutex_lock(&table->s->LOCK_share);
> + if (!table->s->vers_skip_auto_create)
> + {
> + table->s->vers_skip_auto_create= true;
> + action= Open_table_context::OT_ADD_HISTORY_PARTITION;
> + table_arg= table_list;
> + }
> + else
> + {
> + /*
> + NOTE: this may repeat multiple times until creating thread acquires
> + MDL_EXCLUSIVE. Since auto-creation is rare operation this is acceptable.
> + We could suspend this thread on cond-var but we must first exit
> + MDL_SHARED_WRITE first and we cannot store cond-var into TABLE_SHARE
> + because it is already released and there is no guarantee that it will
> + be same instance if we acquire it again.
> + */
> + table_list->vers_skip_create= false;
> + ot_ctx->vers_create_count= 0;
> + action= Open_table_context::OT_REOPEN_TABLES;
> + table_arg= NULL;
> + }
I'm afraid I don't understand. All this business with vers_skip_create and
vers_skip_auto_create, it wasn't in the previous version of the patch. So, I
believe it was a fix for one of the MDEV bugs reported and fixed meanwhile.
What MDEV was it?
Could you explain what was the issue and what was the fix, please?
> + mysql_mutex_unlock(&table->s->LOCK_share);
> + if (!thd->locked_tables_mode)
> + {
> + MYSQL_UNBIND_TABLE(table->file);
> + tc_release_table(table);
> + }
> + ot_ctx->request_backoff_action(action, table_arg);
> + return true;
> + }
> +
> + return false;
> +}
> +#endif /* WITH_PARTITION_STORAGE_ENGINE */
> +
> +
> /**
> Open a base table.
>
> @@ -2572,11 +2699,13 @@ void Locked_tables_list::mark_table_for_reopen(THD *thd, TABLE *table)
> table_list; table_list= table_list->next_global)
> {
> if (table_list->table->s == share)
> + {
> table_list->table->internal_set_needs_reopen(true);
> + some_table_marked_for_reopen= 1;
> + }
> }
> /* This is needed in the case where lock tables where not used */
> table->internal_set_needs_reopen(true);
why ^^^ this doesn't count as "some_table_marked_for_reopen" ?
> - some_table_marked_for_reopen= 1;
> }
>
>
> @@ -3172,13 +3304,49 @@ Open_table_context::recover_from_failed_open()
> break;
> case OT_DISCOVER:
> case OT_REPAIR:
> - if ((result= lock_table_names(m_thd, m_thd->lex->create_info,
> - m_failed_table, NULL,
> - get_timeout(), 0)))
> + case OT_ADD_HISTORY_PARTITION:
> + if (!m_thd->locked_tables_mode)
> + result= lock_table_names(m_thd, m_thd->lex->create_info, m_failed_table,
> + NULL, get_timeout(), 0);
> + else
> + {
> + DBUG_ASSERT(!result);
> + DBUG_ASSERT(m_action == OT_ADD_HISTORY_PARTITION);
It seems you've solved recover_from_failed_open under LOCK TABLES,
so it could be possible to make OT_DISCOVER and OT_REPAIR to work there too.
Is that right? It's just a question, I don't imply it should be part of this MDEV.
> + }
> + /*
> + We are now under MDL_EXCLUSIVE mode. Other threads have no table share
> + acquired: they are blocked either at open_table_get_mdl_lock() in
> + open_table() or at lock_table_names() here.
> + */
> + if (result)
> + {
> + if (m_action == OT_ADD_HISTORY_PARTITION)
> + {
> + TABLE_SHARE *share= tdc_acquire_share(m_thd, m_failed_table,
> + GTS_TABLE, NULL);
> + if (share)
> + {
> + share->vers_skip_auto_create= false;
> + tdc_release_share(share);
> + }
> + if (m_thd->get_stmt_da()->sql_errno() == ER_LOCK_WAIT_TIMEOUT)
> + {
> + // MDEV-23642 Locking timeout caused by auto-creation affects original DML
> + m_thd->clear_error();
> + vers_create_count= 0;
> + result= false;
> + }
> + }
> break;
> + }
>
> - tdc_remove_table(m_thd, m_failed_table->db.str,
> - m_failed_table->table_name.str);
> + /*
> + We don't need to remove share under OT_ADD_HISTORY_PARTITION.
> + Moreover fast_alter_partition_table() works with TABLE instance.
> + */
> + if (m_action != OT_ADD_HISTORY_PARTITION)
> + tdc_remove_table(m_thd, m_failed_table->db.str,
> + m_failed_table->table_name.str);
>
> switch (m_action)
> {
> @@ -3206,6 +3374,70 @@ Open_table_context::recover_from_failed_open()
> case OT_REPAIR:
> result= auto_repair_table(m_thd, m_failed_table);
> break;
> + case OT_ADD_HISTORY_PARTITION:
> + {
> + result= false;
> + TABLE *table= open_ltable(m_thd, m_failed_table, TL_WRITE,
> + MYSQL_OPEN_HAS_MDL_LOCK | MYSQL_OPEN_IGNORE_LOGGING_FORMAT);
> + if (table == NULL)
> + {
> + m_thd->clear_error();
> + break;
> + }
> +
> + DBUG_ASSERT(vers_create_count);
> + result= vers_create_partitions(m_thd, m_failed_table, vers_create_count);
> + vers_create_count= 0;
> + if (!m_thd->transaction->stmt.is_empty())
> + trans_commit_stmt(m_thd);
> + DBUG_ASSERT(!result ||
> + !m_thd->locked_tables_mode ||
> + m_thd->lock->lock_count);
> + if (result)
> + break;
> + if (!m_thd->locked_tables_mode)
> + {
> + /*
> + alter_partition_lock_handling() does mysql_lock_remove() but
> + does not clear thd->lock completely.
> + */
> + DBUG_ASSERT(m_thd->lock->lock_count == 0);
> + if (!(m_thd->lock->flags & GET_LOCK_ON_THD))
> + my_free(m_thd->lock);
> + m_thd->lock= NULL;
> + }
> + else if (m_thd->locked_tables_mode == LTM_PRELOCKED)
> + {
> + MYSQL_LOCK *lock;
> + MYSQL_LOCK *merged_lock;
> +
> + /*
> + In LTM_LOCK_TABLES table was reopened via locked_tables_list,
> + but not in prelocked environment where we have to reopen
> + the table manually.
> + */
> + Open_table_context ot_ctx(m_thd, MYSQL_OPEN_REOPEN);
> + if (open_table(m_thd, m_failed_table, &ot_ctx))
> + {
> + result= true;
> + break;
> + }
> + TABLE *table= m_failed_table->table;
> + table->reginfo.lock_type= m_thd->update_lock_default;
> + m_thd->in_lock_tables= 1;
> + lock= mysql_lock_tables(m_thd, &table, 1,
> + MYSQL_OPEN_REOPEN | MYSQL_LOCK_USE_MALLOC);
> + m_thd->in_lock_tables= 0;
> + if (lock == NULL ||
> + !(merged_lock= mysql_lock_merge(m_thd->lock, lock, m_thd)))
> + {
> + result= true;
> + break;
> + }
> + m_thd->lock= merged_lock;
What MDEV does it fix?
I wonder why other cases in recover_from_failed_open doesn't do that
> + }
> + break;
> + }
> case OT_BACKOFF_AND_RETRY:
> case OT_REOPEN_TABLES:
> case OT_NO_ACTION:
> @@ -4370,6 +4611,8 @@ bool open_tables(THD *thd, const DDL_options_st &options,
> {
> if (ot_ctx.can_recover_from_failed_open())
> {
> + // FIXME: is this really used?
> + DBUG_ASSERT(0);
I take it, you weren't able to get this assert to fire?
> close_tables_for_reopen(thd, start,
> ot_ctx.start_of_statement_svp());
> if (ot_ctx.recover_from_failed_open())
Regards,
Sergei
VP of MariaDB Server Engineering
and security(a)mariadb.org