Re: [Maria-developers] eebebe8ef75: MDEV-23842 Atomic RENAME TABLE
Hi, Monty! On May 10, Michael Widenius wrote:
revision-id: eebebe8ef75 (mariadb-10.6.0-77-geebebe8ef75) parent(s): 5a602e2ce7f author: Michael Widenius <michael.widenius@gmail.com> committer: Michael Widenius <michael.widenius@gmail.com> timestamp: 2021-05-04 21:09:10 +0300 message:
MDEV-23842 Atomic RENAME TABLE
diff --git a/client/mysqltest.cc b/client/mysqltest.cc index 350c55edee2..133d1f76839 100644 --- a/client/mysqltest.cc +++ b/client/mysqltest.cc @@ -8095,9 +8095,10 @@ void handle_error(struct st_command *command, const char *err_sqlstate, DYNAMIC_STRING *ds) { int i; - DBUG_ENTER("handle_error");
+ var_set_int("$errno", err_errno);
there was $mysql_errno already, was is not something you could've used?
+ command->used_replace= 1; if (command->require_file) { diff --git a/mysql-test/suite/atomic/rename_case.result b/mysql-test/suite/atomic/rename_case.result new file mode 100644 index 00000000000..4b58c555cf2 --- /dev/null +++ b/mysql-test/suite/atomic/rename_case.result @@ -0,0 +1,52 @@ +create database test2; +# +# Testing rename error in different places +# +create table t1 (a int); +create table T2 (b int); +create table t3 (c int); +create table T4 (d int); +insert into t1 values(1); +insert into T2 values(2); +insert into t3 values(3); +insert into T4 values(4); +create temporary table tmp1 (a int); +create temporary table tmp2 (b int); +create temporary table tmp3 (c int); +create temporary table tmp4 (d int); +insert into tmp1 values(11); +insert into tmp2 values(22); +insert into tmp3 values(33); +insert into tmp4 values(44); +rename table t3 to T4, t1 to t5, T2 to t1, t5 to T2; +ERROR 42S01: Table 'T4' already exists +rename table t1 to t5, t3 to T4, T2 to t1, t5 to T2; +ERROR 42S01: Table 'T4' already exists +rename table t1 to t5, T2 to t1, t3 to T4, t5 to T2; +ERROR 42S01: Table 'T4' already exists +rename table t1 to t5, T2 to t1, t5 to T2, t3 to T4; +ERROR 42S01: Table 'T4' already exists +# Try failed rename using two databases +rename table test.t1 to test2.t5, test.T2 to test.t1, t5 to test.T2; +ERROR 42S02: Table 'test.t5' doesn't exist +select t1.a+T2.b+t3.c+T4.d from t1,T2,t3,T4;
better do select t1.a,T2.b,t3.c,T4.d from t1,T2,t3,T4; same one row, but it can detect when tables are swapped. or if you need one scalar value (you do below) it could ne select CONCAT(t1.a,T2.b,t3.c,T4.d) from t1,T2,t3,T4;
+t1.a+T2.b+t3.c+T4.d +10 +select * from t5; +ERROR 42S02: Table 'test.t5' doesn't exist +T2.MYD +T2.MYI +T2.frm +T4.MYD +T4.MYI +T4.frm +db.opt +t1.MYD +t1.MYI +t1.frm +t3.MYD +t3.MYI +t3.frm +# Cleanup +drop table t1,T2,t3,T4; +drop database test2; diff --git a/sql/sql_class.h b/sql/sql_class.h index 0dcb2e04ba4..5ae0866672d 100644 --- a/sql/sql_class.h +++ b/sql/sql_class.h @@ -2835,6 +2835,7 @@ class THD: public THD_count, /* this must be first */
#ifndef MYSQL_CLIENT binlog_cache_mngr * binlog_setup_trx_data(); + ulonglong binlog_xid; /* binlog request storing of xid */
"Tell binlog to store thd->query_id in the next Query_log_event"
/* Public interface to write RBR events to the binlog diff --git a/sql/log_event_client.cc b/sql/log_event_client.cc index 6b5d71348e1..733fc14c697 100644 --- a/sql/log_event_client.cc +++ b/sql/log_event_client.cc @@ -1822,7 +1822,7 @@ bool Query_log_event::print_query_header(IO_CACHE* file, my_b_printf(file, "\t%s\tthread_id=%lu\texec_time=%lu\terror_code=%d\n", get_type_str(), (ulong) thread_id, (ulong) exec_time, - error_code)) + error_code, (ulong) xid))
huh? you didn't add "\nxid=%lu" to the format string
goto err; }
diff --git a/sql/debug_sync.h b/sql/debug_sync.h index 3b8aa8815e1..4e3e10fcc51 100644 --- a/sql/debug_sync.h +++ b/sql/debug_sync.h @@ -53,4 +53,12 @@ static inline bool debug_sync_set_action(THD *, const char *, size_t) { return false; } #endif /* defined(ENABLED_DEBUG_SYNC) */
+#ifndef DBUG_OFF +void debug_crash_here(const char *keyword); +bool debug_simulate_error(const char *keyword, uint error); +#else +#define debug_crash_here(A) do { } while(0) +#define debug_simulate_error(A, B) 0 +#endif
this doesn't belong to debug_sync.h or debug_sync.cc put it, hmm, may be in sql_priv.h ? or sql_class.h/sql_class.cc
+ #endif /* DEBUG_SYNC_INCLUDED */ diff --git a/sql/debug_sync.cc b/sql/debug_sync.cc index 1fbb15592a4..f523e22f6ce 100644 --- a/sql/debug_sync.cc +++ b/sql/debug_sync.cc @@ -1628,3 +1628,74 @@ bool debug_sync_set_action(THD *thd, const char *action_str, size_t len) /* prevent linker/lib warning about file without public symbols */ int debug_sync_dummy; #endif /* defined(ENABLED_DEBUG_SYNC) */ + + +/** + Debug utility to do crash after a set number of executions + + The user variable, either @debug_crash_counter or @debug_error_counter,
this is very hackish. better to use @@debug_dbug_counter, something like that. also it could be global for an extra benefit if you'd need to crash a non-user thread.
+ is decremented each time debug_crash() or debug_simulate_error is called + if the keyword is set with @@debug_push, like + @@debug_push="d+frm_data_type_info_emulate" + + If the variable is not set or is not an integer it will be ignored. +*/ + +#ifndef DBUG_OFF + +static const LEX_CSTRING debug_crash_counter= +{ STRING_WITH_LEN("debug_crash_counter") }; +static const LEX_CSTRING debug_error_counter= +{ STRING_WITH_LEN("debug_error_counter") }; + +static bool debug_decrement_counter(const LEX_CSTRING *name) +{ + THD *thd= current_thd; + user_var_entry *entry= (user_var_entry*) + my_hash_search(&thd->user_vars, (uchar*) name->str, name->length); + if (!entry || entry->type != INT_RESULT || ! entry->value) + return 0; + (*(ulonglong*) entry->value)= (*(ulonglong*) entry->value)-1; + return !*(ulonglong*) entry->value; +} + +void debug_crash_here(const char *keyword) +{ + DBUG_ENTER("debug_crash_here"); + DBUG_PRINT("enter", ("keyword: %s", keyword)); + + DBUG_EXECUTE_IF(keyword, + if (debug_decrement_counter(&debug_crash_counter)) + { + my_printf_error(ER_INTERNAL_ERROR, + "Crashing at %s", + MYF(ME_ERROR_LOG | ME_NOTE), keyword); + DBUG_SUICIDE(); + }); + DBUG_VOID_RETURN; +} + +/* + This can be used as debug_counter to simulate an error at a specific + position. + + Typical usage would be + if (debug_simualte_error("keyword")) + error= 1; +*/ + +bool debug_simulate_error(const char *keyword, uint error) +{ + DBUG_ENTER("debug_crash_here"); + DBUG_PRINT("enter", ("keyword: %s", keyword)); + DBUG_EXECUTE_IF(keyword, + if (debug_decrement_counter(&debug_error_counter)) + { + my_printf_error(error, + "Simulating error for '%s'", + MYF(ME_ERROR_LOG), keyword); + DBUG_RETURN(1); + }); + DBUG_RETURN(0); +} +#endif /* DBUG_OFF */ diff --git a/sql/log_event_server.cc b/sql/log_event_server.cc index cca05f1fcca..0cf037e3a6f 100644 --- a/sql/log_event_server.cc +++ b/sql/log_event_server.cc @@ -1294,6 +1294,15 @@ bool Query_log_event::write() int3store(start, when_sec_part); start+= 3; } + + /* xid's is used with ddl_log handling */
it's not really an XID, it's a query_id or the DDL statetent. XID is a combination of a 64-byte global transaction identifier and a 64-byte branch qualifier. It would be much less confusing not to pretend that the query_id is an "xid" is some shape or form. Better to rename it to, say ddl_query_id. Or even ddl_id, if you like it short.
+ if (thd && thd->binlog_xid) + { + *start++= Q_XID; + int8store(start, thd->query_id); + start+= 8; + } + /* NOTE: When adding new status vars, please don't forget to update the MAX_SIZE_LOG_EVENT_STATUS in log_event.h and update the function diff --git a/sql/log.cc b/sql/log.cc index 17c599627cb..6e0a9706ec8 100644 --- a/sql/log.cc +++ b/sql/log.cc @@ -10654,6 +10669,8 @@ int TC_LOG_BINLOG::recover(LOG_INFO *linfo, const char *last_log_name, { if (ha_recover(&xids)) goto err2; + if (ddl_log_close_binlogged_events(&xids))
your xids and normal Xid_log_event xids are from two different non-intersecting sets and have even different semantics. It'd be safer not to have them in the same hash.
+ goto err2;
free_root(&mem_root, MYF(0)); my_hash_free(&xids); diff --git a/sql/sql_view.cc b/sql/sql_view.cc index 5df5ecb76a9..e6d726b30d7 100644 --- a/sql/sql_view.cc +++ b/sql/sql_view.cc @@ -2231,16 +2234,25 @@ mysql_rename_view(THD *thd, (uchar*)&view_def, view_parameters)) { /* restore renamed view in case of error */ - rename_in_schema_file(thd, new_db->str, new_name->str, view->db.str, - view->table_name.str); + rename_in_schema_file(thd, new_db->str, new_name->str, old_db->str, + old_name->str); goto err; } - } else + } + else DBUG_RETURN(1);
/* remove cache entries */ - query_cache_invalidate3(thd, view, 0); - sp_cache_invalidate(); + { + char key[NAME_LEN*2+1], *ptr; + memcpy(key, old_db->str, old_db->length); + ptr= key+ old_db->length; + *ptr++= 0; + memcpy(key, old_name->str, old_name->length); + ptr= key+ old_db->length; + *ptr++= 0;
there's tdc_create_key for that.
+ query_cache.invalidate(thd, key, (size_t) (ptr-key), 0); + } error= FALSE;
err: diff --git a/sql/ddl_log.h b/sql/ddl_log.h index 0747699cd85..316e6708f22 100644 --- a/sql/ddl_log.h +++ b/sql/ddl_log.h @@ -24,6 +24,9 @@ enum ddl_log_entry_code { /* + DDL_LOG_UNKOWN
s/UNKOWN/UNKNOWN/
+ Here mainly to detect blocks that are all zero + DDL_LOG_EXECUTE_CODE: This is a code that indicates that this is a log entry to be executed, from this entry a linked list of log entries @@ -34,48 +37,95 @@ enum ddl_log_entry_code DDL_IGNORE_LOG_ENTRY_CODE: An entry that is to be ignored */ - DDL_LOG_EXECUTE_CODE = 'e', - DDL_LOG_ENTRY_CODE = 'l', - DDL_IGNORE_LOG_ENTRY_CODE = 'i' + DDL_LOG_UNKNOWN= 0, + DDL_LOG_EXECUTE_CODE= 1, + DDL_LOG_ENTRY_CODE= 2, + DDL_IGNORE_LOG_ENTRY_CODE= 3, + DDL_LOG_ENTRY_CODE_LAST= 4 };
+ +/* + When adding things below, also add an entry to ddl_log_entry_phases in + ddl_log.cc +*/ + enum ddl_log_action_code { /* The type of action that a DDL_LOG_ENTRY_CODE entry is to perform. - DDL_LOG_DELETE_ACTION: - Delete an entity - DDL_LOG_RENAME_ACTION: - Rename an entity - DDL_LOG_REPLACE_ACTION: - Rename an entity after removing the previous entry with the - new name, that is replace this entry. - DDL_LOG_EXCHANGE_ACTION: - Exchange two entities by renaming them a -> tmp, b -> a, tmp -> b. */ - DDL_LOG_DELETE_ACTION = 'd', - DDL_LOG_RENAME_ACTION = 'r', - DDL_LOG_REPLACE_ACTION = 's', - DDL_LOG_EXCHANGE_ACTION = 'e' + DDL_LOG_UNKNOWN_ACTION= 0, + + /* Delete a .frm file or a table in the partition engine */ + DDL_LOG_DELETE_ACTION= 1, + + /* Rename a .frm fire a table in the partition engine */ + DDL_LOG_RENAME_ACTION= 2, + + /* + Rename an entity after removing the previous entry with the + new name, that is replace this entry. + */ + DDL_LOG_REPLACE_ACTION= 3, + + /* Exchange two entities by renaming them a -> tmp, b -> a, tmp -> b */ + DDL_LOG_EXCHANGE_ACTION= 4, + /* + log do_rename(): Rename of .frm file, table, stat_tables and triggers + */ + DDL_LOG_RENAME_TABLE_ACTION= 5, + DDL_LOG_RENAME_VIEW_ACTION= 6, + DDL_LOG_LAST_ACTION /* End marker */ };
+ +/* Number of phases for each ddl_log_action_code */ +extern const uchar ddl_log_entry_phases[DDL_LOG_LAST_ACTION]; + + enum enum_ddl_log_exchange_phase { EXCH_PHASE_NAME_TO_TEMP= 0, EXCH_PHASE_FROM_TO_NAME= 1, EXCH_PHASE_TEMP_TO_FROM= 2 };
+enum enum_ddl_log_rename_table_phase { + DDL_RENAME_PHASE_TRIGGER= 0, + DDL_RENAME_PHASE_STAT, + DDL_RENAME_PHASE_TABLE, +}; + +/* + Setting ddl_log_entry.phase to this has the same effect as setting + action_type to DDL_IGNORE_LOG_ENTRY_CODE
this doesn't make much sense. May be you can add a line or two about why it's needed and one cannot simply set action_type to DDL_IGNORE_LOG_ENTRY_CODE instead?
+*/ + +#define DDL_LOG_FINAL_PHASE ((uchar) 0xff)
typedef struct st_ddl_log_entry { - const char *name; - const char *from_name; - const char *handler_name; - const char *tmp_name; + LEX_CSTRING name; + LEX_CSTRING from_name; + LEX_CSTRING handler_name; + LEX_CSTRING tmp_name; + LEX_CSTRING db; + LEX_CSTRING from_db; + LEX_CSTRING from_handler_name; + uchar uuid[MY_UUID_SIZE]; // UUID for new frm file + + ulonglong xid; // Xid stored in the binary log uint next_entry; - uint entry_pos; - enum ddl_log_entry_code entry_type; + uint entry_pos; // Set by write_dll_log_entry() + /* + unique_id can be used to store a unique number to check current state. + Currently it is used to store new size of frm file or link to a ddl log + entry.
is that so? I don't see it storing a file version, a new size of the frm or a link, I see it stores some kind of a counter to abort recovery when it was repeated more than DDL_LOG_MAX_RETRY times
+ */ + uint32 unique_id; // Unique id for file version + uint16 flags; // Flags unique for each command + enum ddl_log_entry_code entry_type; // Set automatically enum ddl_log_action_code action_type; /* Most actions have only one phase. REPLACE does however have two @@ -95,17 +145,52 @@ typedef struct st_ddl_log_memory_entry } DDL_LOG_MEMORY_ENTRY;
+typedef struct st_ddl_log_state +{ + /* List of ddl log entries */ + DDL_LOG_MEMORY_ENTRY *list; + /* One execute entry per list */ + DDL_LOG_MEMORY_ENTRY *execute_entry; +} DDL_LOG_STATE;
would be nice to have a small comment, explaining what is a "state" and how two lists can represent it. is execute_entry a pointer to a some element from the list?
+ + +/* These functions are for recovery */ +bool ddl_log_initialize(); +void ddl_log_release(); +bool ddl_log_close_binlogged_events(HASH *xids); +int ddl_log_execute_recovery(); + +/* functions for updating the ddl log */ bool ddl_log_write_entry(DDL_LOG_ENTRY *ddl_log_entry, DDL_LOG_MEMORY_ENTRY **active_entry); + bool ddl_log_write_execute_entry(uint first_entry, - bool complete, - DDL_LOG_MEMORY_ENTRY **active_entry); -bool ddl_log_increment_phase(uint entry_no); + DDL_LOG_MEMORY_ENTRY **active_entry); +bool ddl_log_disable_execute_entry(DDL_LOG_MEMORY_ENTRY **active_entry); + +void ddl_log_complete(DDL_LOG_STATE *ddl_log_state); +void ddl_log_revert(THD *thd, DDL_LOG_STATE *ddl_log_state); + +bool ddl_log_update_phase(DDL_LOG_STATE *entry, uchar phase); +bool ddl_log_update_xid(DDL_LOG_STATE *state, ulonglong xid); +bool ddl_log_disable_entry(DDL_LOG_STATE *state); +bool ddl_log_increment_phase(uint entry_pos); void ddl_log_release_memory_entry(DDL_LOG_MEMORY_ENTRY *log_entry); bool ddl_log_sync(); -void ddl_log_release(); -void ddl_log_execute_recovery(); bool ddl_log_execute_entry(THD *thd, uint first_entry);
+void ddl_log_release_entries(DDL_LOG_STATE *ddl_log_state); +bool ddl_log_rename_table(THD *thd, DDL_LOG_STATE *ddl_state, + handlerton *hton, + const LEX_CSTRING *org_db, + const LEX_CSTRING *org_alias, + const LEX_CSTRING *new_db, + const LEX_CSTRING *new_alias); +bool ddl_log_rename_view(THD *thd, DDL_LOG_STATE *ddl_state, + const LEX_CSTRING *org_db, + const LEX_CSTRING *org_alias, + const LEX_CSTRING *new_db, + const LEX_CSTRING *new_alias); + extern mysql_mutex_t LOCK_gdl; #endif /* DDL_LOG_INCLUDED */ diff --git a/sql/sql_rename.cc b/sql/sql_rename.cc index b7aed97a8a2..2b2f32cf126 100644 --- a/sql/sql_rename.cc +++ b/sql/sql_rename.cc @@ -186,49 +172,43 @@ bool mysql_rename_tables(THD *thd, TABLE_LIST *table_list, bool silent, /* Add IF EXISTS to binary log */ thd->variables.option_bits|= OPTION_IF_EXISTS; } + + debug_crash_here("ddl_log_rename_before_binlog"); + /* + Store xid in ddl log and binary log so that we can check on ddl recovery + if the item is in the binary log (and thus the operation was complete + */ + thd->binlog_xid= thd->query_id; + ddl_log_update_xid(&ddl_log_state, thd->binlog_xid); binlog_error= write_bin_log(thd, TRUE, thd->query(), thd->query_length());
I'd prefer to avoid blowing up THD with arbitrary flags that are only needed rarely in one specific place in the code to pass some argument few stack frames down. Perhaps you can do instead something like bool Query_log_event::write() { ... /* xid's is used with ddl_log handling */ if (ddl_log_entry_incomplete()) { *start++= Q_XID; int8store(start, thd->query_id); start+= 8; } or at least you could replace ulonglong binlog_xid with a one bit state, like thd->server_status & SERVER_STATUS_IN_DDL or with some other bitmap of flags.
+ if (binlog_error) + error= 1; + thd->binlog_xid= 0; thd->variables.option_bits= save_option_bits; + debug_crash_here("ddl_log_rename_after_binlog");
if (likely(!binlog_error)) my_ok(thd); }
if (likely(!error)) + { query_cache_invalidate3(thd, table_list, 0); + ddl_log_complete(&ddl_log_state); + } + else + { + /* Revert the renames of normal tables with the help of the ddl log */ + ddl_log_revert(thd, &ddl_log_state); + }
err: DBUG_RETURN(error || binlog_error); }
-/* - reverse table list - - SYNOPSIS - reverse_table_list() - table_list pointer to table _list - - RETURN - pointer to new (reversed) list -*/ -static TABLE_LIST *reverse_table_list(TABLE_LIST *table_list) -{ - TABLE_LIST *prev= 0; - - while (table_list) - { - TABLE_LIST *next= table_list->next_local; - table_list->next_local= prev; - prev= table_list; - table_list= next; - } - return (prev); -} - - static bool -do_rename_temporary(THD *thd, TABLE_LIST *ren_table, TABLE_LIST *new_table, - bool skip_error) +do_rename_temporary(THD *thd, TABLE_LIST *ren_table, TABLE_LIST *new_table)
I wonder why we never got any warnings about it
{ LEX_CSTRING *new_alias; DBUG_ENTER("do_rename_temporary"); diff --git a/sql/ddl_log.cc b/sql/ddl_log.cc index bad786b3799..777dfdddbc7 100644 --- a/sql/ddl_log.cc +++ b/sql/ddl_log.cc @@ -22,176 +22,333 @@ #include "log.h" // sql_print_error() #include "ddl_log.h" #include "ha_partition.h" // PAR_EXT +#include "sql_table.h" // build_table_filename +#include "sql_statistics.h" // rename_table_in_stats_tables +#include "sql_view.h" // mysql_rename_view() +#include "strfunc.h" // strconvert +#include <mysys_err.h> // EE_LINK
/*--------------------------------------------------------------------------
MODULE: DDL log -----------------
- This module is used to ensure that we can recover from crashes that occur - in the middle of a meta-data operation in MySQL. E.g. DROP TABLE t1, t2; - We need to ensure that both t1 and t2 are dropped and not only t1 and - also that each table drop is entirely done and not "half-baked". + This module is used to ensure that we can recover from crashes that + occur in the middle of a meta-data operation in MySQL. E.g. DROP + TABLE t1, t2; We need to ensure that both t1 and t2 are dropped and + not only t1 and also that each table drop is entirely done and not + "half-baked".
- To support this we create log entries for each meta-data statement in the - ddl log while we are executing. These entries are dropped when the - operation is completed. + To support this we create log entries for each meta-data statement + in the ddl log while we are executing. These entries are dropped + when the operation is completed.
At recovery those entries that were not completed will be executed.
There is only one ddl log in the system and it is protected by a mutex and there is a global struct that contains information about its current state.
+ DDl recovery after a crash works the following way:
DDL or ddl
+ + - ddl_log_initialize() initializes the global global_ddl_log variable + and opens the binary log if it exists. If it doesn't exists a new one + is created.
it doesn't seem to be opening a binary log. Did you mean ddl log?
+ - ddl_log_close_binlogged_events() loops over all log events and checks if + their xid (stored in the EXECUTE_CODE event) is in the binary log. If xid + exists in the binary log the entry is marked as finished in the ddl log. + - After a new binary log is created and is open for new entries, + ddl_log_execute_recovery() is executed on remaining open events: + - Loop over all events + - For each entry with DDL_LOG_ENTRY_CODE execute the remaining phases + in ddl_log_execute_entry_no_lock() + + The ddl_log.log file is created at startup and deleted when server goes down. + After the final recovery phase is done, the file is truncated. + History: First version written in 2006 by Mikael Ronstrom - Second version written in 2020 by Monty + Second version in 2020 by Monty --------------------------------------------------------------------------*/
+#define DDL_LOG_MAGIC_LENGTH 4 +/* How many times to try to execute a ddl log entry that causes crashes */ +#define DDL_LOG_MAX_RETRY 3 + +uchar ddl_log_file_magic[]= +{ (uchar) 254, (uchar) 254, (uchar) 11, (uchar) 2 }; + +/* Action names for ddl_log_action_code */ + +const char *ddl_log_action_name[DDL_LOG_LAST_ACTION]= +{ + "Unknown", "partitioning delete", "partitioning rename", + "partitioning replace", "partitioning exchange", + "rename table", "rename view" +}; + +/* Number of phases per entry */ +const uchar ddl_log_entry_phases[DDL_LOG_LAST_ACTION]= +{ + 1, 1, 2, 3, 4, 1 +}; + + struct st_global_ddl_log { - /* - We need to adjust buffer size to be able to handle downgrades/upgrades - where IO_SIZE has changed. We'll set the buffer size such that we can - handle that the buffer size was upto 4 times bigger in the version - that wrote the DDL log. - */ - char file_entry_buf[4*IO_SIZE]; - char file_name_str[FN_REFLEN]; - char *file_name; + uchar *file_entry_buf; DDL_LOG_MEMORY_ENTRY *first_free; DDL_LOG_MEMORY_ENTRY *first_used; File file_id; - uint name_len; uint num_entries; + uint name_pos; uint io_size; - bool inited; - bool do_release; - bool recovery_phase; - st_global_ddl_log() : inited(false), do_release(false) {} + bool initialized; + bool open; };
st_global_ddl_log global_ddl_log;
static?
mysql_mutex_t LOCK_gdl;
+/* Positions to different data in a ddl log block */ #define DDL_LOG_ENTRY_TYPE_POS 0 +/* + Note that ACTION_TYPE and PHASE_POS must be after each other. + See update_phase() +*/ #define DDL_LOG_ACTION_TYPE_POS 1 #define DDL_LOG_PHASE_POS 2 #define DDL_LOG_NEXT_ENTRY_POS 4 -#define DDL_LOG_NAME_POS 8 +/* Flags to remember something unique about the query, like if .frm was used */ +#define DDL_LOG_FLAG_POS 8 +/* Used to store XID entry that was written to binary log */ +#define DDL_LOG_XID_POS 10 +/* Used to store unique uuid from the .frm file */
"unique uuid" is a weird choice of words
+#define DDL_LOG_UUID_POS 18 +/* ID_POS can be used to store something unique, like file size (4 bytes) */ +#define DDL_LOG_ID_POS DDL_LOG_UUID_POS + MY_UUID_SIZE +#define DDL_LOG_END_POS DDL_LOG_ID_POS + 8
-#define DDL_LOG_NUM_ENTRY_POS 0 -#define DDL_LOG_NAME_LEN_POS 4 -#define DDL_LOG_IO_SIZE_POS 8 +/* + Position to where names are stored in the ddl log blocks. The current + value is stored in the header and can thus be changed if we need more + space for constants in the header than what is between DDL_LOG_ID_POS and + DDL_LOG_TMP_NAME_POS. +*/ +#define DDL_LOG_TMP_NAME_POS 56
+/* Definitions for the ddl log header, the first block in the file */ +/* IO_SIZE is stored in the header and can thus be changed */ +#define DDL_LOG_IO_SIZE IO_SIZE + +/* Header is stored in positions 0-3 */ +#define DDL_LOG_IO_SIZE_POS 4 +#define DDL_LOG_NAME_OFFSET_POS 6 +/* Sum of the above variables */ +#define DDL_LOG_HEADER_SIZE 4+2+2 + /** + Sync the ddl log file. + + @return Operation status + @retval FALSE Success + @retval TRUE Error +*/ + +static bool ddl_log_sync_file() +{ + DBUG_ENTER("ddl_log_sync_file"); + DBUG_RETURN(mysql_file_sync(global_ddl_log.file_id, MYF(MY_WME))); +}
I don't quite understand that, are there cases when you want to sync the ddl log and you *don't know* whether LOCK_gdl is locked or not? because if you know there should be no lock, you'd better use mysql_mutex_assert_not_owner here.
+ +/* Same as above, but ensure we have the LOCK_gdb locked */
that's an interesting Freudian slip
+ +static bool ddl_log_sync_no_lock() +{ + DBUG_ENTER("ddl_log_sync_no_lock"); + + mysql_mutex_assert_owner(&LOCK_gdl); + DBUG_RETURN(ddl_log_sync_file()); +} ... +/* + Store and read strings in ddl log buffers + + Format is: + 2 byte: length (not counting end \0) + X byte: string value of length 'length' + 1 byte: \0
a bit redundant, don't you think? Why did you do it?
+*/ + +static uchar *store_string(uchar *pos, uchar *end, const LEX_CSTRING *str) { + uint32 length= (uint32) str->length; + if (unlikely(pos + 2 + length + 1 > end)) + { + DBUG_ASSERT(0); + return end; // Overflow } + + int2store(pos, length); + if (likely(length)) + memcpy(pos+2, str->str, length); + pos[2+length]= 0; // Store end \0 + return pos + 2 + length +1; +} + + +static LEX_CSTRING get_string(uchar **pos, const uchar *end) { + LEX_CSTRING tmp; + uint32 length; + if (likely(*pos + 3 <= end)) + { + length= uint2korr(*pos); + if (likely(*pos + 2 + length + 1 <= end)) + { + char *str= (char*) *pos+2; + *pos= *pos + 2 + length + 1; + tmp.str= str; + tmp.length= length; + return tmp; } + } + /* + Overflow on read, should never happen + Set *pos to end to ensure any future calls also returns empty string + */ + DBUG_ASSERT(0); + *pos= (uchar*) end; + tmp.str= ""; + tmp.length= 0; + return tmp; }
@@ -256,38 +484,32 @@ static uint read_ddl_log_header()
static void set_global_from_ddl_log_entry(const DDL_LOG_ENTRY *ddl_log_entry)
it'd be good to have some comment explaining why you need this function. Couldn't you write directly from the ddl_log_entry? or use a local buffer? why to serialize into a buffer inside global_ddl_log?
{ + uchar *file_entry_buf= global_ddl_log.file_entry_buf, *pos, *end; + mysql_mutex_assert_owner(&LOCK_gdl);
it would be logical to have LOCK_gdl inside the global_ddl_log struct
- global_ddl_log.file_entry_buf[DDL_LOG_ENTRY_TYPE_POS]= - (char)DDL_LOG_ENTRY_CODE; - global_ddl_log.file_entry_buf[DDL_LOG_ACTION_TYPE_POS]= - (char)ddl_log_entry->action_type; - global_ddl_log.file_entry_buf[DDL_LOG_PHASE_POS]= 0; - int4store(&global_ddl_log.file_entry_buf[DDL_LOG_NEXT_ENTRY_POS], - ddl_log_entry->next_entry); - DBUG_ASSERT(strlen(ddl_log_entry->name) < FN_REFLEN); - strmake(&global_ddl_log.file_entry_buf[DDL_LOG_NAME_POS], - ddl_log_entry->name, FN_REFLEN - 1); - if (ddl_log_entry->action_type == DDL_LOG_RENAME_ACTION || - ddl_log_entry->action_type == DDL_LOG_REPLACE_ACTION || - ddl_log_entry->action_type == DDL_LOG_EXCHANGE_ACTION) - { - DBUG_ASSERT(strlen(ddl_log_entry->from_name) < FN_REFLEN); - strmake(&global_ddl_log.file_entry_buf[DDL_LOG_NAME_POS + FN_REFLEN], - ddl_log_entry->from_name, FN_REFLEN - 1); - } - else - global_ddl_log.file_entry_buf[DDL_LOG_NAME_POS + FN_REFLEN]= 0; - DBUG_ASSERT(strlen(ddl_log_entry->handler_name) < FN_REFLEN); - strmake(&global_ddl_log.file_entry_buf[DDL_LOG_NAME_POS + (2*FN_REFLEN)], - ddl_log_entry->handler_name, FN_REFLEN - 1); - if (ddl_log_entry->action_type == DDL_LOG_EXCHANGE_ACTION) - { - DBUG_ASSERT(strlen(ddl_log_entry->tmp_name) < FN_REFLEN); - strmake(&global_ddl_log.file_entry_buf[DDL_LOG_NAME_POS + (3*FN_REFLEN)], - ddl_log_entry->tmp_name, FN_REFLEN - 1); - } - else - global_ddl_log.file_entry_buf[DDL_LOG_NAME_POS + (3*FN_REFLEN)]= 0; + + file_entry_buf[DDL_LOG_ENTRY_TYPE_POS]= (uchar) ddl_log_entry->entry_type; + file_entry_buf[DDL_LOG_ACTION_TYPE_POS]= (uchar) ddl_log_entry->action_type; + file_entry_buf[DDL_LOG_PHASE_POS]= (uchar) ddl_log_entry->phase; + int4store(file_entry_buf+DDL_LOG_NEXT_ENTRY_POS, ddl_log_entry->next_entry); + int2store(file_entry_buf+DDL_LOG_FLAG_POS, ddl_log_entry->flags); + int8store(file_entry_buf+DDL_LOG_XID_POS, ddl_log_entry->xid); + memcpy(file_entry_buf+DDL_LOG_UUID_POS, ddl_log_entry->uuid, MY_UUID_SIZE); + int8store(file_entry_buf+DDL_LOG_ID_POS, ddl_log_entry->unique_id); + bzero(file_entry_buf+DDL_LOG_END_POS, + global_ddl_log.name_pos - DDL_LOG_END_POS); + + pos= file_entry_buf + global_ddl_log.name_pos; + end= file_entry_buf + global_ddl_log.io_size; + + pos= store_string(pos, end, &ddl_log_entry->handler_name); + pos= store_string(pos, end, &ddl_log_entry->db); + pos= store_string(pos, end, &ddl_log_entry->name); + pos= store_string(pos, end, &ddl_log_entry->from_handler_name); + pos= store_string(pos, end, &ddl_log_entry->from_db); + pos= store_string(pos, end, &ddl_log_entry->from_name); + pos= store_string(pos, end, &ddl_log_entry->tmp_name);
you have DBUG_ASSERT(0) that all those strings won't overflow io_size. How can you guarantee that?
+ bzero(pos, global_ddl_log.io_size - (pos - file_entry_buf)); }
@@ -442,63 +691,120 @@ static bool ddl_log_sync_no_lock() ... + + +/* + Build a filename for a table, trigger file or .frm + Delete also any temporary file suffixed with ~
Where do you get these *~ files from? What creates them?
+*/ + +static void build_filename_and_delete_tmp_file(char *path, size_t path_length, + const LEX_CSTRING *db, + const LEX_CSTRING *name, + const char *ext, + PSI_file_key psi_key) +{ + uint length= build_table_filename(path, path_length-1, + db->str, name->str, ext, 0); + + path[length]= '~'; + path[length+1]= 0; + (void) mysql_file_delete(psi_key, path, MYF(0)); + path[length]= 0; }
@@ -806,154 +1296,132 @@ static bool ddl_log_execute_entry_no_lock(THD *thd, uint first_entry) bool ddl_log_write_entry(DDL_LOG_ENTRY *ddl_log_entry, DDL_LOG_MEMORY_ENTRY **active_entry) { - bool error, write_header; + bool error; + uchar *pos, *end; DBUG_ENTER("ddl_log_write_entry");
mysql_mutex_assert_owner(&LOCK_gdl); - if (init_ddl_log()) + if (!global_ddl_log.open) DBUG_RETURN(TRUE); + + ddl_log_entry->entry_type= DDL_LOG_ENTRY_CODE; set_global_from_ddl_log_entry(ddl_log_entry); - if (get_free_ddl_log_entry(active_entry, &write_header)) + if (ddl_log_get_free_entry(active_entry)) DBUG_RETURN(TRUE); + error= FALSE; + pos= global_ddl_log.file_entry_buf + global_ddl_log.name_pos; + end= global_ddl_log.file_entry_buf + global_ddl_log.io_size; DBUG_PRINT("ddl_log", - ("write type %c next %u name '%s' from_name '%s' handler '%s'" - " tmp_name '%s'", + ("type: %c next: %u handler: %s " + "to_name: '%s.%s' from_name: '%s.%s' " + "tmp_name: '%s'", (char) global_ddl_log.file_entry_buf[DDL_LOG_ACTION_TYPE_POS], ddl_log_entry->next_entry, - (char*) &global_ddl_log.file_entry_buf[DDL_LOG_NAME_POS], - (char*) &global_ddl_log.file_entry_buf[DDL_LOG_NAME_POS - + FN_REFLEN], - (char*) &global_ddl_log.file_entry_buf[DDL_LOG_NAME_POS - + (2*FN_REFLEN)], - (char*) &global_ddl_log.file_entry_buf[DDL_LOG_NAME_POS - + (3*FN_REFLEN)])); + get_string(&pos, end).str, // Handler + get_string(&pos, end).str, // to db.table + get_string(&pos, end).str, + get_string(&pos, end).str, // From db.table + get_string(&pos, end).str, + get_string(&pos, end).str)); // Tmp name
This is not a critical bug, because it's in a DBUG_PRINT, but it's still a bug. C++ (and C) standard both say that the order of evaluation of function arguments is *unspecified*.
+ if (unlikely(write_ddl_log_file_entry((*active_entry)->entry_pos))) { + ddl_log_release_memory_entry(*active_entry); + *active_entry= 0; error= TRUE; - sql_print_error("Failed to write entry_no = %u", + sql_print_error("DDL_LOG: Failed to write entry %u", (*active_entry)->entry_pos); } - if (write_header && likely(!error)) - { - (void) ddl_log_sync_no_lock(); - if (write_ddl_log_header()) - error= TRUE; - } - if (unlikely(error)) - ddl_log_release_memory_entry(*active_entry); DBUG_RETURN(error); }
/** - @brief Write final entry in the ddl log. + @brief Write or update execute entry in the ddl log.
- @details This is the last write in the ddl log. The previous log entries - have already been written but not yet synched to disk. - We write a couple of log entries that describes action to perform. - This entries are set-up in a linked list, however only when a first - execute entry is put as the first entry these will be executed. - This routine writes this first. + @details An execute entry points to the first entry that should + be excuted during recovery. In some cases it's only written once, + in other cases it's updated for each log entry to point to the new + header for the list.
+ When called, the previous log entries have already been written but not yet + synched to disk. We write a couple of log entries that describes + action to perform. This entries are set-up in a linked list, + however only when an execute entry is put as the first entry these will be + executed during recovery. + @param first_entry First entry in linked list of entries - to execute, if 0 = NULL it means that - the entry is removed and the entries - are put into the free list. - @param complete Flag indicating we are simply writing - info about that entry has been completed + to execute. @param[in,out] active_entry Entry to execute, 0 = NULL if the entry is written first time and needs to be returned. In this case the entry written is returned in this parameter - @return Operation status @retval TRUE Error @retval FALSE Success */
bool ddl_log_write_execute_entry(uint first_entry, - bool complete, DDL_LOG_MEMORY_ENTRY **active_entry) { - bool write_header= FALSE; - char *file_entry_buf= (char*)global_ddl_log.file_entry_buf; + uchar *file_entry_buf= global_ddl_log.file_entry_buf; + bool got_free_entry= 0; DBUG_ENTER("ddl_log_write_execute_entry");
mysql_mutex_assert_owner(&LOCK_gdl); - if (init_ddl_log()) - { - DBUG_RETURN(TRUE); - } - if (!complete) - { /* We haven't synched the log entries yet, we synch them now before writing the execute entry. If complete is true we haven't written
there's no 'complete' anymore
any log entries before, we are only here to write the execute entry to indicate it is done. */ (void) ddl_log_sync_no_lock();
Why do you need to sync previous log entries? As far as I understand, they're useless without DDL_LOG_EXECUTE_CODE entry, so why would it matter whether they're on disk or not after a crash?
- file_entry_buf[DDL_LOG_ENTRY_TYPE_POS]= (char)DDL_LOG_EXECUTE_CODE; - } - else - file_entry_buf[DDL_LOG_ENTRY_TYPE_POS]= (char)DDL_IGNORE_LOG_ENTRY_CODE; - file_entry_buf[DDL_LOG_ACTION_TYPE_POS]= 0; /* Ignored for execute entries */ - file_entry_buf[DDL_LOG_PHASE_POS]= 0; - int4store(&file_entry_buf[DDL_LOG_NEXT_ENTRY_POS], first_entry); - file_entry_buf[DDL_LOG_NAME_POS]= 0; - file_entry_buf[DDL_LOG_NAME_POS + FN_REFLEN]= 0; - file_entry_buf[DDL_LOG_NAME_POS + 2*FN_REFLEN]= 0; + bzero(file_entry_buf, global_ddl_log.io_size); + + file_entry_buf[DDL_LOG_ENTRY_TYPE_POS]= (uchar)DDL_LOG_EXECUTE_CODE; + int4store(file_entry_buf + DDL_LOG_NEXT_ENTRY_POS, first_entry); + if (!(*active_entry)) { - if (get_free_ddl_log_entry(active_entry, &write_header)) - { + if (ddl_log_get_free_entry(active_entry)) DBUG_RETURN(TRUE); + got_free_entry= TRUE; } - write_header= TRUE; - } if (write_ddl_log_file_entry((*active_entry)->entry_pos)) { - sql_print_error("Error writing execute entry in ddl log"); - ddl_log_release_memory_entry(*active_entry); - DBUG_RETURN(TRUE); - } - (void) ddl_log_sync_no_lock(); - if (write_header) + if (got_free_entry) { - if (write_ddl_log_header()) - { ddl_log_release_memory_entry(*active_entry); + *active_entry= 0; + } + sql_print_error("DDL_LOG: Error writing execute entry %u", + (*active_entry)->entry_pos); DBUG_RETURN(TRUE); } - } + (void) ddl_log_sync_no_lock(); DBUG_RETURN(FALSE); }
- /** - Deactivate an individual entry. + Increment phase for enty. Will deactivate entry after all phases are done
@details see ddl_log_increment_phase_no_lock.
- @param entry_no Entry position of record to change + @param entry_pos Entry position of record to change
@return Operation status @retval TRUE Error @retval FALSE Success */
-bool ddl_log_increment_phase(uint entry_no) +bool ddl_log_increment_phase(uint entry_pos) { bool error; DBUG_ENTER("ddl_log_increment_phase");
mysql_mutex_lock(&LOCK_gdl); - error= ddl_log_increment_phase_no_lock(entry_no); + error= ddl_log_increment_phase_no_lock(entry_pos); mysql_mutex_unlock(&LOCK_gdl); DBUG_RETURN(error); } @@ -1042,87 +1484,157 @@ static void close_ddl_log() (void) mysql_file_close(global_ddl_log.file_id, MYF(MY_WME)); global_ddl_log.file_id= (File) -1; } + global_ddl_log.open= 0; DBUG_VOID_RETURN; }
/** + Loop over ddl log excute entries and mark those that are already stored + in the binary log as completed + + @return + @retval 0 ok + @return 1 fail (write error) + +*/ + +bool ddl_log_close_binlogged_events(HASH *xids) +{ + uint i; + DDL_LOG_ENTRY ddl_log_entry; + DBUG_ENTER("ddl_log_close_binlogged_events"); + + if (global_ddl_log.num_entries == 0 || xids->records == 0) + DBUG_RETURN(0); + + mysql_mutex_lock(&LOCK_gdl); + for (i= 1; i <= global_ddl_log.num_entries; i++) + { + if (read_ddl_log_entry(i, &ddl_log_entry)) + break; // Read error. Ignore
it's not really "ignore", it's more like "stop reading"
+ if (ddl_log_entry.entry_type == DDL_LOG_EXECUTE_CODE && + ddl_log_entry.xid != 0 && + my_hash_search(xids, (uchar*) &ddl_log_entry.xid, + sizeof(ddl_log_entry.xid))) + { + if (disable_execute_entry(i)) + { + mysql_mutex_unlock(&LOCK_gdl); + DBUG_RETURN(1); // Write error. Fatal! + } + } + } + (void) ddl_log_sync_no_lock(); + mysql_mutex_unlock(&LOCK_gdl); + DBUG_RETURN(0); +} + + +/** Execute the ddl log at recovery of MySQL Server. + + @return + @retval 0 Ok. + @retval > 0 Fatal error. We have to abort (can't create ddl log) + @return < -1 Recovery failed, but new log exists and is usable + */
-void ddl_log_execute_recovery() +int ddl_log_execute_recovery() { - uint num_entries, i; + uint i, count= 0; + int error= 0; THD *thd; DDL_LOG_ENTRY ddl_log_entry; - char file_name[FN_REFLEN]; static char recover_query_string[]= "INTERNAL DDL LOG RECOVER IN PROGRESS"; DBUG_ENTER("ddl_log_execute_recovery");
- /* - Initialise global_ddl_log struct - */ - bzero(global_ddl_log.file_entry_buf, sizeof(global_ddl_log.file_entry_buf)); - global_ddl_log.inited= FALSE; - global_ddl_log.recovery_phase= TRUE; - global_ddl_log.io_size= IO_SIZE; - global_ddl_log.file_id= (File) -1; + if (global_ddl_log.num_entries == 0) + DBUG_RETURN(0);
/* To be able to run this from boot, we allocate a temporary THD */ if (!(thd=new THD(0))) - DBUG_VOID_RETURN; + { + DBUG_ASSERT(0); // Fatal error + DBUG_RETURN(1); + } thd->thread_stack= (char*) &thd; thd->store_globals(); + thd->log_all_errors= (global_system_variables.log_warnings >= 3);
thd->set_query(recover_query_string, strlen(recover_query_string));
- /* this also initialize LOCK_gdl */ - num_entries= read_ddl_log_header(); mysql_mutex_lock(&LOCK_gdl); - for (i= 1; i < num_entries + 1; i++) + for (i= 1; i <= global_ddl_log.num_entries; i++) { if (read_ddl_log_entry(i, &ddl_log_entry)) { - sql_print_error("Failed to read entry no = %u from ddl log", - i); + error= -1; continue; } if (ddl_log_entry.entry_type == DDL_LOG_EXECUTE_CODE) { + /* purecov: begin tested */
really? you tested with purify?
+ if (ddl_log_entry.unique_id > DDL_LOG_MAX_RETRY) + { + error= -1; + continue; + } + update_unique_id(i, ++ddl_log_entry.unique_id); + if (ddl_log_entry.unique_id > DDL_LOG_MAX_RETRY) + { + sql_print_error("DDL_LOG: Aborting executing entry %u after %llu " + "retries", i, ddl_log_entry.unique_id); + error= -1; + continue; + } + /* purecov: end tested */ if (ddl_log_execute_entry_no_lock(thd, ddl_log_entry.next_entry)) { - /* Real unpleasant scenario but we continue anyways. */ + /* Real unpleasant scenario but we have to continue anyway */
why can this happen? do you have an example? besides hardware failure, like a disk cannot be read
+ error= -1; continue; } + count++; } } close_ddl_log(); - create_ddl_log_file_name(file_name); - (void) mysql_file_delete(key_file_global_ddl_log, file_name, MYF(0)); - global_ddl_log.recovery_phase= FALSE; mysql_mutex_unlock(&LOCK_gdl); thd->reset_query(); delete thd; - DBUG_VOID_RETURN; + + /* + Create a new ddl_log to get rid of old stuff and ensure that header matches + the current source version + */
because here you delete/overwrite the old ddl log, it'd make sense to log all details of every entry that you failed to execute but decided to continue anyway. Now you log only the entry number on failure, which is rather useless, because it's a number in a ddl log that you overwrite.
+ if (create_ddl_log()) + DBUG_RETURN(1); + if (count > 0) + sql_print_information("DDL_LOG: Crash recovery executed %u entries", + count); + DBUG_RETURN(error); }
/** - Release all memory allocated to the ddl log. + Release all memory allocated to the ddl log and delete the ddl log */
void ddl_log_release() { + char file_name[FN_REFLEN]; DDL_LOG_MEMORY_ENTRY *free_list; DDL_LOG_MEMORY_ENTRY *used_list; DBUG_ENTER("ddl_log_release");
- if (!global_ddl_log.do_release) + if (!global_ddl_log.initialized) DBUG_VOID_RETURN;
- mysql_mutex_lock(&LOCK_gdl); + global_ddl_log.initialized= 0; + free_list= global_ddl_log.first_free; used_list= global_ddl_log.first_used; while (used_list)
Regards, Sergei VP of MariaDB Server Engineering and security@mariadb.org
Hi!
diff --git a/client/mysqltest.cc b/client/mysqltest.cc index 350c55edee2..133d1f76839 100644 --- a/client/mysqltest.cc +++ b/client/mysqltest.cc @@ -8095,9 +8095,10 @@ void handle_error(struct st_command *command, const char *err_sqlstate, DYNAMIC_STRING *ds) { int i; - DBUG_ENTER("handle_error");
+ var_set_int("$errno", err_errno);
there was $mysql_errno already, was is not something you could've used?
We had both errno and mysql_errno in the code. It think it is safer to reuse $errno as it is less used. Now errno is both latest sys error and lates sql error.
diff --git a/mysql-test/suite/atomic/rename_case.result b/mysql-test/suite/atomic/rename_case.result new file mode 100644 index 00000000000..4b58c555cf2 --- /dev/null +++ b/mysql-test/suite/atomic/rename_case.result @@ -0,0 +1,52 @@ +create database test2; +# +# Testing rename error in different places +# +create table t1 (a int); +create table T2 (b int); +create table t3 (c int); +create table T4 (d int); +insert into t1 values(1); +insert into T2 values(2); +insert into t3 values(3); +insert into T4 values(4); +create temporary table tmp1 (a int); +create temporary table tmp2 (b int); +create temporary table tmp3 (c int); +create temporary table tmp4 (d int); +insert into tmp1 values(11); +insert into tmp2 values(22); +insert into tmp3 values(33); +insert into tmp4 values(44); +rename table t3 to T4, t1 to t5, T2 to t1, t5 to T2; +ERROR 42S01: Table 'T4' already exists +rename table t1 to t5, t3 to T4, T2 to t1, t5 to T2; +ERROR 42S01: Table 'T4' already exists +rename table t1 to t5, T2 to t1, t3 to T4, t5 to T2; +ERROR 42S01: Table 'T4' already exists +rename table t1 to t5, T2 to t1, t5 to T2, t3 to T4; +ERROR 42S01: Table 'T4' already exists +# Try failed rename using two databases +rename table test.t1 to test2.t5, test.T2 to test.t1, t5 to test.T2; +ERROR 42S02: Table 'test.t5' doesn't exist +select t1.a+T2.b+t3.c+T4.d from t1,T2,t3,T4;
better do
select t1.a,T2.b,t3.c,T4.d from t1,T2,t3,T4;
same one row, but it can detect when tables are swapped.
My row also detects if tables are swapped and is safe as it always returns the same result. The reason it detects swapped tabels is that each table has it's own column names and you will get an error if the tables would be swapped.
--- a/sql/sql_class.h +++ b/sql/sql_class.h @@ -2835,6 +2835,7 @@ class THD: public THD_count, /* this must be first */
#ifndef MYSQL_CLIENT binlog_cache_mngr * binlog_setup_trx_data(); + ulonglong binlog_xid; /* binlog request storing of xid */
"Tell binlog to store thd->query_id in the next Query_log_event"
Changed to: /* If set, tell binlog to store the value as query 'xid' in the next Query_log_event */ It is not always we store query_id and even if it would, the value of binlog_xid is what is stored.
+++ b/sql/log_event_client.cc @@ -1822,7 +1822,7 @@ bool Query_log_event::print_query_header(IO_CACHE* file, my_b_printf(file, "\t%s\tthread_id=%lu\texec_time=%lu\terror_code=%d\n", get_type_str(), (ulong) thread_id, (ulong) exec_time, - error_code)) + error_code, (ulong) xid))
huh? you didn't add "\nxid=%lu" to the format string
Ouch. Fixed
goto err; }
diff --git a/sql/debug_sync.h b/sql/debug_sync.h index 3b8aa8815e1..4e3e10fcc51 100644 --- a/sql/debug_sync.h +++ b/sql/debug_sync.h @@ -53,4 +53,12 @@ static inline bool debug_sync_set_action(THD *, const char *, size_t) { return false; } #endif /* defined(ENABLED_DEBUG_SYNC) */
+#ifndef DBUG_OFF +void debug_crash_here(const char *keyword); +bool debug_simulate_error(const char *keyword, uint error); +#else +#define debug_crash_here(A) do { } while(0) +#define debug_simulate_error(A, B) 0 +#endif
this doesn't belong to debug_sync.h or debug_sync.cc
put it, hmm, may be in sql_priv.h ? or sql_class.h/sql_class.cc
Added it to sql/dbug.cc and sql/debug.h as agreed
#endif /* DEBUG_SYNC_INCLUDED */ diff --git a/sql/debug_sync.cc b/sql/debug_sync.cc index 1fbb15592a4..f523e22f6ce 100644 --- a/sql/debug_sync.cc +++ b/sql/debug_sync.cc @@ -1628,3 +1628,74 @@ bool debug_sync_set_action(THD *thd, const char *action_str, size_t len) /* prevent linker/lib warning about file without public symbols */ int debug_sync_dummy; #endif /* defined(ENABLED_DEBUG_SYNC) */ + + +/** + Debug utility to do crash after a set number of executions + + The user variable, either @debug_crash_counter or @debug_error_counter,
this is very hackish. better to use @@debug_dbug_counter, something like that. also it could be global for an extra benefit if you'd need to crash a non-user thread.
As discussed, I don't want or need global counters. For this to work for atomic ddl. they have to be local thread variables.
diff --git a/sql/log_event_server.cc b/sql/log_event_server.cc index cca05f1fcca..0cf037e3a6f 100644 --- a/sql/log_event_server.cc +++ b/sql/log_event_server.cc @@ -1294,6 +1294,15 @@ bool Query_log_event::write() int3store(start, when_sec_part); start+= 3; } + + /* xid's is used with ddl_log handling */
it's not really an XID, it's a query_id or the DDL statetent.
XID is a combination of a 64-byte global transaction identifier and a 64-byte branch qualifier.
It would be much less confusing not to pretend that the query_id is an "xid" is some shape or form. Better to rename it to, say ddl_query_id. Or even ddl_id, if you like it short.
That would be quite hard to do in the original commits as I would have to fix and recompile every commit. Better to leave it to another day and do it as a separate commit on top if we so desire.
diff --git a/sql/log.cc b/sql/log.cc index 17c599627cb..6e0a9706ec8 100644 --- a/sql/log.cc +++ b/sql/log.cc @@ -10654,6 +10669,8 @@ int TC_LOG_BINLOG::recover(LOG_INFO *linfo, const char *last_log_name, { if (ha_recover(&xids)) goto err2; + if (ddl_log_close_binlogged_events(&xids))
your xids and normal Xid_log_event xids are from two different non-intersecting sets and have even different semantics.
It'd be safer not to have them in the same hash.
I talked with you about this and you wanted to have them in the same hash. I asked how to generated them and you told me to use thd->query_id. Ok, I will split them, not a big deal.
--- a/sql/sql_view.cc +++ b/sql/sql_view.cc @@ -2231,16 +2234,25 @@ mysql_rename_view(THD *thd, (uchar*)&view_def, view_parameters)) { /* restore renamed view in case of error */ - rename_in_schema_file(thd, new_db->str, new_name->str, view->db.str, - view->table_name.str); + rename_in_schema_file(thd, new_db->str, new_name->str, old_db->str, + old_name->str); goto err; } - } else + } + else DBUG_RETURN(1);
/* remove cache entries */ - query_cache_invalidate3(thd, view, 0); - sp_cache_invalidate(); + { + char key[NAME_LEN*2+1], *ptr; + memcpy(key, old_db->str, old_db->length); + ptr= key+ old_db->length; + *ptr++= 0; + memcpy(key, old_name->str, old_name->length); + ptr= key+ old_db->length; + *ptr++= 0;
there's tdc_create_key for that.
Does not work with LEX_CSTRINGs. At some point we should create another version of td_create_key that works with LEX_CSTRINGS. However, to do tha we have to fix a lot of functions that takes db_name and table_name as const char*
+ query_cache.invalidate(thd, key, (size_t) (ptr-key), 0); + } error= FALSE;
<cut>
+enum enum_ddl_log_rename_table_phase { + DDL_RENAME_PHASE_TRIGGER= 0, + DDL_RENAME_PHASE_STAT, + DDL_RENAME_PHASE_TABLE, +}; + +/* + Setting ddl_log_entry.phase to this has the same effect as setting + action_type to DDL_IGNORE_LOG_ENTRY_CODE
this doesn't make much sense.
It is very practical and thus makes perfect sense
May be you can add a line or two about why it's needed and one cannot simply set action_type to DDL_IGNORE_LOG_ENTRY_CODE instead?
Because that it now how the ddl log is supposed to work. We are never changing the action type for normal entries. Makes it easier to check and debug things if something goes wrong. In old code one disabled an entry by storing the maximum phase into a action entry. I added this as a maximum phase to make it safer (to not have to remember where to find the maxium possible phase and to safe guard against someone adding a new phase). Anyway, I fixed the comment to explain this: /* Setting ddl_log_entry.phase to this has the same effect as setting the phase to the maximum phase (..PHASE_END) for an entry. */
+*/ + +#define DDL_LOG_FINAL_PHASE ((uchar) 0xff)
typedef struct st_ddl_log_entry { - const char *name; - const char *from_name; - const char *handler_name; - const char *tmp_name; + LEX_CSTRING name; + LEX_CSTRING from_name; + LEX_CSTRING handler_name; + LEX_CSTRING tmp_name; + LEX_CSTRING db; + LEX_CSTRING from_db; + LEX_CSTRING from_handler_name; + uchar uuid[MY_UUID_SIZE]; // UUID for new frm file + + ulonglong xid; // Xid stored in the binary log uint next_entry; - uint entry_pos; - enum ddl_log_entry_code entry_type; + uint entry_pos; // Set by write_dll_log_entry() + /* + unique_id can be used to store a unique number to check current state. + Currently it is used to store new size of frm file or link to a ddl log + entry.
is that so? I don't see it storing a file version, a new size of the frm or a link, I see it stores some kind of a counter to abort recovery when it was repeated more than DDL_LOG_MAX_RETRY times
Yes, it is so (for action entries) For an execute entry it is re-used as a counter. I will update the documentation for that.
+ */ + uint32 unique_id; // Unique id for file version + uint16 flags; // Flags unique for each command + enum ddl_log_entry_code entry_type; // Set automatically enum ddl_log_action_code action_type; /* Most actions have only one phase. REPLACE does however have two @@ -95,17 +145,52 @@ typedef struct st_ddl_log_memory_entry } DDL_LOG_MEMORY_ENTRY;
+typedef struct st_ddl_log_state +{ + /* List of ddl log entries */ + DDL_LOG_MEMORY_ENTRY *list; + /* One execute entry per list */ + DDL_LOG_MEMORY_ENTRY *execute_entry; +} DDL_LOG_STATE;
would be nice to have a small comment, explaining what is a "state" and how two lists can represent it.
is execute_entry a pointer to a some element from the list?
I added this comment: /* State of the ddl log during execution of a DDL. A ddl log state has one execute entry (main entry pointing to the first action entry) and many 'action entries' linked in a list in the order they should be executed. One recovery the log is parsed and all execute entries will be executed. All entries are stored as separate blocks in the ddl recovery file. */ <cut>
diff --git a/sql/sql_rename.cc b/sql/sql_rename.cc index b7aed97a8a2..2b2f32cf126 100644 --- a/sql/sql_rename.cc +++ b/sql/sql_rename.cc @@ -186,49 +172,43 @@ bool mysql_rename_tables(THD *thd, TABLE_LIST *table_list, bool silent, /* Add IF EXISTS to binary log */ thd->variables.option_bits|= OPTION_IF_EXISTS; } + + debug_crash_here("ddl_log_rename_before_binlog"); + /* + Store xid in ddl log and binary log so that we can check on ddl recovery + if the item is in the binary log (and thus the operation was complete + */ + thd->binlog_xid= thd->query_id; + ddl_log_update_xid(&ddl_log_state, thd->binlog_xid); binlog_error= write_bin_log(thd, TRUE, thd->query(), thd->query_length());
I'd prefer to avoid blowing up THD with arbitrary flags that are only needed rarely in one specific place in the code to pass some argument few stack frames down.
The other way would be to pass it as an argument to all binlog entries, but that would be even worse as the usage of binlog_xid is very deep in the call stack
Perhaps you can do instead something like
bool Query_log_event::write() { ... /* xid's is used with ddl_log handling */ if (ddl_log_entry_incomplete())
There is nothing of the ddl log stored in the thd, so this would not work.
{ *start++= Q_XID; int8store(start, thd->query_id); start+= 8; }
or at least you could replace ulonglong binlog_xid with a one bit state, like thd->server_status & SERVER_STATUS_IN_DDL or with some other bitmap of flags.
As binlog_xid contains the number that should be stored (no, it is not always query_id) there is no way to do this as a bit. <cut>
-/* - reverse table list - - SYNOPSIS - reverse_table_list() - table_list pointer to table _list - - RETURN - pointer to new (reversed) list -*/ -static TABLE_LIST *reverse_table_list(TABLE_LIST *table_list) -{ - TABLE_LIST *prev= 0; - - while (table_list) - { - TABLE_LIST *next= table_list->next_local; - table_list->next_local= prev; - prev= table_list; - table_list= next; - } - return (prev); -} - - static bool -do_rename_temporary(THD *thd, TABLE_LIST *ren_table, TABLE_LIST *new_table, - bool skip_error) +do_rename_temporary(THD *thd, TABLE_LIST *ren_table, TABLE_LIST *new_table)
I wonder why we never got any warnings about it
Warnings about what? Fixing a failed rename worked correctly before it server did not go down. Above I was just reusing the recovery code to do the renames back properly.
{ LEX_CSTRING *new_alias; DBUG_ENTER("do_rename_temporary"); diff --git a/sql/ddl_log.cc b/sql/ddl_log.cc index bad786b3799..777dfdddbc7 100644 --- a/sql/ddl_log.cc
<cut>
+ DDl recovery after a crash works the following way:
DDL or ddl
Fixed cut>
st_global_ddl_log global_ddl_log;
static?
Already done in later commits. <cut>
+ Sync the ddl log file. + + @return Operation status + @retval FALSE Success + @retval TRUE Error +*/ + +static bool ddl_log_sync_file() +{ + DBUG_ENTER("ddl_log_sync_file"); + DBUG_RETURN(mysql_file_sync(global_ddl_log.file_id, MYF(MY_WME))); +}
I don't quite understand that, are there cases when you want to sync the ddl log and you *don't know* whether LOCK_gdl is locked or not?
because if you know there should be no lock, you'd better use mysql_mutex_assert_not_owner here.
Bad orginal code. See top of my tree how this was cleaned up over time as I was working on the code.
+ +/* Same as above, but ensure we have the LOCK_gdb locked */
that's an interesting Freudian slip
+ +static bool ddl_log_sync_no_lock() +{ + DBUG_ENTER("ddl_log_sync_no_lock"); + + mysql_mutex_assert_owner(&LOCK_gdl); + DBUG_RETURN(ddl_log_sync_file()); +} ... +/* + Store and read strings in ddl log buffers + + Format is: + 2 byte: length (not counting end \0) + X byte: string value of length 'length' + 1 byte: \0
a bit redundant, don't you think? Why did you do it?
What is redundant? If you see the usage of the above, you will see the benfits of this format. <cut>
static void set_global_from_ddl_log_entry(const DDL_LOG_ENTRY *ddl_log_entry)
it'd be good to have some comment explaining why you need this function. Couldn't you write directly from the ddl_log_entry? or use a local buffer? why to serialize into a buffer inside global_ddl_log?
Ask Michael Ronström. This is still his logic. Anyway, it works good enough so I have not tried to change it. Having a global buffer helps a little bit with not having to do almost no mallocs when running DDL.s <cut>
+ pos= store_string(pos, end, &ddl_log_entry->tmp_name);
you have DBUG_ASSERT(0) that all those strings won't overflow io_size. How can you guarantee that?
The buffer can never overflow, that code guarantees that. What could in theory happen is the an entry could not fit into a block, in which case recovery would not work. However this is very unlikely/impossible as we don't store any big strings in it. All 'big' strings (like query) are stored separately. We are maximum storing 2 db's and 2 table names + internal tmp file (small string). This would be (64*4)*4 (assuming 4 byte character set), which is 1024 and the default block is 4K. <cut>
+/* + Build a filename for a table, trigger file or .frm + Delete also any temporary file suffixed with ~
Where do you get these *~ files from? What creates them?
Triggers, creation of frm etc. see sql_create_definition_file(). <cut>
DBUG_PRINT("ddl_log", - ("write type %c next %u name '%s' from_name '%s' handler '%s'" - " tmp_name '%s'", + ("type: %c next: %u handler: %s " + "to_name: '%s.%s' from_name: '%s.%s' " + "tmp_name: '%s'", (char) global_ddl_log.file_entry_buf[DDL_LOG_ACTION_TYPE_POS], ddl_log_entry->next_entry, - (char*) &global_ddl_log.file_entry_buf[DDL_LOG_NAME_POS], - (char*) &global_ddl_log.file_entry_buf[DDL_LOG_NAME_POS - + FN_REFLEN], - (char*) &global_ddl_log.file_entry_buf[DDL_LOG_NAME_POS - + (2*FN_REFLEN)], - (char*) &global_ddl_log.file_entry_buf[DDL_LOG_NAME_POS - + (3*FN_REFLEN)])); + get_string(&pos, end).str, // Handler + get_string(&pos, end).str, // to db.table + get_string(&pos, end).str, + get_string(&pos, end).str, // From db.table + get_string(&pos, end).str, + get_string(&pos, end).str)); // Tmp name
This is not a critical bug, because it's in a DBUG_PRINT, but it's still a bug. C++ (and C) standard both say that the order of evaluation of function arguments is *unspecified*.
This is already fixed in a later commit. <cut>
bool ddl_log_write_execute_entry(uint first_entry, - bool complete, DDL_LOG_MEMORY_ENTRY **active_entry) { - bool write_header= FALSE; - char *file_entry_buf= (char*)global_ddl_log.file_entry_buf; + uchar *file_entry_buf= global_ddl_log.file_entry_buf; + bool got_free_entry= 0; DBUG_ENTER("ddl_log_write_execute_entry");
mysql_mutex_assert_owner(&LOCK_gdl); - if (init_ddl_log()) - { - DBUG_RETURN(TRUE); - } - if (!complete) - { /* We haven't synched the log entries yet, we synch them now before writing the execute entry. If complete is true we haven't written
there's no 'complete' anymore
Fixed
any log entries before, we are only here to write the execute entry to indicate it is done. */ (void) ddl_log_sync_no_lock();
Why do you need to sync previous log entries? As far as I understand, they're useless without DDL_LOG_EXECUTE_CODE entry, so why would it matter whether they're on disk or not after a crash?
Because we have to ensure that when exceute entry is written (and can thus be executed any time), all ddl action entries must be also on disk. Thus we have to write and sync ddl action entries before writing the execute entry. <cut>
+bool ddl_log_close_binlogged_events(HASH *xids) +{ + uint i; + DDL_LOG_ENTRY ddl_log_entry; + DBUG_ENTER("ddl_log_close_binlogged_events"); + + if (global_ddl_log.num_entries == 0 || xids->records == 0) + DBUG_RETURN(0); + + mysql_mutex_lock(&LOCK_gdl); + for (i= 1; i <= global_ddl_log.num_entries; i++) + { + if (read_ddl_log_entry(i, &ddl_log_entry)) + break; // Read error. Ignore
it's not really "ignore", it's more like "stop reading"
Updated comment. <cut>
if (ddl_log_entry.entry_type == DDL_LOG_EXECUTE_CODE) { + /* purecov: begin tested */
really? you tested with purify?
Didn't you know that mtr when run with gcov, takes the above into account? Elena has verified that all code in atomic create table was executed, except for the code that was marked with purify. The effect of the above that mtr found that "all lines where tested". <cut>
+ /* purecov: end tested */ if (ddl_log_execute_entry_no_lock(thd, ddl_log_entry.next_entry)) { - /* Real unpleasant scenario but we continue anyways. */ + /* Real unpleasant scenario but we have to continue anyway */
why can this happen? do you have an example? besides hardware failure, like a disk cannot be read
InnoDB gives an error that it cannot drop a table. <cut>
+ /* + Create a new ddl_log to get rid of old stuff and ensure that header matches + the current source version + */
because here you delete/overwrite the old ddl log, it'd make sense to log all details of every entry that you failed to execute but decided to continue anyway. Now you log only the entry number on failure, which is rather useless, because it's a number in a ddl log that you overwrite.
As we have agreed, I now create a backup file before starting recovery. <cut> Regards, Monty
participants (2)
-
Michael Widenius
-
Sergei Golubchik