Hi, Monty! On May 10, Michael Widenius wrote:
revision-id: eebebe8ef75 (mariadb-10.6.0-77-geebebe8ef75) parent(s): 5a602e2ce7f author: Michael Widenius <michael.widenius@gmail.com> committer: Michael Widenius <michael.widenius@gmail.com> timestamp: 2021-05-04 21:09:10 +0300 message:
MDEV-23842 Atomic RENAME TABLE
diff --git a/client/mysqltest.cc b/client/mysqltest.cc index 350c55edee2..133d1f76839 100644 --- a/client/mysqltest.cc +++ b/client/mysqltest.cc @@ -8095,9 +8095,10 @@ void handle_error(struct st_command *command, const char *err_sqlstate, DYNAMIC_STRING *ds) { int i; - DBUG_ENTER("handle_error");
+ var_set_int("$errno", err_errno);
there was $mysql_errno already, was is not something you could've used?
+ command->used_replace= 1; if (command->require_file) { diff --git a/mysql-test/suite/atomic/rename_case.result b/mysql-test/suite/atomic/rename_case.result new file mode 100644 index 00000000000..4b58c555cf2 --- /dev/null +++ b/mysql-test/suite/atomic/rename_case.result @@ -0,0 +1,52 @@ +create database test2; +# +# Testing rename error in different places +# +create table t1 (a int); +create table T2 (b int); +create table t3 (c int); +create table T4 (d int); +insert into t1 values(1); +insert into T2 values(2); +insert into t3 values(3); +insert into T4 values(4); +create temporary table tmp1 (a int); +create temporary table tmp2 (b int); +create temporary table tmp3 (c int); +create temporary table tmp4 (d int); +insert into tmp1 values(11); +insert into tmp2 values(22); +insert into tmp3 values(33); +insert into tmp4 values(44); +rename table t3 to T4, t1 to t5, T2 to t1, t5 to T2; +ERROR 42S01: Table 'T4' already exists +rename table t1 to t5, t3 to T4, T2 to t1, t5 to T2; +ERROR 42S01: Table 'T4' already exists +rename table t1 to t5, T2 to t1, t3 to T4, t5 to T2; +ERROR 42S01: Table 'T4' already exists +rename table t1 to t5, T2 to t1, t5 to T2, t3 to T4; +ERROR 42S01: Table 'T4' already exists +# Try failed rename using two databases +rename table test.t1 to test2.t5, test.T2 to test.t1, t5 to test.T2; +ERROR 42S02: Table 'test.t5' doesn't exist +select t1.a+T2.b+t3.c+T4.d from t1,T2,t3,T4;
better do select t1.a,T2.b,t3.c,T4.d from t1,T2,t3,T4; same one row, but it can detect when tables are swapped. or if you need one scalar value (you do below) it could ne select CONCAT(t1.a,T2.b,t3.c,T4.d) from t1,T2,t3,T4;
+t1.a+T2.b+t3.c+T4.d +10 +select * from t5; +ERROR 42S02: Table 'test.t5' doesn't exist +T2.MYD +T2.MYI +T2.frm +T4.MYD +T4.MYI +T4.frm +db.opt +t1.MYD +t1.MYI +t1.frm +t3.MYD +t3.MYI +t3.frm +# Cleanup +drop table t1,T2,t3,T4; +drop database test2; diff --git a/sql/sql_class.h b/sql/sql_class.h index 0dcb2e04ba4..5ae0866672d 100644 --- a/sql/sql_class.h +++ b/sql/sql_class.h @@ -2835,6 +2835,7 @@ class THD: public THD_count, /* this must be first */
#ifndef MYSQL_CLIENT binlog_cache_mngr * binlog_setup_trx_data(); + ulonglong binlog_xid; /* binlog request storing of xid */
"Tell binlog to store thd->query_id in the next Query_log_event"
/* Public interface to write RBR events to the binlog diff --git a/sql/log_event_client.cc b/sql/log_event_client.cc index 6b5d71348e1..733fc14c697 100644 --- a/sql/log_event_client.cc +++ b/sql/log_event_client.cc @@ -1822,7 +1822,7 @@ bool Query_log_event::print_query_header(IO_CACHE* file, my_b_printf(file, "\t%s\tthread_id=%lu\texec_time=%lu\terror_code=%d\n", get_type_str(), (ulong) thread_id, (ulong) exec_time, - error_code)) + error_code, (ulong) xid))
huh? you didn't add "\nxid=%lu" to the format string
goto err; }
diff --git a/sql/debug_sync.h b/sql/debug_sync.h index 3b8aa8815e1..4e3e10fcc51 100644 --- a/sql/debug_sync.h +++ b/sql/debug_sync.h @@ -53,4 +53,12 @@ static inline bool debug_sync_set_action(THD *, const char *, size_t) { return false; } #endif /* defined(ENABLED_DEBUG_SYNC) */
+#ifndef DBUG_OFF +void debug_crash_here(const char *keyword); +bool debug_simulate_error(const char *keyword, uint error); +#else +#define debug_crash_here(A) do { } while(0) +#define debug_simulate_error(A, B) 0 +#endif
this doesn't belong to debug_sync.h or debug_sync.cc put it, hmm, may be in sql_priv.h ? or sql_class.h/sql_class.cc
+ #endif /* DEBUG_SYNC_INCLUDED */ diff --git a/sql/debug_sync.cc b/sql/debug_sync.cc index 1fbb15592a4..f523e22f6ce 100644 --- a/sql/debug_sync.cc +++ b/sql/debug_sync.cc @@ -1628,3 +1628,74 @@ bool debug_sync_set_action(THD *thd, const char *action_str, size_t len) /* prevent linker/lib warning about file without public symbols */ int debug_sync_dummy; #endif /* defined(ENABLED_DEBUG_SYNC) */ + + +/** + Debug utility to do crash after a set number of executions + + The user variable, either @debug_crash_counter or @debug_error_counter,
this is very hackish. better to use @@debug_dbug_counter, something like that. also it could be global for an extra benefit if you'd need to crash a non-user thread.
+ is decremented each time debug_crash() or debug_simulate_error is called + if the keyword is set with @@debug_push, like + @@debug_push="d+frm_data_type_info_emulate" + + If the variable is not set or is not an integer it will be ignored. +*/ + +#ifndef DBUG_OFF + +static const LEX_CSTRING debug_crash_counter= +{ STRING_WITH_LEN("debug_crash_counter") }; +static const LEX_CSTRING debug_error_counter= +{ STRING_WITH_LEN("debug_error_counter") }; + +static bool debug_decrement_counter(const LEX_CSTRING *name) +{ + THD *thd= current_thd; + user_var_entry *entry= (user_var_entry*) + my_hash_search(&thd->user_vars, (uchar*) name->str, name->length); + if (!entry || entry->type != INT_RESULT || ! entry->value) + return 0; + (*(ulonglong*) entry->value)= (*(ulonglong*) entry->value)-1; + return !*(ulonglong*) entry->value; +} + +void debug_crash_here(const char *keyword) +{ + DBUG_ENTER("debug_crash_here"); + DBUG_PRINT("enter", ("keyword: %s", keyword)); + + DBUG_EXECUTE_IF(keyword, + if (debug_decrement_counter(&debug_crash_counter)) + { + my_printf_error(ER_INTERNAL_ERROR, + "Crashing at %s", + MYF(ME_ERROR_LOG | ME_NOTE), keyword); + DBUG_SUICIDE(); + }); + DBUG_VOID_RETURN; +} + +/* + This can be used as debug_counter to simulate an error at a specific + position. + + Typical usage would be + if (debug_simualte_error("keyword")) + error= 1; +*/ + +bool debug_simulate_error(const char *keyword, uint error) +{ + DBUG_ENTER("debug_crash_here"); + DBUG_PRINT("enter", ("keyword: %s", keyword)); + DBUG_EXECUTE_IF(keyword, + if (debug_decrement_counter(&debug_error_counter)) + { + my_printf_error(error, + "Simulating error for '%s'", + MYF(ME_ERROR_LOG), keyword); + DBUG_RETURN(1); + }); + DBUG_RETURN(0); +} +#endif /* DBUG_OFF */ diff --git a/sql/log_event_server.cc b/sql/log_event_server.cc index cca05f1fcca..0cf037e3a6f 100644 --- a/sql/log_event_server.cc +++ b/sql/log_event_server.cc @@ -1294,6 +1294,15 @@ bool Query_log_event::write() int3store(start, when_sec_part); start+= 3; } + + /* xid's is used with ddl_log handling */
it's not really an XID, it's a query_id or the DDL statetent. XID is a combination of a 64-byte global transaction identifier and a 64-byte branch qualifier. It would be much less confusing not to pretend that the query_id is an "xid" is some shape or form. Better to rename it to, say ddl_query_id. Or even ddl_id, if you like it short.
+ if (thd && thd->binlog_xid) + { + *start++= Q_XID; + int8store(start, thd->query_id); + start+= 8; + } + /* NOTE: When adding new status vars, please don't forget to update the MAX_SIZE_LOG_EVENT_STATUS in log_event.h and update the function diff --git a/sql/log.cc b/sql/log.cc index 17c599627cb..6e0a9706ec8 100644 --- a/sql/log.cc +++ b/sql/log.cc @@ -10654,6 +10669,8 @@ int TC_LOG_BINLOG::recover(LOG_INFO *linfo, const char *last_log_name, { if (ha_recover(&xids)) goto err2; + if (ddl_log_close_binlogged_events(&xids))
your xids and normal Xid_log_event xids are from two different non-intersecting sets and have even different semantics. It'd be safer not to have them in the same hash.
+ goto err2;
free_root(&mem_root, MYF(0)); my_hash_free(&xids); diff --git a/sql/sql_view.cc b/sql/sql_view.cc index 5df5ecb76a9..e6d726b30d7 100644 --- a/sql/sql_view.cc +++ b/sql/sql_view.cc @@ -2231,16 +2234,25 @@ mysql_rename_view(THD *thd, (uchar*)&view_def, view_parameters)) { /* restore renamed view in case of error */ - rename_in_schema_file(thd, new_db->str, new_name->str, view->db.str, - view->table_name.str); + rename_in_schema_file(thd, new_db->str, new_name->str, old_db->str, + old_name->str); goto err; } - } else + } + else DBUG_RETURN(1);
/* remove cache entries */ - query_cache_invalidate3(thd, view, 0); - sp_cache_invalidate(); + { + char key[NAME_LEN*2+1], *ptr; + memcpy(key, old_db->str, old_db->length); + ptr= key+ old_db->length; + *ptr++= 0; + memcpy(key, old_name->str, old_name->length); + ptr= key+ old_db->length; + *ptr++= 0;
there's tdc_create_key for that.
+ query_cache.invalidate(thd, key, (size_t) (ptr-key), 0); + } error= FALSE;
err: diff --git a/sql/ddl_log.h b/sql/ddl_log.h index 0747699cd85..316e6708f22 100644 --- a/sql/ddl_log.h +++ b/sql/ddl_log.h @@ -24,6 +24,9 @@ enum ddl_log_entry_code { /* + DDL_LOG_UNKOWN
s/UNKOWN/UNKNOWN/
+ Here mainly to detect blocks that are all zero + DDL_LOG_EXECUTE_CODE: This is a code that indicates that this is a log entry to be executed, from this entry a linked list of log entries @@ -34,48 +37,95 @@ enum ddl_log_entry_code DDL_IGNORE_LOG_ENTRY_CODE: An entry that is to be ignored */ - DDL_LOG_EXECUTE_CODE = 'e', - DDL_LOG_ENTRY_CODE = 'l', - DDL_IGNORE_LOG_ENTRY_CODE = 'i' + DDL_LOG_UNKNOWN= 0, + DDL_LOG_EXECUTE_CODE= 1, + DDL_LOG_ENTRY_CODE= 2, + DDL_IGNORE_LOG_ENTRY_CODE= 3, + DDL_LOG_ENTRY_CODE_LAST= 4 };
+ +/* + When adding things below, also add an entry to ddl_log_entry_phases in + ddl_log.cc +*/ + enum ddl_log_action_code { /* The type of action that a DDL_LOG_ENTRY_CODE entry is to perform. - DDL_LOG_DELETE_ACTION: - Delete an entity - DDL_LOG_RENAME_ACTION: - Rename an entity - DDL_LOG_REPLACE_ACTION: - Rename an entity after removing the previous entry with the - new name, that is replace this entry. - DDL_LOG_EXCHANGE_ACTION: - Exchange two entities by renaming them a -> tmp, b -> a, tmp -> b. */ - DDL_LOG_DELETE_ACTION = 'd', - DDL_LOG_RENAME_ACTION = 'r', - DDL_LOG_REPLACE_ACTION = 's', - DDL_LOG_EXCHANGE_ACTION = 'e' + DDL_LOG_UNKNOWN_ACTION= 0, + + /* Delete a .frm file or a table in the partition engine */ + DDL_LOG_DELETE_ACTION= 1, + + /* Rename a .frm fire a table in the partition engine */ + DDL_LOG_RENAME_ACTION= 2, + + /* + Rename an entity after removing the previous entry with the + new name, that is replace this entry. + */ + DDL_LOG_REPLACE_ACTION= 3, + + /* Exchange two entities by renaming them a -> tmp, b -> a, tmp -> b */ + DDL_LOG_EXCHANGE_ACTION= 4, + /* + log do_rename(): Rename of .frm file, table, stat_tables and triggers + */ + DDL_LOG_RENAME_TABLE_ACTION= 5, + DDL_LOG_RENAME_VIEW_ACTION= 6, + DDL_LOG_LAST_ACTION /* End marker */ };
+ +/* Number of phases for each ddl_log_action_code */ +extern const uchar ddl_log_entry_phases[DDL_LOG_LAST_ACTION]; + + enum enum_ddl_log_exchange_phase { EXCH_PHASE_NAME_TO_TEMP= 0, EXCH_PHASE_FROM_TO_NAME= 1, EXCH_PHASE_TEMP_TO_FROM= 2 };
+enum enum_ddl_log_rename_table_phase { + DDL_RENAME_PHASE_TRIGGER= 0, + DDL_RENAME_PHASE_STAT, + DDL_RENAME_PHASE_TABLE, +}; + +/* + Setting ddl_log_entry.phase to this has the same effect as setting + action_type to DDL_IGNORE_LOG_ENTRY_CODE
this doesn't make much sense. May be you can add a line or two about why it's needed and one cannot simply set action_type to DDL_IGNORE_LOG_ENTRY_CODE instead?
+*/ + +#define DDL_LOG_FINAL_PHASE ((uchar) 0xff)
typedef struct st_ddl_log_entry { - const char *name; - const char *from_name; - const char *handler_name; - const char *tmp_name; + LEX_CSTRING name; + LEX_CSTRING from_name; + LEX_CSTRING handler_name; + LEX_CSTRING tmp_name; + LEX_CSTRING db; + LEX_CSTRING from_db; + LEX_CSTRING from_handler_name; + uchar uuid[MY_UUID_SIZE]; // UUID for new frm file + + ulonglong xid; // Xid stored in the binary log uint next_entry; - uint entry_pos; - enum ddl_log_entry_code entry_type; + uint entry_pos; // Set by write_dll_log_entry() + /* + unique_id can be used to store a unique number to check current state. + Currently it is used to store new size of frm file or link to a ddl log + entry.
is that so? I don't see it storing a file version, a new size of the frm or a link, I see it stores some kind of a counter to abort recovery when it was repeated more than DDL_LOG_MAX_RETRY times
+ */ + uint32 unique_id; // Unique id for file version + uint16 flags; // Flags unique for each command + enum ddl_log_entry_code entry_type; // Set automatically enum ddl_log_action_code action_type; /* Most actions have only one phase. REPLACE does however have two @@ -95,17 +145,52 @@ typedef struct st_ddl_log_memory_entry } DDL_LOG_MEMORY_ENTRY;
+typedef struct st_ddl_log_state +{ + /* List of ddl log entries */ + DDL_LOG_MEMORY_ENTRY *list; + /* One execute entry per list */ + DDL_LOG_MEMORY_ENTRY *execute_entry; +} DDL_LOG_STATE;
would be nice to have a small comment, explaining what is a "state" and how two lists can represent it. is execute_entry a pointer to a some element from the list?
+ + +/* These functions are for recovery */ +bool ddl_log_initialize(); +void ddl_log_release(); +bool ddl_log_close_binlogged_events(HASH *xids); +int ddl_log_execute_recovery(); + +/* functions for updating the ddl log */ bool ddl_log_write_entry(DDL_LOG_ENTRY *ddl_log_entry, DDL_LOG_MEMORY_ENTRY **active_entry); + bool ddl_log_write_execute_entry(uint first_entry, - bool complete, - DDL_LOG_MEMORY_ENTRY **active_entry); -bool ddl_log_increment_phase(uint entry_no); + DDL_LOG_MEMORY_ENTRY **active_entry); +bool ddl_log_disable_execute_entry(DDL_LOG_MEMORY_ENTRY **active_entry); + +void ddl_log_complete(DDL_LOG_STATE *ddl_log_state); +void ddl_log_revert(THD *thd, DDL_LOG_STATE *ddl_log_state); + +bool ddl_log_update_phase(DDL_LOG_STATE *entry, uchar phase); +bool ddl_log_update_xid(DDL_LOG_STATE *state, ulonglong xid); +bool ddl_log_disable_entry(DDL_LOG_STATE *state); +bool ddl_log_increment_phase(uint entry_pos); void ddl_log_release_memory_entry(DDL_LOG_MEMORY_ENTRY *log_entry); bool ddl_log_sync(); -void ddl_log_release(); -void ddl_log_execute_recovery(); bool ddl_log_execute_entry(THD *thd, uint first_entry);
+void ddl_log_release_entries(DDL_LOG_STATE *ddl_log_state); +bool ddl_log_rename_table(THD *thd, DDL_LOG_STATE *ddl_state, + handlerton *hton, + const LEX_CSTRING *org_db, + const LEX_CSTRING *org_alias, + const LEX_CSTRING *new_db, + const LEX_CSTRING *new_alias); +bool ddl_log_rename_view(THD *thd, DDL_LOG_STATE *ddl_state, + const LEX_CSTRING *org_db, + const LEX_CSTRING *org_alias, + const LEX_CSTRING *new_db, + const LEX_CSTRING *new_alias); + extern mysql_mutex_t LOCK_gdl; #endif /* DDL_LOG_INCLUDED */ diff --git a/sql/sql_rename.cc b/sql/sql_rename.cc index b7aed97a8a2..2b2f32cf126 100644 --- a/sql/sql_rename.cc +++ b/sql/sql_rename.cc @@ -186,49 +172,43 @@ bool mysql_rename_tables(THD *thd, TABLE_LIST *table_list, bool silent, /* Add IF EXISTS to binary log */ thd->variables.option_bits|= OPTION_IF_EXISTS; } + + debug_crash_here("ddl_log_rename_before_binlog"); + /* + Store xid in ddl log and binary log so that we can check on ddl recovery + if the item is in the binary log (and thus the operation was complete + */ + thd->binlog_xid= thd->query_id; + ddl_log_update_xid(&ddl_log_state, thd->binlog_xid); binlog_error= write_bin_log(thd, TRUE, thd->query(), thd->query_length());
I'd prefer to avoid blowing up THD with arbitrary flags that are only needed rarely in one specific place in the code to pass some argument few stack frames down. Perhaps you can do instead something like bool Query_log_event::write() { ... /* xid's is used with ddl_log handling */ if (ddl_log_entry_incomplete()) { *start++= Q_XID; int8store(start, thd->query_id); start+= 8; } or at least you could replace ulonglong binlog_xid with a one bit state, like thd->server_status & SERVER_STATUS_IN_DDL or with some other bitmap of flags.
+ if (binlog_error) + error= 1; + thd->binlog_xid= 0; thd->variables.option_bits= save_option_bits; + debug_crash_here("ddl_log_rename_after_binlog");
if (likely(!binlog_error)) my_ok(thd); }
if (likely(!error)) + { query_cache_invalidate3(thd, table_list, 0); + ddl_log_complete(&ddl_log_state); + } + else + { + /* Revert the renames of normal tables with the help of the ddl log */ + ddl_log_revert(thd, &ddl_log_state); + }
err: DBUG_RETURN(error || binlog_error); }
-/* - reverse table list - - SYNOPSIS - reverse_table_list() - table_list pointer to table _list - - RETURN - pointer to new (reversed) list -*/ -static TABLE_LIST *reverse_table_list(TABLE_LIST *table_list) -{ - TABLE_LIST *prev= 0; - - while (table_list) - { - TABLE_LIST *next= table_list->next_local; - table_list->next_local= prev; - prev= table_list; - table_list= next; - } - return (prev); -} - - static bool -do_rename_temporary(THD *thd, TABLE_LIST *ren_table, TABLE_LIST *new_table, - bool skip_error) +do_rename_temporary(THD *thd, TABLE_LIST *ren_table, TABLE_LIST *new_table)
I wonder why we never got any warnings about it
{ LEX_CSTRING *new_alias; DBUG_ENTER("do_rename_temporary"); diff --git a/sql/ddl_log.cc b/sql/ddl_log.cc index bad786b3799..777dfdddbc7 100644 --- a/sql/ddl_log.cc +++ b/sql/ddl_log.cc @@ -22,176 +22,333 @@ #include "log.h" // sql_print_error() #include "ddl_log.h" #include "ha_partition.h" // PAR_EXT +#include "sql_table.h" // build_table_filename +#include "sql_statistics.h" // rename_table_in_stats_tables +#include "sql_view.h" // mysql_rename_view() +#include "strfunc.h" // strconvert +#include <mysys_err.h> // EE_LINK
/*--------------------------------------------------------------------------
MODULE: DDL log -----------------
- This module is used to ensure that we can recover from crashes that occur - in the middle of a meta-data operation in MySQL. E.g. DROP TABLE t1, t2; - We need to ensure that both t1 and t2 are dropped and not only t1 and - also that each table drop is entirely done and not "half-baked". + This module is used to ensure that we can recover from crashes that + occur in the middle of a meta-data operation in MySQL. E.g. DROP + TABLE t1, t2; We need to ensure that both t1 and t2 are dropped and + not only t1 and also that each table drop is entirely done and not + "half-baked".
- To support this we create log entries for each meta-data statement in the - ddl log while we are executing. These entries are dropped when the - operation is completed. + To support this we create log entries for each meta-data statement + in the ddl log while we are executing. These entries are dropped + when the operation is completed.
At recovery those entries that were not completed will be executed.
There is only one ddl log in the system and it is protected by a mutex and there is a global struct that contains information about its current state.
+ DDl recovery after a crash works the following way:
DDL or ddl
+ + - ddl_log_initialize() initializes the global global_ddl_log variable + and opens the binary log if it exists. If it doesn't exists a new one + is created.
it doesn't seem to be opening a binary log. Did you mean ddl log?
+ - ddl_log_close_binlogged_events() loops over all log events and checks if + their xid (stored in the EXECUTE_CODE event) is in the binary log. If xid + exists in the binary log the entry is marked as finished in the ddl log. + - After a new binary log is created and is open for new entries, + ddl_log_execute_recovery() is executed on remaining open events: + - Loop over all events + - For each entry with DDL_LOG_ENTRY_CODE execute the remaining phases + in ddl_log_execute_entry_no_lock() + + The ddl_log.log file is created at startup and deleted when server goes down. + After the final recovery phase is done, the file is truncated. + History: First version written in 2006 by Mikael Ronstrom - Second version written in 2020 by Monty + Second version in 2020 by Monty --------------------------------------------------------------------------*/
+#define DDL_LOG_MAGIC_LENGTH 4 +/* How many times to try to execute a ddl log entry that causes crashes */ +#define DDL_LOG_MAX_RETRY 3 + +uchar ddl_log_file_magic[]= +{ (uchar) 254, (uchar) 254, (uchar) 11, (uchar) 2 }; + +/* Action names for ddl_log_action_code */ + +const char *ddl_log_action_name[DDL_LOG_LAST_ACTION]= +{ + "Unknown", "partitioning delete", "partitioning rename", + "partitioning replace", "partitioning exchange", + "rename table", "rename view" +}; + +/* Number of phases per entry */ +const uchar ddl_log_entry_phases[DDL_LOG_LAST_ACTION]= +{ + 1, 1, 2, 3, 4, 1 +}; + + struct st_global_ddl_log { - /* - We need to adjust buffer size to be able to handle downgrades/upgrades - where IO_SIZE has changed. We'll set the buffer size such that we can - handle that the buffer size was upto 4 times bigger in the version - that wrote the DDL log. - */ - char file_entry_buf[4*IO_SIZE]; - char file_name_str[FN_REFLEN]; - char *file_name; + uchar *file_entry_buf; DDL_LOG_MEMORY_ENTRY *first_free; DDL_LOG_MEMORY_ENTRY *first_used; File file_id; - uint name_len; uint num_entries; + uint name_pos; uint io_size; - bool inited; - bool do_release; - bool recovery_phase; - st_global_ddl_log() : inited(false), do_release(false) {} + bool initialized; + bool open; };
st_global_ddl_log global_ddl_log;
static?
mysql_mutex_t LOCK_gdl;
+/* Positions to different data in a ddl log block */ #define DDL_LOG_ENTRY_TYPE_POS 0 +/* + Note that ACTION_TYPE and PHASE_POS must be after each other. + See update_phase() +*/ #define DDL_LOG_ACTION_TYPE_POS 1 #define DDL_LOG_PHASE_POS 2 #define DDL_LOG_NEXT_ENTRY_POS 4 -#define DDL_LOG_NAME_POS 8 +/* Flags to remember something unique about the query, like if .frm was used */ +#define DDL_LOG_FLAG_POS 8 +/* Used to store XID entry that was written to binary log */ +#define DDL_LOG_XID_POS 10 +/* Used to store unique uuid from the .frm file */
"unique uuid" is a weird choice of words
+#define DDL_LOG_UUID_POS 18 +/* ID_POS can be used to store something unique, like file size (4 bytes) */ +#define DDL_LOG_ID_POS DDL_LOG_UUID_POS + MY_UUID_SIZE +#define DDL_LOG_END_POS DDL_LOG_ID_POS + 8
-#define DDL_LOG_NUM_ENTRY_POS 0 -#define DDL_LOG_NAME_LEN_POS 4 -#define DDL_LOG_IO_SIZE_POS 8 +/* + Position to where names are stored in the ddl log blocks. The current + value is stored in the header and can thus be changed if we need more + space for constants in the header than what is between DDL_LOG_ID_POS and + DDL_LOG_TMP_NAME_POS. +*/ +#define DDL_LOG_TMP_NAME_POS 56
+/* Definitions for the ddl log header, the first block in the file */ +/* IO_SIZE is stored in the header and can thus be changed */ +#define DDL_LOG_IO_SIZE IO_SIZE + +/* Header is stored in positions 0-3 */ +#define DDL_LOG_IO_SIZE_POS 4 +#define DDL_LOG_NAME_OFFSET_POS 6 +/* Sum of the above variables */ +#define DDL_LOG_HEADER_SIZE 4+2+2 + /** + Sync the ddl log file. + + @return Operation status + @retval FALSE Success + @retval TRUE Error +*/ + +static bool ddl_log_sync_file() +{ + DBUG_ENTER("ddl_log_sync_file"); + DBUG_RETURN(mysql_file_sync(global_ddl_log.file_id, MYF(MY_WME))); +}
I don't quite understand that, are there cases when you want to sync the ddl log and you *don't know* whether LOCK_gdl is locked or not? because if you know there should be no lock, you'd better use mysql_mutex_assert_not_owner here.
+ +/* Same as above, but ensure we have the LOCK_gdb locked */
that's an interesting Freudian slip
+ +static bool ddl_log_sync_no_lock() +{ + DBUG_ENTER("ddl_log_sync_no_lock"); + + mysql_mutex_assert_owner(&LOCK_gdl); + DBUG_RETURN(ddl_log_sync_file()); +} ... +/* + Store and read strings in ddl log buffers + + Format is: + 2 byte: length (not counting end \0) + X byte: string value of length 'length' + 1 byte: \0
a bit redundant, don't you think? Why did you do it?
+*/ + +static uchar *store_string(uchar *pos, uchar *end, const LEX_CSTRING *str) { + uint32 length= (uint32) str->length; + if (unlikely(pos + 2 + length + 1 > end)) + { + DBUG_ASSERT(0); + return end; // Overflow } + + int2store(pos, length); + if (likely(length)) + memcpy(pos+2, str->str, length); + pos[2+length]= 0; // Store end \0 + return pos + 2 + length +1; +} + + +static LEX_CSTRING get_string(uchar **pos, const uchar *end) { + LEX_CSTRING tmp; + uint32 length; + if (likely(*pos + 3 <= end)) + { + length= uint2korr(*pos); + if (likely(*pos + 2 + length + 1 <= end)) + { + char *str= (char*) *pos+2; + *pos= *pos + 2 + length + 1; + tmp.str= str; + tmp.length= length; + return tmp; } + } + /* + Overflow on read, should never happen + Set *pos to end to ensure any future calls also returns empty string + */ + DBUG_ASSERT(0); + *pos= (uchar*) end; + tmp.str= ""; + tmp.length= 0; + return tmp; }
@@ -256,38 +484,32 @@ static uint read_ddl_log_header()
static void set_global_from_ddl_log_entry(const DDL_LOG_ENTRY *ddl_log_entry)
it'd be good to have some comment explaining why you need this function. Couldn't you write directly from the ddl_log_entry? or use a local buffer? why to serialize into a buffer inside global_ddl_log?
{ + uchar *file_entry_buf= global_ddl_log.file_entry_buf, *pos, *end; + mysql_mutex_assert_owner(&LOCK_gdl);
it would be logical to have LOCK_gdl inside the global_ddl_log struct
- global_ddl_log.file_entry_buf[DDL_LOG_ENTRY_TYPE_POS]= - (char)DDL_LOG_ENTRY_CODE; - global_ddl_log.file_entry_buf[DDL_LOG_ACTION_TYPE_POS]= - (char)ddl_log_entry->action_type; - global_ddl_log.file_entry_buf[DDL_LOG_PHASE_POS]= 0; - int4store(&global_ddl_log.file_entry_buf[DDL_LOG_NEXT_ENTRY_POS], - ddl_log_entry->next_entry); - DBUG_ASSERT(strlen(ddl_log_entry->name) < FN_REFLEN); - strmake(&global_ddl_log.file_entry_buf[DDL_LOG_NAME_POS], - ddl_log_entry->name, FN_REFLEN - 1); - if (ddl_log_entry->action_type == DDL_LOG_RENAME_ACTION || - ddl_log_entry->action_type == DDL_LOG_REPLACE_ACTION || - ddl_log_entry->action_type == DDL_LOG_EXCHANGE_ACTION) - { - DBUG_ASSERT(strlen(ddl_log_entry->from_name) < FN_REFLEN); - strmake(&global_ddl_log.file_entry_buf[DDL_LOG_NAME_POS + FN_REFLEN], - ddl_log_entry->from_name, FN_REFLEN - 1); - } - else - global_ddl_log.file_entry_buf[DDL_LOG_NAME_POS + FN_REFLEN]= 0; - DBUG_ASSERT(strlen(ddl_log_entry->handler_name) < FN_REFLEN); - strmake(&global_ddl_log.file_entry_buf[DDL_LOG_NAME_POS + (2*FN_REFLEN)], - ddl_log_entry->handler_name, FN_REFLEN - 1); - if (ddl_log_entry->action_type == DDL_LOG_EXCHANGE_ACTION) - { - DBUG_ASSERT(strlen(ddl_log_entry->tmp_name) < FN_REFLEN); - strmake(&global_ddl_log.file_entry_buf[DDL_LOG_NAME_POS + (3*FN_REFLEN)], - ddl_log_entry->tmp_name, FN_REFLEN - 1); - } - else - global_ddl_log.file_entry_buf[DDL_LOG_NAME_POS + (3*FN_REFLEN)]= 0; + + file_entry_buf[DDL_LOG_ENTRY_TYPE_POS]= (uchar) ddl_log_entry->entry_type; + file_entry_buf[DDL_LOG_ACTION_TYPE_POS]= (uchar) ddl_log_entry->action_type; + file_entry_buf[DDL_LOG_PHASE_POS]= (uchar) ddl_log_entry->phase; + int4store(file_entry_buf+DDL_LOG_NEXT_ENTRY_POS, ddl_log_entry->next_entry); + int2store(file_entry_buf+DDL_LOG_FLAG_POS, ddl_log_entry->flags); + int8store(file_entry_buf+DDL_LOG_XID_POS, ddl_log_entry->xid); + memcpy(file_entry_buf+DDL_LOG_UUID_POS, ddl_log_entry->uuid, MY_UUID_SIZE); + int8store(file_entry_buf+DDL_LOG_ID_POS, ddl_log_entry->unique_id); + bzero(file_entry_buf+DDL_LOG_END_POS, + global_ddl_log.name_pos - DDL_LOG_END_POS); + + pos= file_entry_buf + global_ddl_log.name_pos; + end= file_entry_buf + global_ddl_log.io_size; + + pos= store_string(pos, end, &ddl_log_entry->handler_name); + pos= store_string(pos, end, &ddl_log_entry->db); + pos= store_string(pos, end, &ddl_log_entry->name); + pos= store_string(pos, end, &ddl_log_entry->from_handler_name); + pos= store_string(pos, end, &ddl_log_entry->from_db); + pos= store_string(pos, end, &ddl_log_entry->from_name); + pos= store_string(pos, end, &ddl_log_entry->tmp_name);
you have DBUG_ASSERT(0) that all those strings won't overflow io_size. How can you guarantee that?
+ bzero(pos, global_ddl_log.io_size - (pos - file_entry_buf)); }
@@ -442,63 +691,120 @@ static bool ddl_log_sync_no_lock() ... + + +/* + Build a filename for a table, trigger file or .frm + Delete also any temporary file suffixed with ~
Where do you get these *~ files from? What creates them?
+*/ + +static void build_filename_and_delete_tmp_file(char *path, size_t path_length, + const LEX_CSTRING *db, + const LEX_CSTRING *name, + const char *ext, + PSI_file_key psi_key) +{ + uint length= build_table_filename(path, path_length-1, + db->str, name->str, ext, 0); + + path[length]= '~'; + path[length+1]= 0; + (void) mysql_file_delete(psi_key, path, MYF(0)); + path[length]= 0; }
@@ -806,154 +1296,132 @@ static bool ddl_log_execute_entry_no_lock(THD *thd, uint first_entry) bool ddl_log_write_entry(DDL_LOG_ENTRY *ddl_log_entry, DDL_LOG_MEMORY_ENTRY **active_entry) { - bool error, write_header; + bool error; + uchar *pos, *end; DBUG_ENTER("ddl_log_write_entry");
mysql_mutex_assert_owner(&LOCK_gdl); - if (init_ddl_log()) + if (!global_ddl_log.open) DBUG_RETURN(TRUE); + + ddl_log_entry->entry_type= DDL_LOG_ENTRY_CODE; set_global_from_ddl_log_entry(ddl_log_entry); - if (get_free_ddl_log_entry(active_entry, &write_header)) + if (ddl_log_get_free_entry(active_entry)) DBUG_RETURN(TRUE); + error= FALSE; + pos= global_ddl_log.file_entry_buf + global_ddl_log.name_pos; + end= global_ddl_log.file_entry_buf + global_ddl_log.io_size; DBUG_PRINT("ddl_log", - ("write type %c next %u name '%s' from_name '%s' handler '%s'" - " tmp_name '%s'", + ("type: %c next: %u handler: %s " + "to_name: '%s.%s' from_name: '%s.%s' " + "tmp_name: '%s'", (char) global_ddl_log.file_entry_buf[DDL_LOG_ACTION_TYPE_POS], ddl_log_entry->next_entry, - (char*) &global_ddl_log.file_entry_buf[DDL_LOG_NAME_POS], - (char*) &global_ddl_log.file_entry_buf[DDL_LOG_NAME_POS - + FN_REFLEN], - (char*) &global_ddl_log.file_entry_buf[DDL_LOG_NAME_POS - + (2*FN_REFLEN)], - (char*) &global_ddl_log.file_entry_buf[DDL_LOG_NAME_POS - + (3*FN_REFLEN)])); + get_string(&pos, end).str, // Handler + get_string(&pos, end).str, // to db.table + get_string(&pos, end).str, + get_string(&pos, end).str, // From db.table + get_string(&pos, end).str, + get_string(&pos, end).str)); // Tmp name
This is not a critical bug, because it's in a DBUG_PRINT, but it's still a bug. C++ (and C) standard both say that the order of evaluation of function arguments is *unspecified*.
+ if (unlikely(write_ddl_log_file_entry((*active_entry)->entry_pos))) { + ddl_log_release_memory_entry(*active_entry); + *active_entry= 0; error= TRUE; - sql_print_error("Failed to write entry_no = %u", + sql_print_error("DDL_LOG: Failed to write entry %u", (*active_entry)->entry_pos); } - if (write_header && likely(!error)) - { - (void) ddl_log_sync_no_lock(); - if (write_ddl_log_header()) - error= TRUE; - } - if (unlikely(error)) - ddl_log_release_memory_entry(*active_entry); DBUG_RETURN(error); }
/** - @brief Write final entry in the ddl log. + @brief Write or update execute entry in the ddl log.
- @details This is the last write in the ddl log. The previous log entries - have already been written but not yet synched to disk. - We write a couple of log entries that describes action to perform. - This entries are set-up in a linked list, however only when a first - execute entry is put as the first entry these will be executed. - This routine writes this first. + @details An execute entry points to the first entry that should + be excuted during recovery. In some cases it's only written once, + in other cases it's updated for each log entry to point to the new + header for the list.
+ When called, the previous log entries have already been written but not yet + synched to disk. We write a couple of log entries that describes + action to perform. This entries are set-up in a linked list, + however only when an execute entry is put as the first entry these will be + executed during recovery. + @param first_entry First entry in linked list of entries - to execute, if 0 = NULL it means that - the entry is removed and the entries - are put into the free list. - @param complete Flag indicating we are simply writing - info about that entry has been completed + to execute. @param[in,out] active_entry Entry to execute, 0 = NULL if the entry is written first time and needs to be returned. In this case the entry written is returned in this parameter - @return Operation status @retval TRUE Error @retval FALSE Success */
bool ddl_log_write_execute_entry(uint first_entry, - bool complete, DDL_LOG_MEMORY_ENTRY **active_entry) { - bool write_header= FALSE; - char *file_entry_buf= (char*)global_ddl_log.file_entry_buf; + uchar *file_entry_buf= global_ddl_log.file_entry_buf; + bool got_free_entry= 0; DBUG_ENTER("ddl_log_write_execute_entry");
mysql_mutex_assert_owner(&LOCK_gdl); - if (init_ddl_log()) - { - DBUG_RETURN(TRUE); - } - if (!complete) - { /* We haven't synched the log entries yet, we synch them now before writing the execute entry. If complete is true we haven't written
there's no 'complete' anymore
any log entries before, we are only here to write the execute entry to indicate it is done. */ (void) ddl_log_sync_no_lock();
Why do you need to sync previous log entries? As far as I understand, they're useless without DDL_LOG_EXECUTE_CODE entry, so why would it matter whether they're on disk or not after a crash?
- file_entry_buf[DDL_LOG_ENTRY_TYPE_POS]= (char)DDL_LOG_EXECUTE_CODE; - } - else - file_entry_buf[DDL_LOG_ENTRY_TYPE_POS]= (char)DDL_IGNORE_LOG_ENTRY_CODE; - file_entry_buf[DDL_LOG_ACTION_TYPE_POS]= 0; /* Ignored for execute entries */ - file_entry_buf[DDL_LOG_PHASE_POS]= 0; - int4store(&file_entry_buf[DDL_LOG_NEXT_ENTRY_POS], first_entry); - file_entry_buf[DDL_LOG_NAME_POS]= 0; - file_entry_buf[DDL_LOG_NAME_POS + FN_REFLEN]= 0; - file_entry_buf[DDL_LOG_NAME_POS + 2*FN_REFLEN]= 0; + bzero(file_entry_buf, global_ddl_log.io_size); + + file_entry_buf[DDL_LOG_ENTRY_TYPE_POS]= (uchar)DDL_LOG_EXECUTE_CODE; + int4store(file_entry_buf + DDL_LOG_NEXT_ENTRY_POS, first_entry); + if (!(*active_entry)) { - if (get_free_ddl_log_entry(active_entry, &write_header)) - { + if (ddl_log_get_free_entry(active_entry)) DBUG_RETURN(TRUE); + got_free_entry= TRUE; } - write_header= TRUE; - } if (write_ddl_log_file_entry((*active_entry)->entry_pos)) { - sql_print_error("Error writing execute entry in ddl log"); - ddl_log_release_memory_entry(*active_entry); - DBUG_RETURN(TRUE); - } - (void) ddl_log_sync_no_lock(); - if (write_header) + if (got_free_entry) { - if (write_ddl_log_header()) - { ddl_log_release_memory_entry(*active_entry); + *active_entry= 0; + } + sql_print_error("DDL_LOG: Error writing execute entry %u", + (*active_entry)->entry_pos); DBUG_RETURN(TRUE); } - } + (void) ddl_log_sync_no_lock(); DBUG_RETURN(FALSE); }
- /** - Deactivate an individual entry. + Increment phase for enty. Will deactivate entry after all phases are done
@details see ddl_log_increment_phase_no_lock.
- @param entry_no Entry position of record to change + @param entry_pos Entry position of record to change
@return Operation status @retval TRUE Error @retval FALSE Success */
-bool ddl_log_increment_phase(uint entry_no) +bool ddl_log_increment_phase(uint entry_pos) { bool error; DBUG_ENTER("ddl_log_increment_phase");
mysql_mutex_lock(&LOCK_gdl); - error= ddl_log_increment_phase_no_lock(entry_no); + error= ddl_log_increment_phase_no_lock(entry_pos); mysql_mutex_unlock(&LOCK_gdl); DBUG_RETURN(error); } @@ -1042,87 +1484,157 @@ static void close_ddl_log() (void) mysql_file_close(global_ddl_log.file_id, MYF(MY_WME)); global_ddl_log.file_id= (File) -1; } + global_ddl_log.open= 0; DBUG_VOID_RETURN; }
/** + Loop over ddl log excute entries and mark those that are already stored + in the binary log as completed + + @return + @retval 0 ok + @return 1 fail (write error) + +*/ + +bool ddl_log_close_binlogged_events(HASH *xids) +{ + uint i; + DDL_LOG_ENTRY ddl_log_entry; + DBUG_ENTER("ddl_log_close_binlogged_events"); + + if (global_ddl_log.num_entries == 0 || xids->records == 0) + DBUG_RETURN(0); + + mysql_mutex_lock(&LOCK_gdl); + for (i= 1; i <= global_ddl_log.num_entries; i++) + { + if (read_ddl_log_entry(i, &ddl_log_entry)) + break; // Read error. Ignore
it's not really "ignore", it's more like "stop reading"
+ if (ddl_log_entry.entry_type == DDL_LOG_EXECUTE_CODE && + ddl_log_entry.xid != 0 && + my_hash_search(xids, (uchar*) &ddl_log_entry.xid, + sizeof(ddl_log_entry.xid))) + { + if (disable_execute_entry(i)) + { + mysql_mutex_unlock(&LOCK_gdl); + DBUG_RETURN(1); // Write error. Fatal! + } + } + } + (void) ddl_log_sync_no_lock(); + mysql_mutex_unlock(&LOCK_gdl); + DBUG_RETURN(0); +} + + +/** Execute the ddl log at recovery of MySQL Server. + + @return + @retval 0 Ok. + @retval > 0 Fatal error. We have to abort (can't create ddl log) + @return < -1 Recovery failed, but new log exists and is usable + */
-void ddl_log_execute_recovery() +int ddl_log_execute_recovery() { - uint num_entries, i; + uint i, count= 0; + int error= 0; THD *thd; DDL_LOG_ENTRY ddl_log_entry; - char file_name[FN_REFLEN]; static char recover_query_string[]= "INTERNAL DDL LOG RECOVER IN PROGRESS"; DBUG_ENTER("ddl_log_execute_recovery");
- /* - Initialise global_ddl_log struct - */ - bzero(global_ddl_log.file_entry_buf, sizeof(global_ddl_log.file_entry_buf)); - global_ddl_log.inited= FALSE; - global_ddl_log.recovery_phase= TRUE; - global_ddl_log.io_size= IO_SIZE; - global_ddl_log.file_id= (File) -1; + if (global_ddl_log.num_entries == 0) + DBUG_RETURN(0);
/* To be able to run this from boot, we allocate a temporary THD */ if (!(thd=new THD(0))) - DBUG_VOID_RETURN; + { + DBUG_ASSERT(0); // Fatal error + DBUG_RETURN(1); + } thd->thread_stack= (char*) &thd; thd->store_globals(); + thd->log_all_errors= (global_system_variables.log_warnings >= 3);
thd->set_query(recover_query_string, strlen(recover_query_string));
- /* this also initialize LOCK_gdl */ - num_entries= read_ddl_log_header(); mysql_mutex_lock(&LOCK_gdl); - for (i= 1; i < num_entries + 1; i++) + for (i= 1; i <= global_ddl_log.num_entries; i++) { if (read_ddl_log_entry(i, &ddl_log_entry)) { - sql_print_error("Failed to read entry no = %u from ddl log", - i); + error= -1; continue; } if (ddl_log_entry.entry_type == DDL_LOG_EXECUTE_CODE) { + /* purecov: begin tested */
really? you tested with purify?
+ if (ddl_log_entry.unique_id > DDL_LOG_MAX_RETRY) + { + error= -1; + continue; + } + update_unique_id(i, ++ddl_log_entry.unique_id); + if (ddl_log_entry.unique_id > DDL_LOG_MAX_RETRY) + { + sql_print_error("DDL_LOG: Aborting executing entry %u after %llu " + "retries", i, ddl_log_entry.unique_id); + error= -1; + continue; + } + /* purecov: end tested */ if (ddl_log_execute_entry_no_lock(thd, ddl_log_entry.next_entry)) { - /* Real unpleasant scenario but we continue anyways. */ + /* Real unpleasant scenario but we have to continue anyway */
why can this happen? do you have an example? besides hardware failure, like a disk cannot be read
+ error= -1; continue; } + count++; } } close_ddl_log(); - create_ddl_log_file_name(file_name); - (void) mysql_file_delete(key_file_global_ddl_log, file_name, MYF(0)); - global_ddl_log.recovery_phase= FALSE; mysql_mutex_unlock(&LOCK_gdl); thd->reset_query(); delete thd; - DBUG_VOID_RETURN; + + /* + Create a new ddl_log to get rid of old stuff and ensure that header matches + the current source version + */
because here you delete/overwrite the old ddl log, it'd make sense to log all details of every entry that you failed to execute but decided to continue anyway. Now you log only the entry number on failure, which is rather useless, because it's a number in a ddl log that you overwrite.
+ if (create_ddl_log()) + DBUG_RETURN(1); + if (count > 0) + sql_print_information("DDL_LOG: Crash recovery executed %u entries", + count); + DBUG_RETURN(error); }
/** - Release all memory allocated to the ddl log. + Release all memory allocated to the ddl log and delete the ddl log */
void ddl_log_release() { + char file_name[FN_REFLEN]; DDL_LOG_MEMORY_ENTRY *free_list; DDL_LOG_MEMORY_ENTRY *used_list; DBUG_ENTER("ddl_log_release");
- if (!global_ddl_log.do_release) + if (!global_ddl_log.initialized) DBUG_VOID_RETURN;
- mysql_mutex_lock(&LOCK_gdl); + global_ddl_log.initialized= 0; + free_list= global_ddl_log.first_free; used_list= global_ddl_log.first_used; while (used_list)
Regards, Sergei VP of MariaDB Server Engineering and security@mariadb.org