Re: eba212af92d: MDEV-9101 Limit size of created disk temporary files and tables
Hi, Monty, On Apr 18, Michael Widenius wrote:
commit eba212af92d Author: Michael Widenius
Date: Thu Mar 14 17:59:00 2024 +0100 diff --git a/sql/json_table.cc b/sql/json_table.cc index 6713e796eb7..da5ab0c450a 100644 --- a/sql/json_table.cc +++ b/sql/json_table.cc @@ -734,7 +734,8 @@ bool Create_json_table::finalize(THD *thd, TABLE *table,
table->db_stat= HA_OPEN_KEYFILE; if (unlikely(table->file->ha_open(table, table->s->path.str, O_RDWR, - HA_OPEN_TMP_TABLE | HA_OPEN_INTERNAL_TABLE))) + HA_OPEN_TMP_TABLE | HA_OPEN_INTERNAL_TABLE | + HA_OPEN_SIZE_TRACKING)))
shouldn't HA_OPEN_INTERNAL_TABLE always imply HA_OPEN_SIZE_TRACKING automatically?
DBUG_RETURN(true);
table->set_created(); diff --git a/storage/maria/ma_statrec.c b/storage/maria/ma_statrec.c index d8a8b0a05d7..ff0c6c7a617 100644 --- a/storage/maria/ma_statrec.c +++ b/storage/maria/ma_statrec.c @@ -45,6 +45,13 @@ my_bool _ma_write_static_record(MARIA_HA *info, const uchar *record) my_errno=HA_ERR_RECORD_FILE_FULL; return(2); } + if (info->s->tracked &&
are there temporary files that you don't want to track? are there non-temporary files that you want to track?
+ _ma_update_tmp_file_size(&info->s->track_data, + MY_ALIGN(info->s->state.state.data_file_length+ + info->s->base.pack_reclength, + MARIA_TRACK_INCREMENT_SIZE))) + return(2); + if (info->opt_flag & WRITE_CACHE_USED) { /* Cash in use */ if (my_b_write(&info->rec_cache, record, diff --git a/storage/maria/ma_page.c b/storage/maria/ma_page.c index 5881456a69a..0877c966d1c 100644 --- a/storage/maria/ma_page.c +++ b/storage/maria/ma_page.c @@ -417,6 +417,13 @@ my_off_t _ma_new(register MARIA_HA *info, int level, DBUG_RETURN(HA_OFFSET_ERROR); } share->state.state.key_file_length+= block_size; + if (info->s->tracked && + _ma_update_tmp_file_size(&share->track_index, + share->state.state.key_file_length))
shouldn't you do it before share->state.state.key_file_length is increased? Currently it seems that you're failing the operation, but the file will stay increased same pattern everywhere else
+ { + mysql_mutex_unlock(&share->intern_lock); + DBUG_RETURN(HA_OFFSET_ERROR); + } /* Following is for not transactional tables */ info->state->key_file_length= share->state.state.key_file_length; mysql_mutex_unlock(&share->intern_lock); diff --git a/storage/maria/ma_delete_all.c b/storage/maria/ma_delete_all.c index f355d0da3e8..ed2087d3091 100644 --- a/storage/maria/ma_delete_all.c +++ b/storage/maria/ma_delete_all.c @@ -132,9 +132,16 @@ int maria_delete_all_rows(MARIA_HA *info) if (_ma_flush_table_files(info, MARIA_FLUSH_DATA|MARIA_FLUSH_INDEX, FLUSH_IGNORE_CHANGED, FLUSH_IGNORE_CHANGED) || mysql_file_chsize(info->dfile.file, 0, 0, MYF(MY_WME)) || - mysql_file_chsize(share->kfile.file, share->base.keystart, 0, MYF(MY_WME))) + mysql_file_chsize(share->kfile.file, share->base.keystart, 0, + MYF(MY_WME)))
should be > 0 here, shouldn't it? Both for kfile and dfile.
goto err;
+ if (info->s->tracked) + { + _ma_update_tmp_file_size(&info->s->track_data, 0); + _ma_update_tmp_file_size(&info->s->track_index, share->base.keystart);
but this shouldn't update the size if mysql_file_chsize returned -1
+ } + if (_ma_initialize_data_file(share, info->dfile.file)) goto err;
diff --git a/sql/uniques.cc b/sql/uniques.cc index 36725e80a6b..44d93bb0f88 100644 --- a/sql/uniques.cc +++ b/sql/uniques.cc @@ -720,7 +720,7 @@ bool Unique::merge(TABLE *table, uchar *buff, size_t buff_size, /* Open cached file for table records if it isn't open */ if (! my_b_inited(outfile) && open_cached_file(outfile, mysql_tmpdir, TEMP_PREFIX, DISK_CHUNK_SIZE, - MYF(MY_WME))) + MYF(MY_WME | MY_TRACK | MY_TRACK_WITH_LIMIT)))
as far as I can see, open_cached_file is only used for temporary files. it should apply MY_TRACK | MY_TRACK_WITH_LIMIT internally, it'll be less error-prone
return 1;
bzero((char*) &sort_param,sizeof(sort_param)); diff --git a/storage/maria/ma_write.c b/storage/maria/ma_write.c index 9a6859bfd4d..7404db84067 100644 --- a/storage/maria/ma_write.c +++ b/storage/maria/ma_write.c @@ -386,6 +387,11 @@ int maria_write(MARIA_HA *info, const uchar *record) } } } + else if (my_errno == HA_ERR_LOCAL_TMP_SPACE_FULL || + my_errno == HA_ERR_GLOBAL_TMP_SPACE_FULL) + { + filepos= HA_OFFSET_ERROR; /* Avoid write_record_abort() */
why avoid? HA_ERR_LOCAL_TMP_SPACE_FULL is like EQUOT or ENOSPC. why should be masked as HA_OFFSET_ERROR, and not be a fatal error?
+ } else fatal_error= 1;
diff --git a/storage/maria/ma_close.c b/storage/maria/ma_close.c index 7441e29a97b..47935d61b88 100644 --- a/storage/maria/ma_close.c +++ b/storage/maria/ma_close.c @@ -14,7 +14,7 @@ along with this program; if not, write to the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA */
-/* close a isam-database */ +/* close an Aria table */
at last :) it's what 27 years old?
/* TODO: We need to have a separate mutex on the closed file to allow other threads diff --git a/sql/sql_window.cc b/sql/sql_window.cc index e0c7cef1a9e..4959c7833a1 100644 --- a/sql/sql_window.cc +++ b/sql/sql_window.cc @@ -2922,15 +2922,16 @@ bool compute_window_func(THD *thd, if (unlikely(thd->is_error() || thd->is_killed())) break;
- /* Return to current row after notifying cursors for each window function. */ - tbl->file->ha_rnd_pos(tbl->record[0], rowid_buf); + if (tbl->file->ha_rnd_pos(tbl->record[0], rowid_buf)) + return true;
how can it fail?
}
/* We now have computed values for each window function. They can now be saved in the current row. */ - save_window_function_values(window_functions, tbl, rowid_buf); + if (save_window_function_values(window_functions, tbl, rowid_buf)) + return true;
rownum++; } diff --git a/mysql-test/main/status.test b/mysql-test/main/status.test index acf29f35b9e..b6be046a9ac 100644 --- a/mysql-test/main/status.test +++ b/mysql-test/main/status.test @@ -244,9 +244,9 @@ let $rnd_next = `show global status like 'handler_read_rnd_next'`; let $tmp_table = `show global status like 'Created_tmp_tables'`; show status like 'com_show_status'; show status like 'hand%write%'; -show status like '%tmp%'; +show status like '%_tmp%';
Doesn't look like a very meaningful change. Did you mean '%\_tmp%' ?
show status like 'hand%write%'; -show status like '%tmp%'; +show status like '%_tmp%'; show status like 'com_show_status'; let $rnd_next2 = `show global status like 'handler_read_rnd_next'`; let $tmp_table2 = `show global status like 'Created_tmp_tables'`; diff --git a/mysys/mf_cache.c b/mysys/mf_cache.c index 2fec59f4e70..712187f6181 100644 --- a/mysys/mf_cache.c +++ b/mysys/mf_cache.c @@ -43,7 +43,7 @@ my_bool open_cached_file(IO_CACHE *cache, const char* dir, const char *prefix, cache->file_name=0; cache->buffer=0; /* Mark that not open */ if (!init_io_cache(cache, -1, cache_size, WRITE_CACHE, 0L, 0, - MYF(cache_myflags | MY_NABP))) + MYF(cache_myflags | MY_NABP | MY_TRACK)))
Why not MY_TRACK_WITH_LIMIT ?
{ DBUG_RETURN(0); } diff --git a/sql/sql_select.cc b/sql/sql_select.cc index 6f2797b6b39..324f7679e8f 100644 --- a/sql/sql_select.cc +++ b/sql/sql_select.cc @@ -22275,7 +22275,8 @@ bool open_tmp_table(TABLE *table) int error; if (unlikely((error= table->file->ha_open(table, table->s->path.str, O_RDWR, HA_OPEN_TMP_TABLE | - HA_OPEN_INTERNAL_TABLE)))) + HA_OPEN_INTERNAL_TABLE | + HA_OPEN_SIZE_TRACKING))))
shouldn't HA_OPEN_TMP_TABLE or HA_OPEN_INTERNAL_TABLE imply HA_OPEN_SIZE_TRACKING?
{ table->file->print_error(error, MYF(0)); /* purecov: inspected */ table->db_stat= 0; diff --git a/sql/handler.cc b/sql/handler.cc index fec473d97bb..a05b7476724 100644 --- a/sql/handler.cc +++ b/sql/handler.cc @@ -443,9 +444,12 @@ int ha_init_errors(void) /* Zerofill it to avoid uninitialized gaps. */
remove the comment, if you no longer zerofill
if (! (handler_errmsgs= (const char**) my_malloc(key_memory_handler_errmsgs, HA_ERR_ERRORS * sizeof(char*), - MYF(MY_WME | MY_ZEROFILL)))) + MYF(MY_WME)))) return 1;
+ /* Copy default handler error messages */ + memcpy(handler_errmsgs, handler_error_messages, HA_ERR_ERRORS * sizeof(char*)); + /* Set the dedicated error messages. */ SETMSG(HA_ERR_KEY_NOT_FOUND, ER_DEFAULT(ER_KEY_NOT_FOUND)); SETMSG(HA_ERR_FOUND_DUPP_KEY, ER_DEFAULT(ER_DUP_KEY)); diff --git a/storage/maria/maria_def.h b/storage/maria/maria_def.h index 705562eb795..b256466f1f0 100644 --- a/storage/maria/maria_def.h +++ b/storage/maria/maria_def.h @@ -778,6 +785,14 @@ typedef struct st_maria_share myf write_flag; enum data_file_type data_file_type; enum pagecache_page_type page_type; /* value depending transactional */ + + /* + tracked will cause lost bytes (not aligned) but this is ok as it is always + used with tmp_file_tracking if set
I don't understand your explanation why "this is ok"
+ */ + my_bool tracked; /* Tracked table (always internal) */ + struct tmp_file_tracking track_data,track_index; + /** if Checkpoint looking at table; protected by close_lock or THR_LOCK_maria */ diff --git a/include/my_sys.h b/include/my_sys.h index af509c7df45..0df12963ab9 100644 --- a/include/my_sys.h +++ b/include/my_sys.h @@ -178,6 +180,17 @@ uchar *my_large_malloc(size_t *size, myf my_flags); void my_large_free(void *ptr, size_t size); void my_large_page_truncate(size_t *size);
+/* Tracking tmp file usage */ + +struct tmp_file_tracking +{ + ulonglong last_position;
I had to read how "last_position" is used to realize that it's in fact previous file_size value. Why not to call it previous_file_size? Or recorded_file_size ? Or old_file_size? it's not a "position"
+ ulonglong file_size; +}; + +typedef int (*TMPFILE_SIZE_CB)(struct tmp_file_tracking *track, int no_error); +extern TMPFILE_SIZE_CB update_tmp_file_size; + #ifdef _WIN32 extern BOOL my_obtain_privilege(LPCSTR lpPrivilege); #endif diff --git a/sql/sql_class.h b/sql/sql_class.h index c1efb49f6da..42eee9913ec 100644 --- a/sql/sql_class.h +++ b/sql/sql_class.h @@ -1070,12 +1074,22 @@ typedef struct system_status_var */
#define last_system_status_var questions -#define last_cleared_system_status_var local_memory_used + +/* Parameters to set_status_var_init() */ + +#define STATUS_OFFSET(A) offsetof(STATUS_VAR,A) +/* Clear as part of flush */ +#define clear_up_to_tmp_space_used STATUS_OFFSET(tmp_space_used) +/* Clear as part of startup */ +#define clear_up_to_memory_used STATUS_OFFSET(local_memory_used) +/* Full initialization. Note that global_memory_used is updated early! */ +#define clear_up_to_global_memory_used STATUS_OFFSET(global_memory_used) +#define last_restored_status_var clear_up_to_tmp_space_used
those are bad names. and the fact that you need to rename these defines and change all code that uses them - this kind of shows they're not good. when there will be a new variable, clear_up_to_tmp_space_used can become clear_up_to_new_variable and everything will need to be changed again. better use names that don't depend on individual variables. last_restored_status_var and last_cleared_system_status_var are good. Try to split variables logically. First there are additive variables - global value is the sum of all local values. They're cleared, they're added up tp global, etc. Then there are non-additive, not cleared, not added - like local_memory_used and tmp_space_used, for example. And global_memory_used is special because it's initialized early, right? so, may be defines based on these properties would be easier to understand. because now I'm totally at loss when to use clear_up_to_tmp_space_used, when to use clear_up_to_memory_used, when clear_up_to_global_memory_used and when last_restored_status_var.
+
/** Number of contiguous global status variables */ -constexpr int COUNT_GLOBAL_STATUS_VARS= int(offsetof(STATUS_VAR, - last_system_status_var) / - sizeof(ulong)) + 1; +constexpr int COUNT_GLOBAL_STATUS_VARS= + int(STATUS_OFFSET(last_system_status_var) /sizeof(ulong)) + 1;
/* Global status variables diff --git a/sql/filesort.cc b/sql/filesort.cc index abcd4127a0b..df4508b635a 100644 --- a/sql/filesort.cc +++ b/sql/filesort.cc @@ -1082,6 +1078,11 @@ static ha_rows find_all_keys(THD *thd, Sort_param *param, SQL_SELECT *select, DBUG_RETURN(num_records);
err: + if (!quick_select) + { + (void) file->extra(HA_EXTRA_NO_CACHE); + file->ha_rnd_end(); + }
why?
sort_form->column_bitmaps_set(save_read_set, save_write_set); DBUG_RETURN(HA_POS_ERROR); } /* find_all_keys */ diff --git a/sql/mysqld.cc b/sql/mysqld.cc index dc4a955aa79..4727b7b5413 100644 --- a/sql/mysqld.cc +++ b/sql/mysqld.cc @@ -1947,6 +1950,10 @@ static void mysqld_exit(int exit_code) if (exit_code == 0 || opt_endinfo) SAFEMALLOC_REPORT_MEMORY(0); } + if (global_tmp_space_used) + fprintf(stderr, "Warning: tmp_space not freed: %llu\n", + (ulonglong) global_tmp_space_used);
This is confusing. almost all (if not all) temp space is automatically freed by the OS when the process exits, so this warning will only scare the user about lost space when in fact no space is lost, it's only some space was not accounted for inside MariaDB. I suggest either make this warning debug-only or rephrase it like "Note: %llu tmp_space was lost and will be freed automatically when the process exits"
+ DBUG_LEAVE; #ifdef _WIN32 my_report_svc_status(SERVICE_STOPPED, exit_code, 0); @@ -7571,6 +7636,7 @@ SHOW_VAR status_vars[]= { {"Tc_log_page_size", (char*) &tc_log_page_size, SHOW_LONG_NOFLUSH}, {"Tc_log_page_waits", (char*) &tc_log_page_waits, SHOW_LONG}, #endif + {"tmp_space_used", (char*) offsetof(STATUS_VAR, tmp_space_used), SHOW_LONGLONG_STATUS},
first letter - upper case
#ifdef HAVE_POOL_OF_THREADS {"Threadpool_idle_threads", (char *) &show_threadpool_idle_threads, SHOW_SIMPLE_FUNC}, {"Threadpool_threads", (char *) &show_threadpool_threads, SHOW_SIMPLE_FUNC}, diff --git a/sql/log.cc b/sql/log.cc index 460cefea47b..a9bc3758dd5 100644 --- a/sql/log.cc +++ b/sql/log.cc @@ -2388,6 +2384,8 @@ bool Event_log::check_write_error(THD *thd) case ER_TRANS_CACHE_FULL: case ER_STMT_CACHE_FULL: case ER_ERROR_ON_WRITE: + case EE_LOCAL_TMP_SPACE_FULL: + case EE_GLOBAL_TMP_SPACE_FULL:
not a good idea to mix EE_ OS-like errors and ER_ SQL layer errors
case ER_BINLOG_LOGGING_IMPOSSIBLE: checked= TRUE; break; @@ -6650,7 +6651,8 @@ Event_log::flush_and_set_pending_rows_event(THD *thd, Rows_log_event* event, { set_write_error(thd, is_transactional); if (check_cache_error(thd, cache_data) && - stmt_has_updated_non_trans_table(thd)) + (stmt_has_updated_non_trans_table(thd) || + !is_transactional))
why?
cache_data->set_incident(); delete pending; cache_data->set_pending(NULL); @@ -7868,8 +7877,16 @@ int Event_log::write_cache(THD *thd, binlog_cache_data *cache_data) DBUG_RETURN(res ? ER_ERROR_ON_WRITE : 0); }
- if (reinit_io_cache(cache, READ_CACHE, 0, 0, 0)) + /* + Allow flush of transaction logs to temporary go over the tmp space limit + as we do not want the commit to fail + */ + cache->myflags&= ~MY_TRACK_WITH_LIMIT; + res= reinit_io_cache(cache, READ_CACHE, 0, 0, 0); + cache->myflags|= MY_TRACK_WITH_LIMIT;
you also remove MY_TRACK_WITH_LIMIT inside reinit_io_cache(), so this is redundant. Or it's redundant inside reinit_io_cache(), either way, no point in doing it twice
+ if (res) DBUG_RETURN(ER_ERROR_ON_WRITE); + /* Amount of remaining bytes in the IO_CACHE read buffer. */ size_t log_file_pos; uchar header_buf[LOG_EVENT_HEADER_LEN]; diff --git a/mysys/mf_iocache.c b/mysys/mf_iocache.c index 4ee1331bdb3..6214057f9e3 100644 --- a/mysys/mf_iocache.c +++ b/mysys/mf_iocache.c @@ -73,6 +74,50 @@ int (*_my_b_encr_read)(IO_CACHE *info,uchar *Buffer,size_t Count)= 0; int (*_my_b_encr_write)(IO_CACHE *info,const uchar *Buffer,size_t Count)= 0;
+static inline my_bool tmp_file_track(IO_CACHE *info, ulonglong file_size) +{ + if ((info->myflags & (MY_TRACK | MY_TRACK_WITH_LIMIT)) && + update_tmp_file_size) + { + if (info->tracking.file_size < file_size) + { + int error; + info->tracking.file_size= file_size; + if ((error= update_tmp_file_size(&info->tracking, + !(info->myflags & + MY_TRACK_WITH_LIMIT)))) + { + if (info->myflags & MY_WME) + my_error(my_errno, MYF(0)); + info->error= -1; + return 1; + } + } + } + return 0; +} + +my_bool io_cache_tmp_file_track(IO_CACHE *info, ulonglong file_size) +{ + return tmp_file_track(info, file_size); +} + + +void end_tracking_io_cache(IO_CACHE *info)
what does "end tracking IO_CACHE" mean? I read it as "end tracking of this IO_CACHE, it is no longer tracked after this function call" - but it's not what it is doing. so what how do you read this function name?
+{ + if ((info->myflags & (MY_TRACK | MY_TRACK_WITH_LIMIT)) && + info->tracking.file_size) + { + info->tracking.file_size= 0; + update_tmp_file_size(&info->tracking, 1); + } +} + +void truncate_io_cache(IO_CACHE *info) +{ + if (my_chsize(info->file, 0, 0, MYF(MY_WME)) <= 0) + end_tracking_io_cache(info);
not quite. If my_chsize() returns -1, you should not do info->tracking.file_size= 0, because file size wasn't actually changed.
+}
static void init_functions(IO_CACHE* info)
Regards, Sergei Chief Architect, MariaDB Server and security@mariadb.org
Hi!
On Thu, Apr 18, 2024 at 10:57 PM Sergei Golubchik
Hi, Monty,
On Apr 18, Michael Widenius wrote:
commit eba212af92d Author: Michael Widenius
Date: Thu Mar 14 17:59:00 2024 +0100 diff --git a/sql/json_table.cc b/sql/json_table.cc index 6713e796eb7..da5ab0c450a 100644 --- a/sql/json_table.cc +++ b/sql/json_table.cc @@ -734,7 +734,8 @@ bool Create_json_table::finalize(THD *thd, TABLE *table,
table->db_stat= HA_OPEN_KEYFILE; if (unlikely(table->file->ha_open(table, table->s->path.str, O_RDWR, - HA_OPEN_TMP_TABLE | HA_OPEN_INTERNAL_TABLE))) + HA_OPEN_TMP_TABLE | HA_OPEN_INTERNAL_TABLE | + HA_OPEN_SIZE_TRACKING)))
shouldn't HA_OPEN_INTERNAL_TABLE always imply HA_OPEN_SIZE_TRACKING automatically?
I had that first, but there where cases when this was not the case. Things changed a bit over time, but I do not have OPEN_SIZE_TRACKING for create_new_data_handle().
DBUG_RETURN(true);
table->set_created(); diff --git a/storage/maria/ma_statrec.c b/storage/maria/ma_statrec.c index d8a8b0a05d7..ff0c6c7a617 100644 --- a/storage/maria/ma_statrec.c +++ b/storage/maria/ma_statrec.c @@ -45,6 +45,13 @@ my_bool _ma_write_static_record(MARIA_HA *info, const uchar *record) my_errno=HA_ERR_RECORD_FILE_FULL; return(2); } + if (info->s->tracked &&
are there temporary files that you don't want to track?
Yes, those that are part of ALTER TABLE (temporary table, sort files and files that if they would be truncated would cause one to loose data. There are also files created with CREATE TEMPORARY. The plan is to add another temporary directory for files/tables like this.
are there non-temporary files that you want to track?
Not for now. In the future we may want to add quotas to catalogs, but that is another unrelated project. <cut>
diff --git a/storage/maria/ma_page.c b/storage/maria/ma_page.c index 5881456a69a..0877c966d1c 100644 --- a/storage/maria/ma_page.c +++ b/storage/maria/ma_page.c @@ -417,6 +417,13 @@ my_off_t _ma_new(register MARIA_HA *info, int level, DBUG_RETURN(HA_OFFSET_ERROR); } share->state.state.key_file_length+= block_size; + if (info->s->tracked && + _ma_update_tmp_file_size(&share->track_index, + share->state.state.key_file_length))
shouldn't you do it before share->state.state.key_file_length is increased? Currently it seems that you're failing the operation, but the file will stay increased
If the operation fails, the file will be closed and removed. (This is ok as this are just for internal temporary files).
same pattern everywhere else
Which is correct and the intention.
diff --git a/storage/maria/ma_delete_all.c b/storage/maria/ma_delete_all.c index f355d0da3e8..ed2087d3091 100644 --- a/storage/maria/ma_delete_all.c +++ b/storage/maria/ma_delete_all.c @@ -132,9 +132,16 @@ int maria_delete_all_rows(MARIA_HA *info) if (_ma_flush_table_files(info, MARIA_FLUSH_DATA|MARIA_FLUSH_INDEX, FLUSH_IGNORE_CHANGED, FLUSH_IGNORE_CHANGED) || mysql_file_chsize(info->dfile.file, 0, 0, MYF(MY_WME)) || - mysql_file_chsize(share->kfile.file, share->base.keystart, 0, MYF(MY_WME))) + mysql_file_chsize(share->kfile.file, share->base.keystart, 0, + MYF(MY_WME)))
should be > 0 here, shouldn't it? Both for kfile and dfile.
Yes. Fixed.
goto err;
+ if (info->s->tracked) + { + _ma_update_tmp_file_size(&info->s->track_data, 0); + _ma_update_tmp_file_size(&info->s->track_index, share->base.keystart);
but this shouldn't update the size if mysql_file_chsize returned -1
Yes. Adding the test for >0 fixes this.
diff --git a/sql/uniques.cc b/sql/uniques.cc index 36725e80a6b..44d93bb0f88 100644 --- a/sql/uniques.cc +++ b/sql/uniques.cc @@ -720,7 +720,7 @@ bool Unique::merge(TABLE *table, uchar *buff, size_t buff_size, /* Open cached file for table records if it isn't open */ if (! my_b_inited(outfile) && open_cached_file(outfile, mysql_tmpdir, TEMP_PREFIX, DISK_CHUNK_SIZE, - MYF(MY_WME))) + MYF(MY_WME | MY_TRACK | MY_TRACK_WITH_LIMIT)))
as far as I can see, open_cached_file is only used for temporary files. it should apply MY_TRACK | MY_TRACK_WITH_LIMIT internally, it'll be less error-prone
open_cached_files adds MY_TRACK but not MY_TRACK_WITH_LIMIT as not all files can be limited (yet). I have now removed MY_TRACK for all calls to open_cached_file()
diff --git a/storage/maria/ma_write.c b/storage/maria/ma_write.c index 9a6859bfd4d..7404db84067 100644 --- a/storage/maria/ma_write.c +++ b/storage/maria/ma_write.c @@ -386,6 +387,11 @@ int maria_write(MARIA_HA *info, const uchar *record) } } } + else if (my_errno == HA_ERR_LOCAL_TMP_SPACE_FULL || + my_errno == HA_ERR_GLOBAL_TMP_SPACE_FULL) + { + filepos= HA_OFFSET_ERROR; /* Avoid write_record_abort() */
why avoid? HA_ERR_LOCAL_TMP_SPACE_FULL is like EQUOT or ENOSPC.
This is to avoid setting fatal_error which causes write_record_abort() to be called which we don not want in this case as this would give 'table is crashed' instead of 'tmp space limit is reached"
why should be masked as HA_OFFSET_ERROR, and not be a fatal error?
See above. We could add 'equot' and 'enospc' to the above list later.
diff --git a/sql/sql_window.cc b/sql/sql_window.cc index e0c7cef1a9e..4959c7833a1 100644 --- a/sql/sql_window.cc +++ b/sql/sql_window.cc @@ -2922,15 +2922,16 @@ bool compute_window_func(THD *thd, if (unlikely(thd->is_error() || thd->is_killed())) break;
- /* Return to current row after notifying cursors for each window function. */ - tbl->file->ha_rnd_pos(tbl->record[0], rowid_buf); + if (tbl->file->ha_rnd_pos(tbl->record[0], rowid_buf)) + return true;
how can it fail?
A rnd_pos() can abort if there is a problem with flushing data to disk as part of trying to finding a new page in the page cache. We have tests for failing rnd_pos() all over the source. I probably added the above while fixing other things in the same file as the old code had almost no error checking in this file. (and this caused crashes for Elena).
diff --git a/mysql-test/main/status.test b/mysql-test/main/status.test index acf29f35b9e..b6be046a9ac 100644 --- a/mysql-test/main/status.test +++ b/mysql-test/main/status.test @@ -244,9 +244,9 @@ let $rnd_next = `show global status like 'handler_read_rnd_next'`; let $tmp_table = `show global status like 'Created_tmp_tables'`; show status like 'com_show_status'; show status like 'hand%write%'; -show status like '%tmp%'; +show status like '%_tmp%';
Doesn't look like a very meaningful change. Did you mean '%\_tmp%' ?
Yes. Good catch! However it did not change anything in the result.
--- a/mysys/mf_cache.c
+++ b/mysys/mf_cache.c @@ -43,7 +43,7 @@ my_bool open_cached_file(IO_CACHE *cache, const char* dir, const char *prefix, cache->file_name=0; cache->buffer=0; /* Mark that not open */ if (!init_io_cache(cache, -1, cache_size, WRITE_CACHE, 0L, 0, - MYF(cache_myflags | MY_NABP))) + MYF(cache_myflags | MY_NABP | MY_TRACK)))
Why not MY_TRACK_WITH_LIMIT ?
Not all files are yet safe to track with limit. For example online_alter_cache_data, wsrep_to_buf_helper(), Aria write_keys() (We should add tracking at this soon, I have not yet had time for this one).
diff --git a/sql/sql_select.cc b/sql/sql_select.cc index 6f2797b6b39..324f7679e8f 100644 --- a/sql/sql_select.cc +++ b/sql/sql_select.cc @@ -22275,7 +22275,8 @@ bool open_tmp_table(TABLE *table) int error; if (unlikely((error= table->file->ha_open(table, table->s->path.str, O_RDWR, HA_OPEN_TMP_TABLE | - HA_OPEN_INTERNAL_TABLE)))) + HA_OPEN_INTERNAL_TABLE | + HA_OPEN_SIZE_TRACKING))))
shouldn't HA_OPEN_TMP_TABLE or HA_OPEN_INTERNAL_TABLE imply HA_OPEN_SIZE_TRACKING?
Good question. As far as I remember, there was some case where I needed to separate the options. I checked the code and it looks like now I can remove HA_OPEN_SIZE_TRACKING. I will do that.
--- a/sql/handler.cc +++ b/sql/handler.cc @@ -443,9 +444,12 @@ int ha_init_errors(void) /* Zerofill it to avoid uninitialized gaps. */
remove the comment, if you no longer zerofill
Removed
diff --git a/storage/maria/maria_def.h b/storage/maria/maria_def.h index 705562eb795..b256466f1f0 100644 --- a/storage/maria/maria_def.h +++ b/storage/maria/maria_def.h @@ -778,6 +785,14 @@ typedef struct st_maria_share myf write_flag; enum data_file_type data_file_type; enum pagecache_page_type page_type; /* value depending transactional */ + + /* + tracked will cause lost bytes (not aligned) but this is ok as it is always + used with tmp_file_tracking if set
I don't understand your explanation why "this is ok"
This means that when we check tracked, the next two structs will also be loaded from memory to the cpu. The loss of 7 bytes should be offset by that.
diff --git a/include/my_sys.h b/include/my_sys.h index af509c7df45..0df12963ab9 100644 --- a/include/my_sys.h +++ b/include/my_sys.h @@ -178,6 +180,17 @@ uchar *my_large_malloc(size_t *size, myf my_flags); void my_large_free(void *ptr, size_t size); void my_large_page_truncate(size_t *size);
+/* Tracking tmp file usage */ + +struct tmp_file_tracking +{ + ulonglong last_position;
I had to read how "last_position" is used to realize that it's in fact previous file_size value. Why not to call it previous_file_size? Or recorded_file_size ? Or old_file_size? it's not a "position"
I have changed it to previous_file_size
diff --git a/sql/sql_class.h b/sql/sql_class.h index c1efb49f6da..42eee9913ec 100644 --- a/sql/sql_class.h +++ b/sql/sql_class.h @@ -1070,12 +1074,22 @@ typedef struct system_status_var */
#define last_system_status_var questions -#define last_cleared_system_status_var local_memory_used + +/* Parameters to set_status_var_init() */ + +#define STATUS_OFFSET(A) offsetof(STATUS_VAR,A) +/* Clear as part of flush */ +#define clear_up_to_tmp_space_used STATUS_OFFSET(tmp_space_used) +/* Clear as part of startup */ +#define clear_up_to_memory_used STATUS_OFFSET(local_memory_used) +/* Full initialization. Note that global_memory_used is updated early! */ +#define clear_up_to_global_memory_used STATUS_OFFSET(global_memory_used) +#define last_restored_status_var clear_up_to_tmp_space_used
those are bad names. and the fact that you need to rename these defines and change all code that uses them - this kind of shows they're not good.
The old code did not take into account that we different versions of clearing of variables. When adding new states that needs to be cleared at different times, then there is a need to move things around.
when there will be a new variable, clear_up_to_tmp_space_used can become clear_up_to_new_variable and everything will need to be changed again.
Not really, as any new variable that should be cleared at same time as tmp_space_used can be added after tmp_space_used. This is only needed in refresh_session_status() for variables that can be moved to next session. Anyway, I agree that we can do things better.
better use names that don't depend on individual variables. last_restored_status_var and last_cleared_system_status_var are good.
Try to split variables logically. First there are additive variables - global value is the sum of all local values. They're cleared, they're added up tp global, etc.
Then there are non-additive, not cleared, not added - like local_memory_used and tmp_space_used, for example. And global_memory_used is special because it's initialized early, right?
so, may be defines based on these properties would be easier to understand. because now I'm totally at loss when to use clear_up_to_tmp_space_used, when to use clear_up_to_memory_used, when clear_up_to_global_memory_used and when last_restored_status_var.
How about this clear_up_to_tmp_space_used -> clear_for_flush clear_up_to_memory_used -> clear_for_new_connection clear_up_to_global_memory_used -> clear_for_server_start
diff --git a/sql/filesort.cc b/sql/filesort.cc index abcd4127a0b..df4508b635a 100644 --- a/sql/filesort.cc +++ b/sql/filesort.cc @@ -1082,6 +1078,11 @@ static ha_rows find_all_keys(THD *thd, Sort_param *param, SQL_SELECT *select, DBUG_RETURN(num_records);
err: + if (!quick_select) + { + (void) file->extra(HA_EXTRA_NO_CACHE); + file->ha_rnd_end(); + }
why?
Was an old bug that Elna found. filesort code() calls ha_rnd_init() and must thus call ha_rnd_end() as otherwise there will be an assert later. Same goes for HA_EXTRA_NO_CACHE. cache was initialized in the beginning of filesort but not cleared in case of errors.
diff --git a/sql/mysqld.cc b/sql/mysqld.cc index dc4a955aa79..4727b7b5413 100644 --- a/sql/mysqld.cc +++ b/sql/mysqld.cc @@ -1947,6 +1950,10 @@ static void mysqld_exit(int exit_code) if (exit_code == 0 || opt_endinfo) SAFEMALLOC_REPORT_MEMORY(0); } + if (global_tmp_space_used) + fprintf(stderr, "Warning: tmp_space not freed: %llu\n", + (ulonglong) global_tmp_space_used);
This is confusing. almost all (if not all) temp space is automatically freed by the OS when the process exits, so this warning will only scare the user about lost space when in fact no space is lost, it's only some space was not accounted for inside MariaDB.
This shows there is a bug in MariaDB, which is actually dangerous as if the accounting of tmp space goes wrong there will be crashes as at some point one will not be able to create any temporary files.
I suggest either make this warning debug-only or rephrase it like "Note: %llu tmp_space was lost and will be freed automatically when the process exits"
I changed it to: Warning: Internal tmp_space accounting error of %lld bytes\n"
+ DBUG_LEAVE; #ifdef _WIN32 my_report_svc_status(SERVICE_STOPPED, exit_code, 0); @@ -7571,6 +7636,7 @@ SHOW_VAR status_vars[]= { {"Tc_log_page_size", (char*) &tc_log_page_size, SHOW_LONG_NOFLUSH}, {"Tc_log_page_waits", (char*) &tc_log_page_waits, SHOW_LONG}, #endif + {"tmp_space_used", (char*) offsetof(STATUS_VAR, tmp_space_used), SHOW_LONGLONG_STATUS},
first letter - upper case
fixed.
diff --git a/sql/log.cc b/sql/log.cc index 460cefea47b..a9bc3758dd5 100644 --- a/sql/log.cc +++ b/sql/log.cc @@ -2388,6 +2384,8 @@ bool Event_log::check_write_error(THD *thd) case ER_TRANS_CACHE_FULL: case ER_STMT_CACHE_FULL: case ER_ERROR_ON_WRITE: + case EE_LOCAL_TMP_SPACE_FULL: + case EE_GLOBAL_TMP_SPACE_FULL:
not a good idea to mix EE_ OS-like errors and ER_ SQL layer errors
I don't know how to otherwise do the above. From all practical purposes The new errors are as relevant as the other ones. We have to detect and handle these differently from other errors to get good error messages for common things.
case ER_BINLOG_LOGGING_IMPOSSIBLE: checked= TRUE; break; @@ -6650,7 +6651,8 @@ Event_log::flush_and_set_pending_rows_event(THD *thd, Rows_log_event* event, { set_write_error(thd, is_transactional); if (check_cache_error(thd, cache_data) && - stmt_has_updated_non_trans_table(thd)) + (stmt_has_updated_non_trans_table(thd) || + !is_transactional))
why
If is_transactional is no set, then the data will be different on the master and slave and we have to write an incident event. There should be a test case for this, I think. <cut>
+ /* + Allow flush of transaction logs to temporary go over the tmp space limit + as we do not want the commit to fail + */ + cache->myflags&= ~MY_TRACK_WITH_LIMIT; + res= reinit_io_cache(cache, READ_CACHE, 0, 0, 0); + cache->myflags|= MY_TRACK_WITH_LIMIT;
you also remove MY_TRACK_WITH_LIMIT inside reinit_io_cache(), so this is redundant. Or it's redundant inside reinit_io_cache(), either way, no point in doing it twice
This is only for this one case. In all other cases, we do not have to MY_TRACK_WITH_LIMIT. Without this, we can get errors from reinit_io_cache() here, which would be hard to handle in this place. (In other places it is ok to get the error).
--- a/mysys/mf_iocache.c +++ b/mysys/mf_iocache.c
<cut>
+my_bool io_cache_tmp_file_track(IO_CACHE *info, ulonglong file_size) +{ + return tmp_file_track(info, file_size); +} + + +void end_tracking_io_cache(IO_CACHE *info)
what does "end tracking IO_CACHE" mean? I read it as "end tracking of this IO_CACHE, it is no longer tracked after this function call" - but it's not what it is doing. so what how do you read this function name?
End all future tracking of the io cache. This also implies that all data tracked is freed. I have added a comment to clarify the purpose: /** End tmp space tracking for the data in the io cache This is called when deleting or truncating the cached file. */
+{ + if ((info->myflags & (MY_TRACK | MY_TRACK_WITH_LIMIT)) && + info->tracking.file_size) + { + info->tracking.file_size= 0; + update_tmp_file_size(&info->tracking, 1); + } +} + +void truncate_io_cache(IO_CACHE *info) +{ + if (my_chsize(info->file, 0, 0, MYF(MY_WME)) <= 0) + end_tracking_io_cache(info);
not quite. If my_chsize() returns -1, you should not do info->tracking.file_size= 0, because file size wasn't actually changed.
Fixed. On systems without truncate, one shouldn't use tmp space tracking... Regards, Monty
participants (2)
-
Michael Widenius
-
Sergei Golubchik