Hi, Monty,
On Apr 18, Michael Widenius wrote:
> commit eba212af92d
> Author: Michael Widenius <monty(a)mariadb.org>
> Date: Thu Mar 14 17:59:00 2024 +0100
>
> diff --git a/sql/json_table.cc b/sql/json_table.cc
> index 6713e796eb7..da5ab0c450a 100644
> --- a/sql/json_table.cc
> +++ b/sql/json_table.cc
> @@ -734,7 +734,8 @@ bool Create_json_table::finalize(THD *thd, TABLE *table,
>
> table->db_stat= HA_OPEN_KEYFILE;
> if (unlikely(table->file->ha_open(table, table->s->path.str, O_RDWR,
> - HA_OPEN_TMP_TABLE | HA_OPEN_INTERNAL_TABLE)))
> + HA_OPEN_TMP_TABLE | HA_OPEN_INTERNAL_TABLE |
> + HA_OPEN_SIZE_TRACKING)))
shouldn't HA_OPEN_INTERNAL_TABLE always imply HA_OPEN_SIZE_TRACKING
automatically?
> DBUG_RETURN(true);
>
> table->set_created();
> diff --git a/storage/maria/ma_statrec.c b/storage/maria/ma_statrec.c
> index d8a8b0a05d7..ff0c6c7a617 100644
> --- a/storage/maria/ma_statrec.c
> +++ b/storage/maria/ma_statrec.c
> @@ -45,6 +45,13 @@ my_bool _ma_write_static_record(MARIA_HA *info, const uchar *record)
> my_errno=HA_ERR_RECORD_FILE_FULL;
> return(2);
> }
> + if (info->s->tracked &&
are there temporary files that you don't want to track?
are there non-temporary files that you want to track?
> + _ma_update_tmp_file_size(&info->s->track_data,
> + MY_ALIGN(info->s->state.state.data_file_length+
> + info->s->base.pack_reclength,
> + MARIA_TRACK_INCREMENT_SIZE)))
> + return(2);
> +
> if (info->opt_flag & WRITE_CACHE_USED)
> { /* Cash in use */
> if (my_b_write(&info->rec_cache, record,
> diff --git a/storage/maria/ma_page.c b/storage/maria/ma_page.c
> index 5881456a69a..0877c966d1c 100644
> --- a/storage/maria/ma_page.c
> +++ b/storage/maria/ma_page.c
> @@ -417,6 +417,13 @@ my_off_t _ma_new(register MARIA_HA *info, int level,
> DBUG_RETURN(HA_OFFSET_ERROR);
> }
> share->state.state.key_file_length+= block_size;
> + if (info->s->tracked &&
> + _ma_update_tmp_file_size(&share->track_index,
> + share->state.state.key_file_length))
shouldn't you do it before share->state.state.key_file_length
is increased? Currently it seems that you're failing
the operation, but the file will stay increased
same pattern everywhere else
> + {
> + mysql_mutex_unlock(&share->intern_lock);
> + DBUG_RETURN(HA_OFFSET_ERROR);
> + }
> /* Following is for not transactional tables */
> info->state->key_file_length= share->state.state.key_file_length;
> mysql_mutex_unlock(&share->intern_lock);
> diff --git a/storage/maria/ma_delete_all.c b/storage/maria/ma_delete_all.c
> index f355d0da3e8..ed2087d3091 100644
> --- a/storage/maria/ma_delete_all.c
> +++ b/storage/maria/ma_delete_all.c
> @@ -132,9 +132,16 @@ int maria_delete_all_rows(MARIA_HA *info)
> if (_ma_flush_table_files(info, MARIA_FLUSH_DATA|MARIA_FLUSH_INDEX,
> FLUSH_IGNORE_CHANGED, FLUSH_IGNORE_CHANGED) ||
> mysql_file_chsize(info->dfile.file, 0, 0, MYF(MY_WME)) ||
> - mysql_file_chsize(share->kfile.file, share->base.keystart, 0, MYF(MY_WME)))
> + mysql_file_chsize(share->kfile.file, share->base.keystart, 0,
> + MYF(MY_WME)))
should be > 0 here, shouldn't it? Both for kfile and dfile.
> goto err;
>
> + if (info->s->tracked)
> + {
> + _ma_update_tmp_file_size(&info->s->track_data, 0);
> + _ma_update_tmp_file_size(&info->s->track_index, share->base.keystart);
but this shouldn't update the size if mysql_file_chsize returned -1
> + }
> +
> if (_ma_initialize_data_file(share, info->dfile.file))
> goto err;
>
> diff --git a/sql/uniques.cc b/sql/uniques.cc
> index 36725e80a6b..44d93bb0f88 100644
> --- a/sql/uniques.cc
> +++ b/sql/uniques.cc
> @@ -720,7 +720,7 @@ bool Unique::merge(TABLE *table, uchar *buff, size_t buff_size,
> /* Open cached file for table records if it isn't open */
> if (! my_b_inited(outfile) &&
> open_cached_file(outfile, mysql_tmpdir, TEMP_PREFIX, DISK_CHUNK_SIZE,
> - MYF(MY_WME)))
> + MYF(MY_WME | MY_TRACK | MY_TRACK_WITH_LIMIT)))
as far as I can see, open_cached_file is only used for temporary files.
it should apply MY_TRACK | MY_TRACK_WITH_LIMIT internally, it'll
be less error-prone
> return 1;
>
> bzero((char*) &sort_param,sizeof(sort_param));
> diff --git a/storage/maria/ma_write.c b/storage/maria/ma_write.c
> index 9a6859bfd4d..7404db84067 100644
> --- a/storage/maria/ma_write.c
> +++ b/storage/maria/ma_write.c
> @@ -386,6 +387,11 @@ int maria_write(MARIA_HA *info, const uchar *record)
> }
> }
> }
> + else if (my_errno == HA_ERR_LOCAL_TMP_SPACE_FULL ||
> + my_errno == HA_ERR_GLOBAL_TMP_SPACE_FULL)
> + {
> + filepos= HA_OFFSET_ERROR; /* Avoid write_record_abort() */
why avoid? HA_ERR_LOCAL_TMP_SPACE_FULL is like EQUOT or ENOSPC.
why should be masked as HA_OFFSET_ERROR, and not be a fatal error?
> + }
> else
> fatal_error= 1;
>
> diff --git a/storage/maria/ma_close.c b/storage/maria/ma_close.c
> index 7441e29a97b..47935d61b88 100644
> --- a/storage/maria/ma_close.c
> +++ b/storage/maria/ma_close.c
> @@ -14,7 +14,7 @@
> along with this program; if not, write to the Free Software
> Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA */
>
> -/* close a isam-database */
> +/* close an Aria table */
at last :) it's what 27 years old?
> /*
> TODO:
> We need to have a separate mutex on the closed file to allow other threads
> diff --git a/sql/sql_window.cc b/sql/sql_window.cc
> index e0c7cef1a9e..4959c7833a1 100644
> --- a/sql/sql_window.cc
> +++ b/sql/sql_window.cc
> @@ -2922,15 +2922,16 @@ bool compute_window_func(THD *thd,
> if (unlikely(thd->is_error() || thd->is_killed()))
> break;
>
> -
> /* Return to current row after notifying cursors for each window
> function. */
> - tbl->file->ha_rnd_pos(tbl->record[0], rowid_buf);
> + if (tbl->file->ha_rnd_pos(tbl->record[0], rowid_buf))
> + return true;
how can it fail?
> }
>
> /* We now have computed values for each window function. They can now
> be saved in the current row. */
> - save_window_function_values(window_functions, tbl, rowid_buf);
> + if (save_window_function_values(window_functions, tbl, rowid_buf))
> + return true;
>
> rownum++;
> }
> diff --git a/mysql-test/main/status.test b/mysql-test/main/status.test
> index acf29f35b9e..b6be046a9ac 100644
> --- a/mysql-test/main/status.test
> +++ b/mysql-test/main/status.test
> @@ -244,9 +244,9 @@ let $rnd_next = `show global status like 'handler_read_rnd_next'`;
> let $tmp_table = `show global status like 'Created_tmp_tables'`;
> show status like 'com_show_status';
> show status like 'hand%write%';
> -show status like '%tmp%';
> +show status like '%_tmp%';
Doesn't look like a very meaningful change. Did you mean '%\_tmp%' ?
> show status like 'hand%write%';
> -show status like '%tmp%';
> +show status like '%_tmp%';
> show status like 'com_show_status';
> let $rnd_next2 = `show global status like 'handler_read_rnd_next'`;
> let $tmp_table2 = `show global status like 'Created_tmp_tables'`;
> diff --git a/mysys/mf_cache.c b/mysys/mf_cache.c
> index 2fec59f4e70..712187f6181 100644
> --- a/mysys/mf_cache.c
> +++ b/mysys/mf_cache.c
> @@ -43,7 +43,7 @@ my_bool open_cached_file(IO_CACHE *cache, const char* dir, const char *prefix,
> cache->file_name=0;
> cache->buffer=0; /* Mark that not open */
> if (!init_io_cache(cache, -1, cache_size, WRITE_CACHE, 0L, 0,
> - MYF(cache_myflags | MY_NABP)))
> + MYF(cache_myflags | MY_NABP | MY_TRACK)))
Why not MY_TRACK_WITH_LIMIT ?
> {
> DBUG_RETURN(0);
> }
> diff --git a/sql/sql_select.cc b/sql/sql_select.cc
> index 6f2797b6b39..324f7679e8f 100644
> --- a/sql/sql_select.cc
> +++ b/sql/sql_select.cc
> @@ -22275,7 +22275,8 @@ bool open_tmp_table(TABLE *table)
> int error;
> if (unlikely((error= table->file->ha_open(table, table->s->path.str, O_RDWR,
> HA_OPEN_TMP_TABLE |
> - HA_OPEN_INTERNAL_TABLE))))
> + HA_OPEN_INTERNAL_TABLE |
> + HA_OPEN_SIZE_TRACKING))))
shouldn't HA_OPEN_TMP_TABLE or HA_OPEN_INTERNAL_TABLE imply
HA_OPEN_SIZE_TRACKING?
> {
> table->file->print_error(error, MYF(0)); /* purecov: inspected */
> table->db_stat= 0;
> diff --git a/sql/handler.cc b/sql/handler.cc
> index fec473d97bb..a05b7476724 100644
> --- a/sql/handler.cc
> +++ b/sql/handler.cc
> @@ -443,9 +444,12 @@ int ha_init_errors(void)
> /* Zerofill it to avoid uninitialized gaps. */
remove the comment, if you no longer zerofill
> if (! (handler_errmsgs= (const char**) my_malloc(key_memory_handler_errmsgs,
> HA_ERR_ERRORS * sizeof(char*),
> - MYF(MY_WME | MY_ZEROFILL))))
> + MYF(MY_WME))))
> return 1;
>
> + /* Copy default handler error messages */
> + memcpy(handler_errmsgs, handler_error_messages, HA_ERR_ERRORS * sizeof(char*));
> +
> /* Set the dedicated error messages. */
> SETMSG(HA_ERR_KEY_NOT_FOUND, ER_DEFAULT(ER_KEY_NOT_FOUND));
> SETMSG(HA_ERR_FOUND_DUPP_KEY, ER_DEFAULT(ER_DUP_KEY));
> diff --git a/storage/maria/maria_def.h b/storage/maria/maria_def.h
> index 705562eb795..b256466f1f0 100644
> --- a/storage/maria/maria_def.h
> +++ b/storage/maria/maria_def.h
> @@ -778,6 +785,14 @@ typedef struct st_maria_share
> myf write_flag;
> enum data_file_type data_file_type;
> enum pagecache_page_type page_type; /* value depending transactional */
> +
> + /*
> + tracked will cause lost bytes (not aligned) but this is ok as it is always
> + used with tmp_file_tracking if set
I don't understand your explanation why "this is ok"
> + */
> + my_bool tracked; /* Tracked table (always internal) */
> + struct tmp_file_tracking track_data,track_index;
> +
> /**
> if Checkpoint looking at table; protected by close_lock or THR_LOCK_maria
> */
> diff --git a/include/my_sys.h b/include/my_sys.h
> index af509c7df45..0df12963ab9 100644
> --- a/include/my_sys.h
> +++ b/include/my_sys.h
> @@ -178,6 +180,17 @@ uchar *my_large_malloc(size_t *size, myf my_flags);
> void my_large_free(void *ptr, size_t size);
> void my_large_page_truncate(size_t *size);
>
> +/* Tracking tmp file usage */
> +
> +struct tmp_file_tracking
> +{
> + ulonglong last_position;
I had to read how "last_position" is used to realize that it's
in fact previous file_size value. Why not to call it previous_file_size?
Or recorded_file_size ? Or old_file_size? it's not a "position"
> + ulonglong file_size;
> +};
> +
> +typedef int (*TMPFILE_SIZE_CB)(struct tmp_file_tracking *track, int no_error);
> +extern TMPFILE_SIZE_CB update_tmp_file_size;
> +
> #ifdef _WIN32
> extern BOOL my_obtain_privilege(LPCSTR lpPrivilege);
> #endif
> diff --git a/sql/sql_class.h b/sql/sql_class.h
> index c1efb49f6da..42eee9913ec 100644
> --- a/sql/sql_class.h
> +++ b/sql/sql_class.h
> @@ -1070,12 +1074,22 @@ typedef struct system_status_var
> */
>
> #define last_system_status_var questions
> -#define last_cleared_system_status_var local_memory_used
> +
> +/* Parameters to set_status_var_init() */
> +
> +#define STATUS_OFFSET(A) offsetof(STATUS_VAR,A)
> +/* Clear as part of flush */
> +#define clear_up_to_tmp_space_used STATUS_OFFSET(tmp_space_used)
> +/* Clear as part of startup */
> +#define clear_up_to_memory_used STATUS_OFFSET(local_memory_used)
> +/* Full initialization. Note that global_memory_used is updated early! */
> +#define clear_up_to_global_memory_used STATUS_OFFSET(global_memory_used)
> +#define last_restored_status_var clear_up_to_tmp_space_used
those are bad names. and the fact that you need to rename these defines
and change all code that uses them - this kind of shows they're not good.
when there will be a new variable, clear_up_to_tmp_space_used can
become clear_up_to_new_variable and everything will need to
be changed again.
better use names that don't depend on individual variables.
last_restored_status_var and last_cleared_system_status_var are good.
Try to split variables logically. First there are additive
variables - global value is the sum of all local values. They're cleared,
they're added up tp global, etc.
Then there are non-additive, not cleared, not added - like
local_memory_used and tmp_space_used, for example. And global_memory_used
is special because it's initialized early, right?
so, may be defines based on these properties would be easier to understand.
because now I'm totally at loss when to use clear_up_to_tmp_space_used,
when to use clear_up_to_memory_used, when clear_up_to_global_memory_used
and when last_restored_status_var.
> +
>
> /** Number of contiguous global status variables */
> -constexpr int COUNT_GLOBAL_STATUS_VARS= int(offsetof(STATUS_VAR,
> - last_system_status_var) /
> - sizeof(ulong)) + 1;
> +constexpr int COUNT_GLOBAL_STATUS_VARS=
> + int(STATUS_OFFSET(last_system_status_var) /sizeof(ulong)) + 1;
>
> /*
> Global status variables
> diff --git a/sql/filesort.cc b/sql/filesort.cc
> index abcd4127a0b..df4508b635a 100644
> --- a/sql/filesort.cc
> +++ b/sql/filesort.cc
> @@ -1082,6 +1078,11 @@ static ha_rows find_all_keys(THD *thd, Sort_param *param, SQL_SELECT *select,
> DBUG_RETURN(num_records);
>
> err:
> + if (!quick_select)
> + {
> + (void) file->extra(HA_EXTRA_NO_CACHE);
> + file->ha_rnd_end();
> + }
why?
> sort_form->column_bitmaps_set(save_read_set, save_write_set);
> DBUG_RETURN(HA_POS_ERROR);
> } /* find_all_keys */
> diff --git a/sql/mysqld.cc b/sql/mysqld.cc
> index dc4a955aa79..4727b7b5413 100644
> --- a/sql/mysqld.cc
> +++ b/sql/mysqld.cc
> @@ -1947,6 +1950,10 @@ static void mysqld_exit(int exit_code)
> if (exit_code == 0 || opt_endinfo)
> SAFEMALLOC_REPORT_MEMORY(0);
> }
> + if (global_tmp_space_used)
> + fprintf(stderr, "Warning: tmp_space not freed: %llu\n",
> + (ulonglong) global_tmp_space_used);
This is confusing. almost all (if not all) temp space is automatically
freed by the OS when the process exits, so this warning will only
scare the user about lost space when in fact no space is lost,
it's only some space was not accounted for inside MariaDB.
I suggest either make this warning debug-only or rephrase it like
"Note: %llu tmp_space was lost and will be freed automatically when the
process exits"
> +
> DBUG_LEAVE;
> #ifdef _WIN32
> my_report_svc_status(SERVICE_STOPPED, exit_code, 0);
> @@ -7571,6 +7636,7 @@ SHOW_VAR status_vars[]= {
> {"Tc_log_page_size", (char*) &tc_log_page_size, SHOW_LONG_NOFLUSH},
> {"Tc_log_page_waits", (char*) &tc_log_page_waits, SHOW_LONG},
> #endif
> + {"tmp_space_used", (char*) offsetof(STATUS_VAR, tmp_space_used), SHOW_LONGLONG_STATUS},
first letter - upper case
> #ifdef HAVE_POOL_OF_THREADS
> {"Threadpool_idle_threads", (char *) &show_threadpool_idle_threads, SHOW_SIMPLE_FUNC},
> {"Threadpool_threads", (char *) &show_threadpool_threads, SHOW_SIMPLE_FUNC},
> diff --git a/sql/log.cc b/sql/log.cc
> index 460cefea47b..a9bc3758dd5 100644
> --- a/sql/log.cc
> +++ b/sql/log.cc
> @@ -2388,6 +2384,8 @@ bool Event_log::check_write_error(THD *thd)
> case ER_TRANS_CACHE_FULL:
> case ER_STMT_CACHE_FULL:
> case ER_ERROR_ON_WRITE:
> + case EE_LOCAL_TMP_SPACE_FULL:
> + case EE_GLOBAL_TMP_SPACE_FULL:
not a good idea to mix EE_ OS-like errors and ER_ SQL layer errors
> case ER_BINLOG_LOGGING_IMPOSSIBLE:
> checked= TRUE;
> break;
> @@ -6650,7 +6651,8 @@ Event_log::flush_and_set_pending_rows_event(THD *thd, Rows_log_event* event,
> {
> set_write_error(thd, is_transactional);
> if (check_cache_error(thd, cache_data) &&
> - stmt_has_updated_non_trans_table(thd))
> + (stmt_has_updated_non_trans_table(thd) ||
> + !is_transactional))
why?
> cache_data->set_incident();
> delete pending;
> cache_data->set_pending(NULL);
> @@ -7868,8 +7877,16 @@ int Event_log::write_cache(THD *thd, binlog_cache_data *cache_data)
> DBUG_RETURN(res ? ER_ERROR_ON_WRITE : 0);
> }
>
> - if (reinit_io_cache(cache, READ_CACHE, 0, 0, 0))
> + /*
> + Allow flush of transaction logs to temporary go over the tmp space limit
> + as we do not want the commit to fail
> + */
> + cache->myflags&= ~MY_TRACK_WITH_LIMIT;
> + res= reinit_io_cache(cache, READ_CACHE, 0, 0, 0);
> + cache->myflags|= MY_TRACK_WITH_LIMIT;
you also remove MY_TRACK_WITH_LIMIT inside reinit_io_cache(),
so this is redundant. Or it's redundant inside reinit_io_cache(),
either way, no point in doing it twice
> + if (res)
> DBUG_RETURN(ER_ERROR_ON_WRITE);
> +
> /* Amount of remaining bytes in the IO_CACHE read buffer. */
> size_t log_file_pos;
> uchar header_buf[LOG_EVENT_HEADER_LEN];
> diff --git a/mysys/mf_iocache.c b/mysys/mf_iocache.c
> index 4ee1331bdb3..6214057f9e3 100644
> --- a/mysys/mf_iocache.c
> +++ b/mysys/mf_iocache.c
> @@ -73,6 +74,50 @@ int (*_my_b_encr_read)(IO_CACHE *info,uchar *Buffer,size_t Count)= 0;
> int (*_my_b_encr_write)(IO_CACHE *info,const uchar *Buffer,size_t Count)= 0;
>
>
> +static inline my_bool tmp_file_track(IO_CACHE *info, ulonglong file_size)
> +{
> + if ((info->myflags & (MY_TRACK | MY_TRACK_WITH_LIMIT)) &&
> + update_tmp_file_size)
> + {
> + if (info->tracking.file_size < file_size)
> + {
> + int error;
> + info->tracking.file_size= file_size;
> + if ((error= update_tmp_file_size(&info->tracking,
> + !(info->myflags &
> + MY_TRACK_WITH_LIMIT))))
> + {
> + if (info->myflags & MY_WME)
> + my_error(my_errno, MYF(0));
> + info->error= -1;
> + return 1;
> + }
> + }
> + }
> + return 0;
> +}
> +
> +my_bool io_cache_tmp_file_track(IO_CACHE *info, ulonglong file_size)
> +{
> + return tmp_file_track(info, file_size);
> +}
> +
> +
> +void end_tracking_io_cache(IO_CACHE *info)
what does "end tracking IO_CACHE" mean?
I read it as "end tracking of this IO_CACHE, it is no longer
tracked after this function call" - but it's not what it is doing.
so what how do you read this function name?
> +{
> + if ((info->myflags & (MY_TRACK | MY_TRACK_WITH_LIMIT)) &&
> + info->tracking.file_size)
> + {
> + info->tracking.file_size= 0;
> + update_tmp_file_size(&info->tracking, 1);
> + }
> +}
> +
> +void truncate_io_cache(IO_CACHE *info)
> +{
> + if (my_chsize(info->file, 0, 0, MYF(MY_WME)) <= 0)
> + end_tracking_io_cache(info);
not quite. If my_chsize() returns -1, you should not do
info->tracking.file_size= 0, because file size wasn't actually changed.
> +}
>
> static void
> init_functions(IO_CACHE* info)
Regards,
Sergei
Chief Architect, MariaDB Server
and security(a)mariadb.org