developers
Threads by month
- ----- 2025 -----
- January
- ----- 2024 -----
- December
- November
- October
- September
- August
- July
- June
- May
- April
- March
- February
- January
- ----- 2023 -----
- December
- November
- October
- September
- August
- July
- June
- May
- April
- March
- February
- January
- ----- 2022 -----
- December
- November
- October
- September
- August
- July
- June
- May
- April
- March
- February
- January
- ----- 2021 -----
- December
- November
- October
- September
- August
- July
- June
- May
- April
- March
- February
- January
- ----- 2020 -----
- December
- November
- October
- September
- August
- July
- June
- May
- April
- March
- February
- January
- ----- 2019 -----
- December
- November
- October
- September
- August
- July
- June
- May
- April
- March
- February
- January
- ----- 2018 -----
- December
- November
- October
- September
- August
- July
- June
- May
- April
- March
- February
- January
- ----- 2017 -----
- December
- November
- October
- September
- August
- July
- June
- May
- April
- March
- February
- January
- ----- 2016 -----
- December
- November
- October
- September
- August
- July
- June
- May
- April
- March
- February
- January
- ----- 2015 -----
- December
- November
- October
- September
- August
- July
- June
- May
- April
- March
- February
- January
- ----- 2014 -----
- December
- November
- October
- September
- August
- July
- June
- May
- April
- March
- February
- January
- ----- 2013 -----
- December
- November
- October
- September
- August
- July
- June
- May
- April
- March
- February
- January
- ----- 2012 -----
- December
- November
- October
- September
- August
- July
- June
- May
- April
- March
- February
- January
- ----- 2011 -----
- December
- November
- October
- September
- August
- July
- June
- May
- April
- March
- February
- January
- ----- 2010 -----
- December
- November
- October
- September
- August
- July
- June
- May
- April
- March
- February
- January
- ----- 2009 -----
- December
- November
- October
- September
- August
- July
- June
- May
- April
- March
- February
- 4 participants
- 6816 discussions
Hi Aleksey,
Thanks for the patch. Please see below for some initial review comments.
There are some todos for myself noted below which I will look into
tomorrow.
> From b4465fc6b51dfc4e645569201e6026ec6405367b Mon Sep 17 00:00:00 2001
> From: Aleksey Midenkov <midenok(a)gmail.com>
> Date: Mon, 30 Sep 2024 22:26:51 +0300
> Subject: [PATCH] MDEV-28413 System versioning is incorrect in Spider
> Basic functioning of system versioning in Spider: UPDATE, DELETE,
> DELETE HISTORY, AS OF, FROM .. TO, BETWEEN .. AND.
> More testing should be done for:
> LOAD DATA, joins, derived tables, multi-update/delete and what else?
> Note that we are still using HA_CAN_DIRECT_UPDATE_AND_DELETE, but not
> for periods in DELETE (fwiw there is condition
> !table_list->has_period(), but it is missing in UDPATE).
> ---
> .../mysql-test/spider/r/versioning.result | 142 ++++++++++++++++
> .../mysql-test/spider/t/versioning.test | 151 ++++++++++++++++++
> storage/spider/spd_db_conn.cc | 19 ++-
> storage/spider/spd_db_include.h | 1 +
> storage/spider/spd_db_mysql.cc | 25 ++-
> 5 files changed, 335 insertions(+), 3 deletions(-)
> create mode 100644 storage/spider/mysql-test/spider/r/versioning.result
> create mode 100644 storage/spider/mysql-test/spider/t/versioning.test
> [... 149 lines elided]
> diff --git a/storage/spider/mysql-test/spider/t/versioning.test b/storage/spider/mysql-test/spider/t/versioning.test
> new file mode 100644
> index 00000000000..6093169bff4
> --- /dev/null
> +++ b/storage/spider/mysql-test/spider/t/versioning.test
> @@ -0,0 +1,151 @@
> +--source include/have_innodb.inc
> +--source suite/versioning/common.inc
> +
> +install plugin spider soname 'ha_spider';
For init the spider plugin in spider tests we commonly do
--disable_query_log
--disable_result_log
--source ../t/test_init.inc
--enable_result_log
--enable_query_log
> +set spider_same_server_link= on;
> +
> +--replace_result $MASTER_1_MYPORT MASTER_1_MYPORT
> +eval create server s foreign data wrapper mysql options
> +(host '127.0.0.1', database 'test', user 'root', port $MASTER_1_MYPORT);
Use evalp instead of eval and you can save the --replace_result line
> +
> +
> +create table t1 (
> + x int, y int
> +) with system versioning;
> +
> +create table t2 (
> + x int, y int,
> + row_start timestamp(6) as row start invisible,
> + row_end timestamp(6) as row end invisible,
> + period for system_time (row_start, row_end)
> +) with system versioning;
> +
> +create or replace table t1_sp (
> + x int, y int)
> +engine=spider comment='wrapper "mysql", srv "s", table "t1"'
> +with system versioning;
> +
> +create or replace table t2_sp (
> + x int, y int,
> + row_start timestamp(6) as row start invisible,
> + row_end timestamp(6) as row end invisible,
> + period for system_time (row_start, row_end))
> +engine=spider comment='wrapper "mysql", srv "s", table "t2"'
> +with system versioning;
> +
> +--echo # Timestamps are not propagated (subject of MDEV-16546, but not for implicit system fields)
> +set timestamp= unix_timestamp('2000-01-01 00:00:00');
I tried executing directly using
select spider_direct_sql('set timestamp= unix_timestamp("2000-01-01 00:00:00");', '', 'srv "s"');
But then I get extra rows in subsequent select queries and I think it is
caused by the timezones. It could be a separate task to fix this, but we
should warn users about it.
> +insert into t1_sp values (1, 1), (2, 2);
It may be obvious to you, but I find the timestamps at times hard to
keep track of, so I suggest adding a comment to each DML statement
indicating the time, as well as the select row_start / row_end
statements. For example, change the line above to
insert into t1_sp values (1, 1), (2, 2); # T1
> +insert into t2_sp values (1, 1), (2, 2);
and change this line to
insert into t2_sp values (1, 1), (2, 2); # T2
> +--sleep 0.01
> +
> +set timestamp= unix_timestamp('2000-01-01 00:11:11');
> +--echo # check dml generating history
> +delete from t1_sp where x = 1;
add # T3
> +--sleep 0.01
> +delete from t2_sp where x = 1;
# T4
> +--sleep 0.01
> +
> +select row_end into @t1 from t2 for system_time all where x = 1;
And so we can annotate this line with the timestamp above, like so:
select row_end into @t1 from t2 for system_time all where x = 1; # @t1 = T4
> +
> +update t1_sp set y= y + 1;
# T5
> +select row_start into @t2 from t1 where x = 2;
# @t2 = T5
> +--sleep 0.01
> +update t2_sp set y= y + 1;
# T6
With the annotations I suggested, results from the following block of
selects becomes easier to understand.
> +
> +select * from t1_sp;
> +select * from t2_sp;
> +select * from t1_sp for system_time as of timestamp @t1;
> +select * from t2_sp for system_time as of timestamp @t1;
> +select * from t1_sp for system_time as of timestamp @t2;
> +select * from t2_sp for system_time as of timestamp @t2;
> +select * from t1_sp for system_time from timestamp @t1 to timestamp @t2;
> +select * from t2_sp for system_time from timestamp @t1 to timestamp @t2;
> +select * from t1_sp for system_time between timestamp @t1 and timestamp @t2;
> +select * from t2_sp for system_time between timestamp @t1 and timestamp @t2;
> +
> +--echo # spider cannot call functions, it fails in spider_create_group_by_handler()
> + # at:
> + #
> + # 1555 if (spider_db_print_item_type(item, null, spider, null, null, 0,
> + # 1556 roop_count, true, fields_arg))
> + # 1557 {
> + # 1558 dbug_print("info",("spider dbton_id=%d can't create select", roop_count));
> + # 1559 spider_clear_bit(dbton_bitmap, roop_count);
> + # 1560 keep_going = false;
> + # 1561 break;
> + # 1562 }
> + #
This is fine. The Spider group-by-handler (GBH) is not meant to handle
every situation. Either remove this block starting from the --echo, or
make it more succinct by replacing it with something like (Note that GBH
supports certain functions like trim):
# The spider group-by-handler does not support the check_row function
> +--echo # but somehow it doesn't fail the query and continues without FOR SYSTEM_TIME ALL clause
> +select *, check_row(row_start, row_end) from t1_sp for system_time all order by row_end, y, x;
> +select *, check_row(row_start, row_end) from t2_sp for system_time all order by row_end, y, x;
I will check why the execution without the group-by-handler results in
wrong results.
> +--echo # SELECT from base table works as usual:
> +select *, check_row(row_start, row_end) from t1 for system_time all order by row_end, y, x;
> +select *, check_row(row_start, row_end) from t2 for system_time all order by row_end, y, x;
> +
> +--echo # here it works nice (append_table_list() has the chance to add FOR SYSTEM_TIME ALL)
> +
> +# Usage: SHOW_TIMESTAMPS=1 mtr versioning.spider
Shouldn't the test name be spider.versioning?
> +if ($SHOW_TIMESTAMPS)
> +{
> + select @t1, @t2;
> +}
> +if (!$SHOW_TIMESTAMPS)
> +{
> +--replace_regex /\d\d\d\d-\d\d-\d\d \d\d:\d\d:\d\d.\d\d\d\d\d\d/TIMESTAMP/
> +}
> +select *, row_start, row_end from t1_sp for system_time all order by row_end, y, x;
> +
> +if (!$SHOW_TIMESTAMPS)
> +{
> +--replace_regex /\d\d\d\d-\d\d-\d\d \d\d:\d\d:\d\d.\d\d\d\d\d\d/TIMESTAMP/
> +}
> +select *, row_start, row_end from t2_sp for system_time all order by row_end, y, x;
> +
> +delete history from t1_sp before system_time @t1;
> +delete history from t2_sp before system_time @t1;
> +
> +if (!$SHOW_TIMESTAMPS)
> +{
> +--replace_regex /\d\d\d\d-\d\d-\d\d \d\d:\d\d:\d\d.\d\d\d\d\d\d/TIMESTAMP/
> +}
> +select *, row_start, row_end from t1_sp for system_time all order by row_end, y, x;
> +
> +if (!$SHOW_TIMESTAMPS)
> +{
> +--replace_regex /\d\d\d\d-\d\d-\d\d \d\d:\d\d:\d\d.\d\d\d\d\d\d/TIMESTAMP/
> +}
> +select *, row_start, row_end from t2_sp for system_time all order by row_end, y, x;
> +
> +select * from t1 for system_time all;
> +select * from t2 for system_time all;
> +
> +delete history from t1_sp;
> +delete history from t2_sp;
> +
> +if (!$SHOW_TIMESTAMPS)
> +{
> +--replace_regex /\d\d\d\d-\d\d-\d\d \d\d:\d\d:\d\d.\d\d\d\d\d\d/TIMESTAMP/
> +}
> +select *, row_start, row_end from t1_sp for system_time all order by row_end, y, x;
> +
> +if (!$SHOW_TIMESTAMPS)
> +{
> +--replace_regex /\d\d\d\d-\d\d-\d\d \d\d:\d\d:\d\d.\d\d\d\d\d\d/TIMESTAMP/
> +}
> +select *, row_start, row_end from t2_sp for system_time all order by row_end, y, x;
> +
> +select * from t1 for system_time all;
> +select * from t2 for system_time all;
> +
> +drop tables t1, t2, t1_sp, t2_sp;
> +set timestamp= default;
> +--source suite/versioning/common_finish.inc
> +
> +uninstall plugin spider;
> +drop table mysql.spider_link_failed_log, mysql.spider_link_mon_servers, mysql.spider_tables, mysql.spider_table_crd, mysql.spider_table_position_for_recovery, mysql.spider_table_sts, mysql.spider_xa, mysql.spider_xa_failed_log, mysql.spider_xa_member;
> +drop function spider_direct_sql;
> +drop function spider_bg_direct_sql;
> +drop function spider_ping_table;
> +drop function spider_copy_tables;
> +drop function spider_flush_table_mon_cache;
For deinit the spider plugin in spider tests we commonly do
--disable_query_log
--disable_result_log
--source ../t/test_deinit.inc
--enable_result_log
--enable_query_log
Please replace the lines from "uninstall plugin spider;" to the end of
the test with this block.
> diff --git a/storage/spider/spd_db_conn.cc b/storage/spider/spd_db_conn.cc
> index 8750e87237e..896fce8adf5 100644
> --- a/storage/spider/spd_db_conn.cc
> +++ b/storage/spider/spd_db_conn.cc
> @@ -1515,9 +1515,8 @@ int spider_db_append_key_where_internal(
> ) {
> SPIDER_RESULT_LIST *result_list = &spider->result_list;
> SPIDER_SHARE *share = spider->share;
> -#ifndef DBUG_OFF
> TABLE *table = spider->get_table();
> -#endif
Why remove the #ifndef and #endif?
> [... 99 lines elided]
Best,
Yuchen
1
1
2
2
Hello I wonder if anyone can help me out for this case
I'm trying to build an encryption plugin for mariaDB and I built this
https://gist.github.com/Mahmoudgalalz/ae1de7a57f8b9f37056efc2018575ce4
I managed to compile it, but I'm running into an issue when I'm trying to use encrypted option
MariaDB [test]> CREATE TABLE test_encrypted (id INT PRIMARY KEY, sensitive_data VARCHAR(255)) ENGINE=InnoDB ENCRYPTED=YES;
ERROR 2013 (HY000): Lost connection to server during query
MariaDB [test]> CREATE TABLE t1 (a VARCHAR(8)) ENGINE=InnoDB ENCRYPTED=YES ENCRYPTION_KEY_ID=1;
ERROR 2006 (HY000): Server has gone away
No connection. Trying to reconnect...
Connection id: 31
Current database: test
ERROR 2013 (HY000): Lost connection to server during query
so if anyone can help me out to figure out what is wrong and should I correct or consider would be very much appreciated.
2
4
Hello I wonder if anyone can help me out for this case
I'm trying to build an encryption plugin for mariaDB and I built this
https://gist.github.com/Mahmoudgalalz/ae1de7a57f8b9f37056efc2018575ce4
I managed to compile it, but I'm running into an issue when I'm trying to use encrypted option
MariaDB [test]> CREATE TABLE test_encrypted (id INT PRIMARY KEY, sensitive_data VARCHAR(255)) ENGINE=InnoDB ENCRYPTED=YES;
ERROR 2013 (HY000): Lost connection to server during query
MariaDB [test]> CREATE TABLE t1 (a VARCHAR(8)) ENGINE=InnoDB ENCRYPTED=YES ENCRYPTION_KEY_ID=1;
ERROR 2006 (HY000): Server has gone away
No connection. Trying to reconnect...
Connection id: 31
Current database: test
ERROR 2013 (HY000): Lost connection to server during query
so if anyone can help me out to figure out what is wrong and should I correct or consider would be very much appreciated.
1
0
Re: [PATCH] MDEV-32014 Rename binlog cache temporary file to binlog file for large transaction
by Kristian Nielsen 10 Sep '24
by Kristian Nielsen 10 Sep '24
10 Sep '24
> From: Libing Song <anders.slb(a)alibaba-inc.com>
>
> Description
> ===========
> When a transaction commits, it copies the binlog events from
> binlog cache to binlog file. Very large transactions
> (eg. gigabytes) can stall other transactions for a long time
> because the data is copied while holding LOCK_log, which blocks
> other commits from binlogging.
Hi Libing Song,
Here is my full review of the patch for MDEV-32014 Rename binlog cache
temporary file to binlog file for large transaction.
First let me say that thanks, reviewing this patch was a pleasant surprise.
The binlog code that you are integrating this new feature in is complex to
work with, but overall I think you have managed to do it a pretty clean way.
That is great work, and I think a lot of effort must have gone into getting
it this way. I have a number of comments and requests for changes below, so
I wanted to give this overall good impression up-front.
Also thanks to Brandon for doing first review. My apologies if I am
repeating something already discussed, but Github pull request reviews of
non-trivial patches are almost impossible to read, with reviews being split
in fragments, patches disappearing after force-pushes, etc.
The root cause for the problem being addressed by the patch is a fundamental
limitation in the current binlog format. It requires all transactions to be
binlogged as a single event group, using consecutive bytes in a single
binlog file. Thus, while writing the large transaction to the binlog during
commit, all other transaction commits will have to wait before they can
commit.
Eventually, we should move towards an enhanced binlog format that doesn't
have this limitation, so that transactions can be partially binlogged during
execution, avoiding this bottlenect. There are a number of other problems
caused by this limitation that are not addressed by this patch. Changing the
binlog format is a complex task not easily done, so this patch should be
seen as a temporary solution to one specific bottleneck, until if/when this
underlying limitation can hopefully be removed.
First some overall comments:
As Brandon also remarked, the name "binlog_free_flush" does not describe the
feature very clearly. I would suggest to use the term "rotate" in the name,
as the concept of binlog rotation is well known in MariaDB replication, and
the patch effectively performs large commits by doing it as a binlog
rotation instead of a simple write to the existing binlog file.
So I suggest to name the option --binlog-commit-by-rotate-threshold. And I
suggest in the code (names and comments) to use the phrase
"rotate_by_commit" instead of "free_flush".
The patch enables the feature by default, with a threshold of 128MB.
We need to consider if that is appropriate, or if the feature should be
disabled by default.
Reasons for disabling by default:
- The feature causes some binlog files to be cut short of the configured
--max-binlog-size, which will be a surprise to users.
- The feature adds an overhead of 4096 pre-allocated bytes at the start of
every trx cache (though the overhead is minimal).
- A default of 128MB is somewhat arbitrary.
- Releasing this feature disabled by default is somewhat less risky wrt.
regressions for users that upgrade.
Reasons to enable by default:
- Most users will otherwise not know the feature exists, and will thus not
benefit from this work.
- It will get much less testing "in the wild".
So the decision will be a compromise between these two. If I had to make the
choice, I would default to this feature being disabled. But I'm open to hear
opinions the other way.
The patch calls the trx_group_commit_leader() function. That is ackward, as
commit-by-rotate does not actually participate in group commit. But the
problem is with the existing function trx_group_commit_leader(), which does
too much in a single function.
So please do a small refactoring of trx_group_commit_leader() and keep only
the first part of that function that fetches the group commit queue. Move
the second part to a new function trx_group_commit_with_engines() that is
called by trx_group_commit_leader() passing the "queue" and "last_in_queue"
values. And assert in trx_group_commit_with_engines() that LOCK_log is held
on entry and comment that this function will release the lock.
Then you can from your Binlog_free_flush::commit() function call into
trx_group_commit_with_engines() directly, avoiding the need for adding a
true/false flag, and also avoiding that the trx_group_commit_leader()
function is sometimes called with LOCK_log held and sometimes not.
Same situation with MYSQL_BIN_LOG::write_transaction_to_binlog_events(),
this function is also doing too much and makes integrating your code
ackward.
Please refactor the MYSQL_BIN_LOG::write_transaction_to_binlog_events()
function something like this:
bool
MYSQL_BIN_LOG::write_transaction_to_binlog_events(group_commit_entry *entry)
{
if (binlog_commit_by_rotate.should_commit_by_rotate(entry) &&
binlog_commit_by_rotate.commit(entry))
{
/* Commit done by renaming the trx/stmt cache. */
}
else
{
if (write_transaction_with_group_commit(entry))
return true; /* Error */
}
if (unlikely(entry->error))
{
write_transaction_handle_error(entry);
return 1;
}
return 0;
}
Introducing two new functions:
bool write_transaction_with_group_commit(group_commit_entry *entry)
This function contains the code from write_transaction_to_binlog_events() up
to and including this:
if (likely(!entry->error))
return entry->thd->wait_for_prior_commit();
You don't need the wait_for_prior_commit() in the commit-by-rotate case, as
you already did this in Binlog_free_flush::commit()).
You also don't need to handle the (!opt_optimize_thread_scheduling) case for
commit-by-rotate, this is old code that is being deprecated. Just add a
condition in should_free_flush() to skip if opt_optimize_thread_scheduling
is set.
Other new function:
void write_transaction_handle_error(group_commit_entry *entry)
This function contains the remainder of the
write_transaction_to_binlog_events() function starting from
"switch (entry->error) ...".
This way, the commit-by-rotate case is cleanly separated from the normal
group commit case and the code of write_transaction_to_binlog_events()
becomes much cleaner.
Now follows more detailed comments on specific parts of the patch:
> diff --git a/sql/log.cc b/sql/log.cc
> index 34f9ad745fc..3dc57b21c05 100644
> --- a/sql/log.cc
> +++ b/sql/log.cc
> @@ -163,6 +163,111 @@ static SHOW_VAR binlog_status_vars_detail[]=
> + void set_reserved_bytes(uint32 header_len)
> + {
> + // Gtid event length
> + header_len+= LOG_EVENT_HEADER_LEN + Gtid_log_event::max_data_length +
> + BINLOG_CHECKSUM_LEN;
> + header_len= header_len - (header_len % IO_SIZE) + IO_SIZE;
This has an off-by-one error, it will round up IO_SIZE to 2*IO_SIZE.
Use eg. header_len = (header_len + (IO_SIZE - 1)) & ~(IO_SIZE - 1)
You can use static_assert((IO_SIZE & (IO_SIZE - 1)) == 0, "power of two")
if you want to catch if anyone would ever change IO_SIZE to be not a power
of two.
+private:
+ Binlog_free_flush &operator=(const Binlog_free_flush &);
+ Binlog_free_flush(const Binlog_free_flush &);
Add before this a comment "singleton object, prevent copying by mistake".
> + char m_cache_dir[FN_REFLEN];
This member appears to be unused?
> + /** The cache_data which will be renamed to binlog. */
> + binlog_cache_data *m_cache_data{nullptr};
Also appears unused.
Maybe add a comment that m_entry and m_replaced are protected by LOCK_log?
> +static Binlog_free_flush binlog_free_flush;
> +ulonglong opt_binlog_free_flush_threshold= 10 * 1024 * 1024;
This initializes the value to 10 MB, but the default value in sys_vars.cc is
128MB, so that seems inconsistent.
> @@ -4126,8 +4238,7 @@ bool MYSQL_BIN_LOG::open(const char *log_name,
> /* Notify the io thread that binlog is rotated to a new file */
> if (is_relay_log)
> signal_relay_log_update();
> - else
> - update_binlog_end_pos();
> +
Why do you remove update_binlog_end_pos() here? Doesn't the binlog dump
threads need this update?
> @@ -8703,7 +8821,16 @@ MYSQL_BIN_LOG::queue_for_group_commit(group_commit_entry *orig_entry)
> bool
> MYSQL_BIN_LOG::write_transaction_to_binlog_events(group_commit_entry *entry)
> {
> - int is_leader= queue_for_group_commit(entry);
> + int is_leader;
> +
> + if (binlog_free_flush.should_free_flush(entry) &&
> + binlog_free_flush.commit(entry))
> + {
> + is_leader= 1;
> + goto commit;
> + }
> +
> + is_leader= queue_for_group_commit(entry);
As described above, re-factor the write_transaction_to_binlog_events()
function to avoid this assignment of is_leader and the goto.
> @@ -8863,6 +8992,16 @@ MYSQL_BIN_LOG::trx_group_commit_leader(group_commit_entry *leader)
> uint64 commit_id;
> DBUG_ENTER("MYSQL_BIN_LOG::trx_group_commit_leader");
>
> + /*
> + When move a binlog cache to a binlog file, the transaction itself is
> + a group.
> + */
> + if (unlikely(is_free_flush))
> + {
> + last_in_queue= leader;
> + queue= leader;
> + }
> + else
As described above, re-factor trx_group_commit_leader() to avoid this check
in trx_group_commit_leader(), instead calling directly into the relevant
code now in trx_group_commit_with_engines().
> +inline bool Binlog_free_flush::should_free_flush(
> + const MYSQL_BIN_LOG::group_commit_entry *entry) const
> +{
> + /* Do not free flush if total_bytes smaller than limit size. */
> + if (likely(cache_data->get_byte_position() <=
> + opt_binlog_free_flush_threshold))
> + return false;
Put this check as the first check in the function. So the common case of a
small transaction does not need to run any of the other checks.
And as suggested above, put a check:
if (unlikely(!opt_optimize_thread_scheduling))
return false;
(This is not critical, but just to make things a bit easier; there is no
reason for you/us to worry about making commit-by-rotate work with disabled
optimize_thread_scheduling, this option isn't used any more. Otherwise you'd
need to test that your patch works correctly with optimize_thread_scheduling
disabled, which would just be a waste of time.)
> +bool Binlog_free_flush::commit(MYSQL_BIN_LOG::group_commit_entry *entry)
> +{
> + // It will be released by trx_group_commit_leader
> + mysql_mutex_lock(&mysql_bin_log.LOCK_log);
After suggested changes, comment should read "It will be released by
trx_group_commit_with_engines()."
> + mysql_bin_log.trx_group_commit_leader(entry, true /* is_free_flush */);
As discussed above, change this to call a new function
mysql_bin_log.trx_group_commit_with_engines(), which contains the second
part of trx_group_commit_leader() that you need to run. Then there is no
longer a need to pass a `true` flag here to skip the first part.
Also, before the call, this is the place to set entry->next to nullptr, to
convert the stand-alone entry into a (singleton) list (instead of
initializing it in MYSQL_BIN_LOG::write_transaction_to_binlog().)
> @@ -8301,6 +8418,7 @@ MYSQL_BIN_LOG::write_transaction_to_binlog(THD *thd,
> DBUG_RETURN(0);
> }
>
> + entry.next= nullptr;
> entry.thd= thd;
> entry.cache_mngr= cache_mngr;
> entry.error= 0;
Do you need to initialize entry->next here, and if so why?
At this point, the entry is not a list, it is a stand-alone struct that is
not yet linked into any list, the next pointer should be set appropriately
when/if the struct is later put into a list.
More important than the minor cost of initializing, by redundantly
initializing here we are hiding bugs (eg. Valgrind/msan/asan) if some code
would incorrectly read the next pointer.
> +bool Binlog_free_flush::replace_binlog_file()
> +{
> + size_t binlog_size= my_b_tell(&mysql_bin_log.log_file);
> + size_t required_size= binlog_size;
> + // space for Gtid_log_event
> + required_size+= LOG_EVENT_HEADER_LEN + Gtid_log_event::max_data_length +
> + BINLOG_CHECKSUM_LEN;
Just a remark, here you actually know the real required size of the GTID
event, so you don't need to conservatively estimate
Gtid_log_event::max_data_length. But I think doing it this way is fine, it
is simpler and in most cases we are rounding up to IO_SIZE anyway, so I'm
fine with keeping it this way.
> + sql_print_information("Could not rename binlog cache to binlog, "
> + "require %llu bytes but only %llu bytes reserved.",
> + required_size, m_cache_data->file_reserved_bytes());
This log message will perhaps appear a bit out-of-context in the error log.
Suggested clarification:
"Could not rename binlog cache to binlog (as requested by
--binlog-commit-by-rotate-threshold). Required ..."
> + // Set the cache file as binlog file.
> + mysql_file_close(mysql_bin_log.log_file.file, MYF(MY_WME));
Do not to pass MY_WME here (instead pass MYF(0)). Since you are ignoring the
return of the function, so we don't want mysql_file_close to call my_error()
leaving an error code set even though the operation succeeds.
> +size_t Binlog_free_flush::get_gtid_event_pad_size()
> +{
> + size_t begin_pos= my_b_tell(&mysql_bin_log.log_file);
> + size_t pad_to_size=
> + m_cache_data->file_reserved_bytes() - begin_pos - LOG_EVENT_HEADER_LEN;
> +
> + if (binlog_checksum_options != BINLOG_CHECKSUM_ALG_OFF)
> + pad_to_size-= BINLOG_CHECKSUM_LEN;
Adjusting with BINLOG_CHECKSUM_LEN is to account for the 4 bytes of checksum
at the end of the GTID_EVENT, right?
But why do you subtract LOG_EVENT_HEADER_LEN when calculating pad_to_size?
(I mean, it is probably correct, otherwise nothing would work I think. I
just did not understand it, maybe you can explain in a short comment. Or
maybe Gtid_log_event::write() could be modified to do the adjusting for
LOG_EVENT_HEADER_LEN and BINLOG_CHECKSUM_LEN).
> + // offset must be saved before replace_binlog_file(), it will update the pos
> + my_off_t offset= my_b_tell(&log_file);
Suggest to clarify comment to "..., it will update the file position".
> @@ -756,18 +759,20 @@ class MYSQL_BIN_LOG: public TC_LOG, private Event_log
> new_file() is locking. new_file_without_locking() does not acquire
> LOCK_log.
> */
> - int new_file_impl();
> + int new_file_impl(bool is_free_flush= false);
> void do_checkpoint_request(ulong binlog_id);
> - int write_transaction_or_stmt(group_commit_entry *entry, uint64 commit_id);
> + int write_transaction_or_stmt(group_commit_entry *entry, uint64 commit_id,
> + bool is_free_flush= false);
> int queue_for_group_commit(group_commit_entry *entry);
> bool write_transaction_to_binlog_events(group_commit_entry *entry);
> - void trx_group_commit_leader(group_commit_entry *leader);
> + void trx_group_commit_leader(group_commit_entry *leader,
> + bool is_free_flush= false);
> bool is_xidlist_idle_nolock();
> void update_gtid_index(uint32 offset, rpl_gtid gtid);
>
> public:
> void purge(bool all);
> - int new_file_without_locking();
> + int new_file_without_locking(bool is_free_flush= false);
> /*
> A list of struct xid_count_per_binlog is used to keep track of how many
> XIDs are in prepared, but not committed, state in each binlog. And how
> @@ -997,7 +1002,8 @@ class MYSQL_BIN_LOG: public TC_LOG, private Event_log
> enum cache_type io_cache_type_arg,
> ulong max_size,
> bool null_created,
> - bool need_mutex);
> + bool need_mutex,
> + bool is_free_flush = false);
> bool open_index_file(const char *index_file_name_arg,
> const char *log_name, bool need_mutex);
> /* Use this to start writing a new log file */
> @@ -1037,7 +1043,7 @@ class MYSQL_BIN_LOG: public TC_LOG, private Event_log
> bool is_active(const char* log_file_name);
> bool can_purge_log(const char *log_file_name, bool interactive);
> int update_log_index(LOG_INFO* linfo, bool need_update_threads);
> - int rotate(bool force_rotate, bool* check_purge);
> + int rotate(bool force_rotate, bool* check_purge, bool is_free_flush= false);
> void checkpoint_and_purge(ulong binlog_id);
> int rotate_and_purge(bool force_rotate, DYNAMIC_ARRAY* drop_gtid_domain= NULL);
> /**
Style preference: please add the these new boolean parameters without
default value, and instead change existing callers to pass false explicitly.
(And the parameter will be "commit_by_rotate" after name change.)
> +
> +extern ulonglong opt_binlog_free_flush_threshold;
> +static Sys_var_ulonglong Sys_binlog_free_flush_threshold(
> + "binlog_free_flush_threshold",
> + "Try to rename the binlog cache temporary file of the commiting "
> + "transaction to a binlog file when its binlog cache size "
> + "is bigger than the value of this variable",
> + GLOBAL_VAR(opt_binlog_free_flush_threshold),
> + CMD_LINE(REQUIRED_ARG), VALID_RANGE(10 * 1024 * 1024, ULLONG_MAX),
> + DEFAULT(128 * 1024 * 1024), BLOCK_SIZE(1));
As described above:
- Suggested name: "binlog_commit_by_rotate_threshold".
- Suggested text: "For transactions that exceed this size in the binlog,
commit by a more efficient method that forces a binlog rotation and
re-uses the pre-existing temporary file as the next binlog file."
- Let me hear opinions on whether this should default to 128 MB or should
default to eg. ULONGLONG_MAX to disable the feature by default.
> diff --git a/mysql-test/main/tmp_space_usage.test b/mysql-test/main/tmp_space_usage.test
> index af7b295f343..1685dbbc450 100644
> --- a/mysql-test/main/tmp_space_usage.test
> +++ b/mysql-test/main/tmp_space_usage.test
> @@ -215,7 +215,8 @@ select count(distinct concat(seq,repeat('x',1000))) from seq_1_to_1000;
>
> set @save_max_tmp_total_space_usage=@@global.max_tmp_total_space_usage;
> set @@global.max_tmp_total_space_usage=64*1024*1024;
> -set @@max_tmp_session_space_usage=1179648;
> +# Binlog cache reserve 4096 bytes at the begin of the temporary file.
> +set @@max_tmp_session_space_usage=1179648+65536;
> select @@max_tmp_session_space_usage;
> set @save_aria_repair_threads=@@aria_repair_threads;
> set @@aria_repair_threads=2;
> @@ -224,6 +225,7 @@ set @@max_heap_table_size=16777216;
>
> CREATE TABLE t1 (a CHAR(255),b INT,INDEX (b));
> INSERT INTO t1 SELECT SEQ,SEQ FROM seq_1_to_100000;
> +set @@max_tmp_session_space_usage=1179648;
> --error 200
> SELECT * FROM t1 UNION SELECT * FROM t1;
> DROP TABLE t1;
> @@ -266,11 +268,15 @@ SELECT MIN(VARIABLE_VALUE) OVER (), NTILE(1) OVER (), MAX(VARIABLE_NAME) OVER ()
>
> connect(c1, localhost, root,,);
> set @@binlog_format=row;
> -CREATE OR REPLACE TABLE t1 (a DATETIME) ENGINE=MyISAM;
> +CREATE OR REPLACE TABLE t1 (a DATETIME) ENGINE=InnoDB;
> +# Use the transaction to keep binlog cache temporary file large enough
> +BEGIN;
> INSERT INTO t1 SELECT NOW() FROM seq_1_to_6000;
> +
> SET max_tmp_session_space_usage = 64*1024;
> --error 200
> SELECT * FROM information_schema.ALL_PLUGINS LIMIT 2;
> +ROLLBACK;
> drop table t1;
Can you explain why you need these changes to the test case? I was not able
to understand that from the patch.
With these changes, I approve of this patch / pull-request to merge into the
next MariaDB release.
- Kristian.
3
3
Re: bd616a3733c: Auth: set thd->scramble in all cases during writing Initial Handshake Packet
by Sergei Golubchik 10 Sep '24
by Sergei Golubchik 10 Sep '24
10 Sep '24
Hi, Nikita,
On Sep 09, Nikita Malyavin wrote:
> revision-id: bd616a3733c (mariadb-11.6.1-13-gbd616a3733c)
> parent(s): bbbb429a1eb
> author: Nikita Malyavin
> committer: Nikita Malyavin
> timestamp: 2024-09-07 20:46:19 +0200
> message:
>
> Auth: set thd->scramble in all cases during writing Initial Handshake Packet
>
> ---
> sql/sql_acl.cc | 19 +++++++++++++------
> 1 file changed, 13 insertions(+), 6 deletions(-)
>
> diff --git a/sql/sql_acl.cc b/sql/sql_acl.cc
> index 93efc5806cc..7845af55413 100644
> --- a/sql/sql_acl.cc
> +++ b/sql/sql_acl.cc
> @@ -13504,17 +13504,21 @@ static bool send_server_handshake_packet(MPVIO_EXT *mpvio,
> mpvio->cached_server_packet.pkt_len= data_len;
> }
>
> - if (data_len < SCRAMBLE_LENGTH)
> + if (thd->scramble[SCRAMBLE_LENGTH] != 0)
old code was obviously wrong
but the new one makes no sense either. How can
thd->scramble[SCRAMBLE_LENGTH] be not zero at this point?
I'd think you need to execute the code below unconditionally.
> {
> + DBUG_ASSERT(thd->scramble[SCRAMBLE_LENGTH] == 1); // Sanity
> +
> if (data_len)
> {
> /*
> the first packet *must* have at least 20 bytes of a scramble.
> - if a plugin provided less, we pad it to 20 with zeros
> + if a plugin provided less, we pad it to 20 with zeros,
> + plus extra zero termination sign is put in thd->scramble.
> + If more is provided, we'll use only 20 bytes as a handshake scramble.
Not sure it's a good idea. Why not to send the complete scramble?
True, it won't fit on the client side into mysql->scramble[].
Two ways to fix it:
* Write the first 20 bytes. The server will only store the first 20
bytes too, and for COM_CHANGE_USER they'll use the first 20 bytes
only (for the login auth they can use the complete scramble).
That's a rather hackish fix, a proper fix would be
* store the complete scramble in MYSQL. The first 20 bytes in
MYSQL::scramble_buff, the rest in MySQL::extension.scramble_suffix
> */
> - memcpy(scramble_buf, data, data_len);
> - bzero(scramble_buf + data_len, SCRAMBLE_LENGTH - data_len);
> - data= scramble_buf;
> + size_t fill_size= MY_MIN(SCRAMBLE_LENGTH, data_len);
> + memcpy(thd->scramble, data, fill_size);
> + bzero(thd->scramble + fill_size, SCRAMBLE_LENGTH - fill_size + 1);
> }
Regards,
Sergei
Chief Architect, MariaDB Server
and security(a)mariadb.org
2
3
Re: 982bf06d560: MDEV-12320 configurable default authentication plugin for the server
by Sergei Golubchik 09 Sep '24
by Sergei Golubchik 09 Sep '24
09 Sep '24
Hi, Nikita,
This looks good. Minor comments below.
On Sep 09, Nikita Malyavin wrote:
> revision-id: 982bf06d560 (mariadb-11.6.1-14-g982bf06d560)
> parent(s): bd616a3733c
> author: Nikita Malyavin
> committer: Nikita Malyavin
> timestamp: 2024-09-07 21:44:13 +0200
> message:
>
> MDEV-12320 configurable default authentication plugin for the server
>
> * Add a new cmdline-only variable "default_auth_plugin".
> * A default plugin is locked at the server init and unlocked at the deinit
> stages. This means that mysql_native_password and old_password_plugin, when
> default, are locked/unlocked twice.
doesn't matter, compiled-in plugins are only locked in debug builds,
otherwise it's a no-op.
>
> --- a/sql/sys_vars.cc
> +++ b/sql/sys_vars.cc
> @@ -4538,6 +4538,14 @@ static Sys_var_plugin Sys_enforce_storage_engine(
> DEFAULT(&enforced_storage_engine), NO_MUTEX_GUARD, NOT_IN_BINLOG,
> ON_CHECK(check_has_super));
>
> +extern const char *default_auth_plugin_name;
> +extern LEX_CSTRING native_password_plugin_name;
Is it ok? The correct type is Lex_ident_plugin.
> +static Sys_var_charptr Sys_default_auth_plugin(
there's also Sys_var_lexstring, if you prefer that.
> + "default_auth_plugin", "Default plugin, that will be tried first when authenticating new connections",
reformat the long line, please
> + READ_ONLY GLOBAL_VAR(default_auth_plugin_name), CMD_LINE(OPT_ARG),
> + DEFAULT(native_password_plugin_name.str),
> + NO_MUTEX_GUARD, NOT_IN_BINLOG);
> --- a/sql/sql_acl.cc
> +++ b/sql/sql_acl.cc
> @@ -2543,9 +2542,20 @@ bool acl_init(bool dont_read_acl_tables)
> old_password_plugin= my_plugin_lock_by_name(0,
> &old_password_plugin_name, MYSQL_AUTHENTICATION_PLUGIN);
>
> + Lex_cstring_strlen def_plugin_name(default_auth_plugin_name);
> + default_auth_plugin= my_plugin_lock_by_name(NULL, &def_plugin_name,
> + MYSQL_AUTHENTICATION_PLUGIN);
> +
> if (!native_password_plugin || !old_password_plugin)
> DBUG_RETURN(1);
>
> + if (!default_auth_plugin)
> + {
> + sql_print_error("Default plugin %s could not be loaded",
> + default_auth_plugin_name);
see init_default_storage_engine() in mysqld.cc - it's for
--default-storage-engine option.
let's use similar wording for consistency:
sql_print_error("Unknown/unsupported authentication plugin: %s",
default_auth_plugin_name);
> + DBUG_RETURN(1);
> + }
> +
> if (dont_read_acl_tables)
> {
> DBUG_RETURN(0); /* purecov: tested */
Regards,
Sergei
Chief Architect, MariaDB Server
and security(a)mariadb.org
1
0
Re: bbbb429a1eb: Auth: release the plugins from responsibility to fill scramble_buff.
by Sergei Golubchik 09 Sep '24
by Sergei Golubchik 09 Sep '24
09 Sep '24
Hi, Nikita,
Conceptually ok, see one omission below.
On Sep 09, Nikita Malyavin wrote:
> revision-id: bbbb429a1eb (mariadb-11.6.1-12-gbbbb429a1eb)
> parent(s): db5d1cde450
> author: Nikita Malyavin
> committer: Nikita Malyavin
> timestamp: 2024-09-07 20:44:02 +0200
> message:
>
> Auth: release the plugins from responsibility to fill scramble_buff.
>
> Fill it once during the Initial Handshake Packet parsing. This uses the fact
> that the server guarantees first 20 bytes to be the scramble.
>
> Submodule libmariadb de6305915f8..12d78187061:
> diff --git a/libmariadb/libmariadb/mariadb_lib.c b/libmariadb/libmariadb/mariadb_lib.c
> index 78195d44..07953ea4 100644
> --- a/libmariadb/libmariadb/mariadb_lib.c
> +++ b/libmariadb/libmariadb/mariadb_lib.c
> @@ -1954,6 +1954,8 @@ restart:
> goto error;
> }
> }
> + memmove(mysql->scramble_buff, scramble_data, SCRAMBLE_LENGTH);
> + mysql->scramble_buff[SCRAMBLE_LENGTH]= 0;
> /* Set character set */
> if (mysql->options.charset_name)
> diff --git a/libmariadb/plugins/auth/my_auth.c b/libmariadb/plugins/auth/my_auth.c
> index a2fd519d..47d26150 100644
> --- a/libmariadb/plugins/auth/my_auth.c
> +++ b/libmariadb/plugins/auth/my_auth.c
> @@ -126,10 +126,6 @@ static int native_password_auth_client(MYSQL_PLUGIN_VIO *vio, MYSQL *mysql)
>
> if (pkt_len != SCRAMBLE_LENGTH + 1)
> return CR_SERVER_HANDSHAKE_ERR;
> -
> - /* save it in MYSQL */
> - memmove(mysql->scramble_buff, pkt, SCRAMBLE_LENGTH);
> - mysql->scramble_buff[SCRAMBLE_LENGTH] = 0;
This looks ok
> }
>
> if (mysql && mysql->passwd[0])
> diff --git a/sql-common/client.c b/sql-common/client.c
> index 6d030ce0a17..28b477c1e9a 100644
> --- a/sql-common/client.c
> +++ b/sql-common/client.c
> @@ -4178,10 +4178,6 @@ static int native_password_auth_client(MYSQL_PLUGIN_VIO *vio, MYSQL *mysql)
>
> if (pkt_len != SCRAMBLE_LENGTH + 1)
> DBUG_RETURN(CR_SERVER_HANDSHAKE_ERR);
> -
> - /* save it in MYSQL */
> - memcpy(mysql->scramble, pkt, SCRAMBLE_LENGTH);
> - mysql->scramble[SCRAMBLE_LENGTH] = 0;
this isn't, you removed memcpy from the plugin, but didn't add it to
mysql_real_connect in sql-common/client.c.
> }
>
> if (mysql->passwd[0])
Regards,
Sergei
Chief Architect, MariaDB Server
and security(a)mariadb.org
2
1
Re: b96afca238a: MDEV-16440 merge 5.7 P_S sysvars instrumentation and tables
by Sergei Golubchik 08 Sep '24
by Sergei Golubchik 08 Sep '24
08 Sep '24
Hi, Nikita,
Despite the email subject, it's a review of everything in
bb-10.10-MDEV-16440, *except* 9c57bbfef92. That is
37e1ccbf678..c16b2429f02 without notifiable_work_zone.h
I'll send a separate email about notifiable_work_zone.h
On Sep 08, Nikita Malyavin wrote:
> revision-id: b96afca238a (mariadb-10.6.1-472-gb96afca238a)
> parent(s): 37e1ccbf678
> author: Nikita Malyavin
> committer: Nikita Malyavin
> timestamp: 2022-06-19 14:52:30 +0300
> message:
>
> MDEV-16440 merge 5.7 P_S sysvars instrumentation and tables
> diff --git a/mysql-test/suite/perfschema/t/variables_stress.test b/mysql-test/suite/perfschema/t/variables_stress.test
> new file mode 100644
> index 00000000000..be3012bf8af
> --- /dev/null
> +++ b/mysql-test/suite/perfschema/t/variables_stress.test
> @@ -0,0 +1,71 @@
stress tests generally don't belong in mtr and aren't reliable enough
for the mtr. Also, you forgot to check in the result file for it.
on the other hand, looking at the patch I got the impression that
this stress test helped to find a lot of bugs.
> +let $i = 64;
> +enable_connect_log;
> +while ($i) {
> + echo i = $i;
> + connect(con_$i, localhost, root);
> +# set debug_sync = "tp_hanger wait_for banger";
> + send set debug_dbug = "+d,tp_wanger";
> + dec $i;
> +}
> +connection default;
> +
> +SELECT THREAD_ID, PROCESSLIST_ID FROM performance_schema.threads;
> +set debug_sync = "now signal banger";
> +connect(banger_1, localhost, root);
> +let $banger1= `SELECT THREAD_ID FROM performance_schema.threads WHERE PROCESSLIST_ID = CONNECTION_ID()`;
> +send select count(*) from performance_schema.variables_by_thread where variable_name="time_zone";
> +connect(banger_2, localhost, root);
> +let $banger2= `SELECT THREAD_ID FROM performance_schema.threads WHERE PROCESSLIST_ID = CONNECTION_ID()`;
> +send select count(*) from performance_schema.variables_by_thread where variable_name="time_zone";
> +connect(banger_3, localhost, root);
> +let $banger3= `SELECT THREAD_ID FROM performance_schema.threads WHERE PROCESSLIST_ID = CONNECTION_ID()`;
> +send select count(*) from performance_schema.variables_by_thread where variable_name="time_zone";
> +connect(banger_4, localhost, root);
> +let $banger4= `SELECT THREAD_ID FROM performance_schema.threads WHERE PROCESSLIST_ID = CONNECTION_ID()`;
> +send select count(*) from performance_schema.variables_by_thread where variable_name="time_zone";
> +connect(banger_5, localhost, root);
> +let $banger5= `SELECT THREAD_ID FROM performance_schema.threads WHERE PROCESSLIST_ID = CONNECTION_ID()`;
> +
> +send select count(*) from performance_schema.variables_by_thread where variable_name="time_zone";
> +
> +let $i = 2000;
> +let $j=65;
> +while ($i) {
> + echo i = $i;
> + connection banger_1;
> + reap;
> + send select count(*) from performance_schema.variables_by_thread where variable_name="time_zone";
> + connection banger_2;
> + reap;
> + send select count(*) from performance_schema.variables_by_thread where variable_name="time_zone";
> + connection banger_3;
> + reap;
> + send select count(*) from performance_schema.variables_by_thread where variable_name="time_zone";
> + connection banger_4;
> + reap;
> + send select count(*) from performance_schema.variables_by_thread where variable_name="time_zone";
> + connection banger_5;
> + reap;
> + send select count(*) from performance_schema.variables_by_thread where variable_name="time_zone";
> + dec $i;
> +
> + let $threads= `show status where variable_name = 'threads_connected'`;
> + if ($threads < 63) {
> + connect(con_$i, localhost, root);
> + send set debug_dbug = "+d,tp_wanger";
> + inc $j;
> + }
> +}
> +reap;
> +select * from performance_schema.variables_by_thread where variable_name="time_zone";
> +echo $banger1;
> +echo $banger2;
> +echo $banger3;
> +echo $banger4;
> +echo $banger5;
> +# eval select * from performance_schema.variables_by_thread where variable_name like "%id" and thread_id=197;
> +connection default;
> +# set global debug_dbug = "";
> +set debug_sync = reset;
> +
> +show status where variable_name = 'threads_connected';
> diff --git a/sql/sql_plugin.h b/sql/sql_plugin.h
> index d4df8c6468f..3d02d1c7509 100644
> --- a/sql/sql_plugin.h
> +++ b/sql/sql_plugin.h
> @@ -186,6 +186,10 @@ extern SHOW_COMP_OPTION plugin_status(const char *name, size_t len, int type);
> extern bool check_valid_path(const char *path, size_t length);
> extern void plugin_mutex_init();
>
> +template <class T> class Dynamic_array;
you can simply include sql_array.h, can't you?
> +
> +void plugin_lock_by_sys_var_array(THD *thd, Dynamic_array<SHOW_VAR> *vars);
> +
> typedef my_bool (plugin_foreach_func)(THD *thd,
> plugin_ref plugin,
> void *arg);
> diff --git a/sql/sql_parse.cc b/sql/sql_parse.cc
> index 0597b086b7b..ec86389d08f 100644
> --- a/sql/sql_parse.cc
> +++ b/sql/sql_parse.cc
> @@ -1296,6 +1296,9 @@ dispatch_command_return do_command(THD *thd, bool blocking)
> goto out;
> }
>
> + if (unlikely(thd->apc_target.have_apc_requests()))
> + thd->apc_target.process_apc_requests();
1. why here?
2. there generally shouldn't be direct checks of apc requests,
the code should do check_killed() often enough. So if you really need
a check here (see question 1), add if (thd->check_killed())...
> +
> packet= (char*) net->read_pos;
> /*
> 'packet_length' contains length of data, as it was stored in packet
> diff --git a/vio/viopipe.c b/vio/viopipe.c
> index aeaad311b7e..5106c6d6513 100644
> --- a/vio/viopipe.c
> +++ b/vio/viopipe.c
> @@ -53,8 +53,9 @@ static size_t wait_overlapped_result(Vio *vio, int timeout)
> }
> else
> {
> + DWORD last_error= GetLastError();
> /* Error or timeout, cancel the pending I/O operation. */
> - CancelIo(vio->hPipe);
> + CancelIoEx(vio->hPipe, &vio->overlapped);
why, CancelIoEx resets last error, but CancelIo doesn't?
> /*
> If the wait timed out, set error code to indicate a
> diff --git a/sql/threadpool_win.cc b/sql/threadpool_win.cc
> index ed68e31c755..25844d1f3ef 100644
> --- a/sql/threadpool_win.cc
> +++ b/sql/threadpool_win.cc
> @@ -131,6 +131,11 @@ void TP_pool_win::resume(TP_connection* c)
> SubmitThreadpoolWork(((TP_connection_win*)c)->work);
> }
>
> +int TP_pool_win::wake(TP_connection *c)
> +{
> + return 0;
> +}
How is that supposed to work?
How do you wake up threadpool threads on Windows?
> +
> #define CHECK_ALLOC_ERROR(op) \
> do \
> { \
> diff --git a/mysys/my_thr_init.c b/mysys/my_thr_init.c
> index cde34e5bdb9..89f190ee989 100644
> --- a/mysys/my_thr_init.c
> +++ b/mysys/my_thr_init.c
> @@ -432,9 +432,21 @@ const char *my_thread_name(void)
> extern void **my_thread_var_dbug()
> {
> struct st_my_thread_var *tmp;
> + int last_error;
> if (!my_thread_global_init_done)
> return NULL;
> +#ifdef WIN32
> + last_error= GetLastError();
> +#else
> + last_error= errno;
> +#endif
> +
> tmp= my_thread_var;
> +#ifdef WIN32
> + SetLastError(last_error);
> +#else
> + errno= last_error;
> +#endif
huh? my_thread_var can change errno?
> return tmp && tmp->init ? &tmp->dbug : 0;
> }
> #endif /* DBUG_OFF */
> diff --git a/storage/perfschema/pfs_variable.h b/storage/perfschema/pfs_variable.h
> index e59b02f2af8..f509e97048c 100644
> --- a/storage/perfschema/pfs_variable.h
> +++ b/storage/perfschema/pfs_variable.h
> @@ -212,7 +212,7 @@ class Find_THD_variable : public Find_THD_Impl
> return false;
>
> /* Hold this lock to keep THD during materialization. */
> - mysql_mutex_lock(&thd->LOCK_thd_data);
> + mysql_mutex_lock(&thd->LOCK_thd_kill);
see commits 736a54f49c72 and c6c2a2b8d463
> return true;
> }
> void set_unsafe_thd(THD *unsafe_thd) { m_unsafe_thd= unsafe_thd; }
> diff --git a/sql/threadpool_generic.h b/sql/threadpool_generic.h
> index b7a35b7cbf0..e16ec4a4966 100644
> --- a/sql/threadpool_generic.h
> +++ b/sql/threadpool_generic.h
> @@ -88,9 +89,17 @@ struct TP_connection_generic :public TP_connection
> ulonglong abs_wait_timeout;
> ulonglong enqueue_time;
> TP_file_handle fd;
> + /**
> + Designates whether fd is currently connected to the poll denoted by
> + thread_group->pollfd. See also change_group.
> + */
> bool bound_to_poll_descriptor;
> int waiting;
> bool fix_group;
> +#ifndef WIN32
> + Notifiable_work_zone work_zone;
> + bool leave_work_zone() final;
> +#endif
Do you need this #ifndef? WIN32 is *never* defined, it's _WIN32.
and _WIN32 is never defined *here*, on Windows we use TP_connection_win.
> #ifdef _WIN32
> win_aiosocket win_sock{};
> void init_vio(st_vio *vio) override
> diff --git a/sql/sql_class.cc b/sql/sql_class.cc
> index a0ba1d7e5a3..bfe49cddbf4 100644
> --- a/sql/sql_class.cc
> +++ b/sql/sql_class.cc
> @@ -1909,13 +1910,13 @@ void THD::awake_no_mutex(killed_state state_to_set)
> }
>
> /* Broadcast a condition to kick the target if it is waiting on it. */
> -void THD::abort_current_cond_wait(bool force)
> +void THD::abort_current_cond_wait(bool force, bool mark_abort)
a bit illogical, these values aren't orthogonal.
you want a 3-state enum, like
ABORT_NONE, ABORT_CONNECTIONS_BUT_NOT_SYSTEM_THREADS, ABORT_ALL
> {
> mysql_mutex_assert_owner(&LOCK_thd_kill);
> if (mysys_var)
> {
> mysql_mutex_lock(&mysys_var->mutex);
> - if (!system_thread || force) // Don't abort locks
> + if (mark_abort && (!system_thread || force)) // Don't abort locks
> mysys_var->abort=1;
>
> /*
> @@ -4373,6 +4374,14 @@ my_bool thd_net_is_killed(THD *thd)
> return thd && thd->killed ? 1 : 0;
> }
>
> +/* returns true if any APC was processed */
> +void thd_net_process_apc_requests(THD *thd)
even though it's called from net_serv.cc,
there's nothing about NET here, so calling it
thd_net_process_apc_requests is misleading
> +{
> + if (unlikely(thd->apc_target.have_apc_requests()))
> + thd->apc_target.process_apc_requests();
you also call it from the threadpool_common and
a couple of times you have these two lines verbatim
in other places (e.g. sql_parse.cc)
why couldn't you call thd->check_killed() instead?
> + DEBUG_SYNC(thd, "net_after_apc");
> +}
> +
>
> void thd_increment_bytes_received(void *thd, size_t length)
> {
> diff --git a/sql/my_apc.h b/sql/my_apc.h
> index cc98e36bbe4..ae60bdc2763 100644
> --- a/sql/my_apc.h
> +++ b/sql/my_apc.h
> @@ -81,7 +83,7 @@ class Apc_target
> */
> inline bool have_apc_requests()
> {
> - return MY_TEST(apc_calls);
> + return MY_TEST(apc_calls.load(std::memory_order_acquire));
why?
> }
>
> inline bool is_enabled() { return enabled; }
> @@ -95,14 +97,19 @@ class Apc_target
> virtual ~Apc_call() {}
> };
>
> + class Call_request;
> /* Make a call in the target thread (see function definition for details) */
> - bool make_apc_call(THD *caller_thd, Apc_call *call, int timeout_sec, bool *timed_out);
> + bool make_apc_call(THD *caller_thd, Apc_call *call,
> + int timeout_sec, bool *timed_out);
> +
> + void enqueue_request(Call_request *request_buff, Apc_call *call);
> + int wait_for_completion(THD *caller_thd, Call_request *request,
> + int timeout_sec);
You wrote
> In pfs_variable.cc we will have to make an additional step between
> enqueue_request and wait_for_completion. These two functions will be called
> directly and therefore both should have a public interface
and I don't think I quite agree with that. Your "additional step"
seems to be abort_current_cond_wait() and notify_apc(). It's not an exotic
additional step that only pfs_variable.cc needs, anyone, who makes an apc
call needs the target thread to wake up and process it. It's something
that should be part of apc code, not done only in pfs_variable.cc
And this case you likely won't need enqueue_request and wait_for_completion
as a public api
>
> #ifndef DBUG_OFF
> int n_calls_processed; /* Number of calls served by this target */
> #endif
> private:
> - class Call_request;
>
> /*
> Non-zero value means we're enabled. It's an int, not bool, because one can
> diff --git a/sql/threadpool.h b/sql/threadpool.h
> index 7737d056b4a..08e85d362e7 100644
> --- a/sql/threadpool.h
> +++ b/sql/threadpool.h
> @@ -112,6 +119,8 @@ struct TP_connection
>
> /* Read for the next client command (async) with specified timeout */
> virtual int start_io() = 0;
> + IF_WIN(,virtual) bool leave_work_zone(){ return true; }
> + IF_WIN(Notifiable_work_zone work_zone;,) // Dummy object.
Ugh, this is unreadable. IF_WIN was supposed to be used in expressions.
And why do you need virtual methods here, again?
>
> virtual void wait_begin(int type)= 0;
> virtual void wait_end() = 0;
> diff --git a/sql/threadpool_generic.cc b/sql/threadpool_generic.cc
> index eb08441a4d5..f61dd90606b 100644
> --- a/sql/threadpool_generic.cc
> +++ b/sql/threadpool_generic.cc
> @@ -114,7 +114,8 @@ struct pool_timer_t
>
> static pool_timer_t pool_timer;
>
> -static void queue_put(thread_group_t *thread_group, TP_connection_generic *connection);
> +IF_DBUG(,static)
you can remove this IF_DBUG, I don't see you use queue_put outside of this
file.
> +void queue_put(thread_group_t *thread_group, TP_connection_generic *connection);
> static void queue_put(thread_group_t *thread_group, native_event *ev, int cnt);
> static int wake_thread(thread_group_t *thread_group,bool due_to_stall);
> static int wake_or_create_thread(thread_group_t *thread_group, bool due_to_stall=false);
> @@ -1485,8 +1520,41 @@ static int change_group(TP_connection_generic *c,
> }
>
>
> +#ifndef WIN32
> +bool TP_connection_generic::leave_work_zone()
> +{
> + auto leave_state= work_zone.try_leave();
> + while (unlikely(leave_state == Notifiable_work_zone::SIGNAL))
> + {
> + if (thd->apc_target.have_apc_requests())
> + thd->apc_target.process_apc_requests();
> + leave_state= work_zone.try_leave();
> + }
> + return leave_state == Notifiable_work_zone::OWNER;
> +}
> +#endif
> +
> +#ifndef DBUG_OFF
> +#include <random>
> +static thread_local std::mt19937 mt_random{std::random_device()()};
> +static thread_local std::uniform_int_distribution<int> kill_dist{1, 100000};
> +void queue_put(thread_group_t *thread_group, TP_connection_generic *connection);
> +void tp_send(TP_connection_type* c);
> +#endif
> +
> int TP_connection_generic::start_io()
> {
> + if(DBUG_IF("tp_wanger"))
> + {
> + bool survive= kill_dist(mt_random) != 555;
does that compile? you define kill_dist under #ifndef DBUG_OFF.
You can fix this with, like
#ifndef DBUG_OFF
if (DBUG_IF("tp_wanger"))
{
static thread_local std::mt19937 ...
etc
}
#endif
> + if (survive)
> + {
> + state= TP_STATE_INTERRUPTED;
> + tp_send(this);
> + }
> + return survive ? 0 : 1;
> + }
> +
> /*
> Usually, connection will stay in the same group for the entire
> connection's life. However, we do allow group_count to
> diff --git a/vio/viosocket.c b/vio/viosocket.c
> index 002ff274b74..45c3d80a33d 100644
> --- a/vio/viosocket.c
> +++ b/vio/viosocket.c
> @@ -909,20 +916,53 @@ static my_bool socket_peek_read(Vio *vio, uint *bytes)
> */
>
> #ifndef _WIN32
> +#ifdef _GNU_SOURCE
> +#define PFD_SIZE 1
> +static inline int vio_poll(struct pollfd pfd[], nfds_t nr, int timeout)
> +{
> + struct timespec tm, *tm_arg= NULL;
> + sigset_t signals;
> + /* Convert the timeout, in milliseconds, to seconds and microseconds. */
> + if (timeout >= 0)
> + {
> + tm.tv_sec= timeout / 1000;
> + tm.tv_nsec= (timeout % 1000) * 1000 * 1000;
> + tm_arg= &tm;
> + }
> + sigemptyset(&signals);
> +
> + return ppoll(pfd, 1, tm_arg, &signals);
> +}
> +#else
> +#define PFD_SIZE 2
> +#define vio_poll poll
> +#endif
why do you need that? How is your vio_poll() different from poll() ?
> +
> int vio_io_wait(Vio *vio, enum enum_vio_io_event event, int timeout)
> {
> int ret;
> short revents __attribute__((unused)) = 0;
> - struct pollfd pfd;
> + const short revents_read = MY_POLL_SET_IN | MY_POLL_SET_ERR | POLLRDHUP;
> + struct pollfd pfd[PFD_SIZE];
> my_socket sd= mysql_socket_getfd(vio->mysql_socket);
> + int pfd_size;
> MYSQL_SOCKET_WAIT_VARIABLES(locker, state) /* no ';' */
> DBUG_ENTER("vio_io_wait");
> DBUG_PRINT("enter", ("timeout: %d", timeout));
>
> - memset(&pfd, 0, sizeof(pfd));
> + memset(pfd, 0, sizeof(pfd));
>
> - pfd.fd= sd;
> + pfd[0].fd= sd;
>
> +#ifndef _GNU_SOURCE
> + my_socket self_pipe= threadlocal_get_self_pipe();
> + if (self_pipe)
> + {
> + pfd[1].fd= self_pipe;
> + pfd[1].events= MY_POLL_SET_IN;
> + }
> +#endif
> + pfd_size= PFD_SIZE;
really? I'd expect it to be essentially
pfd_size= self_pipe ? 2 : 1;
> /*
> Set the poll bitmask describing the type of events.
> The error flags are only valid in the revents bitmask.
> @@ -970,16 +1023,16 @@ int vio_io_wait(Vio *vio, enum enum_vio_io_event event, int timeout)
> }
>
> #else
> -
> int vio_io_wait(Vio *vio, enum enum_vio_io_event event, int timeout)
> {
> int ret;
> - struct timeval tm;
> - my_socket fd= mysql_socket_getfd(vio->mysql_socket);
> - fd_set readfds, writefds, exceptfds;
> MYSQL_SOCKET_WAIT_VARIABLES(locker, state) /* no ';' */
> DBUG_ENTER("vio_io_wait");
>
> + /* Not read , e.g connect or write - use select() based wait */
> + struct timeval tm;
> + my_socket fd= mysql_socket_getfd(vio->mysql_socket);
> + fd_set readfds, writefds, exceptfds;
it's C, you cannot mix declarations and code
> /* Convert the timeout, in milliseconds, to seconds and microseconds. */
> if (timeout >= 0)
> {
> @@ -1013,14 +1066,17 @@ int vio_io_wait(Vio *vio, enum enum_vio_io_event event, int timeout)
> ret= select(0, &readfds, &writefds, &exceptfds, (timeout >= 0) ? &tm : NULL);
>
> END_SOCKET_WAIT(locker, timeout);
> /* Set error code to indicate a timeout error. */
> if (ret == 0)
> WSASetLastError(SOCKET_ETIMEDOUT);
>
> /* Error or timeout? */
> - if (ret <= 0)
> + if (ret <= 0){
indentation
> + DBUG_PRINT("vio", ("vio_io_wait: error return: %d. Last error: %d",
> + ret, socket_errno));
> DBUG_RETURN(ret);
> + }
>
> /* The requested I/O event is ready? */
> switch (event)
> diff --git a/sql/sql_connect.cc b/sql/sql_connect.cc
> index 9beb3cca8f8..1c8975cfb23 100644
> --- a/sql/sql_connect.cc
> +++ b/sql/sql_connect.cc
> @@ -23,7 +23,53 @@
> #include "mariadb.h"
> #include "mysqld.h"
> #include "sql_priv.h"
> -#ifndef _WIN32
> +#ifdef _WIN32
> +#ifdef _WIN32_WINNT
> +#undef _WIN32_WINNT
> +#endif
> +
> +#define _WIN32_WINNT 0x0500
> +#include "windows.h" // QueueUserAPC2
> +#define WINDOWS_KERNEL32_DLLNAME_W "kernel32"
> +
> +// https://github.com/dotnet/runtime/pull/55649/files
> +
> +// These declarations are for a new special user-mode APC feature introduced in Windows. These are not yet available in Windows
> +// SDK headers, so some names below are prefixed with "CLONE_" to avoid conflicts in the future. Once the prefixed declarations
> +// become available in the Windows SDK headers, the prefixed declarations below can be removed in favor of the SDK ones.
> +
> +//enum CLONE_QUEUE_USER_APC_FLAGS
> +//{
> +// CLONE_QUEUE_USER_APC_FLAGS_NONE = 0x0,
> +// CLONE_QUEUE_USER_APC_FLAGS_SPECIAL_USER_APC = 0x1,
> +//
> +// CLONE_QUEUE_USER_APC_CALLBACK_DATA_CONTEXT = 0x10000
> +//};
> +//typedef BOOL (WINAPI *QueueUserAPC2Proc)(PAPCFUNC ApcRoutine, HANDLE Thread, ULONG_PTR Data, CLONE_QUEUE_USER_APC_FLAGS Flags);
> +//void c()
> +//{
> +// HMODULE hKernel32 = LoadLibraryExA(WINDOWS_KERNEL32_DLLNAME_W, NULL, LOAD_LIBRARY_SEARCH_SYSTEM32);
> +//
> +// // See if QueueUserAPC2 exists
> +// QueueUserAPC2Proc pfnQueueUserAPC2Proc = (QueueUserAPC2Proc)GetProcAddress(hKernel32, "QueueUserAPC2");
> +// if (pfnQueueUserAPC2Proc == nullptr)
> +// {
> +// abort();
> +// }
> +//
> +// // See if QueueUserAPC2 supports the special user-mode APC with a callback that includes the interrupted CONTEXT. A special
> +// // user-mode APC can interrupt a thread that is in user mode and not in a non-alertable wait.
> +// if (!(*pfnQueueUserAPC2Proc)(EmptyApcCallback, GetCurrentThread(), 0, SpecialUserModeApcWithContextFlags))
> +// {
> +// return;
> +// }
> +//
> +// return pfnQueueUserAPC2Proc;
> +//}
> +//
> +//static QueueUserAPC2Proc pfnQueueUserAPC2 = InitializeSpecialUserModeApc();
what's that big commented block for?
and win-something defines/includes - still needed?
> +
> +#else
> #include <netdb.h> // getservbyname, servent
> #endif
> #include "sql_audit.h"
> @@ -1353,6 +1399,107 @@ bool thd_is_connection_alive(THD *thd)
> return FALSE;
> }
>
> +#if !defined(_WIN32)
> +static void self_pipe_write();
> +#if !defined(_GNU_SOURCE)
> +class Thread_apc_context;
> +static thread_local Thread_apc_context *_THR_APC_CTX= NULL;
may be it should be part of THD (rather, Apc_target) or mysys_var?
that is, accessible as
current_thd->...->self_pipe
or
my_thread_var->self_pipe
the first one (in Apc_target) seems quite logical
> +#endif
> +
> +class Thread_apc_context
> +{
> +public:
> +#ifndef _GNU_SOURCE
> + my_socket self_pipe[2]{};
> +#endif
> +
> + bool setup_thread_apc()
> + {
> +#if defined(_GNU_SOURCE)
> + struct sigaction act {};
> + act.sa_handler= [](int) -> void {self_pipe_write();};
why? it doesn't do anything, as far as I can see
> + act.sa_flags= 0;
> + int ret= sigaction(SIG_APC_NOTIFY, &act, NULL);
> + DBUG_ASSERT(ret == 0);
> +
> + sigset_t signals;
> + ret|= sigemptyset(&signals);
> + DBUG_ASSERT(ret == 0);
> + ret|= sigaddset(&signals, SIG_APC_NOTIFY);
> + DBUG_ASSERT(ret == 0);
> +
> + ret|= pthread_sigmask(SIG_BLOCK, &signals, NULL);
> + DBUG_ASSERT(ret == 0);
> +#else
> + // Self-pipe trick. Create a new pipe and store it thread-locally
> + // It can be accessed from Vio later. See also vio_io_wait()
> + // From the other end, it is accessed through a threadlocal
> + // _THR_APC_CTX, from a SIG_APC_NOTIFY signal handler.
> + int ret= pipe(self_pipe);
> +
> + struct sigaction act {};
> + act.sa_handler= [](int) -> void { self_pipe_write(); };
> + act.sa_flags= 0;
> + ret|= sigaction(SIG_APC_NOTIFY, &act, NULL);
why? you send a signal and convert it to a self-pipe write.
why not to write to the pipe in the first place?
like
Apc_target::wake()
which will send a signal or write to a pipe, as appropriate.
And TP_pool::wake() will call thd->apc_target->wake();
> + return ret == 0;
> +#endif
> + return true;
> + }
> + bool inited;
> + Thread_apc_context()
> + {
> + inited = setup_thread_apc();
> +#ifndef _GNU_SOURCE
> + if (inited)
> + _THR_APC_CTX= this;
> +#endif
> + }
> +#ifndef _GNU_SOURCE
> + ~Thread_apc_context()
> + {
> + _THR_APC_CTX= NULL;
> + closesocket(self_pipe[0]);
> + closesocket(self_pipe[1]);
> + }
> +#endif
> +};
> +
> +#ifdef _GNU_SOURCE
> +static void self_pipe_write()
> +{
> + // No self-pipe is actually used. Instead, ppoll is used to wake up on signal.
> +}
> +#else
> +
> +my_socket threadlocal_get_self_pipe()
> +{
> + return _THR_APC_CTX ? _THR_APC_CTX->self_pipe[0] : 0;
> +}
> +
> +static void self_pipe_write()
> +{
> + DBUG_ASSERT(IF_WIN(0, pthread_self() == select_thread) || _THR_APC_CTX);
> + if (!_THR_APC_CTX)
> + return; // SIGUSR1 is used in thr_alarm, which can wake up main thread.
> + char buf[4]{};
> + send(_THR_APC_CTX->self_pipe[1], buf, sizeof buf, 0);
> +}
> +#endif
> +#endif /* !_WIN32 */
> +
> +bool thread_per_connection_notify_apc(THD *thd)
> +{
> +#ifdef WIN32
> + auto vio= thd->net.vio;
> + HANDLE h=
> + (vio->type == VIO_TYPE_NAMEDPIPE) ? vio->hPipe : (HANDLE)vio->mysql_socket.fd;
> + return CancelIoEx(h, NULL);
> +#else
> + bool ret = pthread_kill(thd->mysys_var->pthread_self, SIG_APC_NOTIFY) == 0;
> + DBUG_ASSERT(ret || errno == EAGAIN);
> + return ret;
> +#endif
> +}
1. I'd prefer APC framework implementation to stay in my_apc.cc. not
being spread over three (?) different files.
2. how is it diferent from ::wake() ?
>
> void do_handle_one_connection(CONNECT *connect, bool put_in_cache)
> {
> diff --git a/sql/threadpool_common.cc b/sql/threadpool_common.cc
> index f72f46b3a6b..8f24bc9e4f7 100644
> --- a/sql/threadpool_common.cc
> +++ b/sql/threadpool_common.cc
> @@ -320,6 +346,10 @@ static void threadpool_remove_connection(THD *thd)
> end_connection(thd);
> close_connection(thd, 0);
> unlink_thd(thd);
> + // The rest of APC requests should be processed after unlinking.
> + // This guarantees that no new APC requests can be added.
> + // TODO: better notify the requestor with some KILLED state here.
> + thd->apc_target.process_apc_requests();
not sure it guarantees much. One can find the thd before the unlink
and add a new request after the unlink. or is it impossible?
> PSI_CALL_delete_current_thread(); // before THD is destroyed
> delete thd;
>
> diff --git a/sql/my_apc.cc b/sql/my_apc.cc
> index 9777deb399a..a89d9c8bc19 100644
> --- a/sql/my_apc.cc
> +++ b/sql/my_apc.cc
> @@ -47,11 +47,12 @@ void Apc_target::init(mysql_mutex_t *target_mutex)
> void Apc_target::enqueue_request(Call_request *qe)
> {
> mysql_mutex_assert_owner(LOCK_thd_kill_ptr);
> - if (apc_calls)
> + Call_request *old_apc_calls= apc_calls;
eh, what? you modify apc_calls without a lock somewhere?
> + if (old_apc_calls)
> {
> - Call_request *after= apc_calls->prev;
> - qe->next= apc_calls;
> - apc_calls->prev= qe;
> + Call_request *after= old_apc_calls->prev;
> + qe->next= old_apc_calls;
> + old_apc_calls->prev= qe;
>
> qe->prev= after;
> after->next= qe;
> @@ -100,89 +109,118 @@ void init_show_explain_psi_keys(void)
> if (PSI_server == NULL)
> return;
>
> - PSI_server->register_cond("sql", show_explain_psi_conds,
> - array_elements(show_explain_psi_conds));
> + PSI_server->register_cond("sql", apc_request_psi_conds,
> + array_elements(apc_request_psi_conds));
> + PSI_server->register_mutex("sql", apc_request_psi_locks,
> + array_elements(apc_request_psi_locks));
> }
> #endif
>
> -
> -/*
> - Make an APC (Async Procedure Call) to another thread.
> -
> - @detail
> - Make an APC call: schedule it for execution and wait until the target
> - thread has executed it.
> -
> - - The caller is responsible for making sure he's not posting request
> - to the thread he's calling this function from.
> -
> - - The caller must have locked target_mutex. The function will release it.
> -
> - @retval FALSE - Ok, the call has been made
> - @retval TRUE - Call wasnt made (either the target is in disabled state or
> - timeout occurred)
> -*/
> -
> -bool Apc_target::make_apc_call(THD *caller_thd, Apc_call *call,
> - int timeout_sec, bool *timed_out)
> +/**
> + Wait gracefully until the request is completed.
> + @retval 0 -- Success
> + @retval 1 -- Timeout
> + */
> +int Apc_target::wait_for_completion(THD *caller_thd, Call_request *apc_request,
> + int timeout_sec)
> {
> - bool res= TRUE;
> - *timed_out= FALSE;
> -
> - if (enabled)
> - {
> - /* Create and post the request */
> - Call_request apc_request;
> - apc_request.call= call;
> - apc_request.processed= FALSE;
> - mysql_cond_init(key_show_explain_request_COND, &apc_request.COND_request,
> - NULL);
> - enqueue_request(&apc_request);
> - apc_request.what="enqueued by make_apc_call";
> -
> struct timespec abstime;
> const int timeout= timeout_sec;
> set_timespec(abstime, timeout);
>
> + DBUG_EXECUTE_IF("apc_timeout", set_timespec_nsec(abstime, 1000000););
> + int res = 1;
> int wait_res= 0;
> PSI_stage_info old_stage;
> - caller_thd->ENTER_COND(&apc_request.COND_request, LOCK_thd_kill_ptr,
> +
> + mysql_mutex_unlock(LOCK_thd_kill_ptr);
I don't understand how this works. After you release LOCK_thd_kill_ptr,
the THD might be destroyed. How do you detect that? By the wait timing out?
Who will destroy the Call_request?
> +
> + mysql_mutex_lock(&apc_request->LOCK_request);
> + caller_thd->ENTER_COND(&apc_request->COND_request, &apc_request->LOCK_request,
> &stage_show_explain, &old_stage);
> /* todo: how about processing other errors here? */
> - while (!apc_request.processed && (wait_res != ETIMEDOUT))
> + while (!apc_request->processed && (wait_res != ETIMEDOUT))
> {
> - /* We own LOCK_thd_kill_ptr */
> - wait_res= mysql_cond_timedwait(&apc_request.COND_request,
> - LOCK_thd_kill_ptr, &abstime);
> - // &apc_request.LOCK_request, &abstime);
> + wait_res= mysql_cond_timedwait(&apc_request->COND_request,
> + &apc_request->LOCK_request, &abstime);
> if (caller_thd->killed)
> break;
> +
> + if (caller_thd->apc_target.have_apc_requests())
> + {
> + mysql_mutex_unlock(&apc_request->LOCK_request);
> + caller_thd->apc_target.process_apc_requests();
> + mysql_mutex_lock(&apc_request->LOCK_request);
> + }
> }
>
> - if (!apc_request.processed)
> + if (!apc_request->processed)
> {
> /*
> The wait has timed out, or this thread was KILLed.
> - Remove the request from the queue (ok to do because we own
> - LOCK_thd_kill_ptr)
> + We can't delete it from the queue, because LOCK_thd_kill_ptr is already
> + released. It can't be reacquired because of the ordering with
> + apc_request->LOCK_request.
> + However, apc_request->processed is guarded by this lock.
> + Set processed= TRUE and transfer the ownership to the processor thread.
> + It should free the resources itself.
> + apc_request cannot be referred after unlock anymore in this case.
> */
> - apc_request.processed= TRUE;
> - dequeue_request(&apc_request);
> - *timed_out= TRUE;
> - res= TRUE;
> + apc_request->processed= TRUE;
> + res= 1;
> }
> else
> {
> /* Request was successfully executed and dequeued by the target thread */
> - res= FALSE;
> + res= 0;
> }
> - /*
> - exit_cond() will call mysql_mutex_unlock(LOCK_thd_kill_ptr) for us:
> - */
> +
> + /* EXIT_COND() will call mysql_mutex_unlock(LOCK_request) for us */
> caller_thd->EXIT_COND(&old_stage);
>
> - /* Destroy all APC request data */
> - mysql_cond_destroy(&apc_request.COND_request);
> + return res;
> +}
> +
> +/** Create and post the request */
> +void Apc_target::enqueue_request(Call_request *request_buff, Apc_call *call)
> +{
> + request_buff->call= call;
> + request_buff->processed= FALSE;
> + enqueue_request(request_buff);
> + request_buff->what="enqueued by make_apc_call";
> +}
> +
> +/**
> + Make an APC (Async Procedure Call) to another thread.
> +
> + @detail
> + Make an APC call: schedule it for execution and wait until the target
> + thread has executed it.
> +
> + - The caller is responsible for making sure he's not posting request
> + to the thread he's calling this function from.
> +
> + - The caller must have locked target_mutex. The function will release it.
> +
> + @retval FALSE - Ok, the call has been made
> + @retval TRUE - Call wasnt made (either the target is in disabled state or
> + timeout occurred)
> +*/
> +
> +bool Apc_target::make_apc_call(THD *caller_thd, Apc_call *call,
> + int timeout_sec, bool *timed_out)
> +{
> + bool res= TRUE;
> + *timed_out= FALSE;
> +
> + if (enabled)
> + {
> + Call_request *apc_request= new Call_request;
> + enqueue_request(apc_request, call);
> + res= wait_for_completion(caller_thd, apc_request, timeout_sec);
> + *timed_out= res;
> + if (!*timed_out)
> + delete apc_request;
> }
> else
> {
> diff --git a/storage/perfschema/pfs_variable.cc b/storage/perfschema/pfs_variable.cc
> index 17b7dfc200c..698ff1eca6c 100644
> --- a/storage/perfschema/pfs_variable.cc
> +++ b/storage/perfschema/pfs_variable.cc
> @@ -164,7 +167,11 @@ int PFS_system_variable_cache::do_materialize_global(void)
> during materialization.
> */
> if (!m_external_init)
> + {
> + mysql_mutex_lock(&LOCK_plugin);
> init_show_var_array(OPT_GLOBAL, true);
> + mysql_mutex_unlock(&LOCK_plugin);
1. why not move the locking inside init_show_var_array()?
2. init_show_var_array() locks LOCK_system_variables_hash - is that
not enough?
> + }
>
> /* Resolve the value for each SHOW_VAR in the array, add to cache. */
> for (SHOW_VAR *show_var= m_show_var_array.front();
> @@ -230,37 +236,26 @@ int PFS_system_variable_cache::do_materialize_all(THD *unsafe_thd)
> m_materialized= false;
> m_cache.clear();
>
> - /* Block plugins from unloading. */
> - mysql_mutex_lock(&LOCK_plugin_delete);
also please remove all references of LOCK_plugin_delete
> -
> /*
> Build array of SHOW_VARs from system variable hash. Do this within
> LOCK_plugin_delete to ensure that the hash table remains unchanged
> while this thread is materialized.
> */
> if (!m_external_init)
> + {
> + mysql_mutex_lock(&LOCK_plugin);
> init_show_var_array(OPT_SESSION, false);
> + mysql_mutex_unlock(&LOCK_plugin);
> + }
>
> /* Get and lock a validated THD from the thread manager. */
> if ((m_safe_thd= get_THD(unsafe_thd)) != NULL)
> {
> DEBUG_SYNC(m_current_thd, "materialize_session_variable_array_THD_locked");
> - for (SHOW_VAR *show_var= m_show_var_array.front();
> - show_var->value && (show_var != m_show_var_array.end()); show_var++)
> - {
> - /* Resolve value, convert to text, add to cache. */
> - System_variable system_var(m_safe_thd, show_var, m_query_scope, false);
> - m_cache.push(system_var);
> - }
> -
> - /* Release lock taken in get_THD(). */
> - mysql_mutex_unlock(&m_safe_thd->LOCK_thd_data);
> + ret= make_call(&PFS_system_variable_cache::refresh_vars, 1);
>
> m_materialized= true;
> - ret= 0;
> }
> -
> - mysql_mutex_unlock(&LOCK_plugin_delete);
> return ret;
> }
>
> @@ -312,6 +307,105 @@ void PFS_system_variable_cache::free_mem_root(void)
> }
> }
> }
> +class PFS_system_variable_cache_apc: public Apc_target::Apc_call
> +{
> +public:
> + typedef PFS_system_variable_cache::Request_func Request;
> + PFS_system_variable_cache_apc(PFS_system_variable_cache *pfs, Request func,
> + uint param)
> + : m_pfs(pfs), m_func(func), m_param(param) {}
> +private:
> + PFS_system_variable_cache *m_pfs;
> + Request m_func;
> + uint m_param;
> +
> + void call_in_target_thread() override
> + {
> + call(m_pfs, m_func, m_param);
> + }
> +public:
> + static void call(PFS_system_variable_cache *pfs, Request func, uint param)
> + {
> + THD *safe_thd= pfs->safe_thd();
> +
> + DBUG_ASSERT(pfs->query_scope() == OPT_SESSION);
> +
> + mysql_mutex_lock(&LOCK_global_system_variables);
> + if (!safe_thd->variables.dynamic_variables_ptr ||
> + global_system_variables.dynamic_variables_head >
> + safe_thd->variables.dynamic_variables_head)
> + {
> + mysql_prlock_rdlock(&LOCK_system_variables_hash);
> + sync_dynamic_session_variables(safe_thd, false);
> + mysql_prlock_unlock(&LOCK_system_variables_hash);
> + }
> + mysql_mutex_unlock(&LOCK_global_system_variables);
> +
> + (pfs->*func)(param);
> + }
> +};
> +
> +void PFS_system_variable_cache::refresh_vars(uint all)
> +{
> + for (SHOW_VAR *show_var= m_show_var_array.front();
> + show_var->value && (show_var != m_show_var_array.end()); show_var++)
> + {
> + sys_var *value= (sys_var *)show_var->value;
> +
> + /* Match the system variable scope to the target scope. */
> + if (all || match_scope(value->scope()))
> + {
> + /* Resolve value, convert to text, add to cache. */
> + System_variable system_var(m_safe_thd, show_var, m_query_scope, false);
> + m_cache.push(system_var);
> + }
> + }
> +}
> +void PFS_system_variable_cache::refresh_one_var(uint index)
> +{
> + SHOW_VAR *show_var= &m_show_var_array.at(index);
> +
> + if (show_var && show_var->value &&
> + (show_var != m_show_var_array.end()))
> + {
> + sys_var *value= (sys_var *)show_var->value;
> +
> + /* Match the system variable scope to the target scope. */
> + if (match_scope(value->scope()))
> + {
> + /* Resolve value, convert to text, add to cache. */
> + System_variable system_var(m_safe_thd, show_var, m_query_scope, false);
> + m_cache.push(system_var);
> + }
> + }
> +}
> +
> +
> +#define MAKE_CALL_MAX_RETRIES 3
unused
> +
> +int PFS_system_variable_cache::make_call(Request_func func, uint param)
> +{
> + int ret= 0;
> + THD *requestor_thd= m_current_thd;
> + if (requestor_thd == m_safe_thd)
> + {
> + mysql_mutex_unlock(&m_safe_thd->LOCK_thd_kill);
> + PFS_system_variable_cache_apc::call(this, func, param);
> + }
> + else
> + {
> + PFS_system_variable_cache_apc apc_call(this, func, param);
> + auto *request= new Apc_target::Call_request;
> + m_safe_thd->apc_target.enqueue_request(request, &apc_call);
> + m_safe_thd->abort_current_cond_wait(false, false);
> + m_safe_thd->scheduler->notify_apc(m_safe_thd);
> + DEBUG_SYNC(requestor_thd, "apc_after_notify");
> + ret= m_safe_thd->apc_target.wait_for_completion(requestor_thd, request, 10);
> + if (ret == 0)
> + delete request;
> + }
> + return ret;
> +}
>
> /**
> Build a SESSION system variable cache for a pfs_thread.
Regards,
Sergei
Chief Architect, MariaDB Server
and security(a)mariadb.org
1
0
Re: c46e7c5cb64: MDEV-12252 ROW data type for stored function return values
by Sergei Golubchik 05 Sep '24
by Sergei Golubchik 05 Sep '24
05 Sep '24
Hi, Alexander,
Looks good, thanks. Great commit comment too.
Just a few questions and test suggestions - see below.
Regards,
Sergei
Chief Architect, MariaDB Server
and security(a)mariadb.org
On Aug 23, Alexander Barkov wrote:
> revision-id: c46e7c5cb64 (mariadb-11.4.1-5-gc46e7c5cb64)
> parent(s): ae6684d79f4
> author: Alexander Barkov
> committer: Alexander Barkov
> timestamp: 2024-02-20 13:49:54 +0400
> message:
>
> MDEV-12252 ROW data type for stored function return values
> diff --git a/mysql-test/main/sp-anchor-row-type-table.test b/mysql-test/main/sp-anchor-row-type-table.test
> index 7123f9160db..2fbcfbffc1b 100644
> --- a/mysql-test/main/sp-anchor-row-type-table.test
> +++ b/mysql-test/main/sp-anchor-row-type-table.test
> @@ -939,3 +939,70 @@ DROP TABLE t1;
> --echo #
> --echo # End of 10.4 tests
> --echo #
> +
> +--echo #
> +--echo # Start of 11.3 tests
> +--echo #
> +
> +--echo #
> +--echo # MDEV-12252 ROW data type for stored function return values
> +--echo #
> +
> +--error ER_PARSE_ERROR
> +CREATE FUNCTION f1() RETURNS ROW TYPE OF step1.step2.step3 RETURN ROW(1,2);
> +
> +CREATE FUNCTION f1() RETURNS ROW TYPE OF t1 RETURN ROW(1,2);
you don't resolve t1 at CREATE FUNCTION time. why?
is it how TYPE OF works in DECLARE too?
> +SHOW CREATE FUNCTION f1;
> +--error ER_NO_SUCH_TABLE
> +SELECT f1();
> +DROP FUNCTION f1;
> +
> +CREATE FUNCTION f1() RETURNS ROW TYPE OF test.t1 RETURN ROW(1,2);
> +SHOW CREATE FUNCTION f1;
> +--error ER_NO_SUCH_TABLE
> +SELECT f1();
> +DROP FUNCTION f1;
> +
> +
> +CREATE TABLE t1 (a INT, b VARCHAR(32));
> +CREATE FUNCTION f1() RETURNS ROW TYPE OF test.t1 RETURN (SELECT * FROM t1);
> +SHOW CREATE FUNCTION f1;
> +DELIMITER $$;
> +CREATE PROCEDURE p1()
> +BEGIN
> + DECLARE r ROW TYPE OF t1 DEFAULT f1();
> + SELECT r.a, r.b;
> +END;
> +$$
> +DELIMITER ;$$
> +
> +# Testing with no rows
> +CALL p1();
> +--error ER_OPERAND_COLUMNS
> +SELECT f1();
> +SELECT f1()=ROW(1,'b1') AS c;
> +SELECT f1()=ROW(NULL,NULL) AS c;
> +
> +# Testing with one row
> +INSERT INTO t1 VALUES (1,'b1');
> +CALL p1();
> +--error ER_OPERAND_COLUMNS
> +SELECT f1();
> +SELECT f1()=ROW(1,'b1') AS c;
> +SELECT f1()=ROW(1,'') AS c;
> +SELECT f1()=ROW(2,'b1') AS c;
> +SELECT f1()=ROW(1,NULL) AS c;
> +SELECT f1()=ROW(NULL,'b1') AS c;
do one more SHOW CREATE FUNCTION here
> +
> +# Testing with two rows
> +INSERT INTO t1 VALUES (2,'b2');
> +--error ER_SUBQUERY_NO_1_ROW
> +CALL p1();
> +--error ER_OPERAND_COLUMNS
> +SELECT f1();
> +--error ER_SUBQUERY_NO_1_ROW
> +SELECT f1()=ROW(1,'b1') AS c;
> +
> +DROP PROCEDURE p1;
> +DROP FUNCTION f1;
> +DROP TABLE t1;
> diff --git a/mysql-test/main/sp-anchor-type.test b/mysql-test/main/sp-anchor-type.test
> index 0e24ef900d8..fed53fd011a 100644
> --- a/mysql-test/main/sp-anchor-type.test
> +++ b/mysql-test/main/sp-anchor-type.test
> @@ -767,3 +767,78 @@ BEGIN NOT ATOMIC
> END;
> $$
> DELIMITER ;$$
> +
> +
> +--echo #
> +--echo # Start of 11.3 tests
> +--echo #
> +
> +--echo #
> +--echo # MDEV-12252 ROW data type for stored function return values
> +--echo #
> +
> +--error ER_PARSE_ERROR
> +CREATE FUNCTION f1() RETURNS TYPE OF a RETURN 1;
> +
> +CREATE FUNCTION f1() RETURNS TYPE OF t1.a RETURN (SELECT min(a) FROM t1);
> +SHOW CREATE FUNCTION f1;
> +--error ER_NO_SUCH_TABLE
> +SELECT f1();
> +CREATE TABLE t1 (b INT);
> +--error ER_BAD_FIELD_ERROR
> +SELECT f1();
> +DROP TABLE t1;
> +DROP FUNCTION f1;
> +
> +CREATE DATABASE db1;
> +DELIMITER $$;
> +CREATE FUNCTION db1.f1() RETURNS TYPE OF db1.t1.a
> +BEGIN
> + RETURN (SELECT min(a) FROM t1);
> +END;
> +$$
> +DELIMITER ;$$
> +SHOW CREATE FUNCTION db1.f1;
> +--error ER_NO_SUCH_TABLE
> +SELECT db1.f1();
> +CREATE TABLE db1.t1 (b TIME);
> +--error ER_BAD_FIELD_ERROR
> +SELECT db1.f1();
> +DROP TABLE db1.t1;
> +CREATE TABLE db1.t1 (a TIME);
> +SELECT db1.f1();
> +INSERT INTO db1.t1 VALUES ('10:20:30');
> +SELECT db1.f1();
> +DROP TABLE db1.t1;
> +DROP FUNCTION db1.f1;
> +DROP DATABASE db1;
> +
> +CREATE TABLE t1 (a TIME);
> +CREATE FUNCTION f1() RETURNS TYPE OF test.t1.a RETURN (SELECT min(a) FROM t1);
> +SHOW CREATE FUNCTION f1;
> +SELECT f1();
> +DROP FUNCTION f1;
> +DROP TABLE t1;
> +
> +CREATE TABLE t1 (a TIME);
> +DELIMITER $$;
> +CREATE FUNCTION f1() RETURNS TYPE OF t1.a
> +BEGIN
> + RETURN (SELECT min(a) FROM t1);
> +END;
> +$$
> +DELIMITER ;$$
> +SHOW CREATE FUNCTION f1;
> +SELECT f1();
> +CREATE TABLE t2 AS SELECT f1();
> +SHOW CREATE TABLE t2;
> +SELECT * FROM t2;
> +DROP TABLE t2;
> +INSERT INTO t1 VALUES ('10:20:30');
> +SELECT f1();
> +CREATE TABLE t2 AS SELECT f1();
> +SHOW CREATE TABLE t2;
> +SELECT * FROM t2;
> +DROP TABLE t2;
please, also try here
DROP TABLE t1;
CREATE TABLE t1 (a INT);
INSERT INTO t1 VALUES (10);
CREATE TABLE t2 AS SELECT f1();
the point is to change TYPE OF t1.a
after the function has been successfully called,
and thus is already in the sp cache.
> +DROP FUNCTION f1;
> +DROP TABLE t1;
> diff --git a/mysql-test/suite/compat/oracle/t/sp-package.test b/mysql-test/suite/compat/oracle/t/sp-package.test
> index aefede41c8b..7c231d736c5 100644
> --- a/mysql-test/suite/compat/oracle/t/sp-package.test
> +++ b/mysql-test/suite/compat/oracle/t/sp-package.test
> @@ -3109,3 +3109,79 @@ DROP TABLE t1;
> DROP FUNCTION f1_deterministic;
> DROP FUNCTION f2_not_deterministic;
> DROP PACKAGE pkg1;
> +
> +--echo #
> +--echo # MDEV-12252 ROW data type for stored function return values
> +--echo #
> +
> +--echo #
> +--echo # Testing fixed ROW type with package routines
> +--echo #
> +
> +DELIMITER $$;
> +CREATE PACKAGE pkg
> +AS
> + FUNCTION f1() RETURN ROW(a INT, b VARCHAR(32));
> + PROCEDURE p1(r ROW(a INT, b VARCHAR(32)));
> + PROCEDURE p2();
> +END;
> +$$
> +CREATE PACKAGE BODY pkg
> +AS
> + FUNCTION f1() RETURN ROW(a INT, b VARCHAR(32)) AS
> + BEGIN
> + RETURN ROW(1,'b1');
> + END;
> + PROCEDURE p1(r ROW(a INT, b VARCHAR(32))) AS
> + BEGIN
> + SELECT r.a, r.b;
> + END;
> + PROCEDURE p2() AS
> + BEGIN
> + CALL p1(f1());
> + END;
> +END;
> +$$
> +DELIMITER ;$$
> +CALL pkg.p1(pkg.f1());
> +CALL pkg.p2;
> +DROP PACKAGE pkg;
> +
> +
> +--echo #
> +--echo # Testing table%ROWTYPE with package routines
> +--echo #
now you can also add package tests outside of oracle mode
> +
> +CREATE TABLE t1 (a INT, b VARCHAR(32));
> +INSERT INTO t1 VALUES (1,'b1');
> +DELIMITER /;
> +CREATE PACKAGE pkg
> +AS
> + FUNCTION f1 RETURN t1%ROWTYPE;
> + PROCEDURE p1(r t1%ROWTYPE);
> + PROCEDURE p2;
> +END;
> +/
> +CREATE PACKAGE BODY pkg
> +AS
> + FUNCTION f1 RETURN t1%ROWTYPE AS
> + r t1%ROWTYPE;
> + BEGIN
> + SELECT * INTO r FROM t1;
> + RETURN r;
> + END;
> + PROCEDURE p1(r t1%ROWTYPE) AS
> + BEGIN
> + SELECT r.a || ' ' || r.b;
> + END;
> + PROCEDURE p2 AS
> + BEGIN
> + p1(f1());
> + END;
> +END;
> +/
> +DELIMITER ;/
> +CALL pkg.p1(pkg.f1());
> +CALL pkg.p2;
> +DROP PACKAGE pkg;
> +DROP TABLE t1;
> diff --git a/sql/field.h b/sql/field.h
> index 7f1c243a180..7af069a9735 100644
> --- a/sql/field.h
> +++ b/sql/field.h
> @@ -1373,6 +1374,20 @@ class Field: public Value_source
> in str and restore it with set() if needed
> */
> virtual void sql_type(String &str) const =0;
> + void sql_type_for_sp_returns(String &str) const
This is oddly specific. Strange that we don't need to print the field
type with charset/collation anywhere else, but for sp returns we do.
> + {
> + sql_type(str);
> + if (has_charset())
> + {
> + str.append(STRING_WITH_LEN(" CHARSET "));
> + str.append(charset()->cs_name);
> + if (Charset(charset()).can_have_collate_clause())
> + {
> + str.append(STRING_WITH_LEN(" COLLATE "));
> + str.append(charset()->coll_name);
> + }
> + }
> + }
> virtual void sql_rpl_type(String *str) const { sql_type(*str); }
> virtual uint size_of() const =0; // For new field
> inline bool is_null(my_ptrdiff_t row_offset= 0) const
> diff --git a/sql/field.cc b/sql/field.cc
> index 35be045620d..23b050de221 100644
> --- a/sql/field.cc
> +++ b/sql/field.cc
> @@ -2768,6 +2768,32 @@ bool Field_row::sp_prepare_and_store_item(THD *thd, Item **value)
> }
>
>
> +uint Field_row::cols() const
> +{
> + // The table with ROW members must be already instantiated
> + DBUG_ASSERT(m_table);
> + return m_table->s->fields;
> +}
> +
> +
> +void Field_row::sql_type(String &res) const
would be more logical to call this Field_row::sql_type_for_sp_returns()
> +{
> + res.set_ascii(STRING_WITH_LEN("row("));
> + for (uint i= 0; i < m_table->s->fields; i++)
> + {
> + if (i > 0)
> + res.append(',');
> + Field *field= m_table->field[i];
> + res.append(field->field_name);
> + res.append(' ');
> + StringBuffer<64> col;
> + field->sql_type_for_sp_returns(col);
> + res.append(col.to_lex_cstring());
> + }
> + res.append(')');
> +}
> +
> +
> /****************************************************************************
> Functions for the Field_decimal class
> This is an number stored as a pre-space (or pre-zero) string
> diff --git a/sql/item.cc b/sql/item.cc
> index 54ad511cb6b..0a125752979 100644
> --- a/sql/item.cc
> +++ b/sql/item.cc
> @@ -3001,10 +3003,31 @@ Item_sp::init_result_field(THD *thd, uint max_length, uint maybe_null,
> dummy_table->s->table_name= empty_clex_str;
> dummy_table->maybe_null= maybe_null;
>
> + if (m_sp->m_return_field_def.is_column_type_ref() &&
> + m_sp->m_return_field_def.column_type_ref()->
> + resolve_type_ref(thd, &m_sp->m_return_field_def))
> + DBUG_RETURN(TRUE);
Here you overwrite old m_return_field_def value which was a column_type_ref
with its actual type, don't you?
What will happen if you change the column's type
after the sp was already executed once?
> +
> if (!(sp_result_field= m_sp->create_result_field(max_length, name,
> dummy_table)))
> DBUG_RETURN(TRUE);
>
> + /*
> + In case of a ROW return type we need to create Item_field_row
> + on top of Field_row, and remember it in sp_result_item_field_row.
> + ROW members are later accessed using sp_result_item_field_row->addr(i),
> + e.g. when copying the function return value to a local variable.
> + For scalar return types no Item_field is needed around sp_result_field,
> + as the value is fetched directly from sp_result_field,
> + inside Item_func_sp::val_xxx() methods.
Sorry, I don't understand that. Why cannot you use Field_row here?
> + */
> + if (Field_row *field_row= dynamic_cast<Field_row*>(sp_result_field))
> + {
> + if (!(sp_result_item_field_row=
> + m_sp->m_return_field_def.make_item_field_row(thd, field_row)))
> + DBUG_RETURN(true);
> + }
> +
> if (sp_result_field->pack_length() > sizeof(result_buf))
> {
> void *tmp;
> diff --git a/sql/sp.cc b/sql/sp.cc
> index ddaeeb8cb7a..7eab8304c31 100644
> --- a/sql/sp.cc
> +++ b/sql/sp.cc
> @@ -1091,18 +1136,19 @@ sp_returns_type(THD *thd, String &result, const sp_head *sp)
> bzero((char*) &share, sizeof(share));
> table.in_use= thd;
> table.s = &share;
> - field= sp->create_result_field(0, 0, &table);
> - field->sql_type(result);
> + field= create_result_field(0, 0, &table);
>
> - if (field->has_charset())
> + if (m_return_field_def.is_row())
> {
> - result.append(STRING_WITH_LEN(" CHARSET "));
> - result.append(field->charset()->cs_name);
> - if (Charset(field->charset()).can_have_collate_clause())
> - {
> - result.append(STRING_WITH_LEN(" COLLATE "));
> - result.append(field->charset()->coll_name);
> - }
> + Field_row *field_row= dynamic_cast<Field_row*>(field);
> + if (!field_row->row_create_fields(
> + thd, m_return_field_def.row_field_definitions()))
> + field->sql_type(result);
when is this used? you're creating a TABLE and Field's -
only to print types and charsets. This might be tolerable, but only if it's
not needed often
> + }
> + else
> + {
> + DBUG_ASSERT(m_return_field_def.type_handler()->is_scalar_type());
> + field->sql_type_for_sp_returns(result);
> }
>
> delete field;
> diff --git a/sql/sql_yacc.yy b/sql/sql_yacc.yy
> index 5977e309ae4..f11e4f89466 100644
> --- a/sql/sql_yacc.yy
> +++ b/sql/sql_yacc.yy
> @@ -18347,7 +18338,50 @@ sp_parameters:
> ;
>
> sf_returned_type_clause:
> - RETURNS_SYM sf_return_type
> + RETURNS_SYM
> + {
> + LEX *lex= Lex;
you still do this lex=Lex? :)
it became obsolete, like, 15 years ago. If not more.
> + lex->init_last_field(&lex->sphead->m_return_field_def,
> + &empty_clex_str);
> + }
> + sf_return_type
> + ;
> +
> +sf_return_type:
> + field_type
> + {
> + if (unlikely(Lex->sf_return_fill_definition($1)))
> + MYSQL_YYABORT;
> + }
> + | ROW_SYM row_type_body
> + {
> + if (Lex->sf_return_fill_definition_row($2))
> + MYSQL_YYABORT;
> + }
> + | ROW_SYM TYPE_SYM OF_SYM ident
> + {
> + if (Lex->sf_return_fill_definition_rowtype_of(
> + Qualified_column_ident(&$4)))
> + MYSQL_YYABORT;
> + }
> + | ROW_SYM TYPE_SYM OF_SYM ident '.' ident
> + {
> + if (Lex->sf_return_fill_definition_rowtype_of(
> + Qualified_column_ident(&$4, &$6)))
> + MYSQL_YYABORT;
> + }
> + | TYPE_SYM OF_SYM ident '.' ident
> + {
> + if (Lex->sf_return_fill_definition_type_of(
> + Qualified_column_ident(&$3, &$5)))
> + MYSQL_YYABORT;
> + }
> + | TYPE_SYM OF_SYM ident '.' ident '.' ident
> + {
> + if (Lex->sf_return_fill_definition_type_of(
> + Qualified_column_ident(thd, &$3, &$5, &$7)))
> + MYSQL_YYABORT;
> + }
> ;
>
> package_implementation_item_declaration:
2
1