Hi, Sujatha!
Note, it's a review of the combined `git diff e5fc78 21a84e`
On Mar 31, Sujatha wrote:
> revision-id: 21a84e23cf3 (mariadb-10.5.2-307-g21a84e23cf3)
> parent(s): 4e6de585ff7
> author: Sujatha <sujatha.sivakumar(a)mariadb.com>
> committer: Sujatha <sujatha.sivakumar(a)mariadb.com>
> timestamp: 2020-11-30 13:22:32 +0530
> message:
>
> MDEV-16437: merge 5.7 P_S replication instrumentation and tables
That was pretty good, thanks!
Just few small comments below, no big changes will be needed.
> diff --git a/mysql-test/suite/multi_source/simple.result b/mysql-test/suite/multi_source/simple.result
> index a66d49e88cb..b57e146feb5 100644
> --- a/mysql-test/suite/multi_source/simple.result
> +++ b/mysql-test/suite/multi_source/simple.result
> @@ -21,7 +21,21 @@ show all slaves status;
> Connection_name Slave_SQL_State Slave_IO_State Master_Host Master_User Master_Port Connect_Retry Master_Log_File Read_Master_Log_Pos Relay_Log_File Relay_Log_Pos Relay_Master_Log_File Slave_IO_Running Slave_SQL_Running Replicate_Do_DB Replicate_Ignore_DB Replicate_Do_Table Replicate_Ignore_Table Replicate_Wild_Do_Table Replicate_Wild_Ignore_Table Last_Errno Last_Error Skip_Counter Exec_Master_Log_Pos Relay_Log_Space Until_Condition Until_Log_File Until_Log_Pos Master_SSL_Allowed Master_SSL_CA_File Master_SSL_CA_Path Master_SSL_Cert Master_SSL_Cipher Master_SSL_Key Seconds_Behind_Master Master_SSL_Verify_Server_Cert Last_IO_Errno Last_IO_Error Last_SQL_Errno Last_SQL_Error Replicate_Ignore_Server_Ids Master_Server_Id Master_SSL_Crl Master_SSL_Crlpath Using_Gtid Gtid_IO_Pos Replicate_Do_Domain_Ids Replicate_Ignore_Domain_Ids Parallel_Mode SQL_Delay SQL_Remaining_Delay Slave_SQL_Running_State Slave_DDL_Groups Slave_Non_Transactional_Groups Slave_Transactional_Groups Retried_transactions Max_relay_log_size Executed_log_entries Slave_received_heartbeats Slave_heartbeat_period Gtid_Slave_Pos
> slave1 Slave has read all relay log; waiting for more updates Waiting for master to send event 127.0.0.1 root MYPORT_1 60 master-bin.000001 <read_master_log_pos> mysqld-relay-bin-slave1.000002 <relay_log_pos> master-bin.000001 Yes Yes 0 0 <read_master_log_pos> <relay_log_space1> None 0 No 0 No 0 0 1 No optimistic 0 NULL Slave has read all relay log; waiting for more updates 0 0 0 0 1073741824 7 0 60.000
> slave2 Slave has read all relay log; waiting for more updates Waiting for master to send event 127.0.0.1 root MYPORT_2 60 master-bin.000001 <read_master_log_pos> mysqld-relay-bin-slave2.000002 <relay_log_pos> master-bin.000001 Yes Yes 0 0 <read_master_log_pos> <relay_log_space1> None 0 No 0 No 0 0 2 No optimistic 0 NULL Slave has read all relay log; waiting for more updates 0 0 0 0 1073741824 7 0 60.000
> +#
> +# MDEV:16437: merge 5.7 P_S replication instrumentation and tables
> +#
> +select * from performance_schema.replication_connection_configuration;
> +CONNECTION_NAME HOST PORT USER USING_GTID SSL_ALLOWED SSL_CA_FILE SSL_CA_PATH SSL_CERTIFICATE SSL_CIPHER SSL_KEY SSL_VERIFY_SERVER_CERTIFICATE SSL_CRL_FILE SSL_CRL_PATH CONNECTION_RETRY_INTERVAL CONNECTION_RETRY_COUNT HEARTBEAT_INTERVAL IGNORE_SERVER_IDS REPL_DO_DOMAIN_IDS REPL_IGNORE_DOMAIN_IDS
> +slave2 # # root NO NO NO 60 86400 60.000
> +slave1 # # root NO NO NO 60 86400 60.000
Isn't the host always 127.0.0.1 ? Why to mask it?
Also, may be it'd be easier to read the result, if you print it vertically?
It looks that CONNECTION_RETRY_COUNT is 86400.
And 86400 is clearly a timeout, not a retry count.
> start all slaves;
> +#
> +# MDEV:16437: merge 5.7 P_S replication instrumentation and tables
> +#
> +select * from performance_schema.replication_applier_status_by_coordinator;
> +CONNECTION_NAME THREAD_ID SERVICE_STATE LAST_SEEN_TRANSACTION LAST_ERROR_NUMBER LAST_ERROR_MESSAGE LAST_ERROR_TIMESTAMP LAST_TRANS_RETRY_COUNT
> +slave2 # ON 0 0000-00-00 00:00:00 0
> +slave1 # ON 0 0000-00-00 00:00:00 0
> stop slave 'slave1';
> show slave 'slave1' status;
> Slave_IO_State
> diff --git a/mysql-test/suite/rpl/include/rpl_deadlock.test b/mysql-test/suite/rpl/include/rpl_deadlock.test
> index e9191d5fcd8..bccbe044a36 100644
> --- a/mysql-test/suite/rpl/include/rpl_deadlock.test
> +++ b/mysql-test/suite/rpl/include/rpl_deadlock.test
> @@ -59,6 +59,16 @@ let $status_var_comparsion= >;
> connection slave;
> SELECT COUNT(*) FROM t2;
> COMMIT;
> +
> +--echo
> +--echo # Test that the performance schema coulumn shows > 0 values.
> +--echo
> +
> +--let $assert_text= current number of retries should be more than the value saved before deadlock.
> +--let $assert_cond= [SELECT COUNT_TRANSACTIONS_RETRIES FROM performance_schema.replication_applier_status, COUNT_TRANSACTIONS_RETRIES, 1] > "$slave_retried_transactions"
> +--source include/assert.inc
what's wrong with simple
SELECT COUNT_TRANSACTIONS_RETRIES > $slave_retried_transactions FROM performance_schema.replication_applier_status
?
> +
> +source include/check_slave_is_running.inc;
> sync_with_master;
>
> # Check the data
> diff --git a/mysql-test/suite/rpl/r/rpl_perfschema_connect_config.result b/mysql-test/suite/rpl/r/rpl_perfschema_connect_config.result
> new file mode 100644
> index 00000000000..4ace84ffac4
> --- /dev/null
> +++ b/mysql-test/suite/rpl/r/rpl_perfschema_connect_config.result
> @@ -0,0 +1,124 @@
> +include/master-slave.inc
> +[connection master]
> +# Asserted this: On master, the table should return an empty set.
> +connection slave;
> +
> +# Verify that SELECT works for every field and produces an output
> +# similar to the corresponding field in SHOW SLAVE STATUS(SSS).
> +
> +include/assert.inc [Value returned by SSS and PS table for Host should be same.]
> +include/assert.inc [Value returned by SSS and PS table for Port should be same.]
> +include/assert.inc [Value returned by SSS and PS table for User should be same.]
> +include/assert.inc [Value returned by SSS and PS table for Using_Gtid should be same.]
> +include/assert.inc [Value returned by SSS and PS table for SSL_Allowed should be same.]
> +include/assert.inc [Value returned by SSS and PS table for SSL_CA_File should be same.]
> +include/assert.inc [Value returned by SSS and PS table for SSL_CA_Path should be same.]
> +include/assert.inc [Value returned by SSS and PS table for SSL_Certificate should be same.]
> +include/assert.inc [Value returned by SSS and PS table for SSL_Cipher should be same.]
> +include/assert.inc [Value returned by SSS and PS table for SSL_Key should be same.]
> +include/assert.inc [Value returned by SSS and PS table for SSL_Verify_Server_Certificate should be same.]
> +include/assert.inc [Value returned by SSS and PS table for SSL_Crl_File should be same.]
> +include/assert.inc [Value returned by SSS and PS table for SSL_Crl_Path should be same.]
> +include/assert.inc [Value returned by SSS and PS table for Connection_Retry_Interval should be same.]
> +include/assert.inc [Value returned by PS table for Connection_Retry_Count should be 10.]
this is just unreadable
> +
> +# Heartbeat_Interval is part of I_S and P_S. We will compare the
> +# two to make sure both match.
...
> diff --git a/mysql-test/suite/rpl/t/rpl_perfschema_applier_config.test b/mysql-test/suite/rpl/t/rpl_perfschema_applier_config.test
> new file mode 100644
> index 00000000000..132d9912222
> --- /dev/null
> +++ b/mysql-test/suite/rpl/t/rpl_perfschema_applier_config.test
> @@ -0,0 +1,96 @@
> +# ==== Purpose ====
> +#
> +# This test script serves as the functionality testing for the table
> +# performance_schema.replication_applier_configuration. Test for ddl and dml
> +# operations is a part of the perfschema suite. The ddl/dml tests are named:
> +# 1) ddl_replication_applier_configuration.test and
> +# 2) dml_replication_applier_configuration.test.
> +#
> +# The follwing scenarios are tested in this script:
> +#
> +# - Verify that output is same as SSS on a fresh slave.
> +# - Verify that the value of this field is correct after STOP SLAVE.
> +# - Verify that, when desired delay is set, the value is shown corectly.
> +# - Verify that the value is preserved after STOP SLAVE.
> +# - Verify that, when desired delay is reset, the value is shown corectly.
> +#
> +# ==== Related Worklog ====
> +#
> +# MDEV-16437: merge 5.7 P_S replication instrumentation and tables
> +#
> +
> +source include/master-slave.inc;
> +source include/have_binlog_format_mixed.inc;
master-slave should be included last
> +
> +let $assert_text= On master, the table should return an empty set.;
> +let $assert_cond= count(*) = 0 from performance_schema.replication_applier_configuration;
> +source include/assert.inc;
again, what's wrong with just
select * from performance_schema.replication_applier_configuration;
or, may be, if you want to be very explicit
select count(*) as 'must be 0' from performance_schema.replication_applier_configuration;
> +
> +--connection slave
> +
> +--echo
> +--echo # Verify that SELECT works and produces an output similar to
> +--echo # the corresponding field in SHOW SLAVE STATUS(SSS) in all scenarios.
> +--echo
> +
> +--echo
> +--echo # Verify that output is same as SSS on a fresh slave.
> +--echo
> +
> +let $sss_value= query_get_value(SHOW SLAVE STATUS, SQL_Delay, 1);
> +let $ps_value= query_get_value(select Desired_Delay from performance_schema.replication_applier_configuration, Desired_Delay, 1);
> +let $assert_text= Value returned by SSS and PS table for Desired_Delay should be same.;
> +let $assert_cond= "$sss_value" = "$ps_value";
> +source include/assert.inc;
I'll stop commenting on every assert.inc, but please, please, stop overusing them.
> +
> +--echo
> +--echo # Verify that the value of this field is correct after STOP SLAVE.
> +--echo
> +
> +source include/stop_slave.inc;
> +
> +let $sss_value= query_get_value(SHOW SLAVE STATUS, SQL_Delay, 1);
> +let $ps_value= query_get_value(select Desired_Delay from performance_schema.replication_applier_configuration, Desired_Delay, 1);
> +let $assert_text= Value returned by SSS and PS table for Desired_Delay should be same.;
> +let $assert_cond= "$sss_value" = "$ps_value";
> +source include/assert.inc;
> +
> +--echo
> +--echo # Verify that, when desired delay is set, the value is shown corectly.
> +--echo
> +
> +eval change master to master_delay= 2;
> +source include/start_slave.inc;
> +
> +let $sss_value= query_get_value(SHOW SLAVE STATUS, SQL_Delay, 1);
> +let $ps_value= query_get_value(select Desired_Delay from performance_schema.replication_applier_configuration, Desired_Delay, 1);
> +let $assert_text= Value returned by SSS and PS table for Desired_Delay should be same.;
> +let $assert_cond= "$sss_value" = "$ps_value";
> +source include/assert.inc;
> +
> +--echo
> +--echo # Verify that the value is preserved after STOP SLAVE.
> +--echo
> +
> +source include/stop_slave.inc;
> +
> +let $ss_value= query_get_value(SHOW SLAVE STATUS, SQL_Delay, 1);
> +let $ps_value= query_get_value(select Desired_Delay from performance_schema.replication_applier_configuration, Desired_Delay, 1);
> +let $assert_text= Value returned by SSS and PS table for Desired_Delay should be same.;
> +let $assert_cond= "$sss_value" = "$ps_value";
> +source include/assert.inc;
> +
> +--echo
> +--echo # Verify that, when desired delay is reset, the value is shown corectly.
> +--echo
> +
> +eval change master to master_delay= 0;
> +source include/start_slave.inc;
> +
> +let $sss_value= query_get_value(SHOW SLAVE STATUS, SQL_Delay, 1);
> +let $ps_value= query_get_value(select Desired_Delay from performance_schema.replication_applier_configuration, Desired_Delay, 1);
> +let $assert_text= Value returned by SSS and PS table for Desired_Delay should be same.;
> +let $assert_cond= "$sss_value" = "$ps_value";
> +source include/assert.inc;
> +
> +source include/rpl_end.inc;
> diff --git a/mysql-test/suite/rpl/t/rpl_perfschema_applier_status.test b/mysql-test/suite/rpl/t/rpl_perfschema_applier_status.test
> new file mode 100644
> index 00000000000..52ee14cef2a
> --- /dev/null
> +++ b/mysql-test/suite/rpl/t/rpl_perfschema_applier_status.test
> @@ -0,0 +1,72 @@
> +# ==== Purpose ====
> +#
> +# This test script serves as the functionality testing for the table
> +# performance_schema.replication_applier_status. Test for ddl and dml
> +# operations is a part of the perfschema suite. The ddl/dml tests are named:
> +# 1) ddl_replication_applier_status.test and
> +# 2) dml_replication_applier_status.test.
> +#
> +# The follwing scenarios are tested in this script:
> +#
> +# - Verify that output is same as SSS on a fresh slave.
> +# - Verify that the value of this field is correct after STOP SLAVE.
> +# - Remaining delay is not tested.
> +# - Count_trnsaction is partially tested here making sure it can be queried.
> +# More testing in rpl/include/rpl_deadlock.test
> +#
> +# ==== Related Worklog ====
> +#
> +# MDEV-16437: merge 5.7 P_S replication instrumentation and tables
> +#
> +
> +source include/master-slave.inc;
> +source include/have_binlog_format_mixed.inc;
master-slave must always be last (I'll stop commenting on that too)
> +
> +let $assert_text= On master, the table should return an empty set.;
> +let $assert_cond= count(*) = 0 from performance_schema.replication_applier_status;
> +source include/assert.inc;
> +
> +--connection slave
> +
> +--echo
> +--echo # Verify that SELECT works and produces an output similar to
> +--echo # the corresponding field in SHOW SLAVE STATUS(SSS) in all scenarios.
> +--echo
> +
> +--echo
> +--echo # Verify that output is same as SSS on a fresh slave.
> +--echo
> +
> +let $sss_value= query_get_value(SHOW SLAVE STATUS, Slave_SQL_Running, 1);
> +let $ps_value= query_get_value(select Service_State from performance_schema.replication_applier_status, Service_State, 1);
> +let $assert_text= SSS shows Slave_SQL_Running as "Yes". So, Service_State from this PS table should be "ON".;
> +let $assert_cond= "$sss_value" = "Yes" AND "$ps_value"= "ON";
> +source include/assert.inc;
> +
> +let $ss_value= query_get_value(SHOW STATUS LIKE 'Slave_retried_transactions', Value, 1);
> +let $ps_value= query_get_value(select count_transactions_retries from performance_schema.replication_applier_status, count_transactions_retries, 1);
> +let $assert_text= COUNT_TRANSACTION_RETRIES should be equal to Slave_retried_transactions.;
> +let $assert_cond= "$ps_value"= "$ss_value";
> +source include/assert.inc;
> +
> +--echo
> +--echo # Verify that the fields show the correct values after STOP SLAVE.
> +--echo
> +
> +source include/stop_slave.inc;
> +
> +let $sss_value= query_get_value(SHOW SLAVE STATUS, Slave_SQL_Running, 1);
> +let $ps_value= query_get_value(select Service_State from performance_schema.replication_applier_status, Service_State, 1);
> +let $assert_text= SSS shows Slave_SQL_Running as "No". So, Service_State from this PS table should be "OFF".;
> +let $assert_cond= "$sss_value" = "No" AND "$ps_value"= "OFF";
> +source include/assert.inc;
> +
> +let $ss_value= query_get_value(SHOW STATUS LIKE 'Slave_retried_transactions', Value, 1);
> +let $ps_value= query_get_value(select count_transactions_retries from performance_schema.replication_applier_status, count_transactions_retries, 1);
> +let $assert_text= COUNT_TRANSACTION_RETRIES should be equal to Slave_retried_transactions.;
> +let $assert_cond= "$ps_value"= "$ss_value";
> +source include/assert.inc;
> +
> +source include/start_slave.inc;
> +source include/rpl_end.inc;
> +
> diff --git a/sql/rpl_mi.h b/sql/rpl_mi.h
> index 4d47689ac18..946d138d618 100644
> --- a/sql/rpl_mi.h
> +++ b/sql/rpl_mi.h
> @@ -50,13 +57,6 @@ class Domain_id_filter
> */
> DYNAMIC_ARRAY m_domain_ids[2];
>
> -public:
> - /* domain id list types */
> - enum enum_list_type {
> - DO_DOMAIN_IDS= 0,
> - IGNORE_DOMAIN_IDS
> - };
> -
Why did you move it up? Does it make any difference?
> Domain_id_filter();
>
> ~Domain_id_filter();
> diff --git a/storage/perfschema/table_replication_applier_status_by_coordinator.cc b/storage/perfschema/table_replication_applier_status_by_coordinator.cc
> index beb8620b240..30cf56ce0c2 100644
> --- a/storage/perfschema/table_replication_applier_status_by_coordinator.cc
> +++ b/storage/perfschema/table_replication_applier_status_by_coordinator.cc
> @@ -55,12 +55,14 @@ table_replication_applier_status_by_coordinator::m_share=
> sizeof(pos_t), /* ref length */
> &m_table_lock,
> { C_STRING_WITH_LEN("CREATE TABLE replication_applier_status_by_coordinator("
> - "CHANNEL_NAME CHAR(64) collate utf8_general_ci not null,"
> + "CONNECTION_NAME VARCHAR(256) collate utf8_general_ci not null,"
please, don't rename existing columns, let's keep calling it CHANNEL_NAME everywhere.
> "THREAD_ID BIGINT UNSIGNED,"
> "SERVICE_STATE ENUM('ON','OFF') not null,"
> + "LAST_SEEN_TRANSACTION CHAR(57) not null,"
I think it's better to put new columns at the end.
> "LAST_ERROR_NUMBER INTEGER not null,"
> "LAST_ERROR_MESSAGE VARCHAR(1024) not null,"
> - "LAST_ERROR_TIMESTAMP TIMESTAMP(0) not null)") },
> + "LAST_ERROR_TIMESTAMP TIMESTAMP(0) not null,"
> + "LAST_TRANS_RETRY_COUNT INTEGER not null)") },
> false /* perpetual */
> };
>
> @@ -104,15 +106,7 @@ int table_replication_applier_status_by_coordinator::rnd_next(void)
> {
> mi= (Master_info *)my_hash_element(&master_info_index->master_info_hash, m_pos.m_index);
>
> - /*
> - Construct and display SQL Thread's (Coordinator) information in
> - 'replication_applier_status_by_coordinator' table only in the case of
> - multi threaded slave mode. Code should do nothing in the case of single
> - threaded slave mode. In case of single threaded slave mode SQL Thread's
> - status will be reported as part of
> - 'replication_applier_status_by_worker' table.
> - */
is this no longer true?
> - if (mi && mi->host[0] && /*mi->rli.get_worker_count() > */ 0)
> + if (mi && mi->host[0])
> {
> make_row(mi);
> m_next_pos.set_after(&m_pos);
> @@ -147,13 +141,20 @@ int table_replication_applier_status_by_coordinator::rnd_pos(const void *pos)
> void table_replication_applier_status_by_coordinator::make_row(Master_info *mi)
> {
> m_row_exists= false;
> + rpl_gtid gtid;
> + char buf[10+1+10+1+20+1];
> + String str(buf, sizeof(buf), system_charset_info);
There's a special template for it:
StringBuffer<10+1+10+1+20+1> str;
> + bool first= true;
> +
> + str.length(0);
not needed if you use StringBuffer<>
>
> DBUG_ASSERT(mi != NULL);
>
> mysql_mutex_lock(&mi->rli.data_lock);
>
> - m_row.channel_name_length= static_cast<uint>(mi->connection_name.length);
> - memcpy(m_row.channel_name, mi->connection_name.str, m_row.channel_name_length);
> + gtid= mi->rli.last_seen_gtid;
> + m_row.connection_name_length= static_cast<uint>(mi->connection_name.length);
> + memcpy(m_row.connection_name, mi->connection_name.str, m_row.connection_name_length);
>
> if (mi->rli.slave_running)
> {
> @@ -175,6 +176,18 @@ void table_replication_applier_status_by_coordinator::make_row(Master_info *mi)
> else
> m_row.service_state= PS_RPL_NO;
>
> + if ((gtid.seq_no > 0 &&
> + !rpl_slave_state_tostring_helper(&str, >id, &first)))
> + {
> + strmake(m_row.last_seen_transaction,str.ptr(), str.length());
> + m_row.last_seen_transaction_length= str.length();
> + }
> + else
> + {
> + m_row.last_seen_transaction_length= 0;
> + memcpy(m_row.last_seen_transaction, "", 1);
a strange way to write m_row.last_seen_transaction[0]= 0, but ok :)
> + }
> +
> mysql_mutex_lock(&mi->rli.err_lock);
>
> m_row.last_error_number= (long int) mi->rli.last_error().number;
Regards,
Sergei
VP of MariaDB Server Engineering
and security(a)mariadb.org