revision-id: be5698265a4195586142d1a34fdd1cce9d95d8a1 (mariadb-10.1.34-10-gbe5698265a4) parent(s): c6392d52ee2e918a65b05c275286ff4d450eef2c author: Jan Lindström committer: Jan Lindström timestamp: 2018-06-27 12:37:21 +0300 message: MDEV-15607: mysqld crashed few after node is being joined with sst This is a typical systemd response where it tries to shutdown the joiner (due to "timeout") before the joiner manages to complete SST. wsrep_sst_wait wsrep_SE_init_wait While waiting the operation to finish use mysql_cond_timedwait instead of mysql_cond_wait and if operation is not finished extend systemd timeout (if needed). --- sql/wsrep_sst.cc | 41 +++++++++++++++++++++++++++++++++++++---- 1 file changed, 37 insertions(+), 4 deletions(-) diff --git a/sql/wsrep_sst.cc b/sql/wsrep_sst.cc index 4df969496bc..60683bf740c 100644 --- a/sql/wsrep_sst.cc +++ b/sql/wsrep_sst.cc @@ -30,6 +30,10 @@ #include <cstdio> #include <cstdlib> +#if MYSQL_VERSION_ID < 100200 +# include <my_service_manager.h> +#endif + static char wsrep_defaults_file[FN_REFLEN * 2 + 10 + 30 + sizeof(WSREP_SST_OPT_CONF) + sizeof(WSREP_SST_OPT_CONF_SUFFIX) + @@ -186,6 +190,9 @@ bool wsrep_before_SE() static bool sst_complete = false; static bool sst_needed = false; +#define WSREP_EXTEND_TIMEOUT_INTERVAL 30 +#define WSREP_TIMEDWAIT_SECONDS 10 + void wsrep_sst_grab () { WSREP_INFO("wsrep_sst_grab()"); @@ -197,11 +204,25 @@ void wsrep_sst_grab () // Wait for end of SST bool wsrep_sst_wait () { - if (mysql_mutex_lock (&LOCK_wsrep_sst)) abort(); + struct timespec wtime = {WSREP_TIMEDWAIT_SECONDS, 0}; + uint32 total_wtime = 0; + + if (mysql_mutex_lock (&LOCK_wsrep_sst)) + abort(); + + WSREP_INFO("Waiting for SST to complete."); + while (!sst_complete) { - WSREP_INFO("Waiting for SST to complete."); - mysql_cond_wait (&COND_wsrep_sst, &LOCK_wsrep_sst); + mysql_cond_timedwait (&COND_wsrep_sst, &LOCK_wsrep_sst, &wtime); + + if (!sst_complete) + { + total_wtime += wtime.tv_sec; + WSREP_DEBUG("Waiting for SST to complete. waited %u secs.", total_wtime); + service_manager_extend_timeout(WSREP_EXTEND_TIMEOUT_INTERVAL, + "WSREP state transfer ongoing, current seqno: %ld", local_seqno); + } } if (local_seqno >= 0) @@ -1298,10 +1319,22 @@ void wsrep_SE_init_grab() void wsrep_SE_init_wait() { + struct timespec wtime = {WSREP_TIMEDWAIT_SECONDS, 0}; + uint32 total_wtime=0; + while (SE_initialized == false) { - mysql_cond_wait (&COND_wsrep_sst_init, &LOCK_wsrep_sst_init); + mysql_cond_timedwait (&COND_wsrep_sst_init, &LOCK_wsrep_sst_init, &wtime); + + if (!SE_initialized) + { + total_wtime += wtime.tv_sec; + WSREP_DEBUG("Waiting for SST to complete. waited %u secs.", total_wtime); + service_manager_extend_timeout(WSREP_EXTEND_TIMEOUT_INTERVAL, + "WSREP SE initialization ongoing."); + } } + mysql_mutex_unlock (&LOCK_wsrep_sst_init); }