developers
Threads by month
- ----- 2024 -----
- December
- November
- October
- September
- August
- July
- June
- May
- April
- March
- February
- January
- ----- 2023 -----
- December
- November
- October
- September
- August
- July
- June
- May
- April
- March
- February
- January
- ----- 2022 -----
- December
- November
- October
- September
- August
- July
- June
- May
- April
- March
- February
- January
- ----- 2021 -----
- December
- November
- October
- September
- August
- July
- June
- May
- April
- March
- February
- January
- ----- 2020 -----
- December
- November
- October
- September
- August
- July
- June
- May
- April
- March
- February
- January
- ----- 2019 -----
- December
- November
- October
- September
- August
- July
- June
- May
- April
- March
- February
- January
- ----- 2018 -----
- December
- November
- October
- September
- August
- July
- June
- May
- April
- March
- February
- January
- ----- 2017 -----
- December
- November
- October
- September
- August
- July
- June
- May
- April
- March
- February
- January
- ----- 2016 -----
- December
- November
- October
- September
- August
- July
- June
- May
- April
- March
- February
- January
- ----- 2015 -----
- December
- November
- October
- September
- August
- July
- June
- May
- April
- March
- February
- January
- ----- 2014 -----
- December
- November
- October
- September
- August
- July
- June
- May
- April
- March
- February
- January
- ----- 2013 -----
- December
- November
- October
- September
- August
- July
- June
- May
- April
- March
- February
- January
- ----- 2012 -----
- December
- November
- October
- September
- August
- July
- June
- May
- April
- March
- February
- January
- ----- 2011 -----
- December
- November
- October
- September
- August
- July
- June
- May
- April
- March
- February
- January
- ----- 2010 -----
- December
- November
- October
- September
- August
- July
- June
- May
- April
- March
- February
- January
- ----- 2009 -----
- December
- November
- October
- September
- August
- July
- June
- May
- April
- March
- February
February 2010
- 24 participants
- 291 discussions
[Maria-developers] Rev 2741: Group commit for maria engine. in file:///Users/bell/maria/bzr/work-maria-5.2-groupcommit2/
by sanjaï¼ askmonty.org 12 Feb '10
by sanjaï¼ askmonty.org 12 Feb '10
12 Feb '10
At file:///Users/bell/maria/bzr/work-maria-5.2-groupcommit2/
------------------------------------------------------------
revno: 2741
revision-id: sanja(a)askmonty.org-20100212131228-bgxli0wfybhjkvg9
parent: sergii(a)pisem.net-20100212084731-b5jst7oxhzp251pg
committer: sanja(a)askmonty.org
branch nick: work-maria-5.2-groupcommit2
timestamp: Fri 2010-02-12 15:12:28 +0200
message:
Group commit for maria engine.
=== added file 'mysql-test/suite/maria/r/group_commit.result'
--- a/mysql-test/suite/maria/r/group_commit.result 1970-01-01 00:00:00 +0000
+++ b/mysql-test/suite/maria/r/group_commit.result 2010-02-12 13:12:28 +0000
@@ -0,0 +1,17 @@
+drop table if exists t1;
+create table t1 (a int);
+SET GLOBAL maria_group_commit="NONE";
+SET GLOBAL maria_group_commit_interval= 0;
+SET GLOBAL maria_group_commit="NONE";
+SET GLOBAL maria_group_commit_interval= 100;
+SET GLOBAL maria_group_commit="HARD";
+SET GLOBAL maria_group_commit_interval= 0;
+SET GLOBAL maria_group_commit="HARD";
+SET GLOBAL maria_group_commit_interval= 100;
+SET GLOBAL maria_group_commit="SOFT";
+SET GLOBAL maria_group_commit_interval= 0;
+SET GLOBAL maria_group_commit="SOFT";
+SET GLOBAL maria_group_commit_interval= 100;
+SET GLOBAL maria_group_commit="NONE";
+SET GLOBAL maria_group_commit_interval= 0;
+drop table t1;
=== modified file 'mysql-test/suite/maria/r/maria3.result'
--- a/mysql-test/suite/maria/r/maria3.result 2009-09-18 01:04:43 +0000
+++ b/mysql-test/suite/maria/r/maria3.result 2010-02-12 13:12:28 +0000
@@ -306,6 +306,8 @@
maria_block_size 8192
maria_checkpoint_interval 30
maria_force_start_after_recovery_failures 0
+maria_group_commit none
+maria_group_commit_interval 0
maria_log_file_size 4294959104
maria_log_purge_type immediate
maria_max_sort_file_size 9223372036853727232
@@ -328,6 +330,7 @@
Maria_pagecache_reads #
Maria_pagecache_write_requests #
Maria_pagecache_writes #
+Maria_transaction_log_syncs #
create table t1 (b char(0));
insert into t1 values(NULL),("");
select length(b) from t1;
=== added file 'mysql-test/suite/maria/t/group_commit.test'
--- a/mysql-test/suite/maria/t/group_commit.test 1970-01-01 00:00:00 +0000
+++ b/mysql-test/suite/maria/t/group_commit.test 2010-02-12 13:12:28 +0000
@@ -0,0 +1,71 @@
+# Test different ways of syncing (mostly syntax)
+
+--disable_warnings
+drop table if exists t1;
+--enable_warnings
+
+create table t1 (a int);
+
+SET GLOBAL maria_group_commit="NONE";
+SET GLOBAL maria_group_commit_interval= 0;
+--disable_query_log
+let $num = 5000;
+while ($num)
+{
+ insert into t1 values (1);
+ dec $num;
+}
+--enable_query_log
+SET GLOBAL maria_group_commit="NONE";
+SET GLOBAL maria_group_commit_interval= 100;
+--disable_query_log
+let $num = 5000;
+while ($num)
+{
+ insert into t1 values (1);
+ dec $num;
+}
+--enable_query_log
+SET GLOBAL maria_group_commit="HARD";
+SET GLOBAL maria_group_commit_interval= 0;
+--disable_query_log
+let $num = 5000;
+while ($num)
+{
+ insert into t1 values (1);
+ dec $num;
+}
+--enable_query_log
+SET GLOBAL maria_group_commit="HARD";
+SET GLOBAL maria_group_commit_interval= 100;
+--disable_query_log
+let $num = 5000;
+while ($num)
+{
+ insert into t1 values (1);
+ dec $num;
+}
+--enable_query_log
+SET GLOBAL maria_group_commit="SOFT";
+SET GLOBAL maria_group_commit_interval= 0;
+--disable_query_log
+let $num = 5000;
+while ($num)
+{
+ insert into t1 values (1);
+ dec $num;
+}
+--enable_query_log
+SET GLOBAL maria_group_commit="SOFT";
+SET GLOBAL maria_group_commit_interval= 100;
+--disable_query_log
+let $num = 5000;
+while ($num)
+{
+ insert into t1 values (1);
+ dec $num;
+}
+--enable_query_log
+SET GLOBAL maria_group_commit="NONE";
+SET GLOBAL maria_group_commit_interval= 0;
+drop table t1;
=== added directory 'randgen'
=== added directory 'randgen/conf'
=== added file 'randgen/conf/maria_group_commit.yy'
--- a/randgen/conf/maria_group_commit.yy 1970-01-01 00:00:00 +0000
+++ b/randgen/conf/maria_group_commit.yy 2010-02-12 13:12:28 +0000
@@ -0,0 +1,181 @@
+# test of group commit switching
+
+query:
+ select | insert | update| delete |
+ select | insert | update| delete |
+ select | insert | update| delete |
+ select | insert | update| delete |
+ select | insert | update| delete |
+ select | insert | update| delete |
+ select | insert | update| delete |
+ select | insert | update| delete |
+ select | insert | update| delete |
+ select | insert | update| delete |
+ select | insert | update| delete |
+ select | insert | update| delete |
+ select | insert | update| delete |
+ select | insert | update| delete |
+ select | insert | update| delete |
+ select | insert | update| delete |
+ select | insert | update| delete |
+ select | insert | update| delete |
+ select | insert | update| delete |
+ select | insert | update| delete |
+ select | insert | update| delete |
+ select | insert | update| delete |
+ select | insert | update| delete |
+ select | insert | update| delete |
+ select | insert | update| delete |
+ select | insert | update| delete |
+ select | insert | update| delete |
+ select | insert | update| delete |
+ select | insert | update| delete |
+ select | insert | update| delete |
+ select | insert | update| delete |
+ select | insert | update| delete |
+ select | insert | update| delete |
+ select | insert | update| delete |
+ select | insert | update| delete |
+ select | insert | update| delete |
+ select | insert | update| delete |
+ select | insert | update| delete |
+ select | insert | update| delete |
+ select | insert | update| delete |
+ select | insert | update| delete |
+ select | insert | update| delete |
+ select | insert | update| delete |
+ select | insert | update| delete |
+ select | insert | update| delete |
+ select | insert | update| delete |
+ select | insert | update| delete |
+ select | insert | update| delete |
+ select | insert | update| delete |
+ select | insert | update| delete |
+ select | insert | update| delete |
+ select | insert | update| delete |
+ select | insert | update| delete |
+ select | insert | update| delete |
+ select | insert | update| delete |
+ select | insert | update| delete |
+ select | insert | update| delete |
+ select | insert | update| delete |
+ select | insert | update| delete |
+ select | insert | update| delete |
+ select | insert | update| delete |
+ select | insert | update| delete |
+ select | insert | update| delete |
+ select | insert | update| delete |
+ select | insert | update| delete |
+ select | insert | update| delete |
+ select | insert | update| delete |
+ select | insert | update| delete |
+ select | insert | update| delete |
+ select | insert | update| delete |
+ select | insert | update| delete |
+ select | insert | update| delete |
+ select | insert | update| delete |
+ select | insert | update| delete |
+ select | insert | update| delete |
+ select | insert | update| delete |
+ select | insert | update| delete |
+ select | insert | update| delete |
+ select | insert | update| delete |
+ select | insert | update| delete |
+ select | insert | update| delete |
+ select | insert | update| delete |
+ select | insert | update| delete |
+ select | insert | update| delete |
+ select | insert | update| delete |
+ select | insert | update| delete |
+ select | insert | update| delete |
+ select | insert | update| delete |
+ select | insert | update| delete |
+ select | insert | update| delete |
+ select | insert | update| delete |
+ select | insert | update| delete |
+ select | insert | update| delete |
+ select | insert | update| delete |
+ select | insert | update| delete |
+ select | insert | update| delete |
+ select | insert | update| delete |
+ select | insert | update| delete |
+ select | insert | update| delete |
+ select | insert | update| delete |
+ select | insert | update| delete |
+ select | insert | update| delete |
+ select | insert | update| delete |
+ select | insert | update| delete |
+ select | insert | update| delete |
+ select | insert | update| delete |
+ select | insert | update| delete |
+ select | insert | update| delete |
+ select | insert | update| delete |
+ select | insert | update| delete |
+ select | insert | update| delete |
+ select | insert | update| delete |
+ select | insert | update| delete |
+ select | insert | update| delete |
+ select | insert | update| delete |
+ select | insert | update| delete |
+ select | insert | update| delete |
+ select | insert | update| delete |
+ select | insert | update| delete |
+ select | insert | update| delete |
+ change_group_commit | change_interval;
+
+
+select:
+ SELECT select_item FROM join where order_by limit;
+
+select_item:
+ * | X . _field ;
+
+join:
+ _table AS X |
+ _table AS X LEFT JOIN _table AS Y ON ( X . _field = Y . _field ) ;
+
+where:
+ |
+ WHERE X . _field < value |
+ WHERE X . _field > value |
+ WHERE X . _field = value ;
+
+where_delete:
+ |
+ WHERE _field < value |
+ WHERE _field > value |
+ WHERE _field = value ;
+
+order_by:
+ | ORDER BY X . _field ;
+
+limit:
+ | LIMIT _digit ;
+
+insert:
+ INSERT INTO _table ( _field , _field ) VALUES ( value , value ) ;
+
+update:
+ UPDATE _table AS X SET _field = value where order_by limit ;
+
+delete:
+ DELETE FROM _table where_delete LIMIT _digit ;
+
+value:
+ ' _letter ' | _digit | _date | _datetime | _time | _english ;
+
+change_group_commit:
+ SET GLOBAL MARIA_GROUP_COMMIT=none_soft_hard;
+
+none_soft_hard:
+ NONE | SOFT | HARD;
+
+change_interval:
+ set_interval | set_interval | set_interval | set_interval |
+ drop_interval;
+
+set_interval:
+ SET GLOBAL MARIA_GROUP_COMMIT_INTERVAL=_tinyint_unsigned;
+
+drop_interval:
+ SET GLOBAL MARIA_GROUP_COMMIT_INTERVAL=0;
=== modified file 'storage/maria/ha_maria.cc'
--- a/storage/maria/ha_maria.cc 2010-02-10 19:06:24 +0000
+++ b/storage/maria/ha_maria.cc 2010-02-12 13:12:28 +0000
@@ -102,22 +102,40 @@
array_elements(maria_translog_purge_type_names) - 1, "",
maria_translog_purge_type_names, NULL
};
+
+/* transactional log directory sync */
const char *maria_sync_log_dir_names[]=
{
"NEVER", "NEWFILE", "ALWAYS", NullS
};
-
TYPELIB maria_sync_log_dir_typelib=
{
array_elements(maria_sync_log_dir_names) - 1, "",
maria_sync_log_dir_names, NULL
};
+/* transactional log group commit */
+const char *maria_group_commit_names[]=
+{
+ "none", "hard", "soft", NullS
+};
+TYPELIB maria_group_commit_typelib=
+{
+ array_elements(maria_group_commit_names) - 1, "",
+ maria_group_commit_names, NULL
+};
+
/** Interval between background checkpoints in seconds */
static ulong checkpoint_interval;
static void update_checkpoint_interval(MYSQL_THD thd,
struct st_mysql_sys_var *var,
void *var_ptr, const void *save);
+static void update_maria_group_commit(MYSQL_THD thd,
+ struct st_mysql_sys_var *var,
+ void *var_ptr, const void *save);
+static void update_maria_group_commit_interval(MYSQL_THD thd,
+ struct st_mysql_sys_var *var,
+ void *var_ptr, const void *save);
/** After that many consecutive recovery failures, remove logs */
static ulong force_start_after_recovery_failures;
static void update_log_file_size(MYSQL_THD thd,
@@ -164,6 +182,24 @@
NULL, update_log_file_size, TRANSLOG_FILE_SIZE,
TRANSLOG_MIN_FILE_SIZE, 0xffffffffL, TRANSLOG_PAGE_SIZE);
+static MYSQL_SYSVAR_ENUM(group_commit, maria_group_commit,
+ PLUGIN_VAR_RQCMDARG,
+ "Specifies maria group commit mode. "
+ "Possible values are \"none\" (no group commit), "
+ "\"hard\" (with waiting to actual commit), "
+ "\"soft\" (no wait for commit (DANGEROUS!!!))",
+ NULL, update_maria_group_commit,
+ TRANSLOG_GCOMMIT_NONE, &maria_group_commit_typelib);
+
+static MYSQL_SYSVAR_ULONG(group_commit_interval, maria_group_commit_interval,
+ PLUGIN_VAR_RQCMDARG,
+ "Interval between commite in microseconds (1/1000000c)."
+ " 0 stands for no waiting"
+ " for other threads to come and do a commit in \"hard\" mode and no"
+ " sync()/commit at all in \"soft\" mode. Option has only an effect"
+ " if maria_group_commit is used",
+ NULL, update_maria_group_commit_interval, 0, 0, UINT_MAX, 1);
+
static MYSQL_SYSVAR_ENUM(log_purge_type, log_purge_type,
PLUGIN_VAR_RQCMDARG,
"Specifies how maria transactional log will be purged. "
@@ -3278,6 +3314,8 @@
MYSQL_SYSVAR(block_size),
MYSQL_SYSVAR(checkpoint_interval),
MYSQL_SYSVAR(force_start_after_recovery_failures),
+ MYSQL_SYSVAR(group_commit),
+ MYSQL_SYSVAR(group_commit_interval),
MYSQL_SYSVAR(page_checksum),
MYSQL_SYSVAR(log_dir_path),
MYSQL_SYSVAR(log_file_size),
@@ -3309,6 +3347,92 @@
}
/**
+ @brief Updates group commit mode
+*/
+
+static void update_maria_group_commit(MYSQL_THD thd,
+ struct st_mysql_sys_var *var,
+ void *var_ptr, const void *save)
+{
+ ulong value= (ulong)*((long *)var_ptr);
+ DBUG_ENTER("update_maria_group_commit");
+ DBUG_PRINT("enter", ("old value: %lu new value %lu rate %lu",
+ value, (ulong)(*(long *)save),
+ maria_group_commit_interval));
+ /* old value */
+ switch (value) {
+ case TRANSLOG_GCOMMIT_NONE:
+ break;
+ case TRANSLOG_GCOMMIT_HARD:
+ translog_hard_group_commit(FALSE);
+ break;
+ case TRANSLOG_GCOMMIT_SOFT:
+ translog_soft_sync(FALSE);
+ if (maria_group_commit_interval)
+ translog_soft_sync_end();
+ break;
+ default:
+ DBUG_ASSERT(0); /* impossible */
+ }
+ value= *(ulong *)var_ptr= (ulong)(*(long *)save);
+ translog_sync();
+ /* new value */
+ switch (value) {
+ case TRANSLOG_GCOMMIT_NONE:
+ break;
+ case TRANSLOG_GCOMMIT_HARD:
+ translog_hard_group_commit(TRUE);
+ break;
+ case TRANSLOG_GCOMMIT_SOFT:
+ translog_soft_sync(TRUE);
+ /* variable change made under global lock so we can just read it */
+ if (maria_group_commit_interval)
+ translog_soft_sync_start();
+ break;
+ default:
+ DBUG_ASSERT(0); /* impossible */
+ }
+ DBUG_VOID_RETURN;
+}
+
+/**
+ @brief Updates group commit interval
+*/
+
+static void update_maria_group_commit_interval(MYSQL_THD thd,
+ struct st_mysql_sys_var *var,
+ void *var_ptr, const void *save)
+{
+ ulong new_value= (ulong)*((long *)save);
+ ulong *value_ptr= (ulong*) var_ptr;
+ DBUG_ENTER("update_maria_group_commit_interval");
+ DBUG_PRINT("enter", ("old value: %lu new value %lu group commit %lu",
+ *value_ptr, new_value, maria_group_commit));
+
+ /* variable change made under global lock so we can just read it */
+ switch (maria_group_commit) {
+ case TRANSLOG_GCOMMIT_NONE:
+ *value_ptr= new_value;
+ translog_set_group_commit_interval(new_value);
+ break;
+ case TRANSLOG_GCOMMIT_HARD:
+ *value_ptr= new_value;
+ translog_set_group_commit_interval(new_value);
+ break;
+ case TRANSLOG_GCOMMIT_SOFT:
+ if (*value_ptr)
+ translog_soft_sync_end();
+ translog_set_group_commit_interval(new_value);
+ if ((*value_ptr= new_value))
+ translog_soft_sync_start();
+ break;
+ default:
+ DBUG_ASSERT(0); /* impossible */
+ }
+ DBUG_VOID_RETURN;
+}
+
+/**
@brief Updates the transaction log file limit.
*/
@@ -3330,6 +3454,7 @@
{"Maria_pagecache_reads", (char*) &maria_pagecache_var.global_cache_read, SHOW_LONGLONG},
{"Maria_pagecache_write_requests", (char*) &maria_pagecache_var.global_cache_w_requests, SHOW_LONGLONG},
{"Maria_pagecache_writes", (char*) &maria_pagecache_var.global_cache_write, SHOW_LONGLONG},
+ {"Maria_transaction_log_syncs", (char*) &translog_syncs, SHOW_LONGLONG},
{NullS, NullS, SHOW_LONG}
};
=== modified file 'storage/maria/ma_init.c'
--- a/storage/maria/ma_init.c 2008-10-09 20:03:54 +0000
+++ b/storage/maria/ma_init.c 2010-02-12 13:12:28 +0000
@@ -82,6 +82,11 @@
maria_inited= maria_multi_threaded= FALSE;
ft_free_stopwords();
ma_checkpoint_end();
+ if (translog_status == TRANSLOG_OK)
+ {
+ translog_soft_sync_end();
+ translog_sync();
+ }
if ((trid= trnman_get_max_trid()) > max_trid_in_control_file)
{
/*
=== modified file 'storage/maria/ma_loghandler.c'
--- a/storage/maria/ma_loghandler.c 2010-01-06 21:27:53 +0000
+++ b/storage/maria/ma_loghandler.c 2010-02-12 13:12:28 +0000
@@ -18,6 +18,7 @@
#include "ma_blockrec.h" /* for some constants and in-write hooks */
#include "ma_key_recover.h" /* For some in-write hooks */
#include "ma_checkpoint.h"
+#include "ma_servicethread.h"
/*
On Windows, neither my_open() nor my_sync() work for directories.
@@ -47,6 +48,15 @@
#include <m_ctype.h>
#endif
+/** @brief protects checkpoint_in_progress */
+static pthread_mutex_t LOCK_soft_sync;
+/** @brief for killing the background checkpoint thread */
+static pthread_cond_t COND_soft_sync;
+/** @brief control structure for checkpoint background thread */
+static MA_SERVICE_THREAD_CONTROL soft_sync_control=
+ {THREAD_DEAD, FALSE, &LOCK_soft_sync, &COND_soft_sync};
+
+
/* transaction log file descriptor */
typedef struct st_translog_file
{
@@ -124,10 +134,24 @@
/* Previous buffer offset to detect it flush finish */
TRANSLOG_ADDRESS prev_buffer_offset;
/*
+ If the buffer was forced to close it save value of its horizon
+ otherwise LSN_IMPOSSIBLE
+ */
+ TRANSLOG_ADDRESS pre_force_close_horizon;
+ /*
How much is written (or will be written when copy_to_buffer_in_progress
become 0) to this buffer
*/
translog_size_t size;
+ /*
+ When moving from one log buffer to another, we write the last of the
+ previous buffer to file and then move to start using the new log
+ buffer. In the case of a part filed last page, this page is not moved
+ to the start of the new buffer but instead we set the 'skip_data'
+ variable to tell us how much data at the beginning of the buffer is not
+ relevant.
+ */
+ uint skipped_data;
/* File handler for this buffer */
TRANSLOG_FILE *file;
/* Threads which are waiting for buffer filling/freeing */
@@ -304,6 +328,7 @@
*/
pthread_mutex_t log_flush_lock;
pthread_cond_t log_flush_cond;
+ pthread_cond_t new_goal_cond;
/* Protects changing of headers of finished files (max_lsn) */
pthread_mutex_t file_header_lock;
@@ -344,13 +369,39 @@
ulong log_purge_type= TRANSLOG_PURGE_IMMIDIATE;
ulong log_file_size= TRANSLOG_FILE_SIZE;
+/* sync() of log files directory mode */
ulong sync_log_dir= TRANSLOG_SYNC_DIR_NEWFILE;
+ulong maria_group_commit= TRANSLOG_GCOMMIT_NONE;
+ulong maria_group_commit_interval= 0;
/* Marker for end of log */
static uchar end_of_log= 0;
#define END_OF_LOG &end_of_log
+/**
+ Switch for "soft" sync (no real sync() but periodical sync by service
+ thread)
+*/
+static volatile my_bool soft_sync= FALSE;
+/**
+ Switch for "hard" group commit mode
+*/
+static volatile my_bool hard_group_commit= FALSE;
+/**
+ File numbers interval which have to be sync()
+*/
+static uint32 soft_sync_min= 0;
+static uint32 soft_sync_max= 0;
+static uint32 soft_need_sync= 1;
+/**
+ stores interval in microseconds
+*/
+static uint32 group_commit_wait= 0;
enum enum_translog_status translog_status= TRANSLOG_UNINITED;
+ulonglong translog_syncs= 0; /* Number of sync()s */
+
+/* time of last flush */
+static ulonglong flush_start= 0;
/* chunk types */
#define TRANSLOG_CHUNK_LSN 0x00 /* 0 chunk refer as LSN (head or tail */
@@ -980,12 +1031,17 @@
static TRANSLOG_FILE *get_current_logfile()
{
TRANSLOG_FILE *file;
+ DBUG_ENTER("get_current_logfile");
rw_rdlock(&log_descriptor.open_files_lock);
+ DBUG_PRINT("info", ("max_file: %lu min_file: %lu open_files: %lu",
+ (ulong) log_descriptor.max_file,
+ (ulong) log_descriptor.min_file,
+ (ulong) log_descriptor.open_files.elements));
DBUG_ASSERT(log_descriptor.max_file - log_descriptor.min_file + 1 ==
log_descriptor.open_files.elements);
file= *dynamic_element(&log_descriptor.open_files, 0, TRANSLOG_FILE **);
rw_unlock(&log_descriptor.open_files_lock);
- return (file);
+ DBUG_RETURN(file);
}
uchar NEAR maria_trans_file_magic[]=
@@ -1069,6 +1125,7 @@
static my_bool translog_max_lsn_to_header(File file, LSN lsn)
{
uchar lsn_buff[LSN_STORE_SIZE];
+ my_bool rc;
DBUG_ENTER("translog_max_lsn_to_header");
DBUG_PRINT("enter", ("File descriptor: %ld "
"lsn: (%lu,0x%lx)",
@@ -1077,11 +1134,17 @@
lsn_store(lsn_buff, lsn);
- DBUG_RETURN(my_pwrite(file, lsn_buff,
- LSN_STORE_SIZE,
- (LOG_HEADER_DATA_SIZE - LSN_STORE_SIZE),
- log_write_flags) != 0 ||
- my_sync(file, MYF(MY_WME)) != 0);
+ rc= (my_pwrite(file, lsn_buff,
+ LSN_STORE_SIZE,
+ (LOG_HEADER_DATA_SIZE - LSN_STORE_SIZE),
+ log_write_flags) != 0 ||
+ my_sync(file, MYF(MY_WME)) != 0);
+ /*
+ We should not increase counter in case of error above, but it is so
+ unlikely that we can ignore this case
+ */
+ translog_syncs++;
+ DBUG_RETURN(rc);
}
@@ -1423,7 +1486,9 @@
static my_bool translog_buffer_init(struct st_translog_buffer *buffer, int num)
{
DBUG_ENTER("translog_buffer_init");
- buffer->prev_last_lsn= buffer->last_lsn= LSN_IMPOSSIBLE;
+ buffer->pre_force_close_horizon=
+ buffer->prev_last_lsn= buffer->last_lsn=
+ LSN_IMPOSSIBLE;
DBUG_PRINT("info", ("last_lsn and prev_last_lsn set to 0 buffer: 0x%lx",
(ulong) buffer));
@@ -1435,6 +1500,7 @@
memset(buffer->buffer, TRANSLOG_FILLER, TRANSLOG_WRITE_BUFFER);
/* Buffer size */
buffer->size= 0;
+ buffer->skipped_data= 0;
/* cond of thread which is waiting for buffer filling */
if (pthread_cond_init(&buffer->waiting_filling_buffer, 0))
DBUG_RETURN(1);
@@ -1489,7 +1555,10 @@
TODO: sync only we have changed the log
*/
if (!file->is_sync)
+ {
rc= my_sync(file->handler.file, MYF(MY_WME));
+ translog_syncs++;
+ }
rc|= my_close(file->handler.file, MYF(MY_WME));
my_free(file, MYF(0));
return test(rc);
@@ -2044,7 +2113,8 @@
(ulong) LSN_OFFSET(log_descriptor.horizon),
(ulong) LSN_OFFSET(log_descriptor.horizon)));
DBUG_ASSERT(buffer_no == buffer->buffer_no);
- buffer->prev_last_lsn= buffer->last_lsn= LSN_IMPOSSIBLE;
+ buffer->pre_force_close_horizon=
+ buffer->prev_last_lsn= buffer->last_lsn= LSN_IMPOSSIBLE;
DBUG_PRINT("info", ("last_lsn and prev_last_lsn set to 0 buffer: 0x%lx",
(ulong) buffer));
buffer->offset= log_descriptor.horizon;
@@ -2052,6 +2122,7 @@
buffer->file= get_current_logfile();
buffer->overlay= 0;
buffer->size= 0;
+ buffer->skipped_data= 0;
translog_cursor_init(cursor, buffer, buffer_no);
DBUG_PRINT("info", ("file: #%ld (%d) init cursor #%u: 0x%lx "
"chaser: %d Size: %lu (%lu)",
@@ -2523,6 +2594,7 @@
TRANSLOG_ADDRESS offset= buffer->offset;
TRANSLOG_FILE *file= buffer->file;
uint8 ver= buffer->ver;
+ uint skipped_data;
DBUG_ENTER("translog_buffer_flush");
DBUG_PRINT("enter",
("Buffer: #%u 0x%lx file: %d offset: (%lu,0x%lx) size: %lu",
@@ -2557,6 +2629,8 @@
disk
*/
file= buffer->file;
+ skipped_data= buffer->skipped_data;
+ DBUG_ASSERT(skipped_data < TRANSLOG_PAGE_SIZE);
for (i= 0, pg= LSN_OFFSET(buffer->offset) / TRANSLOG_PAGE_SIZE;
i < buffer->size;
i+= TRANSLOG_PAGE_SIZE, pg++)
@@ -2573,13 +2647,16 @@
DBUG_ASSERT(i + TRANSLOG_PAGE_SIZE <= buffer->size);
if (translog_status != TRANSLOG_OK && translog_status != TRANSLOG_SHUTDOWN)
DBUG_RETURN(1);
- if (pagecache_inject(log_descriptor.pagecache,
+ if (pagecache_write_part(log_descriptor.pagecache,
&file->handler, pg, 3,
buffer->buffer + i,
PAGECACHE_PLAIN_PAGE,
PAGECACHE_LOCK_LEFT_UNLOCKED,
- PAGECACHE_PIN_LEFT_UNPINNED, 0,
- LSN_IMPOSSIBLE))
+ PAGECACHE_PIN_LEFT_UNPINNED,
+ PAGECACHE_WRITE_DONE, 0,
+ LSN_IMPOSSIBLE,
+ skipped_data,
+ TRANSLOG_PAGE_SIZE - skipped_data))
{
DBUG_PRINT("error",
("Can't write page (%lu,0x%lx) to pagecache, error: %d",
@@ -2589,10 +2666,12 @@
translog_stop_writing();
DBUG_RETURN(1);
}
+ skipped_data= 0;
}
file->is_sync= 0;
- if (my_pwrite(file->handler.file, buffer->buffer,
- buffer->size, LSN_OFFSET(buffer->offset),
+ if (my_pwrite(file->handler.file, buffer->buffer + buffer->skipped_data,
+ buffer->size - buffer->skipped_data,
+ LSN_OFFSET(buffer->offset) + buffer->skipped_data,
log_write_flags))
{
DBUG_PRINT("error", ("Can't write buffer (%lu,0x%lx) size %lu "
@@ -2985,6 +3064,7 @@
uchar *from, *table= NULL;
int is_last_unfinished_page;
uint last_protected_sector= 0;
+ uint skipped_data= curr_buffer->skipped_data;
TRANSLOG_FILE file_copy;
uint8 ver= curr_buffer->ver;
translog_wait_for_writers(curr_buffer);
@@ -2997,7 +3077,38 @@
}
DBUG_ASSERT(LSN_FILE_NO(addr) == LSN_FILE_NO(curr_buffer->offset));
from= curr_buffer->buffer + (addr - curr_buffer->offset);
- memcpy(buffer, from, TRANSLOG_PAGE_SIZE);
+ if (skipped_data && addr == curr_buffer->offset)
+ {
+ /*
+ We read page part of which is not present in buffer,
+ so we should read absent part from file (page cache actually)
+ */
+ file= get_logfile_by_number(file_no);
+ DBUG_ASSERT(file != NULL);
+ /*
+ it's ok to not lock the page because:
+ - The log handler has it's own page cache.
+ - There is only one thread that can access the log
+ cache at a time
+ */
+ if (!(buffer= pagecache_read(log_descriptor.pagecache,
+ &file->handler,
+ LSN_OFFSET(addr) / TRANSLOG_PAGE_SIZE,
+ 3, buffer,
+ PAGECACHE_PLAIN_PAGE,
+ PAGECACHE_LOCK_LEFT_UNLOCKED,
+ NULL)))
+ DBUG_RETURN(NULL);
+ }
+ else
+ skipped_data= 0; /* Read after skipped in buffer data */
+ /*
+ Now we have correct data in buffer up to 'skipped_data'. The
+ following memcpy() will move the data from the internal buffer
+ that was not yet on disk.
+ */
+ memcpy(buffer + skipped_data, from + skipped_data,
+ TRANSLOG_PAGE_SIZE - skipped_data);
/*
We can use copy then in translog_page_validator() because it
do not put it permanently somewhere.
@@ -3291,6 +3402,7 @@
uint32 next_page_offset, page_rest;
uint32 i;
File fd;
+ int rc;
TRANSLOG_VALIDATOR_DATA data;
char path[FN_REFLEN];
uchar page_buff[TRANSLOG_PAGE_SIZE];
@@ -3316,14 +3428,19 @@
TRANSLOG_PAGE_SIZE);
page_rest= next_page_offset - LSN_OFFSET(addr);
memset(page_buff, TRANSLOG_FILLER, page_rest);
- if ((fd= open_logfile_by_number_no_cache(LSN_FILE_NO(addr))) < 0 ||
- ((my_chsize(fd, next_page_offset, TRANSLOG_FILLER, MYF(MY_WME)) ||
- (page_rest && my_pwrite(fd, page_buff, page_rest, LSN_OFFSET(addr),
- log_write_flags)) ||
- my_sync(fd, MYF(MY_WME))) |
- my_close(fd, MYF(MY_WME))) ||
- (sync_log_dir >= TRANSLOG_SYNC_DIR_ALWAYS &&
- sync_dir(log_descriptor.directory_fd, MYF(MY_WME | MY_IGNORE_BADFD))))
+ rc= ((fd= open_logfile_by_number_no_cache(LSN_FILE_NO(addr))) < 0 ||
+ ((my_chsize(fd, next_page_offset, TRANSLOG_FILLER, MYF(MY_WME)) ||
+ (page_rest && my_pwrite(fd, page_buff, page_rest, LSN_OFFSET(addr),
+ log_write_flags)) ||
+ my_sync(fd, MYF(MY_WME)))));
+ translog_syncs++;
+ rc|= (fd > 0 && my_close(fd, MYF(MY_WME)));
+ if (sync_log_dir >= TRANSLOG_SYNC_DIR_ALWAYS)
+ {
+ rc|= sync_dir(log_descriptor.directory_fd, MYF(MY_WME | MY_IGNORE_BADFD));
+ translog_syncs++;
+ }
+ if (rc)
DBUG_RETURN(1);
/* fix the horizon */
@@ -3483,7 +3600,10 @@
my_bool version_changed= 0;
DBUG_ENTER("translog_init_with_table");
+ translog_syncs= 0;
+ flush_start= 0;
id_to_share= NULL;
+
log_descriptor.directory_fd= -1;
log_descriptor.is_everything_flushed= 1;
log_descriptor.flush_in_progress= 0;
@@ -3511,6 +3631,7 @@
pthread_mutex_init(&log_descriptor.dirty_buffer_mask_lock,
MY_MUTEX_INIT_FAST) ||
pthread_cond_init(&log_descriptor.log_flush_cond, 0) ||
+ pthread_cond_init(&log_descriptor.new_goal_cond, 0) ||
my_rwlock_init(&log_descriptor.open_files_lock,
NULL) ||
my_init_dynamic_array(&log_descriptor.open_files,
@@ -3912,7 +4033,6 @@
log_descriptor.flushed= log_descriptor.horizon;
log_descriptor.in_buffers_only= log_descriptor.bc.buffer->offset;
log_descriptor.max_lsn= LSN_IMPOSSIBLE; /* set to 0 */
- log_descriptor.previous_flush_horizon= log_descriptor.horizon;
/*
Now 'flushed' is set to 'horizon' value, but 'horizon' is (potentially)
address of the next LSN and we want indicate that all LSNs that are
@@ -3995,6 +4115,10 @@
It is beginning of the log => there is no LSNs in the log =>
There is no harm in leaving it "as-is".
*/
+ log_descriptor.previous_flush_horizon= log_descriptor.horizon;
+ DBUG_PRINT("info", ("previous_flush_horizon: (%lu,0x%lx)",
+ LSN_IN_PARTS(log_descriptor.
+ previous_flush_horizon)));
DBUG_RETURN(0);
}
file_no--;
@@ -4070,6 +4194,9 @@
translog_free_record_header(&rec);
}
}
+ log_descriptor.previous_flush_horizon= log_descriptor.horizon;
+ DBUG_PRINT("info", ("previous_flush_horizon: (%lu,0x%lx)",
+ LSN_IN_PARTS(log_descriptor.previous_flush_horizon)));
DBUG_RETURN(0);
err:
ma_message_no_user(0, "log initialization failed");
@@ -4157,6 +4284,7 @@
pthread_mutex_destroy(&log_descriptor.log_flush_lock);
pthread_mutex_destroy(&log_descriptor.dirty_buffer_mask_lock);
pthread_cond_destroy(&log_descriptor.log_flush_cond);
+ pthread_cond_destroy(&log_descriptor.new_goal_cond);
rwlock_destroy(&log_descriptor.open_files_lock);
delete_dynamic(&log_descriptor.open_files);
delete_dynamic(&log_descriptor.unfinished_files);
@@ -6885,11 +7013,11 @@
{
translog_size_t res;
DBUG_ENTER("translog_read_record_header_from_buffer");
- DBUG_ASSERT(translog_is_LSN_chunk(page[page_offset]));
- DBUG_ASSERT(translog_status == TRANSLOG_OK ||
- translog_status == TRANSLOG_READONLY);
DBUG_PRINT("info", ("page byte: 0x%x offset: %u",
(uint) page[page_offset], (uint) page_offset));
+ DBUG_ASSERT(translog_is_LSN_chunk(page[page_offset]));
+ DBUG_ASSERT(translog_status == TRANSLOG_OK ||
+ translog_status == TRANSLOG_READONLY);
buff->type= (page[page_offset] & TRANSLOG_REC_TYPE);
buff->short_trid= uint2korr(page + page_offset + 1);
DBUG_PRINT("info", ("Type %u, Short TrID %u, LSN (%lu,0x%lx)",
@@ -7356,27 +7484,27 @@
"Buffer addr: (%lu,0x%lx) "
"Page addr: (%lu,0x%lx) "
"size: %lu (%lu) Pg: %u left: %u in progress %u",
- (uint) log_descriptor.bc.buffer_no,
- (ulong) log_descriptor.bc.buffer,
- LSN_IN_PARTS(log_descriptor.bc.buffer->offset),
+ (uint) old_buffer_no,
+ (ulong) old_buffer,
+ LSN_IN_PARTS(old_buffer->offset),
(ulong) LSN_FILE_NO(log_descriptor.horizon),
(ulong) (LSN_OFFSET(log_descriptor.horizon) -
log_descriptor.bc.current_page_fill),
- (ulong) log_descriptor.bc.buffer->size,
+ (ulong) old_buffer->size,
(ulong) (log_descriptor.bc.ptr -log_descriptor.bc.
buffer->buffer),
(uint) log_descriptor.bc.current_page_fill,
(uint) left,
- (uint) log_descriptor.bc.buffer->
+ (uint) old_buffer->
copy_to_buffer_in_progress));
translog_lock_assert_owner();
LINT_INIT(current_page_fill);
- new_buff_beginning= log_descriptor.bc.buffer->offset;
- new_buff_beginning+= log_descriptor.bc.buffer->size; /* increase offset */
+ new_buff_beginning= old_buffer->offset;
+ new_buff_beginning+= old_buffer->size; /* increase offset */
DBUG_ASSERT(log_descriptor.bc.ptr !=NULL);
DBUG_ASSERT(LSN_FILE_NO(log_descriptor.horizon) ==
- LSN_FILE_NO(log_descriptor.bc.buffer->offset));
+ LSN_FILE_NO(old_buffer->offset));
translog_check_cursor(&log_descriptor.bc);
DBUG_ASSERT(left < TRANSLOG_PAGE_SIZE);
if (left)
@@ -7387,18 +7515,20 @@
*/
DBUG_PRINT("info", ("left: %u", (uint) left));
+ old_buffer->pre_force_close_horizon=
+ old_buffer->offset + old_buffer->size;
/* decrease offset */
new_buff_beginning-= log_descriptor.bc.current_page_fill;
current_page_fill= log_descriptor.bc.current_page_fill;
memset(log_descriptor.bc.ptr, TRANSLOG_FILLER, left);
- log_descriptor.bc.buffer->size+= left;
+ old_buffer->size+= left;
DBUG_PRINT("info", ("Finish Page buffer #%u: 0x%lx "
"Size: %lu",
- (uint) log_descriptor.bc.buffer->buffer_no,
- (ulong) log_descriptor.bc.buffer,
- (ulong) log_descriptor.bc.buffer->size));
- DBUG_ASSERT(log_descriptor.bc.buffer->buffer_no ==
+ (uint) old_buffer->buffer_no,
+ (ulong) old_buffer,
+ (ulong) old_buffer->size));
+ DBUG_ASSERT(old_buffer->buffer_no ==
log_descriptor.bc.buffer_no);
}
else
@@ -7509,11 +7639,21 @@
if (left)
{
- /*
- TODO: do not copy beginning of the page if we have no CRC or sector
- checks on
- */
- memcpy(new_buffer->buffer, data, current_page_fill);
+ if (log_descriptor.flags &
+ (TRANSLOG_PAGE_CRC | TRANSLOG_SECTOR_PROTECTION))
+ memcpy(new_buffer->buffer, data, current_page_fill);
+ else
+ {
+ /*
+ This page header does not change if we add more data to the page so
+ we can not copy it and will not overwrite later
+ */
+ new_buffer->skipped_data= current_page_fill;
+#ifndef DBUG_OFF
+ memset(new_buffer->buffer, 0xa5, current_page_fill);
+#endif
+ DBUG_ASSERT(new_buffer->skipped_data < TRANSLOG_PAGE_SIZE);
+ }
}
old_buffer->next_buffer_offset= new_buffer->offset;
translog_buffer_lock(new_buffer);
@@ -7561,6 +7701,7 @@
{
log_descriptor.next_pass_max_lsn= lsn;
log_descriptor.max_lsn_requester= pthread_self();
+ pthread_cond_broadcast(&log_descriptor.new_goal_cond);
}
while (flush_no == log_descriptor.flush_no)
{
@@ -7572,66 +7713,78 @@
/**
- @brief Flush the log up to given LSN (included)
-
- @param lsn log record serial number up to which (inclusive)
- the log has to be flushed
-
- @return Operation status
+ @brief sync() range of files (inclusive) and directory (by request)
+
+ @param min min internal file number to flush
+ @param max max internal file number to flush
+ @param sync_dir need sync directory
+
+ return Operation status
@retval 0 OK
@retval 1 Error
-
-*/
-
-my_bool translog_flush(TRANSLOG_ADDRESS lsn)
-{
- LSN sent_to_disk= LSN_IMPOSSIBLE;
- TRANSLOG_ADDRESS flush_horizon;
- uint fn, i;
+*/
+
+static my_bool translog_sync_files(uint32 min, uint32 max,
+ my_bool sync_dir)
+{
+ uint fn;
+ my_bool rc= 0;
+ ulonglong flush_interval;
+ DBUG_ENTER("translog_sync_files");
+ DBUG_PRINT("info", ("min: %lu max: %lu sync dir: %d",
+ (ulong) min, (ulong) max, (int) sync_dir));
+ DBUG_ASSERT(min <= max);
+
+ flush_interval= group_commit_wait;
+ if (flush_interval)
+ flush_start= my_micro_time();
+ for (fn= min; fn <= max; fn++)
+ {
+ TRANSLOG_FILE *file= get_logfile_by_number(fn);
+ DBUG_ASSERT(file != NULL);
+ if (!file->is_sync)
+ {
+ if (my_sync(file->handler.file, MYF(MY_WME)))
+ {
+ rc= 1;
+ translog_stop_writing();
+ DBUG_RETURN(rc);
+ }
+ translog_syncs++;
+ file->is_sync= 1;
+ }
+ }
+
+ if (sync_dir)
+ {
+ if (!(rc= sync_dir(log_descriptor.directory_fd,
+ MYF(MY_WME | MY_IGNORE_BADFD))))
+ translog_syncs++;
+ }
+
+ DBUG_RETURN(rc);
+}
+
+
+/*
+ @brief Flushes buffers with LSNs in them less or equal address <lsn>
+
+ @param lsn address up to which all LSNs should be flushed,
+ can be reset to real last LSN address
+ @parem sent_to_disk returns 'sent to disk' position
+ @param flush_horizon returns horizon of the flush
+
+ @note About terminology see comment to translog_flush().
+*/
+
+void translog_flush_buffers(TRANSLOG_ADDRESS *lsn,
+ TRANSLOG_ADDRESS *sent_to_disk,
+ TRANSLOG_ADDRESS *flush_horizon)
+{
dirty_buffer_mask_t dirty_buffer_mask;
+ uint i;
uint8 last_buffer_no, start_buffer_no;
- my_bool rc= 0;
- DBUG_ENTER("translog_flush");
- DBUG_PRINT("enter", ("Flush up to LSN: (%lu,0x%lx)", LSN_IN_PARTS(lsn)));
- DBUG_ASSERT(translog_status == TRANSLOG_OK ||
- translog_status == TRANSLOG_READONLY);
- LINT_INIT(sent_to_disk);
-
- pthread_mutex_lock(&log_descriptor.log_flush_lock);
- DBUG_PRINT("info", ("Everything is flushed up to (%lu,0x%lx)",
- LSN_IN_PARTS(log_descriptor.flushed)));
- if (cmp_translog_addr(log_descriptor.flushed, lsn) >= 0)
- {
- pthread_mutex_unlock(&log_descriptor.log_flush_lock);
- DBUG_RETURN(0);
- }
- if (log_descriptor.flush_in_progress)
- {
- translog_flush_set_new_goal_and_wait(lsn);
- if (!pthread_equal(log_descriptor.max_lsn_requester, pthread_self()))
- {
- /* fix lsn if it was horizon */
- if (cmp_translog_addr(lsn, log_descriptor.bc.buffer->last_lsn) > 0)
- lsn= BUFFER_MAX_LSN(log_descriptor.bc.buffer);
- translog_flush_wait_for_end(lsn);
- pthread_mutex_unlock(&log_descriptor.log_flush_lock);
- DBUG_RETURN(0);
- }
- log_descriptor.next_pass_max_lsn= LSN_IMPOSSIBLE;
- }
- log_descriptor.flush_in_progress= 1;
- flush_horizon= log_descriptor.previous_flush_horizon;
- DBUG_PRINT("info", ("flush_in_progress is set"));
- pthread_mutex_unlock(&log_descriptor.log_flush_lock);
-
- translog_lock();
- if (log_descriptor.is_everything_flushed)
- {
- DBUG_PRINT("info", ("everything is flushed"));
- rc= (translog_status == TRANSLOG_READONLY);
- translog_unlock();
- goto out;
- }
+ DBUG_ENTER("translog_flush_buffers");
/*
We will recheck information when will lock buffers one by
@@ -7656,15 +7809,15 @@
/*
if LSN up to which we have to flush bigger then maximum LSN of previous
buffer and at least one LSN was saved in the current buffer (last_lsn !=
- LSN_IMPOSSIBLE) then we better finish the current buffer.
+ LSN_IMPOSSIBLE) then we have to close the current buffer.
*/
- if (cmp_translog_addr(lsn, log_descriptor.bc.buffer->prev_last_lsn) > 0 &&
+ if (cmp_translog_addr(*lsn, log_descriptor.bc.buffer->prev_last_lsn) > 0 &&
log_descriptor.bc.buffer->last_lsn != LSN_IMPOSSIBLE)
{
struct st_translog_buffer *buffer= log_descriptor.bc.buffer;
- lsn= log_descriptor.bc.buffer->last_lsn; /* fix lsn if it was horizon */
+ *lsn= log_descriptor.bc.buffer->last_lsn; /* fix lsn if it was horizon */
DBUG_PRINT("info", ("LSN to flush fixed to last lsn: (%lu,0x%lx)",
- LSN_IN_PARTS(log_descriptor.bc.buffer->last_lsn)));
+ LSN_IN_PARTS(log_descriptor.bc.buffer->last_lsn)));
last_buffer_no= log_descriptor.bc.buffer_no;
log_descriptor.is_everything_flushed= 1;
translog_force_current_buffer_to_finish();
@@ -7676,8 +7829,10 @@
TRANSLOG_BUFFERS_NO);
translog_unlock();
}
- sent_to_disk= translog_get_sent_to_disk();
- if (cmp_translog_addr(lsn, sent_to_disk) > 0)
+
+ /* flush buffers */
+ *sent_to_disk= translog_get_sent_to_disk();
+ if (cmp_translog_addr(*lsn, *sent_to_disk) > 0)
{
DBUG_PRINT("info", ("Start buffer #: %u last buffer #: %u",
@@ -7697,53 +7852,238 @@
LSN_IN_PARTS(buffer->last_lsn),
(buffer->file ?
"dirty" : "closed")));
- if (buffer->prev_last_lsn <= lsn &&
+ if (buffer->prev_last_lsn <= *lsn &&
buffer->file != NULL)
{
- DBUG_ASSERT(flush_horizon <= buffer->offset + buffer->size);
- flush_horizon= buffer->offset + buffer->size;
+ DBUG_ASSERT(*flush_horizon <= buffer->offset + buffer->size);
+ *flush_horizon= (buffer->pre_force_close_horizon != LSN_IMPOSSIBLE ?
+ buffer->pre_force_close_horizon :
+ buffer->offset + buffer->size);
+ /* pre_force_close_horizon is reset during new buffer start */
+ DBUG_PRINT("info", ("flush_horizon: (%lu,0x%lx)",
+ LSN_IN_PARTS(*flush_horizon)));
+ DBUG_ASSERT(*flush_horizon <= log_descriptor.horizon);
+
translog_buffer_flush(buffer);
}
translog_buffer_unlock(buffer);
i= (i + 1) % TRANSLOG_BUFFERS_NO;
} while (i != last_buffer_no);
- sent_to_disk= translog_get_sent_to_disk();
- }
-
- /* sync files from previous flush till current one */
- for (fn= LSN_FILE_NO(log_descriptor.flushed); fn <= LSN_FILE_NO(lsn); fn++)
- {
- TRANSLOG_FILE *file= get_logfile_by_number(fn);
- DBUG_ASSERT(file != NULL);
- if (!file->is_sync)
- {
- if (my_sync(file->handler.file, MYF(MY_WME)))
+ *sent_to_disk= translog_get_sent_to_disk();
+ }
+
+ DBUG_VOID_RETURN;
+}
+
+/**
+ @brief Flush the log up to given LSN (included)
+
+ @param lsn log record serial number up to which (inclusive)
+ the log has to be flushed
+
+ @return Operation status
+ @retval 0 OK
+ @retval 1 Error
+
+ @note
+
+ - Non group commit logic: Commits made in passes. Thread which started
+ flush first is performing actual flush, other threads sets new goal (LSN)
+ of the next pass (if it is maximum) and waits for the pass end or just
+ wait for the pass end.
+
+ - If hard group commit enabled and rate set to zero:
+ The first thread sends all changed buffers to disk. This is repeated
+ as long as there are new LSNs added. The process can not loop
+ forever because we have limited number of threads and they will wait
+ for the data to be synced.
+ Pseudo code:
+
+ do
+ send changed buffers to disk
+ while new_goal
+ sync
+
+ - If hard group commit switched ON and less than rate microseconds has
+ passed from last sync, then after buffers have been sent to disk
+ wait until rate microseconds has passed since last sync, do sync and return.
+ This ensures that if we call sync infrequently we don't do any waits.
+
+ - If soft group commit enabled everything works as with 'non group commit'
+ but the thread doesn't do any real sync(). If rate is not zero the
+ sync() will be performed by a service thread with the given rate
+ when needed (new LSN appears).
+
+ @note Terminology:
+ 'sent to disk' means written to disk but not sync()ed,
+ 'flushed' mean sent to disk and synced().
+*/
+
+my_bool translog_flush(TRANSLOG_ADDRESS lsn)
+{
+ struct timespec abstime;
+ ulonglong flush_interval;
+ ulonglong time_spent;
+ LSN sent_to_disk= LSN_IMPOSSIBLE;
+ TRANSLOG_ADDRESS flush_horizon;
+ my_bool rc= 0;
+ my_bool hgroup_commit_at_start;
+ DBUG_ENTER("translog_flush");
+ DBUG_PRINT("enter", ("Flush up to LSN: (%lu,0x%lx)", LSN_IN_PARTS(lsn)));
+ DBUG_ASSERT(translog_status == TRANSLOG_OK ||
+ translog_status == TRANSLOG_READONLY);
+ LINT_INIT(sent_to_disk);
+
+ pthread_mutex_lock(&log_descriptor.log_flush_lock);
+ DBUG_PRINT("info", ("Everything is flushed up to (%lu,0x%lx)",
+ LSN_IN_PARTS(log_descriptor.flushed)));
+ if (cmp_translog_addr(log_descriptor.flushed, lsn) >= 0)
+ {
+ pthread_mutex_unlock(&log_descriptor.log_flush_lock);
+ DBUG_RETURN(0);
+ }
+ if (log_descriptor.flush_in_progress)
+ {
+ translog_lock();
+ /* fix lsn if it was horizon */
+ if (cmp_translog_addr(lsn, log_descriptor.bc.buffer->last_lsn) > 0)
+ lsn= BUFFER_MAX_LSN(log_descriptor.bc.buffer);
+ translog_unlock();
+ translog_flush_set_new_goal_and_wait(lsn);
+ if (!pthread_equal(log_descriptor.max_lsn_requester, pthread_self()))
+ {
+ /*
+ translog_flush_wait_for_end() release log_flush_lock while is
+ waiting then acquire it again
+ */
+ translog_flush_wait_for_end(lsn);
+ pthread_mutex_unlock(&log_descriptor.log_flush_lock);
+ DBUG_RETURN(0);
+ }
+ log_descriptor.next_pass_max_lsn= LSN_IMPOSSIBLE;
+ }
+ log_descriptor.flush_in_progress= 1;
+ flush_horizon= log_descriptor.previous_flush_horizon;
+ DBUG_PRINT("info", ("flush_in_progress is set, flush_horizon: (%lu,0x%lx)",
+ LSN_IN_PARTS(flush_horizon)));
+ pthread_mutex_unlock(&log_descriptor.log_flush_lock);
+
+ hgroup_commit_at_start= hard_group_commit;
+ if (hgroup_commit_at_start)
+ flush_interval= group_commit_wait;
+
+ translog_lock();
+ if (log_descriptor.is_everything_flushed)
+ {
+ DBUG_PRINT("info", ("everything is flushed"));
+ translog_unlock();
+ pthread_mutex_lock(&log_descriptor.log_flush_lock);
+ goto out;
+ }
+
+ for (;;)
+ {
+ /* Following function flushes buffers and makes translog_unlock() */
+ translog_flush_buffers(&lsn, &sent_to_disk, &flush_horizon);
+
+ if (!hgroup_commit_at_start)
+ break; /* flush pass is ended */
+
+retest:
+ /*
+ We do not check time here because pthread_mutex_lock rarely takes
+ a lot of time so we can sacrifice a bit precision to performance
+ (taking into account that my_micro_time() might be expensive call).
+ */
+ if (flush_interval == 0)
+ break; /* flush pass is ended */
+
+ pthread_mutex_lock(&log_descriptor.log_flush_lock);
+ if (log_descriptor.next_pass_max_lsn == LSN_IMPOSSIBLE)
+ {
+ if (flush_interval == 0 ||
+ (time_spent= (my_micro_time() - flush_start)) >= flush_interval)
{
- rc= 1;
- translog_stop_writing();
- sent_to_disk= LSN_IMPOSSIBLE;
- goto out;
+ pthread_mutex_unlock(&log_descriptor.log_flush_lock);
+ break;
}
- file->is_sync= 1;
- }
- }
-
- if (sync_log_dir >= TRANSLOG_SYNC_DIR_ALWAYS &&
- (LSN_FILE_NO(log_descriptor.previous_flush_horizon) !=
- LSN_FILE_NO(flush_horizon) ||
- ((LSN_OFFSET(log_descriptor.previous_flush_horizon) - 1) /
- TRANSLOG_PAGE_SIZE) !=
- ((LSN_OFFSET(flush_horizon) - 1) / TRANSLOG_PAGE_SIZE)))
- rc|= sync_dir(log_descriptor.directory_fd, MYF(MY_WME | MY_IGNORE_BADFD));
+ DBUG_PRINT("info", ("flush waits: %llu interval: %llu spent: %llu",
+ flush_interval - time_spent,
+ flush_interval, time_spent));
+ /* wait time or next goal */
+ set_timespec_nsec(abstime, flush_interval - time_spent);
+ pthread_cond_timedwait(&log_descriptor.new_goal_cond,
+ &log_descriptor.log_flush_lock,
+ &abstime);
+ pthread_mutex_unlock(&log_descriptor.log_flush_lock);
+ DBUG_PRINT("info", ("retest conditions"));
+ goto retest;
+ }
+
+ /* take next goal */
+ lsn= log_descriptor.next_pass_max_lsn;
+ log_descriptor.next_pass_max_lsn= LSN_IMPOSSIBLE;
+ /* prevent other thread from continue */
+ log_descriptor.max_lsn_requester= pthread_self();
+ DBUG_PRINT("info", ("flush took next goal: (%lu,0x%lx)",
+ LSN_IN_PARTS(lsn)));
+ pthread_mutex_unlock(&log_descriptor.log_flush_lock);
+
+ /* next flush pass */
+ DBUG_PRINT("info", ("next flush pass"));
+ translog_lock();
+ }
+
+ /*
+ sync() files from previous flush till current one
+ */
+ if (!soft_sync || hgroup_commit_at_start)
+ {
+ if ((rc=
+ translog_sync_files(LSN_FILE_NO(log_descriptor.flushed),
+ LSN_FILE_NO(lsn),
+ sync_log_dir >= TRANSLOG_SYNC_DIR_ALWAYS &&
+ (LSN_FILE_NO(log_descriptor.
+ previous_flush_horizon) !=
+ LSN_FILE_NO(flush_horizon) ||
+ (LSN_OFFSET(log_descriptor.
+ previous_flush_horizon) /
+ TRANSLOG_PAGE_SIZE) !=
+ (LSN_OFFSET(flush_horizon) /
+ TRANSLOG_PAGE_SIZE)))))
+ {
+ sent_to_disk= LSN_IMPOSSIBLE;
+ pthread_mutex_lock(&log_descriptor.log_flush_lock);
+ goto out;
+ }
+ /* keep values for soft sync() and forced sync() actual */
+ {
+ uint32 fileno= LSN_FILE_NO(lsn);
+ my_atomic_rwlock_wrlock(&soft_sync_rwl);
+ my_atomic_store32(&soft_sync_min, fileno);
+ my_atomic_store32(&soft_sync_max, fileno);
+ my_atomic_rwlock_wrunlock(&soft_sync_rwl);
+ }
+ }
+ else
+ {
+ my_atomic_rwlock_wrlock(&soft_sync_rwl);
+ my_atomic_store32(&soft_sync_max, LSN_FILE_NO(lsn));
+ my_atomic_store32(&soft_need_sync, 1);
+ my_atomic_rwlock_wrunlock(&soft_sync_rwl);
+ }
+
+ DBUG_ASSERT(flush_horizon <= log_descriptor.horizon);
+
+ pthread_mutex_lock(&log_descriptor.log_flush_lock);
log_descriptor.previous_flush_horizon= flush_horizon;
out:
- pthread_mutex_lock(&log_descriptor.log_flush_lock);
if (sent_to_disk != LSN_IMPOSSIBLE)
log_descriptor.flushed= sent_to_disk;
log_descriptor.flush_in_progress= 0;
log_descriptor.flush_no++;
DBUG_PRINT("info", ("flush_in_progress is dropped"));
- pthread_mutex_unlock(&log_descriptor.log_flush_lock);\
+ pthread_mutex_unlock(&log_descriptor.log_flush_lock);
pthread_cond_broadcast(&log_descriptor.log_flush_cond);
DBUG_RETURN(rc);
}
@@ -8113,6 +8453,8 @@
my_bool translog_purge(TRANSLOG_ADDRESS low)
{
uint32 last_need_file= LSN_FILE_NO(low);
+ uint32 min_unsync;
+ int soft;
TRANSLOG_ADDRESS horizon= translog_get_horizon();
int rc= 0;
DBUG_ENTER("translog_purge");
@@ -8120,12 +8462,26 @@
DBUG_ASSERT(translog_status == TRANSLOG_OK ||
translog_status == TRANSLOG_READONLY);
+ soft= soft_sync;
+ my_atomic_rwlock_wrlock(&soft_sync_rwl);
+ min_unsync= my_atomic_load32(&soft_sync_min);
+ my_atomic_rwlock_wrunlock(&soft_sync_rwl);
+ DBUG_PRINT("info", ("min_unsync: %lu", (ulong) min_unsync));
+ if (soft && min_unsync < last_need_file)
+ {
+ last_need_file= min_unsync;
+ DBUG_PRINT("info", ("last_need_file set to %lu", (ulong)last_need_file));
+ }
+
pthread_mutex_lock(&log_descriptor.purger_lock);
+ DBUG_PRINT("info", ("last_lsn_checked file: %lu:",
+ (ulong) log_descriptor.last_lsn_checked));
if (LSN_FILE_NO(log_descriptor.last_lsn_checked) < last_need_file)
{
uint32 i;
uint32 min_file= translog_first_file(horizon, 1);
DBUG_ASSERT(min_file != 0); /* log is already started */
+ DBUG_PRINT("info", ("min_file: %lu:",(ulong) min_file));
for(i= min_file; i < last_need_file && rc == 0; i++)
{
LSN lsn= translog_get_file_max_lsn_stored(i);
@@ -8356,6 +8712,159 @@
}
+
+/**
+ Sets soft sync mode
+
+ @param mode TRUE if we need switch soft sync on else off
+*/
+
+void translog_soft_sync(my_bool mode)
+{
+ soft_sync= mode;
+}
+
+
+/**
+ Sets hard group commit
+
+ @param mode TRUE if we need switch hard group commit on else off
+*/
+
+void translog_hard_group_commit(my_bool mode)
+{
+ hard_group_commit= mode;
+}
+
+
+/**
+ @brief forced log sync (used when we are switching modes)
+*/
+
+void translog_sync()
+{
+ uint32 max= get_current_logfile()->number;
+ uint32 min;
+ DBUG_ENTER("ma_translog_sync");
+
+ my_atomic_rwlock_rdlock(&soft_sync_rwl);
+ min= my_atomic_load32(&soft_sync_min);
+ my_atomic_rwlock_rdunlock(&soft_sync_rwl);
+ if (!min)
+ min= max;
+
+ translog_sync_files(min, max, sync_log_dir >= TRANSLOG_SYNC_DIR_ALWAYS);
+
+ DBUG_VOID_RETURN;
+}
+
+
+/**
+ @brief set rate for group commit
+
+ @param interval interval to set.
+
+ @note We use this function with additional variable because have to
+ restart service thread with new value which we can't make inside changing
+ variable routine (update_maria_group_commit_interval)
+*/
+
+void translog_set_group_commit_interval(uint32 interval)
+{
+ DBUG_ENTER("translog_set_group_commit_interval");
+ group_commit_wait= interval;
+ DBUG_PRINT("info", ("wait: %llu",
+ (ulonglong)group_commit_wait));
+ DBUG_VOID_RETURN;
+}
+
+
+/**
+ @brief syncing service thread
+*/
+
+static pthread_handler_t
+ma_soft_sync_background( void *arg __attribute__((unused)))
+{
+
+ my_thread_init();
+ {
+ DBUG_ENTER("ma_soft_sync_background");
+ for(;;)
+ {
+ ulonglong prev_loop= my_micro_time();
+ ulonglong time, sleep;
+ uint32 min, max, sync_request;
+ my_atomic_rwlock_rdlock(&soft_sync_rwl);
+ min= my_atomic_load32(&soft_sync_min);
+ max= my_atomic_load32(&soft_sync_max);
+ sync_request= my_atomic_load32(&soft_need_sync);
+ my_atomic_store32(&soft_sync_min, max);
+ my_atomic_store32(&soft_need_sync, 0);
+ my_atomic_rwlock_rdunlock(&soft_sync_rwl);
+
+ sleep= group_commit_wait;
+ if (sync_request)
+ translog_sync_files(min, max, FALSE);
+ time= my_micro_time() - prev_loop;
+ if (time > sleep)
+ sleep= 0;
+ else
+ sleep-= time;
+ if (my_service_thread_sleep(&soft_sync_control, sleep))
+ break;
+ }
+ my_service_thread_signal_end(&soft_sync_control);
+ my_thread_end();
+ DBUG_RETURN(0);
+ }
+}
+
+
+/**
+ @brief Starts syncing thread
+*/
+
+int translog_soft_sync_start(void)
+{
+ pthread_t th;
+ int res= 0;
+ uint32 min, max;
+ DBUG_ENTER("translog_soft_sync_start");
+
+ /* check and init variables */
+ my_atomic_rwlock_rdlock(&soft_sync_rwl);
+ min= my_atomic_load32(&soft_sync_min);
+ max= my_atomic_load32(&soft_sync_max);
+ if (!max)
+ my_atomic_store32(&soft_sync_max, (max= get_current_logfile()->number));
+ if (!min)
+ my_atomic_store32(&soft_sync_min, max);
+ my_atomic_store32(&soft_need_sync, 1);
+ my_atomic_rwlock_rdunlock(&soft_sync_rwl);
+
+ if (!(res= ma_service_thread_control_init(&soft_sync_control)))
+ if (!(res= pthread_create(&th, NULL, ma_soft_sync_background, NULL)))
+ soft_sync_control.status= THREAD_RUNNING;
+ DBUG_RETURN(res);
+}
+
+
+/**
+ @brief Stops syncing thread
+*/
+
+void translog_soft_sync_end(void)
+{
+ DBUG_ENTER("translog_soft_sync_end");
+ if (soft_sync_control.inited)
+ {
+ ma_service_thread_control_end(&soft_sync_control);
+ }
+ DBUG_VOID_RETURN;
+}
+
+
#ifdef MARIA_DUMP_LOG
#include <my_getopt.h>
extern void translog_example_table_init();
=== modified file 'storage/maria/ma_loghandler.h'
--- a/storage/maria/ma_loghandler.h 2009-01-15 22:25:53 +0000
+++ b/storage/maria/ma_loghandler.h 2010-02-12 13:12:28 +0000
@@ -342,6 +342,14 @@
TRANSLOG_SHUTDOWN /* going to shutdown the loghandler */
};
extern enum enum_translog_status translog_status;
+extern ulonglong translog_syncs; /* Number of sync()s */
+
+void translog_soft_sync(my_bool mode);
+void translog_hard_group_commit(my_bool mode);
+int translog_soft_sync_start(void);
+void translog_soft_sync_end(void);
+void translog_sync();
+void translog_set_group_commit_interval(uint32 interval);
/*
all the rest added because of recovery; should we make
@@ -441,6 +449,14 @@
typedef enum
{
+ TRANSLOG_GCOMMIT_NONE,
+ TRANSLOG_GCOMMIT_HARD,
+ TRANSLOG_GCOMMIT_SOFT
+} enum_maria_group_commit;
+extern ulong maria_group_commit;
+extern ulong maria_group_commit_interval;
+typedef enum
+{
TRANSLOG_PURGE_IMMIDIATE,
TRANSLOG_PURGE_EXTERNAL,
TRANSLOG_PURGE_ONDEMAND
1
0
[Maria-developers] Rev 2740: Group commit for maria storage engine. in file:///Users/bell/maria/bzr/work-maria-5.2-groupcommit/
by sanjaï¼ askmonty.org 12 Feb '10
by sanjaï¼ askmonty.org 12 Feb '10
12 Feb '10
At file:///Users/bell/maria/bzr/work-maria-5.2-groupcommit/
------------------------------------------------------------
revno: 2740
revision-id: sanja(a)askmonty.org-20100212091325-sluwoeo04cvmjewk
parent: knielsen(a)knielsen-hq.org-20100201190519-b9uktnn90rwwiile
committer: sanja(a)askmonty.org
branch nick: work-maria-5.2-groupcommit
timestamp: Fri 2010-02-12 11:13:25 +0200
message:
Group commit for maria storage engine.
=== added file 'mysql-test/suite/maria/r/group_commit.result'
--- a/mysql-test/suite/maria/r/group_commit.result 1970-01-01 00:00:00 +0000
+++ b/mysql-test/suite/maria/r/group_commit.result 2010-02-12 09:13:25 +0000
@@ -0,0 +1,17 @@
+drop table if exists t1;
+create table t1 (a int);
+SET GLOBAL maria_group_commit="NONE";
+SET GLOBAL maria_group_commit_interval= 0;
+SET GLOBAL maria_group_commit="NONE";
+SET GLOBAL maria_group_commit_interval= 100;
+SET GLOBAL maria_group_commit="HARD";
+SET GLOBAL maria_group_commit_interval= 0;
+SET GLOBAL maria_group_commit="HARD";
+SET GLOBAL maria_group_commit_interval= 100;
+SET GLOBAL maria_group_commit="SOFT";
+SET GLOBAL maria_group_commit_interval= 0;
+SET GLOBAL maria_group_commit="SOFT";
+SET GLOBAL maria_group_commit_interval= 100;
+SET GLOBAL maria_group_commit="NONE";
+SET GLOBAL maria_group_commit_interval= 0;
+drop table t1;
=== modified file 'mysql-test/suite/maria/r/maria3.result'
--- a/mysql-test/suite/maria/r/maria3.result 2009-09-18 01:04:43 +0000
+++ b/mysql-test/suite/maria/r/maria3.result 2010-02-12 09:13:25 +0000
@@ -306,6 +306,8 @@
maria_block_size 8192
maria_checkpoint_interval 30
maria_force_start_after_recovery_failures 0
+maria_group_commit none
+maria_group_commit_interval 0
maria_log_file_size 4294959104
maria_log_purge_type immediate
maria_max_sort_file_size 9223372036853727232
@@ -328,6 +330,7 @@
Maria_pagecache_reads #
Maria_pagecache_write_requests #
Maria_pagecache_writes #
+Maria_transaction_log_syncs #
create table t1 (b char(0));
insert into t1 values(NULL),("");
select length(b) from t1;
=== added file 'mysql-test/suite/maria/t/group_commit.test'
--- a/mysql-test/suite/maria/t/group_commit.test 1970-01-01 00:00:00 +0000
+++ b/mysql-test/suite/maria/t/group_commit.test 2010-02-12 09:13:25 +0000
@@ -0,0 +1,71 @@
+# Test different ways of syncing (mostly syntax)
+
+--disable_warnings
+drop table if exists t1;
+--enable_warnings
+
+create table t1 (a int);
+
+SET GLOBAL maria_group_commit="NONE";
+SET GLOBAL maria_group_commit_interval= 0;
+--disable_query_log
+let $num = 5000;
+while ($num)
+{
+ insert into t1 values (1);
+ dec $num;
+}
+--enable_query_log
+SET GLOBAL maria_group_commit="NONE";
+SET GLOBAL maria_group_commit_interval= 100;
+--disable_query_log
+let $num = 5000;
+while ($num)
+{
+ insert into t1 values (1);
+ dec $num;
+}
+--enable_query_log
+SET GLOBAL maria_group_commit="HARD";
+SET GLOBAL maria_group_commit_interval= 0;
+--disable_query_log
+let $num = 5000;
+while ($num)
+{
+ insert into t1 values (1);
+ dec $num;
+}
+--enable_query_log
+SET GLOBAL maria_group_commit="HARD";
+SET GLOBAL maria_group_commit_interval= 100;
+--disable_query_log
+let $num = 5000;
+while ($num)
+{
+ insert into t1 values (1);
+ dec $num;
+}
+--enable_query_log
+SET GLOBAL maria_group_commit="SOFT";
+SET GLOBAL maria_group_commit_interval= 0;
+--disable_query_log
+let $num = 5000;
+while ($num)
+{
+ insert into t1 values (1);
+ dec $num;
+}
+--enable_query_log
+SET GLOBAL maria_group_commit="SOFT";
+SET GLOBAL maria_group_commit_interval= 100;
+--disable_query_log
+let $num = 5000;
+while ($num)
+{
+ insert into t1 values (1);
+ dec $num;
+}
+--enable_query_log
+SET GLOBAL maria_group_commit="NONE";
+SET GLOBAL maria_group_commit_interval= 0;
+drop table t1;
=== added directory 'randgen'
=== added directory 'randgen/conf'
=== added file 'randgen/conf/maria_group_commit.yy'
--- a/randgen/conf/maria_group_commit.yy 1970-01-01 00:00:00 +0000
+++ b/randgen/conf/maria_group_commit.yy 2010-02-12 09:13:25 +0000
@@ -0,0 +1,181 @@
+# test of group commit switching
+
+query:
+ select | insert | update| delete |
+ select | insert | update| delete |
+ select | insert | update| delete |
+ select | insert | update| delete |
+ select | insert | update| delete |
+ select | insert | update| delete |
+ select | insert | update| delete |
+ select | insert | update| delete |
+ select | insert | update| delete |
+ select | insert | update| delete |
+ select | insert | update| delete |
+ select | insert | update| delete |
+ select | insert | update| delete |
+ select | insert | update| delete |
+ select | insert | update| delete |
+ select | insert | update| delete |
+ select | insert | update| delete |
+ select | insert | update| delete |
+ select | insert | update| delete |
+ select | insert | update| delete |
+ select | insert | update| delete |
+ select | insert | update| delete |
+ select | insert | update| delete |
+ select | insert | update| delete |
+ select | insert | update| delete |
+ select | insert | update| delete |
+ select | insert | update| delete |
+ select | insert | update| delete |
+ select | insert | update| delete |
+ select | insert | update| delete |
+ select | insert | update| delete |
+ select | insert | update| delete |
+ select | insert | update| delete |
+ select | insert | update| delete |
+ select | insert | update| delete |
+ select | insert | update| delete |
+ select | insert | update| delete |
+ select | insert | update| delete |
+ select | insert | update| delete |
+ select | insert | update| delete |
+ select | insert | update| delete |
+ select | insert | update| delete |
+ select | insert | update| delete |
+ select | insert | update| delete |
+ select | insert | update| delete |
+ select | insert | update| delete |
+ select | insert | update| delete |
+ select | insert | update| delete |
+ select | insert | update| delete |
+ select | insert | update| delete |
+ select | insert | update| delete |
+ select | insert | update| delete |
+ select | insert | update| delete |
+ select | insert | update| delete |
+ select | insert | update| delete |
+ select | insert | update| delete |
+ select | insert | update| delete |
+ select | insert | update| delete |
+ select | insert | update| delete |
+ select | insert | update| delete |
+ select | insert | update| delete |
+ select | insert | update| delete |
+ select | insert | update| delete |
+ select | insert | update| delete |
+ select | insert | update| delete |
+ select | insert | update| delete |
+ select | insert | update| delete |
+ select | insert | update| delete |
+ select | insert | update| delete |
+ select | insert | update| delete |
+ select | insert | update| delete |
+ select | insert | update| delete |
+ select | insert | update| delete |
+ select | insert | update| delete |
+ select | insert | update| delete |
+ select | insert | update| delete |
+ select | insert | update| delete |
+ select | insert | update| delete |
+ select | insert | update| delete |
+ select | insert | update| delete |
+ select | insert | update| delete |
+ select | insert | update| delete |
+ select | insert | update| delete |
+ select | insert | update| delete |
+ select | insert | update| delete |
+ select | insert | update| delete |
+ select | insert | update| delete |
+ select | insert | update| delete |
+ select | insert | update| delete |
+ select | insert | update| delete |
+ select | insert | update| delete |
+ select | insert | update| delete |
+ select | insert | update| delete |
+ select | insert | update| delete |
+ select | insert | update| delete |
+ select | insert | update| delete |
+ select | insert | update| delete |
+ select | insert | update| delete |
+ select | insert | update| delete |
+ select | insert | update| delete |
+ select | insert | update| delete |
+ select | insert | update| delete |
+ select | insert | update| delete |
+ select | insert | update| delete |
+ select | insert | update| delete |
+ select | insert | update| delete |
+ select | insert | update| delete |
+ select | insert | update| delete |
+ select | insert | update| delete |
+ select | insert | update| delete |
+ select | insert | update| delete |
+ select | insert | update| delete |
+ select | insert | update| delete |
+ select | insert | update| delete |
+ select | insert | update| delete |
+ select | insert | update| delete |
+ select | insert | update| delete |
+ select | insert | update| delete |
+ select | insert | update| delete |
+ select | insert | update| delete |
+ change_group_commit | change_interval;
+
+
+select:
+ SELECT select_item FROM join where order_by limit;
+
+select_item:
+ * | X . _field ;
+
+join:
+ _table AS X |
+ _table AS X LEFT JOIN _table AS Y ON ( X . _field = Y . _field ) ;
+
+where:
+ |
+ WHERE X . _field < value |
+ WHERE X . _field > value |
+ WHERE X . _field = value ;
+
+where_delete:
+ |
+ WHERE _field < value |
+ WHERE _field > value |
+ WHERE _field = value ;
+
+order_by:
+ | ORDER BY X . _field ;
+
+limit:
+ | LIMIT _digit ;
+
+insert:
+ INSERT INTO _table ( _field , _field ) VALUES ( value , value ) ;
+
+update:
+ UPDATE _table AS X SET _field = value where order_by limit ;
+
+delete:
+ DELETE FROM _table where_delete LIMIT _digit ;
+
+value:
+ ' _letter ' | _digit | _date | _datetime | _time | _english ;
+
+change_group_commit:
+ SET GLOBAL MARIA_GROUP_COMMIT=none_soft_hard;
+
+none_soft_hard:
+ NONE | SOFT | HARD;
+
+change_interval:
+ set_interval | set_interval | set_interval | set_interval |
+ drop_interval;
+
+set_interval:
+ SET GLOBAL MARIA_GROUP_COMMIT_INTERVAL=_tinyint_unsigned;
+
+drop_interval:
+ SET GLOBAL MARIA_GROUP_COMMIT_INTERVAL=0;
=== modified file 'storage/maria/ha_maria.cc'
--- a/storage/maria/ha_maria.cc 2009-12-03 11:34:11 +0000
+++ b/storage/maria/ha_maria.cc 2010-02-12 09:13:25 +0000
@@ -102,22 +102,40 @@
array_elements(maria_translog_purge_type_names) - 1, "",
maria_translog_purge_type_names, NULL
};
+
+/* transactional log directory sync */
const char *maria_sync_log_dir_names[]=
{
"NEVER", "NEWFILE", "ALWAYS", NullS
};
-
TYPELIB maria_sync_log_dir_typelib=
{
array_elements(maria_sync_log_dir_names) - 1, "",
maria_sync_log_dir_names, NULL
};
+/* transactional log group commit */
+const char *maria_group_commit_names[]=
+{
+ "none", "hard", "soft", NullS
+};
+TYPELIB maria_group_commit_typelib=
+{
+ array_elements(maria_group_commit_names) - 1, "",
+ maria_group_commit_names, NULL
+};
+
/** Interval between background checkpoints in seconds */
static ulong checkpoint_interval;
static void update_checkpoint_interval(MYSQL_THD thd,
struct st_mysql_sys_var *var,
void *var_ptr, const void *save);
+static void update_maria_group_commit(MYSQL_THD thd,
+ struct st_mysql_sys_var *var,
+ void *var_ptr, const void *save);
+static void update_maria_group_commit_interval(MYSQL_THD thd,
+ struct st_mysql_sys_var *var,
+ void *var_ptr, const void *save);
/** After that many consecutive recovery failures, remove logs */
static ulong force_start_after_recovery_failures;
static void update_log_file_size(MYSQL_THD thd,
@@ -164,6 +182,24 @@
NULL, update_log_file_size, TRANSLOG_FILE_SIZE,
TRANSLOG_MIN_FILE_SIZE, 0xffffffffL, TRANSLOG_PAGE_SIZE);
+static MYSQL_SYSVAR_ENUM(group_commit, maria_group_commit,
+ PLUGIN_VAR_RQCMDARG,
+ "Specifies maria group commit mode. "
+ "Possible values are \"none\" (no group commit), "
+ "\"hard\" (with waiting to actual commit), "
+ "\"soft\" (no wait for commit (DANGEROUS!!!))",
+ NULL, update_maria_group_commit,
+ TRANSLOG_GCOMMIT_NONE, &maria_group_commit_typelib);
+
+static MYSQL_SYSVAR_ULONG(group_commit_interval, maria_group_commit_interval,
+ PLUGIN_VAR_RQCMDARG,
+ "Interval between commite in microseconds (1/1000000c)."
+ " 0 stands for no waiting"
+ " for other threads to come and do a commit in \"hard\" mode and no"
+ " sync()/commit at all in \"soft\" mode. Option has only an effect"
+ " if maria_group_commit is used",
+ NULL, update_maria_group_commit_interval, 0, 0, UINT_MAX, 1);
+
static MYSQL_SYSVAR_ENUM(log_purge_type, log_purge_type,
PLUGIN_VAR_RQCMDARG,
"Specifies how maria transactional log will be purged. "
@@ -3275,6 +3311,8 @@
MYSQL_SYSVAR(block_size),
MYSQL_SYSVAR(checkpoint_interval),
MYSQL_SYSVAR(force_start_after_recovery_failures),
+ MYSQL_SYSVAR(group_commit),
+ MYSQL_SYSVAR(group_commit_interval),
MYSQL_SYSVAR(page_checksum),
MYSQL_SYSVAR(log_dir_path),
MYSQL_SYSVAR(log_file_size),
@@ -3306,6 +3344,92 @@
}
/**
+ @brief Updates group commit mode
+*/
+
+static void update_maria_group_commit(MYSQL_THD thd,
+ struct st_mysql_sys_var *var,
+ void *var_ptr, const void *save)
+{
+ ulong value= (ulong)*((long *)var_ptr);
+ DBUG_ENTER("update_maria_group_commit");
+ DBUG_PRINT("enter", ("old value: %lu new value %lu rate %lu",
+ value, (ulong)(*(long *)save),
+ maria_group_commit_interval));
+ /* old value */
+ switch (value) {
+ case TRANSLOG_GCOMMIT_NONE:
+ break;
+ case TRANSLOG_GCOMMIT_HARD:
+ translog_hard_group_commit(FALSE);
+ break;
+ case TRANSLOG_GCOMMIT_SOFT:
+ translog_soft_sync(FALSE);
+ if (maria_group_commit_interval)
+ translog_soft_sync_end();
+ break;
+ default:
+ DBUG_ASSERT(0); /* impossible */
+ }
+ value= *(ulong *)var_ptr= (ulong)(*(long *)save);
+ translog_sync();
+ /* new value */
+ switch (value) {
+ case TRANSLOG_GCOMMIT_NONE:
+ break;
+ case TRANSLOG_GCOMMIT_HARD:
+ translog_hard_group_commit(TRUE);
+ break;
+ case TRANSLOG_GCOMMIT_SOFT:
+ translog_soft_sync(TRUE);
+ /* variable change made under global lock so we can just read it */
+ if (maria_group_commit_interval)
+ translog_soft_sync_start();
+ break;
+ default:
+ DBUG_ASSERT(0); /* impossible */
+ }
+ DBUG_VOID_RETURN;
+}
+
+/**
+ @brief Updates group commit interval
+*/
+
+static void update_maria_group_commit_interval(MYSQL_THD thd,
+ struct st_mysql_sys_var *var,
+ void *var_ptr, const void *save)
+{
+ ulong new_value= (ulong)*((long *)save);
+ ulong *value_ptr= (ulong*) var_ptr;
+ DBUG_ENTER("update_maria_group_commit_interval");
+ DBUG_PRINT("enter", ("old value: %lu new value %lu group commit %lu",
+ *value_ptr, new_value, maria_group_commit));
+
+ /* variable change made under global lock so we can just read it */
+ switch (maria_group_commit) {
+ case TRANSLOG_GCOMMIT_NONE:
+ *value_ptr= new_value;
+ translog_set_group_commit_interval(new_value);
+ break;
+ case TRANSLOG_GCOMMIT_HARD:
+ *value_ptr= new_value;
+ translog_set_group_commit_interval(new_value);
+ break;
+ case TRANSLOG_GCOMMIT_SOFT:
+ if (*value_ptr)
+ translog_soft_sync_end();
+ translog_set_group_commit_interval(new_value);
+ if ((*value_ptr= new_value))
+ translog_soft_sync_start();
+ break;
+ default:
+ DBUG_ASSERT(0); /* impossible */
+ }
+ DBUG_VOID_RETURN;
+}
+
+/**
@brief Updates the transaction log file limit.
*/
@@ -3327,6 +3451,7 @@
{"Maria_pagecache_reads", (char*) &maria_pagecache_var.global_cache_read, SHOW_LONGLONG},
{"Maria_pagecache_write_requests", (char*) &maria_pagecache_var.global_cache_w_requests, SHOW_LONGLONG},
{"Maria_pagecache_writes", (char*) &maria_pagecache_var.global_cache_write, SHOW_LONGLONG},
+ {"Maria_transaction_log_syncs", (char*) &translog_syncs, SHOW_LONGLONG},
{NullS, NullS, SHOW_LONG}
};
=== modified file 'storage/maria/ma_init.c'
--- a/storage/maria/ma_init.c 2008-10-09 20:03:54 +0000
+++ b/storage/maria/ma_init.c 2010-02-12 09:13:25 +0000
@@ -82,6 +82,11 @@
maria_inited= maria_multi_threaded= FALSE;
ft_free_stopwords();
ma_checkpoint_end();
+ if (translog_status == TRANSLOG_OK)
+ {
+ translog_soft_sync_end();
+ translog_sync();
+ }
if ((trid= trnman_get_max_trid()) > max_trid_in_control_file)
{
/*
=== modified file 'storage/maria/ma_loghandler.c'
--- a/storage/maria/ma_loghandler.c 2010-01-06 21:27:53 +0000
+++ b/storage/maria/ma_loghandler.c 2010-02-12 09:13:25 +0000
@@ -18,6 +18,7 @@
#include "ma_blockrec.h" /* for some constants and in-write hooks */
#include "ma_key_recover.h" /* For some in-write hooks */
#include "ma_checkpoint.h"
+#include "ma_servicethread.h"
/*
On Windows, neither my_open() nor my_sync() work for directories.
@@ -47,6 +48,15 @@
#include <m_ctype.h>
#endif
+/** @brief protects checkpoint_in_progress */
+static pthread_mutex_t LOCK_soft_sync;
+/** @brief for killing the background checkpoint thread */
+static pthread_cond_t COND_soft_sync;
+/** @brief control structure for checkpoint background thread */
+static MA_SERVICE_THREAD_CONTROL soft_sync_control=
+ {THREAD_DEAD, FALSE, &LOCK_soft_sync, &COND_soft_sync};
+
+
/* transaction log file descriptor */
typedef struct st_translog_file
{
@@ -124,10 +134,24 @@
/* Previous buffer offset to detect it flush finish */
TRANSLOG_ADDRESS prev_buffer_offset;
/*
+ If the buffer was forced to close it save value of its horizon
+ otherwise LSN_IMPOSSIBLE
+ */
+ TRANSLOG_ADDRESS pre_force_close_horizon;
+ /*
How much is written (or will be written when copy_to_buffer_in_progress
become 0) to this buffer
*/
translog_size_t size;
+ /*
+ When moving from one log buffer to another, we write the last of the
+ previous buffer to file and then move to start using the new log
+ buffer. In the case of a part filed last page, this page is not moved
+ to the start of the new buffer but instead we set the 'skip_data'
+ variable to tell us how much data at the beginning of the buffer is not
+ relevant.
+ */
+ uint skipped_data;
/* File handler for this buffer */
TRANSLOG_FILE *file;
/* Threads which are waiting for buffer filling/freeing */
@@ -304,6 +328,7 @@
*/
pthread_mutex_t log_flush_lock;
pthread_cond_t log_flush_cond;
+ pthread_cond_t new_goal_cond;
/* Protects changing of headers of finished files (max_lsn) */
pthread_mutex_t file_header_lock;
@@ -344,13 +369,39 @@
ulong log_purge_type= TRANSLOG_PURGE_IMMIDIATE;
ulong log_file_size= TRANSLOG_FILE_SIZE;
+/* sync() of log files directory mode */
ulong sync_log_dir= TRANSLOG_SYNC_DIR_NEWFILE;
+ulong maria_group_commit= TRANSLOG_GCOMMIT_NONE;
+ulong maria_group_commit_interval= 0;
/* Marker for end of log */
static uchar end_of_log= 0;
#define END_OF_LOG &end_of_log
+/**
+ Switch for "soft" sync (no real sync() but periodical sync by service
+ thread)
+*/
+static volatile my_bool soft_sync= FALSE;
+/**
+ Switch for "hard" group commit mode
+*/
+static volatile my_bool hard_group_commit= FALSE;
+/**
+ File numbers interval which have to be sync()
+*/
+static uint32 soft_sync_min= 0;
+static uint32 soft_sync_max= 0;
+static uint32 soft_need_sync= 1;
+/**
+ stores interval in microseconds
+*/
+static uint32 group_commit_wait= 0;
enum enum_translog_status translog_status= TRANSLOG_UNINITED;
+ulonglong translog_syncs= 0; /* Number of sync()s */
+
+/* time of last flush */
+static ulonglong flush_start= 0;
/* chunk types */
#define TRANSLOG_CHUNK_LSN 0x00 /* 0 chunk refer as LSN (head or tail */
@@ -980,12 +1031,17 @@
static TRANSLOG_FILE *get_current_logfile()
{
TRANSLOG_FILE *file;
+ DBUG_ENTER("get_current_logfile");
rw_rdlock(&log_descriptor.open_files_lock);
+ DBUG_PRINT("info", ("max_file: %lu min_file: %lu open_files: %lu",
+ (ulong) log_descriptor.max_file,
+ (ulong) log_descriptor.min_file,
+ (ulong) log_descriptor.open_files.elements));
DBUG_ASSERT(log_descriptor.max_file - log_descriptor.min_file + 1 ==
log_descriptor.open_files.elements);
file= *dynamic_element(&log_descriptor.open_files, 0, TRANSLOG_FILE **);
rw_unlock(&log_descriptor.open_files_lock);
- return (file);
+ DBUG_RETURN(file);
}
uchar NEAR maria_trans_file_magic[]=
@@ -1069,6 +1125,7 @@
static my_bool translog_max_lsn_to_header(File file, LSN lsn)
{
uchar lsn_buff[LSN_STORE_SIZE];
+ my_bool rc;
DBUG_ENTER("translog_max_lsn_to_header");
DBUG_PRINT("enter", ("File descriptor: %ld "
"lsn: (%lu,0x%lx)",
@@ -1077,11 +1134,17 @@
lsn_store(lsn_buff, lsn);
- DBUG_RETURN(my_pwrite(file, lsn_buff,
- LSN_STORE_SIZE,
- (LOG_HEADER_DATA_SIZE - LSN_STORE_SIZE),
- log_write_flags) != 0 ||
- my_sync(file, MYF(MY_WME)) != 0);
+ rc= (my_pwrite(file, lsn_buff,
+ LSN_STORE_SIZE,
+ (LOG_HEADER_DATA_SIZE - LSN_STORE_SIZE),
+ log_write_flags) != 0 ||
+ my_sync(file, MYF(MY_WME)) != 0);
+ /*
+ We should not increase counter in case of error above, but it is so
+ unlikely that we can ignore this case
+ */
+ translog_syncs++;
+ DBUG_RETURN(rc);
}
@@ -1423,7 +1486,9 @@
static my_bool translog_buffer_init(struct st_translog_buffer *buffer, int num)
{
DBUG_ENTER("translog_buffer_init");
- buffer->prev_last_lsn= buffer->last_lsn= LSN_IMPOSSIBLE;
+ buffer->pre_force_close_horizon=
+ buffer->prev_last_lsn= buffer->last_lsn=
+ LSN_IMPOSSIBLE;
DBUG_PRINT("info", ("last_lsn and prev_last_lsn set to 0 buffer: 0x%lx",
(ulong) buffer));
@@ -1435,6 +1500,7 @@
memset(buffer->buffer, TRANSLOG_FILLER, TRANSLOG_WRITE_BUFFER);
/* Buffer size */
buffer->size= 0;
+ buffer->skipped_data= 0;
/* cond of thread which is waiting for buffer filling */
if (pthread_cond_init(&buffer->waiting_filling_buffer, 0))
DBUG_RETURN(1);
@@ -1489,7 +1555,10 @@
TODO: sync only we have changed the log
*/
if (!file->is_sync)
+ {
rc= my_sync(file->handler.file, MYF(MY_WME));
+ translog_syncs++;
+ }
rc|= my_close(file->handler.file, MYF(MY_WME));
my_free(file, MYF(0));
return test(rc);
@@ -2044,7 +2113,8 @@
(ulong) LSN_OFFSET(log_descriptor.horizon),
(ulong) LSN_OFFSET(log_descriptor.horizon)));
DBUG_ASSERT(buffer_no == buffer->buffer_no);
- buffer->prev_last_lsn= buffer->last_lsn= LSN_IMPOSSIBLE;
+ buffer->pre_force_close_horizon=
+ buffer->prev_last_lsn= buffer->last_lsn= LSN_IMPOSSIBLE;
DBUG_PRINT("info", ("last_lsn and prev_last_lsn set to 0 buffer: 0x%lx",
(ulong) buffer));
buffer->offset= log_descriptor.horizon;
@@ -2052,6 +2122,7 @@
buffer->file= get_current_logfile();
buffer->overlay= 0;
buffer->size= 0;
+ buffer->skipped_data= 0;
translog_cursor_init(cursor, buffer, buffer_no);
DBUG_PRINT("info", ("file: #%ld (%d) init cursor #%u: 0x%lx "
"chaser: %d Size: %lu (%lu)",
@@ -2523,6 +2594,7 @@
TRANSLOG_ADDRESS offset= buffer->offset;
TRANSLOG_FILE *file= buffer->file;
uint8 ver= buffer->ver;
+ uint skipped_data;
DBUG_ENTER("translog_buffer_flush");
DBUG_PRINT("enter",
("Buffer: #%u 0x%lx file: %d offset: (%lu,0x%lx) size: %lu",
@@ -2557,6 +2629,8 @@
disk
*/
file= buffer->file;
+ skipped_data= buffer->skipped_data;
+ DBUG_ASSERT(skipped_data < TRANSLOG_PAGE_SIZE);
for (i= 0, pg= LSN_OFFSET(buffer->offset) / TRANSLOG_PAGE_SIZE;
i < buffer->size;
i+= TRANSLOG_PAGE_SIZE, pg++)
@@ -2573,13 +2647,16 @@
DBUG_ASSERT(i + TRANSLOG_PAGE_SIZE <= buffer->size);
if (translog_status != TRANSLOG_OK && translog_status != TRANSLOG_SHUTDOWN)
DBUG_RETURN(1);
- if (pagecache_inject(log_descriptor.pagecache,
+ if (pagecache_write_part(log_descriptor.pagecache,
&file->handler, pg, 3,
buffer->buffer + i,
PAGECACHE_PLAIN_PAGE,
PAGECACHE_LOCK_LEFT_UNLOCKED,
- PAGECACHE_PIN_LEFT_UNPINNED, 0,
- LSN_IMPOSSIBLE))
+ PAGECACHE_PIN_LEFT_UNPINNED,
+ PAGECACHE_WRITE_DONE, 0,
+ LSN_IMPOSSIBLE,
+ skipped_data,
+ TRANSLOG_PAGE_SIZE - skipped_data))
{
DBUG_PRINT("error",
("Can't write page (%lu,0x%lx) to pagecache, error: %d",
@@ -2589,10 +2666,12 @@
translog_stop_writing();
DBUG_RETURN(1);
}
+ skipped_data= 0;
}
file->is_sync= 0;
- if (my_pwrite(file->handler.file, buffer->buffer,
- buffer->size, LSN_OFFSET(buffer->offset),
+ if (my_pwrite(file->handler.file, buffer->buffer + buffer->skipped_data,
+ buffer->size - buffer->skipped_data,
+ LSN_OFFSET(buffer->offset) + buffer->skipped_data,
log_write_flags))
{
DBUG_PRINT("error", ("Can't write buffer (%lu,0x%lx) size %lu "
@@ -2985,6 +3064,7 @@
uchar *from, *table= NULL;
int is_last_unfinished_page;
uint last_protected_sector= 0;
+ uint skipped_data= curr_buffer->skipped_data;
TRANSLOG_FILE file_copy;
uint8 ver= curr_buffer->ver;
translog_wait_for_writers(curr_buffer);
@@ -2997,7 +3077,38 @@
}
DBUG_ASSERT(LSN_FILE_NO(addr) == LSN_FILE_NO(curr_buffer->offset));
from= curr_buffer->buffer + (addr - curr_buffer->offset);
- memcpy(buffer, from, TRANSLOG_PAGE_SIZE);
+ if (skipped_data && addr == curr_buffer->offset)
+ {
+ /*
+ We read page part of which is not present in buffer,
+ so we should read absent part from file (page cache actually)
+ */
+ file= get_logfile_by_number(file_no);
+ DBUG_ASSERT(file != NULL);
+ /*
+ it's ok to not lock the page because:
+ - The log handler has it's own page cache.
+ - There is only one thread that can access the log
+ cache at a time
+ */
+ if (!(buffer= pagecache_read(log_descriptor.pagecache,
+ &file->handler,
+ LSN_OFFSET(addr) / TRANSLOG_PAGE_SIZE,
+ 3, buffer,
+ PAGECACHE_PLAIN_PAGE,
+ PAGECACHE_LOCK_LEFT_UNLOCKED,
+ NULL)))
+ DBUG_RETURN(NULL);
+ }
+ else
+ skipped_data= 0; /* Read after skipped in buffer data */
+ /*
+ Now we have correct data in buffer up to 'skipped_data'. The
+ following memcpy() will move the data from the internal buffer
+ that was not yet on disk.
+ */
+ memcpy(buffer + skipped_data, from + skipped_data,
+ TRANSLOG_PAGE_SIZE - skipped_data);
/*
We can use copy then in translog_page_validator() because it
do not put it permanently somewhere.
@@ -3291,6 +3402,7 @@
uint32 next_page_offset, page_rest;
uint32 i;
File fd;
+ int rc;
TRANSLOG_VALIDATOR_DATA data;
char path[FN_REFLEN];
uchar page_buff[TRANSLOG_PAGE_SIZE];
@@ -3316,14 +3428,19 @@
TRANSLOG_PAGE_SIZE);
page_rest= next_page_offset - LSN_OFFSET(addr);
memset(page_buff, TRANSLOG_FILLER, page_rest);
- if ((fd= open_logfile_by_number_no_cache(LSN_FILE_NO(addr))) < 0 ||
- ((my_chsize(fd, next_page_offset, TRANSLOG_FILLER, MYF(MY_WME)) ||
- (page_rest && my_pwrite(fd, page_buff, page_rest, LSN_OFFSET(addr),
- log_write_flags)) ||
- my_sync(fd, MYF(MY_WME))) |
- my_close(fd, MYF(MY_WME))) ||
- (sync_log_dir >= TRANSLOG_SYNC_DIR_ALWAYS &&
- sync_dir(log_descriptor.directory_fd, MYF(MY_WME | MY_IGNORE_BADFD))))
+ rc= ((fd= open_logfile_by_number_no_cache(LSN_FILE_NO(addr))) < 0 ||
+ ((my_chsize(fd, next_page_offset, TRANSLOG_FILLER, MYF(MY_WME)) ||
+ (page_rest && my_pwrite(fd, page_buff, page_rest, LSN_OFFSET(addr),
+ log_write_flags)) ||
+ my_sync(fd, MYF(MY_WME)))));
+ translog_syncs++;
+ rc|= (fd > 0 && my_close(fd, MYF(MY_WME)));
+ if (sync_log_dir >= TRANSLOG_SYNC_DIR_ALWAYS)
+ {
+ rc|= sync_dir(log_descriptor.directory_fd, MYF(MY_WME | MY_IGNORE_BADFD));
+ translog_syncs++;
+ }
+ if (rc)
DBUG_RETURN(1);
/* fix the horizon */
@@ -3483,7 +3600,10 @@
my_bool version_changed= 0;
DBUG_ENTER("translog_init_with_table");
+ translog_syncs= 0;
+ flush_start= 0;
id_to_share= NULL;
+
log_descriptor.directory_fd= -1;
log_descriptor.is_everything_flushed= 1;
log_descriptor.flush_in_progress= 0;
@@ -3511,6 +3631,7 @@
pthread_mutex_init(&log_descriptor.dirty_buffer_mask_lock,
MY_MUTEX_INIT_FAST) ||
pthread_cond_init(&log_descriptor.log_flush_cond, 0) ||
+ pthread_cond_init(&log_descriptor.new_goal_cond, 0) ||
my_rwlock_init(&log_descriptor.open_files_lock,
NULL) ||
my_init_dynamic_array(&log_descriptor.open_files,
@@ -3912,7 +4033,6 @@
log_descriptor.flushed= log_descriptor.horizon;
log_descriptor.in_buffers_only= log_descriptor.bc.buffer->offset;
log_descriptor.max_lsn= LSN_IMPOSSIBLE; /* set to 0 */
- log_descriptor.previous_flush_horizon= log_descriptor.horizon;
/*
Now 'flushed' is set to 'horizon' value, but 'horizon' is (potentially)
address of the next LSN and we want indicate that all LSNs that are
@@ -3995,6 +4115,10 @@
It is beginning of the log => there is no LSNs in the log =>
There is no harm in leaving it "as-is".
*/
+ log_descriptor.previous_flush_horizon= log_descriptor.horizon;
+ DBUG_PRINT("info", ("previous_flush_horizon: (%lu,0x%lx)",
+ LSN_IN_PARTS(log_descriptor.
+ previous_flush_horizon)));
DBUG_RETURN(0);
}
file_no--;
@@ -4070,6 +4194,9 @@
translog_free_record_header(&rec);
}
}
+ log_descriptor.previous_flush_horizon= log_descriptor.horizon;
+ DBUG_PRINT("info", ("previous_flush_horizon: (%lu,0x%lx)",
+ LSN_IN_PARTS(log_descriptor.previous_flush_horizon)));
DBUG_RETURN(0);
err:
ma_message_no_user(0, "log initialization failed");
@@ -4157,6 +4284,7 @@
pthread_mutex_destroy(&log_descriptor.log_flush_lock);
pthread_mutex_destroy(&log_descriptor.dirty_buffer_mask_lock);
pthread_cond_destroy(&log_descriptor.log_flush_cond);
+ pthread_cond_destroy(&log_descriptor.new_goal_cond);
rwlock_destroy(&log_descriptor.open_files_lock);
delete_dynamic(&log_descriptor.open_files);
delete_dynamic(&log_descriptor.unfinished_files);
@@ -6885,11 +7013,11 @@
{
translog_size_t res;
DBUG_ENTER("translog_read_record_header_from_buffer");
- DBUG_ASSERT(translog_is_LSN_chunk(page[page_offset]));
- DBUG_ASSERT(translog_status == TRANSLOG_OK ||
- translog_status == TRANSLOG_READONLY);
DBUG_PRINT("info", ("page byte: 0x%x offset: %u",
(uint) page[page_offset], (uint) page_offset));
+ DBUG_ASSERT(translog_is_LSN_chunk(page[page_offset]));
+ DBUG_ASSERT(translog_status == TRANSLOG_OK ||
+ translog_status == TRANSLOG_READONLY);
buff->type= (page[page_offset] & TRANSLOG_REC_TYPE);
buff->short_trid= uint2korr(page + page_offset + 1);
DBUG_PRINT("info", ("Type %u, Short TrID %u, LSN (%lu,0x%lx)",
@@ -7356,27 +7484,27 @@
"Buffer addr: (%lu,0x%lx) "
"Page addr: (%lu,0x%lx) "
"size: %lu (%lu) Pg: %u left: %u in progress %u",
- (uint) log_descriptor.bc.buffer_no,
- (ulong) log_descriptor.bc.buffer,
- LSN_IN_PARTS(log_descriptor.bc.buffer->offset),
+ (uint) old_buffer_no,
+ (ulong) old_buffer,
+ LSN_IN_PARTS(old_buffer->offset),
(ulong) LSN_FILE_NO(log_descriptor.horizon),
(ulong) (LSN_OFFSET(log_descriptor.horizon) -
log_descriptor.bc.current_page_fill),
- (ulong) log_descriptor.bc.buffer->size,
+ (ulong) old_buffer->size,
(ulong) (log_descriptor.bc.ptr -log_descriptor.bc.
buffer->buffer),
(uint) log_descriptor.bc.current_page_fill,
(uint) left,
- (uint) log_descriptor.bc.buffer->
+ (uint) old_buffer->
copy_to_buffer_in_progress));
translog_lock_assert_owner();
LINT_INIT(current_page_fill);
- new_buff_beginning= log_descriptor.bc.buffer->offset;
- new_buff_beginning+= log_descriptor.bc.buffer->size; /* increase offset */
+ new_buff_beginning= old_buffer->offset;
+ new_buff_beginning+= old_buffer->size; /* increase offset */
DBUG_ASSERT(log_descriptor.bc.ptr !=NULL);
DBUG_ASSERT(LSN_FILE_NO(log_descriptor.horizon) ==
- LSN_FILE_NO(log_descriptor.bc.buffer->offset));
+ LSN_FILE_NO(old_buffer->offset));
translog_check_cursor(&log_descriptor.bc);
DBUG_ASSERT(left < TRANSLOG_PAGE_SIZE);
if (left)
@@ -7387,18 +7515,20 @@
*/
DBUG_PRINT("info", ("left: %u", (uint) left));
+ old_buffer->pre_force_close_horizon=
+ old_buffer->offset + old_buffer->size;
/* decrease offset */
new_buff_beginning-= log_descriptor.bc.current_page_fill;
current_page_fill= log_descriptor.bc.current_page_fill;
memset(log_descriptor.bc.ptr, TRANSLOG_FILLER, left);
- log_descriptor.bc.buffer->size+= left;
+ old_buffer->size+= left;
DBUG_PRINT("info", ("Finish Page buffer #%u: 0x%lx "
"Size: %lu",
- (uint) log_descriptor.bc.buffer->buffer_no,
- (ulong) log_descriptor.bc.buffer,
- (ulong) log_descriptor.bc.buffer->size));
- DBUG_ASSERT(log_descriptor.bc.buffer->buffer_no ==
+ (uint) old_buffer->buffer_no,
+ (ulong) old_buffer,
+ (ulong) old_buffer->size));
+ DBUG_ASSERT(old_buffer->buffer_no ==
log_descriptor.bc.buffer_no);
}
else
@@ -7509,11 +7639,21 @@
if (left)
{
- /*
- TODO: do not copy beginning of the page if we have no CRC or sector
- checks on
- */
- memcpy(new_buffer->buffer, data, current_page_fill);
+ if (log_descriptor.flags &
+ (TRANSLOG_PAGE_CRC | TRANSLOG_SECTOR_PROTECTION))
+ memcpy(new_buffer->buffer, data, current_page_fill);
+ else
+ {
+ /*
+ This page header does not change if we add more data to the page so
+ we can not copy it and will not overwrite later
+ */
+ new_buffer->skipped_data= current_page_fill;
+#ifndef DBUG_OFF
+ memset(new_buffer->buffer, 0xa5, current_page_fill);
+#endif
+ DBUG_ASSERT(new_buffer->skipped_data < TRANSLOG_PAGE_SIZE);
+ }
}
old_buffer->next_buffer_offset= new_buffer->offset;
translog_buffer_lock(new_buffer);
@@ -7561,6 +7701,7 @@
{
log_descriptor.next_pass_max_lsn= lsn;
log_descriptor.max_lsn_requester= pthread_self();
+ pthread_cond_broadcast(&log_descriptor.new_goal_cond);
}
while (flush_no == log_descriptor.flush_no)
{
@@ -7572,66 +7713,78 @@
/**
- @brief Flush the log up to given LSN (included)
-
- @param lsn log record serial number up to which (inclusive)
- the log has to be flushed
-
- @return Operation status
+ @brief sync() range of files (inclusive) and directory (by request)
+
+ @param min min internal file number to flush
+ @param max max internal file number to flush
+ @param sync_dir need sync directory
+
+ return Operation status
@retval 0 OK
@retval 1 Error
-
-*/
-
-my_bool translog_flush(TRANSLOG_ADDRESS lsn)
-{
- LSN sent_to_disk= LSN_IMPOSSIBLE;
- TRANSLOG_ADDRESS flush_horizon;
- uint fn, i;
+*/
+
+static my_bool translog_sync_files(uint32 min, uint32 max,
+ my_bool sync_dir)
+{
+ uint fn;
+ my_bool rc= 0;
+ ulonglong flush_interval;
+ DBUG_ENTER("translog_sync_files");
+ DBUG_PRINT("info", ("min: %lu max: %lu sync dir: %d",
+ (ulong) min, (ulong) max, (int) sync_dir));
+ DBUG_ASSERT(min <= max);
+
+ flush_interval= group_commit_wait;
+ if (flush_interval)
+ flush_start= my_micro_time();
+ for (fn= min; fn <= max; fn++)
+ {
+ TRANSLOG_FILE *file= get_logfile_by_number(fn);
+ DBUG_ASSERT(file != NULL);
+ if (!file->is_sync)
+ {
+ if (my_sync(file->handler.file, MYF(MY_WME)))
+ {
+ rc= 1;
+ translog_stop_writing();
+ DBUG_RETURN(rc);
+ }
+ translog_syncs++;
+ file->is_sync= 1;
+ }
+ }
+
+ if (sync_dir)
+ {
+ if (!(rc= sync_dir(log_descriptor.directory_fd,
+ MYF(MY_WME | MY_IGNORE_BADFD))))
+ translog_syncs++;
+ }
+
+ DBUG_RETURN(rc);
+}
+
+
+/*
+ @brief Flushes buffers with LSNs in them less or equal address <lsn>
+
+ @param lsn address up to which all LSNs should be flushed,
+ can be reset to real last LSN address
+ @parem sent_to_disk returns 'sent to disk' position
+ @param flush_horizon returns horizon of the flush
+
+ @note About terminology see comment to translog_flush().
+*/
+
+void translog_flush_buffers(TRANSLOG_ADDRESS *lsn,
+ TRANSLOG_ADDRESS *sent_to_disk,
+ TRANSLOG_ADDRESS *flush_horizon)
+{
dirty_buffer_mask_t dirty_buffer_mask;
+ uint i;
uint8 last_buffer_no, start_buffer_no;
- my_bool rc= 0;
- DBUG_ENTER("translog_flush");
- DBUG_PRINT("enter", ("Flush up to LSN: (%lu,0x%lx)", LSN_IN_PARTS(lsn)));
- DBUG_ASSERT(translog_status == TRANSLOG_OK ||
- translog_status == TRANSLOG_READONLY);
- LINT_INIT(sent_to_disk);
-
- pthread_mutex_lock(&log_descriptor.log_flush_lock);
- DBUG_PRINT("info", ("Everything is flushed up to (%lu,0x%lx)",
- LSN_IN_PARTS(log_descriptor.flushed)));
- if (cmp_translog_addr(log_descriptor.flushed, lsn) >= 0)
- {
- pthread_mutex_unlock(&log_descriptor.log_flush_lock);
- DBUG_RETURN(0);
- }
- if (log_descriptor.flush_in_progress)
- {
- translog_flush_set_new_goal_and_wait(lsn);
- if (!pthread_equal(log_descriptor.max_lsn_requester, pthread_self()))
- {
- /* fix lsn if it was horizon */
- if (cmp_translog_addr(lsn, log_descriptor.bc.buffer->last_lsn) > 0)
- lsn= BUFFER_MAX_LSN(log_descriptor.bc.buffer);
- translog_flush_wait_for_end(lsn);
- pthread_mutex_unlock(&log_descriptor.log_flush_lock);
- DBUG_RETURN(0);
- }
- log_descriptor.next_pass_max_lsn= LSN_IMPOSSIBLE;
- }
- log_descriptor.flush_in_progress= 1;
- flush_horizon= log_descriptor.previous_flush_horizon;
- DBUG_PRINT("info", ("flush_in_progress is set"));
- pthread_mutex_unlock(&log_descriptor.log_flush_lock);
-
- translog_lock();
- if (log_descriptor.is_everything_flushed)
- {
- DBUG_PRINT("info", ("everything is flushed"));
- rc= (translog_status == TRANSLOG_READONLY);
- translog_unlock();
- goto out;
- }
+ DBUG_ENTER("translog_flush_buffers");
/*
We will recheck information when will lock buffers one by
@@ -7656,15 +7809,15 @@
/*
if LSN up to which we have to flush bigger then maximum LSN of previous
buffer and at least one LSN was saved in the current buffer (last_lsn !=
- LSN_IMPOSSIBLE) then we better finish the current buffer.
+ LSN_IMPOSSIBLE) then we have to close the current buffer.
*/
- if (cmp_translog_addr(lsn, log_descriptor.bc.buffer->prev_last_lsn) > 0 &&
+ if (cmp_translog_addr(*lsn, log_descriptor.bc.buffer->prev_last_lsn) > 0 &&
log_descriptor.bc.buffer->last_lsn != LSN_IMPOSSIBLE)
{
struct st_translog_buffer *buffer= log_descriptor.bc.buffer;
- lsn= log_descriptor.bc.buffer->last_lsn; /* fix lsn if it was horizon */
+ *lsn= log_descriptor.bc.buffer->last_lsn; /* fix lsn if it was horizon */
DBUG_PRINT("info", ("LSN to flush fixed to last lsn: (%lu,0x%lx)",
- LSN_IN_PARTS(log_descriptor.bc.buffer->last_lsn)));
+ LSN_IN_PARTS(log_descriptor.bc.buffer->last_lsn)));
last_buffer_no= log_descriptor.bc.buffer_no;
log_descriptor.is_everything_flushed= 1;
translog_force_current_buffer_to_finish();
@@ -7676,8 +7829,10 @@
TRANSLOG_BUFFERS_NO);
translog_unlock();
}
- sent_to_disk= translog_get_sent_to_disk();
- if (cmp_translog_addr(lsn, sent_to_disk) > 0)
+
+ /* flush buffers */
+ *sent_to_disk= translog_get_sent_to_disk();
+ if (cmp_translog_addr(*lsn, *sent_to_disk) > 0)
{
DBUG_PRINT("info", ("Start buffer #: %u last buffer #: %u",
@@ -7697,53 +7852,238 @@
LSN_IN_PARTS(buffer->last_lsn),
(buffer->file ?
"dirty" : "closed")));
- if (buffer->prev_last_lsn <= lsn &&
+ if (buffer->prev_last_lsn <= *lsn &&
buffer->file != NULL)
{
- DBUG_ASSERT(flush_horizon <= buffer->offset + buffer->size);
- flush_horizon= buffer->offset + buffer->size;
+ DBUG_ASSERT(*flush_horizon <= buffer->offset + buffer->size);
+ *flush_horizon= (buffer->pre_force_close_horizon != LSN_IMPOSSIBLE ?
+ buffer->pre_force_close_horizon :
+ buffer->offset + buffer->size);
+ /* pre_force_close_horizon is reset during new buffer start */
+ DBUG_PRINT("info", ("flush_horizon: (%lu,0x%lx)",
+ LSN_IN_PARTS(*flush_horizon)));
+ DBUG_ASSERT(*flush_horizon <= log_descriptor.horizon);
+
translog_buffer_flush(buffer);
}
translog_buffer_unlock(buffer);
i= (i + 1) % TRANSLOG_BUFFERS_NO;
} while (i != last_buffer_no);
- sent_to_disk= translog_get_sent_to_disk();
- }
-
- /* sync files from previous flush till current one */
- for (fn= LSN_FILE_NO(log_descriptor.flushed); fn <= LSN_FILE_NO(lsn); fn++)
- {
- TRANSLOG_FILE *file= get_logfile_by_number(fn);
- DBUG_ASSERT(file != NULL);
- if (!file->is_sync)
- {
- if (my_sync(file->handler.file, MYF(MY_WME)))
+ *sent_to_disk= translog_get_sent_to_disk();
+ }
+
+ DBUG_VOID_RETURN;
+}
+
+/**
+ @brief Flush the log up to given LSN (included)
+
+ @param lsn log record serial number up to which (inclusive)
+ the log has to be flushed
+
+ @return Operation status
+ @retval 0 OK
+ @retval 1 Error
+
+ @note
+
+ - Non group commit logic: Commits made in passes. Thread which started
+ flush first is performing actual flush, other threads sets new goal (LSN)
+ of the next pass (if it is maximum) and waits for the pass end or just
+ wait for the pass end.
+
+ - If hard group commit enabled and rate set to zero:
+ The first thread sends all changed buffers to disk. This is repeated
+ as long as there are new LSNs added. The process can not loop
+ forever because we have limited number of threads and they will wait
+ for the data to be synced.
+ Pseudo code:
+
+ do
+ send changed buffers to disk
+ while new_goal
+ sync
+
+ - If hard group commit switched ON and less than rate microseconds has
+ passed from last sync, then after buffers have been sent to disk
+ wait until rate microseconds has passed since last sync, do sync and return.
+ This ensures that if we call sync infrequently we don't do any waits.
+
+ - If soft group commit enabled everything works as with 'non group commit'
+ but the thread doesn't do any real sync(). If rate is not zero the
+ sync() will be performed by a service thread with the given rate
+ when needed (new LSN appears).
+
+ @note Terminology:
+ 'sent to disk' means written to disk but not sync()ed,
+ 'flushed' mean sent to disk and synced().
+*/
+
+my_bool translog_flush(TRANSLOG_ADDRESS lsn)
+{
+ struct timespec abstime;
+ ulonglong flush_interval;
+ ulonglong time_spent;
+ LSN sent_to_disk= LSN_IMPOSSIBLE;
+ TRANSLOG_ADDRESS flush_horizon;
+ my_bool rc= 0;
+ my_bool hgroup_commit_at_start;
+ DBUG_ENTER("translog_flush");
+ DBUG_PRINT("enter", ("Flush up to LSN: (%lu,0x%lx)", LSN_IN_PARTS(lsn)));
+ DBUG_ASSERT(translog_status == TRANSLOG_OK ||
+ translog_status == TRANSLOG_READONLY);
+ LINT_INIT(sent_to_disk);
+
+ pthread_mutex_lock(&log_descriptor.log_flush_lock);
+ DBUG_PRINT("info", ("Everything is flushed up to (%lu,0x%lx)",
+ LSN_IN_PARTS(log_descriptor.flushed)));
+ if (cmp_translog_addr(log_descriptor.flushed, lsn) >= 0)
+ {
+ pthread_mutex_unlock(&log_descriptor.log_flush_lock);
+ DBUG_RETURN(0);
+ }
+ if (log_descriptor.flush_in_progress)
+ {
+ translog_lock();
+ /* fix lsn if it was horizon */
+ if (cmp_translog_addr(lsn, log_descriptor.bc.buffer->last_lsn) > 0)
+ lsn= BUFFER_MAX_LSN(log_descriptor.bc.buffer);
+ translog_unlock();
+ translog_flush_set_new_goal_and_wait(lsn);
+ if (!pthread_equal(log_descriptor.max_lsn_requester, pthread_self()))
+ {
+ /*
+ translog_flush_wait_for_end() release log_flush_lock while is
+ waiting then acquire it again
+ */
+ translog_flush_wait_for_end(lsn);
+ pthread_mutex_unlock(&log_descriptor.log_flush_lock);
+ DBUG_RETURN(0);
+ }
+ log_descriptor.next_pass_max_lsn= LSN_IMPOSSIBLE;
+ }
+ log_descriptor.flush_in_progress= 1;
+ flush_horizon= log_descriptor.previous_flush_horizon;
+ DBUG_PRINT("info", ("flush_in_progress is set, flush_horizon: (%lu,0x%lx)",
+ LSN_IN_PARTS(flush_horizon)));
+ pthread_mutex_unlock(&log_descriptor.log_flush_lock);
+
+ hgroup_commit_at_start= hard_group_commit;
+ if (hgroup_commit_at_start)
+ flush_interval= group_commit_wait;
+
+ translog_lock();
+ if (log_descriptor.is_everything_flushed)
+ {
+ DBUG_PRINT("info", ("everything is flushed"));
+ translog_unlock();
+ pthread_mutex_lock(&log_descriptor.log_flush_lock);
+ goto out;
+ }
+
+ for (;;)
+ {
+ /* Following function flushes buffers and makes translog_unlock() */
+ translog_flush_buffers(&lsn, &sent_to_disk, &flush_horizon);
+
+ if (!hgroup_commit_at_start)
+ break; /* flush pass is ended */
+
+retest:
+ /*
+ We do not check time here because pthread_mutex_lock rarely takes
+ a lot of time so we can sacrifice a bit precision to performance
+ (taking into account that my_micro_time() might be expensive call).
+ */
+ if (flush_interval == 0)
+ break; /* flush pass is ended */
+
+ pthread_mutex_lock(&log_descriptor.log_flush_lock);
+ if (log_descriptor.next_pass_max_lsn == LSN_IMPOSSIBLE)
+ {
+ if (flush_interval == 0 ||
+ (time_spent= (my_micro_time() - flush_start)) >= flush_interval)
{
- rc= 1;
- translog_stop_writing();
- sent_to_disk= LSN_IMPOSSIBLE;
- goto out;
+ pthread_mutex_unlock(&log_descriptor.log_flush_lock);
+ break;
}
- file->is_sync= 1;
- }
- }
-
- if (sync_log_dir >= TRANSLOG_SYNC_DIR_ALWAYS &&
- (LSN_FILE_NO(log_descriptor.previous_flush_horizon) !=
- LSN_FILE_NO(flush_horizon) ||
- ((LSN_OFFSET(log_descriptor.previous_flush_horizon) - 1) /
- TRANSLOG_PAGE_SIZE) !=
- ((LSN_OFFSET(flush_horizon) - 1) / TRANSLOG_PAGE_SIZE)))
- rc|= sync_dir(log_descriptor.directory_fd, MYF(MY_WME | MY_IGNORE_BADFD));
+ DBUG_PRINT("info", ("flush waits: %llu interval: %llu spent: %llu",
+ flush_interval - time_spent,
+ flush_interval, time_spent));
+ /* wait time or next goal */
+ set_timespec_nsec(abstime, flush_interval - time_spent);
+ pthread_cond_timedwait(&log_descriptor.new_goal_cond,
+ &log_descriptor.log_flush_lock,
+ &abstime);
+ pthread_mutex_unlock(&log_descriptor.log_flush_lock);
+ DBUG_PRINT("info", ("retest conditions"));
+ goto retest;
+ }
+
+ /* take next goal */
+ lsn= log_descriptor.next_pass_max_lsn;
+ log_descriptor.next_pass_max_lsn= LSN_IMPOSSIBLE;
+ /* prevent other thread from continue */
+ log_descriptor.max_lsn_requester= pthread_self();
+ DBUG_PRINT("info", ("flush took next goal: (%lu,0x%lx)",
+ LSN_IN_PARTS(lsn)));
+ pthread_mutex_unlock(&log_descriptor.log_flush_lock);
+
+ /* next flush pass */
+ DBUG_PRINT("info", ("next flush pass"));
+ translog_lock();
+ }
+
+ /*
+ sync() files from previous flush till current one
+ */
+ if (!soft_sync || hgroup_commit_at_start)
+ {
+ if ((rc=
+ translog_sync_files(LSN_FILE_NO(log_descriptor.flushed),
+ LSN_FILE_NO(lsn),
+ sync_log_dir >= TRANSLOG_SYNC_DIR_ALWAYS &&
+ (LSN_FILE_NO(log_descriptor.
+ previous_flush_horizon) !=
+ LSN_FILE_NO(flush_horizon) ||
+ (LSN_OFFSET(log_descriptor.
+ previous_flush_horizon) /
+ TRANSLOG_PAGE_SIZE) !=
+ (LSN_OFFSET(flush_horizon) /
+ TRANSLOG_PAGE_SIZE)))))
+ {
+ sent_to_disk= LSN_IMPOSSIBLE;
+ pthread_mutex_lock(&log_descriptor.log_flush_lock);
+ goto out;
+ }
+ /* keep values for soft sync() and forced sync() actual */
+ {
+ uint32 fileno= LSN_FILE_NO(lsn);
+ my_atomic_rwlock_wrlock(&soft_sync_rwl);
+ my_atomic_store32(&soft_sync_min, fileno);
+ my_atomic_store32(&soft_sync_max, fileno);
+ my_atomic_rwlock_wrunlock(&soft_sync_rwl);
+ }
+ }
+ else
+ {
+ my_atomic_rwlock_wrlock(&soft_sync_rwl);
+ my_atomic_store32(&soft_sync_max, LSN_FILE_NO(lsn));
+ my_atomic_store32(&soft_need_sync, 1);
+ my_atomic_rwlock_wrunlock(&soft_sync_rwl);
+ }
+
+ DBUG_ASSERT(flush_horizon <= log_descriptor.horizon);
+
+ pthread_mutex_lock(&log_descriptor.log_flush_lock);
log_descriptor.previous_flush_horizon= flush_horizon;
out:
- pthread_mutex_lock(&log_descriptor.log_flush_lock);
if (sent_to_disk != LSN_IMPOSSIBLE)
log_descriptor.flushed= sent_to_disk;
log_descriptor.flush_in_progress= 0;
log_descriptor.flush_no++;
DBUG_PRINT("info", ("flush_in_progress is dropped"));
- pthread_mutex_unlock(&log_descriptor.log_flush_lock);\
+ pthread_mutex_unlock(&log_descriptor.log_flush_lock);
pthread_cond_broadcast(&log_descriptor.log_flush_cond);
DBUG_RETURN(rc);
}
@@ -8113,6 +8453,8 @@
my_bool translog_purge(TRANSLOG_ADDRESS low)
{
uint32 last_need_file= LSN_FILE_NO(low);
+ uint32 min_unsync;
+ int soft;
TRANSLOG_ADDRESS horizon= translog_get_horizon();
int rc= 0;
DBUG_ENTER("translog_purge");
@@ -8120,12 +8462,26 @@
DBUG_ASSERT(translog_status == TRANSLOG_OK ||
translog_status == TRANSLOG_READONLY);
+ soft= soft_sync;
+ my_atomic_rwlock_wrlock(&soft_sync_rwl);
+ min_unsync= my_atomic_load32(&soft_sync_min);
+ my_atomic_rwlock_wrunlock(&soft_sync_rwl);
+ DBUG_PRINT("info", ("min_unsync: %lu", (ulong) min_unsync));
+ if (soft && min_unsync < last_need_file)
+ {
+ last_need_file= min_unsync;
+ DBUG_PRINT("info", ("last_need_file set to %lu", (ulong)last_need_file));
+ }
+
pthread_mutex_lock(&log_descriptor.purger_lock);
+ DBUG_PRINT("info", ("last_lsn_checked file: %lu:",
+ (ulong) log_descriptor.last_lsn_checked));
if (LSN_FILE_NO(log_descriptor.last_lsn_checked) < last_need_file)
{
uint32 i;
uint32 min_file= translog_first_file(horizon, 1);
DBUG_ASSERT(min_file != 0); /* log is already started */
+ DBUG_PRINT("info", ("min_file: %lu:",(ulong) min_file));
for(i= min_file; i < last_need_file && rc == 0; i++)
{
LSN lsn= translog_get_file_max_lsn_stored(i);
@@ -8356,6 +8712,159 @@
}
+
+/**
+ Sets soft sync mode
+
+ @param mode TRUE if we need switch soft sync on else off
+*/
+
+void translog_soft_sync(my_bool mode)
+{
+ soft_sync= mode;
+}
+
+
+/**
+ Sets hard group commit
+
+ @param mode TRUE if we need switch hard group commit on else off
+*/
+
+void translog_hard_group_commit(my_bool mode)
+{
+ hard_group_commit= mode;
+}
+
+
+/**
+ @brief forced log sync (used when we are switching modes)
+*/
+
+void translog_sync()
+{
+ uint32 max= get_current_logfile()->number;
+ uint32 min;
+ DBUG_ENTER("ma_translog_sync");
+
+ my_atomic_rwlock_rdlock(&soft_sync_rwl);
+ min= my_atomic_load32(&soft_sync_min);
+ my_atomic_rwlock_rdunlock(&soft_sync_rwl);
+ if (!min)
+ min= max;
+
+ translog_sync_files(min, max, sync_log_dir >= TRANSLOG_SYNC_DIR_ALWAYS);
+
+ DBUG_VOID_RETURN;
+}
+
+
+/**
+ @brief set rate for group commit
+
+ @param interval interval to set.
+
+ @note We use this function with additional variable because have to
+ restart service thread with new value which we can't make inside changing
+ variable routine (update_maria_group_commit_interval)
+*/
+
+void translog_set_group_commit_interval(uint32 interval)
+{
+ DBUG_ENTER("translog_set_group_commit_interval");
+ group_commit_wait= interval;
+ DBUG_PRINT("info", ("wait: %llu",
+ (ulonglong)group_commit_wait));
+ DBUG_VOID_RETURN;
+}
+
+
+/**
+ @brief syncing service thread
+*/
+
+static pthread_handler_t
+ma_soft_sync_background( void *arg __attribute__((unused)))
+{
+
+ my_thread_init();
+ {
+ DBUG_ENTER("ma_soft_sync_background");
+ for(;;)
+ {
+ ulonglong prev_loop= my_micro_time();
+ ulonglong time, sleep;
+ uint32 min, max, sync_request;
+ my_atomic_rwlock_rdlock(&soft_sync_rwl);
+ min= my_atomic_load32(&soft_sync_min);
+ max= my_atomic_load32(&soft_sync_max);
+ sync_request= my_atomic_load32(&soft_need_sync);
+ my_atomic_store32(&soft_sync_min, max);
+ my_atomic_store32(&soft_need_sync, 0);
+ my_atomic_rwlock_rdunlock(&soft_sync_rwl);
+
+ sleep= group_commit_wait;
+ if (sync_request)
+ translog_sync_files(min, max, FALSE);
+ time= my_micro_time() - prev_loop;
+ if (time > sleep)
+ sleep= 0;
+ else
+ sleep-= time;
+ if (my_service_thread_sleep(&soft_sync_control, sleep))
+ break;
+ }
+ my_service_thread_signal_end(&soft_sync_control);
+ my_thread_end();
+ DBUG_RETURN(0);
+ }
+}
+
+
+/**
+ @brief Starts syncing thread
+*/
+
+int translog_soft_sync_start(void)
+{
+ pthread_t th;
+ int res= 0;
+ uint32 min, max;
+ DBUG_ENTER("translog_soft_sync_start");
+
+ /* check and init variables */
+ my_atomic_rwlock_rdlock(&soft_sync_rwl);
+ min= my_atomic_load32(&soft_sync_min);
+ max= my_atomic_load32(&soft_sync_max);
+ if (!max)
+ my_atomic_store32(&soft_sync_max, (max= get_current_logfile()->number));
+ if (!min)
+ my_atomic_store32(&soft_sync_min, max);
+ my_atomic_store32(&soft_need_sync, 1);
+ my_atomic_rwlock_rdunlock(&soft_sync_rwl);
+
+ if (!(res= ma_service_thread_control_init(&soft_sync_control)))
+ if (!(res= pthread_create(&th, NULL, ma_soft_sync_background, NULL)))
+ soft_sync_control.status= THREAD_RUNNING;
+ DBUG_RETURN(res);
+}
+
+
+/**
+ @brief Stops syncing thread
+*/
+
+void translog_soft_sync_end(void)
+{
+ DBUG_ENTER("translog_soft_sync_end");
+ if (soft_sync_control.inited)
+ {
+ ma_service_thread_control_end(&soft_sync_control);
+ }
+ DBUG_VOID_RETURN;
+}
+
+
#ifdef MARIA_DUMP_LOG
#include <my_getopt.h>
extern void translog_example_table_init();
=== modified file 'storage/maria/ma_loghandler.h'
--- a/storage/maria/ma_loghandler.h 2009-01-15 22:25:53 +0000
+++ b/storage/maria/ma_loghandler.h 2010-02-12 09:13:25 +0000
@@ -342,6 +342,14 @@
TRANSLOG_SHUTDOWN /* going to shutdown the loghandler */
};
extern enum enum_translog_status translog_status;
+extern ulonglong translog_syncs; /* Number of sync()s */
+
+void translog_soft_sync(my_bool mode);
+void translog_hard_group_commit(my_bool mode);
+int translog_soft_sync_start(void);
+void translog_soft_sync_end(void);
+void translog_sync();
+void translog_set_group_commit_interval(uint32 interval);
/*
all the rest added because of recovery; should we make
@@ -441,6 +449,14 @@
typedef enum
{
+ TRANSLOG_GCOMMIT_NONE,
+ TRANSLOG_GCOMMIT_HARD,
+ TRANSLOG_GCOMMIT_SOFT
+} enum_maria_group_commit;
+extern ulong maria_group_commit;
+extern ulong maria_group_commit_interval;
+typedef enum
+{
TRANSLOG_PURGE_IMMIDIATE,
TRANSLOG_PURGE_EXTERNAL,
TRANSLOG_PURGE_ONDEMAND
1
0
[Maria-developers] Rev 2740: Group commit for maria storage engine. in file:///Users/bell/maria/bzr/work-maria-5.2-groupcommit/
by sanjaï¼ askmonty.org 12 Feb '10
by sanjaï¼ askmonty.org 12 Feb '10
12 Feb '10
At file:///Users/bell/maria/bzr/work-maria-5.2-groupcommit/
------------------------------------------------------------
revno: 2740
revision-id: sanja(a)askmonty.org-20100212065247-vnhehxm6snm32c1j
parent: knielsen(a)knielsen-hq.org-20100201190519-b9uktnn90rwwiile
committer: sanja(a)askmonty.org
branch nick: work-maria-5.2-groupcommit
timestamp: Fri 2010-02-12 08:52:47 +0200
message:
Group commit for maria storage engine.
=== added file 'mysql-test/suite/maria/r/group_commit.result'
--- a/mysql-test/suite/maria/r/group_commit.result 1970-01-01 00:00:00 +0000
+++ b/mysql-test/suite/maria/r/group_commit.result 2010-02-12 06:52:47 +0000
@@ -0,0 +1,17 @@
+drop table if exists t1;
+create table t1 (a int);
+SET GLOBAL maria_group_commit="NONE";
+SET GLOBAL maria_group_commit_interval= 0;
+SET GLOBAL maria_group_commit="NONE";
+SET GLOBAL maria_group_commit_interval= 100;
+SET GLOBAL maria_group_commit="HARD";
+SET GLOBAL maria_group_commit_interval= 0;
+SET GLOBAL maria_group_commit="HARD";
+SET GLOBAL maria_group_commit_interval= 100;
+SET GLOBAL maria_group_commit="SOFT";
+SET GLOBAL maria_group_commit_interval= 0;
+SET GLOBAL maria_group_commit="SOFT";
+SET GLOBAL maria_group_commit_interval= 100;
+SET GLOBAL maria_group_commit="NONE";
+SET GLOBAL maria_group_commit_interval= 0;
+drop table t1;
=== modified file 'mysql-test/suite/maria/r/maria3.result'
--- a/mysql-test/suite/maria/r/maria3.result 2009-09-18 01:04:43 +0000
+++ b/mysql-test/suite/maria/r/maria3.result 2010-02-12 06:52:47 +0000
@@ -306,6 +306,8 @@
maria_block_size 8192
maria_checkpoint_interval 30
maria_force_start_after_recovery_failures 0
+maria_group_commit none
+maria_group_commit_interval 0
maria_log_file_size 4294959104
maria_log_purge_type immediate
maria_max_sort_file_size 9223372036853727232
@@ -328,6 +330,7 @@
Maria_pagecache_reads #
Maria_pagecache_write_requests #
Maria_pagecache_writes #
+Maria_transaction_log_syncs #
create table t1 (b char(0));
insert into t1 values(NULL),("");
select length(b) from t1;
=== added file 'mysql-test/suite/maria/t/group_commit.test'
--- a/mysql-test/suite/maria/t/group_commit.test 1970-01-01 00:00:00 +0000
+++ b/mysql-test/suite/maria/t/group_commit.test 2010-02-12 06:52:47 +0000
@@ -0,0 +1,71 @@
+# Test different ways of syncing (mostly syntax)
+
+--disable_warnings
+drop table if exists t1;
+--enable_warnings
+
+create table t1 (a int);
+
+SET GLOBAL maria_group_commit="NONE";
+SET GLOBAL maria_group_commit_interval= 0;
+--disable_query_log
+let $num = 5000;
+while ($num)
+{
+ insert into t1 values (1);
+ dec $num;
+}
+--enable_query_log
+SET GLOBAL maria_group_commit="NONE";
+SET GLOBAL maria_group_commit_interval= 100;
+--disable_query_log
+let $num = 5000;
+while ($num)
+{
+ insert into t1 values (1);
+ dec $num;
+}
+--enable_query_log
+SET GLOBAL maria_group_commit="HARD";
+SET GLOBAL maria_group_commit_interval= 0;
+--disable_query_log
+let $num = 5000;
+while ($num)
+{
+ insert into t1 values (1);
+ dec $num;
+}
+--enable_query_log
+SET GLOBAL maria_group_commit="HARD";
+SET GLOBAL maria_group_commit_interval= 100;
+--disable_query_log
+let $num = 5000;
+while ($num)
+{
+ insert into t1 values (1);
+ dec $num;
+}
+--enable_query_log
+SET GLOBAL maria_group_commit="SOFT";
+SET GLOBAL maria_group_commit_interval= 0;
+--disable_query_log
+let $num = 5000;
+while ($num)
+{
+ insert into t1 values (1);
+ dec $num;
+}
+--enable_query_log
+SET GLOBAL maria_group_commit="SOFT";
+SET GLOBAL maria_group_commit_interval= 100;
+--disable_query_log
+let $num = 5000;
+while ($num)
+{
+ insert into t1 values (1);
+ dec $num;
+}
+--enable_query_log
+SET GLOBAL maria_group_commit="NONE";
+SET GLOBAL maria_group_commit_interval= 0;
+drop table t1;
=== modified file 'storage/maria/ha_maria.cc'
--- a/storage/maria/ha_maria.cc 2009-12-03 11:34:11 +0000
+++ b/storage/maria/ha_maria.cc 2010-02-12 06:52:47 +0000
@@ -102,22 +102,40 @@
array_elements(maria_translog_purge_type_names) - 1, "",
maria_translog_purge_type_names, NULL
};
+
+/* transactional log directory sync */
const char *maria_sync_log_dir_names[]=
{
"NEVER", "NEWFILE", "ALWAYS", NullS
};
-
TYPELIB maria_sync_log_dir_typelib=
{
array_elements(maria_sync_log_dir_names) - 1, "",
maria_sync_log_dir_names, NULL
};
+/* transactional log group commit */
+const char *maria_group_commit_names[]=
+{
+ "none", "hard", "soft", NullS
+};
+TYPELIB maria_group_commit_typelib=
+{
+ array_elements(maria_group_commit_names) - 1, "",
+ maria_group_commit_names, NULL
+};
+
/** Interval between background checkpoints in seconds */
static ulong checkpoint_interval;
static void update_checkpoint_interval(MYSQL_THD thd,
struct st_mysql_sys_var *var,
void *var_ptr, const void *save);
+static void update_maria_group_commit(MYSQL_THD thd,
+ struct st_mysql_sys_var *var,
+ void *var_ptr, const void *save);
+static void update_maria_group_commit_interval(MYSQL_THD thd,
+ struct st_mysql_sys_var *var,
+ void *var_ptr, const void *save);
/** After that many consecutive recovery failures, remove logs */
static ulong force_start_after_recovery_failures;
static void update_log_file_size(MYSQL_THD thd,
@@ -164,6 +182,24 @@
NULL, update_log_file_size, TRANSLOG_FILE_SIZE,
TRANSLOG_MIN_FILE_SIZE, 0xffffffffL, TRANSLOG_PAGE_SIZE);
+static MYSQL_SYSVAR_ENUM(group_commit, maria_group_commit,
+ PLUGIN_VAR_RQCMDARG,
+ "Specifies maria group commit mode. "
+ "Possible values are \"none\" (no group commit), "
+ "\"hard\" (with waiting to actual commit), "
+ "\"soft\" (no wait for commit (DANGEROUS!!!))",
+ NULL, update_maria_group_commit,
+ TRANSLOG_GCOMMIT_NONE, &maria_group_commit_typelib);
+
+static MYSQL_SYSVAR_ULONG(group_commit_interval, maria_group_commit_interval,
+ PLUGIN_VAR_RQCMDARG,
+ "Interval between commite in microseconds (1/1000000c)."
+ " 0 stands for no waiting"
+ " for other threads to come and do a commit in \"hard\" mode and no"
+ " sync()/commit at all in \"soft\" mode. Option has only an effect"
+ " if maria_group_commit is used",
+ NULL, update_maria_group_commit_interval, 0, 0, UINT_MAX, 1);
+
static MYSQL_SYSVAR_ENUM(log_purge_type, log_purge_type,
PLUGIN_VAR_RQCMDARG,
"Specifies how maria transactional log will be purged. "
@@ -3275,6 +3311,8 @@
MYSQL_SYSVAR(block_size),
MYSQL_SYSVAR(checkpoint_interval),
MYSQL_SYSVAR(force_start_after_recovery_failures),
+ MYSQL_SYSVAR(group_commit),
+ MYSQL_SYSVAR(group_commit_interval),
MYSQL_SYSVAR(page_checksum),
MYSQL_SYSVAR(log_dir_path),
MYSQL_SYSVAR(log_file_size),
@@ -3306,6 +3344,92 @@
}
/**
+ @brief Updates group commit mode
+*/
+
+static void update_maria_group_commit(MYSQL_THD thd,
+ struct st_mysql_sys_var *var,
+ void *var_ptr, const void *save)
+{
+ ulong value= (ulong)*((long *)var_ptr);
+ DBUG_ENTER("update_maria_group_commit");
+ DBUG_PRINT("enter", ("old value: %lu new value %lu rate %lu",
+ value, (ulong)(*(long *)save),
+ maria_group_commit_interval));
+ /* old value */
+ switch (value) {
+ case TRANSLOG_GCOMMIT_NONE:
+ break;
+ case TRANSLOG_GCOMMIT_HARD:
+ translog_hard_group_commit(FALSE);
+ break;
+ case TRANSLOG_GCOMMIT_SOFT:
+ translog_soft_sync(FALSE);
+ if (maria_group_commit_interval)
+ translog_soft_sync_end();
+ break;
+ default:
+ DBUG_ASSERT(0); /* impossible */
+ }
+ value= *(ulong *)var_ptr= (ulong)(*(long *)save);
+ translog_sync();
+ /* new value */
+ switch (value) {
+ case TRANSLOG_GCOMMIT_NONE:
+ break;
+ case TRANSLOG_GCOMMIT_HARD:
+ translog_hard_group_commit(TRUE);
+ break;
+ case TRANSLOG_GCOMMIT_SOFT:
+ translog_soft_sync(TRUE);
+ /* variable change made under global lock so we can just read it */
+ if (maria_group_commit_interval)
+ translog_soft_sync_start();
+ break;
+ default:
+ DBUG_ASSERT(0); /* impossible */
+ }
+ DBUG_VOID_RETURN;
+}
+
+/**
+ @brief Updates group commit interval
+*/
+
+static void update_maria_group_commit_interval(MYSQL_THD thd,
+ struct st_mysql_sys_var *var,
+ void *var_ptr, const void *save)
+{
+ ulong new_value= (ulong)*((long *)save);
+ ulong *value_ptr= (ulong*) var_ptr;
+ DBUG_ENTER("update_maria_group_commit_interval");
+ DBUG_PRINT("enter", ("old value: %lu new value %lu group commit %lu",
+ *value_ptr, new_value, maria_group_commit));
+
+ /* variable change made under global lock so we can just read it */
+ switch (maria_group_commit) {
+ case TRANSLOG_GCOMMIT_NONE:
+ *value_ptr= new_value;
+ translog_set_group_commit_interval(new_value);
+ break;
+ case TRANSLOG_GCOMMIT_HARD:
+ *value_ptr= new_value;
+ translog_set_group_commit_interval(new_value);
+ break;
+ case TRANSLOG_GCOMMIT_SOFT:
+ if (*value_ptr)
+ translog_soft_sync_end();
+ translog_set_group_commit_interval(new_value);
+ if ((*value_ptr= new_value))
+ translog_soft_sync_start();
+ break;
+ default:
+ DBUG_ASSERT(0); /* impossible */
+ }
+ DBUG_VOID_RETURN;
+}
+
+/**
@brief Updates the transaction log file limit.
*/
@@ -3327,6 +3451,7 @@
{"Maria_pagecache_reads", (char*) &maria_pagecache_var.global_cache_read, SHOW_LONGLONG},
{"Maria_pagecache_write_requests", (char*) &maria_pagecache_var.global_cache_w_requests, SHOW_LONGLONG},
{"Maria_pagecache_writes", (char*) &maria_pagecache_var.global_cache_write, SHOW_LONGLONG},
+ {"Maria_transaction_log_syncs", (char*) &translog_syncs, SHOW_LONGLONG},
{NullS, NullS, SHOW_LONG}
};
=== modified file 'storage/maria/ma_init.c'
--- a/storage/maria/ma_init.c 2008-10-09 20:03:54 +0000
+++ b/storage/maria/ma_init.c 2010-02-12 06:52:47 +0000
@@ -82,6 +82,11 @@
maria_inited= maria_multi_threaded= FALSE;
ft_free_stopwords();
ma_checkpoint_end();
+ if (translog_status == TRANSLOG_OK)
+ {
+ translog_soft_sync_end();
+ translog_sync();
+ }
if ((trid= trnman_get_max_trid()) > max_trid_in_control_file)
{
/*
=== modified file 'storage/maria/ma_loghandler.c'
--- a/storage/maria/ma_loghandler.c 2010-01-06 21:27:53 +0000
+++ b/storage/maria/ma_loghandler.c 2010-02-12 06:52:47 +0000
@@ -18,6 +18,7 @@
#include "ma_blockrec.h" /* for some constants and in-write hooks */
#include "ma_key_recover.h" /* For some in-write hooks */
#include "ma_checkpoint.h"
+#include "ma_servicethread.h"
/*
On Windows, neither my_open() nor my_sync() work for directories.
@@ -47,6 +48,15 @@
#include <m_ctype.h>
#endif
+/** @brief protects checkpoint_in_progress */
+static pthread_mutex_t LOCK_soft_sync;
+/** @brief for killing the background checkpoint thread */
+static pthread_cond_t COND_soft_sync;
+/** @brief control structure for checkpoint background thread */
+static MA_SERVICE_THREAD_CONTROL soft_sync_control=
+ {THREAD_DEAD, FALSE, &LOCK_soft_sync, &COND_soft_sync};
+
+
/* transaction log file descriptor */
typedef struct st_translog_file
{
@@ -124,10 +134,24 @@
/* Previous buffer offset to detect it flush finish */
TRANSLOG_ADDRESS prev_buffer_offset;
/*
+ If the buffer was forced to close it save value of its horizon
+ otherwise LSN_IMPOSSIBLE
+ */
+ TRANSLOG_ADDRESS pre_force_close_horizon;
+ /*
How much is written (or will be written when copy_to_buffer_in_progress
become 0) to this buffer
*/
translog_size_t size;
+ /*
+ When moving from one log buffer to another, we write the last of the
+ previous buffer to file and then move to start using the new log
+ buffer. In the case of a part filed last page, this page is not moved
+ to the start of the new buffer but instead we set the 'skip_data'
+ variable to tell us how much data at the beginning of the buffer is not
+ relevant.
+ */
+ uint skipped_data;
/* File handler for this buffer */
TRANSLOG_FILE *file;
/* Threads which are waiting for buffer filling/freeing */
@@ -304,6 +328,7 @@
*/
pthread_mutex_t log_flush_lock;
pthread_cond_t log_flush_cond;
+ pthread_cond_t new_goal_cond;
/* Protects changing of headers of finished files (max_lsn) */
pthread_mutex_t file_header_lock;
@@ -344,13 +369,39 @@
ulong log_purge_type= TRANSLOG_PURGE_IMMIDIATE;
ulong log_file_size= TRANSLOG_FILE_SIZE;
+/* sync() of log files directory mode */
ulong sync_log_dir= TRANSLOG_SYNC_DIR_NEWFILE;
+ulong maria_group_commit= TRANSLOG_GCOMMIT_NONE;
+ulong maria_group_commit_interval= 0;
/* Marker for end of log */
static uchar end_of_log= 0;
#define END_OF_LOG &end_of_log
+/**
+ Switch for "soft" sync (no real sync() but periodical sync by service
+ thread)
+*/
+static volatile my_bool soft_sync= FALSE;
+/**
+ Switch for "hard" group commit mode
+*/
+static volatile my_bool hard_group_commit= FALSE;
+/**
+ File numbers interval which have to be sync()
+*/
+static uint32 soft_sync_min= 0;
+static uint32 soft_sync_max= 0;
+static uint32 soft_need_sync= 1;
+/**
+ stores interval in microseconds
+*/
+static uint32 group_commit_wait= 0;
enum enum_translog_status translog_status= TRANSLOG_UNINITED;
+ulonglong translog_syncs= 0; /* Number of sync()s */
+
+/* time of last flush */
+static ulonglong flush_start= 0;
/* chunk types */
#define TRANSLOG_CHUNK_LSN 0x00 /* 0 chunk refer as LSN (head or tail */
@@ -980,12 +1031,17 @@
static TRANSLOG_FILE *get_current_logfile()
{
TRANSLOG_FILE *file;
+ DBUG_ENTER("get_current_logfile");
rw_rdlock(&log_descriptor.open_files_lock);
+ DBUG_PRINT("info", ("max_file: %lu min_file: %lu open_files: %lu",
+ (ulong) log_descriptor.max_file,
+ (ulong) log_descriptor.min_file,
+ (ulong) log_descriptor.open_files.elements));
DBUG_ASSERT(log_descriptor.max_file - log_descriptor.min_file + 1 ==
log_descriptor.open_files.elements);
file= *dynamic_element(&log_descriptor.open_files, 0, TRANSLOG_FILE **);
rw_unlock(&log_descriptor.open_files_lock);
- return (file);
+ DBUG_RETURN(file);
}
uchar NEAR maria_trans_file_magic[]=
@@ -1069,6 +1125,7 @@
static my_bool translog_max_lsn_to_header(File file, LSN lsn)
{
uchar lsn_buff[LSN_STORE_SIZE];
+ my_bool rc;
DBUG_ENTER("translog_max_lsn_to_header");
DBUG_PRINT("enter", ("File descriptor: %ld "
"lsn: (%lu,0x%lx)",
@@ -1077,11 +1134,17 @@
lsn_store(lsn_buff, lsn);
- DBUG_RETURN(my_pwrite(file, lsn_buff,
- LSN_STORE_SIZE,
- (LOG_HEADER_DATA_SIZE - LSN_STORE_SIZE),
- log_write_flags) != 0 ||
- my_sync(file, MYF(MY_WME)) != 0);
+ rc= (my_pwrite(file, lsn_buff,
+ LSN_STORE_SIZE,
+ (LOG_HEADER_DATA_SIZE - LSN_STORE_SIZE),
+ log_write_flags) != 0 ||
+ my_sync(file, MYF(MY_WME)) != 0);
+ /*
+ We should not increase counter in case of error above, but it is so
+ unlikely that we can ignore this case
+ */
+ translog_syncs++;
+ DBUG_RETURN(rc);
}
@@ -1423,7 +1486,9 @@
static my_bool translog_buffer_init(struct st_translog_buffer *buffer, int num)
{
DBUG_ENTER("translog_buffer_init");
- buffer->prev_last_lsn= buffer->last_lsn= LSN_IMPOSSIBLE;
+ buffer->pre_force_close_horizon=
+ buffer->prev_last_lsn= buffer->last_lsn=
+ LSN_IMPOSSIBLE;
DBUG_PRINT("info", ("last_lsn and prev_last_lsn set to 0 buffer: 0x%lx",
(ulong) buffer));
@@ -1435,6 +1500,7 @@
memset(buffer->buffer, TRANSLOG_FILLER, TRANSLOG_WRITE_BUFFER);
/* Buffer size */
buffer->size= 0;
+ buffer->skipped_data= 0;
/* cond of thread which is waiting for buffer filling */
if (pthread_cond_init(&buffer->waiting_filling_buffer, 0))
DBUG_RETURN(1);
@@ -1489,7 +1555,10 @@
TODO: sync only we have changed the log
*/
if (!file->is_sync)
+ {
rc= my_sync(file->handler.file, MYF(MY_WME));
+ translog_syncs++;
+ }
rc|= my_close(file->handler.file, MYF(MY_WME));
my_free(file, MYF(0));
return test(rc);
@@ -2044,7 +2113,8 @@
(ulong) LSN_OFFSET(log_descriptor.horizon),
(ulong) LSN_OFFSET(log_descriptor.horizon)));
DBUG_ASSERT(buffer_no == buffer->buffer_no);
- buffer->prev_last_lsn= buffer->last_lsn= LSN_IMPOSSIBLE;
+ buffer->pre_force_close_horizon=
+ buffer->prev_last_lsn= buffer->last_lsn= LSN_IMPOSSIBLE;
DBUG_PRINT("info", ("last_lsn and prev_last_lsn set to 0 buffer: 0x%lx",
(ulong) buffer));
buffer->offset= log_descriptor.horizon;
@@ -2052,6 +2122,7 @@
buffer->file= get_current_logfile();
buffer->overlay= 0;
buffer->size= 0;
+ buffer->skipped_data= 0;
translog_cursor_init(cursor, buffer, buffer_no);
DBUG_PRINT("info", ("file: #%ld (%d) init cursor #%u: 0x%lx "
"chaser: %d Size: %lu (%lu)",
@@ -2523,6 +2594,7 @@
TRANSLOG_ADDRESS offset= buffer->offset;
TRANSLOG_FILE *file= buffer->file;
uint8 ver= buffer->ver;
+ uint skipped_data;
DBUG_ENTER("translog_buffer_flush");
DBUG_PRINT("enter",
("Buffer: #%u 0x%lx file: %d offset: (%lu,0x%lx) size: %lu",
@@ -2557,6 +2629,8 @@
disk
*/
file= buffer->file;
+ skipped_data= buffer->skipped_data;
+ DBUG_ASSERT(skipped_data < TRANSLOG_PAGE_SIZE);
for (i= 0, pg= LSN_OFFSET(buffer->offset) / TRANSLOG_PAGE_SIZE;
i < buffer->size;
i+= TRANSLOG_PAGE_SIZE, pg++)
@@ -2573,13 +2647,16 @@
DBUG_ASSERT(i + TRANSLOG_PAGE_SIZE <= buffer->size);
if (translog_status != TRANSLOG_OK && translog_status != TRANSLOG_SHUTDOWN)
DBUG_RETURN(1);
- if (pagecache_inject(log_descriptor.pagecache,
+ if (pagecache_write_part(log_descriptor.pagecache,
&file->handler, pg, 3,
buffer->buffer + i,
PAGECACHE_PLAIN_PAGE,
PAGECACHE_LOCK_LEFT_UNLOCKED,
- PAGECACHE_PIN_LEFT_UNPINNED, 0,
- LSN_IMPOSSIBLE))
+ PAGECACHE_PIN_LEFT_UNPINNED,
+ PAGECACHE_WRITE_DONE, 0,
+ LSN_IMPOSSIBLE,
+ skipped_data,
+ TRANSLOG_PAGE_SIZE - skipped_data))
{
DBUG_PRINT("error",
("Can't write page (%lu,0x%lx) to pagecache, error: %d",
@@ -2589,10 +2666,12 @@
translog_stop_writing();
DBUG_RETURN(1);
}
+ skipped_data= 0;
}
file->is_sync= 0;
- if (my_pwrite(file->handler.file, buffer->buffer,
- buffer->size, LSN_OFFSET(buffer->offset),
+ if (my_pwrite(file->handler.file, buffer->buffer + buffer->skipped_data,
+ buffer->size - buffer->skipped_data,
+ LSN_OFFSET(buffer->offset) + buffer->skipped_data,
log_write_flags))
{
DBUG_PRINT("error", ("Can't write buffer (%lu,0x%lx) size %lu "
@@ -2985,6 +3064,7 @@
uchar *from, *table= NULL;
int is_last_unfinished_page;
uint last_protected_sector= 0;
+ uint skipped_data= curr_buffer->skipped_data;
TRANSLOG_FILE file_copy;
uint8 ver= curr_buffer->ver;
translog_wait_for_writers(curr_buffer);
@@ -2997,7 +3077,38 @@
}
DBUG_ASSERT(LSN_FILE_NO(addr) == LSN_FILE_NO(curr_buffer->offset));
from= curr_buffer->buffer + (addr - curr_buffer->offset);
- memcpy(buffer, from, TRANSLOG_PAGE_SIZE);
+ if (skipped_data && addr == curr_buffer->offset)
+ {
+ /*
+ We read page part of which is not present in buffer,
+ so we should read absent part from file (page cache actually)
+ */
+ file= get_logfile_by_number(file_no);
+ DBUG_ASSERT(file != NULL);
+ /*
+ it's ok to not lock the page because:
+ - The log handler has it's own page cache.
+ - There is only one thread that can access the log
+ cache at a time
+ */
+ if (!(buffer= pagecache_read(log_descriptor.pagecache,
+ &file->handler,
+ LSN_OFFSET(addr) / TRANSLOG_PAGE_SIZE,
+ 3, buffer,
+ PAGECACHE_PLAIN_PAGE,
+ PAGECACHE_LOCK_LEFT_UNLOCKED,
+ NULL)))
+ DBUG_RETURN(NULL);
+ }
+ else
+ skipped_data= 0; /* Read after skipped in buffer data */
+ /*
+ Now we have correct data in buffer up to 'skipped_data'. The
+ following memcpy() will move the data from the internal buffer
+ that was not yet on disk.
+ */
+ memcpy(buffer + skipped_data, from + skipped_data,
+ TRANSLOG_PAGE_SIZE - skipped_data);
/*
We can use copy then in translog_page_validator() because it
do not put it permanently somewhere.
@@ -3291,6 +3402,7 @@
uint32 next_page_offset, page_rest;
uint32 i;
File fd;
+ int rc;
TRANSLOG_VALIDATOR_DATA data;
char path[FN_REFLEN];
uchar page_buff[TRANSLOG_PAGE_SIZE];
@@ -3316,14 +3428,19 @@
TRANSLOG_PAGE_SIZE);
page_rest= next_page_offset - LSN_OFFSET(addr);
memset(page_buff, TRANSLOG_FILLER, page_rest);
- if ((fd= open_logfile_by_number_no_cache(LSN_FILE_NO(addr))) < 0 ||
- ((my_chsize(fd, next_page_offset, TRANSLOG_FILLER, MYF(MY_WME)) ||
- (page_rest && my_pwrite(fd, page_buff, page_rest, LSN_OFFSET(addr),
- log_write_flags)) ||
- my_sync(fd, MYF(MY_WME))) |
- my_close(fd, MYF(MY_WME))) ||
- (sync_log_dir >= TRANSLOG_SYNC_DIR_ALWAYS &&
- sync_dir(log_descriptor.directory_fd, MYF(MY_WME | MY_IGNORE_BADFD))))
+ rc= ((fd= open_logfile_by_number_no_cache(LSN_FILE_NO(addr))) < 0 ||
+ ((my_chsize(fd, next_page_offset, TRANSLOG_FILLER, MYF(MY_WME)) ||
+ (page_rest && my_pwrite(fd, page_buff, page_rest, LSN_OFFSET(addr),
+ log_write_flags)) ||
+ my_sync(fd, MYF(MY_WME)))));
+ translog_syncs++;
+ rc|= (fd > 0 && my_close(fd, MYF(MY_WME)));
+ if (sync_log_dir >= TRANSLOG_SYNC_DIR_ALWAYS)
+ {
+ rc|= sync_dir(log_descriptor.directory_fd, MYF(MY_WME | MY_IGNORE_BADFD));
+ translog_syncs++;
+ }
+ if (rc)
DBUG_RETURN(1);
/* fix the horizon */
@@ -3483,7 +3600,10 @@
my_bool version_changed= 0;
DBUG_ENTER("translog_init_with_table");
+ translog_syncs= 0;
+ flush_start= 0;
id_to_share= NULL;
+
log_descriptor.directory_fd= -1;
log_descriptor.is_everything_flushed= 1;
log_descriptor.flush_in_progress= 0;
@@ -3511,6 +3631,7 @@
pthread_mutex_init(&log_descriptor.dirty_buffer_mask_lock,
MY_MUTEX_INIT_FAST) ||
pthread_cond_init(&log_descriptor.log_flush_cond, 0) ||
+ pthread_cond_init(&log_descriptor.new_goal_cond, 0) ||
my_rwlock_init(&log_descriptor.open_files_lock,
NULL) ||
my_init_dynamic_array(&log_descriptor.open_files,
@@ -3912,7 +4033,6 @@
log_descriptor.flushed= log_descriptor.horizon;
log_descriptor.in_buffers_only= log_descriptor.bc.buffer->offset;
log_descriptor.max_lsn= LSN_IMPOSSIBLE; /* set to 0 */
- log_descriptor.previous_flush_horizon= log_descriptor.horizon;
/*
Now 'flushed' is set to 'horizon' value, but 'horizon' is (potentially)
address of the next LSN and we want indicate that all LSNs that are
@@ -3995,6 +4115,10 @@
It is beginning of the log => there is no LSNs in the log =>
There is no harm in leaving it "as-is".
*/
+ log_descriptor.previous_flush_horizon= log_descriptor.horizon;
+ DBUG_PRINT("info", ("previous_flush_horizon: (%lu,0x%lx)",
+ LSN_IN_PARTS(log_descriptor.
+ previous_flush_horizon)));
DBUG_RETURN(0);
}
file_no--;
@@ -4070,6 +4194,9 @@
translog_free_record_header(&rec);
}
}
+ log_descriptor.previous_flush_horizon= log_descriptor.horizon;
+ DBUG_PRINT("info", ("previous_flush_horizon: (%lu,0x%lx)",
+ LSN_IN_PARTS(log_descriptor.previous_flush_horizon)));
DBUG_RETURN(0);
err:
ma_message_no_user(0, "log initialization failed");
@@ -4157,6 +4284,7 @@
pthread_mutex_destroy(&log_descriptor.log_flush_lock);
pthread_mutex_destroy(&log_descriptor.dirty_buffer_mask_lock);
pthread_cond_destroy(&log_descriptor.log_flush_cond);
+ pthread_cond_destroy(&log_descriptor.new_goal_cond);
rwlock_destroy(&log_descriptor.open_files_lock);
delete_dynamic(&log_descriptor.open_files);
delete_dynamic(&log_descriptor.unfinished_files);
@@ -6885,11 +7013,11 @@
{
translog_size_t res;
DBUG_ENTER("translog_read_record_header_from_buffer");
- DBUG_ASSERT(translog_is_LSN_chunk(page[page_offset]));
- DBUG_ASSERT(translog_status == TRANSLOG_OK ||
- translog_status == TRANSLOG_READONLY);
DBUG_PRINT("info", ("page byte: 0x%x offset: %u",
(uint) page[page_offset], (uint) page_offset));
+ DBUG_ASSERT(translog_is_LSN_chunk(page[page_offset]));
+ DBUG_ASSERT(translog_status == TRANSLOG_OK ||
+ translog_status == TRANSLOG_READONLY);
buff->type= (page[page_offset] & TRANSLOG_REC_TYPE);
buff->short_trid= uint2korr(page + page_offset + 1);
DBUG_PRINT("info", ("Type %u, Short TrID %u, LSN (%lu,0x%lx)",
@@ -7356,27 +7484,27 @@
"Buffer addr: (%lu,0x%lx) "
"Page addr: (%lu,0x%lx) "
"size: %lu (%lu) Pg: %u left: %u in progress %u",
- (uint) log_descriptor.bc.buffer_no,
- (ulong) log_descriptor.bc.buffer,
- LSN_IN_PARTS(log_descriptor.bc.buffer->offset),
+ (uint) old_buffer_no,
+ (ulong) old_buffer,
+ LSN_IN_PARTS(old_buffer->offset),
(ulong) LSN_FILE_NO(log_descriptor.horizon),
(ulong) (LSN_OFFSET(log_descriptor.horizon) -
log_descriptor.bc.current_page_fill),
- (ulong) log_descriptor.bc.buffer->size,
+ (ulong) old_buffer->size,
(ulong) (log_descriptor.bc.ptr -log_descriptor.bc.
buffer->buffer),
(uint) log_descriptor.bc.current_page_fill,
(uint) left,
- (uint) log_descriptor.bc.buffer->
+ (uint) old_buffer->
copy_to_buffer_in_progress));
translog_lock_assert_owner();
LINT_INIT(current_page_fill);
- new_buff_beginning= log_descriptor.bc.buffer->offset;
- new_buff_beginning+= log_descriptor.bc.buffer->size; /* increase offset */
+ new_buff_beginning= old_buffer->offset;
+ new_buff_beginning+= old_buffer->size; /* increase offset */
DBUG_ASSERT(log_descriptor.bc.ptr !=NULL);
DBUG_ASSERT(LSN_FILE_NO(log_descriptor.horizon) ==
- LSN_FILE_NO(log_descriptor.bc.buffer->offset));
+ LSN_FILE_NO(old_buffer->offset));
translog_check_cursor(&log_descriptor.bc);
DBUG_ASSERT(left < TRANSLOG_PAGE_SIZE);
if (left)
@@ -7387,18 +7515,20 @@
*/
DBUG_PRINT("info", ("left: %u", (uint) left));
+ old_buffer->pre_force_close_horizon=
+ old_buffer->offset + old_buffer->size;
/* decrease offset */
new_buff_beginning-= log_descriptor.bc.current_page_fill;
current_page_fill= log_descriptor.bc.current_page_fill;
memset(log_descriptor.bc.ptr, TRANSLOG_FILLER, left);
- log_descriptor.bc.buffer->size+= left;
+ old_buffer->size+= left;
DBUG_PRINT("info", ("Finish Page buffer #%u: 0x%lx "
"Size: %lu",
- (uint) log_descriptor.bc.buffer->buffer_no,
- (ulong) log_descriptor.bc.buffer,
- (ulong) log_descriptor.bc.buffer->size));
- DBUG_ASSERT(log_descriptor.bc.buffer->buffer_no ==
+ (uint) old_buffer->buffer_no,
+ (ulong) old_buffer,
+ (ulong) old_buffer->size));
+ DBUG_ASSERT(old_buffer->buffer_no ==
log_descriptor.bc.buffer_no);
}
else
@@ -7509,11 +7639,21 @@
if (left)
{
- /*
- TODO: do not copy beginning of the page if we have no CRC or sector
- checks on
- */
- memcpy(new_buffer->buffer, data, current_page_fill);
+ if (log_descriptor.flags &
+ (TRANSLOG_PAGE_CRC | TRANSLOG_SECTOR_PROTECTION))
+ memcpy(new_buffer->buffer, data, current_page_fill);
+ else
+ {
+ /*
+ This page header does not change if we add more data to the page so
+ we can not copy it and will not overwrite later
+ */
+ new_buffer->skipped_data= current_page_fill;
+#ifndef DBUG_OFF
+ memset(new_buffer->buffer, 0xa5, current_page_fill);
+#endif
+ DBUG_ASSERT(new_buffer->skipped_data < TRANSLOG_PAGE_SIZE);
+ }
}
old_buffer->next_buffer_offset= new_buffer->offset;
translog_buffer_lock(new_buffer);
@@ -7561,6 +7701,7 @@
{
log_descriptor.next_pass_max_lsn= lsn;
log_descriptor.max_lsn_requester= pthread_self();
+ pthread_cond_broadcast(&log_descriptor.new_goal_cond);
}
while (flush_no == log_descriptor.flush_no)
{
@@ -7572,66 +7713,78 @@
/**
- @brief Flush the log up to given LSN (included)
-
- @param lsn log record serial number up to which (inclusive)
- the log has to be flushed
-
- @return Operation status
+ @brief sync() range of files (inclusive) and directory (by request)
+
+ @param min min internal file number to flush
+ @param max max internal file number to flush
+ @param sync_dir need sync directory
+
+ return Operation status
@retval 0 OK
@retval 1 Error
-
-*/
-
-my_bool translog_flush(TRANSLOG_ADDRESS lsn)
-{
- LSN sent_to_disk= LSN_IMPOSSIBLE;
- TRANSLOG_ADDRESS flush_horizon;
- uint fn, i;
+*/
+
+static my_bool translog_sync_files(uint32 min, uint32 max,
+ my_bool sync_dir)
+{
+ uint fn;
+ my_bool rc= 0;
+ ulonglong flush_interval;
+ DBUG_ENTER("translog_sync_files");
+ DBUG_PRINT("info", ("min: %lu max: %lu sync dir: %d",
+ (ulong) min, (ulong) max, (int) sync_dir));
+ DBUG_ASSERT(min <= max);
+
+ flush_interval= group_commit_wait;
+ if (flush_interval)
+ flush_start= my_micro_time();
+ for (fn= min; fn <= max; fn++)
+ {
+ TRANSLOG_FILE *file= get_logfile_by_number(fn);
+ DBUG_ASSERT(file != NULL);
+ if (!file->is_sync)
+ {
+ if (my_sync(file->handler.file, MYF(MY_WME)))
+ {
+ rc= 1;
+ translog_stop_writing();
+ DBUG_RETURN(rc);
+ }
+ translog_syncs++;
+ file->is_sync= 1;
+ }
+ }
+
+ if (sync_dir)
+ {
+ if (!(rc= sync_dir(log_descriptor.directory_fd,
+ MYF(MY_WME | MY_IGNORE_BADFD))))
+ translog_syncs++;
+ }
+
+ DBUG_RETURN(rc);
+}
+
+
+/*
+ @brief Flushes buffers with LSNs in them less or equal address <lsn>
+
+ @param lsn address up to which all LSNs should be flushed,
+ can be reset to real last LSN address
+ @parem sent_to_disk returns 'sent to disk' position
+ @param flush_horizon returns horizon of the flush
+
+ @note About terminology see comment to translog_flush().
+*/
+
+void translog_flush_buffers(TRANSLOG_ADDRESS *lsn,
+ TRANSLOG_ADDRESS *sent_to_disk,
+ TRANSLOG_ADDRESS *flush_horizon)
+{
dirty_buffer_mask_t dirty_buffer_mask;
+ uint i;
uint8 last_buffer_no, start_buffer_no;
- my_bool rc= 0;
- DBUG_ENTER("translog_flush");
- DBUG_PRINT("enter", ("Flush up to LSN: (%lu,0x%lx)", LSN_IN_PARTS(lsn)));
- DBUG_ASSERT(translog_status == TRANSLOG_OK ||
- translog_status == TRANSLOG_READONLY);
- LINT_INIT(sent_to_disk);
-
- pthread_mutex_lock(&log_descriptor.log_flush_lock);
- DBUG_PRINT("info", ("Everything is flushed up to (%lu,0x%lx)",
- LSN_IN_PARTS(log_descriptor.flushed)));
- if (cmp_translog_addr(log_descriptor.flushed, lsn) >= 0)
- {
- pthread_mutex_unlock(&log_descriptor.log_flush_lock);
- DBUG_RETURN(0);
- }
- if (log_descriptor.flush_in_progress)
- {
- translog_flush_set_new_goal_and_wait(lsn);
- if (!pthread_equal(log_descriptor.max_lsn_requester, pthread_self()))
- {
- /* fix lsn if it was horizon */
- if (cmp_translog_addr(lsn, log_descriptor.bc.buffer->last_lsn) > 0)
- lsn= BUFFER_MAX_LSN(log_descriptor.bc.buffer);
- translog_flush_wait_for_end(lsn);
- pthread_mutex_unlock(&log_descriptor.log_flush_lock);
- DBUG_RETURN(0);
- }
- log_descriptor.next_pass_max_lsn= LSN_IMPOSSIBLE;
- }
- log_descriptor.flush_in_progress= 1;
- flush_horizon= log_descriptor.previous_flush_horizon;
- DBUG_PRINT("info", ("flush_in_progress is set"));
- pthread_mutex_unlock(&log_descriptor.log_flush_lock);
-
- translog_lock();
- if (log_descriptor.is_everything_flushed)
- {
- DBUG_PRINT("info", ("everything is flushed"));
- rc= (translog_status == TRANSLOG_READONLY);
- translog_unlock();
- goto out;
- }
+ DBUG_ENTER("translog_flush_buffers");
/*
We will recheck information when will lock buffers one by
@@ -7656,15 +7809,15 @@
/*
if LSN up to which we have to flush bigger then maximum LSN of previous
buffer and at least one LSN was saved in the current buffer (last_lsn !=
- LSN_IMPOSSIBLE) then we better finish the current buffer.
+ LSN_IMPOSSIBLE) then we have to close the current buffer.
*/
- if (cmp_translog_addr(lsn, log_descriptor.bc.buffer->prev_last_lsn) > 0 &&
+ if (cmp_translog_addr(*lsn, log_descriptor.bc.buffer->prev_last_lsn) > 0 &&
log_descriptor.bc.buffer->last_lsn != LSN_IMPOSSIBLE)
{
struct st_translog_buffer *buffer= log_descriptor.bc.buffer;
- lsn= log_descriptor.bc.buffer->last_lsn; /* fix lsn if it was horizon */
+ *lsn= log_descriptor.bc.buffer->last_lsn; /* fix lsn if it was horizon */
DBUG_PRINT("info", ("LSN to flush fixed to last lsn: (%lu,0x%lx)",
- LSN_IN_PARTS(log_descriptor.bc.buffer->last_lsn)));
+ LSN_IN_PARTS(log_descriptor.bc.buffer->last_lsn)));
last_buffer_no= log_descriptor.bc.buffer_no;
log_descriptor.is_everything_flushed= 1;
translog_force_current_buffer_to_finish();
@@ -7676,8 +7829,10 @@
TRANSLOG_BUFFERS_NO);
translog_unlock();
}
- sent_to_disk= translog_get_sent_to_disk();
- if (cmp_translog_addr(lsn, sent_to_disk) > 0)
+
+ /* flush buffers */
+ *sent_to_disk= translog_get_sent_to_disk();
+ if (cmp_translog_addr(*lsn, *sent_to_disk) > 0)
{
DBUG_PRINT("info", ("Start buffer #: %u last buffer #: %u",
@@ -7697,53 +7852,238 @@
LSN_IN_PARTS(buffer->last_lsn),
(buffer->file ?
"dirty" : "closed")));
- if (buffer->prev_last_lsn <= lsn &&
+ if (buffer->prev_last_lsn <= *lsn &&
buffer->file != NULL)
{
- DBUG_ASSERT(flush_horizon <= buffer->offset + buffer->size);
- flush_horizon= buffer->offset + buffer->size;
+ DBUG_ASSERT(*flush_horizon <= buffer->offset + buffer->size);
+ *flush_horizon= (buffer->pre_force_close_horizon != LSN_IMPOSSIBLE ?
+ buffer->pre_force_close_horizon :
+ buffer->offset + buffer->size);
+ /* pre_force_close_horizon is reset during new buffer start */
+ DBUG_PRINT("info", ("flush_horizon: (%lu,0x%lx)",
+ LSN_IN_PARTS(*flush_horizon)));
+ DBUG_ASSERT(*flush_horizon <= log_descriptor.horizon);
+
translog_buffer_flush(buffer);
}
translog_buffer_unlock(buffer);
i= (i + 1) % TRANSLOG_BUFFERS_NO;
} while (i != last_buffer_no);
- sent_to_disk= translog_get_sent_to_disk();
- }
-
- /* sync files from previous flush till current one */
- for (fn= LSN_FILE_NO(log_descriptor.flushed); fn <= LSN_FILE_NO(lsn); fn++)
- {
- TRANSLOG_FILE *file= get_logfile_by_number(fn);
- DBUG_ASSERT(file != NULL);
- if (!file->is_sync)
- {
- if (my_sync(file->handler.file, MYF(MY_WME)))
+ *sent_to_disk= translog_get_sent_to_disk();
+ }
+
+ DBUG_VOID_RETURN;
+}
+
+/**
+ @brief Flush the log up to given LSN (included)
+
+ @param lsn log record serial number up to which (inclusive)
+ the log has to be flushed
+
+ @return Operation status
+ @retval 0 OK
+ @retval 1 Error
+
+ @note
+
+ - Non group commit logic: Commits made in passes. Thread which started
+ flush first is performing actual flush, other threads sets new goal (LSN)
+ of the next pass (if it is maximum) and waits for the pass end or just
+ wait for the pass end.
+
+ - If hard group commit enabled and rate set to zero:
+ The first thread sends all changed buffers to disk. This is repeated
+ as long as there are new LSNs added. The process can not loop
+ forever because we have limited number of threads and they will wait
+ for the data to be synced.
+ Pseudo code:
+
+ do
+ send changed buffers to disk
+ while new_goal
+ sync
+
+ - If hard group commit switched ON and less than rate microseconds has
+ passed from last sync, then after buffers have been sent to disk
+ wait until rate microseconds has passed since last sync, do sync and return.
+ This ensures that if we call sync infrequently we don't do any waits.
+
+ - If soft group commit enabled everything works as with 'non group commit'
+ but the thread doesn't do any real sync(). If rate is not zero the
+ sync() will be performed by a service thread with the given rate
+ when needed (new LSN appears).
+
+ @note Terminology:
+ 'sent to disk' means written to disk but not sync()ed,
+ 'flushed' mean sent to disk and synced().
+*/
+
+my_bool translog_flush(TRANSLOG_ADDRESS lsn)
+{
+ struct timespec abstime;
+ ulonglong flush_interval;
+ ulonglong time_spent;
+ LSN sent_to_disk= LSN_IMPOSSIBLE;
+ TRANSLOG_ADDRESS flush_horizon;
+ my_bool rc= 0;
+ my_bool hgroup_commit_at_start;
+ DBUG_ENTER("translog_flush");
+ DBUG_PRINT("enter", ("Flush up to LSN: (%lu,0x%lx)", LSN_IN_PARTS(lsn)));
+ DBUG_ASSERT(translog_status == TRANSLOG_OK ||
+ translog_status == TRANSLOG_READONLY);
+ LINT_INIT(sent_to_disk);
+
+ pthread_mutex_lock(&log_descriptor.log_flush_lock);
+ DBUG_PRINT("info", ("Everything is flushed up to (%lu,0x%lx)",
+ LSN_IN_PARTS(log_descriptor.flushed)));
+ if (cmp_translog_addr(log_descriptor.flushed, lsn) >= 0)
+ {
+ pthread_mutex_unlock(&log_descriptor.log_flush_lock);
+ DBUG_RETURN(0);
+ }
+ if (log_descriptor.flush_in_progress)
+ {
+ translog_lock();
+ /* fix lsn if it was horizon */
+ if (cmp_translog_addr(lsn, log_descriptor.bc.buffer->last_lsn) > 0)
+ lsn= BUFFER_MAX_LSN(log_descriptor.bc.buffer);
+ translog_unlock();
+ translog_flush_set_new_goal_and_wait(lsn);
+ if (!pthread_equal(log_descriptor.max_lsn_requester, pthread_self()))
+ {
+ /*
+ translog_flush_wait_for_end() release log_flush_lock while is
+ waiting then acquire it again
+ */
+ translog_flush_wait_for_end(lsn);
+ pthread_mutex_unlock(&log_descriptor.log_flush_lock);
+ DBUG_RETURN(0);
+ }
+ log_descriptor.next_pass_max_lsn= LSN_IMPOSSIBLE;
+ }
+ log_descriptor.flush_in_progress= 1;
+ flush_horizon= log_descriptor.previous_flush_horizon;
+ DBUG_PRINT("info", ("flush_in_progress is set, flush_horizon: (%lu,0x%lx)",
+ LSN_IN_PARTS(flush_horizon)));
+ pthread_mutex_unlock(&log_descriptor.log_flush_lock);
+
+ hgroup_commit_at_start= hard_group_commit;
+ if (hgroup_commit_at_start)
+ flush_interval= group_commit_wait;
+
+ translog_lock();
+ if (log_descriptor.is_everything_flushed)
+ {
+ DBUG_PRINT("info", ("everything is flushed"));
+ translog_unlock();
+ pthread_mutex_lock(&log_descriptor.log_flush_lock);
+ goto out;
+ }
+
+ for (;;)
+ {
+ /* Following function flushes buffers and makes translog_unlock() */
+ translog_flush_buffers(&lsn, &sent_to_disk, &flush_horizon);
+
+ if (!hgroup_commit_at_start)
+ break; /* flush pass is ended */
+
+retest:
+ /*
+ We do not check time here because pthread_mutex_lock rarely takes
+ a lot of time so we can sacrifice a bit precision to performance
+ (taking into account that my_micro_time() might be expensive call).
+ */
+ if (flush_interval == 0)
+ break; /* flush pass is ended */
+
+ pthread_mutex_lock(&log_descriptor.log_flush_lock);
+ if (log_descriptor.next_pass_max_lsn == LSN_IMPOSSIBLE)
+ {
+ if (flush_interval == 0 ||
+ (time_spent= (my_micro_time() - flush_start)) >= flush_interval)
{
- rc= 1;
- translog_stop_writing();
- sent_to_disk= LSN_IMPOSSIBLE;
- goto out;
+ pthread_mutex_unlock(&log_descriptor.log_flush_lock);
+ break;
}
- file->is_sync= 1;
- }
- }
-
- if (sync_log_dir >= TRANSLOG_SYNC_DIR_ALWAYS &&
- (LSN_FILE_NO(log_descriptor.previous_flush_horizon) !=
- LSN_FILE_NO(flush_horizon) ||
- ((LSN_OFFSET(log_descriptor.previous_flush_horizon) - 1) /
- TRANSLOG_PAGE_SIZE) !=
- ((LSN_OFFSET(flush_horizon) - 1) / TRANSLOG_PAGE_SIZE)))
- rc|= sync_dir(log_descriptor.directory_fd, MYF(MY_WME | MY_IGNORE_BADFD));
+ DBUG_PRINT("info", ("flush waits: %llu interval: %llu spent: %llu",
+ flush_interval - time_spent,
+ flush_interval, time_spent));
+ /* wait time or next goal */
+ set_timespec_nsec(abstime, flush_interval - time_spent);
+ pthread_cond_timedwait(&log_descriptor.new_goal_cond,
+ &log_descriptor.log_flush_lock,
+ &abstime);
+ pthread_mutex_unlock(&log_descriptor.log_flush_lock);
+ DBUG_PRINT("info", ("retest conditions"));
+ goto retest;
+ }
+
+ /* take next goal */
+ lsn= log_descriptor.next_pass_max_lsn;
+ log_descriptor.next_pass_max_lsn= LSN_IMPOSSIBLE;
+ /* prevent other thread from continue */
+ log_descriptor.max_lsn_requester= pthread_self();
+ DBUG_PRINT("info", ("flush took next goal: (%lu,0x%lx)",
+ LSN_IN_PARTS(lsn)));
+ pthread_mutex_unlock(&log_descriptor.log_flush_lock);
+
+ /* next flush pass */
+ DBUG_PRINT("info", ("next flush pass"));
+ translog_lock();
+ }
+
+ /*
+ sync() files from previous flush till current one
+ */
+ if (!soft_sync || hgroup_commit_at_start)
+ {
+ if ((rc=
+ translog_sync_files(LSN_FILE_NO(log_descriptor.flushed),
+ LSN_FILE_NO(lsn),
+ sync_log_dir >= TRANSLOG_SYNC_DIR_ALWAYS &&
+ (LSN_FILE_NO(log_descriptor.
+ previous_flush_horizon) !=
+ LSN_FILE_NO(flush_horizon) ||
+ (LSN_OFFSET(log_descriptor.
+ previous_flush_horizon) /
+ TRANSLOG_PAGE_SIZE) !=
+ (LSN_OFFSET(flush_horizon) /
+ TRANSLOG_PAGE_SIZE)))))
+ {
+ sent_to_disk= LSN_IMPOSSIBLE;
+ pthread_mutex_lock(&log_descriptor.log_flush_lock);
+ goto out;
+ }
+ /* keep values for soft sync() and forced sync() actual */
+ {
+ uint32 fileno= LSN_FILE_NO(lsn);
+ my_atomic_rwlock_wrlock(&soft_sync_rwl);
+ my_atomic_store32(&soft_sync_min, fileno);
+ my_atomic_store32(&soft_sync_max, fileno);
+ my_atomic_rwlock_wrunlock(&soft_sync_rwl);
+ }
+ }
+ else
+ {
+ my_atomic_rwlock_wrlock(&soft_sync_rwl);
+ my_atomic_store32(&soft_sync_max, LSN_FILE_NO(lsn));
+ my_atomic_store32(&soft_need_sync, 1);
+ my_atomic_rwlock_wrunlock(&soft_sync_rwl);
+ }
+
+ DBUG_ASSERT(flush_horizon <= log_descriptor.horizon);
+
+ pthread_mutex_lock(&log_descriptor.log_flush_lock);
log_descriptor.previous_flush_horizon= flush_horizon;
out:
- pthread_mutex_lock(&log_descriptor.log_flush_lock);
if (sent_to_disk != LSN_IMPOSSIBLE)
log_descriptor.flushed= sent_to_disk;
log_descriptor.flush_in_progress= 0;
log_descriptor.flush_no++;
DBUG_PRINT("info", ("flush_in_progress is dropped"));
- pthread_mutex_unlock(&log_descriptor.log_flush_lock);\
+ pthread_mutex_unlock(&log_descriptor.log_flush_lock);
pthread_cond_broadcast(&log_descriptor.log_flush_cond);
DBUG_RETURN(rc);
}
@@ -8113,6 +8453,8 @@
my_bool translog_purge(TRANSLOG_ADDRESS low)
{
uint32 last_need_file= LSN_FILE_NO(low);
+ uint32 min_unsync;
+ int soft;
TRANSLOG_ADDRESS horizon= translog_get_horizon();
int rc= 0;
DBUG_ENTER("translog_purge");
@@ -8120,12 +8462,26 @@
DBUG_ASSERT(translog_status == TRANSLOG_OK ||
translog_status == TRANSLOG_READONLY);
+ soft= soft_sync;
+ my_atomic_rwlock_wrlock(&soft_sync_rwl);
+ min_unsync= my_atomic_load32(&soft_sync_min);
+ my_atomic_rwlock_wrunlock(&soft_sync_rwl);
+ DBUG_PRINT("info", ("min_unsync: %lu", (ulong) min_unsync));
+ if (soft && min_unsync < last_need_file)
+ {
+ last_need_file= min_unsync;
+ DBUG_PRINT("info", ("last_need_file set to %lu", (ulong)last_need_file));
+ }
+
pthread_mutex_lock(&log_descriptor.purger_lock);
+ DBUG_PRINT("info", ("last_lsn_checked file: %lu:",
+ (ulong) log_descriptor.last_lsn_checked));
if (LSN_FILE_NO(log_descriptor.last_lsn_checked) < last_need_file)
{
uint32 i;
uint32 min_file= translog_first_file(horizon, 1);
DBUG_ASSERT(min_file != 0); /* log is already started */
+ DBUG_PRINT("info", ("min_file: %lu:",(ulong) min_file));
for(i= min_file; i < last_need_file && rc == 0; i++)
{
LSN lsn= translog_get_file_max_lsn_stored(i);
@@ -8356,6 +8712,159 @@
}
+
+/**
+ Sets soft sync mode
+
+ @param mode TRUE if we need switch soft sync on else off
+*/
+
+void translog_soft_sync(my_bool mode)
+{
+ soft_sync= mode;
+}
+
+
+/**
+ Sets hard group commit
+
+ @param mode TRUE if we need switch hard group commit on else off
+*/
+
+void translog_hard_group_commit(my_bool mode)
+{
+ hard_group_commit= mode;
+}
+
+
+/**
+ @brief forced log sync (used when we are switching modes)
+*/
+
+void translog_sync()
+{
+ uint32 max= get_current_logfile()->number;
+ uint32 min;
+ DBUG_ENTER("ma_translog_sync");
+
+ my_atomic_rwlock_rdlock(&soft_sync_rwl);
+ min= my_atomic_load32(&soft_sync_min);
+ my_atomic_rwlock_rdunlock(&soft_sync_rwl);
+ if (!min)
+ min= max;
+
+ translog_sync_files(min, max, sync_log_dir >= TRANSLOG_SYNC_DIR_ALWAYS);
+
+ DBUG_VOID_RETURN;
+}
+
+
+/**
+ @brief set rate for group commit
+
+ @param interval interval to set.
+
+ @note We use this function with additional variable because have to
+ restart service thread with new value which we can't make inside changing
+ variable routine (update_maria_group_commit_interval)
+*/
+
+void translog_set_group_commit_interval(uint32 interval)
+{
+ DBUG_ENTER("translog_set_group_commit_interval");
+ group_commit_wait= interval;
+ DBUG_PRINT("info", ("wait: %llu",
+ (ulonglong)group_commit_wait));
+ DBUG_VOID_RETURN;
+}
+
+
+/**
+ @brief syncing service thread
+*/
+
+static pthread_handler_t
+ma_soft_sync_background( void *arg __attribute__((unused)))
+{
+
+ my_thread_init();
+ {
+ DBUG_ENTER("ma_soft_sync_background");
+ for(;;)
+ {
+ ulonglong prev_loop= my_micro_time();
+ ulonglong time, sleep;
+ uint32 min, max, sync_request;
+ my_atomic_rwlock_rdlock(&soft_sync_rwl);
+ min= my_atomic_load32(&soft_sync_min);
+ max= my_atomic_load32(&soft_sync_max);
+ sync_request= my_atomic_load32(&soft_need_sync);
+ my_atomic_store32(&soft_sync_min, max);
+ my_atomic_store32(&soft_need_sync, 0);
+ my_atomic_rwlock_rdunlock(&soft_sync_rwl);
+
+ sleep= group_commit_wait;
+ if (sync_request)
+ translog_sync_files(min, max, FALSE);
+ time= my_micro_time() - prev_loop;
+ if (time > sleep)
+ sleep= 0;
+ else
+ sleep-= time;
+ if (my_service_thread_sleep(&soft_sync_control, sleep))
+ break;
+ }
+ my_service_thread_signal_end(&soft_sync_control);
+ my_thread_end();
+ DBUG_RETURN(0);
+ }
+}
+
+
+/**
+ @brief Starts syncing thread
+*/
+
+int translog_soft_sync_start(void)
+{
+ pthread_t th;
+ int res= 0;
+ uint32 min, max;
+ DBUG_ENTER("translog_soft_sync_start");
+
+ /* check and init variables */
+ my_atomic_rwlock_rdlock(&soft_sync_rwl);
+ min= my_atomic_load32(&soft_sync_min);
+ max= my_atomic_load32(&soft_sync_max);
+ if (!max)
+ my_atomic_store32(&soft_sync_max, (max= get_current_logfile()->number));
+ if (!min)
+ my_atomic_store32(&soft_sync_min, max);
+ my_atomic_store32(&soft_need_sync, 1);
+ my_atomic_rwlock_rdunlock(&soft_sync_rwl);
+
+ if (!(res= ma_service_thread_control_init(&soft_sync_control)))
+ if (!(res= pthread_create(&th, NULL, ma_soft_sync_background, NULL)))
+ soft_sync_control.status= THREAD_RUNNING;
+ DBUG_RETURN(res);
+}
+
+
+/**
+ @brief Stops syncing thread
+*/
+
+void translog_soft_sync_end(void)
+{
+ DBUG_ENTER("translog_soft_sync_end");
+ if (soft_sync_control.inited)
+ {
+ ma_service_thread_control_end(&soft_sync_control);
+ }
+ DBUG_VOID_RETURN;
+}
+
+
#ifdef MARIA_DUMP_LOG
#include <my_getopt.h>
extern void translog_example_table_init();
=== modified file 'storage/maria/ma_loghandler.h'
--- a/storage/maria/ma_loghandler.h 2009-01-15 22:25:53 +0000
+++ b/storage/maria/ma_loghandler.h 2010-02-12 06:52:47 +0000
@@ -342,6 +342,14 @@
TRANSLOG_SHUTDOWN /* going to shutdown the loghandler */
};
extern enum enum_translog_status translog_status;
+extern ulonglong translog_syncs; /* Number of sync()s */
+
+void translog_soft_sync(my_bool mode);
+void translog_hard_group_commit(my_bool mode);
+int translog_soft_sync_start(void);
+void translog_soft_sync_end(void);
+void translog_sync();
+void translog_set_group_commit_interval(uint32 interval);
/*
all the rest added because of recovery; should we make
@@ -441,6 +449,14 @@
typedef enum
{
+ TRANSLOG_GCOMMIT_NONE,
+ TRANSLOG_GCOMMIT_HARD,
+ TRANSLOG_GCOMMIT_SOFT
+} enum_maria_group_commit;
+extern ulong maria_group_commit;
+extern ulong maria_group_commit_interval;
+typedef enum
+{
TRANSLOG_PURGE_IMMIDIATE,
TRANSLOG_PURGE_EXTERNAL,
TRANSLOG_PURGE_ONDEMAND
1
0
[Maria-developers] Rev 2756: BUG#31480: Incorrect result for nested subquery when executed via semi join in file:///home/psergey/dev/maria-5.3-subqueries-r6/
by Sergey Petrunya 12 Feb '10
by Sergey Petrunya 12 Feb '10
12 Feb '10
At file:///home/psergey/dev/maria-5.3-subqueries-r6/
------------------------------------------------------------
revno: 2756
revision-id: psergey(a)askmonty.org-20100211235958-p11o4e80dlrn2bsq
parent: psergey(a)askmonty.org-20100211223118-5fzuidow1pkubpzl
committer: Sergey Petrunya <psergey(a)askmonty.org>
branch nick: maria-5.3-subqueries-r6
timestamp: Fri 2010-02-12 02:59:58 +0300
message:
BUG#31480: Incorrect result for nested subquery when executed via semi join
- Variant #3 of the fix. It also
= Unifies code with table elimination's
= is able to handle FROM-subquery pullout.
=== modified file 'mysql-test/r/subselect_sj.result'
--- a/mysql-test/r/subselect_sj.result 2010-01-17 14:51:10 +0000
+++ b/mysql-test/r/subselect_sj.result 2010-02-11 23:59:58 +0000
@@ -779,3 +779,48 @@
1 PRIMARY it2 ALL NULL NULL NULL NULL 20 Using where; End temporary
DROP TABLE ot1, it1, it2;
# End of BUG#38075
+#
+# BUG#31480: Incorrect result for nested subquery when executed via semi join
+#
+create table t1 (a int not null, b int not null);
+create table t2 (c int not null, d int not null);
+create table t3 (e int not null);
+insert into t1 values (1,10);
+insert into t1 values (2,10);
+insert into t1 values (1,20);
+insert into t1 values (2,20);
+insert into t1 values (3,20);
+insert into t1 values (2,30);
+insert into t1 values (4,40);
+insert into t2 values (2,10);
+insert into t2 values (2,20);
+insert into t2 values (4,10);
+insert into t2 values (5,10);
+insert into t2 values (3,20);
+insert into t2 values (2,40);
+insert into t3 values (10);
+insert into t3 values (30);
+insert into t3 values (10);
+insert into t3 values (20);
+explain extended
+select a from t1
+where a in (select c from t2 where d >= some(select e from t3 where b=e));
+id select_type table type possible_keys key key_len ref rows filtered Extra
+1 PRIMARY t2 ALL NULL NULL NULL NULL 6 100.00 Start temporary
+1 PRIMARY t1 ALL NULL NULL NULL NULL 7 100.00 Using where; End temporary; Using join buffer
+3 DEPENDENT SUBQUERY t3 ALL NULL NULL NULL NULL 4 100.00 Using where
+Warnings:
+Note 1276 Field or reference 'test.t1.b' of SELECT #3 was resolved in SELECT #1
+Note 1003 select `test`.`t1`.`a` AS `a` from `test`.`t1` semi join (`test`.`t2`) where ((`test`.`t1`.`a` = `test`.`t2`.`c`) and <nop>(<in_optimizer>(`test`.`t2`.`d`,<exists>(select 1 AS `Not_used` from `test`.`t3` where ((`test`.`t1`.`b` = `test`.`t3`.`e`) and (<cache>(`test`.`t2`.`d`) >= `test`.`t3`.`e`))))))
+show warnings;
+Level Code Message
+Note 1276 Field or reference 'test.t1.b' of SELECT #3 was resolved in SELECT #1
+Note 1003 select `test`.`t1`.`a` AS `a` from `test`.`t1` semi join (`test`.`t2`) where ((`test`.`t1`.`a` = `test`.`t2`.`c`) and <nop>(<in_optimizer>(`test`.`t2`.`d`,<exists>(select 1 AS `Not_used` from `test`.`t3` where ((`test`.`t1`.`b` = `test`.`t3`.`e`) and (<cache>(`test`.`t2`.`d`) >= `test`.`t3`.`e`))))))
+select a from t1
+where a in (select c from t2 where d >= some(select e from t3 where b=e));
+a
+2
+2
+3
+2
+drop table t1, t2, t3;
=== modified file 'mysql-test/r/subselect_sj_jcl6.result'
--- a/mysql-test/r/subselect_sj_jcl6.result 2010-01-17 14:51:10 +0000
+++ b/mysql-test/r/subselect_sj_jcl6.result 2010-02-11 23:59:58 +0000
@@ -783,6 +783,51 @@
1 PRIMARY it2 ALL NULL NULL NULL NULL 20 Using where; End temporary; Using join buffer
DROP TABLE ot1, it1, it2;
# End of BUG#38075
+#
+# BUG#31480: Incorrect result for nested subquery when executed via semi join
+#
+create table t1 (a int not null, b int not null);
+create table t2 (c int not null, d int not null);
+create table t3 (e int not null);
+insert into t1 values (1,10);
+insert into t1 values (2,10);
+insert into t1 values (1,20);
+insert into t1 values (2,20);
+insert into t1 values (3,20);
+insert into t1 values (2,30);
+insert into t1 values (4,40);
+insert into t2 values (2,10);
+insert into t2 values (2,20);
+insert into t2 values (4,10);
+insert into t2 values (5,10);
+insert into t2 values (3,20);
+insert into t2 values (2,40);
+insert into t3 values (10);
+insert into t3 values (30);
+insert into t3 values (10);
+insert into t3 values (20);
+explain extended
+select a from t1
+where a in (select c from t2 where d >= some(select e from t3 where b=e));
+id select_type table type possible_keys key key_len ref rows filtered Extra
+1 PRIMARY t2 ALL NULL NULL NULL NULL 6 100.00 Start temporary
+1 PRIMARY t1 ALL NULL NULL NULL NULL 7 100.00 Using where; End temporary; Using join buffer
+3 DEPENDENT SUBQUERY t3 ALL NULL NULL NULL NULL 4 100.00 Using where
+Warnings:
+Note 1276 Field or reference 'test.t1.b' of SELECT #3 was resolved in SELECT #1
+Note 1003 select `test`.`t1`.`a` AS `a` from `test`.`t1` semi join (`test`.`t2`) where ((`test`.`t1`.`a` = `test`.`t2`.`c`) and <nop>(<in_optimizer>(`test`.`t2`.`d`,<exists>(select 1 AS `Not_used` from `test`.`t3` where ((`test`.`t1`.`b` = `test`.`t3`.`e`) and (<cache>(`test`.`t2`.`d`) >= `test`.`t3`.`e`))))))
+show warnings;
+Level Code Message
+Note 1276 Field or reference 'test.t1.b' of SELECT #3 was resolved in SELECT #1
+Note 1003 select `test`.`t1`.`a` AS `a` from `test`.`t1` semi join (`test`.`t2`) where ((`test`.`t1`.`a` = `test`.`t2`.`c`) and <nop>(<in_optimizer>(`test`.`t2`.`d`,<exists>(select 1 AS `Not_used` from `test`.`t3` where ((`test`.`t1`.`b` = `test`.`t3`.`e`) and (<cache>(`test`.`t2`.`d`) >= `test`.`t3`.`e`))))))
+select a from t1
+where a in (select c from t2 where d >= some(select e from t3 where b=e));
+a
+2
+2
+3
+2
+drop table t1, t2, t3;
set join_cache_level=default;
show variables like 'join_cache_level';
Variable_name Value
=== modified file 'mysql-test/t/subselect_sj.test'
--- a/mysql-test/t/subselect_sj.test 2010-01-17 14:51:10 +0000
+++ b/mysql-test/t/subselect_sj.test 2010-02-11 23:59:58 +0000
@@ -681,3 +681,41 @@
DROP TABLE ot1, it1, it2;
--echo # End of BUG#38075
+
+--echo #
+--echo # BUG#31480: Incorrect result for nested subquery when executed via semi join
+--echo #
+create table t1 (a int not null, b int not null);
+create table t2 (c int not null, d int not null);
+create table t3 (e int not null);
+
+insert into t1 values (1,10);
+insert into t1 values (2,10);
+insert into t1 values (1,20);
+insert into t1 values (2,20);
+insert into t1 values (3,20);
+insert into t1 values (2,30);
+insert into t1 values (4,40);
+
+insert into t2 values (2,10);
+insert into t2 values (2,20);
+insert into t2 values (4,10);
+insert into t2 values (5,10);
+insert into t2 values (3,20);
+insert into t2 values (2,40);
+
+insert into t3 values (10);
+insert into t3 values (30);
+insert into t3 values (10);
+insert into t3 values (20);
+
+explain extended
+select a from t1
+where a in (select c from t2 where d >= some(select e from t3 where b=e));
+show warnings;
+
+select a from t1
+where a in (select c from t2 where d >= some(select e from t3 where b=e));
+
+drop table t1, t2, t3;
+
=== modified file 'sql/item.cc'
--- a/sql/item.cc 2010-02-11 22:00:36 +0000
+++ b/sql/item.cc 2010-02-11 23:59:58 +0000
@@ -3647,7 +3647,7 @@
substitution)
*/
-static void mark_as_dependent(THD *thd, SELECT_LEX *last, SELECT_LEX *current,
+static bool mark_as_dependent(THD *thd, SELECT_LEX *last, SELECT_LEX *current,
Item_ident *resolved_item,
Item_ident *mark_item)
{
@@ -3658,7 +3658,9 @@
/* store pointer on SELECT_LEX from which item is dependent */
if (mark_item)
mark_item->depended_from= last;
- current->mark_as_dependent(last, resolved_item);
+ if (current->mark_as_dependent(thd, last, /** resolved_item psergey-thu
+ **/mark_item))
+ return TRUE;
if (thd->lex->describe & DESCRIBE_EXTENDED)
{
push_warning_printf(thd, MYSQL_ERROR::WARN_LEVEL_NOTE,
@@ -3668,6 +3670,7 @@
resolved_item->field_name,
current->select_number, last->select_number);
}
+ return FALSE;
}
@@ -4119,6 +4122,7 @@
((ref_type == REF_ITEM || ref_type == FIELD_ITEM) ?
(Item_ident*) (*reference) :
0));
+
/*
A reference to a view field had been found and we
substituted it instead of this Item (find_field_in_tables
@@ -4218,7 +4222,7 @@
return -1;
mark_as_dependent(thd, last_checked_context->select_lex,
- context->select_lex, this,
+ context->select_lex, rf,
rf);
return 0;
}
@@ -5998,7 +6002,7 @@
goto error;
thd->change_item_tree(reference, fld);
mark_as_dependent(thd, last_checked_context->select_lex,
- thd->lex->current_select, this, fld);
+ thd->lex->current_select, fld, fld);
/*
A reference is resolved to a nest level that's outer or the same as
the nest level of the enclosing set function : adjust the value of
@@ -6438,7 +6442,7 @@
if (depended_from == new_parent)
{
*ref= outer_ref;
- outer_ref->fix_after_pullout(new_parent, ref);
+ (*ref)->fix_after_pullout(new_parent, ref);
}
}
=== modified file 'sql/item.h'
--- a/sql/item.h 2010-02-11 22:00:36 +0000
+++ b/sql/item.h 2010-02-11 23:59:58 +0000
@@ -1115,7 +1115,9 @@
/*
- Class to be used to enumerate all field references in an item tree.
+ Class to be used to enumerate all field references in an item tree. This
+ includes references to outside but not fields of the tables within a
+ subquery.
Suggested usage:
class My_enumerator : public Field_enumerator
@@ -2377,6 +2379,8 @@
}
bool walk(Item_processor processor, bool walk_subquery, uchar *arg)
{ return (*ref)->walk(processor, walk_subquery, arg); }
+ bool enumerate_field_refs_processor(uchar *arg)
+ { return (*ref)->enumerate_field_refs_processor(arg); }
virtual void print(String *str, enum_query_type query_type);
bool result_as_longlong()
{
=== modified file 'sql/item_subselect.cc'
--- a/sql/item_subselect.cc 2010-01-28 13:48:33 +0000
+++ b/sql/item_subselect.cc 2010-02-11 23:59:58 +0000
@@ -39,8 +39,8 @@
Item_subselect::Item_subselect():
Item_result_field(), value_assigned(0), thd(0), substitution(0),
engine(0), old_engine(0), used_tables_cache(0), have_to_be_excluded(0),
- const_item_cache(1), in_fix_fields(0), engine_changed(0), changed(0),
- is_correlated(FALSE)
+ const_item_cache(1), inside_first_fix_fields(0), done_first_fix_fields(FALSE),
+ engine_changed(0), changed(0), is_correlated(FALSE)
{
with_subselect= 1;
reset();
@@ -167,18 +167,23 @@
DBUG_ASSERT(fixed == 0);
engine->set_thd((thd= thd_param));
- if (!in_fix_fields)
- refers_to.empty();
+ if (!done_first_fix_fields)
+ {
+ done_first_fix_fields= TRUE;
+ inside_first_fix_fields= TRUE;
+ }
+
eliminated= FALSE;
+ parent_select= thd_param->lex->current_select;
if (check_stack_overrun(thd, STACK_MIN_SIZE, (uchar*)&res))
return TRUE;
- in_fix_fields++;
res= engine->prepare();
// all transformation is done (used by prepared statements)
changed= 1;
+ inside_first_fix_fields= FALSE;
if (!res)
{
@@ -210,14 +215,12 @@
if (!(*ref)->fixed)
ret= (*ref)->fix_fields(thd, ref);
thd->where= save_where;
- in_fix_fields--;
return ret;
}
// Is it one field subselect?
if (engine->cols() > max_columns)
{
my_error(ER_OPERAND_COLUMNS, MYF(0), 1);
- in_fix_fields--;
return TRUE;
}
fix_length_and_dec();
@@ -234,7 +237,6 @@
fixed= 1;
err:
- in_fix_fields--;
thd->where= save_where;
return res;
}
@@ -242,11 +244,12 @@
bool Item_subselect::enumerate_field_refs_processor(uchar *arg)
{
- List_iterator<Item> it(refers_to);
- Item *item;
- while ((item= it++))
+ List_iterator<Ref_to_outside> it(upper_refs);
+ Ref_to_outside *upper;
+
+ while ((upper= it++))
{
- if (item->walk(&Item::enumerate_field_refs_processor, FALSE, arg))
+ if (upper->item->walk(&Item::enumerate_field_refs_processor, FALSE, arg))
return TRUE;
}
return FALSE;
@@ -258,6 +261,142 @@
return FALSE;
}
+
+bool Item_subselect::mark_as_dependent(THD *thd, st_select_lex *select,
+ Item *item)
+{
+ if (inside_first_fix_fields)
+ {
+ is_correlated= TRUE;
+ Ref_to_outside *upper;
+ if (!(upper= new (thd->stmt_arena->mem_root) Ref_to_outside()))
+ return TRUE;
+ upper->select= select;
+ upper->item= item;
+ if (upper_refs.push_back(upper, thd->stmt_arena->mem_root))
+ return TRUE;
+ }
+ return FALSE;
+}
+
+/*
+ Adjust attributes after our parent select has been merged into grandparent
+
+ DESCRIPTION
+ Subquery is a composite object which may be correlated, that is, it may
+ have
+ 1. references to tables of the parent select (i.e. one that has the clause
+ with the subquery predicate)
+ 2. references to tables of the grandparent select
+ 3. references to tables of further ancestors.
+
+ Before the pullout, this item indicates:
+ - #1 with table bits in used_tables()
+ - #2 and #3 with OUTER_REF_TABLE_BIT.
+
+ After parent has been merged with grandparent:
+ - references to parent and grandparent tables should be indicated with
+ table bits.
+ - references to greatgrandparent and further ancestors - with
+ OUTER_REF_TABLE_BIT.
+*/
+
+void Item_subselect::fix_after_pullout(st_select_lex *new_parent, Item **ref)
+{
+ recalc_used_tables(new_parent, TRUE);
+ parent_select= new_parent;
+}
+
+class Field_fixer: public Field_enumerator
+{
+public:
+ table_map used_tables; /* Collect used_tables here */
+ st_select_lex *new_parent; /* Select we're in */
+ virtual void visit_field(Field *field)
+ {
+ //for (TABLE_LIST *tbl= new_parent->leaf_tables; tbl; tbl= tbl->next_local)
+ //{
+ // if (tbl->table == field->table)
+ // {
+ used_tables|= field->table->map;
+ // return;
+ // }
+ //}
+ //used_tables |= OUTER_REF_TABLE_BIT;
+ }
+};
+
+
+/*
+ Recalculate used_tables_cache
+*/
+
+void Item_subselect::recalc_used_tables(st_select_lex *new_parent,
+ bool after_pullout)
+{
+ List_iterator<Ref_to_outside> it(upper_refs);
+ Ref_to_outside *upper;
+
+ used_tables_cache= 0;
+ while ((upper= it++))
+ {
+ bool found= FALSE;
+ /*
+ Check if
+ 1. the upper reference refers to the new immediate parent select, or
+ 2. one of the further ancestors.
+
+ We rely on the fact that the tree of selects is modified by some kind of
+ 'flattening', i.e. a process where child selects are merged into their
+ parents.
+ The merged selects are removed from the select tree but keep pointers to
+ their parents.
+ */
+ for (st_select_lex *sel= upper->select; sel; sel= sel->outer_select())
+ {
+ /*
+ If we've reached the new parent select by walking upwards from
+ reference's original select, this means that the reference is now
+ referring to the direct parent:
+ */
+ if (sel == new_parent)
+ {
+ found= TRUE;
+ /*
+ upper->item may be NULL when we've referred to a grouping function,
+ in which case we don't care about what it's table_map really is,
+ because item->with_sum_func==1 will ensure correct placement of the
+ item.
+ */
+ if (upper->item)
+ {
+ // Now, iterate over fields and collect used_tables() attribute:
+ Field_fixer fixer;
+ fixer.used_tables= 0;
+ fixer.new_parent= new_parent;
+ upper->item->walk(&Item::enumerate_field_refs_processor, FALSE,
+ (uchar*)&fixer);
+ used_tables_cache |= fixer.used_tables;
+ /*
+ if (after_pullout)
+ upper->item->fix_after_pullout(new_parent, &(upper->item));
+ upper->item->update_used_tables();
+ used_tables_cache |= upper->item->used_tables();
+ */
+ }
+ }
+ }
+ if (!found)
+ used_tables_cache|= OUTER_REF_TABLE_BIT;
+ }
+ /*
+ Don't update const_tables_cache yet as we don't yet know which of the
+ parent's tables are constant. Parent will call update_used_tables() after
+ he has done const table detection, and that will be our chance to update
+ const_tables_cache.
+ */
+}
+
bool Item_subselect::walk(Item_processor processor, bool walk_subquery,
uchar *argument)
{
@@ -397,6 +536,7 @@
void Item_subselect::update_used_tables()
{
+ recalc_used_tables(parent_select, FALSE);
if (!engine->uncacheable())
{
// did all used tables become static?
@@ -1843,6 +1983,18 @@
return result || Item_subselect::fix_fields(thd_arg, ref);
}
+void Item_in_subselect::fix_after_pullout(st_select_lex *new_parent, Item **ref)
+{
+ left_expr->fix_after_pullout(new_parent, &left_expr);
+ Item_subselect::fix_after_pullout(new_parent, ref);
+}
+
+void Item_in_subselect::update_used_tables()
+{
+ Item_subselect::update_used_tables();
+ left_expr->update_used_tables();
+ used_tables_cache |= left_expr->used_tables();
+}
/**
Try to create an engine to compute the subselect via materialization,
=== modified file 'sql/item_subselect.h'
--- a/sql/item_subselect.h 2010-01-28 13:48:33 +0000
+++ b/sql/item_subselect.h 2010-02-11 23:59:58 +0000
@@ -67,14 +67,32 @@
bool have_to_be_excluded;
/* cache of constant state */
bool const_item_cache;
-
+
+ bool inside_first_fix_fields;
+ bool done_first_fix_fields;
public:
- /*
- References from inside the subquery to the select that this predicate is
- in. References to parent selects not included.
+ /* A reference from inside subquery predicate to somewhere outside of it */
+ class Ref_to_outside : public Sql_alloc
+ {
+ public:
+ st_select_lex *select; /* Select where the reference is pointing to */
+ /*
+ What is being referred. This may be NULL when we're referring to an
+ aggregate function.
+ */
+ Item *item;
+ };
+ /*
+ References from within this subquery to somewhere outside of it (i.e. to
+ parent select, grandparent select, etc)
*/
- List<Item> refers_to;
- int in_fix_fields;
+ List<Ref_to_outside> upper_refs;
+ st_select_lex *parent_select;
+
+ /*
+ TRUE<=>Table Elimination has made it redundant to evaluate this select
+ (and so it is not part of QEP, etc)
+ */
bool eliminated;
/* changed engine indicator */
@@ -117,6 +135,9 @@
return null_value;
}
bool fix_fields(THD *thd, Item **ref);
+ bool mark_as_dependent(THD *thd, st_select_lex *select, Item *item);
+ void fix_after_pullout(st_select_lex *new_parent, Item **ref);
+ void recalc_used_tables(st_select_lex *new_parent, bool after_pullout);
virtual bool exec();
virtual void fix_length_and_dec();
table_map used_tables() const;
@@ -396,6 +417,8 @@
bool test_limit(st_select_lex_unit *unit);
virtual void print(String *str, enum_query_type query_type);
bool fix_fields(THD *thd, Item **ref);
+ void fix_after_pullout(st_select_lex *new_parent, Item **ref);
+ void update_used_tables();
bool setup_engine();
bool init_left_expr_cache();
bool is_expensive_processor(uchar *arg);
=== modified file 'sql/item_sum.cc'
--- a/sql/item_sum.cc 2009-10-15 21:38:29 +0000
+++ b/sql/item_sum.cc 2010-02-11 23:59:58 +0000
@@ -350,7 +350,7 @@
sl= sl->master_unit()->outer_select() )
sl->master_unit()->item->with_sum_func= 1;
}
- thd->lex->current_select->mark_as_dependent(aggr_sel, NULL);
+ thd->lex->current_select->mark_as_dependent(thd, aggr_sel, NULL);
return FALSE;
}
=== modified file 'sql/sql_lex.cc'
--- a/sql/sql_lex.cc 2010-01-28 13:48:33 +0000
+++ b/sql/sql_lex.cc 2010-02-11 23:59:58 +0000
@@ -1841,9 +1841,8 @@
'last' should be reachable from this st_select_lex_node
*/
-void st_select_lex::mark_as_dependent(st_select_lex *last, Item *dependency)
+bool st_select_lex::mark_as_dependent(THD *thd, st_select_lex *last, Item *dependency)
{
- SELECT_LEX *next_to_last;
/*
Mark all selects from resolved to 1 before select where was
found table as depended (of select where was found table)
@@ -1867,12 +1866,15 @@
sl->uncacheable|= UNCACHEABLE_UNITED;
}
}
- next_to_last= s;
+
+ Item_subselect *subquery_expr= s->master_unit()->item;
+ if (subquery_expr && subquery_expr->mark_as_dependent(thd, last,
+ dependency))
+ return TRUE;
}
is_correlated= TRUE;
this->master_unit()->item->is_correlated= TRUE;
- if (dependency)
- next_to_last->master_unit()->item->refers_to.push_back(dependency);
+ return FALSE;
}
bool st_select_lex_node::set_braces(bool value) { return 1; }
=== modified file 'sql/sql_lex.h'
--- a/sql/sql_lex.h 2010-01-28 13:48:33 +0000
+++ b/sql/sql_lex.h 2010-02-11 23:59:58 +0000
@@ -747,7 +747,7 @@
return master_unit()->return_after_parsing();
}
- void mark_as_dependent(st_select_lex *last, Item *dependency);
+ bool mark_as_dependent(THD *thd, st_select_lex *last, Item *dependency);
bool set_braces(bool value);
bool inc_in_sum_expr();
1
0
[Maria-developers] Rev 2755: Subquery optimizations: backport: enable disabled subquery code in BKA in file:///home/psergey/dev/maria-5.3-subqueries-r6/
by Sergey Petrunya 11 Feb '10
by Sergey Petrunya 11 Feb '10
11 Feb '10
At file:///home/psergey/dev/maria-5.3-subqueries-r6/
------------------------------------------------------------
revno: 2755
revision-id: psergey(a)askmonty.org-20100211223118-5fzuidow1pkubpzl
parent: psergey(a)askmonty.org-20100211220036-qh3iw743tbgwpzax
committer: Sergey Petrunya <psergey(a)askmonty.org>
branch nick: maria-5.3-subqueries-r6
timestamp: Fri 2010-02-12 01:31:18 +0300
message:
Subquery optimizations: backport: enable disabled subquery code in BKA
=== modified file 'sql/sql_select.h'
--- a/sql/sql_select.h 2010-01-28 13:48:33 +0000
+++ b/sql/sql_select.h 2010-02-11 22:31:18 +0000
@@ -282,13 +282,11 @@
}
bool check_rowid_field()
{
-/* !!!NB igor: enable the code in this comment after backporting the SJ code
if (keep_current_rowid && !used_rowid_fields)
{
used_rowid_fields= 1;
used_fieldlength+= table->file->ref_length;
}
-*/
return test(used_rowid_fields);
}
bool is_inner_table_of_semi_join_with_first_match()
1
0
[Maria-developers] Rev 2754: Subquery optimizations: backport in file:///home/psergey/dev/maria-5.3-subqueries-r6/
by Sergey Petrunya 11 Feb '10
by Sergey Petrunya 11 Feb '10
11 Feb '10
At file:///home/psergey/dev/maria-5.3-subqueries-r6/
------------------------------------------------------------
revno: 2754
revision-id: psergey(a)askmonty.org-20100211220036-qh3iw743tbgwpzax
parent: psergey(a)askmonty.org-20100211215932-qi36vl0i3zkl86bv
committer: Sergey Petrunya <psergey(a)askmonty.org>
branch nick: maria-5.3-subqueries-r6
timestamp: Fri 2010-02-12 01:00:36 +0300
message:
Subquery optimizations: backport
- Fix valgrind failure: do initialize Item::is_expensive_cache.
=== modified file 'sql/item.cc'
--- a/sql/item.cc 2010-01-17 14:55:08 +0000
+++ b/sql/item.cc 2010-02-11 22:00:36 +0000
@@ -373,8 +373,8 @@
Item::Item():
- rsize(0), name(0), orig_name(0), name_length(0), fixed(0),
- is_autogenerated_name(TRUE),
+ is_expensive_cache(-1), rsize(0), name(0), orig_name(0), name_length(0),
+ fixed(0), is_autogenerated_name(TRUE),
collation(&my_charset_bin, DERIVATION_COERCIBLE)
{
marker= 0;
@@ -410,6 +410,7 @@
tables.
*/
Item::Item(THD *thd, Item *item):
+ is_expensive_cache(-1),
rsize(0),
str_value(item->str_value),
name(item->name),
=== modified file 'sql/item.h'
--- a/sql/item.h 2010-01-28 13:48:33 +0000
+++ b/sql/item.h 2010-02-11 22:00:36 +0000
@@ -513,6 +513,9 @@
enum traverse_order { POSTFIX, PREFIX };
+ /* Cache of the result of is_expensive(). */
+ int8 is_expensive_cache;
+
/* Reuse size, only used by SP local variable assignment, otherwize 0 */
uint rsize;
@@ -878,9 +881,6 @@
static CHARSET_INFO *default_charset();
virtual CHARSET_INFO *compare_collation() { return NULL; }
- /* Cache of the result of is_expensive(). */
- int8 is_expensive_cache;
-
virtual bool walk(Item_processor processor, bool walk_subquery, uchar *arg)
{
return (this->*processor)(arg);
1
0
[Maria-developers] Rev 2753: Subquery optimizations backport: Update test results (checked) in file:///home/psergey/dev/maria-5.3-subqueries-r6/
by Sergey Petrunya 11 Feb '10
by Sergey Petrunya 11 Feb '10
11 Feb '10
At file:///home/psergey/dev/maria-5.3-subqueries-r6/
------------------------------------------------------------
revno: 2753
revision-id: psergey(a)askmonty.org-20100211215932-qi36vl0i3zkl86bv
parent: psergey(a)askmonty.org-20100211215823-63ikirl70ztmlk05
committer: Sergey Petrunya <psergey(a)askmonty.org>
branch nick: maria-5.3-subqueries-r6
timestamp: Fri 2010-02-12 00:59:32 +0300
message:
Subquery optimizations backport: Update test results (checked)
=== modified file 'mysql-test/r/join_cache.result'
--- a/mysql-test/r/join_cache.result 2009-12-21 02:26:15 +0000
+++ b/mysql-test/r/join_cache.result 2010-02-11 21:59:32 +0000
@@ -1028,8 +1028,8 @@
WHERE City.Country IN (SELECT Code FROM Country WHERE Country.Name LIKE 'L%') AND
City.Population > 100000;
id select_type table type possible_keys key key_len ref rows Extra
-1 PRIMARY City ALL Population NULL NULL NULL 4079 Using where
-2 DEPENDENT SUBQUERY Country unique_subquery PRIMARY,Name PRIMARY 3 func 1 Using where
+1 PRIMARY Country range PRIMARY,Name Name 52 NULL 10 Using index condition; Using MRR
+1 PRIMARY City ref Population,Country Country 3 world.Country.Code 18 Using where; Using join buffer
SELECT Name FROM City
WHERE City.Country IN (SELECT Code FROM Country WHERE Country.Name LIKE 'L%') AND
City.Population > 100000;
@@ -1343,8 +1343,8 @@
WHERE City.Country IN (SELECT Code FROM Country WHERE Country.Name LIKE 'L%') AND
City.Population > 100000;
id select_type table type possible_keys key key_len ref rows Extra
-1 PRIMARY City ALL Population NULL NULL NULL 4079 Using where
-2 DEPENDENT SUBQUERY Country unique_subquery PRIMARY,Name PRIMARY 3 func 1 Using where
+1 PRIMARY Country range PRIMARY,Name Name 52 NULL 10 Using index condition; Using MRR
+1 PRIMARY City ref Population,Country Country 3 world.Country.Code 18 Using where; Using join buffer
SELECT Name FROM City
WHERE City.Country IN (SELECT Code FROM Country WHERE Country.Name LIKE 'L%') AND
City.Population > 100000;
@@ -1658,8 +1658,8 @@
WHERE City.Country IN (SELECT Code FROM Country WHERE Country.Name LIKE 'L%') AND
City.Population > 100000;
id select_type table type possible_keys key key_len ref rows Extra
-1 PRIMARY City ALL Population NULL NULL NULL 4079 Using where
-2 DEPENDENT SUBQUERY Country unique_subquery PRIMARY,Name PRIMARY 3 func 1 Using where
+1 PRIMARY Country range PRIMARY,Name Name 52 NULL 10 Using index condition; Using MRR
+1 PRIMARY City ref Population,Country Country 3 world.Country.Code 18 Using where; Using join buffer
SELECT Name FROM City
WHERE City.Country IN (SELECT Code FROM Country WHERE Country.Name LIKE 'L%') AND
City.Population > 100000;
@@ -1973,8 +1973,8 @@
WHERE City.Country IN (SELECT Code FROM Country WHERE Country.Name LIKE 'L%') AND
City.Population > 100000;
id select_type table type possible_keys key key_len ref rows Extra
-1 PRIMARY City ALL Population NULL NULL NULL 4079 Using where
-2 DEPENDENT SUBQUERY Country unique_subquery PRIMARY,Name PRIMARY 3 func 1 Using where
+1 PRIMARY Country range PRIMARY,Name Name 52 NULL 10 Using index condition; Using MRR
+1 PRIMARY City ref Population,Country Country 3 world.Country.Code 18 Using where; Using join buffer
SELECT Name FROM City
WHERE City.Country IN (SELECT Code FROM Country WHERE Country.Name LIKE 'L%') AND
City.Population > 100000;
@@ -2292,8 +2292,8 @@
WHERE City.Country IN (SELECT Code FROM Country WHERE Country.Name LIKE 'L%') AND
City.Population > 100000;
id select_type table type possible_keys key key_len ref rows Extra
-1 PRIMARY City ALL Population NULL NULL NULL 4079 Using where
-2 DEPENDENT SUBQUERY Country unique_subquery PRIMARY,Name PRIMARY 3 func 1 Using where
+1 PRIMARY Country range PRIMARY,Name Name 52 NULL 10 Using index condition; Using MRR
+1 PRIMARY City ref Population,Country Country 3 world.Country.Code 18 Using where; Using join buffer
SELECT Name FROM City
WHERE City.Country IN (SELECT Code FROM Country WHERE Country.Name LIKE 'L%') AND
City.Population > 100000;
@@ -2514,8 +2514,8 @@
WHERE City.Country IN (SELECT Code FROM Country WHERE Country.Name LIKE 'L%') AND
City.Population > 100000;
id select_type table type possible_keys key key_len ref rows Extra
-1 PRIMARY City ALL Population NULL NULL NULL 4079 Using where
-2 DEPENDENT SUBQUERY Country unique_subquery PRIMARY,Name PRIMARY 3 func 1 Using where
+1 PRIMARY Country range PRIMARY,Name Name 52 NULL 10 Using index condition; Using MRR
+1 PRIMARY City ref Population,Country Country 3 world.Country.Code 18 Using where; Using join buffer
SELECT Name FROM City
WHERE City.Country IN (SELECT Code FROM Country WHERE Country.Name LIKE 'L%') AND
City.Population > 100000;
@@ -2736,8 +2736,8 @@
WHERE City.Country IN (SELECT Code FROM Country WHERE Country.Name LIKE 'L%') AND
City.Population > 100000;
id select_type table type possible_keys key key_len ref rows Extra
-1 PRIMARY City ALL Population NULL NULL NULL 4079 Using where
-2 DEPENDENT SUBQUERY Country unique_subquery PRIMARY,Name PRIMARY 3 func 1 Using where
+1 PRIMARY Country range PRIMARY,Name Name 52 NULL 10 Using index condition; Using MRR
+1 PRIMARY City ref Population,Country Country 3 world.Country.Code 18 Using where; Using join buffer
SELECT Name FROM City
WHERE City.Country IN (SELECT Code FROM Country WHERE Country.Name LIKE 'L%') AND
City.Population > 100000;
@@ -2958,8 +2958,8 @@
WHERE City.Country IN (SELECT Code FROM Country WHERE Country.Name LIKE 'L%') AND
City.Population > 100000;
id select_type table type possible_keys key key_len ref rows Extra
-1 PRIMARY City ALL Population NULL NULL NULL 4079 Using where
-2 DEPENDENT SUBQUERY Country unique_subquery PRIMARY,Name PRIMARY 3 func 1 Using where
+1 PRIMARY Country range PRIMARY,Name Name 52 NULL 10 Using index condition; Using MRR
+1 PRIMARY City ref Population,Country Country 3 world.Country.Code 18 Using where; Using join buffer
SELECT Name FROM City
WHERE City.Country IN (SELECT Code FROM Country WHERE Country.Name LIKE 'L%') AND
City.Population > 100000;
=== modified file 'mysql-test/r/type_datetime.result'
--- a/mysql-test/r/type_datetime.result 2009-02-13 18:07:03 +0000
+++ b/mysql-test/r/type_datetime.result 2010-02-11 21:59:32 +0000
@@ -514,10 +514,9 @@
where id in (select id from t1 as x1 where (t1.cur_date is null));
id select_type table type possible_keys key key_len ref rows filtered Extra
1 PRIMARY NULL NULL NULL NULL NULL NULL NULL NULL Impossible WHERE noticed after reading const tables
-2 DEPENDENT SUBQUERY NULL NULL NULL NULL NULL NULL NULL NULL Impossible WHERE
Warnings:
Note 1276 Field or reference 'test.t1.cur_date' of SELECT #2 was resolved in SELECT #1
-Note 1003 select '1' AS `id`,'2007-04-25 18:30:22' AS `cur_date` from `test`.`t1` where <in_optimizer>('1',<exists>(select 1 AS `Not_used` from `test`.`t1` `x1` where 0))
+Note 1003 select '1' AS `id`,'2007-04-25 18:30:22' AS `cur_date` from `test`.`t1` `x1` join `test`.`t1` where (('2007-04-25 18:30:22' = 0))
select * from t1
where id in (select id from t1 as x1 where (t1.cur_date is null));
id cur_date
@@ -526,10 +525,9 @@
where id in (select id from t2 as x1 where (t2.cur_date is null));
id select_type table type possible_keys key key_len ref rows filtered Extra
1 PRIMARY NULL NULL NULL NULL NULL NULL NULL NULL Impossible WHERE noticed after reading const tables
-2 DEPENDENT SUBQUERY NULL NULL NULL NULL NULL NULL NULL NULL Impossible WHERE
Warnings:
Note 1276 Field or reference 'test.t2.cur_date' of SELECT #2 was resolved in SELECT #1
-Note 1003 select '1' AS `id`,'2007-04-25' AS `cur_date` from `test`.`t2` where <in_optimizer>('1',<exists>(select 1 AS `Not_used` from `test`.`t2` `x1` where 0))
+Note 1003 select '1' AS `id`,'2007-04-25' AS `cur_date` from `test`.`t2` `x1` join `test`.`t2` where (('2007-04-25' = 0))
select * from t2
where id in (select id from t2 as x1 where (t2.cur_date is null));
id cur_date
@@ -540,10 +538,10 @@
where id in (select id from t1 as x1 where (t1.cur_date is null));
id select_type table type possible_keys key key_len ref rows filtered Extra
1 PRIMARY t1 ALL NULL NULL NULL NULL 2 100.00 Using where
-2 DEPENDENT SUBQUERY x1 ALL NULL NULL NULL NULL 2 100.00 Using where
+1 PRIMARY x1 ALL NULL NULL NULL NULL 2 100.00 Using where; FirstMatch(t1)
Warnings:
Note 1276 Field or reference 'test.t1.cur_date' of SELECT #2 was resolved in SELECT #1
-Note 1003 select `test`.`t1`.`id` AS `id`,`test`.`t1`.`cur_date` AS `cur_date` from `test`.`t1` where <in_optimizer>(`test`.`t1`.`id`,<exists>(select 1 AS `Not_used` from `test`.`t1` `x1` where ((`test`.`t1`.`cur_date` = 0) and (<cache>(`test`.`t1`.`id`) = `test`.`x1`.`id`))))
+Note 1003 select `test`.`t1`.`id` AS `id`,`test`.`t1`.`cur_date` AS `cur_date` from `test`.`t1` semi join (`test`.`t1` `x1`) where ((`test`.`x1`.`id` = `test`.`t1`.`id`) and (`test`.`t1`.`cur_date` = 0))
select * from t1
where id in (select id from t1 as x1 where (t1.cur_date is null));
id cur_date
@@ -552,10 +550,10 @@
where id in (select id from t2 as x1 where (t2.cur_date is null));
id select_type table type possible_keys key key_len ref rows filtered Extra
1 PRIMARY t2 ALL NULL NULL NULL NULL 2 100.00 Using where
-2 DEPENDENT SUBQUERY x1 ALL NULL NULL NULL NULL 2 100.00 Using where
+1 PRIMARY x1 ALL NULL NULL NULL NULL 2 100.00 Using where; FirstMatch(t2)
Warnings:
Note 1276 Field or reference 'test.t2.cur_date' of SELECT #2 was resolved in SELECT #1
-Note 1003 select `test`.`t2`.`id` AS `id`,`test`.`t2`.`cur_date` AS `cur_date` from `test`.`t2` where <in_optimizer>(`test`.`t2`.`id`,<exists>(select 1 AS `Not_used` from `test`.`t2` `x1` where ((`test`.`t2`.`cur_date` = 0) and (<cache>(`test`.`t2`.`id`) = `test`.`x1`.`id`))))
+Note 1003 select `test`.`t2`.`id` AS `id`,`test`.`t2`.`cur_date` AS `cur_date` from `test`.`t2` semi join (`test`.`t2` `x1`) where ((`test`.`x1`.`id` = `test`.`t2`.`id`) and (`test`.`t2`.`cur_date` = 0))
select * from t2
where id in (select id from t2 as x1 where (t2.cur_date is null));
id cur_date
1
0
[Maria-developers] Rev 2752: Apply Jorgen Loland's fix: Bug#45221: Query "SELECT pk FROM C WHERE pk IN (SELECT int_key)" failing in file:///home/psergey/dev/maria-5.3-subqueries-r6/
by Sergey Petrunya 11 Feb '10
by Sergey Petrunya 11 Feb '10
11 Feb '10
At file:///home/psergey/dev/maria-5.3-subqueries-r6/
------------------------------------------------------------
revno: 2752
revision-id: psergey(a)askmonty.org-20100211215823-63ikirl70ztmlk05
parent: psergey(a)askmonty.org-20100211215602-irdyu314ddwew1xd
committer: Sergey Petrunya <psergey(a)askmonty.org>
branch nick: maria-5.3-subqueries-r6
timestamp: Fri 2010-02-12 00:58:23 +0300
message:
Apply Jorgen Loland's fix: Bug#45221: Query "SELECT pk FROM C WHERE pk IN (SELECT int_key)" failing
XOR conditions are not optimized, and Item_cond_xor therefore
acts like type Func_item even though it inherits from Item_cond.
A subtle difference between Item_func and Item_cond is that
you can get the children Items from the former by calling
arguments(), and from the latter by calling argument_list().
However, since Item_cond_xor inherits from Item_cond,
arguments() did not return any Items.
The fact that Item_cond_xor::arguments() did not return it's
children items lead to a problem for make_cond_for_index();
the method accepted that XOR items on unindexed columns were
pushed using ICP. ICP evaluation of non-indexed columns
does not (and should not) work.
The fix for this bug is to make Item_cond_xor return it's
children items when the arguments() method is used. This makes
Item_cond_xor behave more like Item_func and in turn allows
make_cond_for_index() to discover any conflicting children
Items.
This is a temporary fix and should be removed when Item_cond_xor
is optimized.
=== modified file 'mysql-test/r/group_by.result'
--- a/mysql-test/r/group_by.result 2009-02-26 17:17:06 +0000
+++ b/mysql-test/r/group_by.result 2010-02-11 21:58:23 +0000
@@ -1542,8 +1542,8 @@
EXPLAIN SELECT 1 FROM t1 WHERE a IN
(SELECT a FROM t1 USE INDEX (i2) IGNORE INDEX (i2));
id select_type table type possible_keys key key_len ref rows Extra
-1 PRIMARY t1 index NULL PRIMARY 4 NULL 144 Using where; Using index
-2 DEPENDENT SUBQUERY t1 ALL NULL NULL NULL NULL 144 Using where
+1 PRIMARY t1 index PRIMARY,i2 PRIMARY 4 NULL 144 Using index
+1 PRIMARY t1 ALL NULL NULL NULL NULL 144 Using where; FirstMatch(t1)
CREATE TABLE t2 (a INT, b INT, KEY(a));
INSERT INTO t2 VALUES (1, 1), (2, 2), (3,3), (4,4);
EXPLAIN SELECT a, SUM(b) FROM t2 GROUP BY a LIMIT 2;
@@ -1555,8 +1555,8 @@
EXPLAIN SELECT 1 FROM t2 WHERE a IN
(SELECT a FROM t1 USE INDEX (i2) IGNORE INDEX (i2));
id select_type table type possible_keys key key_len ref rows Extra
-1 PRIMARY t2 index NULL a 5 NULL 4 Using where; Using index
-2 DEPENDENT SUBQUERY t1 ALL NULL NULL NULL NULL 144 Using where
+1 PRIMARY t2 index a a 5 NULL 4 Using index
+1 PRIMARY t1 ALL NULL NULL NULL NULL 144 Using where; FirstMatch(t2)
SHOW VARIABLES LIKE 'old';
Variable_name Value
old OFF
=== modified file 'sql/item_cmpfunc.h'
--- a/sql/item_cmpfunc.h 2010-01-17 14:55:08 +0000
+++ b/sql/item_cmpfunc.h 2010-02-11 21:58:23 +0000
@@ -1715,14 +1715,34 @@
class Item_cond_xor :public Item_cond
{
public:
- Item_cond_xor() :Item_cond() {}
- Item_cond_xor(Item *i1,Item *i2) :Item_cond(i1,i2) {}
+ Item_cond_xor(Item *i1,Item *i2) :Item_cond(i1,i2)
+ {
+ /*
+ Items must be stored in args[] as well because this Item_cond is
+ treated as a FUNC_ITEM (see type()). I.e., users of it will get
+ it's children by calling arguments(), not argument_list(). This
+ is a temporary solution until XOR is optimized and treated like
+ a full Item_cond citizen.
+ */
+ arg_count= 2;
+ args= tmp_arg;
+ args[0]= i1;
+ args[1]= i2;
+ }
enum Functype functype() const { return COND_XOR_FUNC; }
/* TODO: remove the next line when implementing XOR optimization */
enum Type type() const { return FUNC_ITEM; }
longlong val_int();
const char *func_name() const { return "xor"; }
void top_level_item() {}
+ /* Since child Items are stored in args[], Items cannot be added.
+ However, since Item_cond_xor is treated as a FUNC_ITEM (see
+ type()), the methods below should never be called.
+ */
+ bool add(Item *item) { DBUG_ASSERT(FALSE); return FALSE; }
+ bool add_at_head(Item *item) { DBUG_ASSERT(FALSE); return FALSE; }
+ bool add_at_head(List<Item> *nlist) { DBUG_ASSERT(FALSE); return FALSE; }
+ void copy_andor_arguments(THD *thd, Item_cond *item) { DBUG_ASSERT(FALSE); }
};
1
0
[Maria-developers] Rev 2751: Subquery backport: Update test results (checked) in file:///home/psergey/dev/maria-5.3-subqueries-r6/
by Sergey Petrunya 11 Feb '10
by Sergey Petrunya 11 Feb '10
11 Feb '10
At file:///home/psergey/dev/maria-5.3-subqueries-r6/
------------------------------------------------------------
revno: 2751
revision-id: psergey(a)askmonty.org-20100211215602-irdyu314ddwew1xd
parent: psergey(a)askmonty.org-20100211215456-u85owf67gwqkkss5
committer: Sergey Petrunya <psergey(a)askmonty.org>
branch nick: maria-5.3-subqueries-r6
timestamp: Fri 2010-02-12 00:56:02 +0300
message:
Subquery backport: Update test results (checked)
=== modified file 'mysql-test/r/explain.result'
--- a/mysql-test/r/explain.result 2009-12-15 07:16:46 +0000
+++ b/mysql-test/r/explain.result 2010-02-11 21:56:02 +0000
@@ -171,7 +171,7 @@
EXPLAIN SELECT OUTR.dt FROM t1 AS OUTR WHERE OUTR.dt IN (SELECT INNR.dt FROM t2 AS INNR WHERE OUTR.dt IS NULL );
id select_type table type possible_keys key key_len ref rows Extra
1 PRIMARY OUTR ALL NULL NULL NULL NULL 2 Using where
-2 DEPENDENT SUBQUERY INNR ALL NULL NULL NULL NULL 2 Using where
+1 PRIMARY INNR ALL NULL NULL NULL NULL 2 Using where; FirstMatch(OUTR)
flush tables;
SELECT OUTR.dt FROM t1 AS OUTR WHERE OUTR.dt IN (SELECT INNR.dt FROM t2 AS INNR WHERE OUTR.dt IS NULL );
dt
@@ -179,7 +179,7 @@
EXPLAIN SELECT OUTR.dt FROM t1 AS OUTR WHERE OUTR.dt IN ( SELECT INNR.dt FROM t2 AS INNR WHERE OUTR.t < '2005-11-13 7:41:31' );
id select_type table type possible_keys key key_len ref rows Extra
1 PRIMARY OUTR ALL NULL NULL NULL NULL 2 Using where
-2 DEPENDENT SUBQUERY INNR ALL NULL NULL NULL NULL 2 Using where
+1 PRIMARY INNR ALL NULL NULL NULL NULL 2 Using where; FirstMatch(OUTR)
flush tables;
SELECT OUTR.dt FROM t1 AS OUTR WHERE OUTR.dt IN ( SELECT INNR.dt FROM t2 AS INNR WHERE OUTR.t < '2005-11-13 7:41:31' );
dt
=== modified file 'mysql-test/r/group_min_max.result'
--- a/mysql-test/r/group_min_max.result 2009-08-30 07:03:37 +0000
+++ b/mysql-test/r/group_min_max.result 2010-02-11 21:56:02 +0000
@@ -2256,7 +2256,7 @@
a IN (SELECT max(b) FROM t1 GROUP BY a HAVING a < 2);
id select_type table type possible_keys key key_len ref rows Extra
1 PRIMARY t1_outer index NULL a 10 NULL 15 Using where; Using index
-2 DEPENDENT SUBQUERY t1 index NULL a 10 NULL 1 Using index
+2 SUBQUERY t1 range NULL a 5 NULL 8 Using index for group-by
EXPLAIN SELECT 1 FROM t1 AS t1_outer GROUP BY a HAVING
a > (SELECT max(b) FROM t1 GROUP BY a HAVING a < 2);
id select_type table type possible_keys key key_len ref rows Extra
=== modified file 'mysql-test/r/subselect3_jcl6.result'
--- a/mysql-test/r/subselect3_jcl6.result 2010-01-17 14:51:10 +0000
+++ b/mysql-test/r/subselect3_jcl6.result 2010-02-11 21:56:02 +0000
@@ -1140,7 +1140,7 @@
flush status;
select count(*) from t0 A, t0 B, t0 C, t0 D where D.a in (select a from t1 E);
count(*)
-4999
+5000
show status like 'Created_tmp_disk_tables';
Variable_name Value
Created_tmp_disk_tables 1
1
0
[Maria-developers] Rev 2750: Subquery optimization backport: Duplicate Elimination: in file:///home/psergey/dev/maria-5.3-subqueries-r6/
by Sergey Petrunya 11 Feb '10
by Sergey Petrunya 11 Feb '10
11 Feb '10
At file:///home/psergey/dev/maria-5.3-subqueries-r6/
------------------------------------------------------------
revno: 2750
revision-id: psergey(a)askmonty.org-20100211215456-u85owf67gwqkkss5
parent: psergey(a)askmonty.org-20100128134833-9000udjp5wa3tsff
committer: Sergey Petrunya <psergey(a)askmonty.org>
branch nick: maria-5.3-subqueries-r6
timestamp: Fri 2010-02-12 00:54:56 +0300
message:
Subquery optimization backport: Duplicate Elimination:
process temporary table overflow correctly.
=== modified file 'sql/sql_select.cc'
--- a/sql/sql_select.cc 2010-01-28 13:48:33 +0000
+++ b/sql/sql_select.cc 2010-02-11 21:54:56 +0000
@@ -16250,12 +16250,12 @@
if (error)
{
/* create_internal_tmp_table_from_heap will generate error if needed */
- if (sjtbl->tmp_table->file->is_fatal_error(error, HA_CHECK_DUP) &&
- create_internal_tmp_table_from_heap(thd, sjtbl->tmp_table,
+ if (!sjtbl->tmp_table->file->is_fatal_error(error, HA_CHECK_DUP))
+ DBUG_RETURN(1); /* Duplicate */
+ if (create_internal_tmp_table_from_heap(thd, sjtbl->tmp_table,
sjtbl->start_recinfo,
&sjtbl->recinfo, error, 1))
DBUG_RETURN(-1);
- DBUG_RETURN(1);
}
DBUG_RETURN(0);
}
1
0