[Commits] 03edf2e: Undo the incorrect part of commit 7e700bd2a81ae4b37145f1c32bb0902c72856d2d
by psergey@askmonty.org 07 May '18
by psergey@askmonty.org 07 May '18
07 May '18
revision-id: 03edf2ed04dbffe8c413fe0dd2715684e1627371
parent(s): e3661b9f7c60aa471aaa79e597723e897caf320c
committer: Sergei Petrunia
branch nick: 10.2-r10
timestamp: 2018-05-07 20:33:14 +0300
message:
Undo the incorrect part of commit 7e700bd2a81ae4b37145f1c32bb0902c72856d2d
Restore the old code in opt_range.cc
---
sql/opt_range.cc | 4 ++--
1 file changed, 2 insertions(+), 2 deletions(-)
diff --git a/sql/opt_range.cc b/sql/opt_range.cc
index bc047b2..cc57469 100644
--- a/sql/opt_range.cc
+++ b/sql/opt_range.cc
@@ -11677,8 +11677,8 @@ int QUICK_SELECT_DESC::get_next()
end_key.flag= (last_range->flag & NEAR_MAX ? HA_READ_BEFORE_KEY :
HA_READ_AFTER_KEY);
end_key.keypart_map= last_range->max_keypart_map;
- result= file->prepare_range_scan((last_range->flag & NO_MIN_RANGE) ? NULL : &end_key,
- (last_range->flag & NO_MAX_RANGE) ? NULL : &start_key);
+ result= file->prepare_range_scan((last_range->flag & NO_MIN_RANGE) ? NULL : &start_key,
+ (last_range->flag & NO_MAX_RANGE) ? NULL : &end_key);
if (result)
{
DBUG_RETURN(result);
1
0
[Commits] e3661b9: Cherry-picked from MyRocks upstream: Issue #809: Wrong query result with bloom filters
by psergey@askmonty.org 07 May '18
by psergey@askmonty.org 07 May '18
07 May '18
revision-id: e3661b9f7c60aa471aaa79e597723e897caf320c
parent(s): 39fbafbcc22cd51c1cbca8a06320394e94a9cd50
committer: Sergei Petrunia
branch nick: 10.2-r10
timestamp: 2018-05-07 20:21:35 +0300
message:
Cherry-picked from MyRocks upstream: Issue #809: Wrong query result with bloom filters
In reverse-ordered column families, if one wants to start reading at the
logical end of the index, they should Seek() to a key value that is not
covered by the index. This may (and typically does) prevent use of a bloom
filter.
The calls to setup_scan_iterator() that are made for index and table scan
didn't take this into account and passed eq_cond_len=INDEX_NUMBER_SIZE.
Fixed them to compute and pass correct eq_cond_len.
Also, removed an incorrect assert in ha_rocksdb::setup_iterator_bounds.
---
storage/rocksdb/ha_rocksdb.cc | 9 ++--
.../mysql-test/rocksdb/r/bloomfilter5.result | 62 ++++++++++++++++++++++
.../mysql-test/rocksdb/t/bloomfilter5-master.opt | 1 +
.../rocksdb/mysql-test/rocksdb/t/bloomfilter5.test | 61 +++++++++++++++++++++
storage/rocksdb/rdb_datadic.h | 26 +++++++--
5 files changed, 149 insertions(+), 10 deletions(-)
diff --git a/storage/rocksdb/ha_rocksdb.cc b/storage/rocksdb/ha_rocksdb.cc
index 6e9e7ef..e7abac7 100644
--- a/storage/rocksdb/ha_rocksdb.cc
+++ b/storage/rocksdb/ha_rocksdb.cc
@@ -8526,7 +8526,7 @@ int ha_rocksdb::index_first_intern(uchar *const buf) {
DBUG_ASSERT(key != nullptr);
const Rdb_key_def &kd = *m_key_descr_arr[active_index];
- kd.get_first_key(key, &key_size);
+ int key_start_matching_bytes = kd.get_first_key(key, &key_size);
rocksdb::Slice index_key((const char *)key, key_size);
@@ -8537,7 +8537,7 @@ int ha_rocksdb::index_first_intern(uchar *const buf) {
// Loop as long as we get a deadlock error AND we end up creating the
// snapshot here (i.e. it did not exist prior to this)
for (;;) {
- setup_scan_iterator(kd, &index_key, false, Rdb_key_def::INDEX_NUMBER_SIZE);
+ setup_scan_iterator(kd, &index_key, false, key_start_matching_bytes);
m_scan_it->Seek(index_key);
m_skip_scan_it_next_call = true;
@@ -9539,7 +9539,6 @@ int ha_rocksdb::update_write_row(const uchar *const old_data,
void ha_rocksdb::setup_iterator_bounds(const Rdb_key_def &kd,
const rocksdb::Slice &eq_cond) {
uint eq_cond_len = eq_cond.size();
- DBUG_ASSERT(eq_cond_len >= Rdb_key_def::INDEX_NUMBER_SIZE);
memcpy(m_eq_cond_upper_bound, eq_cond.data(), eq_cond_len);
kd.successor(m_eq_cond_upper_bound, eq_cond_len);
memcpy(m_eq_cond_lower_bound, eq_cond.data(), eq_cond_len);
@@ -9635,12 +9634,12 @@ void ha_rocksdb::release_scan_iterator() {
void ha_rocksdb::setup_iterator_for_rnd_scan() {
uint key_size;
- m_pk_descr->get_first_key(m_pk_packed_tuple, &key_size);
+ int key_start_matching_bytes = m_pk_descr->get_first_key(m_pk_packed_tuple, &key_size);
rocksdb::Slice table_key((const char *)m_pk_packed_tuple, key_size);
setup_scan_iterator(*m_pk_descr, &table_key, false,
- Rdb_key_def::INDEX_NUMBER_SIZE);
+ key_start_matching_bytes);
m_scan_it->Seek(table_key);
m_skip_scan_it_next_call = true;
}
diff --git a/storage/rocksdb/mysql-test/rocksdb/r/bloomfilter5.result b/storage/rocksdb/mysql-test/rocksdb/r/bloomfilter5.result
new file mode 100644
index 0000000..4f6702b
--- /dev/null
+++ b/storage/rocksdb/mysql-test/rocksdb/r/bloomfilter5.result
@@ -0,0 +1,62 @@
+#
+# Issue #809: Wrong query result with bloom filters
+#
+create table t1 (
+id1 bigint not null,
+id2 bigint not null,
+id3 varchar(100) not null,
+id4 int not null,
+id5 int not null,
+value bigint,
+value2 varchar(100),
+primary key (id1, id2, id3, id4) COMMENT 'rev:bf5_1'
+) engine=ROCKSDB;
+create table t2(a int);
+insert into t2 values (0),(1),(2),(3),(4),(5),(6),(7),(8),(9);
+create table t3(seq int);
+insert into t3
+select
+1+ A.a + B.a* 10 + C.a * 100 + D.a * 1000
+from t2 A, t2 B, t2 C, t2 D;
+insert t1
+select
+(seq+9) div 10, (seq+4) div 5, (seq+4) div 5, seq, seq, 1000, "aaabbbccc"
+from t3;
+set global rocksdb_force_flush_memtable_now=1;
+# Full table scan
+explain
+select * from t1 limit 10;
+id select_type table type possible_keys key key_len ref rows Extra
+1 SIMPLE t1 ALL NULL NULL NULL NULL 10000
+select * from t1 limit 10;
+id1 id2 id3 id4 id5 value value2
+1000 2000 2000 10000 10000 1000 aaabbbccc
+1000 2000 2000 9999 9999 1000 aaabbbccc
+1000 2000 2000 9998 9998 1000 aaabbbccc
+1000 2000 2000 9997 9997 1000 aaabbbccc
+1000 2000 2000 9996 9996 1000 aaabbbccc
+1000 1999 1999 9995 9995 1000 aaabbbccc
+1000 1999 1999 9994 9994 1000 aaabbbccc
+1000 1999 1999 9993 9993 1000 aaabbbccc
+1000 1999 1999 9992 9992 1000 aaabbbccc
+1000 1999 1999 9991 9991 1000 aaabbbccc
+# An index scan starting from the end of the table:
+explain
+select * from t1 order by id1 desc,id2 desc, id3 desc, id4 desc limit 1;
+id select_type table type possible_keys key key_len ref rows Extra
+1 SIMPLE t1 index NULL PRIMARY 122 NULL 1
+select * from t1 order by id1 desc,id2 desc, id3 desc, id4 desc limit 1;
+id1 id2 id3 id4 id5 value value2
+1000 2000 2000 10000 10000 1000 aaabbbccc
+create table t4 (
+pk int unsigned not null primary key,
+kp1 int unsigned not null,
+kp2 int unsigned not null,
+col1 int unsigned,
+key(kp1, kp2) comment 'rev:bf5_2'
+) engine=rocksdb;
+insert into t4 values (1, 0xFFFF, 0xFFF, 12345);
+# This must not fail an assert:
+select * from t4 force index(kp1) where kp1=0xFFFFFFFF and kp2<=0xFFFFFFFF order by kp2 desc;
+pk kp1 kp2 col1
+drop table t1,t2,t3,t4;
diff --git a/storage/rocksdb/mysql-test/rocksdb/t/bloomfilter5-master.opt b/storage/rocksdb/mysql-test/rocksdb/t/bloomfilter5-master.opt
new file mode 100644
index 0000000..7d63dc7
--- /dev/null
+++ b/storage/rocksdb/mysql-test/rocksdb/t/bloomfilter5-master.opt
@@ -0,0 +1 @@
+--rocksdb_override_cf_options=rev:bf5_1={prefix_extractor=capped:4;block_based_table_factory={filter_policy=bloomfilter:10:false;whole_key_filtering=0;}};
diff --git a/storage/rocksdb/mysql-test/rocksdb/t/bloomfilter5.test b/storage/rocksdb/mysql-test/rocksdb/t/bloomfilter5.test
new file mode 100644
index 0000000..00968aeb
--- /dev/null
+++ b/storage/rocksdb/mysql-test/rocksdb/t/bloomfilter5.test
@@ -0,0 +1,61 @@
+
+--echo #
+--echo # Issue #809: Wrong query result with bloom filters
+--echo #
+
+create table t1 (
+ id1 bigint not null,
+ id2 bigint not null,
+ id3 varchar(100) not null,
+ id4 int not null,
+ id5 int not null,
+ value bigint,
+ value2 varchar(100),
+ primary key (id1, id2, id3, id4) COMMENT 'rev:bf5_1'
+) engine=ROCKSDB;
+
+
+create table t2(a int);
+insert into t2 values (0),(1),(2),(3),(4),(5),(6),(7),(8),(9);
+
+create table t3(seq int);
+insert into t3
+select
+ 1+ A.a + B.a* 10 + C.a * 100 + D.a * 1000
+from t2 A, t2 B, t2 C, t2 D;
+
+insert t1
+select
+ (seq+9) div 10, (seq+4) div 5, (seq+4) div 5, seq, seq, 1000, "aaabbbccc"
+from t3;
+
+set global rocksdb_force_flush_memtable_now=1;
+
+--echo # Full table scan
+explain
+select * from t1 limit 10;
+select * from t1 limit 10;
+
+--echo # An index scan starting from the end of the table:
+explain
+select * from t1 order by id1 desc,id2 desc, id3 desc, id4 desc limit 1;
+select * from t1 order by id1 desc,id2 desc, id3 desc, id4 desc limit 1;
+
+# A testcase for an assertion that the fix is removing
+# The only requirement for the used column family is that it is reverse-ordered
+create table t4 (
+ pk int unsigned not null primary key,
+ kp1 int unsigned not null,
+ kp2 int unsigned not null,
+ col1 int unsigned,
+ key(kp1, kp2) comment 'rev:bf5_2'
+) engine=rocksdb;
+
+insert into t4 values (1, 0xFFFF, 0xFFF, 12345);
+
+--echo # This must not fail an assert:
+select * from t4 force index(kp1) where kp1=0xFFFFFFFF and kp2<=0xFFFFFFFF order by kp2 desc;
+
+drop table t1,t2,t3,t4;
+
+
diff --git a/storage/rocksdb/rdb_datadic.h b/storage/rocksdb/rdb_datadic.h
index 326570c..f97c0d0 100644
--- a/storage/rocksdb/rdb_datadic.h
+++ b/storage/rocksdb/rdb_datadic.h
@@ -238,12 +238,28 @@ class Rdb_key_def {
*size = INDEX_NUMBER_SIZE;
}
- /* Get the first key that you need to position at to start iterating.
- Returns a "supremum" or "infimum" for this index based on collation order
+ /*
+ Get the first key that you need to position at to start iterating.
+
+ Stores into *key a "supremum" or "infimum" key value for the index.
+
+ @return Number of bytes in the key that are usable for bloom filter use.
*/
- inline void get_first_key(uchar *const key, uint *const size) const {
- return m_is_reverse_cf ? get_supremum_key(key, size)
- : get_infimum_key(key, size);
+ inline int get_first_key(uchar *const key, uint *const size) const {
+ if (m_is_reverse_cf)
+ get_supremum_key(key, size);
+ else
+ get_infimum_key(key, size);
+
+ /* Find out how many bytes of infimum are the same as m_index_number */
+ uchar unmodified_key[INDEX_NUMBER_SIZE];
+ rdb_netbuf_store_index(unmodified_key, m_index_number);
+ int i;
+ for (i = 0; i < INDEX_NUMBER_SIZE; i++) {
+ if (key[i] != unmodified_key[i])
+ break;
+ }
+ return i;
}
/* Make a key that is right after the given key. */
1
0
[Commits] 85e6bba1d6f: MDEV-15347: Valgrind or ASAN errors in mysql_make_view on query from information_schema
by Oleksandr Byelkin 07 May '18
by Oleksandr Byelkin 07 May '18
07 May '18
revision-id: 85e6bba1d6fb5cb2487c760d9e73812d4fb17f32 (mariadb-10.1.32-81-g85e6bba1d6f)
parent(s): 0db66ab18ffef6d8920e2e6ff66e99516a458a4d
author: Oleksandr Byelkin
committer: Oleksandr Byelkin
timestamp: 2018-05-07 19:14:02 +0200
message:
MDEV-15347: Valgrind or ASAN errors in mysql_make_view on query from information_schema
Make each lex pointing to statement lex instead of global pointer in THD (no
need store and restoire the global pointer and put it on SP stack).
---
mysql-test/r/sp.result | 11 +++++++++++
mysql-test/t/sp.test | 15 +++++++++++++++
sql/sp_head.cc | 10 ++++------
sql/sql_class.cc | 2 +-
sql/sql_class.h | 15 ---------------
sql/sql_lex.cc | 5 +++--
sql/sql_lex.h | 17 ++++++++++++++++-
sql/sql_parse.cc | 7 ++++---
sql/sql_prepare.cc | 2 +-
sql/sql_trigger.cc | 6 ++----
sql/sql_view.cc | 4 +++-
11 files changed, 60 insertions(+), 34 deletions(-)
diff --git a/mysql-test/r/sp.result b/mysql-test/r/sp.result
index ad5bddda035..bc33c08d9d8 100644
--- a/mysql-test/r/sp.result
+++ b/mysql-test/r/sp.result
@@ -8242,4 +8242,15 @@ DROP PROCEDURE proc_13;
DROP PROCEDURE proc_select;
DROP TABLE t1, t2;
SET max_sp_recursion_depth=default;
+#
+# MDEV-15347: Valgrind or ASAN errors in mysql_make_view on query
+# from information_schema
+#
+CREATE VIEW v AS SELECT 1;
+CREATE FUNCTION f() RETURNS INT RETURN 1;
+SELECT * FROM INFORMATION_SCHEMA.TABLES JOIN INFORMATION_SCHEMA.PARAMETERS
+UNION
+SELECT * FROM INFORMATION_SCHEMA.TABLES JOIN INFORMATION_SCHEMA.PARAMETERS;
+DROP FUNCTION f;
+DROP VIEW v;
#End of 10.1 tests
diff --git a/mysql-test/t/sp.test b/mysql-test/t/sp.test
index eeabb0486ca..467d3b5a7d4 100644
--- a/mysql-test/t/sp.test
+++ b/mysql-test/t/sp.test
@@ -9754,4 +9754,19 @@ DROP TABLE t1, t2;
SET max_sp_recursion_depth=default;
+--echo #
+--echo # MDEV-15347: Valgrind or ASAN errors in mysql_make_view on query
+--echo # from information_schema
+--echo #
+
+CREATE VIEW v AS SELECT 1;
+CREATE FUNCTION f() RETURNS INT RETURN 1;
+--disable_result_log
+SELECT * FROM INFORMATION_SCHEMA.TABLES JOIN INFORMATION_SCHEMA.PARAMETERS
+UNION
+SELECT * FROM INFORMATION_SCHEMA.TABLES JOIN INFORMATION_SCHEMA.PARAMETERS;
+--enable_result_log
+DROP FUNCTION f;
+DROP VIEW v;
+
--echo #End of 10.1 tests
diff --git a/sql/sp_head.cc b/sql/sp_head.cc
index 8bf78d97670..0d24ed04eae 100644
--- a/sql/sp_head.cc
+++ b/sql/sp_head.cc
@@ -840,7 +840,7 @@ sp_head::~sp_head()
thd->lex->sphead= NULL;
lex_end(thd->lex);
delete thd->lex;
- thd->lex= thd->stmt_lex= lex;
+ thd->lex= lex;
}
my_hash_free(&m_sptabs);
@@ -1121,7 +1121,7 @@ sp_head::execute(THD *thd, bool merge_da_on_success)
backup_arena;
query_id_t old_query_id;
TABLE *old_derived_tables;
- LEX *old_lex, *old_stmt_lex;
+ LEX *old_lex;
Item_change_list old_change_list;
String old_packet;
uint old_server_status;
@@ -1224,7 +1224,6 @@ sp_head::execute(THD *thd, bool merge_da_on_success)
do it in each instruction
*/
old_lex= thd->lex;
- old_stmt_lex= thd->stmt_lex;
/*
We should also save Item tree change list to avoid rollback something
too early in the calling query.
@@ -1372,7 +1371,6 @@ sp_head::execute(THD *thd, bool merge_da_on_success)
DBUG_ASSERT(thd->change_list.is_empty());
old_change_list.move_elements_to(&thd->change_list);
thd->lex= old_lex;
- thd->stmt_lex= old_stmt_lex;
thd->set_query_id(old_query_id);
DBUG_ASSERT(!thd->derived_tables);
thd->derived_tables= old_derived_tables;
@@ -2207,7 +2205,7 @@ sp_head::reset_lex(THD *thd)
if (sublex == 0)
DBUG_RETURN(TRUE);
- thd->lex= thd->stmt_lex= sublex;
+ thd->lex= sublex;
(void)m_lex.push_front(oldlex);
/* Reset most stuff. */
@@ -2953,7 +2951,7 @@ sp_lex_keeper::reset_lex_and_exec_core(THD *thd, uint *nextp,
We should not save old value since it is saved/restored in
sp_head::execute() when we are entering/leaving routine.
*/
- thd->lex= thd->stmt_lex= m_lex;
+ thd->lex= m_lex;
thd->set_query_id(next_query_id());
diff --git a/sql/sql_class.cc b/sql/sql_class.cc
index 2ab1cd3a61a..24140246b96 100644
--- a/sql/sql_class.cc
+++ b/sql/sql_class.cc
@@ -3654,7 +3654,7 @@ void Statement::set_statement(Statement *stmt)
{
id= stmt->id;
mark_used_columns= stmt->mark_used_columns;
- stmt_lex= lex= stmt->lex;
+ lex= stmt->lex;
query_string= stmt->query_string;
}
diff --git a/sql/sql_class.h b/sql/sql_class.h
index 341f2e571d8..ca6155ec93f 100644
--- a/sql/sql_class.h
+++ b/sql/sql_class.h
@@ -1027,21 +1027,6 @@ class Statement: public ilink, public Query_arena
LEX_STRING name; /* name for named prepared statements */
LEX *lex; // parse tree descriptor
- /*
- LEX which represents current statement (conventional, SP or PS)
-
- For example during view parsing THD::lex will point to the views LEX and
- THD::stmt_lex will point to LEX of the statement where the view will be
- included
-
- Currently it is used to have always correct select numbering inside
- statement (LEX::current_select_number) without storing and restoring a
- global counter which was THD::select_number.
-
- TODO: make some unified statement representation (now SP has different)
- to store such data like LEX::current_select_number.
- */
- LEX *stmt_lex;
/*
Points to the query associated with this statement. It's const, but
we need to declare it char * because all table handlers are written
diff --git a/sql/sql_lex.cc b/sql/sql_lex.cc
index 3fa5ec71aeb..085ad1a4b3b 100644
--- a/sql/sql_lex.cc
+++ b/sql/sql_lex.cc
@@ -657,10 +657,11 @@ void lex_start(THD *thd)
{
LEX *lex= thd->lex;
DBUG_ENTER("lex_start");
- DBUG_PRINT("info", ("Lex %p stmt_lex: %p", thd->lex, thd->stmt_lex));
+ DBUG_PRINT("info", ("Lex %p", thd->lex));
lex->thd= lex->unit.thd= thd;
-
+
+ lex->stmt_lex= lex; // default, should be rewritten for VIEWs And CTEs
DBUG_ASSERT(!lex->explain);
lex->context_stack.empty();
diff --git a/sql/sql_lex.h b/sql/sql_lex.h
index 4fcd090e1f5..3b47b1d25c9 100644
--- a/sql/sql_lex.h
+++ b/sql/sql_lex.h
@@ -730,7 +730,7 @@ class st_select_lex: public st_select_lex_node
/*
Point to the LEX in which it was created, used in view subquery detection.
- TODO: make also st_select_lex::parent_stmt_lex (see THD::stmt_lex)
+ TODO: make also st_select_lex::parent_stmt_lex (see LEX::stmt_lex)
and use st_select_lex::parent_lex & st_select_lex::parent_stmt_lex
instead of global (from THD) references where it is possible.
*/
@@ -2435,6 +2435,21 @@ struct LEX: public Query_tables_list
// type information
char *length,*dec;
CHARSET_INFO *charset;
+ /*
+ LEX which represents current statement (conventional, SP or PS)
+
+ For example during view parsing THD::lex will point to the views LEX and
+ lex::stmt_lex will point to LEX of the statement where the view will be
+ included
+
+ Currently it is used to have always correct select numbering inside
+ statement (LEX::current_select_number) without storing and restoring a
+ global counter which was THD::select_number.
+
+ TODO: make some unified statement representation (now SP has different)
+ to store such data like LEX::current_select_number.
+ */
+ LEX *stmt_lex;
LEX_STRING name;
char *help_arg;
diff --git a/sql/sql_parse.cc b/sql/sql_parse.cc
index 3d40f8e4245..10038ad8711 100644
--- a/sql/sql_parse.cc
+++ b/sql/sql_parse.cc
@@ -6927,8 +6927,9 @@ void THD::reset_for_next_command(bool do_clear_error)
We also assign thd->stmt_lex in lex_start(), but during bootstrap this
code is executed first.
*/
- thd->stmt_lex= &main_lex; thd->stmt_lex->current_select_number= 1;
- DBUG_PRINT("info", ("Lex %p stmt_lex: %p", thd->lex, thd->stmt_lex));
+ DBUG_ASSERT(lex == &main_lex);
+ main_lex.stmt_lex= &main_lex; main_lex.current_select_number= 1;
+ DBUG_PRINT("info", ("Lex and stmt_lex: %p", &main_lex));
/*
Those two lines below are theoretically unneeded as
THD::cleanup_after_query() should take care of this already.
@@ -7046,7 +7047,7 @@ mysql_new_select(LEX *lex, bool move_down)
if (!(select_lex= new (thd->mem_root) SELECT_LEX()))
DBUG_RETURN(1);
- select_lex->select_number= ++thd->stmt_lex->current_select_number;
+ select_lex->select_number= ++thd->lex->stmt_lex->current_select_number;
select_lex->parent_lex= lex; /* Used in init_query. */
select_lex->init_query();
select_lex->init_select();
diff --git a/sql/sql_prepare.cc b/sql/sql_prepare.cc
index 6e14ddc2afb..d39ed6aa637 100644
--- a/sql/sql_prepare.cc
+++ b/sql/sql_prepare.cc
@@ -3635,7 +3635,7 @@ bool Prepared_statement::prepare(const char *packet, uint packet_len)
if (! (lex= new (mem_root) st_lex_local))
DBUG_RETURN(TRUE);
- stmt_lex= lex;
+ lex->stmt_lex= lex;
if (set_db(thd->db, thd->db_length))
DBUG_RETURN(TRUE);
diff --git a/sql/sql_trigger.cc b/sql/sql_trigger.cc
index e14d19c7369..293a4c17156 100644
--- a/sql/sql_trigger.cc
+++ b/sql/sql_trigger.cc
@@ -1374,13 +1374,13 @@ bool Table_triggers_list::check_n_load(THD *thd, const char *db,
List_iterator_fast<LEX_STRING> it_client_cs_name(triggers->client_cs_names);
List_iterator_fast<LEX_STRING> it_connection_cl_name(triggers->connection_cl_names);
List_iterator_fast<LEX_STRING> it_db_cl_name(triggers->db_cl_names);
- LEX *old_lex= thd->lex, *old_stmt_lex= thd->stmt_lex;
+ LEX *old_lex= thd->lex;
LEX lex;
sp_rcontext *save_spcont= thd->spcont;
ulonglong save_sql_mode= thd->variables.sql_mode;
LEX_STRING *on_table_name;
- thd->lex= thd->stmt_lex= &lex;
+ thd->lex= &lex;
save_db.str= thd->db;
save_db.length= thd->db_length;
@@ -1579,7 +1579,6 @@ bool Table_triggers_list::check_n_load(THD *thd, const char *db,
}
thd->reset_db(save_db.str, save_db.length);
thd->lex= old_lex;
- thd->stmt_lex= old_stmt_lex;
thd->spcont= save_spcont;
thd->variables.sql_mode= save_sql_mode;
@@ -1592,7 +1591,6 @@ bool Table_triggers_list::check_n_load(THD *thd, const char *db,
// QQ: anything else ?
lex_end(&lex);
thd->lex= old_lex;
- thd->stmt_lex= old_stmt_lex;
thd->spcont= save_spcont;
thd->variables.sql_mode= save_sql_mode;
thd->reset_db(save_db.str, save_db.length);
diff --git a/sql/sql_view.cc b/sql/sql_view.cc
index 1bdc76a66ea..6d7a8e1cc9d 100644
--- a/sql/sql_view.cc
+++ b/sql/sql_view.cc
@@ -1315,6 +1315,7 @@ bool mysql_make_view(THD *thd, TABLE_SHARE *share, TABLE_LIST *table,
now Lex placed in statement memory
*/
+
table->view= lex= thd->lex= (LEX*) new(thd->mem_root) st_lex_local;
if (!table->view)
{
@@ -1340,8 +1341,9 @@ bool mysql_make_view(THD *thd, TABLE_SHARE *share, TABLE_LIST *table,
goto end;
lex_start(thd);
+ lex->stmt_lex= old_lex;
view_select= &lex->select_lex;
- view_select->select_number= ++thd->stmt_lex->current_select_number;
+ view_select->select_number= ++thd->lex->stmt_lex->current_select_number;
ulonglong saved_mode= thd->variables.sql_mode;
/* switch off modes which can prevent normal parsing of VIEW
1
0
[Commits] 6a82972: Issue #809: Wrong query result with bloom filters
by psergey@askmonty.org 07 May '18
by psergey@askmonty.org 07 May '18
07 May '18
revision-id: 6a82972dfc23405406be511161ddd71e58c30eaa
parent(s): fc7bcca40f4ce83327745e9edf3b9bf84b1516af
committer: Sergei Petrunia
branch nick: mysql-5.6-rocksdb-look800
timestamp: 2018-05-07 20:03:54 +0300
message:
Issue #809: Wrong query result with bloom filters
In reverse-ordered column families, if one wants to start reading at the
logical end of the index, they should Seek() to a key value that is not
covered by the index. This may (and typically does) prevent use of a bloom
filter.
The calls to setup_scan_iterator() that are made for index and table scan
didn't take this into account and passed eq_cond_len=INDEX_NUMBER_SIZE.
Fixed them to compute and pass correct eq_cond_len.
Also, removed an incorrect assert in ha_rocksdb::setup_iterator_bounds.
---
mysql-test/suite/rocksdb/r/bloomfilter5.result | 62 ++++++++++++++++++++++
mysql-test/suite/rocksdb/t/bloomfilter5-master.opt | 3 ++
mysql-test/suite/rocksdb/t/bloomfilter5.test | 61 +++++++++++++++++++++
storage/rocksdb/ha_rocksdb.cc | 9 ++--
storage/rocksdb/rdb_datadic.h | 26 +++++++--
5 files changed, 151 insertions(+), 10 deletions(-)
diff --git a/mysql-test/suite/rocksdb/r/bloomfilter5.result b/mysql-test/suite/rocksdb/r/bloomfilter5.result
new file mode 100644
index 0000000..058d360
--- /dev/null
+++ b/mysql-test/suite/rocksdb/r/bloomfilter5.result
@@ -0,0 +1,62 @@
+#
+# Issue #809: Wrong query result with bloom filters
+#
+create table t1 (
+id1 bigint not null,
+id2 bigint not null,
+id3 varchar(100) not null,
+id4 int not null,
+id5 int not null,
+value bigint,
+value2 varchar(100),
+primary key (id1, id2, id3, id4) COMMENT 'rev:bf5_1'
+) engine=ROCKSDB;
+create table t2(a int);
+insert into t2 values (0),(1),(2),(3),(4),(5),(6),(7),(8),(9);
+create table t3(seq int);
+insert into t3
+select
+1+ A.a + B.a* 10 + C.a * 100 + D.a * 1000
+from t2 A, t2 B, t2 C, t2 D;
+insert t1
+select
+(seq+9) div 10, (seq+4) div 5, (seq+4) div 5, seq, seq, 1000, "aaabbbccc"
+from t3;
+set global rocksdb_force_flush_memtable_now=1;
+# Full table scan
+explain
+select * from t1 limit 10;
+id select_type table type possible_keys key key_len ref rows Extra
+1 SIMPLE t1 ALL NULL NULL NULL NULL 10000 NULL
+select * from t1 limit 10;
+id1 id2 id3 id4 id5 value value2
+1000 2000 2000 10000 10000 1000 aaabbbccc
+1000 2000 2000 9999 9999 1000 aaabbbccc
+1000 2000 2000 9998 9998 1000 aaabbbccc
+1000 2000 2000 9997 9997 1000 aaabbbccc
+1000 2000 2000 9996 9996 1000 aaabbbccc
+1000 1999 1999 9995 9995 1000 aaabbbccc
+1000 1999 1999 9994 9994 1000 aaabbbccc
+1000 1999 1999 9993 9993 1000 aaabbbccc
+1000 1999 1999 9992 9992 1000 aaabbbccc
+1000 1999 1999 9991 9991 1000 aaabbbccc
+# An index scan starting from the end of the table:
+explain
+select * from t1 order by id1 desc,id2 desc, id3 desc, id4 desc limit 1;
+id select_type table type possible_keys key key_len ref rows Extra
+1 SIMPLE t1 index NULL PRIMARY 122 NULL 1 NULL
+select * from t1 order by id1 desc,id2 desc, id3 desc, id4 desc limit 1;
+id1 id2 id3 id4 id5 value value2
+1000 2000 2000 10000 10000 1000 aaabbbccc
+create table t4 (
+pk int unsigned not null primary key,
+kp1 int unsigned not null,
+kp2 int unsigned not null,
+col1 int unsigned,
+key(kp1, kp2) comment 'rev:bf5_2'
+) engine=rocksdb;
+insert into t4 values (1, 0xFFFF, 0xFFF, 12345);
+# This must not fail an assert:
+select * from t4 force index(kp1) where kp1=0xFFFFFFFF and kp2<=0xFFFFFFFF order by kp2 desc;
+pk kp1 kp2 col1
+drop table t1,t2,t3,t4;
diff --git a/mysql-test/suite/rocksdb/t/bloomfilter5-master.opt b/mysql-test/suite/rocksdb/t/bloomfilter5-master.opt
new file mode 100644
index 0000000..efcd69b
--- /dev/null
+++ b/mysql-test/suite/rocksdb/t/bloomfilter5-master.opt
@@ -0,0 +1,3 @@
+--rocksdb_default_cf_options=write_buffer_size=256k;block_based_table_factory={filter_policy=bloomfilter:10:false;whole_key_filtering=0;}
+--rocksdb_override_cf_options=rev:bf5_1={prefix_extractor=capped:4};
+
diff --git a/mysql-test/suite/rocksdb/t/bloomfilter5.test b/mysql-test/suite/rocksdb/t/bloomfilter5.test
new file mode 100644
index 0000000..00968ae
--- /dev/null
+++ b/mysql-test/suite/rocksdb/t/bloomfilter5.test
@@ -0,0 +1,61 @@
+
+--echo #
+--echo # Issue #809: Wrong query result with bloom filters
+--echo #
+
+create table t1 (
+ id1 bigint not null,
+ id2 bigint not null,
+ id3 varchar(100) not null,
+ id4 int not null,
+ id5 int not null,
+ value bigint,
+ value2 varchar(100),
+ primary key (id1, id2, id3, id4) COMMENT 'rev:bf5_1'
+) engine=ROCKSDB;
+
+
+create table t2(a int);
+insert into t2 values (0),(1),(2),(3),(4),(5),(6),(7),(8),(9);
+
+create table t3(seq int);
+insert into t3
+select
+ 1+ A.a + B.a* 10 + C.a * 100 + D.a * 1000
+from t2 A, t2 B, t2 C, t2 D;
+
+insert t1
+select
+ (seq+9) div 10, (seq+4) div 5, (seq+4) div 5, seq, seq, 1000, "aaabbbccc"
+from t3;
+
+set global rocksdb_force_flush_memtable_now=1;
+
+--echo # Full table scan
+explain
+select * from t1 limit 10;
+select * from t1 limit 10;
+
+--echo # An index scan starting from the end of the table:
+explain
+select * from t1 order by id1 desc,id2 desc, id3 desc, id4 desc limit 1;
+select * from t1 order by id1 desc,id2 desc, id3 desc, id4 desc limit 1;
+
+# A testcase for an assertion that the fix is removing
+# The only requirement for the used column family is that it is reverse-ordered
+create table t4 (
+ pk int unsigned not null primary key,
+ kp1 int unsigned not null,
+ kp2 int unsigned not null,
+ col1 int unsigned,
+ key(kp1, kp2) comment 'rev:bf5_2'
+) engine=rocksdb;
+
+insert into t4 values (1, 0xFFFF, 0xFFF, 12345);
+
+--echo # This must not fail an assert:
+select * from t4 force index(kp1) where kp1=0xFFFFFFFF and kp2<=0xFFFFFFFF order by kp2 desc;
+
+drop table t1,t2,t3,t4;
+
+
diff --git a/storage/rocksdb/ha_rocksdb.cc b/storage/rocksdb/ha_rocksdb.cc
index 455b55d..3eedf4c 100644
--- a/storage/rocksdb/ha_rocksdb.cc
+++ b/storage/rocksdb/ha_rocksdb.cc
@@ -8582,7 +8582,7 @@ int ha_rocksdb::index_first_intern(uchar *const buf) {
DBUG_ASSERT(key != nullptr);
const Rdb_key_def &kd = *m_key_descr_arr[active_index];
- kd.get_first_key(key, &key_size);
+ int key_start_matching_bytes = kd.get_first_key(key, &key_size);
rocksdb::Slice index_key((const char *)key, key_size);
@@ -8593,7 +8593,7 @@ int ha_rocksdb::index_first_intern(uchar *const buf) {
// Loop as long as we get a deadlock error AND we end up creating the
// snapshot here (i.e. it did not exist prior to this)
for (;;) {
- setup_scan_iterator(kd, &index_key, false, Rdb_key_def::INDEX_NUMBER_SIZE);
+ setup_scan_iterator(kd, &index_key, false, key_start_matching_bytes);
m_scan_it->Seek(index_key);
m_skip_scan_it_next_call = true;
@@ -9600,7 +9600,6 @@ int ha_rocksdb::update_write_row(const uchar *const old_data,
void ha_rocksdb::setup_iterator_bounds(const Rdb_key_def &kd,
const rocksdb::Slice &eq_cond) {
uint eq_cond_len = eq_cond.size();
- DBUG_ASSERT(eq_cond_len >= Rdb_key_def::INDEX_NUMBER_SIZE);
memcpy(m_eq_cond_upper_bound, eq_cond.data(), eq_cond_len);
kd.successor(m_eq_cond_upper_bound, eq_cond_len);
memcpy(m_eq_cond_lower_bound, eq_cond.data(), eq_cond_len);
@@ -9696,12 +9695,12 @@ void ha_rocksdb::release_scan_iterator() {
void ha_rocksdb::setup_iterator_for_rnd_scan() {
uint key_size;
- m_pk_descr->get_first_key(m_pk_packed_tuple, &key_size);
+ int key_start_matching_bytes = m_pk_descr->get_first_key(m_pk_packed_tuple, &key_size);
rocksdb::Slice table_key((const char *)m_pk_packed_tuple, key_size);
setup_scan_iterator(*m_pk_descr, &table_key, false,
- Rdb_key_def::INDEX_NUMBER_SIZE);
+ key_start_matching_bytes);
m_scan_it->Seek(table_key);
m_skip_scan_it_next_call = true;
}
diff --git a/storage/rocksdb/rdb_datadic.h b/storage/rocksdb/rdb_datadic.h
index 7290be4..25a6281 100644
--- a/storage/rocksdb/rdb_datadic.h
+++ b/storage/rocksdb/rdb_datadic.h
@@ -232,12 +232,28 @@ public:
*size = INDEX_NUMBER_SIZE;
}
- /* Get the first key that you need to position at to start iterating.
- Returns a "supremum" or "infimum" for this index based on collation order
+ /*
+ Get the first key that you need to position at to start iterating.
+
+ Stores into *key a "supremum" or "infimum" key value for the index.
+
+ @return Number of bytes in the key that are usable for bloom filter use.
*/
- inline void get_first_key(uchar *const key, uint *const size) const {
- return m_is_reverse_cf ? get_supremum_key(key, size)
- : get_infimum_key(key, size);
+ inline int get_first_key(uchar *const key, uint *const size) const {
+ if (m_is_reverse_cf)
+ get_supremum_key(key, size);
+ else
+ get_infimum_key(key, size);
+
+ /* Find out how many bytes of infimum are the same as m_index_number */
+ uchar unmodified_key[INDEX_NUMBER_SIZE];
+ rdb_netbuf_store_index(unmodified_key, m_index_number);
+ int i;
+ for (i = 0; i < INDEX_NUMBER_SIZE; i++) {
+ if (key[i] != unmodified_key[i])
+ break;
+ }
+ return i;
}
/* Make a key that is right after the given key. */
1
0
revision-id: 2f6a4e12e213e2f7afe8533b05ff879481091085 (mariadb-10.1.32-83-g2f6a4e12e21)
parent(s): 4b89d83ddfa4ee93d5c563a9a5a33b439f1c19ed
author: Oleksandr Byelkin
committer: Oleksandr Byelkin
timestamp: 2018-05-07 18:44:52 +0200
message:
fix
---
mysql-test/r/sp.result | 11 +++++++++++
mysql-test/t/sp.test | 2 +-
2 files changed, 12 insertions(+), 1 deletion(-)
diff --git a/mysql-test/r/sp.result b/mysql-test/r/sp.result
index ad5bddda035..bc33c08d9d8 100644
--- a/mysql-test/r/sp.result
+++ b/mysql-test/r/sp.result
@@ -8242,4 +8242,15 @@ DROP PROCEDURE proc_13;
DROP PROCEDURE proc_select;
DROP TABLE t1, t2;
SET max_sp_recursion_depth=default;
+#
+# MDEV-15347: Valgrind or ASAN errors in mysql_make_view on query
+# from information_schema
+#
+CREATE VIEW v AS SELECT 1;
+CREATE FUNCTION f() RETURNS INT RETURN 1;
+SELECT * FROM INFORMATION_SCHEMA.TABLES JOIN INFORMATION_SCHEMA.PARAMETERS
+UNION
+SELECT * FROM INFORMATION_SCHEMA.TABLES JOIN INFORMATION_SCHEMA.PARAMETERS;
+DROP FUNCTION f;
+DROP VIEW v;
#End of 10.1 tests
diff --git a/mysql-test/t/sp.test b/mysql-test/t/sp.test
index 6be6440c551..467d3b5a7d4 100644
--- a/mysql-test/t/sp.test
+++ b/mysql-test/t/sp.test
@@ -9767,6 +9767,6 @@ UNION
SELECT * FROM INFORMATION_SCHEMA.TABLES JOIN INFORMATION_SCHEMA.PARAMETERS;
--enable_result_log
DROP FUNCTION f;
-DROP TABLE v;
+DROP VIEW v;
--echo #End of 10.1 tests
1
0
revision-id: 4b89d83ddfa4ee93d5c563a9a5a33b439f1c19ed (mariadb-10.1.32-82-g4b89d83ddfa)
parent(s): 2e9fdf1d70da2c27d5a78837a306f91712781f03
author: Oleksandr Byelkin
committer: Oleksandr Byelkin
timestamp: 2018-05-07 18:02:43 +0200
message:
fix
---
sql/sp_head.cc | 2 +-
sql/sql_class.cc | 2 +-
sql/sql_lex.cc | 2 +-
sql/sql_trigger.cc | 1 -
4 files changed, 3 insertions(+), 4 deletions(-)
diff --git a/sql/sp_head.cc b/sql/sp_head.cc
index 536f6900866..0d24ed04eae 100644
--- a/sql/sp_head.cc
+++ b/sql/sp_head.cc
@@ -2205,7 +2205,7 @@ sp_head::reset_lex(THD *thd)
if (sublex == 0)
DBUG_RETURN(TRUE);
- thd->lex= thd->stmt_lex= sublex;
+ thd->lex= sublex;
(void)m_lex.push_front(oldlex);
/* Reset most stuff. */
diff --git a/sql/sql_class.cc b/sql/sql_class.cc
index 2ab1cd3a61a..24140246b96 100644
--- a/sql/sql_class.cc
+++ b/sql/sql_class.cc
@@ -3654,7 +3654,7 @@ void Statement::set_statement(Statement *stmt)
{
id= stmt->id;
mark_used_columns= stmt->mark_used_columns;
- stmt_lex= lex= stmt->lex;
+ lex= stmt->lex;
query_string= stmt->query_string;
}
diff --git a/sql/sql_lex.cc b/sql/sql_lex.cc
index 957365df51a..085ad1a4b3b 100644
--- a/sql/sql_lex.cc
+++ b/sql/sql_lex.cc
@@ -661,8 +661,8 @@ void lex_start(THD *thd)
lex->thd= lex->unit.thd= thd;
+ lex->stmt_lex= lex; // default, should be rewritten for VIEWs And CTEs
DBUG_ASSERT(!lex->explain);
- stmt_lex= this;
lex->context_stack.empty();
lex->unit.init_query();
diff --git a/sql/sql_trigger.cc b/sql/sql_trigger.cc
index 2f291cc3756..293a4c17156 100644
--- a/sql/sql_trigger.cc
+++ b/sql/sql_trigger.cc
@@ -1579,7 +1579,6 @@ bool Table_triggers_list::check_n_load(THD *thd, const char *db,
}
thd->reset_db(save_db.str, save_db.length);
thd->lex= old_lex;
- thd->stmt_lex= old_stmt_lex;
thd->spcont= save_spcont;
thd->variables.sql_mode= save_sql_mode;
1
0
revision-id: 133ce4300ef056e6677edfe2be151dbe9f2c8b4a (mariadb-10.1.32-76-g133ce4300ef)
parent(s): 74abc32d308cd4f9a23c4f897a76ea75c85a18c9
author: Oleksandr Byelkin
committer: Oleksandr Byelkin
timestamp: 2018-05-07 17:42:55 +0200
message:
cc
---
mysql-test/t/sp.test | 15 +++++++++++++++
sql/sp_head.cc | 8 +++-----
sql/sql_class.h | 15 ---------------
sql/sql_lex.cc | 5 +++--
sql/sql_lex.h | 17 ++++++++++++++++-
sql/sql_parse.cc | 7 ++++---
sql/sql_prepare.cc | 2 +-
sql/sql_trigger.cc | 5 ++---
sql/sql_view.cc | 4 +++-
9 files changed, 47 insertions(+), 31 deletions(-)
diff --git a/mysql-test/t/sp.test b/mysql-test/t/sp.test
index eeabb0486ca..6be6440c551 100644
--- a/mysql-test/t/sp.test
+++ b/mysql-test/t/sp.test
@@ -9754,4 +9754,19 @@ DROP TABLE t1, t2;
SET max_sp_recursion_depth=default;
+--echo #
+--echo # MDEV-15347: Valgrind or ASAN errors in mysql_make_view on query
+--echo # from information_schema
+--echo #
+
+CREATE VIEW v AS SELECT 1;
+CREATE FUNCTION f() RETURNS INT RETURN 1;
+--disable_result_log
+SELECT * FROM INFORMATION_SCHEMA.TABLES JOIN INFORMATION_SCHEMA.PARAMETERS
+UNION
+SELECT * FROM INFORMATION_SCHEMA.TABLES JOIN INFORMATION_SCHEMA.PARAMETERS;
+--enable_result_log
+DROP FUNCTION f;
+DROP TABLE v;
+
--echo #End of 10.1 tests
diff --git a/sql/sp_head.cc b/sql/sp_head.cc
index 8bf78d97670..536f6900866 100644
--- a/sql/sp_head.cc
+++ b/sql/sp_head.cc
@@ -840,7 +840,7 @@ sp_head::~sp_head()
thd->lex->sphead= NULL;
lex_end(thd->lex);
delete thd->lex;
- thd->lex= thd->stmt_lex= lex;
+ thd->lex= lex;
}
my_hash_free(&m_sptabs);
@@ -1121,7 +1121,7 @@ sp_head::execute(THD *thd, bool merge_da_on_success)
backup_arena;
query_id_t old_query_id;
TABLE *old_derived_tables;
- LEX *old_lex, *old_stmt_lex;
+ LEX *old_lex;
Item_change_list old_change_list;
String old_packet;
uint old_server_status;
@@ -1224,7 +1224,6 @@ sp_head::execute(THD *thd, bool merge_da_on_success)
do it in each instruction
*/
old_lex= thd->lex;
- old_stmt_lex= thd->stmt_lex;
/*
We should also save Item tree change list to avoid rollback something
too early in the calling query.
@@ -1372,7 +1371,6 @@ sp_head::execute(THD *thd, bool merge_da_on_success)
DBUG_ASSERT(thd->change_list.is_empty());
old_change_list.move_elements_to(&thd->change_list);
thd->lex= old_lex;
- thd->stmt_lex= old_stmt_lex;
thd->set_query_id(old_query_id);
DBUG_ASSERT(!thd->derived_tables);
thd->derived_tables= old_derived_tables;
@@ -2953,7 +2951,7 @@ sp_lex_keeper::reset_lex_and_exec_core(THD *thd, uint *nextp,
We should not save old value since it is saved/restored in
sp_head::execute() when we are entering/leaving routine.
*/
- thd->lex= thd->stmt_lex= m_lex;
+ thd->lex= m_lex;
thd->set_query_id(next_query_id());
diff --git a/sql/sql_class.h b/sql/sql_class.h
index 341f2e571d8..ca6155ec93f 100644
--- a/sql/sql_class.h
+++ b/sql/sql_class.h
@@ -1027,21 +1027,6 @@ class Statement: public ilink, public Query_arena
LEX_STRING name; /* name for named prepared statements */
LEX *lex; // parse tree descriptor
- /*
- LEX which represents current statement (conventional, SP or PS)
-
- For example during view parsing THD::lex will point to the views LEX and
- THD::stmt_lex will point to LEX of the statement where the view will be
- included
-
- Currently it is used to have always correct select numbering inside
- statement (LEX::current_select_number) without storing and restoring a
- global counter which was THD::select_number.
-
- TODO: make some unified statement representation (now SP has different)
- to store such data like LEX::current_select_number.
- */
- LEX *stmt_lex;
/*
Points to the query associated with this statement. It's const, but
we need to declare it char * because all table handlers are written
diff --git a/sql/sql_lex.cc b/sql/sql_lex.cc
index 3fa5ec71aeb..957365df51a 100644
--- a/sql/sql_lex.cc
+++ b/sql/sql_lex.cc
@@ -657,11 +657,12 @@ void lex_start(THD *thd)
{
LEX *lex= thd->lex;
DBUG_ENTER("lex_start");
- DBUG_PRINT("info", ("Lex %p stmt_lex: %p", thd->lex, thd->stmt_lex));
+ DBUG_PRINT("info", ("Lex %p", thd->lex));
lex->thd= lex->unit.thd= thd;
-
+
DBUG_ASSERT(!lex->explain);
+ stmt_lex= this;
lex->context_stack.empty();
lex->unit.init_query();
diff --git a/sql/sql_lex.h b/sql/sql_lex.h
index 4fcd090e1f5..3b47b1d25c9 100644
--- a/sql/sql_lex.h
+++ b/sql/sql_lex.h
@@ -730,7 +730,7 @@ class st_select_lex: public st_select_lex_node
/*
Point to the LEX in which it was created, used in view subquery detection.
- TODO: make also st_select_lex::parent_stmt_lex (see THD::stmt_lex)
+ TODO: make also st_select_lex::parent_stmt_lex (see LEX::stmt_lex)
and use st_select_lex::parent_lex & st_select_lex::parent_stmt_lex
instead of global (from THD) references where it is possible.
*/
@@ -2435,6 +2435,21 @@ struct LEX: public Query_tables_list
// type information
char *length,*dec;
CHARSET_INFO *charset;
+ /*
+ LEX which represents current statement (conventional, SP or PS)
+
+ For example during view parsing THD::lex will point to the views LEX and
+ lex::stmt_lex will point to LEX of the statement where the view will be
+ included
+
+ Currently it is used to have always correct select numbering inside
+ statement (LEX::current_select_number) without storing and restoring a
+ global counter which was THD::select_number.
+
+ TODO: make some unified statement representation (now SP has different)
+ to store such data like LEX::current_select_number.
+ */
+ LEX *stmt_lex;
LEX_STRING name;
char *help_arg;
diff --git a/sql/sql_parse.cc b/sql/sql_parse.cc
index 5c03999c9ac..416fd1825eb 100644
--- a/sql/sql_parse.cc
+++ b/sql/sql_parse.cc
@@ -6923,8 +6923,9 @@ void THD::reset_for_next_command(bool do_clear_error)
We also assign thd->stmt_lex in lex_start(), but during bootstrap this
code is executed first.
*/
- thd->stmt_lex= &main_lex; thd->stmt_lex->current_select_number= 1;
- DBUG_PRINT("info", ("Lex %p stmt_lex: %p", thd->lex, thd->stmt_lex));
+ DBUG_ASSERT(lex == &main_lex);
+ main_lex.stmt_lex= &main_lex; main_lex.current_select_number= 1;
+ DBUG_PRINT("info", ("Lex and stmt_lex: %p", &main_lex));
/*
Those two lines below are theoretically unneeded as
THD::cleanup_after_query() should take care of this already.
@@ -7042,7 +7043,7 @@ mysql_new_select(LEX *lex, bool move_down)
if (!(select_lex= new (thd->mem_root) SELECT_LEX()))
DBUG_RETURN(1);
- select_lex->select_number= ++thd->stmt_lex->current_select_number;
+ select_lex->select_number= ++thd->lex->stmt_lex->current_select_number;
select_lex->parent_lex= lex; /* Used in init_query. */
select_lex->init_query();
select_lex->init_select();
diff --git a/sql/sql_prepare.cc b/sql/sql_prepare.cc
index 6e14ddc2afb..d39ed6aa637 100644
--- a/sql/sql_prepare.cc
+++ b/sql/sql_prepare.cc
@@ -3635,7 +3635,7 @@ bool Prepared_statement::prepare(const char *packet, uint packet_len)
if (! (lex= new (mem_root) st_lex_local))
DBUG_RETURN(TRUE);
- stmt_lex= lex;
+ lex->stmt_lex= lex;
if (set_db(thd->db, thd->db_length))
DBUG_RETURN(TRUE);
diff --git a/sql/sql_trigger.cc b/sql/sql_trigger.cc
index e14d19c7369..2f291cc3756 100644
--- a/sql/sql_trigger.cc
+++ b/sql/sql_trigger.cc
@@ -1374,13 +1374,13 @@ bool Table_triggers_list::check_n_load(THD *thd, const char *db,
List_iterator_fast<LEX_STRING> it_client_cs_name(triggers->client_cs_names);
List_iterator_fast<LEX_STRING> it_connection_cl_name(triggers->connection_cl_names);
List_iterator_fast<LEX_STRING> it_db_cl_name(triggers->db_cl_names);
- LEX *old_lex= thd->lex, *old_stmt_lex= thd->stmt_lex;
+ LEX *old_lex= thd->lex;
LEX lex;
sp_rcontext *save_spcont= thd->spcont;
ulonglong save_sql_mode= thd->variables.sql_mode;
LEX_STRING *on_table_name;
- thd->lex= thd->stmt_lex= &lex;
+ thd->lex= &lex;
save_db.str= thd->db;
save_db.length= thd->db_length;
@@ -1592,7 +1592,6 @@ bool Table_triggers_list::check_n_load(THD *thd, const char *db,
// QQ: anything else ?
lex_end(&lex);
thd->lex= old_lex;
- thd->stmt_lex= old_stmt_lex;
thd->spcont= save_spcont;
thd->variables.sql_mode= save_sql_mode;
thd->reset_db(save_db.str, save_db.length);
diff --git a/sql/sql_view.cc b/sql/sql_view.cc
index 1bdc76a66ea..6d7a8e1cc9d 100644
--- a/sql/sql_view.cc
+++ b/sql/sql_view.cc
@@ -1315,6 +1315,7 @@ bool mysql_make_view(THD *thd, TABLE_SHARE *share, TABLE_LIST *table,
now Lex placed in statement memory
*/
+
table->view= lex= thd->lex= (LEX*) new(thd->mem_root) st_lex_local;
if (!table->view)
{
@@ -1340,8 +1341,9 @@ bool mysql_make_view(THD *thd, TABLE_SHARE *share, TABLE_LIST *table,
goto end;
lex_start(thd);
+ lex->stmt_lex= old_lex;
view_select= &lex->select_lex;
- view_select->select_number= ++thd->stmt_lex->current_select_number;
+ view_select->select_number= ++thd->lex->stmt_lex->current_select_number;
ulonglong saved_mode= thd->variables.sql_mode;
/* switch off modes which can prevent normal parsing of VIEW
1
0
[Commits] 847e695: Issue #809: Wrong query result with bloom filters
by psergey@askmonty.org 07 May '18
by psergey@askmonty.org 07 May '18
07 May '18
revision-id: 847e6959f45a1334f3c60452f11dde38bb9cbe3f
parent(s): fc7bcca40f4ce83327745e9edf3b9bf84b1516af
committer: Sergei Petrunia
branch nick: mysql-5.6-rocksdb-look800
timestamp: 2018-05-07 18:13:42 +0300
message:
Issue #809: Wrong query result with bloom filters
In reverse-ordered column families, if one wants to start reading at the
logical end of the index, they should Seek() to a key value that is not
covered by the index. This may (and typically does) prevent use of a bloom
filter.
The calls to setup_scan_iterator() that are made for index and table scan
didn't take this into account and passed eq_cond_len=INDEX_NUMBER_SIZE.
Fixed them to compute and pass correct eq_cond_len.
Also, removed an incorrect assert in ha_rocksdb::setup_iterator_bounds.
---
mysql-test/suite/rocksdb/r/bloomfilter5.result | 62 ++++++++++++++++++++++
mysql-test/suite/rocksdb/t/bloomfilter5-master.opt | 3 ++
mysql-test/suite/rocksdb/t/bloomfilter5.test | 61 +++++++++++++++++++++
storage/rocksdb/ha_rocksdb.cc | 9 ++--
4 files changed, 130 insertions(+), 5 deletions(-)
diff --git a/mysql-test/suite/rocksdb/r/bloomfilter5.result b/mysql-test/suite/rocksdb/r/bloomfilter5.result
new file mode 100644
index 0000000..058d360
--- /dev/null
+++ b/mysql-test/suite/rocksdb/r/bloomfilter5.result
@@ -0,0 +1,62 @@
+#
+# Issue #809: Wrong query result with bloom filters
+#
+create table t1 (
+id1 bigint not null,
+id2 bigint not null,
+id3 varchar(100) not null,
+id4 int not null,
+id5 int not null,
+value bigint,
+value2 varchar(100),
+primary key (id1, id2, id3, id4) COMMENT 'rev:bf5_1'
+) engine=ROCKSDB;
+create table t2(a int);
+insert into t2 values (0),(1),(2),(3),(4),(5),(6),(7),(8),(9);
+create table t3(seq int);
+insert into t3
+select
+1+ A.a + B.a* 10 + C.a * 100 + D.a * 1000
+from t2 A, t2 B, t2 C, t2 D;
+insert t1
+select
+(seq+9) div 10, (seq+4) div 5, (seq+4) div 5, seq, seq, 1000, "aaabbbccc"
+from t3;
+set global rocksdb_force_flush_memtable_now=1;
+# Full table scan
+explain
+select * from t1 limit 10;
+id select_type table type possible_keys key key_len ref rows Extra
+1 SIMPLE t1 ALL NULL NULL NULL NULL 10000 NULL
+select * from t1 limit 10;
+id1 id2 id3 id4 id5 value value2
+1000 2000 2000 10000 10000 1000 aaabbbccc
+1000 2000 2000 9999 9999 1000 aaabbbccc
+1000 2000 2000 9998 9998 1000 aaabbbccc
+1000 2000 2000 9997 9997 1000 aaabbbccc
+1000 2000 2000 9996 9996 1000 aaabbbccc
+1000 1999 1999 9995 9995 1000 aaabbbccc
+1000 1999 1999 9994 9994 1000 aaabbbccc
+1000 1999 1999 9993 9993 1000 aaabbbccc
+1000 1999 1999 9992 9992 1000 aaabbbccc
+1000 1999 1999 9991 9991 1000 aaabbbccc
+# An index scan starting from the end of the table:
+explain
+select * from t1 order by id1 desc,id2 desc, id3 desc, id4 desc limit 1;
+id select_type table type possible_keys key key_len ref rows Extra
+1 SIMPLE t1 index NULL PRIMARY 122 NULL 1 NULL
+select * from t1 order by id1 desc,id2 desc, id3 desc, id4 desc limit 1;
+id1 id2 id3 id4 id5 value value2
+1000 2000 2000 10000 10000 1000 aaabbbccc
+create table t4 (
+pk int unsigned not null primary key,
+kp1 int unsigned not null,
+kp2 int unsigned not null,
+col1 int unsigned,
+key(kp1, kp2) comment 'rev:bf5_2'
+) engine=rocksdb;
+insert into t4 values (1, 0xFFFF, 0xFFF, 12345);
+# This must not fail an assert:
+select * from t4 force index(kp1) where kp1=0xFFFFFFFF and kp2<=0xFFFFFFFF order by kp2 desc;
+pk kp1 kp2 col1
+drop table t1,t2,t3,t4;
diff --git a/mysql-test/suite/rocksdb/t/bloomfilter5-master.opt b/mysql-test/suite/rocksdb/t/bloomfilter5-master.opt
new file mode 100644
index 0000000..efcd69b
--- /dev/null
+++ b/mysql-test/suite/rocksdb/t/bloomfilter5-master.opt
@@ -0,0 +1,3 @@
+--rocksdb_default_cf_options=write_buffer_size=256k;block_based_table_factory={filter_policy=bloomfilter:10:false;whole_key_filtering=0;}
+--rocksdb_override_cf_options=rev:bf5_1={prefix_extractor=capped:4};
+
diff --git a/mysql-test/suite/rocksdb/t/bloomfilter5.test b/mysql-test/suite/rocksdb/t/bloomfilter5.test
new file mode 100644
index 0000000..00968ae
--- /dev/null
+++ b/mysql-test/suite/rocksdb/t/bloomfilter5.test
@@ -0,0 +1,61 @@
+
+--echo #
+--echo # Issue #809: Wrong query result with bloom filters
+--echo #
+
+create table t1 (
+ id1 bigint not null,
+ id2 bigint not null,
+ id3 varchar(100) not null,
+ id4 int not null,
+ id5 int not null,
+ value bigint,
+ value2 varchar(100),
+ primary key (id1, id2, id3, id4) COMMENT 'rev:bf5_1'
+) engine=ROCKSDB;
+
+
+create table t2(a int);
+insert into t2 values (0),(1),(2),(3),(4),(5),(6),(7),(8),(9);
+
+create table t3(seq int);
+insert into t3
+select
+ 1+ A.a + B.a* 10 + C.a * 100 + D.a * 1000
+from t2 A, t2 B, t2 C, t2 D;
+
+insert t1
+select
+ (seq+9) div 10, (seq+4) div 5, (seq+4) div 5, seq, seq, 1000, "aaabbbccc"
+from t3;
+
+set global rocksdb_force_flush_memtable_now=1;
+
+--echo # Full table scan
+explain
+select * from t1 limit 10;
+select * from t1 limit 10;
+
+--echo # An index scan starting from the end of the table:
+explain
+select * from t1 order by id1 desc,id2 desc, id3 desc, id4 desc limit 1;
+select * from t1 order by id1 desc,id2 desc, id3 desc, id4 desc limit 1;
+
+# A testcase for an assertion that the fix is removing
+# The only requirement for the used column family is that it is reverse-ordered
+create table t4 (
+ pk int unsigned not null primary key,
+ kp1 int unsigned not null,
+ kp2 int unsigned not null,
+ col1 int unsigned,
+ key(kp1, kp2) comment 'rev:bf5_2'
+) engine=rocksdb;
+
+insert into t4 values (1, 0xFFFF, 0xFFF, 12345);
+
+--echo # This must not fail an assert:
+select * from t4 force index(kp1) where kp1=0xFFFFFFFF and kp2<=0xFFFFFFFF order by kp2 desc;
+
+drop table t1,t2,t3,t4;
+
+
diff --git a/storage/rocksdb/ha_rocksdb.cc b/storage/rocksdb/ha_rocksdb.cc
index 455b55d..3eedf4c 100644
--- a/storage/rocksdb/ha_rocksdb.cc
+++ b/storage/rocksdb/ha_rocksdb.cc
@@ -8582,7 +8582,7 @@ int ha_rocksdb::index_first_intern(uchar *const buf) {
DBUG_ASSERT(key != nullptr);
const Rdb_key_def &kd = *m_key_descr_arr[active_index];
- kd.get_first_key(key, &key_size);
+ int key_start_matching_bytes = kd.get_first_key(key, &key_size);
rocksdb::Slice index_key((const char *)key, key_size);
@@ -8593,7 +8593,7 @@ int ha_rocksdb::index_first_intern(uchar *const buf) {
// Loop as long as we get a deadlock error AND we end up creating the
// snapshot here (i.e. it did not exist prior to this)
for (;;) {
- setup_scan_iterator(kd, &index_key, false, Rdb_key_def::INDEX_NUMBER_SIZE);
+ setup_scan_iterator(kd, &index_key, false, key_start_matching_bytes);
m_scan_it->Seek(index_key);
m_skip_scan_it_next_call = true;
@@ -9600,7 +9600,6 @@ int ha_rocksdb::update_write_row(const uchar *const old_data,
void ha_rocksdb::setup_iterator_bounds(const Rdb_key_def &kd,
const rocksdb::Slice &eq_cond) {
uint eq_cond_len = eq_cond.size();
- DBUG_ASSERT(eq_cond_len >= Rdb_key_def::INDEX_NUMBER_SIZE);
memcpy(m_eq_cond_upper_bound, eq_cond.data(), eq_cond_len);
kd.successor(m_eq_cond_upper_bound, eq_cond_len);
memcpy(m_eq_cond_lower_bound, eq_cond.data(), eq_cond_len);
@@ -9696,12 +9695,12 @@ void ha_rocksdb::release_scan_iterator() {
void ha_rocksdb::setup_iterator_for_rnd_scan() {
uint key_size;
- m_pk_descr->get_first_key(m_pk_packed_tuple, &key_size);
+ int key_start_matching_bytes = m_pk_descr->get_first_key(m_pk_packed_tuple, &key_size);
rocksdb::Slice table_key((const char *)m_pk_packed_tuple, key_size);
setup_scan_iterator(*m_pk_descr, &table_key, false,
- Rdb_key_def::INDEX_NUMBER_SIZE);
+ key_start_matching_bytes);
m_scan_it->Seek(table_key);
m_skip_scan_it_next_call = true;
}
1
0
revision-id: e1ffb66449efa44e99d51c2a01c51d1623342d62 (mariadb-galera-10.0.34-7-ge1ffb66449e)
parent(s): 648cf7176cc95f697abd8b94e860c74768680298 42fac3241368ad72f8cfef2b8521269e6c173558
author: Jan Lindström
committer: Jan Lindström
timestamp: 2018-05-07 17:20:39 +0300
message:
Merge tag 'mariadb-10.0.35' into 10.0-galera
.gitattributes | 1 +
CREDITS | 3 +-
client/mysqltest.cc | 2 +-
debian/control | 4 +-
mysql-test/r/contributors.result | 3 +-
mysql-test/r/dyncol.result | 10 +
mysql-test/r/func_concat.result | 6 +
mysql-test/r/func_misc.result | 14 +-
mysql-test/r/func_str.result | 37 +
mysql-test/r/func_time.result | 24 +
mysql-test/r/having.result | 14 +
mysql-test/r/mysqld--help.result | 2 +
mysql-test/r/partition.result | 11 +
mysql-test/r/perror.result | 2 +-
mysql-test/r/ps.result | 2 +-
mysql-test/r/sp-destruct.result | 6 +
mysql-test/r/sp-innodb.result | 34 +
mysql-test/r/statistics.result | 18 +
mysql-test/r/subselect_sj.result | 26 +
mysql-test/r/subselect_sj_jcl6.result | 26 +
mysql-test/r/type_time_6065.result | 44 ++
mysql-test/r/update_innodb.result | 10 +
mysql-test/r/xml.result | 12 +
mysql-test/suite/galera/disabled.def | 2 +
.../suite/innodb/r/innodb-alter-nullable.result | 4 +
.../suite/innodb/r/innodb_bug11754376.result | 1 -
.../suite/innodb/r/innodb_bug27216817.result | 24 +
mysql-test/suite/innodb/r/innodb_bug56947.result | 1 -
.../suite/innodb/r/innodb_corrupt_bit.result | 45 ++
mysql-test/suite/innodb/r/mvcc.result | 26 +
.../innodb/r/read_only_recover_committed.result | 43 +
mysql-test/suite/innodb/r/recovery_shutdown.result | 2 +-
.../suite/innodb/t/innodb-alter-nullable.test | 5 +
mysql-test/suite/innodb/t/innodb_bug11754376.test | 2 -
mysql-test/suite/innodb/t/innodb_bug14147491.test | 3 -
mysql-test/suite/innodb/t/innodb_bug56947.test | 1 -
mysql-test/suite/innodb/t/innodb_corrupt_bit.test | 8 +-
mysql-test/suite/innodb/t/log_file_size.test | 3 +-
mysql-test/suite/innodb/t/mvcc.test | 52 ++
.../innodb/t/read_only_recover_committed.test | 68 ++
mysql-test/suite/innodb/t/recovery_shutdown.test | 6 +-
.../r/innobase_drop_fts_index_table.result | 1 -
.../innodb_fts/r/innodb_fts_misc_debug.result | 3 +-
.../r/innodb_fts_result_cache_limit.result | 1 -
.../t/innobase_drop_fts_index_table.test | 2 -
.../suite/innodb_fts/t/innodb_fts_misc_debug.test | 3 +-
.../t/innodb_fts_result_cache_limit.test | 2 -
mysql-test/suite/parts/inc/part_alter_values.inc | 10 +
.../suite/parts/r/partition_alter_innodb.result | 6 +
.../suite/parts/r/partition_alter_maria.result | 6 +
.../suite/parts/r/partition_alter_myisam.result | 6 +
.../r/max_prepared_stmt_count_basic.result | 20 +-
.../suite/sys_vars/r/table_open_cache_basic.result | 4 +-
.../sys_vars/t/max_prepared_stmt_count_basic.test | 4 +-
mysql-test/t/dyncol.test | 9 +
mysql-test/t/func_concat.test | 6 +
mysql-test/t/func_misc.test | 19 +-
mysql-test/t/func_str.test | 36 +
mysql-test/t/func_time.test | 16 +
mysql-test/t/having.test | 18 +
mysql-test/t/partition.test | 13 +
mysql-test/t/sp-destruct.test | 10 +
mysql-test/t/sp-innodb.test | 42 +
mysql-test/t/statistics.test | 20 +
mysql-test/t/subselect_sj.test | 30 +
mysql-test/t/type_time_6065.test | 23 +
mysql-test/t/update_innodb.test | 13 +
mysql-test/t/xml.test | 9 +
mysql-test/unstable-tests | 137 ++--
mysys/ma_dyncol.c | 2 +-
mysys/my_default.c | 10 +-
pcre/AUTHORS | 6 +-
pcre/ChangeLog | 53 ++
pcre/INSTALL | 320 ++++----
pcre/LICENCE | 6 +-
pcre/NEWS | 6 +
pcre/NON-AUTOTOOLS-BUILD | 15 +-
pcre/configure.ac | 26 +-
pcre/doc/html/NON-AUTOTOOLS-BUILD.txt | 15 +-
pcre/pcre.h.in | 8 +-
pcre/pcre_compile.c | 2 +-
pcre/pcre_dfa_exec.c | 4 +-
pcre/pcre_exec.c | 8 +-
pcre/pcre_jit_compile.c | 407 +++++++---
pcre/pcregrep.c | 55 +-
pcre/pcreposix.c | 6 +-
pcre/testdata/testinput2 | 8 +
pcre/testdata/testinput5 | 6 +
pcre/testdata/testoutput2 | 16 +
pcre/testdata/testoutput5 | 8 +
sql/contributors.h | 3 +-
sql/field.cc | 7 +
sql/handler.cc | 19 +-
sql/handler.h | 2 +-
sql/item.cc | 7 +
sql/item.h | 1 +
sql/item_cmpfunc.cc | 42 +-
sql/item_func.cc | 1 +
sql/item_func.h | 7 +
sql/item_row.cc | 1 +
sql/item_strfunc.cc | 184 ++---
sql/item_strfunc.h | 20 +-
sql/item_subselect.cc | 5 +-
sql/item_sum.cc | 3 +
sql/item_xmlfunc.cc | 10 +-
sql/item_xmlfunc.h | 1 +
sql/log_event.cc | 34 +
sql/log_event_old.cc | 7 +
sql/mysqld.cc | 4 +-
sql/mysqld.h | 2 +-
sql/net_serv.cc | 4 +-
sql/opt_subselect.cc | 24 +-
sql/share/errmsg-utf8.txt | 4 +-
sql/sql_admin.cc | 2 +-
sql/sql_base.cc | 23 +-
sql/sql_class.h | 2 +-
sql/sql_parse.cc | 4 +
sql/sql_partition.cc | 11 +-
sql/sql_partition_admin.cc | 2 +-
sql/sql_plugin.cc | 2 +-
sql/sql_select.cc | 10 +-
sql/sql_statistics.cc | 41 +-
sql/sql_statistics.h | 1 +
sql/sql_table.cc | 13 +-
sql/sql_time.cc | 2 +
sql/sql_trigger.cc | 2 +-
sql/sql_truncate.cc | 3 +-
sql/sys_vars.cc | 6 +-
storage/connect/CMakeLists.txt | 9 +-
storage/connect/Client.java | 27 +-
storage/connect/JavaWrappers.jar | Bin 44053 -> 19192 bytes
storage/connect/JdbcInterface.java | 54 +-
storage/connect/PostgresqlInterface.java | 5 +-
storage/connect/array.cpp | 22 +-
storage/connect/blkfil.cpp | 8 +-
storage/connect/block.h | 4 +-
storage/connect/checklvl.h | 3 +-
storage/connect/cmgoconn.cpp | 18 +-
storage/connect/colblk.cpp | 9 +-
storage/connect/connect.cc | 43 +-
storage/connect/csort.cpp | 16 +-
storage/connect/domdoc.cpp | 3 +-
storage/connect/filamap.cpp | 32 +-
storage/connect/filamdbf.cpp | 12 +-
storage/connect/filamfix.cpp | 68 +-
storage/connect/filamgz.cpp | 20 +-
storage/connect/filamtxt.cpp | 71 +-
storage/connect/filamvct.cpp | 162 ++--
storage/connect/filamzip.cpp | 8 +-
storage/connect/filter.cpp | 36 +-
storage/connect/fmdlex.c | 24 +-
storage/connect/global.h | 11 +-
storage/connect/ha_connect.cc | 332 ++++----
storage/connect/inihandl.cpp | 54 +-
storage/connect/javaconn.cpp | 6 +-
storage/connect/jdbconn.cpp | 365 ++++++---
storage/connect/jdbconn.h | 11 +-
storage/connect/jmgfam.cpp | 2 +-
storage/connect/jmgoconn.cpp | 14 +-
storage/connect/json.cpp | 41 +-
storage/connect/json.h | 8 +-
storage/connect/jsonudf.cpp | 862 ++++++++++++++-------
storage/connect/jsonudf.h | 4 +
storage/connect/libdoc.cpp | 98 +--
storage/connect/macutil.cpp | 4 +-
storage/connect/mongo.cpp | 4 +-
storage/connect/mycat.cc | 34 +-
storage/connect/myconn.cpp | 10 +-
.../mysql-test/connect/r/jdbc_postgresql.result | 35 +-
.../mysql-test/connect/r/json_java_2.result | 7 +-
.../mysql-test/connect/r/json_java_3.result | 4 +-
.../mysql-test/connect/r/json_mongo_c.result | 2 +
.../connect/mysql-test/connect/r/json_udf.result | 10 +-
.../mysql-test/connect/r/json_udf_bin.result | 3 +-
.../connect/mysql-test/connect/r/mongo_c.result | 2 +
.../mysql-test/connect/r/mongo_java_2.result | 4 +-
.../mysql-test/connect/r/mongo_java_3.result | 4 +-
.../connect/mysql-test/connect/r/tbl_thread.result | 6 +-
storage/connect/mysql-test/connect/r/vcol.result | 29 +
.../mysql-test/connect/std_data/JavaWrappers.jar | Bin 0 -> 19192 bytes
.../connect/mysql-test/connect/std_data/Mongo2.jar | Bin 3461358 -> 623907 bytes
.../connect/mysql-test/connect/std_data/Mongo3.jar | Bin 1705776 -> 1705776 bytes
.../mysql-test/connect/t/jdbc_postgresql.test | 33 +-
storage/connect/mysql-test/connect/t/jdbconn.inc | 7 +-
storage/connect/mysql-test/connect/t/json_udf.test | 6 +-
storage/connect/mysql-test/connect/t/mongo.inc | 4 +-
.../connect/mysql-test/connect/t/mongo_test.inc | 8 +-
.../connect/mysql-test/connect/t/tbl_thread.test | 6 +-
storage/connect/mysql-test/connect/t/vcol.test | 31 +
storage/connect/odbconn.cpp | 103 +--
storage/connect/plgdbsem.h | 3 +-
storage/connect/plgdbutl.cpp | 143 ++--
storage/connect/plugutil.cpp | 30 +-
storage/connect/preparse.h | 2 +-
storage/connect/rcmsg.c | 7 +-
storage/connect/rcmsg.h | 2 +-
storage/connect/reldef.cpp | 16 +-
storage/connect/tabcol.cpp | 8 +-
storage/connect/tabdos.cpp | 59 +-
storage/connect/tabdos.h | 1 +
storage/connect/tabext.cpp | 6 +-
storage/connect/tabfix.cpp | 8 +-
storage/connect/tabfmt.cpp | 36 +-
storage/connect/tabjdbc.cpp | 35 +-
storage/connect/tabjdbc.h | 2 +
storage/connect/tabjson.cpp | 597 +++++++-------
storage/connect/tabjson.h | 44 +-
storage/connect/table.cpp | 16 +-
storage/connect/tabmac.cpp | 4 +-
storage/connect/tabmul.cpp | 26 +-
storage/connect/tabmysql.cpp | 28 +-
storage/connect/tabodbc.cpp | 20 +-
storage/connect/tabpivot.cpp | 2 +-
storage/connect/tabsys.cpp | 12 +-
storage/connect/tabtbl.cpp | 41 +-
storage/connect/tabutil.cpp | 8 +-
storage/connect/tabvct.cpp | 8 +-
storage/connect/tabwmi.cpp | 4 +-
storage/connect/tabxml.cpp | 26 +-
storage/connect/user_connect.cc | 2 +-
storage/connect/valblk.cpp | 2 +-
storage/connect/value.cpp | 52 +-
storage/connect/xindex.cpp | 88 +--
storage/connect/xobject.cpp | 5 +-
storage/innobase/CMakeLists.txt | 4 +-
storage/innobase/api/api0api.cc | 2 +-
storage/innobase/btr/btr0cur.cc | 34 +-
storage/innobase/btr/btr0pcur.cc | 2 +-
storage/innobase/dict/dict0load.cc | 5 +
storage/innobase/fts/fts0fts.cc | 29 +-
storage/innobase/fts/fts0que.cc | 3 +-
storage/innobase/handler/ha_innodb.cc | 2 +-
storage/innobase/handler/handler0alter.cc | 63 +-
storage/innobase/include/btr0cur.h | 12 +-
storage/innobase/include/data0type.ic | 2 +
storage/innobase/include/mtr0mtr.h | 16 +-
storage/innobase/include/mtr0mtr.ic | 6 +-
storage/innobase/include/row0upd.h | 4 +-
storage/innobase/include/row0upd.ic | 4 +-
storage/innobase/include/trx0undo.h | 13 +-
storage/innobase/include/univ.i | 2 +-
storage/innobase/lock/lock0lock.cc | 5 +-
storage/innobase/mem/mem0mem.cc | 5 +
storage/innobase/page/page0page.cc | 17 +-
storage/innobase/row/row0ext.cc | 3 +-
storage/innobase/row/row0ftsort.cc | 3 +-
storage/innobase/row/row0import.cc | 9 +-
storage/innobase/row/row0ins.cc | 24 +-
storage/innobase/row/row0log.cc | 16 +-
storage/innobase/row/row0merge.cc | 4 +-
storage/innobase/row/row0mysql.cc | 16 +-
storage/innobase/row/row0sel.cc | 19 +-
storage/innobase/row/row0umod.cc | 14 +-
storage/innobase/row/row0upd.cc | 12 +-
storage/innobase/trx/trx0rec.cc | 8 +-
storage/innobase/trx/trx0trx.cc | 4 +-
storage/innobase/trx/trx0undo.cc | 16 +-
storage/oqgraph/graphcore-config.h | 2 +
storage/oqgraph/graphcore-graph.cc | 1 -
storage/oqgraph/graphcore.cc | 1 -
storage/oqgraph/oqgraph_shim.h | 5 +-
storage/perfschema/ha_perfschema.cc | 2 +-
storage/tokudb/CMakeLists.txt | 2 +-
storage/tokudb/PerconaFT/.clang-format | 36 +
storage/tokudb/PerconaFT/CMakeLists.txt | 6 +
storage/tokudb/PerconaFT/README.md | 29 +-
storage/tokudb/PerconaFT/ft/ft-ops.cc | 91 ++-
storage/tokudb/PerconaFT/ft/tests/ft-clock-test.cc | 4 +-
storage/tokudb/PerconaFT/ft/tests/log-test4.cc | 2 +-
.../PerconaFT/portability/tests/test-max-data.cc | 2 +-
.../PerconaFT/portability/toku_instrumentation.h | 2 +
.../PerconaFT/portability/toku_portability.h | 2 +-
.../tokudb/PerconaFT/portability/toku_pthread.h | 6 +-
.../tokudb/PerconaFT/portability/toku_race_tools.h | 2 +-
storage/tokudb/PerconaFT/portability/toku_time.h | 5 +
.../PerconaFT/src/tests/checkpoint_stress.cc | 2 +-
.../tokudb/PerconaFT/src/tests/directory_lock.cc | 2 +-
.../PerconaFT/src/tests/loader-cleanup-test.cc | 18 +-
.../src/tests/recover-del-multiple-abort.cc | 6 +-
.../recover-del-multiple-srcdb-fdelete-all.cc | 6 +-
.../PerconaFT/src/tests/recover-del-multiple.cc | 6 +-
.../src/tests/recover-put-multiple-abort.cc | 6 +-
.../PerconaFT/src/tests/recovery_fileops_unit.cc | 4 +-
.../tokudb/PerconaFT/src/tests/test-prepare3.cc | 1 +
storage/tokudb/hatoku_hton.cc | 20 +-
storage/xtradb/CMakeLists.txt | 10 +-
storage/xtradb/api/api0api.cc | 2 +-
storage/xtradb/btr/btr0cur.cc | 41 +-
storage/xtradb/btr/btr0pcur.cc | 2 +-
storage/xtradb/dict/dict0dict.cc | 47 +-
storage/xtradb/dict/dict0load.cc | 6 +-
storage/xtradb/dict/dict0mem.cc | 4 +-
storage/xtradb/fts/fts0fts.cc | 43 +-
storage/xtradb/fts/fts0que.cc | 20 +-
storage/xtradb/handler/ha_innodb.cc | 10 +-
storage/xtradb/handler/handler0alter.cc | 12 +-
storage/xtradb/include/btr0cur.h | 12 +-
storage/xtradb/include/data0type.ic | 2 +
storage/xtradb/include/mtr0mtr.h | 16 +-
storage/xtradb/include/mtr0mtr.ic | 6 +-
storage/xtradb/include/row0upd.h | 4 +-
storage/xtradb/include/row0upd.ic | 4 +-
storage/xtradb/include/trx0undo.h | 13 +-
storage/xtradb/include/univ.i | 4 +-
storage/xtradb/include/ut0ut.h | 11 +-
storage/xtradb/lock/lock0lock.cc | 5 +-
storage/xtradb/mem/mem0mem.cc | 5 +
storage/xtradb/page/page0page.cc | 17 +-
storage/xtradb/row/row0ext.cc | 3 +-
storage/xtradb/row/row0ftsort.cc | 3 +-
storage/xtradb/row/row0import.cc | 2 +-
storage/xtradb/row/row0ins.cc | 24 +-
storage/xtradb/row/row0log.cc | 4 +-
storage/xtradb/row/row0merge.cc | 4 +-
storage/xtradb/row/row0mysql.cc | 8 +-
storage/xtradb/row/row0sel.cc | 17 +-
storage/xtradb/row/row0umod.cc | 14 +-
storage/xtradb/row/row0upd.cc | 12 +-
storage/xtradb/trx/trx0purge.cc | 2 +-
storage/xtradb/trx/trx0rec.cc | 8 +-
storage/xtradb/trx/trx0trx.cc | 4 +-
storage/xtradb/trx/trx0undo.cc | 16 +-
storage/xtradb/ut/ut0ut.cc | 30 +-
323 files changed, 5001 insertions(+), 2835 deletions(-)
diff --cc debian/control
index f24aabaaf94,862cd4f3cea..bb52fe0ec4d
--- a/debian/control
+++ b/debian/control
@@@ -24,31 -24,257 +24,31 @@@ Build-Depends: bison
${LIBREADLINE_DEV}
Standards-Version: 3.8.3
Homepage: http://mariadb.org/
- Vcs-Browser: http://bazaar.launchpad.net/~maria-captains/maria/10.0/files
- Vcs-Bzr: bzr://lp:maria
+ Vcs-Browser: https://github.com/MariaDB/server/
+ Vcs-Git: https://github.com/MariaDB/server.git
-Package: libmariadbclient18
-Section: libs
-Architecture: any
-Depends: libmysqlclient18 (= ${source:Version}),
- mariadb-common (>= ${source:Version}),
- ${misc:Depends},
- ${shlibs:Depends}
-Conflicts: mariadb-galera-server-10.0 (<< 10.0.5),
- mariadb-galera-server-5.5 (<< 5.5.33),
- mariadb-server-10.0 (<< 10.0.5),
- mariadb-server-5.1,
- mariadb-server-5.2,
- mariadb-server-5.3,
- mariadb-server-5.5 (<< 5.5.33)
-Description: MariaDB database client library
- MariaDB is a fast, stable and true multi-user, multi-threaded SQL database
- server. SQL (Structured Query Language) is the most popular database query
- language in the world. The main goals of MariaDB are speed, robustness and
- ease of use.
- .
- This package includes the client library.
-
-Package: libmysqlclient18
-Section: libs
-Architecture: any
-Depends: libmariadbclient18 (= ${source:Version})
-Replaces: libmysqlclient18 (<< ${source:Version})
-Description: Virtual package to satisfy external depends
- This is an empty package that provides an updated "best" version of
- libmysqlclient18 that does not conflict with the libmariadbclient18
- package.
- .
- MariaDB is a fast, stable and true multi-user, multi-threaded SQL database
- server. SQL (Structured Query Language) is the most popular database query
- language in the world. The main goals of MariaDB are speed, robustness and
- ease of use.
-
-Package: libmariadbd-dev
-Architecture: any
-Section: libdevel
-Depends: libmariadbclient-dev (>= ${source:Version}), ${misc:Depends}
-Provides: libmysqld-dev
-Conflicts: libmysqld-dev
-Replaces: libmysqld-dev
-Description: MariaDB embedded database development files
- MariaDB is a fast, stable and true multi-user, multi-threaded SQL database
- server. SQL (Structured Query Language) is the most popular database query
- language in the world. The main goals of MariaDB are speed, robustness and
- ease of use.
- .
- This package includes the embedded server library and header files.
-
-Package: libmariadbclient-dev
-Architecture: any
-Section: libdevel
-Depends: libmariadbclient18 (>= ${source:Version}),
- zlib1g-dev,
- ${misc:Depends},
- ${shlibs:Depends}
-Replaces: libmariadbclient16-dev, libmysqlclient16-dev
-Conflicts: libmariadbclient16-dev,
- libmysqlclient-dev,
- libmysqlclient10-dev,
- libmysqlclient12-dev,
- libmysqlclient14-dev,
- libmysqlclient15-dev,
- libmysqlclient16-dev
-Provides: libmysqlclient-dev
-Description: MariaDB database development files
- MariaDB is a fast, stable and true multi-user, multi-threaded SQL database
- server. SQL (Structured Query Language) is the most popular database query
- language in the world. The main goals of MariaDB are speed, robustness and
- ease of use.
- .
- This package includes development libraries and header files.
-
-Package: mysql-common
-Section: database
-Architecture: all
-Depends: ${misc:Depends}, ${shlibs:Depends}
-Description: MariaDB database common files (e.g. /etc/mysql/my.cnf)
- MariaDB is a fast, stable and true multi-user, multi-threaded SQL database
- server. SQL (Structured Query Language) is the most popular database query
- language in the world. The main goals of MariaDB are speed, robustness and
- ease of use.
- .
- This package includes files needed by all versions of the client library
- (e.g. /etc/mysql/my.cnf).
-
-Package: mariadb-common
-Section: database
-Architecture: all
-Depends: mysql-common (>= ${source:Version}), ${misc:Depends}, ${shlibs:Depends}
-Description: MariaDB database common files (e.g. /etc/mysql/conf.d/mariadb.cnf)
- MariaDB is a fast, stable and true multi-user, multi-threaded SQL database
- server. SQL (Structured Query Language) is the most popular database query
- language in the world. The main goals of MariaDB are speed, robustness and
- ease of use.
- .
- This package includes files needed by all versions of the client library
- (e.g. /etc/mysql/conf.d/mariadb.cnf).
-
-Package: mariadb-client-core-10.0
-Architecture: any
-Depends: libmariadbclient18 (>= ${source:Version}),
- mariadb-common (>= ${source:Version}),
- ${misc:Depends},
- ${shlibs:Depends}
-Provides: mysql-client-core,
- mysql-client-core-5.1,
- mysql-client-core-5.5,
- mysql-client-core-5.6
-Conflicts: mariadb-client-5.1,
- mariadb-client-5.2,
- mariadb-client-5.3,
- mariadb-client-5.5,
- mariadb-client-core-5.1,
- mariadb-client-core-5.2,
- mariadb-client-core-5.3,
- mariadb-client-core-5.5,
- mysql-client (<< 5.0.51),
- mysql-client-5.0,
- mysql-client-5.1 (<< ${source:Version}),
- mysql-client-5.5 (<< ${source:Version}),
- mysql-client-core-5.1,
- mysql-client-core-5.5,
- mysql-client-core-5.6
-Replaces: mariadb-client-5.1,
- mariadb-client-5.2,
- mariadb-client-5.3,
- mariadb-client-5.5,
- mariadb-client-core-5.1,
- mariadb-client-core-5.2,
- mariadb-client-core-5.3,
- mariadb-client-core-5.5,
- mysql-client (<< 5.0.51),
- mysql-client-5.0,
- mysql-client-5.1,
- mysql-client-5.5,
- mysql-client-core-5.1,
- mysql-client-core-5.5,
- mysql-client-core-5.6
-Description: MariaDB database core client binaries
- MariaDB is a fast, stable and true multi-user, multi-threaded SQL database
- server. SQL (Structured Query Language) is the most popular database query
- language in the world. The main goals of MariaDB are speed, robustness and
- ease of use.
- .
- This package includes the core client files, as used by Akonadi.
-
-Package: mariadb-client-10.0
-Architecture: any
-Depends: debianutils (>=1.6),
- libdbd-mysql-perl (>= 1.2202),
- libdbi-perl,
- libmariadbclient18 (>= ${source:Version}),
- mariadb-client-core-10.0 (>= ${source:Version}),
- mariadb-common (>= ${source:Version}),
- ${misc:Depends},
- ${perl:Depends},
- ${shlibs:Depends}
-Suggests: libterm-readkey-perl
-Provides: mysql-client,
- mysql-client-4.1,
- mysql-client-5.1,
- mysql-client-5.5,
- mysql-client-5.6,
- virtual-mysql-client
-Conflicts: mariadb-client (<< ${source:Version}),
- mariadb-client-5.1,
- mariadb-client-5.2,
- mariadb-client-5.3,
- mariadb-client-5.5,
- mysql-client (<< 5.0.51),
- mysql-client-5.0,
- mysql-client-5.1,
- mysql-client-5.5,
- mysql-client-5.6
-Replaces: mariadb-client (<< ${source:Version}),
- mariadb-client-5.1,
- mariadb-client-5.2,
- mariadb-client-5.3,
- mariadb-client-5.5,
- mysql-client (<< 5.0.51),
- mysql-client-5.0,
- mysql-client-5.1,
- mysql-client-5.5,
- mysql-client-5.6
-Description: MariaDB database client binaries
- MariaDB is a fast, stable and true multi-user, multi-threaded SQL database
- server. SQL (Structured Query Language) is the most popular database query
- language in the world. The main goals of MariaDB are speed, robustness and
- ease of use.
- .
- This package includes the client binaries and the additional tools
- innotop and mysqlreport.
-
-Package: mariadb-server-core-10.0
-Architecture: any
-Depends: libmariadbclient18 (>= ${binary:Version}),
- ${misc:Depends},
- ${shlibs:Depends}
-Provides: mysql-server-core,
- mysql-server-core-5.1,
- mysql-server-core-5.5,
- mysql-server-core-5.6
-Conflicts: mariadb-server-core-5.1,
- mariadb-server-core-5.2,
- mariadb-server-core-5.3,
- mariadb-server-core-5.5,
- mysql-server-5.0,
- mysql-server-core-5.0,
- mysql-server-core-5.1,
- mysql-server-core-5.5,
- mysql-server-core-5.6
-Replaces: mariadb-server-core-5.1,
- mariadb-server-core-5.2,
- mariadb-server-core-5.3,
- mariadb-server-core-5.5,
- mysql-server-5.0,
- mysql-server-core-5.0,
- mysql-server-core-5.1,
- mysql-server-core-5.5,
- mysql-server-core-5.6
-Description: MariaDB database core server files
- MariaDB is a fast, stable and true multi-user, multi-threaded SQL database
- server. SQL (Structured Query Language) is the most popular database query
- language in the world. The main goals of MariaDB are speed, robustness and
- ease of use.
- .
- This package includes the core server files, as used by Akonadi.
-
-Package: mariadb-test-10.0
+Package: mariadb-galera-test-10.0
Section: database
Architecture: any
-Depends: mariadb-client-10.0 (= ${source:Version}),
- mariadb-server-10.0 (= ${source:Version})
+Depends: mariadb-galera-server-10.0 (= ${source:Version}),
+ mariadb-client-10.0 (>= ${source:Version})
Suggests: patch
-Conflicts: mariadb-galera-server-5.5 (<< 5.5.33),
- mariadb-server-5.5 (<< 5.5.33),
- mariadb-test (<< ${source:Version}),
+Conflicts: mariadb-test,
+ mariadb-galera-test (<< ${source:Version}),
mariadb-test-5.1,
mariadb-test-5.2,
- mariadb-test-5.3
+ mariadb-test-5.3,
+ mariadb-test-5.5,
+ mariadb-test-10.0,
+ mariadb-server-5.5,
+ mariadb-galera-server-5.5,
+ mariadb-server-10.0
Replaces: mariadb-test (<< ${source:Version}),
+ mariadb-galera-test (<< ${source:Version}),
mariadb-test-5.1,
mariadb-test-5.2,
- mariadb-test-5.3
+ mariadb-test-5.3,
+ mariadb-test-5.5
Description: MariaDB database regression test suite
MariaDB is a fast, stable and true multi-user, multi-threaded SQL database
server. SQL (Structured Query Language) is the most popular database query
diff --cc mysql-test/suite/galera/disabled.def
index 78c3565b99d,00000000000..fad1e69d4d7
mode 100644,000000..100644
--- a/mysql-test/suite/galera/disabled.def
+++ b/mysql-test/suite/galera/disabled.def
@@@ -1,51 -1,0 +1,53 @@@
+##############################################################################
+#
+# List the test cases that are to be disabled temporarily.
+#
+# Separate the test case name and the comment with ':'.
+#
+# <testcasename> : MDEV-<xxxx> <comment>
+#
+# Do not use any TAB characters for whitespace.
+#
+##############################################################################
+
+MW-336 : MDEV-13549 Galera test failures
+galera_gra_log : MDEV-13549 Galera test failures
+galera_flush_local : MDEV-13549 Galera test failures
+galera_flush : MDEV-13549 Galera test failures
+MW-329 : MDEV-13549 Galera test failures
+galera_account_management : MariaDB 10.0 does not support ALTER USER
+galera_binlog_row_image : MariaDB 10.0 does not support binlog_row_image
+galera_binlog_rows_query_log_events: MariaDB does not support binlog_rows_query_log_events
+GAL-419 : MDEV-13549 Galera test failures
+galera_toi_ddl_fk_insert : MDEV-13549 Galera test failures
+galera_var_notify_cmd : MDEV-13549 Galera test failures
+galera_var_slave_threads : MDEV-13549 Galera test failures
+mysql-wsrep#90 : MDEV-13549 Galera test failures
+galera_as_master_gtid : Requires MySQL GTID
+galera_as_master_gtid_change_master : Requires MySQL GTID
+galera_as_slave_replication_bundle : MDEV-13549 Galera test failures
+galera_as_slave_preordered : wsrep-preordered feature not merged to MariaDB
+galera_gcs_fragment : MDEV-13549 Galera test failures
+galera_gcache_recover : MDEV-13549 Galera test failures
+galera_gcache_recover_full_gcache : MDEV-13549 Galera test failures
+galera_gcache_recover_manytrx : MDEV-13549 Galera test failures
+galera_ist_mysqldump : MDEV-13549 Galera test failures
+mysql-wsrep#31 : MDEV-13549 Galera test failures
+galera_migrate : MariaDB 10.0 does not support START SLAVE USER
+galera_concurrent_ctas : MDEV-13549 Galera test failures
+galera_bf_abort_for_update : MDEV-13549 Galera test failures
+galera_wsrep_desync_wsrep_on : MDEV-13549 Galera test failures
+galera_ssl_upgrade : MDEV-13549 Galera test failures
+mysql-wsrep#33 : MDEV-13549 Galera test failures
+galera_var_auto_inc_control_on : MDEV-13549 Galera test failures
+MW-44 : MDEV-13549 Galera test failures
+galera_var_retry_autocommit : MDEV-13549 Galera test failures
+pxc-421 : MDEV-13549 Galera test failures
+lp1376747-2 : MDEV-13549 Galera test failures
+lp1376747 : MDEV-13549 Galera test failures
+galera_toi_ddl_nonconflicting : MDEV-13549 Galera test failures
+galera_parallel_simple : MDEV-13549 Galera test failures
+galera_admin : MDEV-13549 Galera test failures
+MW-416 : MDEV-13549 Galera test failures
++galera_wan : MDEV-13549 Galera test failures
++MW-388 : MDEV-13549 Galera test failures
diff --cc sql/handler.cc
index fc8bb53f35d,f38569a1458..128043a8d80
--- a/sql/handler.cc
+++ b/sql/handler.cc
@@@ -4464,8 -4379,6 +4454,7 @@@ handler::ha_create_partitioning_metadat
DBUG_ASSERT(m_lock_type == F_UNLCK ||
(!old_name && strcmp(name, table_share->path.str)));
- mark_trx_read_write();
+
return create_partitioning_metadata(name, old_name, action_flag);
}
diff --cc sql/sql_truncate.cc
index 259f2ad78e3,75a0678928b..7fc5192e42d
--- a/sql/sql_truncate.cc
+++ b/sql/sql_truncate.cc
@@@ -574,6 -565,6 +574,5 @@@ bool Sql_cmd_truncate_table::execute(TH
if (! (res= truncate_table(thd, first_table)))
my_ok(thd);
-
DBUG_RETURN(res);
}
-
1
0
[Commits] 648cf7176cc: Merge remote-tracking branch 'origin/5.5-galera' into 10.0-galera
by jan 07 May '18
by jan 07 May '18
07 May '18
revision-id: 648cf7176cc95f697abd8b94e860c74768680298 (mariadb-galera-10.0.34-6-g648cf7176cc)
parent(s): 7b115181987fb88b97ef6d3d88bb16bdbc281e40 1ecd68d867ced1d00ebffdcedbf6bc97493f5067
author: Jan Lindström
committer: Jan Lindström
timestamp: 2018-05-07 13:49:14 +0300
message:
Merge remote-tracking branch 'origin/5.5-galera' into 10.0-galera
.gitignore | 1 +
extra/yassl/src/handshake.cpp | 10 +
include/heap.h | 1 +
include/my_valgrind.h | 2 +
include/mysql_com.h | 2 +-
include/sql_common.h | 2 +-
mysql-test/mysql-test-run.pl | 2 +-
mysql-test/r/connect_debug.result | 5 +
mysql-test/r/ctype_ucs.result | 31 +++
mysql-test/r/ctype_utf8mb4.result | 23 ++
mysql-test/r/func_misc.result | 11 +
mysql-test/r/join_outer.result | 18 +-
mysql-test/r/join_outer_jcl6.result | 18 +-
mysql-test/r/mysqld--help.result | 4 +-
mysql-test/r/parser.result | 7 +
mysql-test/r/ps_qc_innodb.result | 23 ++
mysql-test/r/subselect4.result | 35 ++-
mysql-test/r/subselect_mat.result | 15 +
mysql-test/r/view.result | 305 ++++++++++++--------
mysql-test/suite/galera/disabled.def | 2 +
mysql-test/suite/galera/r/MW-416.result | 114 ++++++++
mysql-test/suite/galera/r/galera_defaults.result | 2 +-
.../suite/galera/r/galera_var_dirty_reads.result | 2 -
mysql-test/suite/galera/t/MW-416.test | 134 +++++++++
.../suite/galera/t/galera_concurrent_ctas.test | 8 +-
.../suite/galera/t/galera_var_dirty_reads.test | 9 +-
.../suite/innodb/r/innodb-replace-debug.result | 5 +-
.../suite/innodb/t/innodb-replace-debug.test | 5 +-
mysql-test/suite/maria/dynamic.result | 4 +
mysql-test/suite/maria/dynamic.test | 7 +
.../suite/parts/r/partition_alter_maria.result | 9 +
.../suite/parts/t/partition_alter_maria.test | 7 +
mysql-test/suite/plugins/t/server_audit.test | 2 +
mysql-test/suite/wsrep/r/variables.result | 7 +
mysql-test/suite/wsrep/t/variables.test | 14 +
mysql-test/t/connect_debug.test | 12 +
mysql-test/t/ctype_ucs.test | 22 ++
mysql-test/t/ctype_utf8mb4.test | 19 ++
mysql-test/t/func_misc.test | 12 +
mysql-test/t/join_outer.test | 18 +-
mysql-test/t/parser.test | 9 +
mysql-test/t/ps_qc_innodb.test | 35 +++
mysql-test/t/subselect4.test | 31 +++
mysql-test/t/subselect_mat.test | 13 +
mysql-test/t/view.test | 308 +++++++++++++--------
mysys/lf_hash.c | 9 +-
mysys/mf_iocache.c | 2 +-
mysys/my_addr_resolve.c | 2 +-
mysys/my_symlink.c | 2 +-
policy/selinux/mariadb-server.fc | 2 +-
policy/selinux/mariadb-server.te | 2 +-
scripts/CMakeLists.txt | 16 ++
scripts/wsrep_sst_xtrabackup-v2.sh | 2 +-
sql-common/client.c | 12 +-
sql/event_data_objects.cc | 21 +-
sql/event_db_repository.cc | 5 +-
sql/events.cc | 15 +
sql/handler.cc | 6 +
sql/item_cmpfunc.h | 5 +
sql/item_func.h | 4 +-
sql/item_strfunc.h | 2 +
sql/item_subselect.cc | 2 +-
sql/log.cc | 8 +-
sql/log_event.cc | 69 ++++-
sql/log_event_old.cc | 3 +-
sql/mysqld.cc | 5 +-
sql/mysqld.h | 3 +-
sql/opt_subselect.cc | 9 +-
sql/slave.cc | 2 +-
sql/sp.cc | 6 +-
sql/sql_acl.cc | 3 +-
sql/sql_admin.cc | 2 +-
sql/sql_base.h | 2 +
sql/sql_cache.cc | 1 +
sql/sql_class.cc | 7 +-
sql/sql_class.h | 1 +
sql/sql_insert.cc | 29 ++
sql/sql_parse.cc | 14 +-
sql/sql_partition.cc | 2 +-
sql/sql_plugin.cc | 33 ++-
sql/sql_prepare.cc | 6 +-
sql/sql_priv.h | 4 +-
sql/sql_table.cc | 2 +-
sql/sql_trigger.cc | 6 +
sql/sql_truncate.cc | 2 +-
sql/sql_update.cc | 2 +
sql/sql_view.cc | 5 +
sql/sql_yacc.yy | 5 +
sql/sys_vars.cc | 6 +-
sql/table.cc | 16 +-
sql/table.h | 8 +-
sql/wsrep_hton.cc | 38 ++-
sql/wsrep_mysqld.cc | 65 ++---
sql/wsrep_mysqld.h | 13 +
sql/wsrep_priv.h | 2 +-
sql/wsrep_sst.cc | 1 -
sql/wsrep_thd.cc | 2 +-
sql/wsrep_utils.cc | 1 -
storage/heap/_check.c | 2 +-
storage/heap/ha_heap.cc | 11 +-
storage/heap/hp_create.c | 8 +-
storage/heap/hp_delete.c | 2 +-
storage/heap/hp_rrnd.c | 4 +-
storage/heap/hp_rsame.c | 2 +-
storage/heap/hp_scan.c | 2 +-
storage/heap/hp_write.c | 4 +-
storage/innobase/handler/ha_innodb.cc | 73 ++---
storage/innobase/os/os0file.cc | 10 +-
storage/maria/ma_control_file.c | 2 +-
storage/maria/ma_dynrec.c | 10 +-
storage/maria/ma_loghandler.c | 6 +-
storage/maria/ma_open.c | 8 +-
storage/myisam/mi_open.c | 8 +-
storage/xtradb/handler/ha_innodb.cc | 20 +-
storage/xtradb/log/log0online.cc | 3 -
storage/xtradb/os/os0file.cc | 12 +-
support-files/mysql.server.sh | 2 -
support-files/wsrep.cnf.sh | 3 +
118 files changed, 1536 insertions(+), 473 deletions(-)
diff --cc mysql-test/r/ctype_ucs.result
index 6520694a804,1c9e31d3a06..59d88414cab
--- a/mysql-test/r/ctype_ucs.result
+++ b/mysql-test/r/ctype_ucs.result
@@@ -4590,1014 -4397,36 +4590,1045 @@@ Field Type Null Key Default Extr
c1 mediumtext YES NULL
DROP TABLE t1;
#
+ # MDEV-15624 Changing the default character set to utf8mb4 changes query evaluation in a very surprising way
+ #
+ SET NAMES utf8;
+ CREATE TABLE t1 (id INT);
+ INSERT INTO t1 VALUES (1),(2),(3);
+ SELECT COUNT(DISTINCT c) FROM (SELECT id, REPLACE(uuid_short(), '0', CAST('o' AS CHAR CHARACTER SET ucs2)) AS c FROM t1) AS d1;
+ COUNT(DISTINCT c)
+ 3
+ SELECT DISTINCT REPLACE(uuid_short(), '0', CAST('o' AS CHAR CHARACTER SET ucs2)) AS c FROM t1;
+ c
+ xxxxxxxxxxxxxxxxx
+ xxxxxxxxxxxxxxxxx
+ xxxxxxxxxxxxxxxxx
+ SELECT COUNT(DISTINCT c) FROM (SELECT id, INSERT(uuid_short(), 1, 1, CAST('0' AS CHAR CHARACTER SET ucs2)) AS c FROM t1) AS d1;
+ COUNT(DISTINCT c)
+ 3
+ SELECT DISTINCT INSERT(uuid_short(), 1, 1, CAST('0' AS CHAR CHARACTER SET ucs2)) AS c FROM t1;
+ c
+ xxxxxxxxxxxxxxxxx
+ xxxxxxxxxxxxxxxxx
+ xxxxxxxxxxxxxxxxx
+ SELECT COUNT(DISTINCT c) FROM (SELECT id, CONCAT(uuid_short(), CAST('0' AS CHAR CHARACTER SET ucs2)) AS c FROM t1) AS d1;
+ COUNT(DISTINCT c)
+ 3
+ SELECT DISTINCT CONCAT(uuid_short(), CAST('0' AS CHAR CHARACTER SET ucs2)) AS c FROM t1;
+ c
+ xxxxxxxxxxxxxxxxx
+ xxxxxxxxxxxxxxxxx
+ xxxxxxxxxxxxxxxxx
+ DROP TABLE t1;
+ #
# End of 5.5 tests
#
+#
+# Start of 5.6 tests
+#
+#
+# WL#3664 WEIGHT_STRING
+#
+set collation_connection=ucs2_general_ci;
+select @@collation_connection;
+@@collation_connection
+ucs2_general_ci
+CREATE TABLE t1 AS SELECT 'a' AS a;
+SHOW CREATE TABLE t1;
+Table Create Table
+t1 CREATE TABLE `t1` (
+ `a` varchar(1) CHARACTER SET ucs2 NOT NULL DEFAULT ''
+) ENGINE=MyISAM DEFAULT CHARSET=latin1
+CREATE TABLE t2 AS SELECT WEIGHT_STRING(a) AS ws FROM t1;
+SHOW CREATE TABLE t2;
+Table Create Table
+t2 CREATE TABLE `t2` (
+ `ws` varbinary(2) DEFAULT NULL
+) ENGINE=MyISAM DEFAULT CHARSET=latin1
+SELECT HEX(WEIGHT_STRING(a)) FROM t1;
+HEX(WEIGHT_STRING(a))
+0041
+SELECT HEX(ws) FROM t2;
+HEX(ws)
+0041
+DROP TABLE t2;
+DROP TABLE t1;
+CREATE TABLE t1 AS SELECT REPEAT('a',5) AS a;
+SHOW CREATE TABLE t1;
+Table Create Table
+t1 CREATE TABLE `t1` (
+ `a` varchar(5) CHARACTER SET ucs2 NOT NULL DEFAULT ''
+) ENGINE=MyISAM DEFAULT CHARSET=latin1
+CREATE TABLE t2 AS SELECT WEIGHT_STRING(a) AS ws FROM t1;
+SHOW CREATE TABLE t2;
+Table Create Table
+t2 CREATE TABLE `t2` (
+ `ws` varbinary(10) DEFAULT NULL
+) ENGINE=MyISAM DEFAULT CHARSET=latin1
+SELECT HEX(WEIGHT_STRING(a)) FROM t1;
+HEX(WEIGHT_STRING(a))
+00410041004100410041
+SELECT HEX(ws) FROM t2;
+HEX(ws)
+00410041004100410041
+DROP TABLE t2;
+CREATE TABLE t2 AS SELECT WEIGHT_STRING(a AS CHAR(3)) AS ws FROM t1;
+SHOW CREATE TABLE t2;
+Table Create Table
+t2 CREATE TABLE `t2` (
+ `ws` varbinary(6) DEFAULT NULL
+) ENGINE=MyISAM DEFAULT CHARSET=latin1
+SELECT HEX(WEIGHT_STRING(a AS CHAR(3))) FROM t1;
+HEX(WEIGHT_STRING(a AS CHAR(3)))
+004100410041
+SELECT HEX(ws) FROM t2;
+HEX(ws)
+004100410041
+DROP TABLE t2;
+CREATE TABLE t2 AS SELECT WEIGHT_STRING(a AS CHAR(10)) AS ws FROM t1;
+SHOW CREATE TABLE t2;
+Table Create Table
+t2 CREATE TABLE `t2` (
+ `ws` varbinary(20) DEFAULT NULL
+) ENGINE=MyISAM DEFAULT CHARSET=latin1
+SELECT HEX(WEIGHT_STRING(a AS CHAR(10))) FROM t1;
+HEX(WEIGHT_STRING(a AS CHAR(10)))
+0041004100410041004100200020002000200020
+SELECT HEX(ws) FROM t2;
+HEX(ws)
+0041004100410041004100200020002000200020
+DROP TABLE t2;
+DROP TABLE t1;
+select hex(weight_string('a'));
+hex(weight_string('a'))
+0041
+select hex(weight_string('A'));
+hex(weight_string('A'))
+0041
+select hex(weight_string('abc'));
+hex(weight_string('abc'))
+004100420043
+select hex(weight_string('abc' as char(2)));
+hex(weight_string('abc' as char(2)))
+00410042
+select hex(weight_string('abc' as char(3)));
+hex(weight_string('abc' as char(3)))
+004100420043
+select hex(weight_string('abc' as char(5)));
+hex(weight_string('abc' as char(5)))
+00410042004300200020
+select hex(weight_string('abc', 1, 2, 0xC0));
+hex(weight_string('abc', 1, 2, 0xC0))
+00
+select hex(weight_string('abc', 2, 2, 0xC0));
+hex(weight_string('abc', 2, 2, 0xC0))
+0041
+select hex(weight_string('abc', 3, 2, 0xC0));
+hex(weight_string('abc', 3, 2, 0xC0))
+004100
+select hex(weight_string('abc', 4, 2, 0xC0));
+hex(weight_string('abc', 4, 2, 0xC0))
+00410042
+select hex(weight_string('abc', 5, 2, 0xC0));
+hex(weight_string('abc', 5, 2, 0xC0))
+0041004200
+select hex(weight_string('abc',25, 2, 0xC0));
+hex(weight_string('abc',25, 2, 0xC0))
+00410042002000200020002000200020002000200020002000
+select hex(weight_string('abc', 1, 3, 0xC0));
+hex(weight_string('abc', 1, 3, 0xC0))
+00
+select hex(weight_string('abc', 2, 3, 0xC0));
+hex(weight_string('abc', 2, 3, 0xC0))
+0041
+select hex(weight_string('abc', 3, 3, 0xC0));
+hex(weight_string('abc', 3, 3, 0xC0))
+004100
+select hex(weight_string('abc', 4, 3, 0xC0));
+hex(weight_string('abc', 4, 3, 0xC0))
+00410042
+select hex(weight_string('abc', 5, 3, 0xC0));
+hex(weight_string('abc', 5, 3, 0xC0))
+0041004200
+select hex(weight_string('abc',25, 3, 0xC0));
+hex(weight_string('abc',25, 3, 0xC0))
+00410042004300200020002000200020002000200020002000
+select hex(weight_string('abc', 1, 4, 0xC0));
+hex(weight_string('abc', 1, 4, 0xC0))
+00
+select hex(weight_string('abc', 2, 4, 0xC0));
+hex(weight_string('abc', 2, 4, 0xC0))
+0041
+select hex(weight_string('abc', 3, 4, 0xC0));
+hex(weight_string('abc', 3, 4, 0xC0))
+004100
+select hex(weight_string('abc', 4, 4, 0xC0));
+hex(weight_string('abc', 4, 4, 0xC0))
+00410042
+select hex(weight_string('abc', 5, 4, 0xC0));
+hex(weight_string('abc', 5, 4, 0xC0))
+0041004200
+select hex(weight_string('abc',25, 4, 0xC0));
+hex(weight_string('abc',25, 4, 0xC0))
+00410042004300200020002000200020002000200020002000
+select @@collation_connection;
+@@collation_connection
+ucs2_general_ci
+select hex(weight_string(cast(_latin1 0x80 as char)));
+hex(weight_string(cast(_latin1 0x80 as char)))
+20AC
+select hex(weight_string(cast(_latin1 0x808080 as char)));
+hex(weight_string(cast(_latin1 0x808080 as char)))
+20AC20AC20AC
+select hex(weight_string(cast(_latin1 0x808080 as char) as char(2)));
+hex(weight_string(cast(_latin1 0x808080 as char) as char(2)))
+20AC20AC
+select hex(weight_string(cast(_latin1 0x808080 as char) as char(3)));
+hex(weight_string(cast(_latin1 0x808080 as char) as char(3)))
+20AC20AC20AC
+select hex(weight_string(cast(_latin1 0x808080 as char) as char(5)));
+hex(weight_string(cast(_latin1 0x808080 as char) as char(5)))
+20AC20AC20AC00200020
+select hex(weight_string(cast(_latin1 0x808080 as char), 1, 2, 0xC0));
+hex(weight_string(cast(_latin1 0x808080 as char), 1, 2, 0xC0))
+20
+select hex(weight_string(cast(_latin1 0x808080 as char), 2, 2, 0xC0));
+hex(weight_string(cast(_latin1 0x808080 as char), 2, 2, 0xC0))
+20AC
+select hex(weight_string(cast(_latin1 0x808080 as char), 3, 2, 0xC0));
+hex(weight_string(cast(_latin1 0x808080 as char), 3, 2, 0xC0))
+20AC20
+select hex(weight_string(cast(_latin1 0x808080 as char), 4, 2, 0xC0));
+hex(weight_string(cast(_latin1 0x808080 as char), 4, 2, 0xC0))
+20AC20AC
+select hex(weight_string(cast(_latin1 0x808080 as char), 5, 2, 0xC0));
+hex(weight_string(cast(_latin1 0x808080 as char), 5, 2, 0xC0))
+20AC20AC00
+select hex(weight_string(cast(_latin1 0x808080 as char),25, 2, 0xC0));
+hex(weight_string(cast(_latin1 0x808080 as char),25, 2, 0xC0))
+20AC20AC002000200020002000200020002000200020002000
+select hex(weight_string(cast(_latin1 0x808080 as char), 1, 3, 0xC0));
+hex(weight_string(cast(_latin1 0x808080 as char), 1, 3, 0xC0))
+20
+select hex(weight_string(cast(_latin1 0x808080 as char), 2, 3, 0xC0));
+hex(weight_string(cast(_latin1 0x808080 as char), 2, 3, 0xC0))
+20AC
+select hex(weight_string(cast(_latin1 0x808080 as char), 3, 3, 0xC0));
+hex(weight_string(cast(_latin1 0x808080 as char), 3, 3, 0xC0))
+20AC20
+select hex(weight_string(cast(_latin1 0x808080 as char), 4, 3, 0xC0));
+hex(weight_string(cast(_latin1 0x808080 as char), 4, 3, 0xC0))
+20AC20AC
+select hex(weight_string(cast(_latin1 0x808080 as char), 5, 3, 0xC0));
+hex(weight_string(cast(_latin1 0x808080 as char), 5, 3, 0xC0))
+20AC20AC20
+select hex(weight_string(cast(_latin1 0x808080 as char),25, 3, 0xC0));
+hex(weight_string(cast(_latin1 0x808080 as char),25, 3, 0xC0))
+20AC20AC20AC00200020002000200020002000200020002000
+select hex(weight_string(cast(_latin1 0x808080 as char), 1, 4, 0xC0));
+hex(weight_string(cast(_latin1 0x808080 as char), 1, 4, 0xC0))
+20
+select hex(weight_string(cast(_latin1 0x808080 as char), 2, 4, 0xC0));
+hex(weight_string(cast(_latin1 0x808080 as char), 2, 4, 0xC0))
+20AC
+select hex(weight_string(cast(_latin1 0x808080 as char), 3, 4, 0xC0));
+hex(weight_string(cast(_latin1 0x808080 as char), 3, 4, 0xC0))
+20AC20
+select hex(weight_string(cast(_latin1 0x808080 as char), 4, 4, 0xC0));
+hex(weight_string(cast(_latin1 0x808080 as char), 4, 4, 0xC0))
+20AC20AC
+select hex(weight_string(cast(_latin1 0x808080 as char), 5, 4, 0xC0));
+hex(weight_string(cast(_latin1 0x808080 as char), 5, 4, 0xC0))
+20AC20AC20
+select hex(weight_string(cast(_latin1 0x808080 as char),25, 4, 0xC0));
+hex(weight_string(cast(_latin1 0x808080 as char),25, 4, 0xC0))
+20AC20AC20AC00200020002000200020002000200020002000
+select @@collation_connection;
+@@collation_connection
+ucs2_general_ci
+select hex(weight_string('a' LEVEL 1));
+hex(weight_string('a' LEVEL 1))
+0041
+select hex(weight_string('A' LEVEL 1));
+hex(weight_string('A' LEVEL 1))
+0041
+select hex(weight_string('abc' LEVEL 1));
+hex(weight_string('abc' LEVEL 1))
+004100420043
+select hex(weight_string('abc' as char(2) LEVEL 1));
+hex(weight_string('abc' as char(2) LEVEL 1))
+00410042
+select hex(weight_string('abc' as char(3) LEVEL 1));
+hex(weight_string('abc' as char(3) LEVEL 1))
+004100420043
+select hex(weight_string('abc' as char(5) LEVEL 1));
+hex(weight_string('abc' as char(5) LEVEL 1))
+00410042004300200020
+select hex(weight_string('abc' as char(5) LEVEL 1 REVERSE));
+hex(weight_string('abc' as char(5) LEVEL 1 REVERSE))
+20002000430042004100
+select hex(weight_string('abc' as char(5) LEVEL 1 DESC));
+hex(weight_string('abc' as char(5) LEVEL 1 DESC))
+FFBEFFBDFFBCFFDFFFDF
+select hex(weight_string('abc' as char(5) LEVEL 1 DESC REVERSE));
+hex(weight_string('abc' as char(5) LEVEL 1 DESC REVERSE))
+DFFFDFFFBCFFBDFFBEFF
+set collation_connection=ucs2_bin;
+select @@collation_connection;
+@@collation_connection
+ucs2_bin
+CREATE TABLE t1 AS SELECT 'a' AS a;
+SHOW CREATE TABLE t1;
+Table Create Table
+t1 CREATE TABLE `t1` (
+ `a` varchar(1) CHARACTER SET ucs2 COLLATE ucs2_bin NOT NULL DEFAULT ''
+) ENGINE=MyISAM DEFAULT CHARSET=latin1
+CREATE TABLE t2 AS SELECT WEIGHT_STRING(a) AS ws FROM t1;
+SHOW CREATE TABLE t2;
+Table Create Table
+t2 CREATE TABLE `t2` (
+ `ws` varbinary(2) DEFAULT NULL
+) ENGINE=MyISAM DEFAULT CHARSET=latin1
+SELECT HEX(WEIGHT_STRING(a)) FROM t1;
+HEX(WEIGHT_STRING(a))
+0061
+SELECT HEX(ws) FROM t2;
+HEX(ws)
+0061
+DROP TABLE t2;
+DROP TABLE t1;
+CREATE TABLE t1 AS SELECT REPEAT('a',5) AS a;
+SHOW CREATE TABLE t1;
+Table Create Table
+t1 CREATE TABLE `t1` (
+ `a` varchar(5) CHARACTER SET ucs2 COLLATE ucs2_bin NOT NULL DEFAULT ''
+) ENGINE=MyISAM DEFAULT CHARSET=latin1
+CREATE TABLE t2 AS SELECT WEIGHT_STRING(a) AS ws FROM t1;
+SHOW CREATE TABLE t2;
+Table Create Table
+t2 CREATE TABLE `t2` (
+ `ws` varbinary(10) DEFAULT NULL
+) ENGINE=MyISAM DEFAULT CHARSET=latin1
+SELECT HEX(WEIGHT_STRING(a)) FROM t1;
+HEX(WEIGHT_STRING(a))
+00610061006100610061
+SELECT HEX(ws) FROM t2;
+HEX(ws)
+00610061006100610061
+DROP TABLE t2;
+CREATE TABLE t2 AS SELECT WEIGHT_STRING(a AS CHAR(3)) AS ws FROM t1;
+SHOW CREATE TABLE t2;
+Table Create Table
+t2 CREATE TABLE `t2` (
+ `ws` varbinary(6) DEFAULT NULL
+) ENGINE=MyISAM DEFAULT CHARSET=latin1
+SELECT HEX(WEIGHT_STRING(a AS CHAR(3))) FROM t1;
+HEX(WEIGHT_STRING(a AS CHAR(3)))
+006100610061
+SELECT HEX(ws) FROM t2;
+HEX(ws)
+006100610061
+DROP TABLE t2;
+CREATE TABLE t2 AS SELECT WEIGHT_STRING(a AS CHAR(10)) AS ws FROM t1;
+SHOW CREATE TABLE t2;
+Table Create Table
+t2 CREATE TABLE `t2` (
+ `ws` varbinary(20) DEFAULT NULL
+) ENGINE=MyISAM DEFAULT CHARSET=latin1
+SELECT HEX(WEIGHT_STRING(a AS CHAR(10))) FROM t1;
+HEX(WEIGHT_STRING(a AS CHAR(10)))
+0061006100610061006100200020002000200020
+SELECT HEX(ws) FROM t2;
+HEX(ws)
+0061006100610061006100200020002000200020
+DROP TABLE t2;
+DROP TABLE t1;
+select hex(weight_string('a'));
+hex(weight_string('a'))
+0061
+select hex(weight_string('A'));
+hex(weight_string('A'))
+0041
+select hex(weight_string('abc'));
+hex(weight_string('abc'))
+006100620063
+select hex(weight_string('abc' as char(2)));
+hex(weight_string('abc' as char(2)))
+00610062
+select hex(weight_string('abc' as char(3)));
+hex(weight_string('abc' as char(3)))
+006100620063
+select hex(weight_string('abc' as char(5)));
+hex(weight_string('abc' as char(5)))
+00610062006300200020
+select hex(weight_string('abc', 1, 2, 0xC0));
+hex(weight_string('abc', 1, 2, 0xC0))
+00
+select hex(weight_string('abc', 2, 2, 0xC0));
+hex(weight_string('abc', 2, 2, 0xC0))
+0061
+select hex(weight_string('abc', 3, 2, 0xC0));
+hex(weight_string('abc', 3, 2, 0xC0))
+006100
+select hex(weight_string('abc', 4, 2, 0xC0));
+hex(weight_string('abc', 4, 2, 0xC0))
+00610062
+select hex(weight_string('abc', 5, 2, 0xC0));
+hex(weight_string('abc', 5, 2, 0xC0))
+0061006200
+select hex(weight_string('abc',25, 2, 0xC0));
+hex(weight_string('abc',25, 2, 0xC0))
+00610062002000200020002000200020002000200020002000
+select hex(weight_string('abc', 1, 3, 0xC0));
+hex(weight_string('abc', 1, 3, 0xC0))
+00
+select hex(weight_string('abc', 2, 3, 0xC0));
+hex(weight_string('abc', 2, 3, 0xC0))
+0061
+select hex(weight_string('abc', 3, 3, 0xC0));
+hex(weight_string('abc', 3, 3, 0xC0))
+006100
+select hex(weight_string('abc', 4, 3, 0xC0));
+hex(weight_string('abc', 4, 3, 0xC0))
+00610062
+select hex(weight_string('abc', 5, 3, 0xC0));
+hex(weight_string('abc', 5, 3, 0xC0))
+0061006200
+select hex(weight_string('abc',25, 3, 0xC0));
+hex(weight_string('abc',25, 3, 0xC0))
+00610062006300200020002000200020002000200020002000
+select hex(weight_string('abc', 1, 4, 0xC0));
+hex(weight_string('abc', 1, 4, 0xC0))
+00
+select hex(weight_string('abc', 2, 4, 0xC0));
+hex(weight_string('abc', 2, 4, 0xC0))
+0061
+select hex(weight_string('abc', 3, 4, 0xC0));
+hex(weight_string('abc', 3, 4, 0xC0))
+006100
+select hex(weight_string('abc', 4, 4, 0xC0));
+hex(weight_string('abc', 4, 4, 0xC0))
+00610062
+select hex(weight_string('abc', 5, 4, 0xC0));
+hex(weight_string('abc', 5, 4, 0xC0))
+0061006200
+select hex(weight_string('abc',25, 4, 0xC0));
+hex(weight_string('abc',25, 4, 0xC0))
+00610062006300200020002000200020002000200020002000
+select @@collation_connection;
+@@collation_connection
+ucs2_bin
+select hex(weight_string(cast(_latin1 0x80 as char)));
+hex(weight_string(cast(_latin1 0x80 as char)))
+20AC
+select hex(weight_string(cast(_latin1 0x808080 as char)));
+hex(weight_string(cast(_latin1 0x808080 as char)))
+20AC20AC20AC
+select hex(weight_string(cast(_latin1 0x808080 as char) as char(2)));
+hex(weight_string(cast(_latin1 0x808080 as char) as char(2)))
+20AC20AC
+select hex(weight_string(cast(_latin1 0x808080 as char) as char(3)));
+hex(weight_string(cast(_latin1 0x808080 as char) as char(3)))
+20AC20AC20AC
+select hex(weight_string(cast(_latin1 0x808080 as char) as char(5)));
+hex(weight_string(cast(_latin1 0x808080 as char) as char(5)))
+20AC20AC20AC00200020
+select hex(weight_string(cast(_latin1 0x808080 as char), 1, 2, 0xC0));
+hex(weight_string(cast(_latin1 0x808080 as char), 1, 2, 0xC0))
+20
+select hex(weight_string(cast(_latin1 0x808080 as char), 2, 2, 0xC0));
+hex(weight_string(cast(_latin1 0x808080 as char), 2, 2, 0xC0))
+20AC
+select hex(weight_string(cast(_latin1 0x808080 as char), 3, 2, 0xC0));
+hex(weight_string(cast(_latin1 0x808080 as char), 3, 2, 0xC0))
+20AC20
+select hex(weight_string(cast(_latin1 0x808080 as char), 4, 2, 0xC0));
+hex(weight_string(cast(_latin1 0x808080 as char), 4, 2, 0xC0))
+20AC20AC
+select hex(weight_string(cast(_latin1 0x808080 as char), 5, 2, 0xC0));
+hex(weight_string(cast(_latin1 0x808080 as char), 5, 2, 0xC0))
+20AC20AC00
+select hex(weight_string(cast(_latin1 0x808080 as char),25, 2, 0xC0));
+hex(weight_string(cast(_latin1 0x808080 as char),25, 2, 0xC0))
+20AC20AC002000200020002000200020002000200020002000
+select hex(weight_string(cast(_latin1 0x808080 as char), 1, 3, 0xC0));
+hex(weight_string(cast(_latin1 0x808080 as char), 1, 3, 0xC0))
+20
+select hex(weight_string(cast(_latin1 0x808080 as char), 2, 3, 0xC0));
+hex(weight_string(cast(_latin1 0x808080 as char), 2, 3, 0xC0))
+20AC
+select hex(weight_string(cast(_latin1 0x808080 as char), 3, 3, 0xC0));
+hex(weight_string(cast(_latin1 0x808080 as char), 3, 3, 0xC0))
+20AC20
+select hex(weight_string(cast(_latin1 0x808080 as char), 4, 3, 0xC0));
+hex(weight_string(cast(_latin1 0x808080 as char), 4, 3, 0xC0))
+20AC20AC
+select hex(weight_string(cast(_latin1 0x808080 as char), 5, 3, 0xC0));
+hex(weight_string(cast(_latin1 0x808080 as char), 5, 3, 0xC0))
+20AC20AC20
+select hex(weight_string(cast(_latin1 0x808080 as char),25, 3, 0xC0));
+hex(weight_string(cast(_latin1 0x808080 as char),25, 3, 0xC0))
+20AC20AC20AC00200020002000200020002000200020002000
+select hex(weight_string(cast(_latin1 0x808080 as char), 1, 4, 0xC0));
+hex(weight_string(cast(_latin1 0x808080 as char), 1, 4, 0xC0))
+20
+select hex(weight_string(cast(_latin1 0x808080 as char), 2, 4, 0xC0));
+hex(weight_string(cast(_latin1 0x808080 as char), 2, 4, 0xC0))
+20AC
+select hex(weight_string(cast(_latin1 0x808080 as char), 3, 4, 0xC0));
+hex(weight_string(cast(_latin1 0x808080 as char), 3, 4, 0xC0))
+20AC20
+select hex(weight_string(cast(_latin1 0x808080 as char), 4, 4, 0xC0));
+hex(weight_string(cast(_latin1 0x808080 as char), 4, 4, 0xC0))
+20AC20AC
+select hex(weight_string(cast(_latin1 0x808080 as char), 5, 4, 0xC0));
+hex(weight_string(cast(_latin1 0x808080 as char), 5, 4, 0xC0))
+20AC20AC20
+select hex(weight_string(cast(_latin1 0x808080 as char),25, 4, 0xC0));
+hex(weight_string(cast(_latin1 0x808080 as char),25, 4, 0xC0))
+20AC20AC20AC00200020002000200020002000200020002000
+select @@collation_connection;
+@@collation_connection
+ucs2_bin
+select hex(weight_string('a' LEVEL 1));
+hex(weight_string('a' LEVEL 1))
+0061
+select hex(weight_string('A' LEVEL 1));
+hex(weight_string('A' LEVEL 1))
+0041
+select hex(weight_string('abc' LEVEL 1));
+hex(weight_string('abc' LEVEL 1))
+006100620063
+select hex(weight_string('abc' as char(2) LEVEL 1));
+hex(weight_string('abc' as char(2) LEVEL 1))
+00610062
+select hex(weight_string('abc' as char(3) LEVEL 1));
+hex(weight_string('abc' as char(3) LEVEL 1))
+006100620063
+select hex(weight_string('abc' as char(5) LEVEL 1));
+hex(weight_string('abc' as char(5) LEVEL 1))
+00610062006300200020
+select hex(weight_string('abc' as char(5) LEVEL 1 REVERSE));
+hex(weight_string('abc' as char(5) LEVEL 1 REVERSE))
+20002000630062006100
+select hex(weight_string('abc' as char(5) LEVEL 1 DESC));
+hex(weight_string('abc' as char(5) LEVEL 1 DESC))
+FF9EFF9DFF9CFFDFFFDF
+select hex(weight_string('abc' as char(5) LEVEL 1 DESC REVERSE));
+hex(weight_string('abc' as char(5) LEVEL 1 DESC REVERSE))
+DFFFDFFF9CFF9DFF9EFF
+#
+# Bug #36418 Character sets: crash if char(256 using utf32)
+#
+select hex(char(0x01 using ucs2));
+hex(char(0x01 using ucs2))
+0001
+select hex(char(0x0102 using ucs2));
+hex(char(0x0102 using ucs2))
+0102
+select hex(char(0x010203 using ucs2));
+hex(char(0x010203 using ucs2))
+00010203
+select hex(char(0x01020304 using ucs2));
+hex(char(0x01020304 using ucs2))
+01020304
+#
+# Bug#10094 Displays wrong error message for UNIQUE key index on CHAR(255) Unicode datatype
+#
+CREATE TABLE t1 (f1 CHAR(255) unicode);
+INSERT INTO t1 values ('abc'),('bcd'),('abc');
+ALTER TABLE t1 ADD UNIQUE Index_1 (f1);
+ERROR 23000: Duplicate entry 'abc' for key 'Index_1'
+DROP TABLE t1;
+#
+# Test how character set works with date/time
+#
+SET collation_connection=ucs2_general_ci;
+#
+# Bug#32390 Character sets: casting utf32 to/from date doesn't work
+#
+CREATE TABLE t1 AS SELECT repeat('a',20) AS s1 LIMIT 0;
+SET time_zone=_latin1'+03:00';
+SET timestamp=1216359724;
+INSERT INTO t1 VALUES (current_date);
+INSERT INTO t1 VALUES (current_time);
+INSERT INTO t1 VALUES (current_timestamp);
+SELECT s1, hex(s1) FROM t1;
+s1 hex(s1)
+2008-07-18 0032003000300038002D00300037002D00310038
+08:42:04 00300038003A00340032003A00300034
+2008-07-18 08:42:04 0032003000300038002D00300037002D00310038002000300038003A00340032003A00300034
+DROP TABLE t1;
+SET timestamp=0;
+SET time_zone=default;
+#
+# MDEV-5298 Illegal mix of collations on timestamp
+#
+SELECT CHARSET('2013-11-15 00:41:28' - INTERVAL 7 DAY);
+CHARSET('2013-11-15 00:41:28' - INTERVAL 7 DAY)
+ucs2
+SELECT COERCIBILITY('2013-11-15 00:41:28' - INTERVAL 7 DAY);
+COERCIBILITY('2013-11-15 00:41:28' - INTERVAL 7 DAY)
+4
+SELECT CHARSET(TIMESTAMP'2013-11-15 00:41:28' - INTERVAL 7 DAY);
+CHARSET(TIMESTAMP'2013-11-15 00:41:28' - INTERVAL 7 DAY)
+binary
+SELECT COERCIBILITY(TIMESTAMP'2013-11-15 00:41:28' - INTERVAL 7 DAY);
+COERCIBILITY(TIMESTAMP'2013-11-15 00:41:28' - INTERVAL 7 DAY)
+5
+SELECT CHARSET(CONCAT('2013-11-15 00:41:28' - INTERVAL 7 DAY));
+CHARSET(CONCAT('2013-11-15 00:41:28' - INTERVAL 7 DAY))
+ucs2
+SELECT COERCIBILITY(CONCAT('2013-11-15 00:41:28' - INTERVAL 7 DAY));
+COERCIBILITY(CONCAT('2013-11-15 00:41:28' - INTERVAL 7 DAY))
+4
+SELECT CHARSET(CONCAT(TIMESTAMP'2013-11-15 00:41:28' - INTERVAL 7 DAY));
+CHARSET(CONCAT(TIMESTAMP'2013-11-15 00:41:28' - INTERVAL 7 DAY))
+ucs2
+SELECT COERCIBILITY(CONCAT(TIMESTAMP'2013-11-15 00:41:28' - INTERVAL 7 DAY));
+COERCIBILITY(CONCAT(TIMESTAMP'2013-11-15 00:41:28' - INTERVAL 7 DAY))
+4
+SELECT CHARSET(CONCAT('','2001-01-08 00:00:00' - INTERVAL 7 DAY));
+CHARSET(CONCAT('','2001-01-08 00:00:00' - INTERVAL 7 DAY))
+ucs2
+SELECT COERCIBILITY(CONCAT('','2001-01-08 00:00:00' - INTERVAL 7 DAY));
+COERCIBILITY(CONCAT('','2001-01-08 00:00:00' - INTERVAL 7 DAY))
+4
+SELECT HEX(CONCAT('','2001-01-08 00:00:00' - INTERVAL 7 DAY));
+HEX(CONCAT('','2001-01-08 00:00:00' - INTERVAL 7 DAY))
+0032003000300031002D00300031002D00300031002000300030003A00300030003A00300030
+SELECT CHARSET(CONCAT('',TIMESTAMP'2001-01-08 00:00:00' - INTERVAL 7 DAY));
+CHARSET(CONCAT('',TIMESTAMP'2001-01-08 00:00:00' - INTERVAL 7 DAY))
+ucs2
+SELECT COERCIBILITY(CONCAT('',TIMESTAMP'2001-01-08 00:00:00' - INTERVAL 7 DAY));
+COERCIBILITY(CONCAT('',TIMESTAMP'2001-01-08 00:00:00' - INTERVAL 7 DAY))
+4
+SELECT HEX(CONCAT('',TIMESTAMP'2001-01-08 00:00:00' - INTERVAL 7 DAY));
+HEX(CONCAT('',TIMESTAMP'2001-01-08 00:00:00' - INTERVAL 7 DAY))
+0032003000300031002D00300031002D00300031002000300030003A00300030003A00300030
+CREATE TABLE t1 AS SELECT REPEAT('a', 64) AS a LIMIT 0;
+SHOW CREATE TABLE t1;
+Table Create Table
+t1 CREATE TABLE `t1` (
+ `a` varchar(64) CHARACTER SET ucs2 NOT NULL DEFAULT ''
+) ENGINE=MyISAM DEFAULT CHARSET=latin1
+INSERT INTO t1 VALUES ('');
+SELECT CHARSET(CONCAT(a,'2001-01-08 00:00:00' - INTERVAL 7 DAY)) FROM t1;
+CHARSET(CONCAT(a,'2001-01-08 00:00:00' - INTERVAL 7 DAY))
+ucs2
+SELECT COERCIBILITY(CONCAT(a,'2001-01-08 00:00:00' - INTERVAL 7 DAY)) FROM t1;
+COERCIBILITY(CONCAT(a,'2001-01-08 00:00:00' - INTERVAL 7 DAY))
+2
+SELECT HEX(CONCAT(a,'2001-01-08 00:00:00' - INTERVAL 7 DAY)) FROM t1;
+HEX(CONCAT(a,'2001-01-08 00:00:00' - INTERVAL 7 DAY))
+0032003000300031002D00300031002D00300031002000300030003A00300030003A00300030
+SELECT CHARSET(CONCAT(a,TIMESTAMP'2001-01-08 00:00:00' - INTERVAL 7 DAY)) FROM t1;
+CHARSET(CONCAT(a,TIMESTAMP'2001-01-08 00:00:00' - INTERVAL 7 DAY))
+ucs2
+SELECT COERCIBILITY(CONCAT(a,TIMESTAMP'2001-01-08 00:00:00' - INTERVAL 7 DAY)) FROM t1;
+COERCIBILITY(CONCAT(a,TIMESTAMP'2001-01-08 00:00:00' - INTERVAL 7 DAY))
+2
+SELECT HEX(CONCAT(a,TIMESTAMP'2001-01-08 00:00:00' - INTERVAL 7 DAY)) FROM t1;
+HEX(CONCAT(a,TIMESTAMP'2001-01-08 00:00:00' - INTERVAL 7 DAY))
+0032003000300031002D00300031002D00300031002000300030003A00300030003A00300030
+DROP TABLE t1;
+CREATE TABLE t1 (t TIMESTAMP NOT NULL);
+INSERT INTO t1 VALUES ('2001-01-01 00:00:00');
+SELECT * FROM t1 WHERE t < '2013-11-15 00:41:28' - INTERVAL 7 DAY;
+t
+2001-01-01 00:00:00
+SELECT * FROM t1 WHERE t = '2001-01-08 00:00:00' - INTERVAL 7 DAY;
+t
+2001-01-01 00:00:00
+SELECT * FROM t1 WHERE t < CONCAT('2013-11-15 00:41:28',LEFT(RAND(),0)) - INTERVAL 7 DAY;
+t
+2001-01-01 00:00:00
+SELECT * FROM t1 WHERE t = CONCAT('2001-01-08 00:00:00',LEFT(RAND(),0)) - INTERVAL 7 DAY;
+t
+2001-01-01 00:00:00
+SELECT * FROM t1 WHERE t < TIMESTAMP'2013-11-15 00:41:28' - INTERVAL 7 DAY;
+t
+2001-01-01 00:00:00
+SELECT * FROM t1 WHERE t = TIMESTAMP'2001-01-08 00:00:00' - INTERVAL 7 DAY;
+t
+2001-01-01 00:00:00
+DROP TABLE t1;
+SET NAMES latin1;
+#
+# WL#4013 Unicode german2 collation
+#
+SET collation_connection=ucs2_german2_ci;
+"BEGIN ctype_german.inc"
+drop table if exists t1;
+create table t1 as select repeat(' ', 64) as s1;
+select collation(s1) from t1;
+collation(s1)
+ucs2_german2_ci
+delete from t1;
+INSERT INTO t1 VALUES ('ud'),('uf');
+INSERT INTO t1 VALUES ('od'),('of');
+INSERT INTO t1 VALUES ('e');
+INSERT INTO t1 VALUES ('ad'),('af');
+insert into t1 values ('a'),('ae'),(_latin1 0xE4);
+insert into t1 values ('o'),('oe'),(_latin1 0xF6);
+insert into t1 values ('s'),('ss'),(_latin1 0xDF);
+insert into t1 values ('u'),('ue'),(_latin1 0xFC);
+INSERT INTO t1 VALUES (_latin1 0xE6), (_latin1 0xC6);
+INSERT INTO t1 VALUES (_latin1 0x9C), (_latin1 0x8C);
+select s1, hex(s1) from t1 order by s1, binary s1;
+s1 hex(s1)
+a 0061
+ad 00610064
+ae 00610065
+� 00C6
+� 00E4
+� 00E6
+af 00610066
+e 0065
+o 006F
+od 006F0064
+oe 006F0065
+� 00F6
+� 0152
+� 0153
+of 006F0066
+s 0073
+ss 00730073
+� 00DF
+u 0075
+ud 00750064
+ue 00750065
+� 00FC
+uf 00750066
+select group_concat(s1 order by binary s1) from t1 group by s1;
+group_concat(s1 order by binary s1)
+a
+ad
+ae,�,�,�
+af
+e
+o
+od
+oe,�,�,�
+of
+s
+ss,�
+u
+ud
+ue,�
+uf
+SELECT s1, hex(s1), hex(weight_string(s1)) FROM t1 ORDER BY s1, BINARY(s1);
+s1 hex(s1) hex(weight_string(s1))
+a 0061 0E33
+ad 00610064 0E330E6D
+ae 00610065 0E330E8B
+� 00C6 0E330E8B
+� 00E4 0E330E8B
+� 00E6 0E330E8B
+af 00610066 0E330EB9
+e 0065 0E8B
+o 006F 0F82
+od 006F0064 0F820E6D
+oe 006F0065 0F820E8B
+� 00F6 0F820E8B
+� 0152 0F820E8B
+� 0153 0F820E8B
+of 006F0066 0F820EB9
+s 0073 0FEA
+ss 00730073 0FEA0FEA
+� 00DF 0FEA0FEA
+u 0075 101F
+ud 00750064 101F0E6D
+ue 00750065 101F0E8B
+� 00FC 101F0E8B
+uf 00750066 101F0EB9
+SELECT s1, hex(s1) FROM t1 WHERE s1='ae' ORDER BY s1, BINARY(s1);
+s1 hex(s1)
+ae 00610065
+� 00C6
+� 00E4
+� 00E6
+drop table t1;
+CREATE TABLE t1 AS SELECT REPEAT('a',1) AS a, 1 AS b LIMIT 0;
+SHOW CREATE TABLE t1;
+Table Create Table
+t1 CREATE TABLE `t1` (
+ `a` varchar(1) CHARACTER SET ucs2 COLLATE ucs2_german2_ci NOT NULL DEFAULT '',
+ `b` int(1) NOT NULL DEFAULT '0'
+) ENGINE=MyISAM DEFAULT CHARSET=latin1
+INSERT INTO t1 VALUES ('s',0),(_latin1 0xDF,1);
+SELECT * FROM t1 ORDER BY a, b;
+a b
+s 0
+� 1
+SELECT * FROM t1 ORDER BY a DESC, b;
+a b
+� 1
+s 0
+SELECT * FROM t1 ORDER BY CONCAT(a), b;
+a b
+s 0
+� 1
+SELECT * FROM t1 ORDER BY CONCAT(a) DESC, b;
+a b
+� 1
+s 0
+DROP TABLE t1;
+"END ctype_german.inc"
+#
+# Bug#59145 valgrind warnings for uninitialized values in my_strtoll10_mb2
+#
+SET NAMES latin1;
+SELECT CONVERT(CHAR(NULL USING ucs2), UNSIGNED);
+CONVERT(CHAR(NULL USING ucs2), UNSIGNED)
+0
+Warnings:
+Warning 1292 Truncated incorrect INTEGER value: ''
+DO IFNULL(CHAR(NULL USING ucs2), '');
+DO CAST(CONVERT('' USING ucs2) AS UNSIGNED);
+Warnings:
+Warning 1292 Truncated incorrect INTEGER value: ''
+#
+# Test error message for conversion using different charset
+#
+CREATE TABLE t1 (a DECIMAL(2,0));
+SET sql_mode='strict_all_tables';
+INSERT INTO t1 VALUES (CONVERT('9e99999999' USING ucs2));
+ERROR 22007: Incorrect decimal value: '9e99999999' for column 'a' at row 1
+SET sql_mode=DEFAULT;
+INSERT INTO t1 VALUES (CONVERT('aaa' USING ucs2));
+Warnings:
+Warning 1366 Incorrect decimal value: 'aaa' for column 'a' at row 1
+DROP TABLE t1;
+#
+# End of 5.6 tests
+#
+#
+# Start of 10.0 tests
+#
+SET NAMES latin1, collation_connection=ucs2_bin;
+#
+# MDEV-7149 Constant condition propagation erroneously applied for LIKE
+#
+CREATE TABLE t1 AS SELECT REPEAT('a', 10) AS c1 LIMIT 0;
+SHOW CREATE TABLE t1;
+Table Create Table
+t1 CREATE TABLE `t1` (
+ `c1` varchar(10) CHARACTER SET ucs2 COLLATE ucs2_bin NOT NULL DEFAULT ''
+) ENGINE=MyISAM DEFAULT CHARSET=latin1
+INSERT INTO t1 VALUES ('a'),('a ');
+SELECT * FROM t1 WHERE CONCAT(c1)='a';
+c1
+a
+a
+SELECT * FROM t1 WHERE CONCAT(c1) LIKE 'a ';
+c1
+a
+SELECT * FROM t1 WHERE CONCAT(c1)='a' AND CONCAT(c1) LIKE 'a ';
+c1
+a
+EXPLAIN EXTENDED SELECT * FROM t1 WHERE CONCAT(c1)='a' AND CONCAT(c1) LIKE 'a ';
+id select_type table type possible_keys key key_len ref rows filtered Extra
+1 SIMPLE t1 ALL NULL NULL NULL NULL 2 100.00 Using where
+Warnings:
+Note 1003 select `test`.`t1`.`c1` AS `c1` from `test`.`t1` where ((concat(`test`.`t1`.`c1`) = 'a') and (concat(`test`.`t1`.`c1`) like 'a '))
+DROP TABLE t1;
+CREATE TABLE t1 AS SELECT REPEAT('a', 10) AS c1 LIMIT 0;
+SHOW CREATE TABLE t1;
+Table Create Table
+t1 CREATE TABLE `t1` (
+ `c1` varchar(10) CHARACTER SET ucs2 COLLATE ucs2_bin NOT NULL DEFAULT ''
+) ENGINE=MyISAM DEFAULT CHARSET=latin1
+INSERT INTO t1 VALUES ('a'),('a ');
+SELECT * FROM t1 WHERE 'a'=CONCAT(c1);
+c1
+a
+a
+SELECT * FROM t1 WHERE 'a ' LIKE CONCAT(c1);
+c1
+a
+SELECT * FROM t1 WHERE 'a'=CONCAT(c1) AND 'a ' LIKE CONCAT(c1);
+c1
+a
+EXPLAIN EXTENDED SELECT * FROM t1 WHERE 'a'=CONCAT(c1) AND 'a ' LIKE CONCAT(c1);
+id select_type table type possible_keys key key_len ref rows filtered Extra
+1 SIMPLE t1 ALL NULL NULL NULL NULL 2 100.00 Using where
+Warnings:
+Note 1003 select `test`.`t1`.`c1` AS `c1` from `test`.`t1` where (('a' = concat(`test`.`t1`.`c1`)) and ('a ' like concat(`test`.`t1`.`c1`)))
+DROP TABLE t1;
+CREATE TABLE t1 AS SELECT REPEAT('a', 10) AS c1 LIMIT 0;
+SHOW CREATE TABLE t1;
+Table Create Table
+t1 CREATE TABLE `t1` (
+ `c1` varchar(10) CHARACTER SET ucs2 COLLATE ucs2_bin NOT NULL DEFAULT ''
+) ENGINE=MyISAM DEFAULT CHARSET=latin1
+INSERT INTO t1 VALUES ('%'),('% ');
+SELECT * FROM t1 WHERE '% '=CONCAT(c1);
+c1
+%
+%
+SELECT * FROM t1 WHERE 'a' LIKE CONCAT(c1);
+c1
+%
+SELECT * FROM t1 WHERE '% '=CONCAT(c1) AND 'a' LIKE CONCAT(c1);
+c1
+%
+EXPLAIN EXTENDED SELECT * FROM t1 WHERE '% '=CONCAT(c1) AND 'a' LIKE CONCAT(c1);
+id select_type table type possible_keys key key_len ref rows filtered Extra
+1 SIMPLE t1 ALL NULL NULL NULL NULL 2 100.00 Using where
+Warnings:
+Note 1003 select `test`.`t1`.`c1` AS `c1` from `test`.`t1` where (('% ' = concat(`test`.`t1`.`c1`)) and ('a' like concat(`test`.`t1`.`c1`)))
+DROP TABLE t1;
+CREATE TABLE t1 AS SELECT REPEAT('a', 10) AS c1 LIMIT 0;
+SHOW CREATE TABLE t1;
+Table Create Table
+t1 CREATE TABLE `t1` (
+ `c1` varchar(10) CHARACTER SET ucs2 COLLATE ucs2_bin NOT NULL DEFAULT ''
+) ENGINE=MyISAM DEFAULT CHARSET=latin1
+INSERT INTO t1 VALUES ('%'),('% ');
+SELECT * FROM t1 WHERE '%'=CONCAT(c1);
+c1
+%
+%
+SELECT * FROM t1 WHERE 'a' LIKE CONCAT(c1);
+c1
+%
+SELECT * FROM t1 WHERE '%'=CONCAT(c1) AND 'a' LIKE CONCAT(c1);
+c1
+%
+EXPLAIN EXTENDED SELECT * FROM t1 WHERE '%'=CONCAT(c1) AND 'a' LIKE CONCAT(c1);
+id select_type table type possible_keys key key_len ref rows filtered Extra
+1 SIMPLE t1 ALL NULL NULL NULL NULL 2 100.00 Using where
+Warnings:
+Note 1003 select `test`.`t1`.`c1` AS `c1` from `test`.`t1` where (('%' = concat(`test`.`t1`.`c1`)) and ('a' like concat(`test`.`t1`.`c1`)))
+DROP TABLE t1;
+SET NAMES latin1, collation_connection=ucs2_general_ci;
+#
+# MDEV-7149 Constant condition propagation erroneously applied for LIKE
+#
+CREATE TABLE t1 AS SELECT REPEAT('a', 10) AS c1 LIMIT 0;
+SHOW CREATE TABLE t1;
+Table Create Table
+t1 CREATE TABLE `t1` (
+ `c1` varchar(10) CHARACTER SET ucs2 NOT NULL DEFAULT ''
+) ENGINE=MyISAM DEFAULT CHARSET=latin1
+INSERT INTO t1 VALUES ('a'),('a ');
+SELECT * FROM t1 WHERE CONCAT(c1)='a';
+c1
+a
+a
+SELECT * FROM t1 WHERE CONCAT(c1) LIKE 'a ';
+c1
+a
+SELECT * FROM t1 WHERE CONCAT(c1)='a' AND CONCAT(c1) LIKE 'a ';
+c1
+a
+EXPLAIN EXTENDED SELECT * FROM t1 WHERE CONCAT(c1)='a' AND CONCAT(c1) LIKE 'a ';
+id select_type table type possible_keys key key_len ref rows filtered Extra
+1 SIMPLE t1 ALL NULL NULL NULL NULL 2 100.00 Using where
+Warnings:
+Note 1003 select `test`.`t1`.`c1` AS `c1` from `test`.`t1` where ((concat(`test`.`t1`.`c1`) = 'a') and (concat(`test`.`t1`.`c1`) like 'a '))
+DROP TABLE t1;
+CREATE TABLE t1 AS SELECT REPEAT('a', 10) AS c1 LIMIT 0;
+SHOW CREATE TABLE t1;
+Table Create Table
+t1 CREATE TABLE `t1` (
+ `c1` varchar(10) CHARACTER SET ucs2 NOT NULL DEFAULT ''
+) ENGINE=MyISAM DEFAULT CHARSET=latin1
+INSERT INTO t1 VALUES ('a'),('a ');
+SELECT * FROM t1 WHERE 'a'=CONCAT(c1);
+c1
+a
+a
+SELECT * FROM t1 WHERE 'a ' LIKE CONCAT(c1);
+c1
+a
+SELECT * FROM t1 WHERE 'a'=CONCAT(c1) AND 'a ' LIKE CONCAT(c1);
+c1
+a
+EXPLAIN EXTENDED SELECT * FROM t1 WHERE 'a'=CONCAT(c1) AND 'a ' LIKE CONCAT(c1);
+id select_type table type possible_keys key key_len ref rows filtered Extra
+1 SIMPLE t1 ALL NULL NULL NULL NULL 2 100.00 Using where
+Warnings:
+Note 1003 select `test`.`t1`.`c1` AS `c1` from `test`.`t1` where (('a' = concat(`test`.`t1`.`c1`)) and ('a ' like concat(`test`.`t1`.`c1`)))
+DROP TABLE t1;
+CREATE TABLE t1 AS SELECT REPEAT('a', 10) AS c1 LIMIT 0;
+SHOW CREATE TABLE t1;
+Table Create Table
+t1 CREATE TABLE `t1` (
+ `c1` varchar(10) CHARACTER SET ucs2 NOT NULL DEFAULT ''
+) ENGINE=MyISAM DEFAULT CHARSET=latin1
+INSERT INTO t1 VALUES ('%'),('% ');
+SELECT * FROM t1 WHERE '% '=CONCAT(c1);
+c1
+%
+%
+SELECT * FROM t1 WHERE 'a' LIKE CONCAT(c1);
+c1
+%
+SELECT * FROM t1 WHERE '% '=CONCAT(c1) AND 'a' LIKE CONCAT(c1);
+c1
+%
+EXPLAIN EXTENDED SELECT * FROM t1 WHERE '% '=CONCAT(c1) AND 'a' LIKE CONCAT(c1);
+id select_type table type possible_keys key key_len ref rows filtered Extra
+1 SIMPLE t1 ALL NULL NULL NULL NULL 2 100.00 Using where
+Warnings:
+Note 1003 select `test`.`t1`.`c1` AS `c1` from `test`.`t1` where (('% ' = concat(`test`.`t1`.`c1`)) and ('a' like concat(`test`.`t1`.`c1`)))
+DROP TABLE t1;
+CREATE TABLE t1 AS SELECT REPEAT('a', 10) AS c1 LIMIT 0;
+SHOW CREATE TABLE t1;
+Table Create Table
+t1 CREATE TABLE `t1` (
+ `c1` varchar(10) CHARACTER SET ucs2 NOT NULL DEFAULT ''
+) ENGINE=MyISAM DEFAULT CHARSET=latin1
+INSERT INTO t1 VALUES ('%'),('% ');
+SELECT * FROM t1 WHERE '%'=CONCAT(c1);
+c1
+%
+%
+SELECT * FROM t1 WHERE 'a' LIKE CONCAT(c1);
+c1
+%
+SELECT * FROM t1 WHERE '%'=CONCAT(c1) AND 'a' LIKE CONCAT(c1);
+c1
+%
+EXPLAIN EXTENDED SELECT * FROM t1 WHERE '%'=CONCAT(c1) AND 'a' LIKE CONCAT(c1);
+id select_type table type possible_keys key key_len ref rows filtered Extra
+1 SIMPLE t1 ALL NULL NULL NULL NULL 2 100.00 Using where
+Warnings:
+Note 1003 select `test`.`t1`.`c1` AS `c1` from `test`.`t1` where (('%' = concat(`test`.`t1`.`c1`)) and ('a' like concat(`test`.`t1`.`c1`)))
+DROP TABLE t1;
+SET NAMES latin1;
+#
+# MDEV-6661 PI() does not work well in UCS2/UTF16/UTF32 context
+#
+SELECT CONCAT(CONVERT('pi=' USING ucs2),PI()) AS PI;
+PI
+pi=3.141593
+#
+# MDEV-6695 Bad column name for UCS2 string literals
+#
+SET NAMES utf8, character_set_connection=ucs2;
+SELECT 'a','aa';
+a aa
+a aa
+#
+# MDEV-10306 Wrong results with combination of CONCAT, SUBSTR and CONVERT in subquery
+#
+SET NAMES utf8, character_set_connection=ucs2;
+SET @save_optimizer_switch=@@optimizer_switch;
+SET optimizer_switch=_utf8'derived_merge=on';
+CREATE TABLE t1 (t VARCHAR(10) CHARSET latin1);
+INSERT INTO t1 VALUES('abcdefghi');
+SET NAMES utf8, character_set_connection=ucs2;
+SELECT CONCAT(t2,'-',t2) c2 FROM (SELECT HEX(t) t2 FROM t1) sub;
+c2
+616263646566676869-616263646566676869
+SELECT CONCAT(t2,'-',t2) c2 FROM (SELECT TO_BASE64(t) t2 FROM t1) sub;
+c2
+YWJjZGVmZ2hp-YWJjZGVmZ2hp
+DROP TABLE t1;
+SET optimizer_switch=@save_optimizer_switch;
+#
+# End of 10.0 tests
+#
diff --cc mysql-test/r/func_misc.result
index d54a70cab45,66e3cfd4ff4..ea3f57c6204
--- a/mysql-test/r/func_misc.result
+++ b/mysql-test/r/func_misc.result
@@@ -571,6 -571,20 +571,17 @@@ AND 57813X540X1723 = 'Test'
N AVG
0 NULL
drop table t1;
+ #
+ # MDEV-15630 uuid() function evaluates at wrong time in query
+ #
+ CREATE TABLE t1 (id INT);
+ INSERT INTO t1 VALUES (1),(2),(3);
+ SELECT COUNT(1), UUID() as uid FROM t1 GROUP BY uid;
+ COUNT(1) uid
+ 1 xxxxxxxx-xxxx-xxxx-xxxx-xxxxxxxxxxxx
+ 1 xxxxxxxx-xxxx-xxxx-xxxx-xxxxxxxxxxxx
+ 1 xxxxxxxx-xxxx-xxxx-xxxx-xxxxxxxxxxxx
+ DROP TABLE t1;
-#
-# End of 5.5 tests
-#
SELECT NAME_CONST('a', -(1 OR 2)) OR 1;
ERROR HY000: Incorrect arguments to NAME_CONST
SELECT NAME_CONST('a', -(1 AND 2)) OR 1;
diff --cc mysql-test/r/view.result
index 5a51ea85f55,7fc3c48c3a0..4e3146052e9
--- a/mysql-test/r/view.result
+++ b/mysql-test/r/view.result
@@@ -5644,6 -5535,203 +5536,203 @@@ View Create View character_set_client c
v1 CREATE ALGORITHM=UNDEFINED DEFINER=`root`@`localhost` SQL SECURITY DEFINER VIEW `v1` AS select group_concat(`t1`.`str` separator '\\') AS `GROUP_CONCAT(str SEPARATOR '\\')` from `t1` latin1 latin1_swedish_ci
drop view v1;
drop table t1;
+ CREATE TABLE IF NOT EXISTS t0 (f0 INT);
+ CREATE TABLE IF NOT EXISTS t1 (f1 INT);
+ CREATE TABLE IF NOT EXISTS t2 (f2 INT);
+ CREATE TABLE IF NOT EXISTS t3 (f3 INT);
+ CREATE TABLE IF NOT EXISTS t4 (f4 INT);
+ CREATE TABLE IF NOT EXISTS t5 (f5 INT);
+ CREATE TABLE IF NOT EXISTS t6 (f6 INT);
+ CREATE TABLE IF NOT EXISTS t7 (f7 INT);
+ CREATE TABLE IF NOT EXISTS t8 (f8 INT);
+ CREATE TABLE IF NOT EXISTS t9 (f9 INT);
+ CREATE TABLE IF NOT EXISTS t10 (f10 INT);
+ CREATE TABLE IF NOT EXISTS t11 (f11 INT);
+ CREATE TABLE IF NOT EXISTS t12 (f12 INT);
+ CREATE TABLE IF NOT EXISTS t13 (f13 INT);
+ CREATE TABLE IF NOT EXISTS t14 (f14 INT);
+ CREATE TABLE IF NOT EXISTS t15 (f15 INT);
+ CREATE TABLE IF NOT EXISTS t16 (f16 INT);
+ CREATE TABLE IF NOT EXISTS t17 (f17 INT);
+ CREATE TABLE IF NOT EXISTS t18 (f18 INT);
+ CREATE TABLE IF NOT EXISTS t19 (f19 INT);
+ CREATE TABLE IF NOT EXISTS t20 (f20 INT);
+ CREATE TABLE IF NOT EXISTS t21 (f21 INT);
+ CREATE TABLE IF NOT EXISTS t22 (f22 INT);
+ CREATE TABLE IF NOT EXISTS t23 (f23 INT);
+ CREATE TABLE IF NOT EXISTS t24 (f24 INT);
+ CREATE TABLE IF NOT EXISTS t25 (f25 INT);
+ CREATE TABLE IF NOT EXISTS t26 (f26 INT);
+ CREATE TABLE IF NOT EXISTS t27 (f27 INT);
+ CREATE TABLE IF NOT EXISTS t28 (f28 INT);
+ CREATE TABLE IF NOT EXISTS t29 (f29 INT);
+ CREATE TABLE IF NOT EXISTS t30 (f30 INT);
+ CREATE TABLE IF NOT EXISTS t31 (f31 INT);
+ CREATE TABLE IF NOT EXISTS t32 (f32 INT);
+ CREATE TABLE IF NOT EXISTS t33 (f33 INT);
+ CREATE TABLE IF NOT EXISTS t34 (f34 INT);
+ CREATE TABLE IF NOT EXISTS t35 (f35 INT);
+ CREATE TABLE IF NOT EXISTS t36 (f36 INT);
+ CREATE TABLE IF NOT EXISTS t37 (f37 INT);
+ CREATE TABLE IF NOT EXISTS t38 (f38 INT);
+ CREATE TABLE IF NOT EXISTS t39 (f39 INT);
+ CREATE TABLE IF NOT EXISTS t40 (f40 INT);
+ CREATE TABLE IF NOT EXISTS t41 (f41 INT);
+ CREATE TABLE IF NOT EXISTS t42 (f42 INT);
+ CREATE TABLE IF NOT EXISTS t43 (f43 INT);
+ CREATE TABLE IF NOT EXISTS t44 (f44 INT);
+ CREATE TABLE IF NOT EXISTS t45 (f45 INT);
+ CREATE TABLE IF NOT EXISTS t46 (f46 INT);
+ CREATE TABLE IF NOT EXISTS t47 (f47 INT);
+ CREATE TABLE IF NOT EXISTS t48 (f48 INT);
+ CREATE TABLE IF NOT EXISTS t49 (f49 INT);
+ CREATE TABLE IF NOT EXISTS t50 (f50 INT);
+ CREATE TABLE IF NOT EXISTS t51 (f51 INT);
+ CREATE TABLE IF NOT EXISTS t52 (f52 INT);
+ CREATE TABLE IF NOT EXISTS t53 (f53 INT);
+ CREATE TABLE IF NOT EXISTS t54 (f54 INT);
+ CREATE TABLE IF NOT EXISTS t55 (f55 INT);
+ CREATE TABLE IF NOT EXISTS t56 (f56 INT);
+ CREATE TABLE IF NOT EXISTS t57 (f57 INT);
+ CREATE TABLE IF NOT EXISTS t58 (f58 INT);
+ CREATE TABLE IF NOT EXISTS t59 (f59 INT);
+ CREATE TABLE IF NOT EXISTS t60 (f60 INT);
+ CREATE OR REPLACE VIEW v60 AS SELECT * FROM t60;
+ EXPLAIN
+ SELECT t0.*
+ FROM t0
+ JOIN t1
+ ON t1.f1 = t0.f0
+ LEFT JOIN t2
+ ON t0.f0 = t2.f2
+ LEFT JOIN t3
+ ON t0.f0 = t3.f3
+ LEFT JOIN t4
+ ON t0.f0 = t4.f4
+ LEFT JOIN t5
+ ON t4.f4 = t5.f5
+ LEFT JOIN t6
+ ON t0.f0 = t6.f6
+ LEFT JOIN t7
+ ON t0.f0 = t7.f7
+ LEFT JOIN t8
+ ON t0.f0 = t8.f8
+ LEFT JOIN t9
+ ON t0.f0 = t9.f9
+ LEFT JOIN t10
+ ON t0.f0 = t10.f10
+ LEFT JOIN t11
+ ON t0.f0 = t11.f11
+ LEFT JOIN t12
+ ON t0.f0 = t12.f12
+ LEFT JOIN t13
+ ON t0.f0 = t13.f13
+ LEFT JOIN t14
+ ON t0.f0 = t14.f14
+ LEFT JOIN t15
+ ON t0.f0 = t15.f15
+ LEFT JOIN t16
+ ON t0.f0 = t16.f16
+ LEFT JOIN t17
+ ON t0.f0 = t17.f17
+ LEFT JOIN t18
+ ON t0.f0 = t18.f18
+ LEFT JOIN t19
+ ON t18.f18 = t19.f19
+ LEFT JOIN t20
+ ON t20.f20 = t19.f19
+ LEFT JOIN t21
+ ON t20.f20 = t21.f21
+ LEFT JOIN t22
+ ON t19.f19 = t22.f22
+ LEFT JOIN t23
+ ON t23.f23 = t0.f0
+ LEFT JOIN t24
+ ON t24.f24 = t23.f23
+ LEFT JOIN t25
+ ON t0.f0 = t25.f25
+ LEFT JOIN t26
+ ON t26.f26 = t0.f0
+ LEFT JOIN t27
+ ON t27.f27 = t0.f0
+ LEFT JOIN t28
+ ON t0.f0 = t28.f28
+ LEFT JOIN t29
+ ON t0.f0 = t29.f29
+ LEFT JOIN t30
+ ON t30.f30 = t0.f0
+ LEFT JOIN t31
+ ON t0.f0 = t31.f31
+ LEFT JOIN t32
+ ON t32.f32 = t31.f31
+ LEFT JOIN t33
+ ON t33.f33 = t0.f0
+ LEFT JOIN t34
+ ON t33.f33 = t34.f34
+ LEFT JOIN t35
+ ON t33.f33 = t35.f35
+ LEFT JOIN t36
+ ON t36.f36 = t0.f0
+ LEFT JOIN t37
+ ON t32.f32 = t37.f37
+ LEFT JOIN t38
+ ON t31.f31 = t38.f38
+ LEFT JOIN t39
+ ON t39.f39 = t0.f0
+ LEFT JOIN t40
+ ON t40.f40 = t39.f39
+ LEFT JOIN t41
+ ON t41.f41 = t0.f0
+ LEFT JOIN t42
+ ON t42.f42 = t41.f41
+ LEFT JOIN t43
+ ON t43.f43 = t41.f41
+ LEFT JOIN t44
+ ON t44.f44 = t0.f0
+ LEFT JOIN t45
+ ON t45.f45 = t0.f0
+ LEFT JOIN t46
+ ON t46.f46 = t0.f0
+ LEFT JOIN t47
+ ON t47.f47 = t0.f0
+ LEFT JOIN t48
+ ON t48.f48 = t0.f0
+ LEFT JOIN t49
+ ON t0.f0 = t49.f49
+ LEFT JOIN t50
+ ON t0.f0 = t50.f50
+ LEFT JOIN t51
+ ON t0.f0 = t51.f51
+ LEFT JOIN t52
+ ON t52.f52 = t0.f0
+ LEFT JOIN t53
+ ON t53.f53 = t0.f0
+ LEFT JOIN t54
+ ON t54.f54 = t0.f0
+ LEFT JOIN t55
+ ON t55.f55 = t0.f0
+ LEFT JOIN t56
+ ON t56.f56 = t0.f0
+ LEFT JOIN t57
+ ON t57.f57 = t0.f0
+ LEFT JOIN t58
+ ON t58.f58 = t57.f57
+ LEFT JOIN t59
+ ON t36.f36 = t59.f59
+ LEFT JOIN v60
+ ON t36.f36 = v60.f60
+ ;
+ id select_type table type possible_keys key key_len ref rows Extra
-1 SIMPLE NULL NULL NULL NULL NULL NULL NULL Impossible WHERE noticed after reading const tables
++1 PRIMARY NULL NULL NULL NULL NULL NULL NULL Impossible WHERE noticed after reading const tables
+ 2 SUBQUERY NULL NULL NULL NULL NULL NULL NULL no matching row in const table
+ drop table t0, t1, t2, t3, t4, t5, t6, t7, t8, t9,
+ t10, t11, t12, t13, t14, t15, t16, t17, t18,
+ t19, t20, t21, t22, t23, t24, t25, t26, t27,
+ t28, t29, t30, t31, t32, t33, t34, t35, t36,
+ t37, t38, t39, t40, t41, t42, t43, t44, t45,
+ t46, t47, t48, t49, t50, t51, t52, t53, t54,
+ t55, t56, t57, t58, t59,t60;
+ drop view v60;
# -----------------------------------------------------------------
# -- End of 5.5 tests.
# -----------------------------------------------------------------
diff --cc mysql-test/suite/galera/disabled.def
index f9909914089,ad966ebab0d..78c3565b99d
--- a/mysql-test/suite/galera/disabled.def
+++ b/mysql-test/suite/galera/disabled.def
@@@ -1,49 -1,4 +1,51 @@@
-galera_var_dirty_reads : MDEV-12539
-query_cache : MDEV-12539
-MW-421 : MDEV-12539
-galera_concurrent_ctas : MDEV-12539
+##############################################################################
+#
+# List the test cases that are to be disabled temporarily.
+#
+# Separate the test case name and the comment with ':'.
+#
+# <testcasename> : MDEV-<xxxx> <comment>
+#
+# Do not use any TAB characters for whitespace.
+#
+##############################################################################
++
+MW-336 : MDEV-13549 Galera test failures
+galera_gra_log : MDEV-13549 Galera test failures
+galera_flush_local : MDEV-13549 Galera test failures
+galera_flush : MDEV-13549 Galera test failures
+MW-329 : MDEV-13549 Galera test failures
+galera_account_management : MariaDB 10.0 does not support ALTER USER
+galera_binlog_row_image : MariaDB 10.0 does not support binlog_row_image
+galera_binlog_rows_query_log_events: MariaDB does not support binlog_rows_query_log_events
+GAL-419 : MDEV-13549 Galera test failures
+galera_toi_ddl_fk_insert : MDEV-13549 Galera test failures
+galera_var_notify_cmd : MDEV-13549 Galera test failures
+galera_var_slave_threads : MDEV-13549 Galera test failures
+mysql-wsrep#90 : MDEV-13549 Galera test failures
+galera_as_master_gtid : Requires MySQL GTID
+galera_as_master_gtid_change_master : Requires MySQL GTID
+galera_as_slave_replication_bundle : MDEV-13549 Galera test failures
+galera_as_slave_preordered : wsrep-preordered feature not merged to MariaDB
+galera_gcs_fragment : MDEV-13549 Galera test failures
+galera_gcache_recover : MDEV-13549 Galera test failures
+galera_gcache_recover_full_gcache : MDEV-13549 Galera test failures
+galera_gcache_recover_manytrx : MDEV-13549 Galera test failures
+galera_ist_mysqldump : MDEV-13549 Galera test failures
+mysql-wsrep#31 : MDEV-13549 Galera test failures
+galera_migrate : MariaDB 10.0 does not support START SLAVE USER
+galera_concurrent_ctas : MDEV-13549 Galera test failures
+galera_bf_abort_for_update : MDEV-13549 Galera test failures
+galera_wsrep_desync_wsrep_on : MDEV-13549 Galera test failures
+galera_ssl_upgrade : MDEV-13549 Galera test failures
+mysql-wsrep#33 : MDEV-13549 Galera test failures
+galera_var_auto_inc_control_on : MDEV-13549 Galera test failures
+MW-44 : MDEV-13549 Galera test failures
+galera_var_retry_autocommit : MDEV-13549 Galera test failures
+pxc-421 : MDEV-13549 Galera test failures
+lp1376747-2 : MDEV-13549 Galera test failures
+lp1376747 : MDEV-13549 Galera test failures
+galera_toi_ddl_nonconflicting : MDEV-13549 Galera test failures
+galera_parallel_simple : MDEV-13549 Galera test failures
+galera_admin : MDEV-13549 Galera test failures
++MW-416 : MDEV-13549 Galera test failures
diff --cc mysql-test/suite/galera/r/galera_defaults.result
index b242a468f72,00000000000..e7a776e9047
mode 100644,000000..100644
--- a/mysql-test/suite/galera/r/galera_defaults.result
+++ b/mysql-test/suite/galera/r/galera_defaults.result
@@@ -1,119 -1,0 +1,119 @@@
+SELECT COUNT(*) = 40 FROM INFORMATION_SCHEMA.GLOBAL_VARIABLES WHERE VARIABLE_NAME LIKE 'wsrep_%';
+COUNT(*) = 40
+1
+SELECT VARIABLE_NAME, VARIABLE_VALUE
+FROM INFORMATION_SCHEMA.GLOBAL_VARIABLES
+WHERE VARIABLE_NAME LIKE 'wsrep_%'
+AND VARIABLE_NAME NOT IN (
+'WSREP_PROVIDER_OPTIONS',
+'WSREP_SST_RECEIVE_ADDRESS',
+'WSREP_NODE_ADDRESS',
+'WSREP_NODE_NAME',
+'WSREP_PROVIDER',
+'WSREP_DATA_HOME_DIR',
+'WSREP_NODE_INCOMING_ADDRESS',
+'WSREP_START_POSITION'
+)
+ORDER BY VARIABLE_NAME;
+VARIABLE_NAME VARIABLE_VALUE
+WSREP_AUTO_INCREMENT_CONTROL ON
+WSREP_CAUSAL_READS ON
+WSREP_CERTIFY_NONPK ON
+WSREP_CLUSTER_ADDRESS gcomm://
+WSREP_CLUSTER_NAME my_wsrep_cluster
+WSREP_CONVERT_LOCK_TO_TRX OFF
+WSREP_DBUG_OPTION
+WSREP_DEBUG OFF
+WSREP_DESYNC OFF
+WSREP_DIRTY_READS OFF
+WSREP_DRUPAL_282555_WORKAROUND OFF
+WSREP_FORCED_BINLOG_FORMAT NONE
+WSREP_LOAD_DATA_SPLITTING ON
+WSREP_LOG_CONFLICTS OFF
+WSREP_MAX_WS_ROWS 0
+WSREP_MAX_WS_SIZE 2147483647
+WSREP_MYSQL_REPLICATION_BUNDLE 0
+WSREP_NOTIFY_CMD
+WSREP_ON ON
+WSREP_OSU_METHOD TOI
+WSREP_RECOVER OFF
+WSREP_REPLICATE_MYISAM OFF
+WSREP_RESTART_SLAVE OFF
+WSREP_RETRY_AUTOCOMMIT 1
+WSREP_SLAVE_FK_CHECKS ON
+WSREP_SLAVE_THREADS 1
+WSREP_SLAVE_UK_CHECKS OFF
+WSREP_SST_AUTH
+WSREP_SST_DONOR
+WSREP_SST_DONOR_REJECTS_QUERIES OFF
+WSREP_SST_METHOD rsync
+WSREP_SYNC_WAIT 15
- <BASE_DIR>; <BASE_HOST>; <BASE_PORT>; cert.log_conflicts = no; debug = no; evs.auto_evict = 0; evs.causal_keepalive_period = PT1S; evs.debug_log_mask = 0x1; evs.delay_margin = PT1S; evs.delayed_keep_period = PT30S; evs.inactive_check_period = PT0.5S; evs.inactive_timeout = PT15S; evs.info_log_mask = 0; evs.install_timeout = PT7.5S; evs.join_retrans_period = PT1S; evs.keepalive_period = PT1S; evs.max_install_timeouts = 3; evs.send_window = 4; evs.stats_report_period = PT1M; evs.suspect_timeout = PT10S; evs.use_aggregate = true; evs.user_send_window = 2; evs.version = 0; evs.view_forget_timeout = P1D; <GCACHE_DIR>; gcache.keep_pages_size = 0; gcache.mem_size = 0; <GCACHE_NAME>; gcache.page_size = 128M; gcache.recover = no; gcache.size = 10M; gcomm.thread_prio = ; gcs.fc_debug = 0; gcs.fc_factor = 1.0; gcs.fc_limit = 16; gcs.fc_master_slave = no; gcs.max_packet_size = 64500; gcs.max_throttle = 0.25; <GCS_RECV_Q_HARD_LIMIT>; gcs.recv_q_soft_limit = 0.25; gcs.sync_donor = no; <G
MCAST_LISTEN_ADDR>; gmcast.mcast_addr = ; gmcast.mcast_ttl = 1; gmcast.peer_timeout = PT3S; gmcast.segment = 0; gmcast.time_wait = PT5S; gmcast.version = 0; <IST_RECV_ADDR>; pc.announce_timeout = PT3S; pc.checksum = false; pc.ignore_quorum = false; pc.ignore_sb = false; pc.linger = PT20S; pc.npvo = false; pc.recovery = true; pc.version = 0; pc.wait_prim = true; pc.wait_prim_timeout = PT30S; pc.weight = 1; protonet.backend = asio; protonet.version = 0; repl.causal_read_timeout = PT90S; repl.commit_order = 3; repl.key_format = FLAT8; repl.max_ws_size = 2147483647; repl.proto_max = 7; socket.checksum = 2; socket.recv_buf_size = 212992;
++<BASE_DIR>; <BASE_HOST>; <BASE_PORT>; cert.log_conflicts = no; debug = no; evs.auto_evict = 0; evs.causal_keepalive_period = PT1S; evs.debug_log_mask = 0x1; evs.delay_margin = PT1S; evs.delayed_keep_period = PT30S; evs.inactive_check_period = PT0.5S; evs.inactive_timeout = PT15S; evs.info_log_mask = 0; evs.install_timeout = PT7.5S; evs.join_retrans_period = PT1S; evs.keepalive_period = PT1S; evs.max_install_timeouts = 3; evs.send_window = 4; evs.stats_report_period = PT1M; evs.suspect_timeout = PT10S; evs.use_aggregate = true; evs.user_send_window = 2; evs.version = 0; evs.view_forget_timeout = P1D; <GCACHE_DIR>; gcache.keep_pages_size = 0; gcache.mem_size = 0; <GCACHE_NAME>; gcache.page_size = 128M; gcache.recover = no; gcache.size = 10M; gcomm.thread_prio = ; gcs.fc_debug = 0; gcs.fc_factor = 1.0; gcs.fc_limit = 16; gcs.fc_master_slave = no; gcs.max_packet_size = 64500; gcs.max_throttle = 0.25; <GCS_RECV_Q_HARD_LIMIT>; gcs.recv_q_soft_limit = 0.25; gcs.sync_donor = no; <G
MCAST_LISTEN_ADDR>; gmcast.mcast_addr = ; gmcast.mcast_ttl = 1; gmcast.peer_timeout = PT3S; gmcast.segment = 0; gmcast.time_wait = PT5S; gmcast.version = 0; <IST_RECV_ADDR>; pc.announce_timeout = PT3S; pc.checksum = false; pc.ignore_quorum = false; pc.ignore_sb = false; pc.linger = PT20S; pc.npvo = false; pc.recovery = true; pc.version = 0; pc.wait_prim = true; pc.wait_prim_timeout = PT30S; pc.weight = 1; protonet.backend = asio; protonet.version = 0; repl.causal_read_timeout = PT90S; repl.commit_order = 3; repl.key_format = FLAT8; repl.max_ws_size = 2147483647; repl.proto_max = 8; socket.checksum = 2; socket.recv_buf_size = 212992;
+SELECT COUNT(*) FROM INFORMATION_SCHEMA.GLOBAL_STATUS
+WHERE VARIABLE_NAME LIKE 'wsrep_%'
+AND VARIABLE_NAME != 'wsrep_debug_sync_waiters';
+COUNT(*)
+58
+SELECT VARIABLE_NAME FROM INFORMATION_SCHEMA.GLOBAL_STATUS
+WHERE VARIABLE_NAME LIKE 'wsrep_%'
+AND VARIABLE_NAME != 'wsrep_debug_sync_waiters'
+ORDER BY VARIABLE_NAME;
+VARIABLE_NAME
+WSREP_APPLY_OOOE
+WSREP_APPLY_OOOL
+WSREP_APPLY_WINDOW
+WSREP_CAUSAL_READS
+WSREP_CERT_DEPS_DISTANCE
+WSREP_CERT_INDEX_SIZE
+WSREP_CERT_INTERVAL
+WSREP_CLUSTER_CONF_ID
+WSREP_CLUSTER_SIZE
+WSREP_CLUSTER_STATE_UUID
+WSREP_CLUSTER_STATUS
+WSREP_COMMIT_OOOE
+WSREP_COMMIT_OOOL
+WSREP_COMMIT_WINDOW
+WSREP_CONNECTED
+WSREP_DESYNC_COUNT
+WSREP_EVS_DELAYED
+WSREP_EVS_EVICT_LIST
+WSREP_EVS_REPL_LATENCY
+WSREP_EVS_STATE
+WSREP_FLOW_CONTROL_PAUSED
+WSREP_FLOW_CONTROL_PAUSED_NS
+WSREP_FLOW_CONTROL_RECV
+WSREP_FLOW_CONTROL_SENT
+WSREP_GCOMM_UUID
+WSREP_INCOMING_ADDRESSES
+WSREP_LAST_COMMITTED
+WSREP_LOCAL_BF_ABORTS
+WSREP_LOCAL_CACHED_DOWNTO
+WSREP_LOCAL_CERT_FAILURES
+WSREP_LOCAL_COMMITS
+WSREP_LOCAL_INDEX
+WSREP_LOCAL_RECV_QUEUE
+WSREP_LOCAL_RECV_QUEUE_AVG
+WSREP_LOCAL_RECV_QUEUE_MAX
+WSREP_LOCAL_RECV_QUEUE_MIN
+WSREP_LOCAL_REPLAYS
+WSREP_LOCAL_SEND_QUEUE
+WSREP_LOCAL_SEND_QUEUE_AVG
+WSREP_LOCAL_SEND_QUEUE_MAX
+WSREP_LOCAL_SEND_QUEUE_MIN
+WSREP_LOCAL_STATE
+WSREP_LOCAL_STATE_COMMENT
+WSREP_LOCAL_STATE_UUID
+WSREP_PROTOCOL_VERSION
+WSREP_PROVIDER_NAME
+WSREP_PROVIDER_VENDOR
+WSREP_PROVIDER_VERSION
+WSREP_READY
+WSREP_RECEIVED
+WSREP_RECEIVED_BYTES
+WSREP_REPLICATED
+WSREP_REPLICATED_BYTES
+WSREP_REPL_DATA_BYTES
+WSREP_REPL_KEYS
+WSREP_REPL_KEYS_BYTES
+WSREP_REPL_OTHER_BYTES
+WSREP_THREAD_COUNT
diff --cc mysql-test/suite/galera/r/galera_var_dirty_reads.result
index c469e49731d,8a3175912c7..405d86b3027
--- a/mysql-test/suite/galera/r/galera_var_dirty_reads.result
+++ b/mysql-test/suite/galera/r/galera_var_dirty_reads.result
@@@ -42,6 -88,8 +42,4 @@@ SELECT * FROM t1
i
1
DROP TABLE t1;
- set GLOBAL auto_increment_offset = 1;
- set GLOBAL auto_increment_offset = 2;
-drop user user1;
-drop user user2;
-disconnect node_2;
-disconnect node_1;
# End of test
diff --cc mysql-test/suite/galera/t/galera_var_dirty_reads.test
index 152c875a946,8fd3b1d22f2..df4c033ab3d
--- a/mysql-test/suite/galera/t/galera_var_dirty_reads.test
+++ b/mysql-test/suite/galera/t/galera_var_dirty_reads.test
@@@ -5,6 -5,14 +5,11 @@@
--source include/galera_cluster.inc
--source include/have_innodb.inc
---disable_query_log
+ # Save original auto_increment_offset values.
---connection node_1
-let $auto_increment_offset_node_1 = `SELECT @@global.auto_increment_offset`;
---connection node_2
-let $auto_increment_offset_node_2 = `SELECT @@global.auto_increment_offset`;
---enable_query_log
++--let $node_1=node_1
++--let $node_2=node_2
++--source include/auto_increment_offset_save.inc
+
--connection node_2
--let $wsrep_cluster_address_saved = `SELECT @@global.wsrep_cluster_address`
@@@ -64,10 -114,16 +69,8 @@@ USE test
SELECT * FROM t1;
# Cleanup
DROP TABLE t1;
- set GLOBAL auto_increment_offset = 1;
-drop user user1;
-drop user user2;
---disable_query_log
-# Restore original auto_increment_offset values.
---connection node_1
---eval SET @@global.auto_increment_offset = $auto_increment_offset_node_1;
----connection node_2
- set GLOBAL auto_increment_offset = 2;
---eval SET @@global.auto_increment_offset = $auto_increment_offset_node_2;
---enable_query_log
++--source include/auto_increment_offset_restore.inc
--source include/galera_end.inc
--echo # End of test
diff --cc mysql-test/suite/parts/r/partition_alter_maria.result
index fd09c0bd4bb,d79bc0a41fe..7d923570cfe
--- a/mysql-test/suite/parts/r/partition_alter_maria.result
+++ b/mysql-test/suite/parts/r/partition_alter_maria.result
@@@ -16,16 -16,16 +16,25 @@@ select * from t1
pk dt
1 2017-09-28 15:12:00
drop table t1;
+ create table t1 (a int) engine=Aria transactional=1 partition by hash(a) partitions 2;
+ show create table t1;
+ Table Create Table
+ t1 CREATE TABLE `t1` (
+ `a` int(11) DEFAULT NULL
+ ) ENGINE=Aria DEFAULT CHARSET=latin1 TRANSACTIONAL=1
+ /*!50100 PARTITION BY HASH (a)
+ PARTITIONS 2 */
+ drop table t1;
#
+# MDEV-14641 Incompatible key or row definition between the MariaDB .frm file and the information in the storage engine
+#
+CREATE TABLE t1 (i INT) ENGINE=Aria PARTITION BY LIST(i) (PARTITION p0 VALUES IN (1), PARTITION p1 VALUES IN (2));;
+ALTER TABLE t1 ROW_FORMAT=COMPRESSED;
+ALTER TABLE t1 DROP PARTITION p1;
+SELECT * FROM t1;
+i
+DROP TABLE t1;
+#
# MDEV-13788 Server crash when issuing bad SQL partition syntax
#
CREATE TABLE t1 (id int, d date) ENGINE=Aria PARTITION BY RANGE COLUMNS(d) (PARTITION p1 VALUES LESS THAN (MAXVALUE));
diff --cc mysql-test/t/func_misc.test
index dc7202268d6,c21630c0c7b..4afed7d6f6e
--- a/mysql-test/t/func_misc.test
+++ b/mysql-test/t/func_misc.test
@@@ -596,6 -596,22 +596,18 @@@ AND 57813X540X1723 = 'Test'
drop table t1;
+
+ --echo #
+ --echo # MDEV-15630 uuid() function evaluates at wrong time in query
+ --echo #
+
+ CREATE TABLE t1 (id INT);
+ INSERT INTO t1 VALUES (1),(2),(3);
+ --replace_column 2 xxxxxxxx-xxxx-xxxx-xxxx-xxxxxxxxxxxx
+ SELECT COUNT(1), UUID() as uid FROM t1 GROUP BY uid;
+ DROP TABLE t1;
+
+
---echo #
---echo # End of 5.5 tests
---echo #
-
#
# Bug#12735545 - PARSER STACK OVERFLOW WITH NAME_CONST
# CONTAINING OR EXPRESSION
diff --cc scripts/wsrep_sst_xtrabackup-v2.sh
index 26119af2c61,00d8fe21113..f107cea6c74
--- a/scripts/wsrep_sst_xtrabackup-v2.sh
+++ b/scripts/wsrep_sst_xtrabackup-v2.sh
@@@ -1045,9 -863,9 +1045,9 @@@ the
wsrep_log_info "Cleaning the existing datadir and innodb-data/log directories"
- find $ib_home_dir $ib_log_dir $ib_undo_dir $DATA -mindepth 1 -regex $cpat -prune -o -exec rm -rfv {} 1>&2 \+
+ find $ib_home_dir $ib_log_dir $ib_undo_dir $DATA -mindepth 1 -prune -regex $cpat -o -exec rm -rfv {} 1>&2 \+
- tempdir=$(parse_cnf mysqld log-bin "")
+ tempdir=$(parse_cnf --mysqld log-bin "")
if [[ -n ${tempdir:-} ]];then
binlog_dir=$(dirname $tempdir)
binlog_file=$(basename $tempdir)
diff --cc sql/event_data_objects.cc
index e7bdc42b2e6,0cb123451df..aa85b570a84
--- a/sql/event_data_objects.cc
+++ b/sql/event_data_objects.cc
@@@ -1469,29 -1466,38 +1469,28 @@@ end
saved_master_access= thd->security_ctx->master_access;
thd->security_ctx->master_access |= SUPER_ACL;
+ bool save_tx_read_only= thd->tx_read_only;
+ thd->tx_read_only= false;
#ifdef WITH_WSREP
- if (WSREP(thd)) {
- // sql_print_information("sizeof(LEX) = %d", sizeof(struct LEX));
- // sizeof(LEX) = 4512, so it's relatively safe to allocate it on stack.
- LEX lex;
- LEX* saved = thd->lex;
- lex.sql_command = SQLCOM_DROP_EVENT;
- thd->lex = &lex;
- WSREP_TO_ISOLATION_BEGIN(WSREP_MYSQL_DB, NULL, NULL);
- thd->lex = saved;
- /*
- This code is processing event execution and does not have client
- connection. Here, event execution will now execute a prepared
- DROP EVENT statement, but thd->lex->sql_command is set to
- SQLCOM_CREATE_PROCEDURE
- DROP EVENT will be logged in binlog, and we have to
- replicate it to make all nodes have consistent event definitions
- Wsrep DDL replication is triggered inside Events::drop_event(),
- and here we need to prepare the THD so that DDL replication is
- possible, essentially it requires setting sql_command to
- SQLCOMM_DROP_EVENT, we will switch sql_command for the duration
- of DDL replication only.
- */
- const enum_sql_command sql_command_save= thd->lex->sql_command;
+ const bool sql_command_set= WSREP(thd);
- if (WSREP(thd))
- {
++ const enum_sql_command sql_command_save= thd->lex->sql_command;
++
++ if (sql_command_set) {
+ thd->lex->sql_command = SQLCOM_DROP_EVENT;
}
#endif
-
+
ret= Events::drop_event(thd, dbname, name, FALSE);
#ifdef WITH_WSREP
- WSREP_TO_ISOLATION_END;
- error:
+ if (sql_command_set)
+ {
+ WSREP_TO_ISOLATION_END;
+ thd->lex->sql_command = sql_command_save;
+ }
#endif
+ thd->tx_read_only= save_tx_read_only;
thd->security_ctx->master_access= saved_master_access;
}
}
diff --cc sql/events.cc
index dd4e4887d50,a6379ec5a46..661d9e19001
--- a/sql/events.cc
+++ b/sql/events.cc
@@@ -405,10 -401,16 +406,14 @@@ Events::create_event(THD *thd, Event_pa
}
}
}
- /* Restore the state of binlog format */
- DBUG_ASSERT(!thd->is_current_stmt_binlog_format_row());
- if (save_binlog_row_based)
- thd->set_current_stmt_binlog_format_row();
+
+ thd->restore_stmt_binlog_format(save_binlog_format);
DBUG_RETURN(ret);
+ #ifdef WITH_WSREP
+ error:
+ DBUG_RETURN(TRUE);
+ #endif /* WITH_WSREP */
}
@@@ -517,9 -521,16 +523,13 @@@ Events::update_event(THD *thd, Event_pa
ret= write_bin_log(thd, TRUE, thd->query(), thd->query_length());
}
}
- /* Restore the state of binlog format */
- DBUG_ASSERT(!thd->is_current_stmt_binlog_format_row());
- if (save_binlog_row_based)
- thd->set_current_stmt_binlog_format_row();
+ thd->restore_stmt_binlog_format(save_binlog_format);
DBUG_RETURN(ret);
+ #ifdef WITH_WSREP
+ error:
+ DBUG_RETURN(TRUE);
+ #endif /* WITH_WSREP */
}
@@@ -578,9 -591,15 +589,13 @@@ Events::drop_event(THD *thd, LEX_STRIN
DBUG_ASSERT(thd->query() && thd->query_length());
ret= write_bin_log(thd, TRUE, thd->query(), thd->query_length());
}
- /* Restore the state of binlog format */
- DBUG_ASSERT(!thd->is_current_stmt_binlog_format_row());
- if (save_binlog_row_based)
- thd->set_current_stmt_binlog_format_row();
+
+ thd->restore_stmt_binlog_format(save_binlog_format);
DBUG_RETURN(ret);
+ #ifdef WITH_WSREP
+ error:
+ DBUG_RETURN(TRUE);
+ #endif /* WITH_WSREP */
}
diff --cc sql/handler.cc
index 657cb01cbc8,7da373e6802..fc8bb53f35d
--- a/sql/handler.cc
+++ b/sql/handler.cc
@@@ -4453,19 -3838,13 +4453,20 @@@ handler::ha_create(const char *name, TA
*/
int
-handler::ha_create_handler_files(const char *name, const char *old_name,
- int action_flag, HA_CREATE_INFO *info)
+handler::ha_create_partitioning_metadata(const char *name, const char *old_name,
+ int action_flag)
{
- if (!opt_readonly || !info || !(info->options & HA_LEX_CREATE_TMP_TABLE))
- mark_trx_read_write();
+ /*
+ Normally this is done when unlocked, but in fast_alter_partition_table,
+ it is done on an already locked handler when preparing to alter/rename
+ partitions.
+ */
+ DBUG_ASSERT(m_lock_type == F_UNLCK ||
+ (!old_name && strcmp(name, table_share->path.str)));
+
- return create_handler_files(name, old_name, action_flag, info);
+ mark_trx_read_write();
+
+ return create_partitioning_metadata(name, old_name, action_flag);
}
diff --cc sql/item_cmpfunc.h
index 6d81c7acc40,3c8cc71370d..6cd7e0e3e78
--- a/sql/item_cmpfunc.h
+++ b/sql/item_cmpfunc.h
@@@ -272,11 -268,13 +273,15 @@@ public
virtual void get_cache_parameters(List<Item> ¶meters);
bool is_top_level_item();
bool eval_not_null_tables(uchar *opt_arg);
- void fix_after_pullout(st_select_lex *new_parent, Item **ref);
+ void fix_after_pullout(st_select_lex *new_parent, Item **ref, bool merge);
+ bool invisible_mode();
+ void reset_cache() { cache= NULL; }
virtual void print(String *str, enum_query_type query_type);
void restore_first_argument();
+ Item* get_wrapped_in_subselect_item()
+ {
+ return args[1];
+ }
};
class Comp_creator
diff --cc sql/item_func.h
index 7dea193c99b,57818228b98..b0ba87b4bd0
--- a/sql/item_func.h
+++ b/sql/item_func.h
@@@ -73,7 -66,7 +73,7 @@@ public
NOW_FUNC, TRIG_COND_FUNC,
SUSERVAR_FUNC, GUSERVAR_FUNC, COLLATE_FUNC,
EXTRACT_FUNC, CHAR_TYPECAST_FUNC, FUNC_SP, UDF_FUNC,
- NEG_FUNC, GSYSVAR_FUNC, DYNCOL_FUNC };
- NEG_FUNC, GSYSVAR_FUNC, IN_OPTIMIZER_FUNC };
++ NEG_FUNC, GSYSVAR_FUNC, IN_OPTIMIZER_FUNC, DYNCOL_FUNC };
enum optimize_type { OPTIMIZE_NONE,OPTIMIZE_KEY,OPTIMIZE_OP, OPTIMIZE_NULL,
OPTIMIZE_EQUAL };
enum Type type() const { return FUNC_ITEM; }
diff --cc sql/log.cc
index b63d72f0d4a,ca7833a0460..0098dd2ba3d
--- a/sql/log.cc
+++ b/sql/log.cc
@@@ -8589,9 -7042,10 +8589,9 @@@ int TC_LOG_MMAP::open(const char *opt_n
DBUG_ASSERT(opt_name && opt_name[0]);
tc_log_page_size= my_getpagesize();
- DBUG_ASSERT(TC_LOG_PAGE_SIZE % tc_log_page_size == 0);
fn_format(logname,opt_name,mysql_data_home,"",MY_UNPACK_FILENAME);
- if ((fd= mysql_file_open(key_file_tclog, logname, O_RDWR, MYF(0))) < 0)
+ if ((fd= mysql_file_open(key_file_tclog, logname, O_RDWR | O_CLOEXEC, MYF(0))) < 0)
{
if (my_errno != ENOENT)
goto err;
diff --cc sql/log_event.cc
index c57331df807,12489d6d7eb..e799f37ddae
--- a/sql/log_event.cc
+++ b/sql/log_event.cc
@@@ -4314,38 -3832,22 +4347,38 @@@ int Query_log_event::do_apply_event(rpl
}
else
thd->variables.collation_database= thd->db_charset;
-
+
+ /*
+ Record any GTID in the same transaction, so slave state is
+ transactionally consistent.
+ */
+ if (current_stmt_is_commit)
{
- const CHARSET_INFO *cs= thd->charset();
- /*
- We cannot ask for parsing a statement using a character set
- without state_maps (parser internal data).
- */
- if (!cs->state_map)
+ thd->variables.option_bits&= ~OPTION_GTID_BEGIN;
+ if (rgi->gtid_pending)
{
- rli->report(ERROR_LEVEL, ER_SLAVE_FATAL_ERROR,
- ER_THD(thd, ER_SLAVE_FATAL_ERROR),
- "character_set cannot be parsed");
- thd->is_slave_error= true;
- goto end;
- }
- }
+ sub_id= rgi->gtid_sub_id;
+ rgi->gtid_pending= false;
+
+ gtid= rgi->current_gtid;
+ if (rpl_global_gtid_slave_state->record_gtid(thd, >id, sub_id,
+ true, false))
+ {
+ int errcode= thd->get_stmt_da()->sql_errno();
+ if (!is_parallel_retry_error(rgi, errcode))
+ rli->report(ERROR_LEVEL, ER_CANNOT_UPDATE_GTID_STATE,
+ rgi->gtid_info(),
+ "Error during COMMIT: failed to update GTID state in "
+ "%s.%s: %d: %s",
+ "mysql", rpl_gtid_slave_state_table_name.str,
+ errcode,
+ thd->get_stmt_da()->message());
+ sub_id= 0;
+ thd->is_slave_error= 1;
+ goto end;
+ }
+ }
+ }
thd->table_map_for_update= (table_map)table_map_for_update;
thd->set_invoker(&user, &host);
@@@ -7703,7 -6540,12 +7752,12 @@@ User_var_log_event(const char* buf, uin
Old events will not have this extra byte, thence,
we keep the flags set to UNDEF_F.
*/
- uint bytes_read= ((val + val_len) - start);
+ uint bytes_read= ((val + val_len) - buf_start);
+ if (bytes_read > event_len)
+ {
+ error= true;
+ goto err;
+ }
if ((data_written - bytes_read) > 0)
{
flags= (uint) *(buf + UV_VAL_IS_NULL + UV_VAL_TYPE_SIZE +
diff --cc sql/mysqld.cc
index 8575709203c,4acfe57c684..f558b78104f
--- a/sql/mysqld.cc
+++ b/sql/mysqld.cc
@@@ -1110,19 -962,10 +1110,20 @@@ static PSI_cond_info all_server_conds[]
{ &key_COND_wsrep_sst_init, "COND_wsrep_sst_init", PSI_FLAG_GLOBAL},
{ &key_COND_wsrep_sst_thread, "wsrep_sst_thread", 0},
{ &key_COND_wsrep_rollback, "COND_wsrep_rollback", PSI_FLAG_GLOBAL},
+ { &key_COND_wsrep_thd, "THD::COND_wsrep_thd", 0},
{ &key_COND_wsrep_replaying, "COND_wsrep_replaying", PSI_FLAG_GLOBAL},
#endif
- { &key_COND_flush_thread_cache, "COND_flush_thread_cache", PSI_FLAG_GLOBAL}
+ { &key_COND_flush_thread_cache, "COND_flush_thread_cache", PSI_FLAG_GLOBAL},
+ { &key_COND_rpl_thread, "COND_rpl_thread", 0},
+ { &key_COND_rpl_thread_queue, "COND_rpl_thread_queue", 0},
+ { &key_COND_rpl_thread_stop, "COND_rpl_thread_stop", 0},
+ { &key_COND_rpl_thread_pool, "COND_rpl_thread_pool", 0},
+ { &key_COND_parallel_entry, "COND_parallel_entry", 0},
+ { &key_COND_group_commit_orderer, "COND_group_commit_orderer", 0},
+ { &key_COND_prepare_ordered, "COND_prepare_ordered", 0},
+ { &key_COND_slave_init, "COND_slave_init", 0},
+ { &key_COND_wait_gtid, "COND_wait_gtid", 0},
+ { &key_COND_gtid_ignore_duplicates, "COND_gtid_ignore_duplicates", 0}
};
PSI_thread_key key_thread_bootstrap, key_thread_delayed_insert,
diff --cc sql/mysqld.h
index 4af04a3df75,91fa2eda7fd..3bb9f35077e
--- a/sql/mysqld.h
+++ b/sql/mysqld.h
@@@ -245,12 -219,12 +245,13 @@@ extern pthread_key(MEM_ROOT**,THR_MALLO
#ifdef HAVE_PSI_INTERFACE
#ifdef HAVE_MMAP
extern PSI_mutex_key key_PAGE_lock, key_LOCK_sync, key_LOCK_active,
- key_LOCK_pool;
+ key_LOCK_pool, key_LOCK_pending_checkpoint;
#endif /* HAVE_MMAP */
+
#ifdef WITH_WSREP
extern PSI_mutex_key key_LOCK_wsrep_thd;
- #endif /* WITH_WSREP */
+ extern PSI_cond_key key_COND_wsrep_thd;
+ #endif /* HAVE_WSREP */
#ifdef HAVE_OPENSSL
extern PSI_mutex_key key_LOCK_des_key_file;
diff --cc sql/slave.cc
index a633722db16,f370e3dd27f..3dee39ad65f
--- a/sql/slave.cc
+++ b/sql/slave.cc
@@@ -4936,39 -3778,37 +4936,39 @@@ err_during_init
to avoid unneeded position re-init
*/
thd->temporary_tables = 0; // remove tempation from destructor to close them
- DBUG_ASSERT(thd->net.buff != 0);
- net_end(&thd->net); // destructor will not free it, because we are weird
- DBUG_ASSERT(rli->sql_thd == thd);
THD_CHECK_SENTRY(thd);
- rli->sql_thd= 0;
- set_thd_in_use_temporary_tables(rli); // (re)set sql_thd in use for saved temp tables
+ rli->sql_driver_thd= 0;
mysql_mutex_lock(&LOCK_thread_count);
- THD_CHECK_SENTRY(thd);
- delete thd;
+ thd->rgi_fake= thd->rgi_slave= NULL;
+ delete serial_rgi;
mysql_mutex_unlock(&LOCK_thread_count);
+
#ifdef WITH_WSREP
- /* if slave stopped due to node going non primary, we set global flag to
- trigger automatic restart of slave when node joins back to cluster
+ /*
+ If slave stopped due to node going non primary, we set global flag to
+ trigger automatic restart of slave when node joins back to cluster.
*/
- if (wsrep_node_dropped && wsrep_restart_slave)
- {
- if (wsrep_ready_get())
- {
- WSREP_INFO("Slave error due to node temporarily non-primary"
- "SQL slave will continue");
- wsrep_node_dropped= FALSE;
- mysql_mutex_unlock(&rli->run_lock);
- goto wsrep_restart_point;
- } else {
- WSREP_INFO("Slave error due to node going non-primary");
- WSREP_INFO("wsrep_restart_slave was set and therefore slave will be "
- "automatically restarted when node joins back to cluster");
- wsrep_restart_slave_activated= TRUE;
- }
- }
+ if (wsrep_node_dropped && wsrep_restart_slave)
+ {
- if (wsrep_ready)
++ if (wsrep_ready_get())
+ {
+ WSREP_INFO("Slave error due to node temporarily non-primary"
+ "SQL slave will continue");
+ wsrep_node_dropped= FALSE;
+ mysql_mutex_unlock(&rli->run_lock);
+ WSREP_DEBUG("wsrep_conflict_state now: %d", thd->wsrep_conflict_state);
+ WSREP_INFO("slave restart: %d", thd->wsrep_conflict_state);
+ thd->wsrep_conflict_state= NO_CONFLICT;
+ goto wsrep_restart_point;
+ } else {
+ WSREP_INFO("Slave error due to node going non-primary");
+ WSREP_INFO("wsrep_restart_slave was set and therefore slave will be "
+ "automatically restarted when node joins back to cluster.");
+ wsrep_restart_slave_activated= TRUE;
+ }
+ }
#endif /* WITH_WSREP */
+
/*
Note: the order of the broadcast and unlock calls below (first broadcast, then unlock)
is important. Otherwise a killer_thread can execute between the calls and
diff --cc sql/sql_class.cc
index 2502962cef0,ce875ba87ef..b3d964d4006
--- a/sql/sql_class.cc
+++ b/sql/sql_class.cc
@@@ -4544,251 -4305,27 +4549,251 @@@ extern "C" int thd_slave_thread(const M
return(thd->slave_thread);
}
-extern "C" int thd_non_transactional_update(const MYSQL_THD thd)
+/* Returns true for a worker thread in parallel replication. */
+extern "C" int thd_rpl_is_parallel(const MYSQL_THD thd)
{
- return(thd->transaction.all.modified_non_trans_table);
+ return thd->rgi_slave && thd->rgi_slave->is_parallel_exec;
}
-extern "C" int thd_binlog_format(const MYSQL_THD thd)
-{
-#ifdef WITH_WSREP
- if (WSREP(thd))
- {
- /* for wsrep binlog format is meaningful also when binlogging is off */
- return (int) WSREP_BINLOG_FORMAT(thd->variables.binlog_format);
- }
-#endif /* WITH_WSREP */
- if (mysql_bin_log.is_open() && (thd->variables.option_bits & OPTION_BIN_LOG))
- return (int) thd->variables.binlog_format;
- else
- return BINLOG_FORMAT_UNSPEC;
-}
+/*
+ This function can optionally be called to check if thd_report_wait_for()
+ needs to be called for waits done by a given transaction.
-extern "C" void thd_mark_transaction_to_rollback(MYSQL_THD thd, bool all)
+ If this function returns false for a given thd, there is no need to do any
+ calls to thd_report_wait_for() on that thd.
+
+ This call is optional; it is safe to call thd_report_wait_for() in any case.
+ This call can be used to save some redundant calls to thd_report_wait_for()
+ if desired. (This is unlikely to matter much unless there are _lots_ of
+ waits to report, as the overhead of thd_report_wait_for() is small).
+*/
+extern "C" int
+thd_need_wait_for(const MYSQL_THD thd)
+{
+ rpl_group_info *rgi;
+
+ if (mysql_bin_log.is_open() && opt_binlog_commit_wait_count > 0)
+ return true;
+ if (!thd)
+ return false;
+ rgi= thd->rgi_slave;
+ if (!rgi)
+ return false;
+ return rgi->is_parallel_exec;
+}
+
+/*
+ Used by InnoDB/XtraDB to report that one transaction THD is about to go to
+ wait for a transactional lock held by another transactions OTHER_THD.
+
+ This is used for parallel replication, where transactions are required to
+ commit in the same order on the slave as they did on the master. If the
+ transactions on the slave encounters lock conflicts on the slave that did
+ not exist on the master, this can cause deadlocks.
+
+ Normally, such conflicts will not occur, because the same conflict would
+ have prevented the two transactions from committing in parallel on the
+ master, thus preventing them from running in parallel on the slave in the
+ first place. However, it is possible in case when the optimizer chooses a
+ different plan on the slave than on the master (eg. table scan instead of
+ index scan).
+
+ InnoDB/XtraDB reports lock waits using this call. If a lock wait causes a
+ deadlock with the pre-determined commit order, we kill the later transaction,
+ and later re-try it, to resolve the deadlock.
+
+ This call need only receive reports about waits for locks that will remain
+ until the holding transaction commits. InnoDB/XtraDB auto-increment locks
+ are released earlier, and so need not be reported. (Such false positives are
+ not harmful, but could lead to unnecessary kill and retry, so best avoided).
+*/
+extern "C" void
+thd_report_wait_for(MYSQL_THD thd, MYSQL_THD other_thd)
+{
+ rpl_group_info *rgi;
+ rpl_group_info *other_rgi;
+
+ if (!thd || !other_thd)
+ return;
+ binlog_report_wait_for(thd, other_thd);
+ rgi= thd->rgi_slave;
+ other_rgi= other_thd->rgi_slave;
+ if (!rgi || !other_rgi)
+ return;
+ if (!rgi->is_parallel_exec)
+ return;
+ if (rgi->rli != other_rgi->rli)
+ return;
+ if (!rgi->gtid_sub_id || !other_rgi->gtid_sub_id)
+ return;
+ if (rgi->current_gtid.domain_id != other_rgi->current_gtid.domain_id)
+ return;
+ if (rgi->gtid_sub_id > other_rgi->gtid_sub_id)
+ return;
+ /*
+ This transaction is about to wait for another transaction that is required
+ by replication binlog order to commit after. This would cause a deadlock.
+
+ So send a kill to the other transaction, with a temporary error; this will
+ cause replication to rollback (and later re-try) the other transaction,
+ releasing the lock for this transaction so replication can proceed.
+ */
+ other_rgi->killed_for_retry= true;
+ mysql_mutex_lock(&other_thd->LOCK_thd_data);
+ other_thd->awake(KILL_CONNECTION);
+ mysql_mutex_unlock(&other_thd->LOCK_thd_data);
+}
+
+/*
+ This function is called from InnoDB/XtraDB to check if the commit order of
+ two transactions has already been decided by the upper layer. This happens
+ in parallel replication, where the commit order is forced to be the same on
+ the slave as it was originally on the master.
+
+ If this function returns false, it means that such commit order will be
+ enforced. This allows the storage engine to optionally omit gap lock waits
+ or similar measures that would otherwise be needed to ensure that
+ transactions would be serialised in a way that would cause a commit order
+ that is correct for binlogging for statement-based replication.
+
+ Since transactions are only run in parallel on the slave if they ran without
+ lock conflicts on the master, normally no lock conflicts on the slave happen
+ during parallel replication. However, there are a couple of corner cases
+ where it can happen, like these secondary-index operations:
+
+ T1: INSERT INTO t1 VALUES (7, NULL);
+ T2: DELETE FROM t1 WHERE b <= 3;
+
+ T1: UPDATE t1 SET secondary=NULL WHERE primary=1
+ T2: DELETE t1 WHERE secondary <= 3
+
+ The DELETE takes a gap lock that can block the INSERT/UPDATE, but the row
+ locks set by INSERT/UPDATE do not block the DELETE. Thus, the execution
+ order of the transactions determine whether a lock conflict occurs or
+ not. Thus a lock conflict can occur on the slave where it did not on the
+ master.
+
+ If this function returns true, normal locking should be done as required by
+ the binlogging and transaction isolation level in effect. But if it returns
+ false, the correct order will be enforced anyway, and InnoDB/XtraDB can
+ avoid taking the gap lock, preventing the lock conflict.
+
+ Calling this function is just an optimisation to avoid unnecessary
+ deadlocks. If it was not used, a gap lock would be set that could eventually
+ cause a deadlock; the deadlock would be caught by thd_report_wait_for() and
+ the transaction T2 killed and rolled back (and later re-tried).
+*/
+extern "C" int
+thd_need_ordering_with(const MYSQL_THD thd, const MYSQL_THD other_thd)
+{
+ rpl_group_info *rgi, *other_rgi;
+
+ DBUG_EXECUTE_IF("disable_thd_need_ordering_with", return 1;);
+ if (!thd || !other_thd)
+ return 1;
+ rgi= thd->rgi_slave;
+ other_rgi= other_thd->rgi_slave;
+ if (!rgi || !other_rgi)
+ return 1;
+ if (!rgi->is_parallel_exec)
+ return 1;
+ if (rgi->rli != other_rgi->rli)
+ return 1;
+ if (rgi->current_gtid.domain_id != other_rgi->current_gtid.domain_id)
+ return 1;
+ if (!rgi->commit_id || rgi->commit_id != other_rgi->commit_id)
+ return 1;
+ DBUG_EXECUTE_IF("thd_need_ordering_with_force", return 1;);
+ /*
+ Otherwise, these two threads are doing parallel replication within the same
+ replication domain. Their commit order is already fixed, so we do not need
+ gap locks or similar to otherwise enforce ordering (and in fact such locks
+ could lead to unnecessary deadlocks and transaction retry).
+ */
+ return 0;
+}
+
+
+/*
+ If the storage engine detects a deadlock, and needs to choose a victim
+ transaction to roll back, it can call this function to ask the upper
+ server layer for which of two possible transactions is prefered to be
+ aborted and rolled back.
+
+ In parallel replication, if two transactions are running in parallel and
+ one is fixed to commit before the other, then the one that commits later
+ will be prefered as the victim - chosing the early transaction as a victim
+ will not resolve the deadlock anyway, as the later transaction still needs
+ to wait for the earlier to commit.
+
+ Otherwise, a transaction that uses only transactional tables, and can thus
+ be safely rolled back, will be prefered as a deadlock victim over a
+ transaction that also modified non-transactional (eg. MyISAM) tables.
+
+ The return value is -1 if the first transaction is prefered as a deadlock
+ victim, 1 if the second transaction is prefered, or 0 for no preference (in
+ which case the storage engine can make the choice as it prefers).
+*/
+extern "C" int
+thd_deadlock_victim_preference(const MYSQL_THD thd1, const MYSQL_THD thd2)
+{
+ rpl_group_info *rgi1, *rgi2;
+ bool nontrans1, nontrans2;
+
+ if (!thd1 || !thd2)
+ return 0;
+
+ /*
+ If the transactions are participating in the same replication domain in
+ parallel replication, then request to select the one that will commit
+ later (in the fixed commit order from the master) as the deadlock victim.
+ */
+ rgi1= thd1->rgi_slave;
+ rgi2= thd2->rgi_slave;
+ if (rgi1 && rgi2 &&
+ rgi1->is_parallel_exec &&
+ rgi1->rli == rgi2->rli &&
+ rgi1->current_gtid.domain_id == rgi2->current_gtid.domain_id)
+ return rgi1->gtid_sub_id < rgi2->gtid_sub_id ? 1 : -1;
+
+ /*
+ If one transaction has modified non-transactional tables (so that it
+ cannot be safely rolled back), and the other has not, then prefer to
+ select the purely transactional one as the victim.
+ */
+ nontrans1= thd1->transaction.all.modified_non_trans_table;
+ nontrans2= thd2->transaction.all.modified_non_trans_table;
+ if (nontrans1 && !nontrans2)
+ return 1;
+ else if (!nontrans1 && nontrans2)
+ return -1;
+
+ /* No preferences, let the storage engine decide. */
+ return 0;
+}
+
+
+extern "C" int thd_non_transactional_update(const MYSQL_THD thd)
+{
+ return(thd->transaction.all.modified_non_trans_table);
+}
+
+extern "C" int thd_binlog_format(const MYSQL_THD thd)
+{
+#ifdef WITH_WSREP
+ if (WSREP(thd))
+ {
+ /* for wsrep binlog format is meaningful also when binlogging is off */
+ return (int) WSREP_FORMAT(thd->variables.binlog_format);
+ }
+#endif /* WITH_WSREP */
+ if (mysql_bin_log.is_open() && (thd->variables.option_bits & OPTION_BIN_LOG))
+ return (int) thd->variables.binlog_format;
+ else
- return BINLOG_FORMAT_UNSPEC;
++ return BINLOG_FORMAT_UNSPEC;
+}
+
+extern "C" void thd_mark_transaction_to_rollback(MYSQL_THD thd, bool all)
{
DBUG_ASSERT(thd);
thd->mark_transaction_to_rollback(all);
diff --cc sql/sql_class.h
index 0721252193d,cd1ac4fefd7..394575191e4
--- a/sql/sql_class.h
+++ b/sql/sql_class.h
@@@ -2854,6 -2398,43 +2854,7 @@@ public
query_id_t first_query_id;
} binlog_evt_union;
-#ifdef WITH_WSREP
- const bool wsrep_applier; /* dedicated slave applier thread */
- bool wsrep_applier_closing; /* applier marked to close */
- bool wsrep_client_thread; /* to identify client threads*/
- enum wsrep_exec_mode wsrep_exec_mode;
- query_id_t wsrep_last_query_id;
- enum wsrep_query_state wsrep_query_state;
- enum wsrep_conflict_state wsrep_conflict_state;
- mysql_mutex_t LOCK_wsrep_thd;
+ mysql_cond_t COND_wsrep_thd;
- // changed from wsrep_seqno_t to wsrep_trx_meta_t in wsrep API rev 75
- // wsrep_seqno_t wsrep_trx_seqno;
- wsrep_trx_meta_t wsrep_trx_meta;
- uint32 wsrep_rand;
- Relay_log_info* wsrep_rli;
- bool wsrep_converted_lock_session;
- wsrep_ws_handle_t wsrep_ws_handle;
-#ifdef WSREP_PROC_INFO
- char wsrep_info[128]; /* string for dynamic proc info */
-#endif /* WSREP_PROC_INFO */
- ulong wsrep_retry_counter; // of autocommit
- bool wsrep_PA_safe;
- char* wsrep_retry_query;
- size_t wsrep_retry_query_len;
- enum enum_server_command wsrep_retry_command;
- enum wsrep_consistency_check_mode
- wsrep_consistency_check;
- wsrep_stats_var* wsrep_status_vars;
- int wsrep_mysql_replicated;
- THD* wsrep_bf_thd;
- const char* wsrep_TOI_pre_query; /* a query to apply before
- the actual TOI query */
- size_t wsrep_TOI_pre_query_len;
- void* wsrep_apply_format;
- bool wsrep_apply_toi; /* applier processing in TOI */
- wsrep_gtid_t wsrep_sync_wait_gtid;
-#endif /* WITH_WSREP */
/**
Internal parser state.
Note that since the parser is not re-entrant, we keep only one parser
diff --cc sql/sql_insert.cc
index af0321ce68f,64c9497fb7d..fa754d2da38
--- a/sql/sql_insert.cc
+++ b/sql/sql_insert.cc
@@@ -4324,18 -4304,47 +4324,47 @@@ bool select_create::send_eof(
abort_result_set();
DBUG_RETURN(true);
}
- else
+
+ /*
+ Do an implicit commit at end of statement for non-temporary
+ tables. This can fail, but we should unlock the table
+ nevertheless.
+ */
+ if (!table->s->tmp_table)
{
- /*
- Do an implicit commit at end of statement for non-temporary
- tables. This can fail, but we should unlock the table
- nevertheless.
- */
- if (!table->s->tmp_table)
- {
+ #ifdef WITH_WSREP
+ /*
+ append table level exclusive key for CTAS
+ */
+ wsrep_key_arr_t key_arr= {0, 0};
+ wsrep_prepare_keys_for_isolation(thd,
+ create_table->db,
+ create_table->table_name,
+ table_list,
+ &key_arr);
+ int rcode = wsrep->append_key(
+ wsrep,
+ &thd->wsrep_ws_handle,
+ key_arr.keys, //&wkey,
+ key_arr.keys_len,
+ WSREP_KEY_EXCLUSIVE,
+ false);
+ wsrep_keys_free(&key_arr);
+ if (rcode) {
+ DBUG_PRINT("wsrep", ("row key failed: %d", rcode));
+ WSREP_ERROR("Appending table key for CTAS failed: %s, %d",
+ (wsrep_thd_query(thd)) ?
+ wsrep_thd_query(thd) : "void", rcode);
+ return true;
+ }
+ /* If commit fails, we should be able to reset the OK status. */
- thd->stmt_da->can_overwrite_status= TRUE;
++ thd->get_stmt_da()->set_overwrite_status(TRUE);
+ #endif /* WITH_WSREP */
- trans_commit_stmt(thd);
+ trans_commit_stmt(thd);
+ if (!(thd->variables.option_bits & OPTION_GTID_BEGIN))
trans_commit_implicit(thd);
#ifdef WITH_WSREP
- thd->stmt_da->can_overwrite_status= FALSE;
++ thd->get_stmt_da()->set_overwrite_status(FALSE);
mysql_mutex_lock(&thd->LOCK_wsrep_thd);
if (thd->wsrep_conflict_state != NO_CONFLICT)
{
diff --cc sql/sql_parse.cc
index 6fe25961e65,553a6e7539d..f60134b6162
--- a/sql/sql_parse.cc
+++ b/sql/sql_parse.cc
@@@ -1053,7 -851,7 +1053,7 @@@ bool do_command(THD *thd
* bail out if DB snapshot has not been installed. We however,
* allow queries "SET" and "SHOW", they are trapped later in execute_command
*/
- if (thd->variables.wsrep_on && !thd->wsrep_applier && !wsrep_ready &&
- if (thd->variables.wsrep_on && !thd->wsrep_applier && !wsrep_ready_get() &&
++ if (thd->variables.wsrep_on && !thd->wsrep_applier && !wsrep_ready_get() &&
command != COM_QUERY &&
command != COM_PING &&
command != COM_QUIT &&
@@@ -2750,37 -2474,12 +2750,38 @@@ mysql_execute_command(THD *thd
{
WSREP_SYNC_WAIT(thd, WSREP_SYNC_WAIT_BEFORE_SHOW);
execute_show_status(thd, all_tables);
+
-#ifdef WITH_WSREP
- if (lex->sql_command == SQLCOM_SHOW_STATUS) wsrep_free_status(thd);
-#endif /* WITH_WSREP */
break;
}
+ case SQLCOM_SHOW_EXPLAIN:
+ {
+ if (!thd->security_ctx->priv_user[0] &&
+ check_global_access(thd,PROCESS_ACL))
+ break;
+
+ /*
+ The select should use only one table, it's the SHOW EXPLAIN pseudo-table
+ */
+ if (lex->sroutines.records || lex->query_tables->next_global)
+ {
+ my_message(ER_SET_CONSTANTS_ONLY, ER(ER_SET_CONSTANTS_ONLY),
+ MYF(0));
+ goto error;
+ }
+
+ Item **it= lex->value_list.head_ref();
+ if (!(*it)->basic_const_item() ||
+ (!(*it)->fixed && (*it)->fix_fields(lex->thd, it)) ||
+ (*it)->check_cols(1))
+ {
+ my_message(ER_SET_CONSTANTS_ONLY, ER(ER_SET_CONSTANTS_ONLY),
+ MYF(0));
+ goto error;
+ }
+ }
+ /* fall through */
+ case SQLCOM_SHOW_STATUS_PROC:
+ case SQLCOM_SHOW_STATUS_FUNC:
case SQLCOM_SHOW_DATABASES:
case SQLCOM_SHOW_TABLES:
case SQLCOM_SHOW_TRIGGERS:
@@@ -3782,8 -3325,7 +3783,8 @@@ end_with_restore_list
case SQLCOM_INSERT_SELECT:
{
WSREP_SYNC_WAIT(thd, WSREP_SYNC_WAIT_BEFORE_INSERT_REPLACE);
- select_result *sel_result;
+ select_insert *sel_result;
+ bool explain= MY_TEST(lex->describe);
DBUG_ASSERT(first_table == all_tables && first_table != 0);
if ((res= insert_precheck(thd, all_tables)))
break;
@@@ -4386,10 -3860,9 +4386,9 @@@
lex->spname->m_name);
break;
case SQLCOM_DROP_EVENT:
- WSREP_TO_ISOLATION_BEGIN(WSREP_MYSQL_DB, NULL, NULL)
if (!(res= Events::drop_event(thd,
lex->spname->m_db, lex->spname->m_name,
- lex->drop_if_exists)))
+ lex->check_exists)))
my_ok(thd);
break;
#else
diff --cc sql/sql_plugin.cc
index 013e00faeb9,81b59a5be90..b1ffa90dd2f
--- a/sql/sql_plugin.cc
+++ b/sql/sql_plugin.cc
@@@ -2082,11 -2084,20 +2082,14 @@@ bool mysql_install_plugin(THD *thd, con
bool error;
int argc=orig_argc;
char **argv=orig_argv;
+ unsigned long event_class_mask[MYSQL_AUDIT_CLASS_MASK_SIZE] =
+ { MYSQL_AUDIT_GENERAL_CLASSMASK };
DBUG_ENTER("mysql_install_plugin");
- if (opt_noacl)
- {
- my_error(ER_OPTION_PREVENTS_STATEMENT, MYF(0), "--skip-grant-tables");
- DBUG_RETURN(TRUE);
- }
-
tables.init_one_table("mysql", 5, "plugin", 6, "plugin", TL_WRITE);
- if (check_table_access(thd, INSERT_ACL, &tables, FALSE, 1, FALSE))
+ if (!opt_noacl && check_table_access(thd, INSERT_ACL, &tables, FALSE, 1, FALSE))
DBUG_RETURN(TRUE);
+ WSREP_TO_ISOLATION_BEGIN(WSREP_MYSQL_DB, NULL, NULL);
/* need to open before acquiring LOCK_plugin or it will deadlock */
if (! (table = open_ltable(thd, &tables, TL_WRITE,
@@@ -2218,12 -2232,21 +2224,15 @@@ bool mysql_uninstall_plugin(THD *thd, c
TABLE_LIST tables;
LEX_STRING dl= *dl_arg;
bool error= false;
+ unsigned long event_class_mask[MYSQL_AUDIT_CLASS_MASK_SIZE] =
+ { MYSQL_AUDIT_GENERAL_CLASSMASK };
DBUG_ENTER("mysql_uninstall_plugin");
- if (opt_noacl)
- {
- my_error(ER_OPTION_PREVENTS_STATEMENT, MYF(0), "--skip-grant-tables");
- DBUG_RETURN(TRUE);
- }
-
tables.init_one_table("mysql", 5, "plugin", 6, "plugin", TL_WRITE);
- if (check_table_access(thd, DELETE_ACL, &tables, FALSE, 1, FALSE))
+ if (!opt_noacl && check_table_access(thd, DELETE_ACL, &tables, FALSE, 1, FALSE))
DBUG_RETURN(TRUE);
+ WSREP_TO_ISOLATION_BEGIN(WSREP_MYSQL_DB, NULL, NULL);
/* need to open before acquiring LOCK_plugin or it will deadlock */
if (! (table= open_ltable(thd, &tables, TL_WRITE, MYSQL_LOCK_IGNORE_TIMEOUT)))
diff --cc sql/sql_view.cc
index 9fe4dd4849d,bbc5f002461..8fdd86535d1
--- a/sql/sql_view.cc
+++ b/sql/sql_view.cc
@@@ -429,18 -432,9 +429,19 @@@ bool mysql_create_view(THD *thd, TABLE_
lex->link_first_table_back(view, link_to_local);
view->open_type= OT_BASE_ONLY;
+ WSREP_TO_ISOLATION_BEGIN(WSREP_MYSQL_DB, NULL, NULL);
- if (open_and_lock_tables(thd, lex->query_tables, TRUE, 0))
+ /*
+ ignore lock specs for CREATE statement
+ */
+ if (lex->current_select->lock_type != TL_READ_DEFAULT)
+ {
+ lex->current_select->set_lock_for_tables(TL_READ_DEFAULT);
+ view->mdl_request.set_type(MDL_EXCLUSIVE);
+ }
+
+ if (open_temporary_tables(thd, lex->query_tables) ||
+ open_and_lock_tables(thd, lex->query_tables, TRUE, 0))
{
view= lex->unlink_first_table(&link_to_local);
res= TRUE;
@@@ -686,8 -722,12 +687,12 @@@
lex->link_first_table_back(view, link_to_local);
DBUG_RETURN(0);
+ #ifdef WITH_WSREP
+ error:
+ res= TRUE;
+ #endif /* WITH_WSREP */
err:
- thd_proc_info(thd, "end");
+ THD_STAGE_INFO(thd, stage_end);
lex->link_first_table_back(view, link_to_local);
unit->cleanup();
DBUG_RETURN(res || thd->is_error());
diff --cc sql/wsrep_hton.cc
index a9dbc1a17c2,1676daab5fe..0a2264ac03c
--- a/sql/wsrep_hton.cc
+++ b/sql/wsrep_hton.cc
@@@ -131,18 -115,30 +131,30 @@@ void wsrep_post_commit(THD* thd, bool a
wsrep_cleanup_transaction(thd);
break;
}
- case LOCAL_STATE:
- {
- /*
- Non-InnoDB statements may have populated events in stmt cache => cleanup
- */
- WSREP_DEBUG("cleanup transaction for LOCAL_STATE: %s", thd->query());
- wsrep_cleanup_transaction(thd);
- break;
- }
- default: break;
+ case LOCAL_STATE:
+ {
+ /* non-InnoDB statements may have populated events in stmt cache
+ => cleanup
+ */
+ WSREP_DEBUG("cleanup transaction for LOCAL_STATE");
+ /*
+ Run post-rollback hook to clean up in the case if
+ some keys were populated for the transaction in provider
+ but during commit time there was no write set to replicate.
+ This may happen when client sets the SAVEPOINT and immediately
+ rolls back to savepoint after first operation.
+ */
+ if (all && thd->wsrep_conflict_state != MUST_REPLAY &&
+ wsrep && wsrep->post_rollback(wsrep, &thd->wsrep_ws_handle))
+ {
+ WSREP_WARN("post_rollback fail: %llu %d",
- (long long)thd->thread_id, thd->stmt_da->status());
++ (long long)thd->thread_id, thd->get_stmt_da()->status());
+ }
+ wsrep_cleanup_transaction(thd);
+ break;
+ }
+ default: break;
}
-
}
/*
diff --cc sql/wsrep_mysqld.cc
index 49988287933,54fdf430f86..bd397a9a012
--- a/sql/wsrep_mysqld.cc
+++ b/sql/wsrep_mysqld.cc
@@@ -931,76 -1019,84 +932,76 @@@ static bool wsrep_prepare_key_for_isola
}
/* Prepare key list from db/table and table_list */
- static bool wsrep_prepare_keys_for_isolation(THD* thd,
- const char* db,
- const char* table,
- const TABLE_LIST* table_list,
- wsrep_key_arr_t* ka)
+ bool wsrep_prepare_keys_for_isolation(THD* thd,
+ const char* db,
+ const char* table,
+ const TABLE_LIST* table_list,
+ wsrep_key_arr_t* ka)
{
- ka->keys= 0;
- ka->keys_len= 0;
+ ka->keys= 0;
+ ka->keys_len= 0;
- extern TABLE* find_temporary_table(THD*, const TABLE_LIST*);
-
- if (db || table)
+ if (db || table)
+ {
+ if (!(ka->keys= (wsrep_key_t*)my_malloc(sizeof(wsrep_key_t), MYF(0))))
{
- TABLE_LIST tmp_table;
- bzero((char*) &tmp_table,sizeof(tmp_table));
- tmp_table.table_name= (char*)db;
- tmp_table.db= (char*)table;
- if (!table || !find_temporary_table(thd, &tmp_table))
- {
- if (!(ka->keys= (wsrep_key_t*)my_malloc(sizeof(wsrep_key_t), MYF(0))))
- {
- WSREP_ERROR("Can't allocate memory for key_array");
- goto err;
- }
- ka->keys_len= 1;
- if (!(ka->keys[0].key_parts= (wsrep_buf_t*)
- my_malloc(sizeof(wsrep_buf_t)*2, MYF(0))))
- {
- WSREP_ERROR("Can't allocate memory for key_parts");
- goto err;
- }
- ka->keys[0].key_parts_num= 2;
- if (!wsrep_prepare_key_for_isolation(
- db, table,
- (wsrep_buf_t*)ka->keys[0].key_parts,
- &ka->keys[0].key_parts_num))
- {
- WSREP_ERROR("Preparing keys for isolation failed");
- goto err;
- }
- }
+ WSREP_ERROR("Can't allocate memory for key_array");
+ goto err;
+ }
+ ka->keys_len= 1;
+ if (!(ka->keys[0].key_parts= (wsrep_buf_t*)
+ my_malloc(sizeof(wsrep_buf_t)*2, MYF(0))))
+ {
+ WSREP_ERROR("Can't allocate memory for key_parts");
+ goto err;
+ }
+ ka->keys[0].key_parts_num= 2;
+ if (!wsrep_prepare_key_for_isolation(
+ db, table,
+ (wsrep_buf_t*)ka->keys[0].key_parts,
+ &ka->keys[0].key_parts_num))
+ {
+ WSREP_ERROR("Preparing keys for isolation failed (1)");
+ goto err;
}
+ }
+
+ for (const TABLE_LIST* table= table_list; table; table= table->next_global)
+ {
+ wsrep_key_t* tmp;
+ if (ka->keys)
+ tmp= (wsrep_key_t*)my_realloc(ka->keys,
+ (ka->keys_len + 1) * sizeof(wsrep_key_t),
+ MYF(0));
+ else
+ tmp= (wsrep_key_t*)my_malloc((ka->keys_len + 1) * sizeof(wsrep_key_t), MYF(0));
- for (const TABLE_LIST* table= table_list; table; table= table->next_global)
+ if (!tmp)
{
- if (!find_temporary_table(thd, table))
- {
- wsrep_key_t* tmp;
- tmp= (wsrep_key_t*)my_realloc(
- ka->keys, (ka->keys_len + 1) * sizeof(wsrep_key_t), MYF(0));
- if (!tmp)
- {
- WSREP_ERROR("Can't allocate memory for key_array");
- goto err;
- }
- ka->keys= tmp;
- if (!(ka->keys[ka->keys_len].key_parts= (wsrep_buf_t*)
- my_malloc(sizeof(wsrep_buf_t)*2, MYF(0))))
- {
- WSREP_ERROR("Can't allocate memory for key_parts");
- goto err;
- }
- ka->keys[ka->keys_len].key_parts_num= 2;
- ++ka->keys_len;
- if (!wsrep_prepare_key_for_isolation(
- table->db, table->table_name,
- (wsrep_buf_t*)ka->keys[ka->keys_len - 1].key_parts,
- &ka->keys[ka->keys_len - 1].key_parts_num))
- {
- WSREP_ERROR("Preparing keys for isolation failed");
- goto err;
- }
- }
+ WSREP_ERROR("Can't allocate memory for key_array");
+ goto err;
}
- return true;
+ ka->keys= tmp;
+ if (!(ka->keys[ka->keys_len].key_parts= (wsrep_buf_t*)
+ my_malloc(sizeof(wsrep_buf_t)*2, MYF(0))))
+ {
+ WSREP_ERROR("Can't allocate memory for key_parts");
+ goto err;
+ }
+ ka->keys[ka->keys_len].key_parts_num= 2;
+ ++ka->keys_len;
+ if (!wsrep_prepare_key_for_isolation(table->db, table->table_name,
+ (wsrep_buf_t*)ka->keys[ka->keys_len - 1].key_parts,
+ &ka->keys[ka->keys_len - 1].key_parts_num))
+ {
+ WSREP_ERROR("Preparing keys for isolation failed (2)");
+ goto err;
+ }
+ }
+ return 0;
err:
wsrep_keys_free(ka);
- return false;
+ return 1;
}
diff --cc sql/wsrep_mysqld.h
index 6dabdb66022,56e3baae7cc..94c97f04aab
--- a/sql/wsrep_mysqld.h
+++ b/sql/wsrep_mysqld.h
@@@ -139,9 -125,17 +139,10 @@@ extern const char* wsrep_provider_name
extern const char* wsrep_provider_version;
extern const char* wsrep_provider_vendor;
-// Other wsrep global variables
-extern my_bool wsrep_inited; // whether wsrep is initialized ?
-
int wsrep_show_status(THD *thd, SHOW_VAR *var, char *buff);
+ int wsrep_show_ready(THD *thd, SHOW_VAR *var, char *buff);
void wsrep_free_status(THD *thd);
-/* Filters out --wsrep-new-cluster oprtion from argv[]
- * should be called in the very beginning of main() */
-void wsrep_filter_new_cluster (int* argc, char* argv[]);
-
int wsrep_init();
void wsrep_deinit(bool free_options);
void wsrep_recover();
@@@ -255,8 -245,7 +256,9 @@@ extern wsrep_seqno_t wsrep_locked_seqno
#define WSREP_PROVIDER_EXISTS \
(wsrep_provider && strncasecmp(wsrep_provider, WSREP_NONE, FN_REFLEN))
+#define WSREP_QUERY(thd) (thd->query())
+
+ extern my_bool wsrep_ready_get();
extern void wsrep_ready_wait();
enum wsrep_trx_status {
@@@ -332,11 -316,23 +334,22 @@@ int wsrep_create_trigger_query(THD *thd
int wsrep_create_event_query(THD *thd, uchar** buf, size_t* buf_len);
int wsrep_alter_event_query(THD *thd, uchar** buf, size_t* buf_len);
-struct xid_t;
-void wsrep_set_SE_checkpoint(xid_t*);
-void wsrep_get_SE_checkpoint(wsrep_uuid_t&, wsrep_seqno_t&);
-void wsrep_xid_init(xid_t*, const wsrep_uuid_t*, wsrep_seqno_t);
-const wsrep_uuid_t* wsrep_xid_uuid(const xid_t*);
-wsrep_seqno_t wsrep_xid_seqno(const xid_t*);
-extern "C" int wsrep_is_wsrep_xid(const void* xid);
+#ifdef GTID_SUPPORT
+void wsrep_init_sidno(const wsrep_uuid_t&);
+#endif /* GTID_SUPPORT */
+
+bool wsrep_node_is_donor();
+bool wsrep_node_is_synced();
+ typedef struct wsrep_key_arr
+ {
+ wsrep_key_t* keys;
+ size_t keys_len;
+ } wsrep_key_arr_t;
+ bool wsrep_prepare_keys_for_isolation(THD* thd,
+ const char* db,
+ const char* table,
+ const TABLE_LIST* table_list,
+ wsrep_key_arr_t* ka);
+ void wsrep_keys_free(wsrep_key_arr_t* key_arr);
#endif /* WSREP_MYSQLD_H */
diff --cc sql/wsrep_thd.cc
index 307745ff1b0,4d665775f2d..328bcbd6be6
--- a/sql/wsrep_thd.cc
+++ b/sql/wsrep_thd.cc
@@@ -381,7 -287,7 +381,7 @@@ static void wsrep_replication_process(T
case WSREP_TRX_MISSING:
/* these suggests a bug in provider code */
WSREP_WARN("bad return from recv() call: %d", rcode);
-- /* fall through to node shutdown */
++ /* fall through */
case WSREP_FATAL:
/* Cluster connectivity is lost.
*
diff --cc storage/heap/ha_heap.cc
index c1dad6a9943,ec76d08bf97..29bf924dc26
--- a/storage/heap/ha_heap.cc
+++ b/storage/heap/ha_heap.cc
@@@ -91,16 -100,7 +91,7 @@@ ha_heap::ha_heap(handlerton *hton, TABL
int ha_heap::open(const char *name, int mode, uint test_if_locked)
{
- if (table->s->reclength < sizeof (char*))
- {
- MEM_UNDEFINED(table->s->default_values + table->s->reclength,
- sizeof(char*) - table->s->reclength);
- table->s->reclength= sizeof(char*);
- MEM_UNDEFINED(table->record[0], table->s->reclength);
- MEM_UNDEFINED(table->record[1], table->s->reclength);
- }
-
- internal_table= test(test_if_locked & HA_OPEN_INTERNAL_TABLE);
+ internal_table= MY_TEST(test_if_locked & HA_OPEN_INTERNAL_TABLE);
if (internal_table || (!(file= heap_open(name, mode)) && my_errno == ENOENT))
{
HP_CREATE_INFO create_info;
@@@ -723,7 -727,7 +714,7 @@@ heap_prepare_hp_create_info(TABLE *tabl
}
}
}
- mem_per_row+= MY_ALIGN(share->reclength + 1, sizeof(char*));
- mem_per_row+= MY_ALIGN(max(share->reclength, sizeof(char*)) + 1, sizeof(char*));
++ mem_per_row+= MY_ALIGN(MY_MAX(share->reclength, sizeof(char*)) + 1, sizeof(char*));
if (table_arg->found_next_number_field)
{
keydef[share->next_number_index].flag|= HA_AUTO_KEY;
diff --cc storage/heap/hp_create.c
index d03c7c46f15,1daca0beeb5..29c031c466c
--- a/storage/heap/hp_create.c
+++ b/storage/heap/hp_create.c
@@@ -58,9 -59,9 +59,9 @@@ int heap_create(const char *name, HP_CR
/*
We have to store sometimes uchar* del_link in records,
- so the record length should be at least sizeof(uchar*)
+ so the visible_offset must be least at sizeof(uchar*)
*/
- set_if_bigger(reclength, sizeof (uchar*));
- visible_offset= max(reclength, sizeof (char*));
++ visible_offset= MY_MAX(reclength, sizeof (char*));
for (i= key_segs= max_length= 0, keyinfo= keydef; i < keys; i++, keyinfo++)
{
diff --cc storage/innobase/handler/ha_innodb.cc
index 5dbd7a1ca91,7aab200fed1..7e943782165
--- a/storage/innobase/handler/ha_innodb.cc
+++ b/storage/innobase/handler/ha_innodb.cc
@@@ -1,10 -1,8 +1,10 @@@
/*****************************************************************************
- Copyright (c) 2000, 2017, Oracle and/or its affiliates. All Rights Reserved.
+ Copyright (c) 2000, 2018, Oracle and/or its affiliates. All Rights Reserved.
Copyright (c) 2008, 2009 Google Inc.
Copyright (c) 2009, Percona Inc.
+Copyright (c) 2012, Facebook Inc.
- Copyright (c) 2013, 2017, MariaDB Corporation.
++Copyright (c) 2013, 2018, MariaDB Corporation.
Portions of this file contain modifications contributed and copyrighted by
Google, Inc. Those modifications are gratefully acknowledged and are described
@@@ -1603,2924 -934,700 +1603,2928 @@@ innobase_release_temporary_latches
if (!innodb_inited) {
- return(0);
+ return(0);
+ }
+
+ trx_t* trx = thd_to_trx(thd);
+
+ if (trx != NULL) {
+ trx_search_latch_release_if_reserved(trx);
+ }
+
+ return(0);
+}
+
+/********************************************************************//**
+Increments innobase_active_counter and every INNOBASE_WAKE_INTERVALth
+time calls srv_active_wake_master_thread. This function should be used
+when a single database operation may introduce a small need for
+server utility activity, like checkpointing. */
+static inline
+void
+innobase_active_small(void)
+/*=======================*/
+{
+ innobase_active_counter++;
+
+ if ((innobase_active_counter % INNOBASE_WAKE_INTERVAL) == 0) {
+ srv_active_wake_master_thread();
+ }
+}
+
+/********************************************************************//**
+Converts an InnoDB error code to a MySQL error code and also tells to MySQL
+about a possible transaction rollback inside InnoDB caused by a lock wait
+timeout or a deadlock.
+@return MySQL error code */
+static
+int
+convert_error_code_to_mysql(
+/*========================*/
+ dberr_t error, /*!< in: InnoDB error code */
+ ulint flags, /*!< in: InnoDB table flags, or 0 */
+ THD* thd) /*!< in: user thread handle or NULL */
+{
+ switch (error) {
+ case DB_SUCCESS:
+ return(0);
+
+ case DB_INTERRUPTED:
+ return(HA_ERR_ABORTED_BY_USER);
+
+ case DB_FOREIGN_EXCEED_MAX_CASCADE:
+ ut_ad(thd);
+ push_warning_printf(thd, Sql_condition::WARN_LEVEL_WARN,
+ HA_ERR_ROW_IS_REFERENCED,
+ "InnoDB: Cannot delete/update "
+ "rows with cascading foreign key "
+ "constraints that exceed max "
+ "depth of %d. Please "
+ "drop extra constraints and try "
+ "again", DICT_FK_MAX_RECURSIVE_LOAD);
+
+ /* fall through */
+
+ case DB_ERROR:
+ default:
+ return(-1); /* unspecified error */
+
+ case DB_DUPLICATE_KEY:
+ /* Be cautious with returning this error, since
+ mysql could re-enter the storage layer to get
+ duplicated key info, the operation requires a
+ valid table handle and/or transaction information,
+ which might not always be available in the error
+ handling stage. */
+ return(HA_ERR_FOUND_DUPP_KEY);
+
+ case DB_READ_ONLY:
+ return(HA_ERR_TABLE_READONLY);
+
+ case DB_FOREIGN_DUPLICATE_KEY:
+ return(HA_ERR_FOREIGN_DUPLICATE_KEY);
+
+ case DB_MISSING_HISTORY:
+ return(HA_ERR_TABLE_DEF_CHANGED);
+
+ case DB_RECORD_NOT_FOUND:
+ return(HA_ERR_NO_ACTIVE_RECORD);
+
+ case DB_DEADLOCK:
+ /* Since we rolled back the whole transaction, we must
+ tell it also to MySQL so that MySQL knows to empty the
+ cached binlog for this transaction */
+
+ if (thd) {
+ thd_mark_transaction_to_rollback(thd, TRUE);
+ }
+
+ return(HA_ERR_LOCK_DEADLOCK);
+
+ case DB_LOCK_WAIT_TIMEOUT:
+ /* Starting from 5.0.13, we let MySQL just roll back the
+ latest SQL statement in a lock wait timeout. Previously, we
+ rolled back the whole transaction. */
+
+ if (thd) {
+ thd_mark_transaction_to_rollback(
+ thd, (bool) row_rollback_on_timeout);
+ }
+
+ return(HA_ERR_LOCK_WAIT_TIMEOUT);
+
+ case DB_NO_REFERENCED_ROW:
+ return(HA_ERR_NO_REFERENCED_ROW);
+
+ case DB_ROW_IS_REFERENCED:
+ return(HA_ERR_ROW_IS_REFERENCED);
+
+ case DB_CANNOT_ADD_CONSTRAINT:
+ case DB_CHILD_NO_INDEX:
+ case DB_PARENT_NO_INDEX:
+ return(HA_ERR_CANNOT_ADD_FOREIGN);
+
+ case DB_CANNOT_DROP_CONSTRAINT:
+
+ return(HA_ERR_ROW_IS_REFERENCED); /* TODO: This is a bit
+ misleading, a new MySQL error
+ code should be introduced */
+
+ case DB_CORRUPTION:
+ return(HA_ERR_CRASHED);
+
+ case DB_OUT_OF_FILE_SPACE:
+ return(HA_ERR_RECORD_FILE_FULL);
+
+ case DB_TEMP_FILE_WRITE_FAILURE:
+ my_error(ER_GET_ERRMSG, MYF(0),
+ DB_TEMP_FILE_WRITE_FAILURE,
+ ut_strerr(DB_TEMP_FILE_WRITE_FAILURE),
+ "InnoDB");
+ return(HA_ERR_INTERNAL_ERROR);
+
+ case DB_TABLE_IN_FK_CHECK:
+ return(HA_ERR_TABLE_IN_FK_CHECK);
+
+ case DB_TABLE_IS_BEING_USED:
+ return(HA_ERR_WRONG_COMMAND);
+
+ case DB_TABLESPACE_DELETED:
+ case DB_TABLE_NOT_FOUND:
+ return(HA_ERR_NO_SUCH_TABLE);
+
+ case DB_TABLESPACE_NOT_FOUND:
+ return(HA_ERR_NO_SUCH_TABLE);
+
+ case DB_TOO_BIG_RECORD: {
+ /* If prefix is true then a 768-byte prefix is stored
+ locally for BLOB fields. Refer to dict_table_get_format() */
+ bool prefix = (dict_tf_get_format(flags) == UNIV_FORMAT_A);
+ my_printf_error(ER_TOO_BIG_ROWSIZE,
+ "Row size too large (> %lu). Changing some columns "
+ "to TEXT or BLOB %smay help. In current row "
+ "format, BLOB prefix of %d bytes is stored inline.",
+ MYF(0),
+ page_get_free_space_of_empty(flags &
+ DICT_TF_COMPACT) / 2,
+ prefix ? "or using ROW_FORMAT=DYNAMIC "
+ "or ROW_FORMAT=COMPRESSED ": "",
+ prefix ? DICT_MAX_FIXED_COL_LEN : 0);
+ return(HA_ERR_TO_BIG_ROW);
+ }
+
+
+ case DB_TOO_BIG_FOR_REDO:
+ my_printf_error(ER_TOO_BIG_ROWSIZE, "%s" , MYF(0),
+ "The size of BLOB/TEXT data inserted"
+ " in one transaction is greater than"
+ " 10% of redo log size. Increase the"
+ " redo log size using innodb_log_file_size.");
+ return(HA_ERR_TO_BIG_ROW);
+
+ case DB_TOO_BIG_INDEX_COL:
+ my_error(ER_INDEX_COLUMN_TOO_LONG, MYF(0),
+ DICT_MAX_FIELD_LEN_BY_FORMAT_FLAG(flags));
+ return(HA_ERR_INDEX_COL_TOO_LONG);
+
+ case DB_NO_SAVEPOINT:
+ return(HA_ERR_NO_SAVEPOINT);
+
+ case DB_LOCK_TABLE_FULL:
+ /* Since we rolled back the whole transaction, we must
+ tell it also to MySQL so that MySQL knows to empty the
+ cached binlog for this transaction */
+
+ if (thd) {
+ thd_mark_transaction_to_rollback(thd, TRUE);
+ }
+
+ return(HA_ERR_LOCK_TABLE_FULL);
+
+ case DB_FTS_INVALID_DOCID:
+ return(HA_FTS_INVALID_DOCID);
+ case DB_FTS_EXCEED_RESULT_CACHE_LIMIT:
+ return(HA_ERR_OUT_OF_MEM);
+ case DB_TOO_MANY_CONCURRENT_TRXS:
+ return(HA_ERR_TOO_MANY_CONCURRENT_TRXS);
+ case DB_UNSUPPORTED:
+ return(HA_ERR_UNSUPPORTED);
+ case DB_INDEX_CORRUPT:
+ return(HA_ERR_INDEX_CORRUPT);
+ case DB_UNDO_RECORD_TOO_BIG:
+ return(HA_ERR_UNDO_REC_TOO_BIG);
+ case DB_OUT_OF_MEMORY:
+ return(HA_ERR_OUT_OF_MEM);
+ case DB_TABLESPACE_EXISTS:
+ return(HA_ERR_TABLESPACE_EXISTS);
+ case DB_IDENTIFIER_TOO_LONG:
+ return(HA_ERR_INTERNAL_ERROR);
+ case DB_FTS_TOO_MANY_WORDS_IN_PHRASE:
+ return(HA_ERR_FTS_TOO_MANY_WORDS_IN_PHRASE);
+ }
+}
+
+/*************************************************************//**
+Prints info of a THD object (== user session thread) to the given file. */
+UNIV_INTERN
+void
+innobase_mysql_print_thd(
+/*=====================*/
+ FILE* f, /*!< in: output stream */
+ THD* thd, /*!< in: MySQL THD object */
+ uint max_query_len) /*!< in: max query length to print, or 0 to
+ use the default max length */
+{
+ char buffer[1024];
+
+ fputs(thd_get_error_context_description(thd, buffer, sizeof buffer,
+ max_query_len), f);
+ putc('\n', f);
+}
+
+/******************************************************************//**
+Get the error message format string.
+@return the format string or 0 if not found. */
+UNIV_INTERN
+const char*
+innobase_get_err_msg(
+/*=================*/
+ int error_code) /*!< in: MySQL error code */
+{
+ return(my_get_err_msg(error_code));
+}
+
+/******************************************************************//**
+Get the variable length bounds of the given character set. */
+UNIV_INTERN
+void
+innobase_get_cset_width(
+/*====================*/
+ ulint cset, /*!< in: MySQL charset-collation code */
+ ulint* mbminlen, /*!< out: minimum length of a char (in bytes) */
+ ulint* mbmaxlen) /*!< out: maximum length of a char (in bytes) */
+{
+ CHARSET_INFO* cs;
+ ut_ad(cset <= MAX_CHAR_COLL_NUM);
+ ut_ad(mbminlen);
+ ut_ad(mbmaxlen);
+
+ cs = all_charsets[cset];
+ if (cs) {
+ *mbminlen = cs->mbminlen;
+ *mbmaxlen = cs->mbmaxlen;
+ ut_ad(*mbminlen < DATA_MBMAX);
+ ut_ad(*mbmaxlen < DATA_MBMAX);
+ } else {
+ THD* thd = current_thd;
+
+ if (thd && thd_sql_command(thd) == SQLCOM_DROP_TABLE) {
+
+ /* Fix bug#46256: allow tables to be dropped if the
+ collation is not found, but issue a warning. */
+ if ((global_system_variables.log_warnings)
+ && (cset != 0)){
+
+ sql_print_warning(
+ "Unknown collation #%lu.", cset);
+ }
+ } else {
+
+ ut_a(cset == 0);
+ }
+
+ *mbminlen = *mbmaxlen = 0;
+ }
+}
+
+/******************************************************************//**
+Converts an identifier to a table name. */
+UNIV_INTERN
+void
+innobase_convert_from_table_id(
+/*===========================*/
+ struct charset_info_st* cs, /*!< in: the 'from' character set */
+ char* to, /*!< out: converted identifier */
+ const char* from, /*!< in: identifier to convert */
+ ulint len) /*!< in: length of 'to', in bytes */
+{
+ uint errors;
+
+ strconvert(cs, from, FN_REFLEN, &my_charset_filename, to, (uint) len, &errors);
+}
+
+/**********************************************************************
+Check if the length of the identifier exceeds the maximum allowed.
+return true when length of identifier is too long. */
+UNIV_INTERN
+my_bool
+innobase_check_identifier_length(
+/*=============================*/
+ const char* id) /* in: FK identifier to check excluding the
+ database portion. */
+{
+ int well_formed_error = 0;
+ CHARSET_INFO *cs = system_charset_info;
+ DBUG_ENTER("innobase_check_identifier_length");
+
+ size_t len = cs->cset->well_formed_len(
+ cs, id, id + strlen(id),
+ NAME_CHAR_LEN, &well_formed_error);
+
+ if (well_formed_error || len == NAME_CHAR_LEN) {
+ my_error(ER_TOO_LONG_IDENT, MYF(0), id);
+ DBUG_RETURN(true);
+ }
+ DBUG_RETURN(false);
+}
+
+/******************************************************************//**
+Converts an identifier to UTF-8. */
+UNIV_INTERN
+void
+innobase_convert_from_id(
+/*=====================*/
+ struct charset_info_st* cs, /*!< in: the 'from' character set */
+ char* to, /*!< out: converted identifier */
+ const char* from, /*!< in: identifier to convert */
+ ulint len) /*!< in: length of 'to', in bytes */
+{
+ uint errors;
+
+ strconvert(cs, from, FN_REFLEN, system_charset_info, to, (uint) len, &errors);
+}
+
+/******************************************************************//**
+Compares NUL-terminated UTF-8 strings case insensitively.
+@return 0 if a=b, <0 if a<b, >1 if a>b */
+UNIV_INTERN
+int
+innobase_strcasecmp(
+/*================*/
+ const char* a, /*!< in: first string to compare */
+ const char* b) /*!< in: second string to compare */
+{
+ if (!a) {
+ if (!b) {
+ return(0);
+ } else {
+ return(-1);
+ }
+ } else if (!b) {
+ return(1);
+ }
+
+ return(my_strcasecmp(system_charset_info, a, b));
+}
+
+/******************************************************************//**
+Compares NUL-terminated UTF-8 strings case insensitively. The
+second string contains wildcards.
+@return 0 if a match is found, 1 if not */
+UNIV_INTERN
+int
+innobase_wildcasecmp(
+/*=================*/
+ const char* a, /*!< in: string to compare */
+ const char* b) /*!< in: wildcard string to compare */
+{
+ return(wild_case_compare(system_charset_info, a, b));
+}
+
+/******************************************************************//**
+Strip dir name from a full path name and return only the file name
+@return file name or "null" if no file name */
+UNIV_INTERN
+const char*
+innobase_basename(
+/*==============*/
+ const char* path_name) /*!< in: full path name */
+{
+ const char* name = base_name(path_name);
+
+ return((name) ? name : "null");
+}
+
+/******************************************************************//**
+Makes all characters in a NUL-terminated UTF-8 string lower case. */
+UNIV_INTERN
+void
+innobase_casedn_str(
+/*================*/
+ char* a) /*!< in/out: string to put in lower case */
+{
+ my_casedn_str(system_charset_info, a);
+}
+
+/**********************************************************************//**
+Determines the connection character set.
+@return connection character set */
+UNIV_INTERN
+struct charset_info_st*
+innobase_get_charset(
+/*=================*/
+ THD* mysql_thd) /*!< in: MySQL thread handle */
+{
+ return(thd_charset(mysql_thd));
+}
+
+/**********************************************************************//**
+Determines the current SQL statement.
+@return SQL statement string */
+UNIV_INTERN
+const char*
+innobase_get_stmt(
+/*==============*/
+ THD* thd, /*!< in: MySQL thread handle */
+ size_t* length) /*!< out: length of the SQL statement */
+{
+ if (const LEX_STRING *stmt = thd_query_string(thd)) {
+ *length = stmt->length;
+ return stmt->str;
+ }
+ return NULL;
+}
+
+/**********************************************************************//**
+Get the current setting of the tdc_size global parameter. We do
+a dirty read because for one there is no synchronization object and
+secondly there is little harm in doing so even if we get a torn read.
+@return value of tdc_size */
+UNIV_INTERN
+ulint
+innobase_get_table_cache_size(void)
+/*===============================*/
+{
+ return(tdc_size);
+}
+
+/**********************************************************************//**
+Get the current setting of the lower_case_table_names global parameter from
+mysqld.cc. We do a dirty read because for one there is no synchronization
+object and secondly there is little harm in doing so even if we get a torn
+read.
+@return value of lower_case_table_names */
+UNIV_INTERN
+ulint
+innobase_get_lower_case_table_names(void)
+/*=====================================*/
+{
+ return(lower_case_table_names);
+}
+
+/** Create a temporary file in the location specified by the parameter
+path. If the path is null, then it will be created in tmpdir.
+@param[in] path location for creating temporary file
+@return temporary file descriptor, or < 0 on error */
+UNIV_INTERN
+int
+innobase_mysql_tmpfile(
+ const char* path)
+{
+#ifdef WITH_INNODB_DISALLOW_WRITES
+ os_event_wait(srv_allow_writes_event);
+#endif /* WITH_INNODB_DISALLOW_WRITES */
+ int fd2 = -1;
+ File fd;
+
+ DBUG_EXECUTE_IF(
+ "innobase_tmpfile_creation_failure",
+ return(-1);
+ );
+
+ if (path == NULL) {
+ fd = mysql_tmpfile("ib");
+ } else {
+ fd = mysql_tmpfile_path(path, "ib");
+ }
+
+ if (fd >= 0) {
+ /* Copy the file descriptor, so that the additional resources
+ allocated by create_temp_file() can be freed by invoking
+ my_close().
+
+ Because the file descriptor returned by this function
+ will be passed to fdopen(), it will be closed by invoking
+ fclose(), which in turn will invoke close() instead of
+ my_close(). */
+
+#ifdef _WIN32
+ /* Note that on Windows, the integer returned by mysql_tmpfile
+ has no relation to C runtime file descriptor. Here, we need
+ to call my_get_osfhandle to get the HANDLE and then convert it
+ to C runtime filedescriptor. */
+ {
+ HANDLE hFile = my_get_osfhandle(fd);
+ HANDLE hDup;
+ BOOL bOK = DuplicateHandle(
+ GetCurrentProcess(),
+ hFile, GetCurrentProcess(),
+ &hDup, 0, FALSE, DUPLICATE_SAME_ACCESS);
+ if (bOK) {
+ fd2 = _open_osfhandle((intptr_t) hDup, 0);
+ } else {
+ my_osmaperr(GetLastError());
+ fd2 = -1;
+ }
+ }
++#else
++#ifdef F_DUPFD_CLOEXEC
++ fd2 = fcntl(fd, F_DUPFD_CLOEXEC, 0);
+#else
+ fd2 = dup(fd);
++#endif
+#endif
+ if (fd2 < 0) {
+ DBUG_PRINT("error",("Got error %d on dup",fd2));
+ my_errno=errno;
+ my_error(EE_OUT_OF_FILERESOURCES,
+ MYF(ME_BELL+ME_WAITTANG),
+ "ib*", my_errno);
+ }
+ my_close(fd, MYF(MY_WME));
+ }
+ return(fd2);
+}
+
+/*********************************************************************//**
+Wrapper around MySQL's copy_and_convert function.
+@return number of bytes copied to 'to' */
+UNIV_INTERN
+ulint
+innobase_convert_string(
+/*====================*/
+ void* to, /*!< out: converted string */
+ ulint to_length, /*!< in: number of bytes reserved
+ for the converted string */
+ CHARSET_INFO* to_cs, /*!< in: character set to convert to */
+ const void* from, /*!< in: string to convert */
+ ulint from_length, /*!< in: number of bytes to convert */
+ CHARSET_INFO* from_cs, /*!< in: character set to convert
+ from */
+ uint* errors) /*!< out: number of errors encountered
+ during the conversion */
+{
+ return(copy_and_convert(
+ (char*) to, (uint32) to_length, to_cs,
+ (const char*) from, (uint32) from_length, from_cs,
+ errors));
+}
+
+/*******************************************************************//**
+Formats the raw data in "data" (in InnoDB on-disk format) that is of
+type DATA_(CHAR|VARCHAR|MYSQL|VARMYSQL) using "charset_coll" and writes
+the result to "buf". The result is converted to "system_charset_info".
+Not more than "buf_size" bytes are written to "buf".
+The result is always NUL-terminated (provided buf_size > 0) and the
+number of bytes that were written to "buf" is returned (including the
+terminating NUL).
+@return number of bytes that were written */
+UNIV_INTERN
+ulint
+innobase_raw_format(
+/*================*/
+ const char* data, /*!< in: raw data */
+ ulint data_len, /*!< in: raw data length
+ in bytes */
+ ulint charset_coll, /*!< in: charset collation */
+ char* buf, /*!< out: output buffer */
+ ulint buf_size) /*!< in: output buffer size
+ in bytes */
+{
+ /* XXX we use a hard limit instead of allocating
+ but_size bytes from the heap */
+ CHARSET_INFO* data_cs;
+ char buf_tmp[8192];
+ ulint buf_tmp_used;
+ uint num_errors;
+
+ data_cs = all_charsets[charset_coll];
+
+ buf_tmp_used = innobase_convert_string(buf_tmp, sizeof(buf_tmp),
+ system_charset_info,
+ data, data_len, data_cs,
+ &num_errors);
+
+ return(ut_str_sql_format(buf_tmp, buf_tmp_used, buf, buf_size));
+}
+
+/*********************************************************************//**
+Compute the next autoinc value.
+
+For MySQL replication the autoincrement values can be partitioned among
+the nodes. The offset is the start or origin of the autoincrement value
+for a particular node. For n nodes the increment will be n and the offset
+will be in the interval [1, n]. The formula tries to allocate the next
+value for a particular node.
+
+Note: This function is also called with increment set to the number of
+values we want to reserve for multi-value inserts e.g.,
+
+ INSERT INTO T VALUES(), (), ();
+
+innobase_next_autoinc() will be called with increment set to 3 where
+autoinc_lock_mode != TRADITIONAL because we want to reserve 3 values for
+the multi-value INSERT above.
+@return the next value */
+UNIV_INTERN
+ulonglong
+innobase_next_autoinc(
+/*==================*/
+ ulonglong current, /*!< in: Current value */
+ ulonglong need, /*!< in: count of values needed */
+ ulonglong step, /*!< in: AUTOINC increment step */
+ ulonglong offset, /*!< in: AUTOINC offset */
+ ulonglong max_value) /*!< in: max value for type */
+{
+ ulonglong next_value;
+ ulonglong block = need * step;
+
+ /* Should never be 0. */
+ ut_a(need > 0);
+ ut_a(block > 0);
+ ut_a(max_value > 0);
+
+ /*
+ Allow auto_increment to go over max_value up to max ulonglong.
+ This allows us to detect that all values are exhausted.
+ If we don't do this, we will return max_value several times
+ and get duplicate key errors instead of auto increment value
+ out of range.
+ */
+ max_value= (~(ulonglong) 0);
+
+ /* According to MySQL documentation, if the offset is greater than
+ the step then the offset is ignored. */
+ if (offset > block) {
+ offset = 0;
+ }
+
+ /* Check for overflow. Current can be > max_value if the value is
+ in reality a negative value.The visual studio compilers converts
+ large double values automatically into unsigned long long datatype
+ maximum value */
+
+ if (block >= max_value
+ || offset > max_value
+ || current >= max_value
+ || max_value - offset <= offset) {
+
+ next_value = max_value;
+ } else {
+ ut_a(max_value > current);
+
+ ulonglong free = max_value - current;
+
+ if (free < offset || free - offset <= block) {
+ next_value = max_value;
+ } else {
+ next_value = 0;
+ }
+ }
+
+ if (next_value == 0) {
+ ulonglong next;
+
+ if (current >= offset) {
+ next = (current - offset) / step;
+ } else {
+ next = 0;
+ block -= step;
+ }
+
+ ut_a(max_value > next);
+ next_value = next * step;
+ /* Check for multiplication overflow. */
+ ut_a(next_value >= next);
+ ut_a(max_value > next_value);
+
+ /* Check for overflow */
+ if (max_value - next_value >= block) {
+
+ next_value += block;
+
+ if (max_value - next_value >= offset) {
+ next_value += offset;
+ } else {
+ next_value = max_value;
+ }
+ } else {
+ next_value = max_value;
+ }
+ }
+
+ ut_a(next_value != 0);
+ ut_a(next_value <= max_value);
+
+ return(next_value);
+}
+
+/*********************************************************************//**
+Initializes some fields in an InnoDB transaction object. */
+static
+void
+innobase_trx_init(
+/*==============*/
+ THD* thd, /*!< in: user thread handle */
+ trx_t* trx) /*!< in/out: InnoDB transaction handle */
+{
+ DBUG_ENTER("innobase_trx_init");
+ DBUG_ASSERT(thd == trx->mysql_thd);
+
+ trx->check_foreigns = !thd_test_options(
+ thd, OPTION_NO_FOREIGN_KEY_CHECKS);
+
+ trx->check_unique_secondary = !thd_test_options(
+ thd, OPTION_RELAXED_UNIQUE_CHECKS);
+
+ DBUG_VOID_RETURN;
+}
+
+/*********************************************************************//**
+Allocates an InnoDB transaction for a MySQL handler object for DML.
+@return InnoDB transaction handle */
+UNIV_INTERN
+trx_t*
+innobase_trx_allocate(
+/*==================*/
+ THD* thd) /*!< in: user thread handle */
+{
+ trx_t* trx;
+
+ DBUG_ENTER("innobase_trx_allocate");
+ DBUG_ASSERT(thd != NULL);
+ DBUG_ASSERT(EQ_CURRENT_THD(thd));
+
+ trx = trx_allocate_for_mysql();
+
+ trx->mysql_thd = thd;
+
+ innobase_trx_init(thd, trx);
+
+ DBUG_RETURN(trx);
+}
+
+/*********************************************************************//**
+Gets the InnoDB transaction handle for a MySQL handler object, creates
+an InnoDB transaction struct if the corresponding MySQL thread struct still
+lacks one.
+@return InnoDB transaction handle */
+static inline
+trx_t*
+check_trx_exists(
+/*=============*/
+ THD* thd) /*!< in: user thread handle */
+{
+ trx_t*& trx = thd_to_trx(thd);
+
+ if (trx == NULL) {
+ trx = innobase_trx_allocate(thd);
+ thd_set_ha_data(thd, innodb_hton_ptr, trx);
+ } else if (UNIV_UNLIKELY(trx->magic_n != TRX_MAGIC_N)) {
+ mem_analyze_corruption(trx);
+ ut_error;
+ }
+
+ innobase_trx_init(thd, trx);
+
+ return(trx);
+}
+
+/*********************************************************************//**
+Note that a transaction has been registered with MySQL.
+@return true if transaction is registered with MySQL 2PC coordinator */
+static inline
+bool
+trx_is_registered_for_2pc(
+/*=========================*/
+ const trx_t* trx) /* in: transaction */
+{
+ return(trx->is_registered == 1);
+}
+
+/*********************************************************************//**
+Note that innobase_commit_ordered() was run. */
+static inline
+void
+trx_set_active_commit_ordered(
+/*==============================*/
+ trx_t* trx) /* in: transaction */
+{
+ ut_a(trx_is_registered_for_2pc(trx));
+ trx->active_commit_ordered = 1;
+}
+
+/*********************************************************************//**
+Note that a transaction has been registered with MySQL 2PC coordinator. */
+static inline
+void
+trx_register_for_2pc(
+/*==================*/
+ trx_t* trx) /* in: transaction */
+{
+ trx->is_registered = 1;
+ ut_ad(trx->active_commit_ordered == 0);
+}
+
+/*********************************************************************//**
+Note that a transaction has been deregistered. */
+static inline
+void
+trx_deregister_from_2pc(
+/*====================*/
+ trx_t* trx) /* in: transaction */
+{
+ trx->is_registered = 0;
+ trx->active_commit_ordered = 0;
+}
+
+/*********************************************************************//**
+Check whether a transaction has active_commit_ordered set */
+static inline
+bool
+trx_is_active_commit_ordered(
+/*=========================*/
+ const trx_t* trx) /* in: transaction */
+{
+ return(trx->active_commit_ordered == 1);
+}
+
+/*********************************************************************//**
+Check if transaction is started.
+@reutrn true if transaction is in state started */
+static
+bool
+trx_is_started(
+/*===========*/
+ trx_t* trx) /* in: transaction */
+{
+ return(trx->state != TRX_STATE_NOT_STARTED);
+}
+
+/*********************************************************************//**
+Copy table flags from MySQL's HA_CREATE_INFO into an InnoDB table object.
+Those flags are stored in .frm file and end up in the MySQL table object,
+but are frequently used inside InnoDB so we keep their copies into the
+InnoDB table object. */
+UNIV_INTERN
+void
+innobase_copy_frm_flags_from_create_info(
+/*=====================================*/
+ dict_table_t* innodb_table, /*!< in/out: InnoDB table */
+ const HA_CREATE_INFO* create_info) /*!< in: create info */
+{
+ ibool ps_on;
+ ibool ps_off;
+
+ if (dict_table_is_temporary(innodb_table)) {
+ /* Temp tables do not use persistent stats. */
+ ps_on = FALSE;
+ ps_off = TRUE;
+ } else {
+ ps_on = create_info->table_options
+ & HA_OPTION_STATS_PERSISTENT;
+ ps_off = create_info->table_options
+ & HA_OPTION_NO_STATS_PERSISTENT;
+ }
+
+ dict_stats_set_persistent(innodb_table, ps_on, ps_off);
+
+ dict_stats_auto_recalc_set(
+ innodb_table,
+ create_info->stats_auto_recalc == HA_STATS_AUTO_RECALC_ON,
+ create_info->stats_auto_recalc == HA_STATS_AUTO_RECALC_OFF);
+
+ innodb_table->stats_sample_pages = create_info->stats_sample_pages;
+}
+
+/*********************************************************************//**
+Copy table flags from MySQL's TABLE_SHARE into an InnoDB table object.
+Those flags are stored in .frm file and end up in the MySQL table object,
+but are frequently used inside InnoDB so we keep their copies into the
+InnoDB table object. */
+UNIV_INTERN
+void
+innobase_copy_frm_flags_from_table_share(
+/*=====================================*/
+ dict_table_t* innodb_table, /*!< in/out: InnoDB table */
+ const TABLE_SHARE* table_share) /*!< in: table share */
+{
+ ibool ps_on;
+ ibool ps_off;
+
+ if (dict_table_is_temporary(innodb_table)) {
+ /* Temp tables do not use persistent stats */
+ ps_on = FALSE;
+ ps_off = TRUE;
+ } else {
+ ps_on = table_share->db_create_options
+ & HA_OPTION_STATS_PERSISTENT;
+ ps_off = table_share->db_create_options
+ & HA_OPTION_NO_STATS_PERSISTENT;
+ }
+
+ dict_stats_set_persistent(innodb_table, ps_on, ps_off);
+
+ dict_stats_auto_recalc_set(
+ innodb_table,
+ table_share->stats_auto_recalc == HA_STATS_AUTO_RECALC_ON,
+ table_share->stats_auto_recalc == HA_STATS_AUTO_RECALC_OFF);
+
+ innodb_table->stats_sample_pages = table_share->stats_sample_pages;
+}
+
+/*********************************************************************//**
+Construct ha_innobase handler. */
+UNIV_INTERN
+ha_innobase::ha_innobase(
+/*=====================*/
+ handlerton* hton,
+ TABLE_SHARE* table_arg)
+ :handler(hton, table_arg),
+ int_table_flags(HA_REC_NOT_IN_SEQ |
+ HA_NULL_IN_KEY | HA_CAN_VIRTUAL_COLUMNS |
+ HA_CAN_INDEX_BLOBS |
+ HA_CAN_SQL_HANDLER |
+ HA_PRIMARY_KEY_REQUIRED_FOR_POSITION |
+ HA_PRIMARY_KEY_IN_READ_INDEX |
+ HA_BINLOG_ROW_CAPABLE |
+ HA_CAN_GEOMETRY | HA_PARTIAL_COLUMN_READ |
+ HA_TABLE_SCAN_ON_INDEX | HA_CAN_FULLTEXT |
+ HA_CAN_FULLTEXT_EXT | HA_CAN_EXPORT),
+ start_of_scan(0),
+ num_write_row(0)
+{}
+
+/*********************************************************************//**
+Destruct ha_innobase handler. */
+UNIV_INTERN
+ha_innobase::~ha_innobase()
+/*======================*/
+{
+}
+
+/*********************************************************************//**
+Updates the user_thd field in a handle and also allocates a new InnoDB
+transaction handle if needed, and updates the transaction fields in the
+prebuilt struct. */
+UNIV_INTERN inline
+void
+ha_innobase::update_thd(
+/*====================*/
+ THD* thd) /*!< in: thd to use the handle */
+{
+ trx_t* trx;
+
+ DBUG_ENTER("ha_innobase::update_thd");
+ DBUG_PRINT("ha_innobase::update_thd", ("user_thd: %p -> %p",
+ user_thd, thd));
+
+ /* The table should have been opened in ha_innobase::open(). */
+ DBUG_ASSERT(prebuilt->table->n_ref_count > 0);
+
+ trx = check_trx_exists(thd);
+
+ if (prebuilt->trx != trx) {
+
+ row_update_prebuilt_trx(prebuilt, trx);
+ }
+
+ user_thd = thd;
+ DBUG_VOID_RETURN;
+}
+
+/*********************************************************************//**
+Updates the user_thd field in a handle and also allocates a new InnoDB
+transaction handle if needed, and updates the transaction fields in the
+prebuilt struct. */
+UNIV_INTERN
+void
+ha_innobase::update_thd()
+/*=====================*/
+{
+ THD* thd = ha_thd();
+
+ ut_ad(EQ_CURRENT_THD(thd));
+ update_thd(thd);
+}
+
+/*********************************************************************//**
+Registers an InnoDB transaction with the MySQL 2PC coordinator, so that
+the MySQL XA code knows to call the InnoDB prepare and commit, or rollback
+for the transaction. This MUST be called for every transaction for which
+the user may call commit or rollback. Calling this several times to register
+the same transaction is allowed, too. This function also registers the
+current SQL statement. */
+static inline
+void
+innobase_register_trx(
+/*==================*/
+ handlerton* hton, /* in: Innobase handlerton */
+ THD* thd, /* in: MySQL thd (connection) object */
+ trx_t* trx) /* in: transaction to register */
+{
+ trans_register_ha(thd, FALSE, hton);
+
+ if (!trx_is_registered_for_2pc(trx)
+ && thd_test_options(thd, OPTION_NOT_AUTOCOMMIT | OPTION_BEGIN)) {
+
+ trans_register_ha(thd, TRUE, hton);
+ }
+
+ trx_register_for_2pc(trx);
+}
+
+/* BACKGROUND INFO: HOW THE MYSQL QUERY CACHE WORKS WITH INNODB
+ ------------------------------------------------------------
+
+1) The use of the query cache for TBL is disabled when there is an
+uncommitted change to TBL.
+
+2) When a change to TBL commits, InnoDB stores the current value of
+its global trx id counter, let us denote it by INV_TRX_ID, to the table object
+in the InnoDB data dictionary, and does only allow such transactions whose
+id <= INV_TRX_ID to use the query cache.
+
+3) When InnoDB does an INSERT/DELETE/UPDATE to a table TBL, or an implicit
+modification because an ON DELETE CASCADE, we invalidate the MySQL query cache
+of TBL immediately.
+
+How this is implemented inside InnoDB:
+
+1) Since every modification always sets an IX type table lock on the InnoDB
+table, it is easy to check if there can be uncommitted modifications for a
+table: just check if there are locks in the lock list of the table.
+
+2) When a transaction inside InnoDB commits, it reads the global trx id
+counter and stores the value INV_TRX_ID to the tables on which it had a lock.
+
+3) If there is an implicit table change from ON DELETE CASCADE or SET NULL,
+InnoDB calls an invalidate method for the MySQL query cache for that table.
+
+How this is implemented inside sql_cache.cc:
+
+1) The query cache for an InnoDB table TBL is invalidated immediately at an
+INSERT/UPDATE/DELETE, just like in the case of MyISAM. No need to delay
+invalidation to the transaction commit.
+
+2) To store or retrieve a value from the query cache of an InnoDB table TBL,
+any query must first ask InnoDB's permission. We must pass the thd as a
+parameter because InnoDB will look at the trx id, if any, associated with
+that thd. Also the full_name which is used as key to search for the table
+object. The full_name is a string containing the normalized path to the
+table in the canonical format.
+
+3) Use of the query cache for InnoDB tables is now allowed also when
+AUTOCOMMIT==0 or we are inside BEGIN ... COMMIT. Thus transactions no longer
+put restrictions on the use of the query cache.
+*/
+
+/******************************************************************//**
+The MySQL query cache uses this to check from InnoDB if the query cache at
+the moment is allowed to operate on an InnoDB table. The SQL query must
+be a non-locking SELECT.
+
+The query cache is allowed to operate on certain query only if this function
+returns TRUE for all tables in the query.
+
+If thd is not in the autocommit state, this function also starts a new
+transaction for thd if there is no active trx yet, and assigns a consistent
+read view to it if there is no read view yet.
+
+Why a deadlock of threads is not possible: the query cache calls this function
+at the start of a SELECT processing. Then the calling thread cannot be
+holding any InnoDB semaphores. The calling thread is holding the
+query cache mutex, and this function will reserve the InnoDB trx_sys->mutex.
+Thus, the 'rank' in sync0sync.h of the MySQL query cache mutex is above
+the InnoDB trx_sys->mutex.
+@return TRUE if permitted, FALSE if not; note that the value FALSE
+does not mean we should invalidate the query cache: invalidation is
+called explicitly */
+static
+my_bool
+innobase_query_caching_of_table_permitted(
+/*======================================*/
+ THD* thd, /*!< in: thd of the user who is trying to
+ store a result to the query cache or
+ retrieve it */
+ char* full_name, /*!< in: normalized path to the table */
+ uint full_name_len, /*!< in: length of the normalized path
+ to the table */
+ ulonglong *unused) /*!< unused for this engine */
+{
+ ibool is_autocommit;
+ trx_t* trx;
+ char norm_name[1000];
+
+ ut_a(full_name_len < 999);
+
+ trx = check_trx_exists(thd);
+
+ if (trx->isolation_level == TRX_ISO_SERIALIZABLE) {
+ /* In the SERIALIZABLE mode we add LOCK IN SHARE MODE to every
+ plain SELECT if AUTOCOMMIT is not on. */
+
+ return((my_bool)FALSE);
+ }
+
+ if (UNIV_UNLIKELY(trx->has_search_latch)) {
+ sql_print_error("The calling thread is holding the adaptive "
+ "search, latch though calling "
+ "innobase_query_caching_of_table_permitted.");
+ trx_print(stderr, trx, 1024);
+ }
+
+ trx_search_latch_release_if_reserved(trx);
+
+ innobase_srv_conc_force_exit_innodb(trx);
+
+ if (!thd_test_options(thd, OPTION_NOT_AUTOCOMMIT | OPTION_BEGIN)) {
+
+ is_autocommit = TRUE;
+ } else {
+ is_autocommit = FALSE;
+
+ }
+
+ if (is_autocommit && trx->n_mysql_tables_in_use == 0) {
+ /* We are going to retrieve the query result from the query
+ cache. This cannot be a store operation to the query cache
+ because then MySQL would have locks on tables already.
+
+ TODO: if the user has used LOCK TABLES to lock the table,
+ then we open a transaction in the call of row_.. below.
+ That trx can stay open until UNLOCK TABLES. The same problem
+ exists even if we do not use the query cache. MySQL should be
+ modified so that it ALWAYS calls some cleanup function when
+ the processing of a query ends!
+
+ We can imagine we instantaneously serialize this consistent
+ read trx to the current trx id counter. If trx2 would have
+ changed the tables of a query result stored in the cache, and
+ trx2 would have already committed, making the result obsolete,
+ then trx2 would have already invalidated the cache. Thus we
+ can trust the result in the cache is ok for this query. */
+
+ return((my_bool)TRUE);
+ }
+
+ /* Normalize the table name to InnoDB format */
+ normalize_table_name(norm_name, full_name);
+
+ innobase_register_trx(innodb_hton_ptr, thd, trx);
+
+ if (row_search_check_if_query_cache_permitted(trx, norm_name)) {
+
+ /* printf("Query cache for %s permitted\n", norm_name); */
+
+ return((my_bool)TRUE);
+ }
+
+ /* printf("Query cache for %s NOT permitted\n", norm_name); */
+
+ return((my_bool)FALSE);
+}
+
+/*****************************************************************//**
+Invalidates the MySQL query cache for the table. */
+UNIV_INTERN
+void
+innobase_invalidate_query_cache(
+/*============================*/
+ trx_t* trx, /*!< in: transaction which
+ modifies the table */
+ const char* full_name, /*!< in: concatenation of
+ database name, null char NUL,
+ table name, null char NUL;
+ NOTE that in Windows this is
+ always in LOWER CASE! */
+ ulint full_name_len) /*!< in: full name length where
+ also the null chars count */
+{
+ /* Note that the sync0sync.h rank of the query cache mutex is just
+ above the InnoDB trx_sys_t->lock. The caller of this function must
+ not have latches of a lower rank. */
+
+#ifdef HAVE_QUERY_CACHE
+ char qcache_key_name[2 * (NAME_LEN + 1)];
+ size_t tabname_len;
+ size_t dbname_len;
+
+ /* Construct the key("db-name\0table$name\0") for the query cache using
+ the path name("db@002dname\0table@0024name\0") of the table in its
+ canonical form. */
+ dbname_len = filename_to_tablename(full_name, qcache_key_name,
+ sizeof(qcache_key_name));
+ tabname_len = filename_to_tablename(full_name + strlen(full_name) + 1,
+ qcache_key_name + dbname_len + 1,
+ sizeof(qcache_key_name)
+ - dbname_len - 1);
+
+ /* Argument TRUE below means we are using transactions */
+ mysql_query_cache_invalidate4(trx->mysql_thd,
+ qcache_key_name,
+ (dbname_len + tabname_len + 2),
+ TRUE);
+#endif
+}
+
+/*****************************************************************//**
+Convert an SQL identifier to the MySQL system_charset_info (UTF-8)
+and quote it if needed.
+@return pointer to the end of buf */
+static
+char*
+innobase_convert_identifier(
+/*========================*/
+ char* buf, /*!< out: buffer for converted identifier */
+ ulint buflen, /*!< in: length of buf, in bytes */
+ const char* id, /*!< in: identifier to convert */
+ ulint idlen, /*!< in: length of id, in bytes */
+ THD* thd, /*!< in: MySQL connection thread, or NULL */
+ ibool file_id)/*!< in: TRUE=id is a table or database name;
+ FALSE=id is an UTF-8 string */
+{
+ char nz2[MAX_TABLE_NAME_LEN + 1];
+ const char* s = id;
+ int q;
+
+ if (file_id) {
+
+ char nz[MAX_TABLE_NAME_LEN + 1];
+
+ /* Decode the table name. The MySQL function expects
+ a NUL-terminated string. The input and output strings
+ buffers must not be shared. */
+ ut_a(idlen <= MAX_TABLE_NAME_LEN);
+ memcpy(nz, id, idlen);
+ nz[idlen] = 0;
+
+ s = nz2;
+ idlen = explain_filename(thd, nz, nz2, sizeof nz2,
+ EXPLAIN_PARTITIONS_AS_COMMENT);
+ goto no_quote;
+ }
+
+ /* See if the identifier needs to be quoted. */
+ if (UNIV_UNLIKELY(!thd)) {
+ q = '"';
+ } else {
+ q = get_quote_char_for_identifier(thd, s, (int) idlen);
+ }
+
+ if (q == EOF) {
+no_quote:
+ if (UNIV_UNLIKELY(idlen > buflen)) {
+ idlen = buflen;
+ }
+ memcpy(buf, s, idlen);
+ return(buf + idlen);
+ }
+
+ /* Quote the identifier. */
+ if (buflen < 2) {
+ return(buf);
+ }
+
+ *buf++ = q;
+ buflen--;
+
+ for (; idlen; idlen--) {
+ int c = *s++;
+ if (UNIV_UNLIKELY(c == q)) {
+ if (UNIV_UNLIKELY(buflen < 3)) {
+ break;
+ }
+
+ *buf++ = c;
+ *buf++ = c;
+ buflen -= 2;
+ } else {
+ if (UNIV_UNLIKELY(buflen < 2)) {
+ break;
+ }
+
+ *buf++ = c;
+ buflen--;
+ }
+ }
+
+ *buf++ = q;
+ return(buf);
+}
+
+/*****************************************************************//**
+Convert a table or index name to the MySQL system_charset_info (UTF-8)
+and quote it if needed.
+@return pointer to the end of buf */
+UNIV_INTERN
+char*
+innobase_convert_name(
+/*==================*/
+ char* buf, /*!< out: buffer for converted identifier */
+ ulint buflen, /*!< in: length of buf, in bytes */
+ const char* id, /*!< in: identifier to convert */
+ ulint idlen, /*!< in: length of id, in bytes */
+ THD* thd, /*!< in: MySQL connection thread, or NULL */
+ ibool table_id)/*!< in: TRUE=id is a table or database name;
+ FALSE=id is an index name */
+{
+ char* s = buf;
+ const char* bufend = buf + buflen;
+
+ if (table_id) {
+ const char* slash = (const char*) memchr(id, '/', idlen);
+ if (!slash) {
+
+ goto no_db_name;
+ }
+
+ /* Print the database name and table name separately. */
+ s = innobase_convert_identifier(s, bufend - s, id, slash - id,
+ thd, TRUE);
+ if (UNIV_LIKELY(s < bufend)) {
+ *s++ = '.';
+ s = innobase_convert_identifier(s, bufend - s,
+ slash + 1, idlen
+ - (slash - id) - 1,
+ thd, TRUE);
+ }
+ } else if (UNIV_UNLIKELY(*id == TEMP_INDEX_PREFIX)) {
+ /* Temporary index name (smart ALTER TABLE) */
+ const char temp_index_suffix[]= "--temporary--";
+
+ s = innobase_convert_identifier(buf, buflen, id + 1, idlen - 1,
+ thd, FALSE);
+ if (s - buf + (sizeof temp_index_suffix - 1) < buflen) {
+ memcpy(s, temp_index_suffix,
+ sizeof temp_index_suffix - 1);
+ s += sizeof temp_index_suffix - 1;
+ }
+ } else {
+no_db_name:
+ s = innobase_convert_identifier(buf, buflen, id, idlen,
+ thd, table_id);
+ }
+
+ return(s);
+}
+
+/*****************************************************************//**
+A wrapper function of innobase_convert_name(), convert a table or
+index name to the MySQL system_charset_info (UTF-8) and quote it if needed.
+@return pointer to the end of buf */
+UNIV_INTERN
+void
+innobase_format_name(
+/*==================*/
+ char* buf, /*!< out: buffer for converted identifier */
+ ulint buflen, /*!< in: length of buf, in bytes */
+ const char* name, /*!< in: index or table name to format */
+ ibool is_index_name) /*!< in: index name */
+{
+ const char* bufend;
+
+ bufend = innobase_convert_name(buf, buflen, name, strlen(name),
+ NULL, !is_index_name);
+
+ ut_ad((ulint) (bufend - buf) < buflen);
+
+ buf[bufend - buf] = '\0';
+}
+
+/**********************************************************************//**
+Determines if the currently running transaction has been interrupted.
+@return TRUE if interrupted */
+UNIV_INTERN
+ibool
+trx_is_interrupted(
+/*===============*/
+ const trx_t* trx) /*!< in: transaction */
+{
+ return(trx && trx->mysql_thd && thd_kill_level(trx->mysql_thd));
+}
+
+/**********************************************************************//**
+Determines if the currently running transaction is in strict mode.
+@return TRUE if strict */
+UNIV_INTERN
+ibool
+trx_is_strict(
+/*==========*/
+ trx_t* trx) /*!< in: transaction */
+{
+ return(trx && trx->mysql_thd && THDVAR(trx->mysql_thd, strict_mode));
+}
+
+/**************************************************************//**
+Resets some fields of a prebuilt struct. The template is used in fast
+retrieval of just those column values MySQL needs in its processing. */
+inline
+void
+ha_innobase::reset_template(void)
+/*=============================*/
+{
+ ut_ad(prebuilt->magic_n == ROW_PREBUILT_ALLOCATED);
+ ut_ad(prebuilt->magic_n2 == prebuilt->magic_n);
+
+ /* Force table to be freed in close_thread_table(). */
+ DBUG_EXECUTE_IF("free_table_in_fts_query",
+ if (prebuilt->in_fts_query) {
+ table->m_needs_reopen = true;
+ }
+ );
+
+ prebuilt->keep_other_fields_on_keyread = 0;
+ prebuilt->read_just_key = 0;
+ prebuilt->in_fts_query = 0;
+ /* Reset index condition pushdown state. */
+ if (prebuilt->idx_cond) {
+ prebuilt->idx_cond = NULL;
+ prebuilt->idx_cond_n_cols = 0;
+ /* Invalidate prebuilt->mysql_template
+ in ha_innobase::write_row(). */
+ prebuilt->template_type = ROW_MYSQL_NO_TEMPLATE;
+ }
+}
+
+/*****************************************************************//**
+Call this when you have opened a new table handle in HANDLER, before you
+call index_read_idx() etc. Actually, we can let the cursor stay open even
+over a transaction commit! Then you should call this before every operation,
+fetch next etc. This function inits the necessary things even after a
+transaction commit. */
+UNIV_INTERN
+void
+ha_innobase::init_table_handle_for_HANDLER(void)
+/*============================================*/
+{
+ /* If current thd does not yet have a trx struct, create one.
+ If the current handle does not yet have a prebuilt struct, create
+ one. Update the trx pointers in the prebuilt struct. Normally
+ this operation is done in external_lock. */
+
+ update_thd(ha_thd());
+
+ /* Initialize the prebuilt struct much like it would be inited in
+ external_lock */
+
+ trx_search_latch_release_if_reserved(prebuilt->trx);
+
+ innobase_srv_conc_force_exit_innodb(prebuilt->trx);
+
+ /* If the transaction is not started yet, start it */
+
+ trx_start_if_not_started_xa(prebuilt->trx);
+
+ /* Assign a read view if the transaction does not have it yet */
+
+ trx_assign_read_view(prebuilt->trx);
+
+ innobase_register_trx(ht, user_thd, prebuilt->trx);
+
+ /* We did the necessary inits in this function, no need to repeat them
+ in row_search_for_mysql */
+
+ prebuilt->sql_stat_start = FALSE;
+
+ /* We let HANDLER always to do the reads as consistent reads, even
+ if the trx isolation level would have been specified as SERIALIZABLE */
+
+ prebuilt->select_lock_type = LOCK_NONE;
+ prebuilt->stored_select_lock_type = LOCK_NONE;
+
+ /* Always fetch all columns in the index record */
+
+ prebuilt->hint_need_to_fetch_extra_cols = ROW_RETRIEVE_ALL_COLS;
+
+ /* We want always to fetch all columns in the whole row? Or do
+ we???? */
+
+ prebuilt->used_in_HANDLER = TRUE;
+ reset_template();
+}
+
+/****************************************************************//**
+Gives the file extension of an InnoDB single-table tablespace. */
+static const char* ha_innobase_exts[] = {
+ ".ibd",
+ ".isl",
+ NullS
+};
+
+/*********************************************************************//**
+Opens an InnoDB database.
+@return 0 on success, error code on failure */
+static
+int
+innobase_init(
+/*==========*/
+ void *p) /*!< in: InnoDB handlerton */
+{
+ static char current_dir[3]; /*!< Set if using current lib */
+ int err;
+ bool ret;
+ char *default_path;
+ uint format_id;
+ ulong num_pll_degree;
+
+ DBUG_ENTER("innobase_init");
+ handlerton *innobase_hton= (handlerton*) p;
+ innodb_hton_ptr = innobase_hton;
+
+ innobase_hton->state = SHOW_OPTION_YES;
+ innobase_hton->db_type= DB_TYPE_INNODB;
+ innobase_hton->savepoint_offset = sizeof(trx_named_savept_t);
+ innobase_hton->close_connection = innobase_close_connection;
+ innobase_hton->savepoint_set = innobase_savepoint;
+ innobase_hton->savepoint_rollback = innobase_rollback_to_savepoint;
+ innobase_hton->savepoint_rollback_can_release_mdl =
+ innobase_rollback_to_savepoint_can_release_mdl;
+ innobase_hton->savepoint_release = innobase_release_savepoint;
+ innobase_hton->prepare_ordered= NULL;
+ innobase_hton->commit_ordered= innobase_commit_ordered;
+ innobase_hton->commit = innobase_commit;
+ innobase_hton->rollback = innobase_rollback;
+ innobase_hton->prepare = innobase_xa_prepare;
+ innobase_hton->recover = innobase_xa_recover;
+ innobase_hton->commit_by_xid = innobase_commit_by_xid;
+ innobase_hton->rollback_by_xid = innobase_rollback_by_xid;
+ innobase_hton->commit_checkpoint_request=innobase_checkpoint_request;
+ innobase_hton->create_cursor_read_view = innobase_create_cursor_view;
+ innobase_hton->set_cursor_read_view = innobase_set_cursor_view;
+ innobase_hton->close_cursor_read_view = innobase_close_cursor_view;
+ innobase_hton->create = innobase_create_handler;
+ innobase_hton->drop_database = innobase_drop_database;
+ innobase_hton->panic = innobase_end;
+
+ innobase_hton->start_consistent_snapshot =
+ innobase_start_trx_and_assign_read_view;
+
+ innobase_hton->flush_logs = innobase_flush_logs;
+ innobase_hton->show_status = innobase_show_status;
+ innobase_hton->flags =
+ HTON_SUPPORTS_EXTENDED_KEYS | HTON_SUPPORTS_FOREIGN_KEYS;
+
+ innobase_hton->release_temporary_latches =
+ innobase_release_temporary_latches;
+#ifdef WITH_WSREP
+ innobase_hton->wsrep_abort_transaction=wsrep_abort_transaction;
+ innobase_hton->wsrep_set_checkpoint=innobase_wsrep_set_checkpoint;
+ innobase_hton->wsrep_get_checkpoint=innobase_wsrep_get_checkpoint;
+ innobase_hton->wsrep_fake_trx_id=wsrep_fake_trx_id;
+#endif /* WITH_WSREP */
+ innobase_hton->kill_query = innobase_kill_query;
+
+ if (srv_file_per_table)
+ innobase_hton->tablefile_extensions = ha_innobase_exts;
+
+ ut_a(DATA_MYSQL_TRUE_VARCHAR == (ulint)MYSQL_TYPE_VARCHAR);
+
+#ifndef DBUG_OFF
+ static const char test_filename[] = "-@";
+ char test_tablename[sizeof test_filename
+ + sizeof(srv_mysql50_table_name_prefix) - 1];
+ if ((sizeof(test_tablename)) - 1
+ != filename_to_tablename(test_filename,
+ test_tablename,
+ sizeof(test_tablename), true)
+ || strncmp(test_tablename,
+ srv_mysql50_table_name_prefix,
+ sizeof(srv_mysql50_table_name_prefix) - 1)
+ || strcmp(test_tablename
+ + sizeof(srv_mysql50_table_name_prefix) - 1,
+ test_filename)) {
+
+ sql_print_error("tablename encoding has been changed");
+
+ goto error;
+ }
+#endif /* DBUG_OFF */
+
+ /* Check that values don't overflow on 32-bit systems. */
+ if (sizeof(ulint) == 4) {
+ if (innobase_buffer_pool_size > UINT_MAX32) {
+ sql_print_error(
+ "innobase_buffer_pool_size can't be over 4GB"
+ " on 32-bit systems");
+
+ goto error;
+ }
+ }
+
+ os_innodb_umask = (ulint) my_umask;
+
+ /* First calculate the default path for innodb_data_home_dir etc.,
+ in case the user has not given any value.
+
+ Note that when using the embedded server, the datadirectory is not
+ necessarily the current directory of this program. */
+
+ if (mysqld_embedded) {
+ default_path = mysql_real_data_home;
+ fil_path_to_mysql_datadir = mysql_real_data_home;
+ } else {
+ /* It's better to use current lib, to keep paths short */
+ current_dir[0] = FN_CURLIB;
+ current_dir[1] = FN_LIBCHAR;
+ current_dir[2] = 0;
+ default_path = current_dir;
+ }
+
+ ut_a(default_path);
+
+ /* Set InnoDB initialization parameters according to the values
+ read from MySQL .cnf file */
+
+ /*--------------- Data files -------------------------*/
+
+ /* The default dir for data files is the datadir of MySQL */
+
+ srv_data_home = (innobase_data_home_dir ? innobase_data_home_dir :
+ default_path);
+
+ /* Set default InnoDB data file size to 12 MB and let it be
+ auto-extending. Thus users can use InnoDB in >= 4.0 without having
+ to specify any startup options. */
+
+ if (!innobase_data_file_path) {
+ innobase_data_file_path = (char*) "ibdata1:12M:autoextend";
+ }
+
+ /* Since InnoDB edits the argument in the next call, we make another
+ copy of it: */
+
+ internal_innobase_data_file_path = my_strdup(innobase_data_file_path,
+ MYF(MY_FAE));
+
+ ret = (bool) srv_parse_data_file_paths_and_sizes(
+ internal_innobase_data_file_path);
+ if (ret == FALSE) {
+ sql_print_error(
+ "InnoDB: syntax error in innodb_data_file_path"
+ " or size specified is less than 1 megabyte");
+mem_free_and_error:
+ srv_free_paths_and_sizes();
+ my_free(internal_innobase_data_file_path);
+ goto error;
+ }
+
+ /* -------------- All log files ---------------------------*/
+
+ /* The default dir for log files is the datadir of MySQL */
+
+ if (!srv_log_group_home_dir) {
+ srv_log_group_home_dir = default_path;
+ }
+
+#ifdef UNIV_LOG_ARCHIVE
+ /* Since innodb_log_arch_dir has no relevance under MySQL,
+ starting from 4.0.6 we always set it the same as
+ innodb_log_group_home_dir: */
+
+ innobase_log_arch_dir = innobase_log_group_home_dir;
+
+ srv_arch_dir = innobase_log_arch_dir;
+#endif /* UNIG_LOG_ARCHIVE */
+
+ srv_normalize_path_for_win(srv_log_group_home_dir);
+
+ if (strchr(srv_log_group_home_dir, ';')) {
+ sql_print_error("syntax error in innodb_log_group_home_dir");
+ goto mem_free_and_error;
+ }
+
+ if (innobase_mirrored_log_groups == 1) {
+ sql_print_warning(
+ "innodb_mirrored_log_groups is an unimplemented "
+ "feature and the variable will be completely "
+ "removed in a future version.");
+ }
+
+ if (innobase_mirrored_log_groups > 1) {
+ sql_print_error(
+ "innodb_mirrored_log_groups is an unimplemented feature and "
+ "the variable will be completely removed in a future version. "
+ "Using values other than 1 is not supported.");
+ goto mem_free_and_error;
+ }
+
+ if (innobase_mirrored_log_groups == 0) {
+ /* To throw a deprecation warning message when the option is
+ passed, the default was changed to '0' (as a workaround). Since
+ the only value accepted for this option is '1', reset it to 1 */
+ innobase_mirrored_log_groups = 1;
+ }
+
+ /* Validate the file format by animal name */
+ if (innobase_file_format_name != NULL) {
+
+ format_id = innobase_file_format_name_lookup(
+ innobase_file_format_name);
+
+ if (format_id > UNIV_FORMAT_MAX) {
+
+ sql_print_error("InnoDB: wrong innodb_file_format.");
+
+ goto mem_free_and_error;
+ }
+ } else {
+ /* Set it to the default file format id. Though this
+ should never happen. */
+ format_id = 0;
+ }
+
+ srv_file_format = format_id;
+
+ /* Given the type of innobase_file_format_name we have little
+ choice but to cast away the constness from the returned name.
+ innobase_file_format_name is used in the MySQL set variable
+ interface and so can't be const. */
+
+ innobase_file_format_name =
+ (char*) trx_sys_file_format_id_to_name(format_id);
+
+ /* Check innobase_file_format_check variable */
+ if (!innobase_file_format_check) {
+
+ /* Set the value to disable checking. */
+ srv_max_file_format_at_startup = UNIV_FORMAT_MAX + 1;
+
+ } else {
+
+ /* Set the value to the lowest supported format. */
+ srv_max_file_format_at_startup = UNIV_FORMAT_MIN;
+ }
+
+ /* Did the user specify a format name that we support?
+ As a side effect it will update the variable
+ srv_max_file_format_at_startup */
+ if (innobase_file_format_validate_and_set(
+ innobase_file_format_max) < 0) {
+
+ sql_print_error("InnoDB: invalid "
+ "innodb_file_format_max value: "
+ "should be any value up to %s or its "
+ "equivalent numeric id",
+ trx_sys_file_format_id_to_name(
+ UNIV_FORMAT_MAX));
+
+ goto mem_free_and_error;
+ }
+
+ if (innobase_change_buffering) {
+ ulint use;
+
+ for (use = 0;
+ use < UT_ARR_SIZE(innobase_change_buffering_values);
+ use++) {
+ if (!innobase_strcasecmp(
+ innobase_change_buffering,
+ innobase_change_buffering_values[use])) {
+ ibuf_use = (ibuf_use_t) use;
+ goto innobase_change_buffering_inited_ok;
+ }
+ }
+
+ sql_print_error("InnoDB: invalid value "
+ "innodb_change_buffering=%s",
+ innobase_change_buffering);
+ goto mem_free_and_error;
+ }
+
+innobase_change_buffering_inited_ok:
+ ut_a((ulint) ibuf_use < UT_ARR_SIZE(innobase_change_buffering_values));
+ innobase_change_buffering = (char*)
+ innobase_change_buffering_values[ibuf_use];
+
+ /* Check that interdependent parameters have sane values. */
+ if (srv_max_buf_pool_modified_pct < srv_max_dirty_pages_pct_lwm) {
+ sql_print_warning("InnoDB: innodb_max_dirty_pages_pct_lwm"
+ " cannot be set higher than"
+ " innodb_max_dirty_pages_pct.\n"
+ "InnoDB: Setting"
+ " innodb_max_dirty_pages_pct_lwm to %lf\n",
+ srv_max_buf_pool_modified_pct);
+
+ srv_max_dirty_pages_pct_lwm = srv_max_buf_pool_modified_pct;
+ }
+
+ if (srv_max_io_capacity == SRV_MAX_IO_CAPACITY_DUMMY_DEFAULT) {
+
+ if (srv_io_capacity >= SRV_MAX_IO_CAPACITY_LIMIT / 2) {
+ /* Avoid overflow. */
+ srv_max_io_capacity = SRV_MAX_IO_CAPACITY_LIMIT;
+ } else {
+ /* The user has not set the value. We should
+ set it based on innodb_io_capacity. */
+ srv_max_io_capacity = static_cast<ulong>(
+ ut_max(2 * srv_io_capacity, 2000));
+ }
+
+ } else if (srv_max_io_capacity < srv_io_capacity) {
+ sql_print_warning("InnoDB: innodb_io_capacity"
+ " cannot be set higher than"
+ " innodb_io_capacity_max.\n"
+ "InnoDB: Setting"
+ " innodb_io_capacity to %lu\n",
+ srv_max_io_capacity);
+
+ srv_io_capacity = srv_max_io_capacity;
+ }
+
+ if (!is_filename_allowed(srv_buf_dump_filename,
+ strlen(srv_buf_dump_filename), FALSE)) {
+ sql_print_error("InnoDB: innodb_buffer_pool_filename"
+ " cannot have colon (:) in the file name.");
+ goto mem_free_and_error;
+ }
+
+ /* --------------------------------------------------*/
+
+ srv_file_flush_method_str = innobase_file_flush_method;
+
+ srv_log_file_size = (ib_uint64_t) innobase_log_file_size;
+
+#ifdef UNIV_LOG_ARCHIVE
+ srv_log_archive_on = (ulint) innobase_log_archive;
+#endif /* UNIV_LOG_ARCHIVE */
+
+ /* Check that the value of system variable innodb_page_size was
+ set correctly. Its value was put into srv_page_size. If valid,
+ return the associated srv_page_size_shift.*/
+ srv_page_size_shift = innodb_page_size_validate(srv_page_size);
+ if (!srv_page_size_shift) {
+ sql_print_error("InnoDB: Invalid page size=%lu.\n",
+ srv_page_size);
+ goto mem_free_and_error;
+ }
+ if (UNIV_PAGE_SIZE_DEF != srv_page_size) {
+ ut_print_timestamp(stderr);
+ fprintf(stderr,
+ " InnoDB: innodb-page-size has been changed"
+ " from the default value %d to %lu.\n",
+ UNIV_PAGE_SIZE_DEF, srv_page_size);
+ }
+
+ srv_log_buffer_size = (ulint) innobase_log_buffer_size;
+
+ if (innobase_buffer_pool_instances == 0) {
+ innobase_buffer_pool_instances = 8;
+
+#if defined(__WIN__) && !defined(_WIN64)
+ if (innobase_buffer_pool_size > 1331 * 1024 * 1024) {
+ innobase_buffer_pool_instances
+ = ut_min(MAX_BUFFER_POOLS,
+ (long) (innobase_buffer_pool_size
+ / (128 * 1024 * 1024)));
+ }
+#endif /* defined(__WIN__) && !defined(_WIN64) */
+ }
+ srv_buf_pool_size = (ulint) innobase_buffer_pool_size;
+ srv_buf_pool_instances = (ulint) innobase_buffer_pool_instances;
+
+ srv_mem_pool_size = (ulint) innobase_additional_mem_pool_size;
+
+ if (innobase_additional_mem_pool_size
+ != 8*1024*1024L /* the default */ ) {
+
+ ut_print_timestamp(stderr);
+ fprintf(stderr,
+ " InnoDB: Warning: Using "
+ "innodb_additional_mem_pool_size is DEPRECATED. "
+ "This option may be removed in future releases, "
+ "together with the option innodb_use_sys_malloc "
+ "and with the InnoDB's internal memory "
+ "allocator.\n");
+ }
+
+ if (!srv_use_sys_malloc ) {
+ ut_print_timestamp(stderr);
+ fprintf(stderr,
+ " InnoDB: Warning: Setting "
+ "innodb_use_sys_malloc to FALSE is DEPRECATED. "
+ "This option may be removed in future releases, "
+ "together with the InnoDB's internal memory "
+ "allocator.\n");
+ }
+
+ srv_n_file_io_threads = (ulint) innobase_file_io_threads;
+ srv_n_read_io_threads = (ulint) innobase_read_io_threads;
+ srv_n_write_io_threads = (ulint) innobase_write_io_threads;
+
+ srv_use_doublewrite_buf = (ibool) innobase_use_doublewrite;
+
+ if (!innobase_use_checksums) {
+ ut_print_timestamp(stderr);
+ fprintf(stderr,
+ " InnoDB: Warning: Setting "
+ "innodb_checksums to OFF is DEPRECATED. "
+ "This option may be removed in future releases. "
+ "You should set innodb_checksum_algorithm=NONE "
+ "instead.\n");
+ srv_checksum_algorithm = SRV_CHECKSUM_ALGORITHM_NONE;
+ }
+
+#ifdef HAVE_LARGE_PAGES
+ if ((os_use_large_pages = (ibool) my_use_large_pages)) {
+ os_large_page_size = (ulint) opt_large_page_size;
+ }
+#endif
+
+ row_rollback_on_timeout = (ibool) innobase_rollback_on_timeout;
+
+ srv_locks_unsafe_for_binlog = (ibool) innobase_locks_unsafe_for_binlog;
+ if (innobase_locks_unsafe_for_binlog) {
+ ut_print_timestamp(stderr);
+ fprintf(stderr,
+ " InnoDB: Warning: Using "
+ "innodb_locks_unsafe_for_binlog is DEPRECATED. "
+ "This option may be removed in future releases. "
+ "Please use READ COMMITTED transaction isolation "
+ "level instead, see " REFMAN "set-transaction.html.\n");
+ }
+
+ if (innobase_open_files < 10) {
+ innobase_open_files = 300;
+ if (srv_file_per_table && tc_size > 300) {
+ innobase_open_files = tc_size;
+ }
+ }
+
+ if (innobase_open_files > (long) tc_size) {
+ fprintf(stderr,
+ "innodb_open_files should not be greater"
+ " than the open_files_limit.\n");
+ innobase_open_files = tc_size;
+ }
+
+ srv_max_n_open_files = (ulint) innobase_open_files;
+ srv_innodb_status = (ibool) innobase_create_status_file;
+
+ srv_print_verbose_log = mysqld_embedded ? 0 : 1;
+
+ /* Round up fts_sort_pll_degree to nearest power of 2 number */
+ for (num_pll_degree = 1;
+ num_pll_degree < fts_sort_pll_degree;
+ num_pll_degree <<= 1) {
+
+ /* No op */
+ }
+
+ fts_sort_pll_degree = num_pll_degree;
+
+ /* Store the default charset-collation number of this MySQL
+ installation */
+
+ data_mysql_default_charset_coll = (ulint) default_charset_info->number;
+
+ ut_a(DATA_MYSQL_LATIN1_SWEDISH_CHARSET_COLL ==
+ my_charset_latin1.number);
+ ut_a(DATA_MYSQL_BINARY_CHARSET_COLL == my_charset_bin.number);
+
+ /* Store the latin1_swedish_ci character ordering table to InnoDB. For
+ non-latin1_swedish_ci charsets we use the MySQL comparison functions,
+ and consequently we do not need to know the ordering internally in
+ InnoDB. */
+
+ srv_latin1_ordering = my_charset_latin1.sort_order;
+
+ innobase_commit_concurrency_init_default();
+
+#ifdef HAVE_POSIX_FALLOCATE
+ srv_use_posix_fallocate = (ibool) innobase_use_fallocate;
+#endif
+ srv_use_atomic_writes = (ibool) innobase_use_atomic_writes;
+
+ if (innobase_use_atomic_writes) {
+ fprintf(stderr, "InnoDB: using atomic writes.\n");
+
+ /* Force doublewrite buffer off, atomic writes replace it. */
+ if (srv_use_doublewrite_buf) {
+ fprintf(stderr, "InnoDB: Switching off doublewrite buffer "
+ "because of atomic writes.\n");
+ innobase_use_doublewrite = srv_use_doublewrite_buf = FALSE;
+ }
+
+ /* Force O_DIRECT on Unixes (on Windows writes are always unbuffered)*/
+#ifndef _WIN32
+ if(!innobase_file_flush_method ||
+ !strstr(innobase_file_flush_method, "O_DIRECT")) {
+ innobase_file_flush_method =
+ srv_file_flush_method_str = (char*)"O_DIRECT";
+ fprintf(stderr, "InnoDB: using O_DIRECT due to atomic writes.\n");
+ }
+#endif
+#ifdef HAVE_POSIX_FALLOCATE
+ /* Due to a bug in directFS, using atomics needs
+ * posix_fallocate to extend the file
+ * pwrite() past end of the file won't work
+ */
+ srv_use_posix_fallocate = TRUE;
+#endif
+ }
+
+#ifdef HAVE_PSI_INTERFACE
+ /* Register keys with MySQL performance schema */
+ int count;
+
+ count = array_elements(all_pthread_mutexes);
+ mysql_mutex_register("innodb", all_pthread_mutexes, count);
+
+# ifdef UNIV_PFS_MUTEX
+ count = array_elements(all_innodb_mutexes);
+ mysql_mutex_register("innodb", all_innodb_mutexes, count);
+# endif /* UNIV_PFS_MUTEX */
+
+# ifdef UNIV_PFS_RWLOCK
+ count = array_elements(all_innodb_rwlocks);
+ mysql_rwlock_register("innodb", all_innodb_rwlocks, count);
+# endif /* UNIV_PFS_MUTEX */
+
+# ifdef UNIV_PFS_THREAD
+ count = array_elements(all_innodb_threads);
+ mysql_thread_register("innodb", all_innodb_threads, count);
+# endif /* UNIV_PFS_THREAD */
+
+# ifdef UNIV_PFS_IO
+ count = array_elements(all_innodb_files);
+ mysql_file_register("innodb", all_innodb_files, count);
+# endif /* UNIV_PFS_IO */
+
+ count = array_elements(all_innodb_conds);
+ mysql_cond_register("innodb", all_innodb_conds, count);
+#endif /* HAVE_PSI_INTERFACE */
+
+ /* Since we in this module access directly the fields of a trx
+ struct, and due to different headers and flags it might happen that
+ ib_mutex_t has a different size in this module and in InnoDB
+ modules, we check at run time that the size is the same in
+ these compilation modules. */
+
+ err = innobase_start_or_create_for_mysql();
+
+ if (err != DB_SUCCESS) {
+ goto mem_free_and_error;
}
- trx = thd_to_trx(thd);
+ /* Adjust the innodb_undo_logs config object */
+ innobase_undo_logs_init_default_max();
- if (trx != NULL) {
- trx_search_latch_release_if_reserved(trx);
+ innobase_old_blocks_pct = static_cast<uint>(
+ buf_LRU_old_ratio_update(innobase_old_blocks_pct, TRUE));
+
+ ibuf_max_size_update(innobase_change_buffer_max_size);
+
+ innobase_open_tables = hash_create(200);
+ mysql_mutex_init(innobase_share_mutex_key,
+ &innobase_share_mutex,
+ MY_MUTEX_INIT_FAST);
+ mysql_mutex_init(commit_cond_mutex_key,
+ &commit_cond_m, MY_MUTEX_INIT_FAST);
+ mysql_cond_init(commit_cond_key, &commit_cond, NULL);
+ mysql_mutex_init(pending_checkpoint_mutex_key,
+ &pending_checkpoint_mutex,
+ MY_MUTEX_INIT_FAST);
+ innodb_inited= 1;
+#ifdef MYSQL_DYNAMIC_PLUGIN
+ if (innobase_hton != p) {
+ innobase_hton = reinterpret_cast<handlerton*>(p);
+ *innobase_hton = *innodb_hton_ptr;
}
+#endif /* MYSQL_DYNAMIC_PLUGIN */
- return(0);
+ /* Get the current high water mark format. */
+ innobase_file_format_max = (char*) trx_sys_file_format_max_get();
+
+ /* Currently, monitor counter information are not persistent. */
+ memset(monitor_set_tbl, 0, sizeof monitor_set_tbl);
+
+ memset(innodb_counter_value, 0, sizeof innodb_counter_value);
+
+ /* Do this as late as possible so server is fully starts up,
+ since we might get some initial stats if user choose to turn
+ on some counters from start up */
+ if (innobase_enable_monitor_counter) {
+ innodb_enable_monitor_at_startup(
+ innobase_enable_monitor_counter);
+ }
+
+ /* Turn on monitor counters that are default on */
+ srv_mon_default_on();
+
+ DBUG_RETURN(FALSE);
+error:
+ DBUG_RETURN(TRUE);
}
-#ifdef WITH_WSREP
-static int
-wsrep_abort_transaction(handlerton* hton, THD *bf_thd, THD *victim_thd,
- my_bool signal);
-static void
-wsrep_fake_trx_id(handlerton* hton, THD *thd);
-static int innobase_wsrep_set_checkpoint(handlerton* hton, const XID* xid);
-static int innobase_wsrep_get_checkpoint(handlerton* hton, XID* xid);
-#endif
-/********************************************************************//**
-Increments innobase_active_counter and every INNOBASE_WAKE_INTERVALth
-time calls srv_active_wake_master_thread. This function should be used
-when a single database operation may introduce a small need for
-server utility activity, like checkpointing. */
-static inline
+/** Shut down the InnoDB storage engine.
+@return 0 */
+static
+int
+innobase_end(handlerton*, ha_panic_function)
+{
+ DBUG_ENTER("innobase_end");
+
+ if (innodb_inited) {
+
+ THD *thd= current_thd;
+ if (thd) { // may be UNINSTALL PLUGIN statement
+ trx_t* trx = thd_to_trx(thd);
+ if (trx) {
+ trx_free_for_mysql(trx);
+ }
+ }
+
+ srv_fast_shutdown = (ulint) innobase_fast_shutdown;
+
+ innodb_inited = 0;
+ hash_table_free(innobase_open_tables);
+ innobase_open_tables = NULL;
+ innodb_shutdown();
+ srv_free_paths_and_sizes();
+ my_free(internal_innobase_data_file_path);
+ mysql_mutex_destroy(&innobase_share_mutex);
+ mysql_mutex_destroy(&commit_cond_m);
+ mysql_cond_destroy(&commit_cond);
+ mysql_mutex_destroy(&pending_checkpoint_mutex);
+ }
+
+ DBUG_RETURN(0);
+}
+
+/****************************************************************//**
+Flushes InnoDB logs to disk and makes a checkpoint. Really, a commit flushes
+the logs, and the name of this function should be innobase_checkpoint.
+@return TRUE if error */
+static
+bool
+innobase_flush_logs(
+/*================*/
+ handlerton* hton) /*!< in/out: InnoDB handlerton */
+{
+ bool result = 0;
+
+ DBUG_ENTER("innobase_flush_logs");
+ DBUG_ASSERT(hton == innodb_hton_ptr);
+
+ if (!srv_read_only_mode) {
+ log_buffer_flush_to_disk();
+ }
+
+ DBUG_RETURN(result);
+}
+
+/*****************************************************************//**
+Commits a transaction in an InnoDB database. */
+static
void
-innobase_active_small(void)
-/*=======================*/
+innobase_commit_low(
+/*================*/
+ trx_t* trx) /*!< in: transaction handle */
{
- innobase_active_counter++;
+#ifdef WITH_WSREP
+ THD* thd = (THD*)trx->mysql_thd;
+ const char* tmp = 0;
+ if (wsrep_on((void*)thd)) {
+#ifdef WSREP_PROC_INFO
+ char info[64];
+ info[sizeof(info) - 1] = '\0';
+ snprintf(info, sizeof(info) - 1,
+ "innobase_commit_low():trx_commit_for_mysql(%lld)",
+ (long long) wsrep_thd_trx_seqno(thd));
+ tmp = thd_proc_info(thd, info);
- if ((innobase_active_counter % INNOBASE_WAKE_INTERVAL) == 0) {
- srv_active_wake_master_thread();
+#else
+ tmp = thd_proc_info(thd, "innobase_commit_low()");
+#endif /* WSREP_PROC_INFO */
+ }
+#endif /* WITH_WSREP */
+ if (trx_is_started(trx)) {
+
+ trx_commit_for_mysql(trx);
}
+#ifdef WITH_WSREP
+ if (wsrep_on((void*)thd)) { thd_proc_info(thd, tmp); }
+#endif /* WITH_WSREP */
}
-/********************************************************************//**
-Converts an InnoDB error code to a MySQL error code and also tells to MySQL
-about a possible transaction rollback inside InnoDB caused by a lock wait
-timeout or a deadlock.
-@return MySQL error code */
-extern "C" UNIV_INTERN
+/*****************************************************************//**
+Creates an InnoDB transaction struct for the thd if it does not yet have one.
+Starts a new InnoDB transaction if a transaction is not yet started. And
+assigns a new snapshot for a consistent read if the transaction does not yet
+have one.
+@return 0 */
+static
int
-convert_error_code_to_mysql(
-/*========================*/
- int error, /*!< in: InnoDB error code */
- ulint flags, /*!< in: InnoDB table flags, or 0 */
- THD* thd) /*!< in: user thread handle or NULL */
+innobase_start_trx_and_assign_read_view(
+/*====================================*/
+ handlerton* hton, /*!< in: Innodb handlerton */
+ THD* thd) /*!< in: MySQL thread handle of the user for
+ whom the transaction should be committed */
{
- switch (error) {
- case DB_SUCCESS:
- return(0);
+ trx_t* trx;
- case DB_INTERRUPTED:
- return(HA_ERR_ABORTED_BY_USER);
+ DBUG_ENTER("innobase_start_trx_and_assign_read_view");
+ DBUG_ASSERT(hton == innodb_hton_ptr);
- case DB_FOREIGN_EXCEED_MAX_CASCADE:
- push_warning_printf(thd, MYSQL_ERROR::WARN_LEVEL_WARN,
- HA_ERR_ROW_IS_REFERENCED,
- "InnoDB: Cannot delete/update "
- "rows with cascading foreign key "
- "constraints that exceed max "
- "depth of %d. Please "
- "drop extra constraints and try "
- "again", DICT_FK_MAX_RECURSIVE_LOAD);
+ /* Create a new trx struct for thd, if it does not yet have one */
- /* fall through */
+ trx = check_trx_exists(thd);
- case DB_ERROR:
- default:
- return(-1); /* unspecified error */
+ /* This is just to play safe: release a possible FIFO ticket and
+ search latch. Since we can potentially reserve the trx_sys->mutex,
+ we have to release the search system latch first to obey the latching
+ order. */
+
+ trx_search_latch_release_if_reserved(trx);
+
+ innobase_srv_conc_force_exit_innodb(trx);
+
+ /* If the transaction is not started yet, start it */
+
+ trx_start_if_not_started_xa(trx);
+
+ /* Assign a read view if the transaction does not have it yet.
+ Do this only if transaction is using REPEATABLE READ isolation
+ level. */
+ trx->isolation_level = innobase_map_isolation_level(
+ thd_get_trx_isolation(thd));
+
+ if (trx->isolation_level == TRX_ISO_REPEATABLE_READ) {
+ trx_assign_read_view(trx);
+ } else {
+ push_warning_printf(thd, Sql_condition::WARN_LEVEL_WARN,
+ HA_ERR_UNSUPPORTED,
+ "InnoDB: WITH CONSISTENT SNAPSHOT "
+ "was ignored because this phrase "
+ "can only be used with "
+ "REPEATABLE READ isolation level.");
+ }
+
+ /* Set the MySQL flag to mark that there is an active transaction */
+
+ innobase_register_trx(hton, current_thd, trx);
+
+ DBUG_RETURN(0);
+}
+
+static
+void
+innobase_commit_ordered_2(
+/*============*/
+ trx_t* trx, /*!< in: Innodb transaction */
+ THD* thd) /*!< in: MySQL thread handle */
+{
+ DBUG_ENTER("innobase_commit_ordered_2");
+
+ /* We need current binlog position for mysqlbackup to work.
+ Note, the position is current because commit_ordered is guaranteed
+ to be called in same sequenece as writing to binlog. */
+
+retry:
+ if (innobase_commit_concurrency > 0) {
+ mysql_mutex_lock(&commit_cond_m);
+ commit_threads++;
+
+ if (commit_threads > innobase_commit_concurrency) {
+ commit_threads--;
+ mysql_cond_wait(&commit_cond,
+ &commit_cond_m);
+ mysql_mutex_unlock(&commit_cond_m);
+ goto retry;
+ }
+ else {
+ mysql_mutex_unlock(&commit_cond_m);
+ }
+ }
+
+ unsigned long long pos;
+ thd_binlog_pos(thd, &trx->mysql_log_file_name, &pos);
+ trx->mysql_log_offset= static_cast<ib_int64_t>(pos);
+ /* Don't do write + flush right now. For group commit
+ to work we want to do the flush in the innobase_commit()
+ method, which runs without holding any locks. */
+ trx->flush_log_later = TRUE;
+ innobase_commit_low(trx);
+ trx->flush_log_later = FALSE;
+
+ if (innobase_commit_concurrency > 0) {
+ mysql_mutex_lock(&commit_cond_m);
+ commit_threads--;
+ mysql_cond_signal(&commit_cond);
+ mysql_mutex_unlock(&commit_cond_m);
+ }
+
+ DBUG_VOID_RETURN;
+}
- case DB_DUPLICATE_KEY:
- /* Be cautious with returning this error, since
- mysql could re-enter the storage layer to get
- duplicated key info, the operation requires a
- valid table handle and/or transaction information,
- which might not always be available in the error
- handling stage. */
- return(HA_ERR_FOUND_DUPP_KEY);
+/*****************************************************************//**
+Perform the first, fast part of InnoDB commit.
- case DB_FOREIGN_DUPLICATE_KEY:
- return(HA_ERR_FOREIGN_DUPLICATE_KEY);
+Doing it in this call ensures that we get the same commit order here
+as in binlog and any other participating transactional storage engines.
- case DB_MISSING_HISTORY:
- return(HA_ERR_TABLE_DEF_CHANGED);
+Note that we want to do as little as really needed here, as we run
+under a global mutex. The expensive fsync() is done later, in
+innobase_commit(), without a lock so group commit can take place.
- case DB_RECORD_NOT_FOUND:
- return(HA_ERR_NO_ACTIVE_RECORD);
+Note also that this method can be called from a different thread than
+the one handling the rest of the transaction. */
+static
+void
+innobase_commit_ordered(
+/*============*/
+ handlerton *hton, /*!< in: Innodb handlerton */
+ THD* thd, /*!< in: MySQL thread handle of the user for whom
+ the transaction should be committed */
+ bool all) /*!< in: TRUE - commit transaction
+ FALSE - the current SQL statement ended */
+{
+ trx_t* trx;
+ DBUG_ENTER("innobase_commit_ordered");
+ DBUG_ASSERT(hton == innodb_hton_ptr);
- case DB_DEADLOCK:
- /* Since we rolled back the whole transaction, we must
- tell it also to MySQL so that MySQL knows to empty the
- cached binlog for this transaction */
+ trx = check_trx_exists(thd);
- if (thd) {
- thd_mark_transaction_to_rollback(thd, TRUE);
- }
+ /* Since we will reserve the kernel mutex, we must not be holding the
+ search system latch, or we will disobey the latching order. But we
+ already released it in innobase_xa_prepare() (if not before), so just
+ have an assert here.*/
+ ut_ad(!trx->has_search_latch);
- return(HA_ERR_LOCK_DEADLOCK);
+ if (!trx_is_registered_for_2pc(trx) && trx_is_started(trx)) {
+ /* We cannot throw error here; instead we will catch this error
+ again in innobase_commit() and report it from there. */
+ DBUG_VOID_RETURN;
+ }
- case DB_LOCK_WAIT_TIMEOUT:
- /* Starting from 5.0.13, we let MySQL just roll back the
- latest SQL statement in a lock wait timeout. Previously, we
- rolled back the whole transaction. */
+ /* commit_ordered is only called when committing the whole transaction
+ (or an SQL statement when autocommit is on). */
+ DBUG_ASSERT(all ||
+ (!thd_test_options(thd, OPTION_NOT_AUTOCOMMIT | OPTION_BEGIN)));
- if (thd) {
- thd_mark_transaction_to_rollback(
- thd, (bool)row_rollback_on_timeout);
- }
+ innobase_commit_ordered_2(trx, thd);
- return(HA_ERR_LOCK_WAIT_TIMEOUT);
+ trx_set_active_commit_ordered(trx);
- case DB_NO_REFERENCED_ROW:
- return(HA_ERR_NO_REFERENCED_ROW);
+ DBUG_VOID_RETURN;
+}
- case DB_ROW_IS_REFERENCED:
- return(HA_ERR_ROW_IS_REFERENCED);
+/*****************************************************************//**
+Commits a transaction in an InnoDB database or marks an SQL statement
+ended.
+@return 0 */
+static
+int
+innobase_commit(
+/*============*/
+ handlerton* hton, /*!< in: Innodb handlerton */
+ THD* thd, /*!< in: MySQL thread handle of the
+ user for whom the transaction should
+ be committed */
+ bool commit_trx) /*!< in: true - commit transaction
+ false - the current SQL statement
+ ended */
+{
+ trx_t* trx;
- case DB_CANNOT_ADD_CONSTRAINT:
- case DB_CHILD_NO_INDEX:
- case DB_PARENT_NO_INDEX:
- return(HA_ERR_CANNOT_ADD_FOREIGN);
+ DBUG_ENTER("innobase_commit");
+ DBUG_ASSERT(hton == innodb_hton_ptr);
+ DBUG_PRINT("trans", ("ending transaction"));
- case DB_CANNOT_DROP_CONSTRAINT:
+ trx = check_trx_exists(thd);
- return(HA_ERR_ROW_IS_REFERENCED); /* TODO: This is a bit
- misleading, a new MySQL error
- code should be introduced */
+ /* Since we will reserve the trx_sys->mutex, we have to release
+ the search system latch first to obey the latching order. */
- case DB_CORRUPTION:
- return(HA_ERR_CRASHED);
+ if (trx->has_search_latch && !trx_is_active_commit_ordered(trx)) {
+ trx_search_latch_release_if_reserved(trx);
+ }
- case DB_OUT_OF_FILE_SPACE:
- return(HA_ERR_RECORD_FILE_FULL);
+ /* Transaction is deregistered only in a commit or a rollback. If
+ it is deregistered we know there cannot be resources to be freed
+ and we could return immediately. For the time being, we play safe
+ and do the cleanup though there should be nothing to clean up. */
- case DB_TABLE_IN_FK_CHECK:
- return(HA_ERR_TABLE_IN_FK_CHECK);
+ if (!trx_is_registered_for_2pc(trx) && trx_is_started(trx)) {
- case DB_TABLE_IS_BEING_USED:
- return(HA_ERR_WRONG_COMMAND);
+ sql_print_error("Transaction not registered for MySQL 2PC, "
+ "but transaction is active");
+ }
- case DB_TABLE_NOT_FOUND:
- return(HA_ERR_NO_SUCH_TABLE);
+ if (commit_trx
+ || (!thd_test_options(thd, OPTION_NOT_AUTOCOMMIT | OPTION_BEGIN))) {
- case DB_TOO_BIG_RECORD: {
- /* If prefix is true then a 768-byte prefix is stored
- locally for BLOB fields. Refer to dict_table_get_format() */
- bool prefix = ((flags & DICT_TF_FORMAT_MASK)
- >> DICT_TF_FORMAT_SHIFT) < UNIV_FORMAT_B;
- my_printf_error(ER_TOO_BIG_ROWSIZE,
- "Row size too large (> %lu). Changing some columns "
- "to TEXT or BLOB %smay help. In current row "
- "format, BLOB prefix of %d bytes is stored inline.",
- MYF(0),
- page_get_free_space_of_empty(flags &
- DICT_TF_COMPACT) / 2,
- prefix ? "or using ROW_FORMAT=DYNAMIC "
- "or ROW_FORMAT=COMPRESSED ": "",
- prefix ? DICT_MAX_FIXED_COL_LEN : 0);
- return(HA_ERR_TO_BIG_ROW);
- }
+ /* Run the fast part of commit if we did not already. */
+ if (!trx_is_active_commit_ordered(trx)) {
+ innobase_commit_ordered_2(trx, thd);
+ }
- case DB_TOO_BIG_INDEX_COL:
- my_error(ER_INDEX_COLUMN_TOO_LONG, MYF(0),
- DICT_MAX_FIELD_LEN_BY_FORMAT_FLAG(flags));
- return(HA_ERR_INDEX_COL_TOO_LONG);
+ /* We were instructed to commit the whole transaction, or
+ this is an SQL statement end and autocommit is on */
- case DB_NO_SAVEPOINT:
- return(HA_ERR_NO_SAVEPOINT);
+ /* At this point commit order is fixed and transaction is
+ visible to others. So we can wakeup other commits waiting for
+ this one, to allow then to group commit with us. */
+ thd_wakeup_subsequent_commits(thd, 0);
- case DB_LOCK_TABLE_FULL:
- /* Since we rolled back the whole transaction, we must
- tell it also to MySQL so that MySQL knows to empty the
- cached binlog for this transaction */
+ /* We did the first part already in innobase_commit_ordered(),
+ Now finish by doing a write + flush of logs. */
+ trx_commit_complete_for_mysql(trx);
+ trx_deregister_from_2pc(trx);
+ } else {
+ /* We just mark the SQL statement ended and do not do a
+ transaction commit */
- if (thd) {
- thd_mark_transaction_to_rollback(thd, TRUE);
- }
+ /* If we had reserved the auto-inc lock for some
+ table in this SQL statement we release it now */
- return(HA_ERR_LOCK_TABLE_FULL);
+ lock_unlock_table_autoinc(trx);
- case DB_PRIMARY_KEY_IS_NULL:
- return(ER_PRIMARY_CANT_HAVE_NULL);
+ /* Store the current undo_no of the transaction so that we
+ know where to roll back if we have to roll back the next
+ SQL statement */
- case DB_TOO_MANY_CONCURRENT_TRXS:
- /* New error code HA_ERR_TOO_MANY_CONCURRENT_TRXS is only
- available in 5.1.38 and later, but the plugin should still
- work with previous versions of MySQL. */
-#ifdef HA_ERR_TOO_MANY_CONCURRENT_TRXS
- return(HA_ERR_TOO_MANY_CONCURRENT_TRXS);
-#else /* HA_ERR_TOO_MANY_CONCURRENT_TRXS */
- return(HA_ERR_RECORD_FILE_FULL);
-#endif /* HA_ERR_TOO_MANY_CONCURRENT_TRXS */
- case DB_UNSUPPORTED:
- return(HA_ERR_UNSUPPORTED);
- case DB_INDEX_CORRUPT:
- return(HA_ERR_INDEX_CORRUPT);
- case DB_UNDO_RECORD_TOO_BIG:
- return(HA_ERR_UNDO_REC_TOO_BIG);
- case DB_OUT_OF_MEMORY:
- return(HA_ERR_OUT_OF_MEM);
- case DB_IDENTIFIER_TOO_LONG:
- return(HA_ERR_INTERNAL_ERROR);
+ trx_mark_sql_stat_end(trx);
}
-}
-/*************************************************************//**
-Prints info of a THD object (== user session thread) to the given file. */
-extern "C" UNIV_INTERN
-void
-innobase_mysql_print_thd(
-/*=====================*/
- FILE* f, /*!< in: output stream */
- void* thd, /*!< in: pointer to a MySQL THD object */
- uint max_query_len) /*!< in: max query length to print, or 0 to
- use the default max length */
-{
- char buffer[1024];
+ trx->n_autoinc_rows = 0; /* Reset the number AUTO-INC rows required */
- fputs(thd_security_context((THD*) thd, buffer, sizeof buffer,
- max_query_len), f);
- putc('\n', f);
+ /* This is a statement level variable. */
+ trx->fts_next_doc_id = 0;
+
+ innobase_srv_conc_force_exit_innodb(trx);
+
+ DBUG_RETURN(0);
}
-/******************************************************************//**
-Get the variable length bounds of the given character set. */
-extern "C" UNIV_INTERN
-void
-innobase_get_cset_width(
-/*====================*/
- ulint cset, /*!< in: MySQL charset-collation code */
- ulint* mbminlen, /*!< out: minimum length of a char (in bytes) */
- ulint* mbmaxlen) /*!< out: maximum length of a char (in bytes) */
+/*****************************************************************//**
+Rolls back a transaction or the latest SQL statement.
+@return 0 or error number */
+static
+int
+innobase_rollback(
+/*==============*/
+ handlerton* hton, /*!< in: Innodb handlerton */
+ THD* thd, /*!< in: handle to the MySQL thread
+ of the user whose transaction should
+ be rolled back */
+ bool rollback_trx) /*!< in: TRUE - rollback entire
+ transaction FALSE - rollback the current
+ statement only */
{
- CHARSET_INFO* cs;
- ut_ad(cset < 256);
- ut_ad(mbminlen);
- ut_ad(mbmaxlen);
-
- cs = all_charsets[cset];
- if (cs) {
- *mbminlen = cs->mbminlen;
- *mbmaxlen = cs->mbmaxlen;
- ut_ad(*mbminlen < DATA_MBMAX);
- ut_ad(*mbmaxlen < DATA_MBMAX);
- } else {
- THD* thd = current_thd;
+ dberr_t error;
+ trx_t* trx;
- if (thd && thd_sql_command(thd) == SQLCOM_DROP_TABLE) {
+ DBUG_ENTER("innobase_rollback");
+ DBUG_ASSERT(hton == innodb_hton_ptr);
+ DBUG_PRINT("trans", ("aborting transaction"));
- /* Fix bug#46256: allow tables to be dropped if the
- collation is not found, but issue a warning. */
- if ((global_system_variables.log_warnings)
- && (cset != 0)){
+ trx = check_trx_exists(thd);
- sql_print_warning(
- "Unknown collation #%lu.", cset);
- }
- } else {
+ /* Release a possible FIFO ticket and search latch. Since we will
+ reserve the trx_sys->mutex, we have to release the search system
+ latch first to obey the latching order. */
- ut_a(cset == 0);
- }
+ trx_search_latch_release_if_reserved(trx);
- *mbminlen = *mbmaxlen = 0;
- }
-}
+ innobase_srv_conc_force_exit_innodb(trx);
-/******************************************************************//**
-Converts an identifier to a table name. */
-extern "C" UNIV_INTERN
-void
-innobase_convert_from_table_id(
-/*===========================*/
- struct charset_info_st* cs, /*!< in: the 'from' character set */
- char* to, /*!< out: converted identifier */
- const char* from, /*!< in: identifier to convert */
- ulint len) /*!< in: length of 'to', in bytes */
-{
- uint errors;
+ trx->n_autoinc_rows = 0; /* Reset the number AUTO-INC rows required */
- strconvert(cs, from, &my_charset_filename, to, (uint) len, &errors);
-}
+ /* If we had reserved the auto-inc lock for some table (if
+ we come here to roll back the latest SQL statement) we
+ release it now before a possibly lengthy rollback */
-/**********************************************************************
-Check if the length of the identifier exceeds the maximum allowed.
-return true when length of identifier is too long. */
-extern "C"
-my_bool
-innobase_check_identifier_length(
-/*=============================*/
- const char* id) /* in: FK identifier to check excluding the
- database portion. */
-{
- int well_formed_error = 0;
- CHARSET_INFO *cs = system_charset_info;
- DBUG_ENTER("innobase_check_identifier_length");
+ lock_unlock_table_autoinc(trx);
- uint res = cs->cset->well_formed_len(cs, id, id + strlen(id),
- NAME_CHAR_LEN,
- &well_formed_error);
+ /* This is a statement level variable. */
+ trx->fts_next_doc_id = 0;
- if (well_formed_error || res == NAME_CHAR_LEN) {
- my_error(ER_TOO_LONG_IDENT, MYF(0), id);
- DBUG_RETURN(true);
+ if (rollback_trx
+ || !thd_test_options(thd, OPTION_NOT_AUTOCOMMIT | OPTION_BEGIN)) {
+
+ error = trx_rollback_for_mysql(trx);
+ trx_deregister_from_2pc(trx);
+ } else {
+ error = trx_rollback_last_sql_stat_for_mysql(trx);
}
- DBUG_RETURN(false);
+
+ DBUG_RETURN(convert_error_code_to_mysql(error, 0, NULL));
}
-/******************************************************************//**
-Converts an identifier to UTF-8. */
-extern "C" UNIV_INTERN
-void
-innobase_convert_from_id(
-/*=====================*/
- struct charset_info_st* cs, /*!< in: the 'from' character set */
- char* to, /*!< out: converted identifier */
- const char* from, /*!< in: identifier to convert */
- ulint len) /*!< in: length of 'to', in bytes */
+/*****************************************************************//**
+Rolls back a transaction
+@return 0 or error number */
+static
+int
+innobase_rollback_trx(
+/*==================*/
+ trx_t* trx) /*!< in: transaction */
{
- uint errors;
+ dberr_t error = DB_SUCCESS;
- strconvert(cs, from, system_charset_info, to, (uint) len, &errors);
-}
+ DBUG_ENTER("innobase_rollback_trx");
+ DBUG_PRINT("trans", ("aborting transaction"));
-/**********************************************************************
-Converts an identifier from my_charset_filename to UTF-8 charset.
-@return result string length, as returned by strconvert() */
-extern "C"
-uint
-innobase_convert_to_system_charset(
-/*===============================*/
- char* to, /* out: converted identifier */
- const char* from, /* in: identifier to convert */
- ulint len, /* in: length of 'to', in bytes */
- uint* errors) /* out: error return */
-{
- CHARSET_INFO* cs1 = &my_charset_filename;
- CHARSET_INFO* cs2 = system_charset_info;
+ /* Release a possible FIFO ticket and search latch. Since we will
+ reserve the trx_sys->mutex, we have to release the search system
+ latch first to obey the latching order. */
- return(strconvert(cs1, from, cs2, to, len, errors));
-}
+ trx_search_latch_release_if_reserved(trx);
-/******************************************************************//**
-Compares NUL-terminated UTF-8 strings case insensitively.
-@return 0 if a=b, <0 if a<b, >1 if a>b */
-extern "C" UNIV_INTERN
-int
-innobase_strcasecmp(
-/*================*/
- const char* a, /*!< in: first string to compare */
- const char* b) /*!< in: second string to compare */
-{
- return(my_strcasecmp(system_charset_info, a, b));
-}
+ innobase_srv_conc_force_exit_innodb(trx);
-/******************************************************************//**
-Strip dir name from a full path name and return only the file name
-@return file name or "null" if no file name */
-extern "C" UNIV_INTERN
-const char*
-innobase_basename(
-/*==============*/
- const char* path_name) /*!< in: full path name */
-{
- const char* name = base_name(path_name);
+ /* If we had reserved the auto-inc lock for some table (if
+ we come here to roll back the latest SQL statement) we
+ release it now before a possibly lengthy rollback */
- return((name) ? name : "null");
+ lock_unlock_table_autoinc(trx);
+
+ if (!trx->read_only) {
+ error = trx_rollback_for_mysql(trx);
+ }
+
+ DBUG_RETURN(convert_error_code_to_mysql(error, 0, NULL));
}
-/******************************************************************//**
-Makes all characters in a NUL-terminated UTF-8 string lower case. */
-extern "C" UNIV_INTERN
+
+struct pending_checkpoint {
+ struct pending_checkpoint *next;
+ handlerton *hton;
+ void *cookie;
+ ib_uint64_t lsn;
+};
+static struct pending_checkpoint *pending_checkpoint_list;
+static struct pending_checkpoint *pending_checkpoint_list_end;
+
+/*****************************************************************//**
+Handle a commit checkpoint request from server layer.
+We put the request in a queue, so that we can notify upper layer about
+checkpoint complete when we have flushed the redo log.
+If we have already flushed all relevant redo log, we notify immediately.*/
+static
void
-innobase_casedn_str(
-/*================*/
- char* a) /*!< in/out: string to put in lower case */
+innobase_checkpoint_request(
+ handlerton *hton,
+ void *cookie)
{
- my_casedn_str(system_charset_info, a);
-}
+ ib_uint64_t lsn;
+ ib_uint64_t flush_lsn;
+ struct pending_checkpoint * entry;
+
+ /* Do the allocation outside of lock to reduce contention. The normal
+ case is that not everything is flushed, so we will need to enqueue. */
+ entry = static_cast<struct pending_checkpoint *>
+ (my_malloc(sizeof(*entry), MYF(MY_WME)));
+ if (!entry) {
+ sql_print_error("Failed to allocate %u bytes."
+ " Commit checkpoint will be skipped.",
+ static_cast<unsigned>(sizeof(*entry)));
+ return;
+ }
-/**********************************************************************//**
-Determines the connection character set.
-@return connection character set */
-extern "C" UNIV_INTERN
-struct charset_info_st*
-innobase_get_charset(
-/*=================*/
- void* mysql_thd) /*!< in: MySQL thread handle */
-{
- return(thd_charset((THD*) mysql_thd));
+ entry->next = NULL;
+ entry->hton = hton;
+ entry->cookie = cookie;
+
+ mysql_mutex_lock(&pending_checkpoint_mutex);
+ lsn = log_get_lsn();
+ flush_lsn = log_get_flush_lsn();
+ if (lsn > flush_lsn) {
+ /* Put the request in queue.
+ When the log gets flushed past the lsn, we will remove the
+ entry from the queue and notify the upper layer. */
+ entry->lsn = lsn;
+ if (pending_checkpoint_list_end) {
+ pending_checkpoint_list_end->next = entry;
+ /* There is no need to order the entries in the list
+ by lsn. The upper layer can accept notifications in
+ any order, and short delays in notifications do not
+ significantly impact performance. */
+ } else {
+ pending_checkpoint_list = entry;
+ }
+ pending_checkpoint_list_end = entry;
+ entry = NULL;
+ }
+ mysql_mutex_unlock(&pending_checkpoint_mutex);
+
+ if (entry) {
+ /* We are already flushed. Notify the checkpoint immediately. */
+ commit_checkpoint_notify_ha(entry->hton, entry->cookie);
+ my_free(entry);
+ }
}
-/**********************************************************************//**
-Determines the current SQL statement.
-@return SQL statement string */
-extern "C" UNIV_INTERN
-const char*
-innobase_get_stmt(
-/*==============*/
- void* mysql_thd, /*!< in: MySQL thread handle */
- size_t* length) /*!< out: length of the SQL statement */
+/*****************************************************************//**
+Log code calls this whenever log has been written and/or flushed up
+to a new position. We use this to notify upper layer of a new commit
+checkpoint when necessary.*/
+UNIV_INTERN
+void
+innobase_mysql_log_notify(
+/*===============*/
+ ib_uint64_t write_lsn, /*!< in: LSN written to log file */
+ ib_uint64_t flush_lsn) /*!< in: LSN flushed to disk */
{
- LEX_STRING* stmt;
+ struct pending_checkpoint * pending;
+ struct pending_checkpoint * entry;
+ struct pending_checkpoint * last_ready;
+
+ /* It is safe to do a quick check for NULL first without lock.
+ Even if we should race, we will at most skip one checkpoint and
+ take the next one, which is harmless. */
+ if (!pending_checkpoint_list)
+ return;
- stmt = thd_query_string((THD*) mysql_thd);
- *length = stmt->length;
- return(stmt->str);
-}
+ mysql_mutex_lock(&pending_checkpoint_mutex);
+ pending = pending_checkpoint_list;
+ if (!pending)
+ {
+ mysql_mutex_unlock(&pending_checkpoint_mutex);
+ return;
+ }
-/**********************************************************************//**
-Get the current setting of the lower_case_table_names global parameter from
-mysqld.cc. We do a dirty read because for one there is no synchronization
-object and secondly there is little harm in doing so even if we get a torn
-read.
-@return value of lower_case_table_names */
-extern "C" UNIV_INTERN
-ulint
-innobase_get_lower_case_table_names(void)
-/*=====================================*/
-{
- return(lower_case_table_names);
-}
+ last_ready = NULL;
+ for (entry = pending; entry != NULL; entry = entry -> next)
+ {
+ /* Notify checkpoints up until the first entry that has not
+ been fully flushed to the redo log. Since we do not maintain
+ the list ordered, in principle there could be more entries
+ later than were also flushed. But there is no harm in
+ delaying notifications for those a bit. And in practise, the
+ list is unlikely to have more than one element anyway, as we
+ flush the redo log at least once every second. */
+ if (entry->lsn > flush_lsn)
+ break;
+ last_ready = entry;
+ }
-/*********************************************************************//**
-Creates a temporary file.
-@return temporary file descriptor, or < 0 on error */
-extern "C" UNIV_INTERN
-int
-innobase_mysql_tmpfile(void)
-/*========================*/
-{
-#ifdef WITH_INNODB_DISALLOW_WRITES
- os_event_wait(srv_allow_writes_event);
-#endif /* WITH_INNODB_DISALLOW_WRITES */
- int fd2 = -1;
- File fd;
+ if (last_ready)
+ {
+ /* We found some pending checkpoints that are now flushed to
+ disk. So remove them from the list. */
+ pending_checkpoint_list = entry;
+ if (!entry)
+ pending_checkpoint_list_end = NULL;
+ }
- DBUG_EXECUTE_IF(
- "innobase_tmpfile_creation_failure",
- return(-1);
- );
+ mysql_mutex_unlock(&pending_checkpoint_mutex);
- fd = mysql_tmpfile("ib");
+ if (!last_ready)
+ return;
- if (fd >= 0) {
- /* Copy the file descriptor, so that the additional resources
- allocated by create_temp_file() can be freed by invoking
- my_close().
+ /* Now that we have released the lock, notify upper layer about all
+ commit checkpoints that have now completed. */
+ for (;;) {
+ entry = pending;
+ pending = pending->next;
- Because the file descriptor returned by this function
- will be passed to fdopen(), it will be closed by invoking
- fclose(), which in turn will invoke close() instead of
- my_close(). */
+ commit_checkpoint_notify_ha(entry->hton, entry->cookie);
-#ifdef _WIN32
- /* Note that on Windows, the integer returned by mysql_tmpfile
- has no relation to C runtime file descriptor. Here, we need
- to call my_get_osfhandle to get the HANDLE and then convert it
- to C runtime filedescriptor. */
- {
- HANDLE hFile = my_get_osfhandle(fd);
- HANDLE hDup;
- BOOL bOK =
- DuplicateHandle(GetCurrentProcess(), hFile, GetCurrentProcess(),
- &hDup, 0, FALSE, DUPLICATE_SAME_ACCESS);
- if(bOK) {
- fd2 = _open_osfhandle((intptr_t)hDup,0);
- }
- else {
- my_osmaperr(GetLastError());
- fd2 = -1;
- }
- }
-#else
-#ifdef F_DUPFD_CLOEXEC
- fd2 = fcntl(fd, F_DUPFD_CLOEXEC, 0);
-#else
- fd2 = dup(fd);
-#endif
-#endif
- if (fd2 < 0) {
- DBUG_PRINT("error",("Got error %d on dup",fd2));
- my_errno=errno;
- my_error(EE_OUT_OF_FILERESOURCES,
- MYF(ME_BELL+ME_WAITTANG),
- "ib*", my_errno);
- }
- my_close(fd, MYF(MY_WME));
+ my_free(entry);
+ if (entry == last_ready)
+ break;
}
- return(fd2);
-}
-
-/*********************************************************************//**
-Wrapper around MySQL's copy_and_convert function.
-@return number of bytes copied to 'to' */
-extern "C" UNIV_INTERN
-ulint
-innobase_convert_string(
-/*====================*/
- void* to, /*!< out: converted string */
- ulint to_length, /*!< in: number of bytes reserved
- for the converted string */
- CHARSET_INFO* to_cs, /*!< in: character set to convert to */
- const void* from, /*!< in: string to convert */
- ulint from_length, /*!< in: number of bytes to convert */
- CHARSET_INFO* from_cs, /*!< in: character set to convert from */
- uint* errors) /*!< out: number of errors encountered
- during the conversion */
-{
- return(copy_and_convert((char*)to, (uint32) to_length, to_cs,
- (const char*)from, (uint32) from_length, from_cs,
- errors));
}
-/*******************************************************************//**
-Formats the raw data in "data" (in InnoDB on-disk format) that is of
-type DATA_(CHAR|VARCHAR|MYSQL|VARMYSQL) using "charset_coll" and writes
-the result to "buf". The result is converted to "system_charset_info".
-Not more than "buf_size" bytes are written to "buf".
-The result is always NUL-terminated (provided buf_size > 0) and the
-number of bytes that were written to "buf" is returned (including the
-terminating NUL).
-@return number of bytes that were written */
-extern "C" UNIV_INTERN
-ulint
-innobase_raw_format(
-/*================*/
- const char* data, /*!< in: raw data */
- ulint data_len, /*!< in: raw data length
- in bytes */
- ulint charset_coll, /*!< in: charset collation */
- char* buf, /*!< out: output buffer */
- ulint buf_size) /*!< in: output buffer size
- in bytes */
+/*****************************************************************//**
+Rolls back a transaction to a savepoint.
+@return 0 if success, HA_ERR_NO_SAVEPOINT if no savepoint with the
+given name */
+static
+int
+innobase_rollback_to_savepoint(
+/*===========================*/
+ handlerton* hton, /*!< in: Innodb handlerton */
+ THD* thd, /*!< in: handle to the MySQL thread
+ of the user whose transaction should
+ be rolled back to savepoint */
+ void* savepoint) /*!< in: savepoint data */
{
- /* XXX we use a hard limit instead of allocating
- but_size bytes from the heap */
- CHARSET_INFO* data_cs;
- char buf_tmp[8192];
- ulint buf_tmp_used;
- uint num_errors;
+ ib_int64_t mysql_binlog_cache_pos;
+ dberr_t error;
+ trx_t* trx;
+ char name[64];
- data_cs = all_charsets[charset_coll];
+ DBUG_ENTER("innobase_rollback_to_savepoint");
+ DBUG_ASSERT(hton == innodb_hton_ptr);
- buf_tmp_used = innobase_convert_string(buf_tmp, sizeof(buf_tmp),
- system_charset_info,
- data, data_len, data_cs,
- &num_errors);
+ trx = check_trx_exists(thd);
- return(ut_str_sql_format(buf_tmp, buf_tmp_used, buf, buf_size));
+ /* Release a possible FIFO ticket and search latch. Since we will
+ reserve the trx_sys->mutex, we have to release the search system
+ latch first to obey the latching order. */
+
+ trx_search_latch_release_if_reserved(trx);
+
+ innobase_srv_conc_force_exit_innodb(trx);
+
+ /* TODO: use provided savepoint data area to store savepoint data */
+
+ longlong2str((ulint) savepoint, name, 36);
+
+ error = trx_rollback_to_savepoint_for_mysql(
+ trx, name, &mysql_binlog_cache_pos);
+
+ if (error == DB_SUCCESS && trx->fts_trx != NULL) {
+ fts_savepoint_rollback(trx, name);
+ }
+
+ DBUG_RETURN(convert_error_code_to_mysql(error, 0, NULL));
}
-/*********************************************************************//**
-Compute the next autoinc value.
+/*****************************************************************//**
+Check whether innodb state allows to safely release MDL locks after
+rollback to savepoint.
+When binlog is on, MDL locks acquired after savepoint unit are not
+released if there are any locks held in InnoDB.
+@return true if it is safe, false if its not safe. */
+static
+bool
+innobase_rollback_to_savepoint_can_release_mdl(
+/*===========================================*/
+ handlerton* hton, /*!< in: InnoDB handlerton */
+ THD* thd) /*!< in: handle to the MySQL thread
+ of the user whose transaction should
+ be rolled back to savepoint */
+{
+ trx_t* trx;
-For MySQL replication the autoincrement values can be partitioned among
-the nodes. The offset is the start or origin of the autoincrement value
-for a particular node. For n nodes the increment will be n and the offset
-will be in the interval [1, n]. The formula tries to allocate the next
-value for a particular node.
+ DBUG_ENTER("innobase_rollback_to_savepoint_can_release_mdl");
+ DBUG_ASSERT(hton == innodb_hton_ptr);
-Note: This function is also called with increment set to the number of
-values we want to reserve for multi-value inserts e.g.,
+ trx = check_trx_exists(thd);
+ ut_ad(trx);
- INSERT INTO T VALUES(), (), ();
+ /* If transaction has not acquired any locks then it is safe
+ to release MDL after rollback to savepoint */
+ if (!(UT_LIST_GET_LEN(trx->lock.trx_locks))) {
+ DBUG_RETURN(true);
+ }
-innobase_next_autoinc() will be called with increment set to 3 where
-autoinc_lock_mode != TRADITIONAL because we want to reserve 3 values for
-the multi-value INSERT above.
-@return the next value */
+ DBUG_RETURN(false);
+}
+
+/*****************************************************************//**
+Release transaction savepoint name.
+@return 0 if success, HA_ERR_NO_SAVEPOINT if no savepoint with the
+given name */
static
-ulonglong
-innobase_next_autoinc(
-/*==================*/
- ulonglong current, /*!< in: Current value */
- ulonglong need, /*!< in: count of values needed */
- ulonglong step, /*!< in: AUTOINC increment step */
- ulonglong offset, /*!< in: AUTOINC offset */
- ulonglong max_value) /*!< in: max value for type */
+int
+innobase_release_savepoint(
+/*=======================*/
+ handlerton* hton, /*!< in: handlerton for Innodb */
+ THD* thd, /*!< in: handle to the MySQL thread
+ of the user whose transaction's
+ savepoint should be released */
+ void* savepoint) /*!< in: savepoint data */
{
- ulonglong next_value;
- ulonglong block = need * step;
+ dberr_t error;
+ trx_t* trx;
+ char name[64];
- /* Should never be 0. */
- ut_a(need > 0);
- ut_a(block > 0);
- ut_a(max_value > 0);
+ DBUG_ENTER("innobase_release_savepoint");
+ DBUG_ASSERT(hton == innodb_hton_ptr);
- /*
- Allow auto_increment to go over max_value up to max ulonglong.
- This allows us to detect that all values are exhausted.
- If we don't do this, we will return max_value several times
- and get duplicate key errors instead of auto increment value
- out of range.
- */
- max_value= (~(ulonglong) 0);
+ trx = check_trx_exists(thd);
- /* According to MySQL documentation, if the offset is greater than
- the step then the offset is ignored. */
- if (offset > block) {
- offset = 0;
+ if (trx->state == TRX_STATE_NOT_STARTED) {
+ trx_start_if_not_started(trx);
}
- /* Check for overflow. Current can be > max_value if the value is
- in reality a negative value.The visual studio compilers converts
- large double values automatically into unsigned long long datatype
- maximum value */
- if (block >= max_value
- || offset > max_value
- || current >= max_value
- || max_value - offset <= offset) {
+ /* TODO: use provided savepoint data area to store savepoint data */
- next_value = max_value;
- } else {
- ut_a(max_value > current);
+ longlong2str((ulint) savepoint, name, 36);
- ulonglong free = max_value - current;
+ error = trx_release_savepoint_for_mysql(trx, name);
- if (free < offset || free - offset <= block) {
- next_value = max_value;
- } else {
- next_value = 0;
- }
+ if (error == DB_SUCCESS && trx->fts_trx != NULL) {
+ fts_savepoint_release(trx, name);
}
- if (next_value == 0) {
- ulonglong next;
+ DBUG_RETURN(convert_error_code_to_mysql(error, 0, NULL));
+}
- if (current >= offset) {
- next = (current - offset) / step;
- } else {
- next = 0;
- block -= step;
- }
+/*****************************************************************//**
+Sets a transaction savepoint.
+@return always 0, that is, always succeeds */
+static
+int
+innobase_savepoint(
+/*===============*/
+ handlerton* hton, /*!< in: handle to the Innodb handlerton */
+ THD* thd, /*!< in: handle to the MySQL thread */
+ void* savepoint) /*!< in: savepoint data */
+{
+ dberr_t error;
+ trx_t* trx;
- ut_a(max_value > next);
- next_value = next * step;
- /* Check for multiplication overflow. */
- ut_a(next_value >= next);
- ut_a(max_value > next_value);
+ DBUG_ENTER("innobase_savepoint");
+ DBUG_ASSERT(hton == innodb_hton_ptr);
- /* Check for overflow */
- if (max_value - next_value >= block) {
+ /* In the autocommit mode there is no sense to set a savepoint
+ (unless we are in sub-statement), so SQL layer ensures that
+ this method is never called in such situation. */
- next_value += block;
+ trx = check_trx_exists(thd);
- if (max_value - next_value >= offset) {
- next_value += offset;
- } else {
- next_value = max_value;
- }
- } else {
- next_value = max_value;
- }
- }
+ /* Release a possible FIFO ticket and search latch. Since we will
+ reserve the trx_sys->mutex, we have to release the search system
+ latch first to obey the latching order. */
- ut_a(next_value != 0);
- ut_a(next_value <= max_value);
+ trx_search_latch_release_if_reserved(trx);
- return(next_value);
+ innobase_srv_conc_force_exit_innodb(trx);
+
+ /* Cannot happen outside of transaction */
+ DBUG_ASSERT(trx_is_registered_for_2pc(trx));
+
+ /* TODO: use provided savepoint data area to store savepoint data */
+ char name[64];
+ longlong2str((ulint) savepoint,name,36);
+
+ error = trx_savepoint_for_mysql(trx, name, (ib_int64_t)0);
+
+ if (error == DB_SUCCESS && trx->fts_trx != NULL) {
+ fts_savepoint_take(trx, trx->fts_trx, name);
+ }
+
+ DBUG_RETURN(convert_error_code_to_mysql(error, 0, NULL));
}
-/*********************************************************************//**
-Initializes some fields in an InnoDB transaction object. */
+/*****************************************************************//**
+Frees a possible InnoDB trx object associated with the current THD.
+@return 0 or error number */
static
-void
-innobase_trx_init(
-/*==============*/
- THD* thd, /*!< in: user thread handle */
- trx_t* trx) /*!< in/out: InnoDB transaction handle */
+int
+innobase_close_connection(
+/*======================*/
+ handlerton* hton, /*!< in: innobase handlerton */
+ THD* thd) /*!< in: handle to the MySQL thread of the user
+ whose resources should be free'd */
{
- DBUG_ENTER("innobase_trx_init");
- DBUG_ASSERT(thd == trx->mysql_thd);
+ trx_t* trx;
- trx->check_foreigns = !thd_test_options(
- thd, OPTION_NO_FOREIGN_KEY_CHECKS);
+ DBUG_ENTER("innobase_close_connection");
+ DBUG_ASSERT(hton == innodb_hton_ptr);
+ trx = thd_to_trx(thd);
- trx->check_unique_secondary = !thd_test_options(
- thd, OPTION_RELAXED_UNIQUE_CHECKS);
+ ut_a(trx);
- DBUG_VOID_RETURN;
+ if (!trx_is_registered_for_2pc(trx) && trx_is_started(trx)) {
+
+ sql_print_error("Transaction not registered for MySQL 2PC, "
+ "but transaction is active");
+ }
+
+ if (trx_is_started(trx) && global_system_variables.log_warnings) {
+
+ sql_print_warning(
+ "MySQL is closing a connection that has an active "
+ "InnoDB transaction. " TRX_ID_FMT " row modifications "
+ "will roll back.",
+ trx->undo_no);
+ }
+
+ innobase_rollback_trx(trx);
+
+ trx_free_for_mysql(trx);
+
+ DBUG_RETURN(0);
}
-/*********************************************************************//**
-Allocates an InnoDB transaction for a MySQL handler object.
-@return InnoDB transaction handle */
-extern "C" UNIV_INTERN
-trx_t*
-innobase_trx_allocate(
-/*==================*/
- THD* thd) /*!< in: user thread handle */
+/*****************************************************************//**
+Frees a possible InnoDB trx object associated with the current THD.
+@return 0 or error number */
+UNIV_INTERN
+int
+innobase_close_thd(
+/*===============*/
+ THD* thd) /*!< in: handle to the MySQL thread of the user
+ whose resources should be free'd */
+{
+ trx_t* trx = thd_to_trx(thd);
+
+ if (!trx) {
+ return(0);
+ }
+
+ return(innobase_close_connection(innodb_hton_ptr, thd));
+}
+
+UNIV_INTERN void lock_cancel_waiting_and_release(lock_t* lock);
+
+/*****************************************************************//**
+Cancel any pending lock request associated with the current THD. */
+static
+void
+innobase_kill_query(
+/*======================*/
+ handlerton* hton, /*!< in: innobase handlerton */
+ THD* thd, /*!< in: MySQL thread being killed */
+ enum thd_kill_levels level) /*!< in: kill level */
{
trx_t* trx;
@@@ -7149,995 -3773,862 +7153,993 @@@ build_template_field
}
}
- if (col_type != mtype) {
- /* Column Type mismatches */
- DBUG_RETURN(FALSE);
+ ib_logf(IB_LOG_LEVEL_INFO,
+ "Looking for field %lu name %s from table %s",
+ i,
+ (tb_col_name ? tb_col_name : "NULL"),
+ clust_index->table->name);
+
+
+ for(ulint j=0; j < clust_index->n_user_defined_cols; j++) {
+ dict_field_t* ifield = &(clust_index->fields[j]);
+ ib_logf(IB_LOG_LEVEL_INFO,
+ "InnoDB Table %s field %lu name %s",
+ clust_index->table->name,
+ j,
+ (ifield ? ifield->name : "NULL"));
}
- innodb_idx_fld++;
+ for(ulint j=0; j < table->s->stored_fields; j++) {
+ ib_logf(IB_LOG_LEVEL_INFO,
+ "MySQL table %s field %lu name %s",
+ table->s->table_name.str,
+ j,
+ table->field[j]->field_name);
+ }
+
+ ib_logf(IB_LOG_LEVEL_ERROR,
+ "Clustered record field for column %lu"
+ " not found table n_user_defined %d"
+ " index n_user_defined %d"
+ " InnoDB table %s field name %s"
+ " MySQL table %s field name %s n_fields %d"
+ " query %s",
+ i,
+ clust_index->n_user_defined_cols,
+ clust_index->table->n_cols - DATA_N_SYS_COLS,
+ clust_index->table->name,
+ (field ? field->name : "NULL"),
+ table->s->table_name.str,
+ (tb_col_name ? tb_col_name : "NULL"),
+ table->s->stored_fields,
+ innobase_get_stmt(current_thd, &size));
+
+ ut_a(templ->clust_rec_field_no != ULINT_UNDEFINED);
+ }
+
+ if (dict_index_is_clust(index)) {
+ templ->rec_field_no = templ->clust_rec_field_no;
+ } else {
+ templ->rec_field_no = dict_index_get_nth_col_pos(index, i);
}
- DBUG_RETURN(TRUE);
-}
+ if (field->real_maybe_null()) {
+ templ->mysql_null_byte_offset =
+ field->null_offset();
-/*******************************************************************//**
-This function builds a translation table in INNOBASE_SHARE
-structure for fast index location with mysql array number from its
-table->key_info structure. This also provides the necessary translation
-between the key order in mysql key_info and Innodb ib_table->indexes if
-they are not fully matched with each other.
-Note we do not have any mutex protecting the translation table
-building based on the assumption that there is no concurrent
-index creation/drop and DMLs that requires index lookup. All table
-handle will be closed before the index creation/drop.
-@return TRUE if index translation table built successfully */
-static
-ibool
-innobase_build_index_translation(
-/*=============================*/
- const TABLE* table, /*!< in: table in MySQL data
- dictionary */
- dict_table_t* ib_table, /*!< in: table in Innodb data
- dictionary */
- INNOBASE_SHARE* share) /*!< in/out: share structure
- where index translation table
- will be constructed in. */
-{
- ulint mysql_num_index;
- ulint ib_num_index;
- dict_index_t** index_mapping;
- ibool ret = TRUE;
+ templ->mysql_null_bit_mask = (ulint) field->null_bit;
+ } else {
+ templ->mysql_null_bit_mask = 0;
+ }
- DBUG_ENTER("innobase_build_index_translation");
+ templ->mysql_col_offset = (ulint) get_field_offset(table, field);
- mutex_enter(&dict_sys->mutex);
+ templ->mysql_col_len = (ulint) field->pack_length();
+ templ->type = col->mtype;
+ templ->mysql_type = (ulint) field->type();
- mysql_num_index = table->s->keys;
- ib_num_index = UT_LIST_GET_LEN(ib_table->indexes);
+ if (templ->mysql_type == DATA_MYSQL_TRUE_VARCHAR) {
+ templ->mysql_length_bytes = (ulint)
+ (((Field_varstring*) field)->length_bytes);
+ }
- index_mapping = share->idx_trans_tbl.index_mapping;
+ templ->charset = dtype_get_charset_coll(col->prtype);
+ templ->mbminlen = dict_col_get_mbminlen(col);
+ templ->mbmaxlen = dict_col_get_mbmaxlen(col);
+ templ->is_unsigned = col->prtype & DATA_UNSIGNED;
- /* If there exists inconsistency between MySQL and InnoDB dictionary
- (metadata) information, the number of index defined in MySQL
- could exceed that in InnoDB, do not build index translation
- table in such case */
- if (UNIV_UNLIKELY(ib_num_index < mysql_num_index)) {
- ret = FALSE;
- goto func_exit;
+ if (!dict_index_is_clust(index)
+ && templ->rec_field_no == ULINT_UNDEFINED) {
+ prebuilt->need_to_access_clustered = TRUE;
}
- /* If index entry count is non-zero, nothing has
- changed since last update, directly return TRUE */
- if (share->idx_trans_tbl.index_count) {
- /* Index entry count should still match mysql_num_index */
- ut_a(share->idx_trans_tbl.index_count == mysql_num_index);
- goto func_exit;
+ if (prebuilt->mysql_prefix_len < templ->mysql_col_offset
+ + templ->mysql_col_len) {
+ prebuilt->mysql_prefix_len = templ->mysql_col_offset
+ + templ->mysql_col_len;
}
- /* The number of index increased, rebuild the mapping table */
- if (mysql_num_index > share->idx_trans_tbl.array_size) {
- index_mapping = (dict_index_t**) my_realloc(index_mapping,
- mysql_num_index *
- sizeof(*index_mapping),
- MYF(MY_ALLOW_ZERO_PTR));
+ if (templ->type == DATA_BLOB) {
+ prebuilt->templ_contains_blob = TRUE;
+ }
+
+ return(templ);
+}
+
+/**************************************************************//**
+Builds a 'template' to the prebuilt struct. The template is used in fast
+retrieval of just those column values MySQL needs in its processing. */
+UNIV_INTERN
+void
+ha_innobase::build_template(
+/*========================*/
+ bool whole_row) /*!< in: true=ROW_MYSQL_WHOLE_ROW,
+ false=ROW_MYSQL_REC_FIELDS */
+{
+ dict_index_t* index;
+ dict_index_t* clust_index;
+ ulint n_stored_fields;
+ ibool fetch_all_in_key = FALSE;
+ ibool fetch_primary_key_cols = FALSE;
+ ulint i, sql_idx;
+
+ if (prebuilt->select_lock_type == LOCK_X) {
+ /* We always retrieve the whole clustered index record if we
+ use exclusive row level locks, for example, if the read is
+ done in an UPDATE statement. */
+
+ whole_row = true;
+ } else if (!whole_row) {
+ if (prebuilt->hint_need_to_fetch_extra_cols
+ == ROW_RETRIEVE_ALL_COLS) {
+
+ /* We know we must at least fetch all columns in the
+ key, or all columns in the table */
+
+ if (prebuilt->read_just_key) {
+ /* MySQL has instructed us that it is enough
+ to fetch the columns in the key; looks like
+ MySQL can set this flag also when there is
+ only a prefix of the column in the key: in
+ that case we retrieve the whole column from
+ the clustered index */
+
+ fetch_all_in_key = TRUE;
+ } else {
+ whole_row = true;
+ }
+ } else if (prebuilt->hint_need_to_fetch_extra_cols
+ == ROW_RETRIEVE_PRIMARY_KEY) {
+ /* We must at least fetch all primary key cols. Note
+ that if the clustered index was internally generated
+ by InnoDB on the row id (no primary key was
+ defined), then row_search_for_mysql() will always
+ retrieve the row id to a special buffer in the
+ prebuilt struct. */
- if (!index_mapping) {
- /* Report an error if index_mapping continues to be
- NULL and mysql_num_index is a non-zero value */
- sql_print_error("InnoDB: fail to allocate memory for "
- "index translation table. Number of "
- "Index:%lu, array size:%lu",
- mysql_num_index,
- share->idx_trans_tbl.array_size);
- ret = FALSE;
- goto func_exit;
+ fetch_primary_key_cols = TRUE;
}
-
- share->idx_trans_tbl.array_size = mysql_num_index;
}
- /* For each index in the mysql key_info array, fetch its
- corresponding InnoDB index pointer into index_mapping
- array. */
- for (ulint count = 0; count < mysql_num_index; count++) {
+ clust_index = dict_table_get_first_index(prebuilt->table);
- /* Fetch index pointers into index_mapping according to mysql
- index sequence */
- index_mapping[count] = dict_table_get_index_on_name(
- ib_table, table->key_info[count].name);
+ index = whole_row ? clust_index : prebuilt->index;
- if (!index_mapping[count]) {
- sql_print_error("Cannot find index %s in InnoDB "
- "index dictionary.",
- table->key_info[count].name);
- ret = FALSE;
- goto func_exit;
- }
+ prebuilt->need_to_access_clustered = (index == clust_index);
- /* Double check fetched index has the same
- column info as those in mysql key_info. */
- if (!innobase_match_index_columns(&table->key_info[count],
- index_mapping[count])) {
- sql_print_error("Found index %s whose column info "
- "does not match that of MySQL.",
- table->key_info[count].name);
- ret = FALSE;
- goto func_exit;
- }
- }
+ /* Either prebuilt->index should be a secondary index, or it
+ should be the clustered index. */
+ ut_ad(dict_index_is_clust(index) == (index == clust_index));
- /* Successfully built the translation table */
- share->idx_trans_tbl.index_count = mysql_num_index;
+ /* Below we check column by column if we need to access
+ the clustered index. */
-func_exit:
- if (!ret) {
- /* Build translation table failed. */
- my_free(index_mapping);
+ n_stored_fields= (ulint)table->s->stored_fields; /* number of stored columns */
- share->idx_trans_tbl.array_size = 0;
- share->idx_trans_tbl.index_count = 0;
- index_mapping = NULL;
+ if (!prebuilt->mysql_template) {
+ prebuilt->mysql_template = (mysql_row_templ_t*)
+ mem_alloc(n_stored_fields * sizeof(mysql_row_templ_t));
}
- share->idx_trans_tbl.index_mapping = index_mapping;
+ prebuilt->template_type = whole_row
+ ? ROW_MYSQL_WHOLE_ROW : ROW_MYSQL_REC_FIELDS;
+ prebuilt->null_bitmap_len = table->s->null_bytes;
- mutex_exit(&dict_sys->mutex);
+ /* Prepare to build prebuilt->mysql_template[]. */
+ prebuilt->templ_contains_blob = FALSE;
+ prebuilt->mysql_prefix_len = 0;
+ prebuilt->n_template = 0;
+ prebuilt->idx_cond_n_cols = 0;
- DBUG_RETURN(ret);
-}
+ /* Note that in InnoDB, i is the column number in the table.
+ MySQL calls columns 'fields'. */
-/*******************************************************************//**
-This function uses index translation table to quickly locate the
-requested index structure.
-Note we do not have mutex protection for the index translatoin table
-access, it is based on the assumption that there is no concurrent
-translation table rebuild (fter create/drop index) and DMLs that
-require index lookup.
-@return dict_index_t structure for requested index. NULL if
-fail to locate the index structure. */
-static
-dict_index_t*
-innobase_index_lookup(
-/*==================*/
- INNOBASE_SHARE* share, /*!< in: share structure for index
- translation table. */
- uint keynr) /*!< in: index number for the requested
- index */
-{
- if (!share->idx_trans_tbl.index_mapping
- || keynr >= share->idx_trans_tbl.index_count) {
- return(NULL);
- }
+ if (active_index != MAX_KEY && active_index == pushed_idx_cond_keyno) {
+ /* Push down an index condition or an end_range check. */
+ for (i = 0, sql_idx = 0; i < n_stored_fields; i++, sql_idx++) {
- return(share->idx_trans_tbl.index_mapping[keynr]);
-}
+ while (!table->field[sql_idx]->stored_in_db) {
+ sql_idx++;
+ }
-/************************************************************************
-Set the autoinc column max value. This should only be called once from
-ha_innobase::open(). Therefore there's no need for a covering lock. */
-UNIV_INTERN
-void
-ha_innobase::innobase_initialize_autoinc()
-/*======================================*/
-{
- ulonglong auto_inc;
- const Field* field = table->found_next_number_field;
+ const ibool index_contains
+ = dict_index_contains_col_or_prefix(index, i);
+
+ /* Test if an end_range or an index condition
+ refers to the field. Note that "index" and
+ "index_contains" may refer to the clustered index.
+ Index condition pushdown is relative to prebuilt->index
+ (the index that is being looked up first). */
+
+ /* When join_read_always_key() invokes this
+ code via handler::ha_index_init() and
+ ha_innobase::index_init(), end_range is not
+ yet initialized. Because of that, we must
+ always check for index_contains, instead of
+ the subset
+ field->part_of_key.is_set(active_index)
+ which would be acceptable if end_range==NULL. */
+ if (build_template_needs_field_in_icp(
+ index, prebuilt, index_contains, i)) {
+ /* Needed in ICP */
+ const Field* field;
+ mysql_row_templ_t* templ;
+
+ if (whole_row) {
+ field = table->field[sql_idx];
+ } else {
+ field = build_template_needs_field(
+ index_contains,
+ prebuilt->read_just_key,
+ fetch_all_in_key,
+ fetch_primary_key_cols,
+ index, table, i, sql_idx);
+ if (!field) {
+ continue;
+ }
+ }
- if (field != NULL) {
- auto_inc = innobase_get_int_col_max_value(field);
- } else {
- /* We have no idea what's been passed in to us as the
- autoinc column. We set it to the 0, effectively disabling
- updates to the table. */
- auto_inc = 0;
+ templ = build_template_field(
+ prebuilt, clust_index, index,
+ table, field, i);
+ prebuilt->idx_cond_n_cols++;
+ ut_ad(prebuilt->idx_cond_n_cols
+ == prebuilt->n_template);
+
+ if (index == prebuilt->index) {
+ templ->icp_rec_field_no
+ = templ->rec_field_no;
+ } else {
+ templ->icp_rec_field_no
+ = dict_index_get_nth_col_pos(
+ prebuilt->index, i);
+ }
- ut_print_timestamp(stderr);
- fprintf(stderr, " InnoDB: Unable to determine the AUTOINC "
- "column name\n");
- }
+ if (dict_index_is_clust(prebuilt->index)) {
+ ut_ad(templ->icp_rec_field_no
+ != ULINT_UNDEFINED);
+ /* If the primary key includes
+ a column prefix, use it in
+ index condition pushdown,
+ because the condition is
+ evaluated before fetching any
+ off-page (externally stored)
+ columns. */
+ if (templ->icp_rec_field_no
+ < prebuilt->index->n_uniq) {
+ /* This is a key column;
+ all set. */
+ continue;
+ }
+ } else if (templ->icp_rec_field_no
+ != ULINT_UNDEFINED) {
+ continue;
+ }
- if (srv_force_recovery >= SRV_FORCE_NO_IBUF_MERGE) {
- /* If the recovery level is set so high that writes
- are disabled we force the AUTOINC counter to 0
- value effectively disabling writes to the table.
- Secondly, we avoid reading the table in case the read
- results in failure due to a corrupted table/index.
+ /* This is a column prefix index.
+ The column prefix can be used in
+ an end_range comparison. */
+
+ templ->icp_rec_field_no
+ = dict_index_get_nth_col_or_prefix_pos(
+ prebuilt->index, i, TRUE);
+ ut_ad(templ->icp_rec_field_no
+ != ULINT_UNDEFINED);
+
+ /* Index condition pushdown can be used on
+ all columns of a secondary index, and on
+ the PRIMARY KEY columns. On the clustered
+ index, it must never be used on other than
+ PRIMARY KEY columns, because those columns
+ may be stored off-page, and we will not
+ fetch externally stored columns before
+ checking the index condition. */
+ /* TODO: test the above with an assertion
+ like this. Note that index conditions are
+ currently pushed down as part of the
+ "optimizer phase" while end_range is done
+ as part of the execution phase. Therefore,
+ we were unable to use an accurate condition
+ for end_range in the "if" condition above,
+ and the following assertion would fail.
+ ut_ad(!dict_index_is_clust(prebuilt->index)
+ || templ->rec_field_no
+ < prebuilt->index->n_uniq);
+ */
+ }
+ }
- We will not return an error to the client, so that the
- tables can be dumped with minimal hassle. If an error
- were returned in this case, the first attempt to read
- the table would fail and subsequent SELECTs would succeed. */
- auto_inc = 0;
- } else if (field == NULL) {
- /* This is a far more serious error, best to avoid
- opening the table and return failure. */
- my_error(ER_AUTOINC_READ_FAILED, MYF(0));
+ ut_ad(prebuilt->idx_cond_n_cols > 0);
+ ut_ad(prebuilt->idx_cond_n_cols == prebuilt->n_template);
+
+ /* Include the fields that are not needed in index condition
+ pushdown. */
+ for (i = 0, sql_idx = 0; i < n_stored_fields; i++, sql_idx++) {
+
+ while (!table->field[sql_idx]->stored_in_db) {
+ sql_idx++;
+ }
+
+ const ibool index_contains
+ = dict_index_contains_col_or_prefix(index, i);
+
+ if (!build_template_needs_field_in_icp(
+ index, prebuilt, index_contains, i)) {
+ /* Not needed in ICP */
+ const Field* field;
+
+ if (whole_row) {
+ field = table->field[sql_idx];
+ } else {
+ field = build_template_needs_field(
+ index_contains,
+ prebuilt->read_just_key,
+ fetch_all_in_key,
+ fetch_primary_key_cols,
+ index, table, i, sql_idx);
+ if (!field) {
+ continue;
+ }
+ }
+
+ build_template_field(prebuilt,
+ clust_index, index,
+ table, field, i);
+ }
+ }
+
+ prebuilt->idx_cond = this;
} else {
- dict_index_t* index;
- const char* col_name;
- ulonglong read_auto_inc;
- ulint err;
+ /* No index condition pushdown */
+ prebuilt->idx_cond = NULL;
- update_thd(ha_thd());
+ for (i = 0, sql_idx = 0; i < n_stored_fields; i++, sql_idx++) {
+ const Field* field;
- ut_a(prebuilt->trx == thd_to_trx(user_thd));
+ while (!table->field[sql_idx]->stored_in_db) {
+ sql_idx++;
+ }
- col_name = field->field_name;
- index = innobase_get_index(table->s->next_number_index);
+ if (whole_row) {
+ field = table->field[sql_idx];
+ } else {
+ field = build_template_needs_field(
+ dict_index_contains_col_or_prefix(
+ index, i),
+ prebuilt->read_just_key,
+ fetch_all_in_key,
+ fetch_primary_key_cols,
+ index, table, i, sql_idx);
+ if (!field) {
+ continue;
+ }
+ }
- /* Execute SELECT MAX(col_name) FROM TABLE; */
- err = row_search_max_autoinc(index, col_name, &read_auto_inc);
+ build_template_field(prebuilt, clust_index, index,
+ table, field, i);
+ }
+ }
- switch (err) {
- case DB_SUCCESS: {
- ulonglong col_max_value;
+ if (index != clust_index && prebuilt->need_to_access_clustered) {
+ /* Change rec_field_no's to correspond to the clustered index
+ record */
+ for (i = 0; i < prebuilt->n_template; i++) {
- col_max_value = innobase_get_int_col_max_value(field);
+ mysql_row_templ_t* templ
+ = &prebuilt->mysql_template[i];
- /* At the this stage we do not know the increment
- nor the offset, so use a default increment of 1. */
+ templ->rec_field_no = templ->clust_rec_field_no;
+ }
+ }
+}
- auto_inc = innobase_next_autoinc(
- read_auto_inc, 1, 1, 0, col_max_value);
+/********************************************************************//**
+This special handling is really to overcome the limitations of MySQL's
+binlogging. We need to eliminate the non-determinism that will arise in
+INSERT ... SELECT type of statements, since MySQL binlog only stores the
+min value of the autoinc interval. Once that is fixed we can get rid of
+the special lock handling.
+@return DB_SUCCESS if all OK else error code */
+UNIV_INTERN
+dberr_t
+ha_innobase::innobase_lock_autoinc(void)
+/*====================================*/
+{
+ DBUG_ENTER("ha_innobase::innobase_lock_autoinc");
+ dberr_t error = DB_SUCCESS;
- break;
+ ut_ad(!srv_read_only_mode);
+
+ switch (innobase_autoinc_lock_mode) {
+ case AUTOINC_NO_LOCKING:
+ /* Acquire only the AUTOINC mutex. */
+ dict_table_autoinc_lock(prebuilt->table);
+ break;
+
+ case AUTOINC_NEW_STYLE_LOCKING:
+ /* For simple (single/multi) row INSERTs/REPLACEs and RBR
+ events, we fallback to the old style only if another
+ transaction has already acquired the AUTOINC lock on
+ behalf of a LOAD FILE or INSERT ... SELECT etc. type of
+ statement. */
+ if (thd_sql_command(user_thd) == SQLCOM_INSERT
+ || thd_sql_command(user_thd) == SQLCOM_REPLACE
+ || thd_sql_command(user_thd) == SQLCOM_END // RBR event
+ ) {
+ dict_table_t* ib_table = prebuilt->table;
+
+ /* Acquire the AUTOINC mutex. */
+ dict_table_autoinc_lock(ib_table);
+
+ /* We need to check that another transaction isn't
+ already holding the AUTOINC lock on the table. */
+ if (ib_table->n_waiting_or_granted_auto_inc_locks) {
+ /* Release the mutex to avoid deadlocks and
+ fall back to old style locking. */
+ dict_table_autoinc_unlock(ib_table);
+ } else {
+ /* Do not fall back to old style locking. */
+ break;
+ }
}
- case DB_RECORD_NOT_FOUND:
- ut_print_timestamp(stderr);
- fprintf(stderr, " InnoDB: MySQL and InnoDB data "
- "dictionaries are out of sync.\n"
- "InnoDB: Unable to find the AUTOINC column "
- "%s in the InnoDB table %s.\n"
- "InnoDB: We set the next AUTOINC column "
- "value to 0,\n"
- "InnoDB: in effect disabling the AUTOINC "
- "next value generation.\n"
- "InnoDB: You can either set the next "
- "AUTOINC value explicitly using ALTER TABLE\n"
- "InnoDB: or fix the data dictionary by "
- "recreating the table.\n",
- col_name, index->table->name);
+ /* Use old style locking. */
+ /* fall through */
+ case AUTOINC_OLD_STYLE_LOCKING:
+ DBUG_EXECUTE_IF("die_if_autoinc_old_lock_style_used",
+ ut_ad(0););
+ error = row_lock_table_autoinc_for_mysql(prebuilt);
- /* This will disable the AUTOINC generation. */
- auto_inc = 0;
+ if (error == DB_SUCCESS) {
- /* We want the open to succeed, so that the user can
- take corrective action. ie. reads should succeed but
- updates should fail. */
- err = DB_SUCCESS;
- break;
- default:
- /* row_search_max_autoinc() should only return
- one of DB_SUCCESS or DB_RECORD_NOT_FOUND. */
- ut_error;
+ /* Acquire the AUTOINC mutex. */
+ dict_table_autoinc_lock(prebuilt->table);
}
+ break;
+
+ default:
+ ut_error;
}
- dict_table_autoinc_initialize(prebuilt->table, auto_inc);
+ DBUG_RETURN(error);
}
-/*****************************************************************//**
-Creates and opens a handle to a table which already exists in an InnoDB
-database.
-@return 1 if error, 0 if success */
-UNIV_INTERN
-int
-ha_innobase::open(
-/*==============*/
- const char* name, /*!< in: table name */
- int mode, /*!< in: not used */
- uint test_if_locked) /*!< in: not used */
+/********************************************************************//**
+Reset the autoinc value in the table.
+@return DB_SUCCESS if all went well else error code */
+UNIV_INTERN
+dberr_t
+ha_innobase::innobase_reset_autoinc(
+/*================================*/
+ ulonglong autoinc) /*!< in: value to store */
{
- dict_table_t* ib_table;
- char norm_name[1000];
- THD* thd;
- char* is_part = NULL;
- ibool par_case_name_set = FALSE;
- char par_case_name[MAX_FULL_NAME_LEN + 1];
- dict_err_ignore_t ignore_err = DICT_ERR_IGNORE_NONE;
+ dberr_t error;
- DBUG_ENTER("ha_innobase::open");
+ error = innobase_lock_autoinc();
- UT_NOT_USED(mode);
- UT_NOT_USED(test_if_locked);
+ if (error == DB_SUCCESS) {
- thd = ha_thd();
+ dict_table_autoinc_initialize(prebuilt->table, autoinc);
- /* Under some cases MySQL seems to call this function while
- holding btr_search_latch. This breaks the latching order as
- we acquire dict_sys->mutex below and leads to a deadlock. */
- if (thd != NULL) {
- innobase_release_temporary_latches(ht, thd);
+ dict_table_autoinc_unlock(prebuilt->table);
}
- normalize_table_name(norm_name, name);
-
- user_thd = NULL;
+ return(error);
+}
- if (!(share=get_share(name))) {
+/********************************************************************//**
+Store the autoinc value in the table. The autoinc value is only set if
+it's greater than the existing autoinc value in the table.
+@return DB_SUCCESS if all went well else error code */
+UNIV_INTERN
+dberr_t
+ha_innobase::innobase_set_max_autoinc(
+/*==================================*/
+ ulonglong auto_inc) /*!< in: value to store */
+{
+ dberr_t error;
- DBUG_RETURN(1);
- }
+ error = innobase_lock_autoinc();
- /* Will be allocated if it is needed in ::update_row() */
- upd_buf = NULL;
- upd_buf_size = 0;
+ if (error == DB_SUCCESS) {
- /* We look for pattern #P# to see if the table is partitioned
- MySQL table. */
-#ifdef __WIN__
- is_part = strstr(norm_name, "#p#");
-#else
- is_part = strstr(norm_name, "#P#");
-#endif /* __WIN__ */
+ dict_table_autoinc_update_if_greater(prebuilt->table, auto_inc);
- /* Check whether FOREIGN_KEY_CHECKS is set to 0. If so, the table
- can be opened even if some FK indexes are missing. If not, the table
- can't be opened in the same situation */
- if (thd_test_options(thd, OPTION_NO_FOREIGN_KEY_CHECKS)) {
- ignore_err = DICT_ERR_IGNORE_FK_NOKEY;
+ dict_table_autoinc_unlock(prebuilt->table);
}
- /* Get pointer to a table object in InnoDB dictionary cache */
- ib_table = dict_table_get(norm_name, TRUE, ignore_err);
+ return(error);
+}
- if (NULL == ib_table) {
- if (is_part) {
- /* MySQL partition engine hard codes the file name
- separator as "#P#". The text case is fixed even if
- lower_case_table_names is set to 1 or 2. This is true
- for sub-partition names as well. InnoDB always
- normalises file names to lower case on Windows, this
- can potentially cause problems when copying/moving
- tables between platforms.
+/********************************************************************//**
+Stores a row in an InnoDB database, to the table specified in this
+handle.
+@return error code */
+UNIV_INTERN
+int
+ha_innobase::write_row(
+/*===================*/
+ uchar* record) /*!< in: a row in MySQL format */
+{
+ dberr_t error;
+ int error_result= 0;
+ ibool auto_inc_used= FALSE;
+#ifdef WITH_WSREP
+ ibool auto_inc_inserted= FALSE; /* if NULL was inserted */
+#endif
+ ulint sql_command;
+ trx_t* trx = thd_to_trx(user_thd);
- 1) If boot against an installation from Windows
- platform, then its partition table name could
- be all be in lower case in system tables. So we
- will need to check lower case name when load table.
+ DBUG_ENTER("ha_innobase::write_row");
- 2) If we boot an installation from other case
- sensitive platform in Windows, we might need to
- check the existence of table name without lowering
- case them in the system table. */
- if (innobase_get_lower_case_table_names() == 1) {
+ if (high_level_read_only) {
+ ib_senderrf(ha_thd(), IB_LOG_LEVEL_WARN, ER_READ_ONLY_MODE);
+ DBUG_RETURN(HA_ERR_TABLE_READONLY);
+ } else if (prebuilt->trx != trx) {
+ sql_print_error("The transaction object for the table handle "
+ "is at %p, but for the current thread it is at "
+ "%p",
+ (const void*) prebuilt->trx, (const void*) trx);
- if (!par_case_name_set) {
-#ifndef __WIN__
- /* Check for the table using lower
- case name, including the partition
- separator "P" */
- memcpy(par_case_name, norm_name,
- strlen(norm_name));
- par_case_name[strlen(norm_name)] = 0;
- innobase_casedn_str(par_case_name);
-#else
- /* On Windows platfrom, check
- whether there exists table name in
- system table whose name is
- not being normalized to lower case */
- normalize_table_name_low(
- par_case_name, name, FALSE);
-#endif
- par_case_name_set = TRUE;
- }
+ fputs("InnoDB: Dump of 200 bytes around prebuilt: ", stderr);
+ ut_print_buf(stderr, ((const byte*) prebuilt) - 100, 200);
+ fputs("\n"
+ "InnoDB: Dump of 200 bytes around ha_data: ",
+ stderr);
+ ut_print_buf(stderr, ((const byte*) trx) - 100, 200);
+ putc('\n', stderr);
+ ut_error;
+ } else if (!trx_is_started(trx)) {
+ ++trx->will_lock;
+ }
- ib_table = dict_table_get(
- par_case_name, TRUE, ignore_err);
- }
- if (ib_table) {
-#ifndef __WIN__
- sql_print_warning("Partition table %s opened "
- "after converting to lower "
- "case. The table may have "
- "been moved from a case "
- "in-sensitive file system. "
- "Please recreate table in "
- "the current file system\n",
- norm_name);
-#else
- sql_print_warning("Partition table %s opened "
- "after skipping the step to "
- "lower case the table name. "
- "The table may have been "
- "moved from a case sensitive "
- "file system. Please "
- "recreate table in the "
- "current file system\n",
- norm_name);
-#endif
- goto table_opened;
- }
- }
+ ha_statistic_increment(&SSV::ha_write_count);
- if (is_part) {
- sql_print_error("Failed to open table %s.\n",
- norm_name);
- }
+ sql_command = thd_sql_command(user_thd);
- sql_print_error("Cannot find or open table %s from\n"
- "the internal data dictionary of InnoDB "
- "though the .frm file for the\n"
- "table exists. Maybe you have deleted and "
- "recreated InnoDB data\n"
- "files but have forgotten to delete the "
- "corresponding .frm files\n"
- "of InnoDB tables, or you have moved .frm "
- "files to another database?\n"
- "or, the table contains indexes that this "
- "version of the engine\n"
- "doesn't support.\n"
- "See " REFMAN "innodb-troubleshooting.html\n"
- "how you can resolve the problem.\n",
- norm_name);
- free_share(share);
- my_errno = ENOENT;
+ if ((sql_command == SQLCOM_ALTER_TABLE
+ || sql_command == SQLCOM_OPTIMIZE
+ || sql_command == SQLCOM_CREATE_INDEX
+#ifdef WITH_WSREP
+ || (wsrep_on(user_thd) && wsrep_load_data_splitting &&
+ sql_command == SQLCOM_LOAD &&
+ !thd_test_options(
+ user_thd, OPTION_NOT_AUTOCOMMIT | OPTION_BEGIN))
+#endif /* WITH_WSREP */
+ || sql_command == SQLCOM_DROP_INDEX)
+ && num_write_row >= 10000) {
+#ifdef WITH_WSREP
+ if (wsrep_on(user_thd) && sql_command == SQLCOM_LOAD) {
+ WSREP_DEBUG("forced trx split for LOAD: %s",
+ wsrep_thd_query(user_thd));
+ }
+#endif /* WITH_WSREP */
+ /* ALTER TABLE is COMMITted at every 10000 copied rows.
+ The IX table lock for the original table has to be re-issued.
+ As this method will be called on a temporary table where the
+ contents of the original table is being copied to, it is
+ a bit tricky to determine the source table. The cursor
+ position in the source table need not be adjusted after the
+ intermediate COMMIT, since writes by other transactions are
+ being blocked by a MySQL table lock TL_WRITE_ALLOW_READ. */
- DBUG_RETURN(HA_ERR_NO_SUCH_TABLE);
- }
+ dict_table_t* src_table;
+ enum lock_mode mode;
-table_opened:
+ num_write_row = 0;
- if (ib_table->ibd_file_missing && !thd_tablespace_op(thd)) {
- sql_print_error("MySQL is trying to open a table handle but "
- "the .ibd file for\ntable %s does not exist.\n"
- "Have you deleted the .ibd file from the "
- "database directory under\nthe MySQL datadir, "
- "or have you used DISCARD TABLESPACE?\n"
- "See " REFMAN "innodb-troubleshooting.html\n"
- "how you can resolve the problem.\n",
- norm_name);
- free_share(share);
- my_errno = ENOENT;
+ /* Commit the transaction. This will release the table
+ locks, so they have to be acquired again. */
- dict_table_decrement_handle_count(ib_table, FALSE);
- DBUG_RETURN(HA_ERR_NO_SUCH_TABLE);
- }
+ /* Altering an InnoDB table */
+ /* Get the source table. */
+ src_table = lock_get_src_table(
+ prebuilt->trx, prebuilt->table, &mode);
+ if (!src_table) {
+no_commit:
+ /* Unknown situation: do not commit */
+ /*
+ ut_print_timestamp(stderr);
+ fprintf(stderr,
+ " InnoDB: ALTER TABLE is holding lock"
+ " on %lu tables!\n",
+ prebuilt->trx->mysql_n_tables_locked);
+ */
+ ;
+ } else if (src_table == prebuilt->table) {
+#ifdef WITH_WSREP
+ if (wsrep_on(user_thd) &&
+ wsrep_load_data_splitting &&
+ sql_command == SQLCOM_LOAD &&
+ !thd_test_options(user_thd,
+ OPTION_NOT_AUTOCOMMIT | OPTION_BEGIN))
+ {
+ switch (wsrep_run_wsrep_commit(user_thd, wsrep_hton, 1))
+ {
+ case WSREP_TRX_OK:
+ break;
+ case WSREP_TRX_SIZE_EXCEEDED:
+ case WSREP_TRX_CERT_FAIL:
+ case WSREP_TRX_ERROR:
+ DBUG_RETURN(1);
+ }
- prebuilt = row_create_prebuilt(ib_table, table->s->reclength);
+ if (binlog_hton->commit(binlog_hton, user_thd, 1))
+ DBUG_RETURN(1);
+ wsrep_post_commit(user_thd, TRUE);
+ }
+#endif /* WITH_WSREP */
+ /* Source table is not in InnoDB format:
+ no need to re-acquire locks on it. */
- prebuilt->default_rec = table->s->default_values;
- ut_ad(prebuilt->default_rec);
+ /* Altering to InnoDB format */
+ innobase_commit(ht, user_thd, 1);
+ /* Note that this transaction is still active. */
+ trx_register_for_2pc(prebuilt->trx);
+ /* We will need an IX lock on the destination table. */
+ prebuilt->sql_stat_start = TRUE;
+ } else {
+#ifdef WITH_WSREP
+ if (wsrep_on(user_thd) &&
+ wsrep_load_data_splitting &&
+ sql_command == SQLCOM_LOAD &&
+ !thd_test_options(user_thd,
+ OPTION_NOT_AUTOCOMMIT | OPTION_BEGIN))
+ {
+ switch (wsrep_run_wsrep_commit(user_thd, wsrep_hton, 1))
+ {
+ case WSREP_TRX_OK:
+ break;
+ case WSREP_TRX_SIZE_EXCEEDED:
+ case WSREP_TRX_CERT_FAIL:
+ case WSREP_TRX_ERROR:
+ DBUG_RETURN(1);
+ }
- /* Looks like MySQL-3.23 sometimes has primary key number != 0 */
+ if (binlog_hton->commit(binlog_hton, user_thd, 1))
+ DBUG_RETURN(1);
+ wsrep_post_commit(user_thd, TRUE);
+ }
+#endif /* WITH_WSREP */
+ /* Ensure that there are no other table locks than
+ LOCK_IX and LOCK_AUTO_INC on the destination table. */
- primary_key = table->s->primary_key;
- key_used_on_scan = primary_key;
+ if (!lock_is_table_exclusive(prebuilt->table,
+ prebuilt->trx)) {
+ goto no_commit;
+ }
- if (!innobase_build_index_translation(table, ib_table, share)) {
- sql_print_error("Build InnoDB index translation table for"
- " Table %s failed", name);
+ /* Commit the transaction. This will release the table
+ locks, so they have to be acquired again. */
+ innobase_commit(ht, user_thd, 1);
+ /* Note that this transaction is still active. */
+ trx_register_for_2pc(prebuilt->trx);
+ /* Re-acquire the table lock on the source table. */
+ row_lock_table_for_mysql(prebuilt, src_table, mode);
+ /* We will need an IX lock on the destination table. */
+ prebuilt->sql_stat_start = TRUE;
+ }
}
- /* Allocate a buffer for a 'row reference'. A row reference is
- a string of bytes of length ref_length which uniquely specifies
- a row in our table. Note that MySQL may also compare two row
- references for equality by doing a simple memcmp on the strings
- of length ref_length! */
-
- if (!row_table_got_default_clust_index(ib_table)) {
-
- prebuilt->clust_index_was_generated = FALSE;
+ num_write_row++;
- if (UNIV_UNLIKELY(primary_key >= MAX_KEY)) {
- sql_print_error("Table %s has a primary key in "
- "InnoDB data dictionary, but not "
- "in MySQL!", name);
+ /* This is the case where the table has an auto-increment column */
+ if (table->next_number_field && record == table->record[0]) {
- /* This mismatch could cause further problems
- if not attended, bring this to the user's attention
- by printing a warning in addition to log a message
- in the errorlog */
- push_warning_printf(thd, MYSQL_ERROR::WARN_LEVEL_WARN,
- ER_NO_SUCH_INDEX,
- "InnoDB: Table %s has a "
- "primary key in InnoDB data "
- "dictionary, but not in "
- "MySQL!", name);
+ /* Reset the error code before calling
+ innobase_get_auto_increment(). */
+ prebuilt->autoinc_error = DB_SUCCESS;
- /* If primary_key >= MAX_KEY, its (primary_key)
- value could be out of bound if continue to index
- into key_info[] array. Find InnoDB primary index,
- and assign its key_length to ref_length.
- In addition, since MySQL indexes are sorted starting
- with primary index, unique index etc., initialize
- ref_length to the first index key length in
- case we fail to find InnoDB cluster index.
+#ifdef WITH_WSREP
+ auto_inc_inserted= (table->next_number_field->val_int() == 0);
+#endif
- Please note, this will not resolve the primary
- index mismatch problem, other side effects are
- possible if users continue to use the table.
- However, we allow this table to be opened so
- that user can adopt necessary measures for the
- mismatch while still being accessible to the table
- date. */
- ref_length = table->key_info[0].key_length;
+ if ((error_result = update_auto_increment())) {
+ /* We don't want to mask autoinc overflow errors. */
- /* Find correspoinding cluster index
- key length in MySQL's key_info[] array */
- for (ulint i = 0; i < table->s->keys; i++) {
- dict_index_t* index;
- index = innobase_get_index(i);
- if (dict_index_is_clust(index)) {
- ref_length =
- table->key_info[i].key_length;
- }
+ /* Handle the case where the AUTOINC sub-system
+ failed during initialization. */
+ if (prebuilt->autoinc_error == DB_UNSUPPORTED) {
+ error_result = ER_AUTOINC_READ_FAILED;
+ /* Set the error message to report too. */
+ my_error(ER_AUTOINC_READ_FAILED, MYF(0));
+ goto func_exit;
+ } else if (prebuilt->autoinc_error != DB_SUCCESS) {
+ error = prebuilt->autoinc_error;
+ goto report_error;
}
- } else {
- /* MySQL allocates the buffer for ref.
- key_info->key_length includes space for all key
- columns + one byte for each column that may be
- NULL. ref_length must be as exact as possible to
- save space, because all row reference buffers are
- allocated based on ref_length. */
-
- ref_length = table->key_info[primary_key].key_length;
- }
- } else {
- if (primary_key != MAX_KEY) {
- sql_print_error(
- "Table %s has no primary key in InnoDB data "
- "dictionary, but has one in MySQL! If you "
- "created the table with a MySQL version < "
- "3.23.54 and did not define a primary key, "
- "but defined a unique key with all non-NULL "
- "columns, then MySQL internally treats that "
- "key as the primary key. You can fix this "
- "error by dump + DROP + CREATE + reimport "
- "of the table.", name);
- /* This mismatch could cause further problems
- if not attended, bring this to the user attention
- by printing a warning in addition to log a message
- in the errorlog */
- push_warning_printf(thd, MYSQL_ERROR::WARN_LEVEL_WARN,
- ER_NO_SUCH_INDEX,
- "InnoDB: Table %s has no "
- "primary key in InnoDB data "
- "dictionary, but has one in "
- "MySQL!", name);
+ /* MySQL errors are passed straight back. */
+ goto func_exit;
}
- prebuilt->clust_index_was_generated = TRUE;
-
- ref_length = DATA_ROW_ID_LEN;
-
- /* If we automatically created the clustered index, then
- MySQL does not know about it, and MySQL must NOT be aware
- of the index used on scan, to make it avoid checking if we
- update the column of the index. That is why we assert below
- that key_used_on_scan is the undefined value MAX_KEY.
- The column is the row id in the automatical generation case,
- and it will never be updated anyway. */
-
- if (key_used_on_scan != MAX_KEY) {
- sql_print_warning(
- "Table %s key_used_on_scan is %lu even "
- "though there is no primary key inside "
- "InnoDB.", name, (ulong) key_used_on_scan);
- }
+ auto_inc_used = TRUE;
}
- /* Index block size in InnoDB: used by MySQL in query optimization */
- stats.block_size = 16 * 1024;
-
- /* Init table lock structure */
- thr_lock_data_init(&share->lock,&lock,(void*) 0);
+ if (prebuilt->mysql_template == NULL
+ || prebuilt->template_type != ROW_MYSQL_WHOLE_ROW) {
- if (prebuilt->table) {
- /* We update the highest file format in the system table
- space, if this table has higher file format setting. */
+ /* Build the template used in converting quickly between
+ the two database formats */
- trx_sys_file_format_max_upgrade(
- (const char**) &innobase_file_format_max,
- dict_table_get_format(prebuilt->table));
+ build_template(true);
}
- /* Only if the table has an AUTOINC column. */
- if (prebuilt->table != NULL && table->found_next_number_field != NULL) {
- dict_table_autoinc_lock(prebuilt->table);
-
- /* Since a table can already be "open" in InnoDB's internal
- data dictionary, we only init the autoinc counter once, the
- first time the table is loaded. We can safely reuse the
- autoinc value from a previous MySQL open. */
- if (dict_table_autoinc_read(prebuilt->table) == 0) {
-
- innobase_initialize_autoinc();
- }
+ innobase_srv_conc_enter_innodb(prebuilt->trx);
- dict_table_autoinc_unlock(prebuilt->table);
- }
+ error = row_insert_for_mysql((byte*) record, prebuilt);
+ DEBUG_SYNC(user_thd, "ib_after_row_insert");
- info(HA_STATUS_NO_LOCK | HA_STATUS_VARIABLE | HA_STATUS_CONST);
+ /* Handle duplicate key errors */
+ if (auto_inc_used) {
+ ulonglong auto_inc;
+ ulonglong col_max_value;
- DBUG_RETURN(0);
-}
+ /* Note the number of rows processed for this statement, used
+ by get_auto_increment() to determine the number of AUTO-INC
+ values to reserve. This is only useful for a mult-value INSERT
+ and is a statement level counter.*/
+ if (trx->n_autoinc_rows > 0) {
+ --trx->n_autoinc_rows;
+ }
-UNIV_INTERN
-handler*
-ha_innobase::clone(
-/*===============*/
- const char* name, /*!< in: table name */
- MEM_ROOT* mem_root) /*!< in: memory context */
-{
- ha_innobase* new_handler;
+ /* We need the upper limit of the col type to check for
+ whether we update the table autoinc counter or not. */
+ col_max_value = innobase_get_int_col_max_value(
+ table->next_number_field);
- DBUG_ENTER("ha_innobase::clone");
+ /* Get the value that MySQL attempted to store in the table.*/
+ auto_inc = table->next_number_field->val_uint();
- new_handler = static_cast<ha_innobase*>(handler::clone(name,
- mem_root));
- if (new_handler) {
- DBUG_ASSERT(new_handler->prebuilt != NULL);
- DBUG_ASSERT(new_handler->user_thd == user_thd);
- DBUG_ASSERT(new_handler->prebuilt->trx == prebuilt->trx);
+ switch (error) {
+ case DB_DUPLICATE_KEY:
- new_handler->prebuilt->select_lock_type
- = prebuilt->select_lock_type;
- }
+ /* A REPLACE command and LOAD DATA INFILE REPLACE
+ handle a duplicate key error themselves, but we
+ must update the autoinc counter if we are performing
+ those statements. */
- DBUG_RETURN(new_handler);
-}
+ switch (sql_command) {
+ case SQLCOM_LOAD:
+ if (trx->duplicates) {
-UNIV_INTERN
-uint
-ha_innobase::max_supported_key_part_length() const
-{
- /* A table format specific index column length check will be performed
- at ha_innobase::add_index() and row_create_index_for_mysql() */
- return(innobase_large_prefix
- ? REC_VERSION_56_MAX_INDEX_COL_LEN
- : REC_ANTELOPE_MAX_INDEX_COL_LEN - 1);
-}
+ goto set_max_autoinc;
+ }
+ break;
-/******************************************************************//**
-Closes a handle to an InnoDB table.
-@return 0 */
-UNIV_INTERN
-int
-ha_innobase::close(void)
-/*====================*/
-{
- THD* thd;
+ case SQLCOM_REPLACE:
+ case SQLCOM_INSERT_SELECT:
+ case SQLCOM_REPLACE_SELECT:
+ goto set_max_autoinc;
- DBUG_ENTER("ha_innobase::close");
+#ifdef WITH_WSREP
+ /* workaround for LP bug #355000, retrying the insert */
+ case SQLCOM_INSERT:
- thd = ha_thd();
- if (thd != NULL) {
- innobase_release_temporary_latches(ht, thd);
- }
+ WSREP_DEBUG("DUPKEY error for autoinc\n"
+ "THD %ld, value %llu, off %llu inc %llu",
+ wsrep_thd_thread_id(current_thd),
+ auto_inc,
+ prebuilt->autoinc_offset,
+ prebuilt->autoinc_increment);
- row_prebuilt_free(prebuilt, FALSE);
+ if (wsrep_on(current_thd) &&
+ auto_inc_inserted &&
+ wsrep_drupal_282555_workaround &&
+ wsrep_thd_retry_counter(current_thd) == 0 &&
+ !thd_test_options(current_thd,
+ OPTION_NOT_AUTOCOMMIT |
+ OPTION_BEGIN)) {
+ WSREP_DEBUG(
+ "retrying insert: %s",
+ (*wsrep_thd_query(current_thd)) ?
+ wsrep_thd_query(current_thd) :
+ (char *)"void");
+ error= DB_SUCCESS;
+ wsrep_thd_set_conflict_state(
+ current_thd, MUST_ABORT);
+ innobase_srv_conc_exit_innodb(prebuilt->trx);
+ /* jump straight to func exit over
+ * later wsrep hooks */
+ goto func_exit;
+ }
+ break;
+#endif /* WITH_WSREP */
- if (upd_buf != NULL) {
- ut_ad(upd_buf_size != 0);
- my_free(upd_buf);
- upd_buf = NULL;
- upd_buf_size = 0;
- }
+ default:
+ break;
+ }
- free_share(share);
+ break;
- /* Tell InnoDB server that there might be work for
- utility threads: */
+ case DB_SUCCESS:
+ /* If the actual value inserted is greater than
+ the upper limit of the interval, then we try and
+ update the table upper limit. Note: last_value
+ will be 0 if get_auto_increment() was not called.*/
- srv_active_wake_master_thread();
+ if (auto_inc >= prebuilt->autoinc_last_value) {
+set_max_autoinc:
+ /* This should filter out the negative
+ values set explicitly by the user. */
+ if (auto_inc <= col_max_value) {
+ ut_a(prebuilt->autoinc_increment > 0);
- DBUG_RETURN(0);
-}
+ ulonglong offset;
+ ulonglong increment;
+ dberr_t err;
-/* The following accessor functions should really be inside MySQL code! */
+ offset = prebuilt->autoinc_offset;
+ increment = prebuilt->autoinc_increment;
-/**************************************************************//**
-Gets field offset for a field in a table.
-@return offset */
-static inline
-uint
-get_field_offset(
-/*=============*/
- TABLE* table, /*!< in: MySQL table object */
- Field* field) /*!< in: MySQL field object */
-{
- return((uint) (field->ptr - table->record[0]));
-}
+ auto_inc = innobase_next_autoinc(
+ auto_inc,
+ 1, increment, offset,
+ col_max_value);
-/**************************************************************//**
-Checks if a field in a record is SQL NULL. Uses the record format
-information in table to track the null bit in record.
-@return 1 if NULL, 0 otherwise */
-static inline
-uint
-field_in_record_is_null(
-/*====================*/
- TABLE* table, /*!< in: MySQL table object */
- Field* field, /*!< in: MySQL field object */
- char* record) /*!< in: a row in MySQL format */
-{
- int null_offset;
+ err = innobase_set_max_autoinc(
+ auto_inc);
+
+ if (err != DB_SUCCESS) {
+ error = err;
+ }
+ }
+ }
+ break;
+ default:
+ break;
+ }
+ }
- if (!field->null_ptr) {
+ innobase_srv_conc_exit_innodb(prebuilt->trx);
- return(0);
+report_error:
+ if (error == DB_TABLESPACE_DELETED) {
+ ib_senderrf(
+ trx->mysql_thd, IB_LOG_LEVEL_ERROR,
+ ER_TABLESPACE_DISCARDED,
+ table->s->table_name.str);
}
- null_offset = (uint) ((char*) field->null_ptr
- - (char*) table->record[0]);
+ error_result = convert_error_code_to_mysql(error,
+ prebuilt->table->flags,
+ user_thd);
- if (record[null_offset] & field->null_bit) {
+#ifdef WITH_WSREP
- if (!error_result &&
- wsrep_thd_exec_mode(user_thd) == LOCAL_STATE &&
- wsrep_on(user_thd) &&
- !wsrep_consistency_check(user_thd) &&
- !wsrep_thd_skip_append_keys(user_thd))
- {
- if (wsrep_append_keys(user_thd, false, record, NULL))
- {
++ if (!error_result
++ && wsrep_on(user_thd)
++ && wsrep_thd_exec_mode(user_thd) == LOCAL_STATE
++ && !wsrep_consistency_check(user_thd)
++ && !wsrep_thd_skip_append_keys(user_thd)) {
++ if (wsrep_append_keys(user_thd, false, record, NULL)) {
+ DBUG_PRINT("wsrep", ("row key failed"));
+ error_result = HA_ERR_INTERNAL_ERROR;
+ goto wsrep_error;
+ }
+ }
+wsrep_error:
+#endif /* WITH_WSREP */
- return(1);
+ if (error_result == HA_FTS_INVALID_DOCID) {
+ my_error(HA_FTS_INVALID_DOCID, MYF(0));
}
- return(0);
+func_exit:
+ innobase_active_small();
+
+ DBUG_RETURN(error_result);
}
-/*************************************************************//**
-InnoDB uses this function to compare two data fields for which the data type
-is such that we must use MySQL code to compare them. NOTE that the prototype
-of this function is in rem0cmp.c in InnoDB source code! If you change this
-function, remember to update the prototype there!
-@return 1, 0, -1, if a is greater, equal, less than b, respectively */
-extern "C" UNIV_INTERN
-int
-innobase_mysql_cmp(
-/*===============*/
- int mysql_type, /*!< in: MySQL type */
- uint charset_number, /*!< in: number of the charset */
- const unsigned char* a, /*!< in: data field */
- unsigned int a_length, /*!< in: data field length,
- not UNIV_SQL_NULL */
- const unsigned char* b, /*!< in: data field */
- unsigned int b_length) /*!< in: data field length,
- not UNIV_SQL_NULL */
+/**********************************************************************//**
+Checks which fields have changed in a row and stores information
+of them to an update vector.
+@return DB_SUCCESS or error code */
+static
+dberr_t
+calc_row_difference(
+/*================*/
+ upd_t* uvect, /*!< in/out: update vector */
+ uchar* old_row, /*!< in: old row in MySQL format */
+ uchar* new_row, /*!< in: new row in MySQL format */
+ TABLE* table, /*!< in: table in MySQL data
+ dictionary */
+ uchar* upd_buff, /*!< in: buffer to use */
+ ulint buff_len, /*!< in: buffer length */
+ row_prebuilt_t* prebuilt, /*!< in: InnoDB prebuilt struct */
+ THD* thd) /*!< in: user thread */
{
- CHARSET_INFO* charset;
- enum_field_types mysql_tp;
- int ret;
+ uchar* original_upd_buff = upd_buff;
+ Field* field;
+ enum_field_types field_mysql_type;
+ uint n_fields;
+ ulint o_len;
+ ulint n_len;
+ ulint col_pack_len;
+ const byte* new_mysql_row_col;
+ const byte* o_ptr;
+ const byte* n_ptr;
+ byte* buf;
+ upd_field_t* ufield;
+ ulint col_type;
+ ulint n_changed = 0;
+ dfield_t dfield;
+ dict_index_t* clust_index;
+ uint sql_idx, innodb_idx= 0;
+ ibool changes_fts_column = FALSE;
+ ibool changes_fts_doc_col = FALSE;
+ trx_t* trx = thd_to_trx(thd);
+ doc_id_t doc_id = FTS_NULL_DOC_ID;
- DBUG_ASSERT(a_length != UNIV_SQL_NULL);
- DBUG_ASSERT(b_length != UNIV_SQL_NULL);
+ ut_ad(!srv_read_only_mode);
- mysql_tp = (enum_field_types) mysql_type;
+ n_fields = table->s->fields;
+ clust_index = dict_table_get_first_index(prebuilt->table);
- switch (mysql_tp) {
+ /* We use upd_buff to convert changed fields */
+ buf = (byte*) upd_buff;
- case MYSQL_TYPE_BIT:
- case MYSQL_TYPE_STRING:
- case MYSQL_TYPE_VAR_STRING:
- case MYSQL_TYPE_TINY_BLOB:
- case MYSQL_TYPE_MEDIUM_BLOB:
- case MYSQL_TYPE_BLOB:
- case MYSQL_TYPE_LONG_BLOB:
- case MYSQL_TYPE_VARCHAR:
- /* Use the charset number to pick the right charset struct for
- the comparison. Since the MySQL function get_charset may be
- slow before Bar removes the mutex operation there, we first
- look at 2 common charsets directly. */
+ for (sql_idx = 0; sql_idx < n_fields; sql_idx++) {
+ field = table->field[sql_idx];
+ if (!field->stored_in_db)
+ continue;
- if (charset_number == default_charset_info->number) {
- charset = default_charset_info;
- } else if (charset_number == my_charset_latin1.number) {
- charset = &my_charset_latin1;
- } else {
- charset = get_charset(charset_number, MYF(MY_WME));
+ o_ptr = (const byte*) old_row + get_field_offset(table, field);
+ n_ptr = (const byte*) new_row + get_field_offset(table, field);
- if (charset == NULL) {
- sql_print_error("InnoDB needs charset %lu for doing "
- "a comparison, but MySQL cannot "
- "find that charset.",
- (ulong) charset_number);
- ut_a(0);
- }
- }
+ /* Use new_mysql_row_col and col_pack_len save the values */
- /* Starting from 4.1.3, we use strnncollsp() in comparisons of
- non-latin1_swedish_ci strings. NOTE that the collation order
- changes then: 'b\0\0...' is ordered BEFORE 'b ...'. Users
- having indexes on such data need to rebuild their tables! */
+ new_mysql_row_col = n_ptr;
+ col_pack_len = field->pack_length();
- ret = charset->coll->strnncollsp(charset,
- a, a_length,
- b, b_length, 0);
- if (ret < 0) {
- return(-1);
- } else if (ret > 0) {
- return(1);
- } else {
- return(0);
- }
- default:
- ut_error;
- }
+ o_len = col_pack_len;
+ n_len = col_pack_len;
- return(0);
-}
-#ifdef WITH_WSREP
-extern "C" UNIV_INTERN
-int
-wsrep_innobase_mysql_sort(
-/*===============*/
- /* out: str contains sort string */
- int mysql_type, /* in: MySQL type */
- uint charset_number, /* in: number of the charset */
- unsigned char* str, /* in: data field */
- unsigned int str_length, /* in: data field length,
- not UNIV_SQL_NULL */
- unsigned int buf_length) /* in: total str buffer length */
+ /* We use o_ptr and n_ptr to dig up the actual data for
+ comparison. */
-{
- CHARSET_INFO* charset;
- enum_field_types mysql_tp;
- int ret_length = str_length;
+ field_mysql_type = field->type();
- DBUG_ASSERT(str_length != UNIV_SQL_NULL);
+ col_type = prebuilt->table->cols[innodb_idx].mtype;
- mysql_tp = (enum_field_types) mysql_type;
+ switch (col_type) {
- switch (mysql_tp) {
+ case DATA_BLOB:
+ o_ptr = row_mysql_read_blob_ref(&o_len, o_ptr, o_len);
+ n_ptr = row_mysql_read_blob_ref(&n_len, n_ptr, n_len);
- case MYSQL_TYPE_BIT:
- case MYSQL_TYPE_STRING:
- case MYSQL_TYPE_VAR_STRING:
- case MYSQL_TYPE_TINY_BLOB:
- case MYSQL_TYPE_MEDIUM_BLOB:
- case MYSQL_TYPE_BLOB:
- case MYSQL_TYPE_LONG_BLOB:
- case MYSQL_TYPE_VARCHAR:
- {
- uchar tmp_str[REC_VERSION_56_MAX_INDEX_COL_LEN];
- uint tmp_length = REC_VERSION_56_MAX_INDEX_COL_LEN;
+ break;
- /* Use the charset number to pick the right charset struct for
- the comparison. Since the MySQL function get_charset may be
- slow before Bar removes the mutex operation there, we first
- look at 2 common charsets directly. */
+ case DATA_VARCHAR:
+ case DATA_BINARY:
+ case DATA_VARMYSQL:
+ if (field_mysql_type == MYSQL_TYPE_VARCHAR) {
+ /* This is a >= 5.0.3 type true VARCHAR where
+ the real payload data length is stored in
+ 1 or 2 bytes */
- if (charset_number == default_charset_info->number) {
- charset = default_charset_info;
- } else if (charset_number == my_charset_latin1.number) {
- charset = &my_charset_latin1;
- } else {
- charset = get_charset(charset_number, MYF(MY_WME));
+ o_ptr = row_mysql_read_true_varchar(
+ &o_len, o_ptr,
+ (ulint)
+ (((Field_varstring*) field)->length_bytes));
+
+ n_ptr = row_mysql_read_true_varchar(
+ &n_len, n_ptr,
+ (ulint)
+ (((Field_varstring*) field)->length_bytes));
+ }
+
+ break;
+ default:
+ ;
+ }
- if (charset == NULL) {
- sql_print_error("InnoDB needs charset %lu for doing "
- "a comparison, but MySQL cannot "
- "find that charset.",
- (ulong) charset_number);
- ut_a(0);
+ if (field_mysql_type == MYSQL_TYPE_LONGLONG
+ && prebuilt->table->fts
+ && innobase_strcasecmp(
+ field->field_name, FTS_DOC_ID_COL_NAME) == 0) {
+ doc_id = (doc_id_t) mach_read_from_n_little_endian(
+ n_ptr, 8);
+ if (doc_id == 0) {
+ return(DB_FTS_INVALID_DOCID);
}
}
@@@ -13725,277 -9359,313 +13727,278 @@@ ha_innobase::get_parent_foreign_key_lis
trx_search_latch_release_if_reserved(prebuilt->trx);
- ib_table = prebuilt->table;
-
- if (flag & HA_STATUS_TIME) {
- if (called_from_analyze || innobase_stats_on_metadata) {
- /* In sql_show we call with this flag: update
- then statistics so that they are up-to-date */
-
- prebuilt->trx->op_info = "updating table statistics";
+ mutex_enter(&(dict_sys->mutex));
- DEBUG_SYNC_C("info_before_stats_update");
+ for (dict_foreign_set::iterator it
+ = prebuilt->table->referenced_set.begin();
+ it != prebuilt->table->referenced_set.end();
+ ++it) {
- dict_update_statistics(
- ib_table,
- FALSE, /* update even if initialized */
- FALSE /* update even if not changed too much */);
+ foreign = *it;
- prebuilt->trx->op_info = "returning various info to MySQL";
+ pf_key_info = get_foreign_key_info(thd, foreign);
+ if (pf_key_info) {
+ f_key_list->push_back(pf_key_info);
}
-
}
- if (flag & HA_STATUS_VARIABLE) {
-
- ulint page_size;
-
- dict_table_stats_lock(ib_table, RW_S_LATCH);
+ mutex_exit(&(dict_sys->mutex));
- n_rows = ib_table->stat_n_rows;
+ prebuilt->trx->op_info = "";
- /* Because we do not protect stat_n_rows by any mutex in a
- delete, it is theoretically possible that the value can be
- smaller than zero! TODO: fix this race.
+ return(0);
+}
- The MySQL optimizer seems to assume in a left join that n_rows
- is an accurate estimate if it is zero. Of course, it is not,
- since we do not have any locks on the rows yet at this phase.
- Since SHOW TABLE STATUS seems to call this function with the
- HA_STATUS_TIME flag set, while the left join optimizer does not
- set that flag, we add one to a zero value if the flag is not
- set. That way SHOW TABLE STATUS will show the best estimate,
- while the optimizer never sees the table empty. */
+/*****************************************************************//**
+Checks if ALTER TABLE may change the storage engine of the table.
+Changing storage engines is not allowed for tables for which there
+are foreign key constraints (parent or child tables).
+@return TRUE if can switch engines */
+UNIV_INTERN
+bool
+ha_innobase::can_switch_engines(void)
+/*=================================*/
+{
+ bool can_switch;
- if (n_rows < 0) {
- n_rows = 0;
- }
+ DBUG_ENTER("ha_innobase::can_switch_engines");
+ update_thd();
- if (n_rows == 0 && !(flag & HA_STATUS_TIME)) {
- n_rows++;
- }
+ prebuilt->trx->op_info =
+ "determining if there are foreign key constraints";
+ row_mysql_freeze_data_dictionary(prebuilt->trx);
- /* Fix bug#40386: Not flushing query cache after truncate.
- n_rows can not be 0 unless the table is empty, set to 1
- instead. The original problem of bug#29507 is actually
- fixed in the server code. */
- if (thd_sql_command(user_thd) == SQLCOM_TRUNCATE) {
+ can_switch = prebuilt->table->referenced_set.empty()
+ && prebuilt->table->foreign_set.empty();
- n_rows = 1;
+ row_mysql_unfreeze_data_dictionary(prebuilt->trx);
+ prebuilt->trx->op_info = "";
- /* We need to reset the prebuilt value too, otherwise
- checks for values greater than the last value written
- to the table will fail and the autoinc counter will
- not be updated. This will force write_row() into
- attempting an update of the table's AUTOINC counter. */
+ DBUG_RETURN(can_switch);
+}
- prebuilt->autoinc_last_value = 0;
- }
+/*******************************************************************//**
+Checks if a table is referenced by a foreign key. The MySQL manual states that
+a REPLACE is either equivalent to an INSERT, or DELETE(s) + INSERT. Only a
+delete is then allowed internally to resolve a duplicate key conflict in
+REPLACE, not an update.
+@return > 0 if referenced by a FOREIGN KEY */
+UNIV_INTERN
+uint
+ha_innobase::referenced_by_foreign_key(void)
+/*========================================*/
+{
+ if (dict_table_is_referenced_by_foreign_key(prebuilt->table)) {
- page_size = dict_table_zip_size(ib_table);
- if (page_size == 0) {
- page_size = UNIV_PAGE_SIZE;
- }
+ return(1);
+ }
- stats.records = (ha_rows)n_rows;
- stats.deleted = 0;
- stats.data_file_length
- = ((ulonglong) ib_table->stat_clustered_index_size)
- * page_size;
- stats.index_file_length =
- ((ulonglong) ib_table->stat_sum_of_other_index_sizes)
- * page_size;
+ return(0);
+}
- dict_table_stats_unlock(ib_table, RW_S_LATCH);
+/*******************************************************************//**
+Frees the foreign key create info for a table stored in InnoDB, if it is
+non-NULL. */
+UNIV_INTERN
+void
+ha_innobase::free_foreign_key_create_info(
+/*======================================*/
+ char* str) /*!< in, own: create info string to free */
+{
+ if (str) {
+ my_free(str);
+ }
+}
- /* Since fsp_get_available_space_in_free_extents() is
- acquiring latches inside InnoDB, we do not call it if we
- are asked by MySQL to avoid locking. Another reason to
- avoid the call is that it uses quite a lot of CPU.
- See Bug#38185. */
- if (flag & HA_STATUS_NO_LOCK
- || !(flag & HA_STATUS_VARIABLE_EXTRA)) {
- /* We do not update delete_length if no
- locking is requested so the "old" value can
- remain. delete_length is initialized to 0 in
- the ha_statistics' constructor. Also we only
- need delete_length to be set when
- HA_STATUS_VARIABLE_EXTRA is set */
- } else if (UNIV_UNLIKELY
- (srv_force_recovery >= SRV_FORCE_NO_IBUF_MERGE)) {
- /* Avoid accessing the tablespace if
- innodb_crash_recovery is set to a high value. */
- stats.delete_length = 0;
- } else {
- ullint avail_space;
+/*******************************************************************//**
+Tells something additional to the handler about how to do things.
+@return 0 or error number */
+UNIV_INTERN
+int
+ha_innobase::extra(
+/*===============*/
+ enum ha_extra_function operation)
+ /*!< in: HA_EXTRA_FLUSH or some other flag */
+{
+ check_trx_exists(ha_thd());
- avail_space = fsp_get_available_space_in_free_extents(
- ib_table->space);
+ /* Warning: since it is not sure that MySQL calls external_lock
+ before calling this function, the trx field in prebuilt can be
+ obsolete! */
- if (avail_space == ULLINT_UNDEFINED) {
- THD* thd;
+ switch (operation) {
+ case HA_EXTRA_FLUSH:
+ if (prebuilt->blob_heap) {
+ row_mysql_prebuilt_free_blob_heap(prebuilt);
+ }
+ break;
+ case HA_EXTRA_RESET_STATE:
+ reset_template();
+ thd_to_trx(ha_thd())->duplicates = 0;
+ break;
+ case HA_EXTRA_NO_KEYREAD:
+ prebuilt->read_just_key = 0;
+ break;
+ case HA_EXTRA_KEYREAD:
+ prebuilt->read_just_key = 1;
+ break;
+ case HA_EXTRA_KEYREAD_PRESERVE_FIELDS:
+ prebuilt->keep_other_fields_on_keyread = 1;
+ break;
- thd = ha_thd();
+ /* IMPORTANT: prebuilt->trx can be obsolete in
+ this method, because it is not sure that MySQL
+ calls external_lock before this method with the
+ parameters below. We must not invoke update_thd()
+ either, because the calling threads may change.
+ CAREFUL HERE, OR MEMORY CORRUPTION MAY OCCUR! */
+ case HA_EXTRA_INSERT_WITH_UPDATE:
+ thd_to_trx(ha_thd())->duplicates |= TRX_DUP_IGNORE;
+ break;
+ case HA_EXTRA_NO_IGNORE_DUP_KEY:
+ thd_to_trx(ha_thd())->duplicates &= ~TRX_DUP_IGNORE;
+ break;
+ case HA_EXTRA_WRITE_CAN_REPLACE:
+ thd_to_trx(ha_thd())->duplicates |= TRX_DUP_REPLACE;
+ break;
+ case HA_EXTRA_WRITE_CANNOT_REPLACE:
+ thd_to_trx(ha_thd())->duplicates &= ~TRX_DUP_REPLACE;
+ break;
+ default:/* Do nothing */
+ ;
+ }
- push_warning_printf(
- thd,
- MYSQL_ERROR::WARN_LEVEL_WARN,
- ER_CANT_GET_STAT,
- "InnoDB: Trying to get the free "
- "space for table %s but its "
- "tablespace has been discarded or "
- "the .ibd file is missing. Setting "
- "the free space to zero.",
- ib_table->name);
+ return(0);
+}
- stats.delete_length = 0;
- } else {
- stats.delete_length = avail_space * 1024;
- }
- }
+/******************************************************************//**
+*/
+UNIV_INTERN
+int
+ha_innobase::reset()
+/*================*/
+{
+ if (prebuilt->blob_heap) {
+ row_mysql_prebuilt_free_blob_heap(prebuilt);
+ }
- stats.check_time = 0;
- stats.mrr_length_per_rec= ref_length + 8; // 8 = max(sizeof(void *));
+ reset_template();
+ ds_mrr.dsmrr_close();
+ /* TODO: This should really be reset in reset_template() but for now
+ it's safer to do it explicitly here. */
- if (stats.records == 0) {
- stats.mean_rec_length = 0;
- } else {
- stats.mean_rec_length = (ulong) (stats.data_file_length / stats.records);
- }
- }
+ /* This is a statement level counter. */
+ prebuilt->autoinc_last_value = 0;
- if (flag & HA_STATUS_CONST) {
- ulong i;
- /* Verify the number of index in InnoDB and MySQL
- matches up. If prebuilt->clust_index_was_generated
- holds, InnoDB defines GEN_CLUST_INDEX internally */
- ulint num_innodb_index = UT_LIST_GET_LEN(ib_table->indexes)
- - prebuilt->clust_index_was_generated;
+ return(0);
+}
- if (table->s->keys != num_innodb_index) {
- sql_print_error("Table %s contains %lu "
- "indexes inside InnoDB, which "
- "is different from the number of "
- "indexes %u defined in the MySQL ",
- ib_table->name, num_innodb_index,
- table->s->keys);
- }
+/******************************************************************//**
+MySQL calls this function at the start of each SQL statement inside LOCK
+TABLES. Inside LOCK TABLES the ::external_lock method does not work to
+mark SQL statement borders. Note also a special case: if a temporary table
+is created inside LOCK TABLES, MySQL has not called external_lock() at all
+on that table.
+MySQL-5.0 also calls this before each statement in an execution of a stored
+procedure. To make the execution more deterministic for binlogging, MySQL-5.0
+locks all tables involved in a stored procedure with full explicit table
+locks (thd_in_lock_tables(thd) holds in store_lock()) before executing the
+procedure.
+@return 0 or error code */
+UNIV_INTERN
+int
+ha_innobase::start_stmt(
+/*====================*/
+ THD* thd, /*!< in: handle to the user thread */
+ thr_lock_type lock_type)
+{
+ trx_t* trx;
+ DBUG_ENTER("ha_innobase::start_stmt");
- dict_table_stats_lock(ib_table, RW_S_LATCH);
+ update_thd(thd);
- for (i = 0; i < table->s->keys; i++) {
- ulong j;
- rec_per_key = 1;
- /* We could get index quickly through internal
- index mapping with the index translation table.
- The identity of index (match up index name with
- that of table->key_info[i]) is already verified in
- innobase_get_index(). */
- index = innobase_get_index(i);
+ trx = prebuilt->trx;
- if (index == NULL) {
- sql_print_error("Table %s contains fewer "
- "indexes inside InnoDB than "
- "are defined in the MySQL "
- ".frm file. Have you mixed up "
- ".frm files from different "
- "installations? See "
- REFMAN
- "innodb-troubleshooting.html\n",
- ib_table->name);
- break;
- }
+ /* Here we release the search latch and the InnoDB thread FIFO ticket
+ if they were reserved. They should have been released already at the
+ end of the previous statement, but because inside LOCK TABLES the
+ lock count method does not work to mark the end of a SELECT statement,
+ that may not be the case. We MUST release the search latch before an
+ INSERT, for example. */
- for (j = 0; j < table->key_info[i].key_parts; j++) {
+ trx_search_latch_release_if_reserved(trx);
- if (j + 1 > index->n_uniq) {
- sql_print_error(
-"Index %s of %s has %lu columns unique inside InnoDB, but MySQL is asking "
-"statistics for %lu columns. Have you mixed up .frm files from different "
-"installations? "
-"See " REFMAN "innodb-troubleshooting.html\n",
- index->name,
- ib_table->name,
- (unsigned long)
- index->n_uniq, j + 1);
- break;
- }
+ innobase_srv_conc_force_exit_innodb(trx);
- rec_per_key = innodb_rec_per_key(
- index, j, stats.records);
+ /* Reset the AUTOINC statement level counter for multi-row INSERTs. */
+ trx->n_autoinc_rows = 0;
- /* Since MySQL seems to favor table scans
- too much over index searches, we pretend
- index selectivity is 2 times better than
- our estimate: */
+ prebuilt->sql_stat_start = TRUE;
+ prebuilt->hint_need_to_fetch_extra_cols = 0;
+ reset_template();
- rec_per_key = rec_per_key / 2;
+ if (dict_table_is_temporary(prebuilt->table)
+ && prebuilt->mysql_has_locked
+ && prebuilt->select_lock_type == LOCK_NONE) {
+ dberr_t error;
- if (rec_per_key == 0) {
- rec_per_key = 1;
- }
+ switch (thd_sql_command(thd)) {
+ case SQLCOM_INSERT:
+ case SQLCOM_UPDATE:
+ case SQLCOM_DELETE:
++ case SQLCOM_REPLACE:
+ init_table_handle_for_HANDLER();
+ prebuilt->select_lock_type = LOCK_X;
+ prebuilt->stored_select_lock_type = LOCK_X;
+ error = row_lock_table_for_mysql(prebuilt, NULL, 1);
- table->key_info[i].rec_per_key[j]=
- rec_per_key >= ~(ulong) 0 ? ~(ulong) 0 :
- (ulong) rec_per_key;
+ if (error != DB_SUCCESS) {
+ int st = convert_error_code_to_mysql(
+ error, 0, thd);
+ DBUG_RETURN(st);
}
-
- KEY *key_info= table->key_info+i;
- key_part_map ext_key_part_map=
- key_info->ext_key_part_map;
-
- if (key_info->key_parts != key_info->ext_key_parts) {
-
- KEY *pk_key_info= key_info+
- table->s->primary_key;
- uint k = key_info->key_parts;
- ha_rows k_rec_per_key = rec_per_key;
- uint pk_parts = pk_key_info->key_parts;
-
- index= innobase_get_index(
- table->s->primary_key);
-
- n_rows= ib_table->stat_n_rows;
-
- for (j = 0; j < pk_parts; j++) {
-
- if (ext_key_part_map & 1<<j) {
-
- rec_per_key =
- innodb_rec_per_key(index,
- j, stats.records);
-
- if (rec_per_key == 0) {
- rec_per_key = 1;
- }
- else if (rec_per_key > 1) {
- rec_per_key =
- (ha_rows)
- (k_rec_per_key *
- (double)rec_per_key /
- n_rows);
- }
-
- key_info->rec_per_key[k++]=
- rec_per_key >= ~(ulong) 0 ?
- ~(ulong) 0 :
- (ulong) rec_per_key;
-
- }
- }
- }
+ break;
}
+ }
- dict_table_stats_unlock(ib_table, RW_S_LATCH);
-
- my_snprintf(path, sizeof(path), "%s/%s%s",
- mysql_data_home,
- table->s->normalized_path.str,
- reg_ext);
+ if (!prebuilt->mysql_has_locked) {
+ /* This handle is for a temporary table created inside
+ this same LOCK TABLES; since MySQL does NOT call external_lock
+ in this case, we must use x-row locks inside InnoDB to be
+ prepared for an update of a row */
- unpack_filename(path,path);
+ prebuilt->select_lock_type = LOCK_X;
- /* Note that we do not know the access time of the table,
- nor the CHECK TABLE time, nor the UPDATE or INSERT time. */
+ } else if (trx->isolation_level != TRX_ISO_SERIALIZABLE
+ && thd_sql_command(thd) == SQLCOM_SELECT
+ && lock_type == TL_READ) {
- if (os_file_get_status(path,&stat_info)) {
- stats.create_time = (ulong) stat_info.ctime;
- }
- }
+ /* For other than temporary tables, we obtain
+ no lock for consistent read (plain SELECT). */
- if (srv_force_recovery >= SRV_FORCE_NO_IBUF_MERGE) {
+ prebuilt->select_lock_type = LOCK_NONE;
+ } else {
+ /* Not a consistent read: restore the
+ select_lock_type value. The value of
+ stored_select_lock_type was decided in:
+ 1) ::store_lock(),
+ 2) ::external_lock(),
+ 3) ::init_table_handle_for_HANDLER(), and
+ 4) ::transactional_table_lock(). */
- goto func_exit;
+ ut_a(prebuilt->stored_select_lock_type != LOCK_NONE_UNSET);
+ prebuilt->select_lock_type = prebuilt->stored_select_lock_type;
}
- if (flag & HA_STATUS_ERRKEY) {
- const dict_index_t* err_index;
-
- ut_a(prebuilt->trx);
- ut_a(prebuilt->trx->magic_n == TRX_MAGIC_N);
-
- err_index = trx_get_error_info(prebuilt->trx);
+ *trx->detailed_error = 0;
- if (err_index) {
- errkey = innobase_get_mysql_key_number_for_index(
- share, table, ib_table, err_index);
- } else {
- errkey = (unsigned int) prebuilt->trx->error_key_num;
- }
- }
+ innobase_register_trx(ht, thd, trx);
- if ((flag & HA_STATUS_AUTO) && table->found_next_number_field) {
- stats.auto_increment_value = innobase_peek_autoinc();
+ if (!trx_is_started(trx)) {
+ ++trx->will_lock;
}
-func_exit:
- prebuilt->trx->op_info = (char*)"";
-
DBUG_RETURN(0);
}
@@@ -17689,22 -12815,15 +17692,24 @@@ wsrep_innobase_kill_one_trx
wsrep_thd_thread_id(thd),
victim_trx->id);
- WSREP_DEBUG("Aborting query: %s",
- (thd && wsrep_thd_query(thd)) ? wsrep_thd_query(thd) : "void");
+ WSREP_DEBUG("Aborting query: %s conf %d trx: %lu",
+ (thd && wsrep_thd_query(thd)) ? wsrep_thd_query(thd) : "void",
+ wsrep_thd_conflict_state(thd),
+ wsrep_thd_ws_handle(thd)->trx_id);
wsrep_thd_LOCK(thd);
+ DBUG_EXECUTE_IF("sync.wsrep_after_BF_victim_lock",
+ {
+ const char act[]=
+ "now "
+ "wait_for signal.wsrep_after_BF_victim_lock";
+ DBUG_ASSERT(!debug_sync_set_action(bf_thd,
+ STRING_WITH_LEN(act)));
+ };);
+
if (wsrep_thd_query_state(thd) == QUERY_EXITING) {
- WSREP_DEBUG("kill trx EXITING for %llu", victim_trx->id);
+ WSREP_DEBUG("kill trx EXITING for %lu", victim_trx->id);
wsrep_thd_UNLOCK(thd);
DBUG_RETURN(0);
}
@@@ -17751,15 -12870,13 +17756,15 @@@
} else {
rcode = wsrep->abort_pre_commit(
wsrep, bf_seqno,
- (wsrep_trx_id_t)victim_trx->id
+ (wsrep_trx_id_t)wsrep_thd_ws_handle(thd)->trx_id
);
+
switch (rcode) {
case WSREP_WARNING:
- WSREP_DEBUG("cancel commit warning: %llu",
+ WSREP_DEBUG("cancel commit warning: %lu",
victim_trx->id);
wsrep_thd_UNLOCK(thd);
+ wsrep_thd_awake(thd, signal);
DBUG_RETURN(1);
break;
case WSREP_OK:
@@@ -17871,40 -12988,34 +17876,42 @@@
break;
}
default:
- WSREP_WARN("bad wsrep query state: %d",
+ WSREP_WARN("bad wsrep query state: %d",
wsrep_thd_query_state(thd));
+ wsrep_thd_UNLOCK(thd);
break;
}
- wsrep_thd_UNLOCK(thd);
-
+
DBUG_RETURN(0);
}
-static int
-wsrep_abort_transaction(handlerton* hton, THD *bf_thd, THD *victim_thd,
- my_bool signal)
+
+static
+int
+wsrep_abort_transaction(
+ handlerton* hton,
+ THD *bf_thd,
+ THD *victim_thd,
+ my_bool signal)
{
DBUG_ENTER("wsrep_innobase_abort_thd");
- trx_t* victim_trx = thd_to_trx(victim_thd);
- trx_t* bf_trx = (bf_thd) ? thd_to_trx(bf_thd) : NULL;
+
+ trx_t* victim_trx = thd_to_trx(victim_thd);
+ trx_t* bf_trx = (bf_thd) ? thd_to_trx(bf_thd) : NULL;
- WSREP_DEBUG("abort transaction: BF: %s victim: %s",
- wsrep_thd_query(bf_thd),
- wsrep_thd_query(victim_thd));
+ WSREP_DEBUG("abort transaction: BF: %s victim: %s victim conf: %d",
+ wsrep_thd_query(bf_thd),
+ wsrep_thd_query(victim_thd),
+ wsrep_thd_conflict_state(victim_thd));
- ut_ad(!mutex_own(&kernel_mutex));
-
- if (victim_trx)
- {
- int rcode = wsrep_innobase_kill_one_trx(
- bf_thd, bf_trx, victim_trx, signal, FALSE);
+ if (victim_trx) {
+ lock_mutex_enter();
+ trx_mutex_enter(victim_trx);
+ victim_trx->abort_type = TRX_WSREP_ABORT;
+ int rcode = wsrep_innobase_kill_one_trx(bf_thd, bf_trx,
+ victim_trx, signal);
+ trx_mutex_exit(victim_trx);
+ lock_mutex_exit();
+ victim_trx->abort_type = TRX_SERVER_ABORT;
wsrep_srv_conc_cancel_wait(victim_trx);
DBUG_RETURN(rcode);
} else {
diff --cc storage/innobase/os/os0file.cc
index df096dcc6fd,00000000000..d4b8e82b0d8
mode 100644,000000..100644
--- a/storage/innobase/os/os0file.cc
+++ b/storage/innobase/os/os0file.cc
@@@ -1,5785 -1,0 +1,5785 @@@
+/***********************************************************************
+
+Copyright (c) 1995, 2017, Oracle and/or its affiliates. All Rights Reserved.
+Copyright (c) 2009, Percona Inc.
- Copyright (c) 2013, 2017, MariaDB Corporation.
++Copyright (c) 2013, 2018, MariaDB Corporation.
+
+Portions of this file contain modifications contributed and copyrighted
+by Percona Inc.. Those modifications are
+gratefully acknowledged and are described briefly in the InnoDB
+documentation. The contributions by Percona Inc. are incorporated with
+their permission, and subject to the conditions contained in the file
+COPYING.Percona.
+
+This program is free software; you can redistribute it and/or modify it
+under the terms of the GNU General Public License as published by the
+Free Software Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General
+Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA
+
+***********************************************************************/
+
+/**************************************************//**
+@file os/os0file.cc
+The interface to the operating system file i/o primitives
+
+Created 10/21/1995 Heikki Tuuri
+*******************************************************/
+
+#include "os0file.h"
+
+#ifdef UNIV_NONINL
+#include "os0file.ic"
+#endif
+
+#include "ut0mem.h"
+#include "srv0srv.h"
+#include "srv0start.h"
+#include "fil0fil.h"
+#include "buf0buf.h"
+#include "srv0mon.h"
+#ifndef UNIV_HOTBACKUP
+# include "os0sync.h"
+# include "os0thread.h"
+#else /* !UNIV_HOTBACKUP */
+# ifdef __WIN__
+/* Add includes for the _stat() call to compile on Windows */
+# include <sys/types.h>
+# include <sys/stat.h>
+# include <errno.h>
+# endif /* __WIN__ */
+#endif /* !UNIV_HOTBACKUP */
+
+#if defined(LINUX_NATIVE_AIO)
+#include <libaio.h>
+#endif
+
+/** Insert buffer segment id */
+static const ulint IO_IBUF_SEGMENT = 0;
+
+/** Log segment id */
+static const ulint IO_LOG_SEGMENT = 1;
+
+/* This specifies the file permissions InnoDB uses when it creates files in
+Unix; the value of os_innodb_umask is initialized in ha_innodb.cc to
+my_umask */
+
+#ifndef __WIN__
+/** Umask for creating files */
+UNIV_INTERN ulint os_innodb_umask = S_IRUSR | S_IWUSR | S_IRGRP | S_IWGRP;
+#else
+/** Umask for creating files */
+UNIV_INTERN ulint os_innodb_umask = 0;
+#endif /* __WIN__ */
+
+#ifndef UNIV_HOTBACKUP
+/* We use these mutexes to protect lseek + file i/o operation, if the
+OS does not provide an atomic pread or pwrite, or similar */
+#define OS_FILE_N_SEEK_MUTEXES 16
+UNIV_INTERN os_ib_mutex_t os_file_seek_mutexes[OS_FILE_N_SEEK_MUTEXES];
+
+/* In simulated aio, merge at most this many consecutive i/os */
+#define OS_AIO_MERGE_N_CONSECUTIVE 64
+
+#ifdef WITH_INNODB_DISALLOW_WRITES
+#define WAIT_ALLOW_WRITES() os_event_wait(srv_allow_writes_event)
+#else
+#define WAIT_ALLOW_WRITES() do { } while (0)
+#endif /* WITH_INNODB_DISALLOW_WRITES */
+
+/**********************************************************************
+
+InnoDB AIO Implementation:
+=========================
+
+We support native AIO for windows and linux. For rest of the platforms
+we simulate AIO by special io-threads servicing the IO-requests.
+
+Simulated AIO:
+==============
+
+In platforms where we 'simulate' AIO following is a rough explanation
+of the high level design.
+There are four io-threads (for ibuf, log, read, write).
+All synchronous IO requests are serviced by the calling thread using
+os_file_write/os_file_read. The Asynchronous requests are queued up
+in an array (there are four such arrays) by the calling thread.
+Later these requests are picked up by the io-thread and are serviced
+synchronously.
+
+Windows native AIO:
+==================
+
+If srv_use_native_aio is not set then windows follow the same
+code as simulated AIO. If the flag is set then native AIO interface
+is used. On windows, one of the limitation is that if a file is opened
+for AIO no synchronous IO can be done on it. Therefore we have an
+extra fifth array to queue up synchronous IO requests.
+There are innodb_file_io_threads helper threads. These threads work
+on the four arrays mentioned above in Simulated AIO. No thread is
+required for the sync array.
+If a synchronous IO request is made, it is first queued in the sync
+array. Then the calling thread itself waits on the request, thus
+making the call synchronous.
+If an AIO request is made the calling thread not only queues it in the
+array but also submits the requests. The helper thread then collects
+the completed IO request and calls completion routine on it.
+
+Linux native AIO:
+=================
+
+If we have libaio installed on the system and innodb_use_native_aio
+is set to TRUE we follow the code path of native AIO, otherwise we
+do simulated AIO.
+There are innodb_file_io_threads helper threads. These threads work
+on the four arrays mentioned above in Simulated AIO.
+If a synchronous IO request is made, it is handled by calling
+os_file_write/os_file_read.
+If an AIO request is made the calling thread not only queues it in the
+array but also submits the requests. The helper thread then collects
+the completed IO request and calls completion routine on it.
+
+**********************************************************************/
+
+/** Flag: enable debug printout for asynchronous i/o */
+UNIV_INTERN ibool os_aio_print_debug = FALSE;
+
+#ifdef UNIV_PFS_IO
+/* Keys to register InnoDB I/O with performance schema */
+UNIV_INTERN mysql_pfs_key_t innodb_file_data_key;
+UNIV_INTERN mysql_pfs_key_t innodb_file_log_key;
+UNIV_INTERN mysql_pfs_key_t innodb_file_temp_key;
+#endif /* UNIV_PFS_IO */
+
+/** The asynchronous i/o array slot structure */
+struct os_aio_slot_t{
+ ibool is_read; /*!< TRUE if a read operation */
+ ulint pos; /*!< index of the slot in the aio
+ array */
+ ibool reserved; /*!< TRUE if this slot is reserved */
+ time_t reservation_time;/*!< time when reserved */
+ ulint len; /*!< length of the block to read or
+ write */
+ byte* buf; /*!< buffer used in i/o */
+ ulint type; /*!< OS_FILE_READ or OS_FILE_WRITE */
+ os_offset_t offset; /*!< file offset in bytes */
+ pfs_os_file_t file; /*!< file where to read or write */
+ const char* name; /*!< file name or path */
+ ibool io_already_done;/*!< used only in simulated aio:
+ TRUE if the physical i/o already
+ made and only the slot message
+ needs to be passed to the caller
+ of os_aio_simulated_handle */
+ fil_node_t* message1; /*!< message which is given by the */
+ void* message2; /*!< the requester of an aio operation
+ and which can be used to identify
+ which pending aio operation was
+ completed */
+#ifdef WIN_ASYNC_IO
+ HANDLE handle; /*!< handle object we need in the
+ OVERLAPPED struct */
+ OVERLAPPED control; /*!< Windows control block for the
+ aio request */
+#elif defined(LINUX_NATIVE_AIO)
+ struct iocb control; /* Linux control block for aio */
+ int n_bytes; /* bytes written/read. */
+ int ret; /* AIO return code */
+#endif /* WIN_ASYNC_IO */
+};
+
+/** The asynchronous i/o array structure */
+struct os_aio_array_t{
+ os_ib_mutex_t mutex; /*!< the mutex protecting the aio array */
+ os_event_t not_full;
+ /*!< The event which is set to the
+ signaled state when there is space in
+ the aio outside the ibuf segment;
+ os_event_set() and os_event_reset()
+ are protected by os_aio_array_t::mutex */
+ os_event_t is_empty;
+ /*!< The event which is set to the
+ signaled state when there are no
+ pending i/os in this array;
+ os_event_set() and os_event_reset()
+ are protected by os_aio_array_t::mutex */
+ ulint n_slots;/*!< Total number of slots in the aio
+ array. This must be divisible by
+ n_threads. */
+ ulint n_segments;
+ /*!< Number of segments in the aio
+ array of pending aio requests. A
+ thread can wait separately for any one
+ of the segments. */
+ ulint cur_seg;/*!< We reserve IO requests in round
+ robin fashion to different segments.
+ This points to the segment that is to
+ be used to service next IO request. */
+ ulint n_reserved;
+ /*!< Number of reserved slots in the
+ aio array outside the ibuf segment */
+ os_aio_slot_t* slots; /*!< Pointer to the slots in the array */
+#ifdef __WIN__
+ HANDLE* handles;
+ /*!< Pointer to an array of OS native
+ event handles where we copied the
+ handles from slots, in the same
+ order. This can be used in
+ WaitForMultipleObjects; used only in
+ Windows */
+#endif /* __WIN__ */
+
+#if defined(LINUX_NATIVE_AIO)
+ io_context_t* aio_ctx;
+ /* completion queue for IO. There is
+ one such queue per segment. Each thread
+ will work on one ctx exclusively. */
+ struct io_event* aio_events;
+ /* The array to collect completed IOs.
+ There is one such event for each
+ possible pending IO. The size of the
+ array is equal to n_slots. */
+#endif /* LINUX_NATIV_AIO */
+};
+
+#if defined(LINUX_NATIVE_AIO)
+/** timeout for each io_getevents() call = 500ms. */
+#define OS_AIO_REAP_TIMEOUT (500000000UL)
+
+/** time to sleep, in microseconds if io_setup() returns EAGAIN. */
+#define OS_AIO_IO_SETUP_RETRY_SLEEP (500000UL)
+
+/** number of attempts before giving up on io_setup(). */
+#define OS_AIO_IO_SETUP_RETRY_ATTEMPTS 5
+#endif
+
+/** Array of events used in simulated aio. */
+static os_event_t* os_aio_segment_wait_events;
+
+/** The aio arrays for non-ibuf i/o and ibuf i/o, as well as sync aio. These
+are NULL when the module has not yet been initialized. @{ */
+static os_aio_array_t* os_aio_read_array = NULL; /*!< Reads */
+static os_aio_array_t* os_aio_write_array = NULL; /*!< Writes */
+static os_aio_array_t* os_aio_ibuf_array = NULL; /*!< Insert buffer */
+static os_aio_array_t* os_aio_log_array = NULL; /*!< Redo log */
+static os_aio_array_t* os_aio_sync_array = NULL; /*!< Synchronous I/O */
+/* @} */
+
+/** Number of asynchronous I/O segments. Set by os_aio_init(). */
+static ulint os_aio_n_segments = ULINT_UNDEFINED;
+
+/** If the following is TRUE, read i/o handler threads try to
+wait until a batch of new read requests have been posted */
+static ibool os_aio_recommend_sleep_for_read_threads = FALSE;
+#endif /* !UNIV_HOTBACKUP */
+
+UNIV_INTERN ulint os_n_file_reads = 0;
+UNIV_INTERN ulint os_bytes_read_since_printout = 0;
+UNIV_INTERN ulint os_n_file_writes = 0;
+UNIV_INTERN ulint os_n_fsyncs = 0;
+UNIV_INTERN ulint os_n_file_reads_old = 0;
+UNIV_INTERN ulint os_n_file_writes_old = 0;
+UNIV_INTERN ulint os_n_fsyncs_old = 0;
+UNIV_INTERN time_t os_last_printout;
+
+UNIV_INTERN ibool os_has_said_disk_full = FALSE;
+
+#ifdef UNIV_DEBUG
+# ifndef UNIV_HOTBACKUP
+/**********************************************************************//**
+Validates the consistency the aio system some of the time.
+@return TRUE if ok or the check was skipped */
+UNIV_INTERN
+ibool
+os_aio_validate_skip(void)
+/*======================*/
+{
+/** Try os_aio_validate() every this many times */
+# define OS_AIO_VALIDATE_SKIP 13
+
+ /** The os_aio_validate() call skip counter.
+ Use a signed type because of the race condition below. */
+ static int os_aio_validate_count = OS_AIO_VALIDATE_SKIP;
+
+ /* There is a race condition below, but it does not matter,
+ because this call is only for heuristic purposes. We want to
+ reduce the call frequency of the costly os_aio_validate()
+ check in debug builds. */
+ if (--os_aio_validate_count > 0) {
+ return(TRUE);
+ }
+
+ os_aio_validate_count = OS_AIO_VALIDATE_SKIP;
+ return(os_aio_validate());
+}
+# endif /* !UNIV_HOTBACKUP */
+#endif /* UNIV_DEBUG */
+
+#ifdef __WIN__
+/***********************************************************************//**
+Gets the operating system version. Currently works only on Windows.
+@return OS_WIN95, OS_WIN31, OS_WINNT, OS_WIN2000, OS_WINXP, OS_WINVISTA,
+OS_WIN7. */
+UNIV_INTERN
+ulint
+os_get_os_version(void)
+/*===================*/
+{
+ OSVERSIONINFO os_info;
+
+ os_info.dwOSVersionInfoSize = sizeof(OSVERSIONINFO);
+
+ ut_a(GetVersionEx(&os_info));
+
+ if (os_info.dwPlatformId == VER_PLATFORM_WIN32s) {
+ return(OS_WIN31);
+ } else if (os_info.dwPlatformId == VER_PLATFORM_WIN32_WINDOWS) {
+ return(OS_WIN95);
+ } else if (os_info.dwPlatformId == VER_PLATFORM_WIN32_NT) {
+ switch (os_info.dwMajorVersion) {
+ case 3:
+ case 4:
+ return(OS_WINNT);
+ case 5:
+ return (os_info.dwMinorVersion == 0)
+ ? OS_WIN2000 : OS_WINXP;
+ case 6:
+ return (os_info.dwMinorVersion == 0)
+ ? OS_WINVISTA : OS_WIN7;
+ default:
+ return(OS_WIN7);
+ }
+ } else {
+ ut_error;
+ return(0);
+ }
+}
+#endif /* __WIN__ */
+
+/***********************************************************************//**
+Retrieves the last error number if an error occurs in a file io function.
+The number should be retrieved before any other OS calls (because they may
+overwrite the error number). If the number is not known to this program,
+the OS error number + 100 is returned.
+@return error number, or OS error number + 100 */
+static
+ulint
+os_file_get_last_error_low(
+/*=======================*/
+ bool report_all_errors, /*!< in: TRUE if we want an error
+ message printed of all errors */
+ bool on_error_silent) /*!< in: TRUE then don't print any
+ diagnostic to the log */
+{
+#ifdef __WIN__
+
+ ulint err = (ulint) GetLastError();
+ if (err == ERROR_SUCCESS) {
+ return(0);
+ }
+
+ if (report_all_errors
+ || (!on_error_silent
+ && err != ERROR_DISK_FULL
+ && err != ERROR_FILE_EXISTS)) {
+
+ ut_print_timestamp(stderr);
+ fprintf(stderr,
+ " InnoDB: Operating system error number %lu"
+ " in a file operation.\n", (ulong) err);
+
+ if (err == ERROR_PATH_NOT_FOUND) {
+ fprintf(stderr,
+ "InnoDB: The error means the system"
+ " cannot find the path specified.\n");
+
+ if (srv_is_being_started) {
+ fprintf(stderr,
+ "InnoDB: If you are installing InnoDB,"
+ " remember that you must create\n"
+ "InnoDB: directories yourself, InnoDB"
+ " does not create them.\n");
+ }
+ } else if (err == ERROR_ACCESS_DENIED) {
+ fprintf(stderr,
+ "InnoDB: The error means mysqld does not have"
+ " the access rights to\n"
+ "InnoDB: the directory. It may also be"
+ " you have created a subdirectory\n"
+ "InnoDB: of the same name as a data file.\n");
+ } else if (err == ERROR_SHARING_VIOLATION
+ || err == ERROR_LOCK_VIOLATION) {
+ fprintf(stderr,
+ "InnoDB: The error means that another program"
+ " is using InnoDB's files.\n"
+ "InnoDB: This might be a backup or antivirus"
+ " software or another instance\n"
+ "InnoDB: of MySQL."
+ " Please close it to get rid of this error.\n");
+ } else if (err == ERROR_WORKING_SET_QUOTA
+ || err == ERROR_NO_SYSTEM_RESOURCES) {
+ fprintf(stderr,
+ "InnoDB: The error means that there are no"
+ " sufficient system resources or quota to"
+ " complete the operation.\n");
+ } else if (err == ERROR_OPERATION_ABORTED) {
+ fprintf(stderr,
+ "InnoDB: The error means that the I/O"
+ " operation has been aborted\n"
+ "InnoDB: because of either a thread exit"
+ " or an application request.\n"
+ "InnoDB: Retry attempt is made.\n");
+ } else {
+ fprintf(stderr,
+ "InnoDB: Some operating system error numbers"
+ " are described at\n"
+ "InnoDB: "
+ REFMAN
+ "operating-system-error-codes.html\n");
+ }
+ }
+
+ fflush(stderr);
+
+ if (err == ERROR_FILE_NOT_FOUND) {
+ return(OS_FILE_NOT_FOUND);
+ } else if (err == ERROR_DISK_FULL) {
+ return(OS_FILE_DISK_FULL);
+ } else if (err == ERROR_FILE_EXISTS) {
+ return(OS_FILE_ALREADY_EXISTS);
+ } else if (err == ERROR_SHARING_VIOLATION
+ || err == ERROR_LOCK_VIOLATION) {
+ return(OS_FILE_SHARING_VIOLATION);
+ } else if (err == ERROR_WORKING_SET_QUOTA
+ || err == ERROR_NO_SYSTEM_RESOURCES) {
+ return(OS_FILE_INSUFFICIENT_RESOURCE);
+ } else if (err == ERROR_OPERATION_ABORTED) {
+ return(OS_FILE_OPERATION_ABORTED);
+ } else if (err == ERROR_ACCESS_DENIED) {
+ return(OS_FILE_ACCESS_VIOLATION);
+ } else if (err == ERROR_BUFFER_OVERFLOW) {
+ return(OS_FILE_NAME_TOO_LONG);
+ } else {
+ return(OS_FILE_ERROR_MAX + err);
+ }
+#else
+ int err = errno;
+ if (err == 0) {
+ return(0);
+ }
+
+ if (report_all_errors
+ || (err != ENOSPC && err != EEXIST && !on_error_silent)) {
+
+ ut_print_timestamp(stderr);
+ fprintf(stderr,
+ " InnoDB: Operating system error number %d"
+ " in a file operation.\n", err);
+
+ if (err == ENOENT) {
+ fprintf(stderr,
+ "InnoDB: The error means the system"
+ " cannot find the path specified.\n");
+
+ if (srv_is_being_started) {
+ fprintf(stderr,
+ "InnoDB: If you are installing InnoDB,"
+ " remember that you must create\n"
+ "InnoDB: directories yourself, InnoDB"
+ " does not create them.\n");
+ }
+ } else if (err == EACCES) {
+ fprintf(stderr,
+ "InnoDB: The error means mysqld does not have"
+ " the access rights to\n"
+ "InnoDB: the directory.\n");
+ } else {
+ if (strerror(err) != NULL) {
+ fprintf(stderr,
+ "InnoDB: Error number %d"
+ " means '%s'.\n",
+ err, strerror(err));
+ }
+
+
+ fprintf(stderr,
+ "InnoDB: Some operating system"
+ " error numbers are described at\n"
+ "InnoDB: "
+ REFMAN
+ "operating-system-error-codes.html\n");
+ }
+ }
+
+ fflush(stderr);
+
+ switch (err) {
+ case ENOSPC:
+ return(OS_FILE_DISK_FULL);
+ case ENOENT:
+ return(OS_FILE_NOT_FOUND);
+ case EEXIST:
+ return(OS_FILE_ALREADY_EXISTS);
+ case ENAMETOOLONG:
+ return(OS_FILE_NAME_TOO_LONG);
+ case EXDEV:
+ case ENOTDIR:
+ case EISDIR:
+ return(OS_FILE_PATH_ERROR);
+ case EAGAIN:
+ if (srv_use_native_aio) {
+ return(OS_FILE_AIO_RESOURCES_RESERVED);
+ }
+ break;
+ case EINTR:
+ if (srv_use_native_aio) {
+ return(OS_FILE_AIO_INTERRUPTED);
+ }
+ break;
+ case EACCES:
+ return(OS_FILE_ACCESS_VIOLATION);
+ }
+ return(OS_FILE_ERROR_MAX + err);
+#endif
+}
+
+/***********************************************************************//**
+Retrieves the last error number if an error occurs in a file io function.
+The number should be retrieved before any other OS calls (because they may
+overwrite the error number). If the number is not known to this program,
+the OS error number + 100 is returned.
+@return error number, or OS error number + 100 */
+UNIV_INTERN
+ulint
+os_file_get_last_error(
+/*===================*/
+ bool report_all_errors) /*!< in: TRUE if we want an error
+ message printed of all errors */
+{
+ return(os_file_get_last_error_low(report_all_errors, false));
+}
+
+/****************************************************************//**
+Does error handling when a file operation fails.
+Conditionally exits (calling exit(3)) based on should_exit value and the
+error type, if should_exit is TRUE then on_error_silent is ignored.
+@return TRUE if we should retry the operation */
+static
+ibool
+os_file_handle_error_cond_exit(
+/*===========================*/
+ const char* name, /*!< in: name of a file or NULL */
+ const char* operation, /*!< in: operation */
+ ibool should_exit, /*!< in: call exit(3) if unknown error
+ and this parameter is TRUE */
+ ibool on_error_silent)/*!< in: if TRUE then don't print
+ any message to the log iff it is
+ an unknown non-fatal error */
+{
+ ulint err;
+
+ err = os_file_get_last_error_low(false, on_error_silent);
+
+ switch (err) {
+ case OS_FILE_DISK_FULL:
+ /* We only print a warning about disk full once */
+
+ if (os_has_said_disk_full) {
+
+ return(FALSE);
+ }
+
+ /* Disk full error is reported irrespective of the
+ on_error_silent setting. */
+
+ if (name) {
+ ut_print_timestamp(stderr);
+ fprintf(stderr,
+ " InnoDB: Encountered a problem with"
+ " file %s\n", name);
+ }
+
+ ut_print_timestamp(stderr);
+ fprintf(stderr,
+ " InnoDB: Disk is full. Try to clean the disk"
+ " to free space.\n");
+
+ os_has_said_disk_full = TRUE;
+
+ fflush(stderr);
+ ut_error;
+ return(FALSE);
+
+ case OS_FILE_AIO_RESOURCES_RESERVED:
+ case OS_FILE_AIO_INTERRUPTED:
+
+ return(TRUE);
+
+ case OS_FILE_PATH_ERROR:
+ case OS_FILE_ALREADY_EXISTS:
+ case OS_FILE_ACCESS_VIOLATION:
+
+ return(FALSE);
+
+ case OS_FILE_SHARING_VIOLATION:
+
+ os_thread_sleep(10000000); /* 10 sec */
+ return(TRUE);
+
+ case OS_FILE_OPERATION_ABORTED:
+ case OS_FILE_INSUFFICIENT_RESOURCE:
+
+ os_thread_sleep(100000); /* 100 ms */
+ return(TRUE);
+
+ default:
+
+ /* If it is an operation that can crash on error then it
+ is better to ignore on_error_silent and print an error message
+ to the log. */
+
+ if (should_exit || !on_error_silent) {
+ ib_logf(IB_LOG_LEVEL_ERROR, "File %s: '%s' returned OS "
+ "error " ULINTPF ".%s", name ? name : "(unknown)",
+ operation, err, should_exit
+ ? " Cannot continue operation" : "");
+ }
+
+ if (should_exit) {
+ exit(1);
+ }
+ }
+
+ return(FALSE);
+}
+
+/****************************************************************//**
+Does error handling when a file operation fails.
+@return TRUE if we should retry the operation */
+static
+ibool
+os_file_handle_error(
+/*=================*/
+ const char* name, /*!< in: name of a file or NULL */
+ const char* operation) /*!< in: operation */
+{
+ /* exit in case of unknown error */
+ return(os_file_handle_error_cond_exit(name, operation, TRUE, FALSE));
+}
+
+/****************************************************************//**
+Does error handling when a file operation fails.
+@return TRUE if we should retry the operation */
+ibool
+os_file_handle_error_no_exit(
+/*=========================*/
+ const char* name, /*!< in: name of a file or NULL */
+ const char* operation, /*!< in: operation */
+ ibool on_error_silent)/*!< in: if TRUE then don't print
+ any message to the log. */
+{
+ /* don't exit in case of unknown error */
+ return(os_file_handle_error_cond_exit(
+ name, operation, FALSE, on_error_silent));
+}
+
+#undef USE_FILE_LOCK
+#define USE_FILE_LOCK
+#if defined(UNIV_HOTBACKUP) || defined(__WIN__)
+/* InnoDB Hot Backup does not lock the data files.
+ * On Windows, mandatory locking is used.
+ */
+# undef USE_FILE_LOCK
+#endif
+#ifdef USE_FILE_LOCK
+/****************************************************************//**
+Obtain an exclusive lock on a file.
+@return 0 on success */
+static
+int
+os_file_lock(
+/*=========*/
+ int fd, /*!< in: file descriptor */
+ const char* name) /*!< in: file name */
+{
+ struct flock lk;
+
+ ut_ad(!srv_read_only_mode);
+
+ lk.l_type = F_WRLCK;
+ lk.l_whence = SEEK_SET;
+ lk.l_start = lk.l_len = 0;
+
+ if (fcntl(fd, F_SETLK, &lk) == -1) {
+
+ ib_logf(IB_LOG_LEVEL_ERROR,
+ "Unable to lock %s, error: %d", name, errno);
+
+ if (errno == EAGAIN || errno == EACCES) {
+ ib_logf(IB_LOG_LEVEL_INFO,
+ "Check that you do not already have "
+ "another mysqld process using the "
+ "same InnoDB data or log files.");
+ }
+
+ return(-1);
+ }
+
+ return(0);
+}
+#endif /* USE_FILE_LOCK */
+
+#ifndef UNIV_HOTBACKUP
+/****************************************************************//**
+Creates the seek mutexes used in positioned reads and writes. */
+UNIV_INTERN
+void
+os_io_init_simple(void)
+/*===================*/
+{
+ for (ulint i = 0; i < OS_FILE_N_SEEK_MUTEXES; i++) {
+ os_file_seek_mutexes[i] = os_mutex_create();
+ }
+}
+
+/** Create a temporary file. This function is like tmpfile(3), but
+the temporary file is created in the given parameter path. If the path
+is null then it will create the file in the mysql server configuration
+parameter (--tmpdir).
+@param[in] path location for creating temporary file
+@return temporary file handle, or NULL on error */
+UNIV_INTERN
+FILE*
+os_file_create_tmpfile(
+ const char* path)
+{
+ FILE* file = NULL;
+ WAIT_ALLOW_WRITES();
+ int fd = innobase_mysql_tmpfile(path);
+
+ ut_ad(!srv_read_only_mode);
+
+ if (fd >= 0) {
+ file = fdopen(fd, "w+b");
+ }
+
+ if (!file) {
+ ut_print_timestamp(stderr);
+ fprintf(stderr,
+ " InnoDB: Error: unable to create temporary file;"
+ " errno: %d\n", errno);
+ if (fd >= 0) {
+ close(fd);
+ }
+ }
+
+ return(file);
+}
+#endif /* !UNIV_HOTBACKUP */
+
+/***********************************************************************//**
+The os_file_opendir() function opens a directory stream corresponding to the
+directory named by the dirname argument. The directory stream is positioned
+at the first entry. In both Unix and Windows we automatically skip the '.'
+and '..' items at the start of the directory listing.
+@return directory stream, NULL if error */
+UNIV_INTERN
+os_file_dir_t
+os_file_opendir(
+/*============*/
+ const char* dirname, /*!< in: directory name; it must not
+ contain a trailing '\' or '/' */
+ ibool error_is_fatal) /*!< in: TRUE if we should treat an
+ error as a fatal error; if we try to
+ open symlinks then we do not wish a
+ fatal error if it happens not to be
+ a directory */
+{
+ os_file_dir_t dir;
+#ifdef __WIN__
+ LPWIN32_FIND_DATA lpFindFileData;
+ char path[OS_FILE_MAX_PATH + 3];
+
+ ut_a(strlen(dirname) < OS_FILE_MAX_PATH);
+
+ strcpy(path, dirname);
+ strcpy(path + strlen(path), "\\*");
+
+ /* Note that in Windows opening the 'directory stream' also retrieves
+ the first entry in the directory. Since it is '.', that is no problem,
+ as we will skip over the '.' and '..' entries anyway. */
+
+ lpFindFileData = static_cast<LPWIN32_FIND_DATA>(
+ ut_malloc(sizeof(WIN32_FIND_DATA)));
+
+ dir = FindFirstFile((LPCTSTR) path, lpFindFileData);
+
+ ut_free(lpFindFileData);
+
+ if (dir == INVALID_HANDLE_VALUE) {
+
+ if (error_is_fatal) {
+ os_file_handle_error(dirname, "opendir");
+ }
+
+ return(NULL);
+ }
+
+ return(dir);
+#else
+ dir = opendir(dirname);
+
+ if (dir == NULL && error_is_fatal) {
+ os_file_handle_error(dirname, "opendir");
+ }
+
+ return(dir);
+#endif /* __WIN__ */
+}
+
+/***********************************************************************//**
+Closes a directory stream.
+@return 0 if success, -1 if failure */
+UNIV_INTERN
+int
+os_file_closedir(
+/*=============*/
+ os_file_dir_t dir) /*!< in: directory stream */
+{
+#ifdef __WIN__
+ BOOL ret;
+
+ ret = FindClose(dir);
+
+ if (!ret) {
+ os_file_handle_error_no_exit(NULL, "closedir", FALSE);
+
+ return(-1);
+ }
+
+ return(0);
+#else
+ int ret;
+
+ ret = closedir(dir);
+
+ if (ret) {
+ os_file_handle_error_no_exit(NULL, "closedir", FALSE);
+ }
+
+ return(ret);
+#endif /* __WIN__ */
+}
+
+/***********************************************************************//**
+This function returns information of the next file in the directory. We jump
+over the '.' and '..' entries in the directory.
+@return 0 if ok, -1 if error, 1 if at the end of the directory */
+UNIV_INTERN
+int
+os_file_readdir_next_file(
+/*======================*/
+ const char* dirname,/*!< in: directory name or path */
+ os_file_dir_t dir, /*!< in: directory stream */
+ os_file_stat_t* info) /*!< in/out: buffer where the info is returned */
+{
+#ifdef __WIN__
+ LPWIN32_FIND_DATA lpFindFileData;
+ BOOL ret;
+
+ lpFindFileData = static_cast<LPWIN32_FIND_DATA>(
+ ut_malloc(sizeof(WIN32_FIND_DATA)));
+next_file:
+ ret = FindNextFile(dir, lpFindFileData);
+
+ if (ret) {
+ ut_a(strlen((char*) lpFindFileData->cFileName)
+ < OS_FILE_MAX_PATH);
+
+ if (strcmp((char*) lpFindFileData->cFileName, ".") == 0
+ || strcmp((char*) lpFindFileData->cFileName, "..") == 0) {
+
+ goto next_file;
+ }
+
+ strcpy(info->name, (char*) lpFindFileData->cFileName);
+
+ info->size = (ib_int64_t)(lpFindFileData->nFileSizeLow)
+ + (((ib_int64_t)(lpFindFileData->nFileSizeHigh))
+ << 32);
+
+ if (lpFindFileData->dwFileAttributes
+ & FILE_ATTRIBUTE_REPARSE_POINT) {
+ /* TODO: test Windows symlinks */
+ /* TODO: MySQL has apparently its own symlink
+ implementation in Windows, dbname.sym can
+ redirect a database directory:
+ REFMAN "windows-symbolic-links.html" */
+ info->type = OS_FILE_TYPE_LINK;
+ } else if (lpFindFileData->dwFileAttributes
+ & FILE_ATTRIBUTE_DIRECTORY) {
+ info->type = OS_FILE_TYPE_DIR;
+ } else {
+ /* It is probably safest to assume that all other
+ file types are normal. Better to check them rather
+ than blindly skip them. */
+
+ info->type = OS_FILE_TYPE_FILE;
+ }
+ }
+
+ ut_free(lpFindFileData);
+
+ if (ret) {
+ return(0);
+ } else if (GetLastError() == ERROR_NO_MORE_FILES) {
+
+ return(1);
+ } else {
+ os_file_handle_error_no_exit(NULL, "readdir_next_file", FALSE);
+ return(-1);
+ }
+#else
+ struct dirent* ent;
+ char* full_path;
+ int ret;
+ struct stat statinfo;
+#ifdef HAVE_READDIR_R
+ char dirent_buf[sizeof(struct dirent)
+ + _POSIX_PATH_MAX + 100];
+ /* In /mysys/my_lib.c, _POSIX_PATH_MAX + 1 is used as
+ the max file name len; but in most standards, the
+ length is NAME_MAX; we add 100 to be even safer */
+#endif
+
+next_file:
+
+#ifdef HAVE_READDIR_R
+ ret = readdir_r(dir, (struct dirent*) dirent_buf, &ent);
+
+ if (ret != 0
+#ifdef UNIV_AIX
+ /* On AIX, only if we got non-NULL 'ent' (result) value and
+ a non-zero 'ret' (return) value, it indicates a failed
+ readdir_r() call. An NULL 'ent' with an non-zero 'ret'
+ would indicate the "end of the directory" is reached. */
+ && ent != NULL
+#endif
+ ) {
+ fprintf(stderr,
+ "InnoDB: cannot read directory %s, error %lu\n",
+ dirname, (ulong) ret);
+
+ return(-1);
+ }
+
+ if (ent == NULL) {
+ /* End of directory */
+
+ return(1);
+ }
+
+ ut_a(strlen(ent->d_name) < _POSIX_PATH_MAX + 100 - 1);
+#else
+ ent = readdir(dir);
+
+ if (ent == NULL) {
+
+ return(1);
+ }
+#endif
+ ut_a(strlen(ent->d_name) < OS_FILE_MAX_PATH);
+
+ if (strcmp(ent->d_name, ".") == 0 || strcmp(ent->d_name, "..") == 0) {
+
+ goto next_file;
+ }
+
+ strcpy(info->name, ent->d_name);
+
+ full_path = static_cast<char*>(
+ ut_malloc(strlen(dirname) + strlen(ent->d_name) + 10));
+
+ sprintf(full_path, "%s/%s", dirname, ent->d_name);
+
+ ret = stat(full_path, &statinfo);
+
+ if (ret) {
+
+ if (errno == ENOENT) {
+ /* readdir() returned a file that does not exist,
+ it must have been deleted in the meantime. Do what
+ would have happened if the file was deleted before
+ readdir() - ignore and go to the next entry.
+ If this is the last entry then info->name will still
+ contain the name of the deleted file when this
+ function returns, but this is not an issue since the
+ caller shouldn't be looking at info when end of
+ directory is returned. */
+
+ ut_free(full_path);
+
+ goto next_file;
+ }
+
+ os_file_handle_error_no_exit(full_path, "stat", FALSE);
+
+ ut_free(full_path);
+
+ return(-1);
+ }
+
+ info->size = (ib_int64_t) statinfo.st_size;
+
+ if (S_ISDIR(statinfo.st_mode)) {
+ info->type = OS_FILE_TYPE_DIR;
+ } else if (S_ISLNK(statinfo.st_mode)) {
+ info->type = OS_FILE_TYPE_LINK;
+ } else if (S_ISREG(statinfo.st_mode)) {
+ info->type = OS_FILE_TYPE_FILE;
+ } else {
+ info->type = OS_FILE_TYPE_UNKNOWN;
+ }
+
+ ut_free(full_path);
+
+ return(0);
+#endif
+}
+
+/*****************************************************************//**
+This function attempts to create a directory named pathname. The new
+directory gets default permissions. On Unix the permissions are
+(0770 & ~umask). If the directory exists already, nothing is done and
+the call succeeds, unless the fail_if_exists arguments is true.
+If another error occurs, such as a permission error, this does not crash,
+but reports the error and returns FALSE.
+@return TRUE if call succeeds, FALSE on error */
+UNIV_INTERN
+ibool
+os_file_create_directory(
+/*=====================*/
+ const char* pathname, /*!< in: directory name as
+ null-terminated string */
+ ibool fail_if_exists) /*!< in: if TRUE, pre-existing directory
+ is treated as an error. */
+{
+#ifdef __WIN__
+ BOOL rcode;
+
+ rcode = CreateDirectory((LPCTSTR) pathname, NULL);
+ if (!(rcode != 0
+ || (GetLastError() == ERROR_ALREADY_EXISTS
+ && !fail_if_exists))) {
+
+ os_file_handle_error_no_exit(
+ pathname, "CreateDirectory", FALSE);
+
+ return(FALSE);
+ }
+
+ return(TRUE);
+#else
+ int rcode;
+ WAIT_ALLOW_WRITES();
+
+ rcode = mkdir(pathname, 0770);
+
+ if (!(rcode == 0 || (errno == EEXIST && !fail_if_exists))) {
+ /* failure */
+ os_file_handle_error_no_exit(pathname, "mkdir", FALSE);
+
+ return(FALSE);
+ }
+
+ return (TRUE);
+#endif /* __WIN__ */
+}
+
+/****************************************************************//**
+NOTE! Use the corresponding macro os_file_create_simple(), not directly
+this function!
+A simple function to open or create a file.
+@return own: handle to the file, not defined if error, error number
+can be retrieved with os_file_get_last_error */
+UNIV_INTERN
+os_file_t
+os_file_create_simple_func(
+/*=======================*/
+ const char* name, /*!< in: name of the file or path as a
+ null-terminated string */
+ ulint create_mode,/*!< in: create mode */
+ ulint access_type,/*!< in: OS_FILE_READ_ONLY or
+ OS_FILE_READ_WRITE */
+ ibool* success)/*!< out: TRUE if succeed, FALSE if error */
+{
+ os_file_t file;
+ ibool retry;
+
+ *success = FALSE;
+#ifdef __WIN__
+ DWORD access;
+ DWORD create_flag;
+ DWORD attributes = 0;
+
+ ut_a(!(create_mode & OS_FILE_ON_ERROR_SILENT));
+ ut_a(!(create_mode & OS_FILE_ON_ERROR_NO_EXIT));
+
+ if (create_mode == OS_FILE_OPEN) {
+
+ create_flag = OPEN_EXISTING;
+
+ } else if (srv_read_only_mode) {
+
+ create_flag = OPEN_EXISTING;
+
+ } else if (create_mode == OS_FILE_CREATE) {
+
+ create_flag = CREATE_NEW;
+
+ } else if (create_mode == OS_FILE_CREATE_PATH) {
+
+ ut_a(!srv_read_only_mode);
+
+ /* Create subdirs along the path if needed */
+ *success = os_file_create_subdirs_if_needed(name);
+
+ if (!*success) {
+
+ ib_logf(IB_LOG_LEVEL_ERROR,
+ "Unable to create subdirectories '%s'",
+ name);
+
+ return((os_file_t) -1);
+ }
+
+ create_flag = CREATE_NEW;
+ create_mode = OS_FILE_CREATE;
+
+ } else {
+ ib_logf(IB_LOG_LEVEL_ERROR,
+ "Unknown file create mode (%lu) for file '%s'",
+ create_mode, name);
+
+ return((os_file_t) -1);
+ }
+
+ if (access_type == OS_FILE_READ_ONLY) {
+ access = GENERIC_READ;
+ } else if (srv_read_only_mode) {
+
+ ib_logf(IB_LOG_LEVEL_INFO,
+ "read only mode set. Unable to "
+ "open file '%s' in RW mode, trying RO mode", name);
+
+ access = GENERIC_READ;
+
+ } else if (access_type == OS_FILE_READ_WRITE) {
+ access = GENERIC_READ | GENERIC_WRITE;
+ } else {
+ ib_logf(IB_LOG_LEVEL_ERROR,
+ "Unknown file access type (%lu) for file '%s'",
+ access_type, name);
+
+ return((os_file_t) -1);
+ }
+
+ do {
+ /* Use default security attributes and no template file. */
+
+ file = CreateFile(
+ (LPCTSTR) name, access, FILE_SHARE_READ, NULL,
+ create_flag, attributes, NULL);
+
+ if (file == INVALID_HANDLE_VALUE) {
+
+ *success = FALSE;
+
+ retry = os_file_handle_error(
+ name, create_mode == OS_FILE_OPEN ?
+ "open" : "create");
+
+ } else {
+ *success = TRUE;
+ retry = false;
+ }
+
+ } while (retry);
+
+#else /* __WIN__ */
+ int create_flag;
+ if (create_mode != OS_FILE_OPEN && create_mode != OS_FILE_OPEN_RAW)
+ WAIT_ALLOW_WRITES();
+
+ ut_a(!(create_mode & OS_FILE_ON_ERROR_SILENT));
+ ut_a(!(create_mode & OS_FILE_ON_ERROR_NO_EXIT));
+
+ if (create_mode == OS_FILE_OPEN) {
+
+ if (access_type == OS_FILE_READ_ONLY) {
+ create_flag = O_RDONLY;
+ } else if (srv_read_only_mode) {
+ create_flag = O_RDONLY;
+ } else {
+ create_flag = O_RDWR;
+ }
+
+ } else if (srv_read_only_mode) {
+
+ create_flag = O_RDONLY;
+
+ } else if (create_mode == OS_FILE_CREATE) {
+
+ create_flag = O_RDWR | O_CREAT | O_EXCL;
+
+ } else if (create_mode == OS_FILE_CREATE_PATH) {
+
+ /* Create subdirs along the path if needed */
+
+ *success = os_file_create_subdirs_if_needed(name);
+
+ if (!*success) {
+
+ ib_logf(IB_LOG_LEVEL_ERROR,
+ "Unable to create subdirectories '%s'",
+ name);
+
+ return((os_file_t) -1);
+ }
+
+ create_flag = O_RDWR | O_CREAT | O_EXCL;
+ create_mode = OS_FILE_CREATE;
+ } else {
+
+ ib_logf(IB_LOG_LEVEL_ERROR,
+ "Unknown file create mode (%lu) for file '%s'",
+ create_mode, name);
+
+ return((os_file_t) -1);
+ }
+
+ do {
- file = ::open(name, create_flag, os_innodb_umask);
++ file = ::open(name, create_flag | O_CLOEXEC, os_innodb_umask);
+
+ if (file == -1) {
+ *success = FALSE;
+
+ retry = os_file_handle_error(
+ name,
+ create_mode == OS_FILE_OPEN
+ ? "open" : "create");
+ } else {
+ *success = TRUE;
+ retry = false;
+ }
+
+ } while (retry);
+
+#ifdef USE_FILE_LOCK
+ if (!srv_read_only_mode
+ && *success
+ && access_type == OS_FILE_READ_WRITE
+ && os_file_lock(file, name)) {
+
+ *success = FALSE;
+ close(file);
+ file = -1;
+ }
+#endif /* USE_FILE_LOCK */
+
+#endif /* __WIN__ */
+
+ return(file);
+}
+
+/****************************************************************//**
+NOTE! Use the corresponding macro
+os_file_create_simple_no_error_handling(), not directly this function!
+A simple function to open or create a file.
+@return own: handle to the file, not defined if error, error number
+can be retrieved with os_file_get_last_error */
+UNIV_INTERN
+pfs_os_file_t
+os_file_create_simple_no_error_handling_func(
+/*=========================================*/
+ const char* name, /*!< in: name of the file or path as a
+ null-terminated string */
+ ulint create_mode,/*!< in: create mode */
+ ulint access_type,/*!< in: OS_FILE_READ_ONLY,
+ OS_FILE_READ_WRITE, or
+ OS_FILE_READ_ALLOW_DELETE; the last option is
+ used by a backup program reading the file */
+ ibool* success)/*!< out: TRUE if succeed, FALSE if error */
+{
+ pfs_os_file_t file;
+
+ *success = FALSE;
+#ifdef __WIN__
+ DWORD access;
+ DWORD create_flag;
+ DWORD attributes = 0;
+ DWORD share_mode = FILE_SHARE_READ;
+ ut_a(name);
+
+ ut_a(!(create_mode & OS_FILE_ON_ERROR_SILENT));
+ ut_a(!(create_mode & OS_FILE_ON_ERROR_NO_EXIT));
+
+ if (create_mode == OS_FILE_OPEN) {
+ create_flag = OPEN_EXISTING;
+ } else if (srv_read_only_mode) {
+ create_flag = OPEN_EXISTING;
+ } else if (create_mode == OS_FILE_CREATE) {
+ create_flag = CREATE_NEW;
+ } else {
+
+ ib_logf(IB_LOG_LEVEL_ERROR,
+ "Unknown file create mode (%lu) for file '%s'",
+ create_mode, name);
+ file.m_file = (os_file_t)-1;
+ return(file);
+ }
+
+ if (access_type == OS_FILE_READ_ONLY) {
+ access = GENERIC_READ;
+ } else if (srv_read_only_mode) {
+ access = GENERIC_READ;
+ } else if (access_type == OS_FILE_READ_WRITE) {
+ access = GENERIC_READ | GENERIC_WRITE;
+ } else if (access_type == OS_FILE_READ_ALLOW_DELETE) {
+
+ ut_a(!srv_read_only_mode);
+
+ access = GENERIC_READ;
+
+ /*!< A backup program has to give mysqld the maximum
+ freedom to do what it likes with the file */
+
+ share_mode |= FILE_SHARE_DELETE | FILE_SHARE_WRITE;
+ } else {
+ ib_logf(IB_LOG_LEVEL_ERROR,
+ "Unknown file access type (%lu) for file '%s'",
+ access_type, name);
+ file.m_file = (os_file_t)-1;
+ return(file);
+ }
+
+ file.m_file = CreateFile((LPCTSTR) name,
+ access,
+ share_mode,
+ NULL, // Security attributes
+ create_flag,
+ attributes,
+ NULL); // No template file
+
+ *success = (file.m_file != INVALID_HANDLE_VALUE);
+#else /* __WIN__ */
+ int create_flag;
+ const char* mode_str = NULL;
+ ut_a(name);
+ if (create_mode != OS_FILE_OPEN && create_mode != OS_FILE_OPEN_RAW)
+ WAIT_ALLOW_WRITES();
+
+ ut_a(!(create_mode & OS_FILE_ON_ERROR_SILENT));
+ ut_a(!(create_mode & OS_FILE_ON_ERROR_NO_EXIT));
+
+ if (create_mode == OS_FILE_OPEN) {
+
+ mode_str = "OPEN";
+
+ if (access_type == OS_FILE_READ_ONLY) {
+
+ create_flag = O_RDONLY;
+
+ } else if (srv_read_only_mode) {
+
+ create_flag = O_RDONLY;
+
+ } else {
+
+ ut_a(access_type == OS_FILE_READ_WRITE
+ || access_type == OS_FILE_READ_ALLOW_DELETE);
+
+ create_flag = O_RDWR;
+ }
+
+ } else if (srv_read_only_mode) {
+
+ mode_str = "OPEN";
+
+ create_flag = O_RDONLY;
+
+ } else if (create_mode == OS_FILE_CREATE) {
+
+ mode_str = "CREATE";
+
+ create_flag = O_RDWR | O_CREAT | O_EXCL;
+
+ } else {
+ ib_logf(IB_LOG_LEVEL_ERROR,
+ "Unknown file create mode (%lu) for file '%s'",
+ create_mode, name);
+ file.m_file = -1;
+ return(file);
+ }
+
- file.m_file = ::open(name, create_flag, os_innodb_umask);
++ file.m_file = ::open(name, create_flag | O_CLOEXEC, os_innodb_umask);
+
+ *success = file.m_file == -1 ? FALSE : TRUE;
+
+ /* This function is always called for data files, we should disable
+ OS caching (O_DIRECT) here as we do in os_file_create_func(), so
+ we open the same file in the same mode, see man page of open(2). */
+ if (!srv_read_only_mode
+ && *success
+ && (srv_unix_file_flush_method == SRV_UNIX_O_DIRECT
+ || srv_unix_file_flush_method == SRV_UNIX_O_DIRECT_NO_FSYNC)) {
+
+ os_file_set_nocache(file.m_file, name, mode_str);
+ }
+
+#ifdef USE_FILE_LOCK
+ if (!srv_read_only_mode
+ && *success
+ && access_type == OS_FILE_READ_WRITE
+ && os_file_lock(file.m_file, name)) {
+
+ *success = FALSE;
+ close(file.m_file);
+ file.m_file = -1;
+
+ }
+#endif /* USE_FILE_LOCK */
+
+#endif /* __WIN__ */
+
+ return(file);
+}
+
+/****************************************************************//**
+Tries to disable OS caching on an opened file descriptor. */
+UNIV_INTERN
+void
+os_file_set_nocache(
+/*================*/
+ int fd /*!< in: file descriptor to alter */
+ MY_ATTRIBUTE((unused)),
+ const char* file_name /*!< in: used in the diagnostic
+ message */
+ MY_ATTRIBUTE((unused)),
+ const char* operation_name MY_ATTRIBUTE((unused)))
+ /*!< in: "open" or "create"; used
+ in the diagnostic message */
+{
+ /* some versions of Solaris may not have DIRECTIO_ON */
+#if defined(UNIV_SOLARIS) && defined(DIRECTIO_ON)
+ if (directio(fd, DIRECTIO_ON) == -1) {
+ int errno_save = errno;
+
+ ib_logf(IB_LOG_LEVEL_ERROR,
+ "Failed to set DIRECTIO_ON on file %s: %s: %s, "
+ "continuing anyway.",
+ file_name, operation_name, strerror(errno_save));
+ }
+#elif defined(O_DIRECT)
+ if (fcntl(fd, F_SETFL, O_DIRECT) == -1) {
+ int errno_save = errno;
+ static bool warning_message_printed = false;
+ if (errno_save == EINVAL) {
+ if (!warning_message_printed) {
+ warning_message_printed = true;
+# ifdef UNIV_LINUX
+ ib_logf(IB_LOG_LEVEL_WARN,
+ "Failed to set O_DIRECT on file "
+ "%s: %s: %s, continuing anyway. "
+ "O_DIRECT is known to result "
+ "in 'Invalid argument' on Linux on "
+ "tmpfs, see MySQL Bug#26662.",
+ file_name, operation_name,
+ strerror(errno_save));
+# else /* UNIV_LINUX */
+ goto short_warning;
+# endif /* UNIV_LINUX */
+ }
+ } else {
+# ifndef UNIV_LINUX
+short_warning:
+# endif
+ ib_logf(IB_LOG_LEVEL_WARN,
+ "Failed to set O_DIRECT on file %s: %s: %s, "
+ "continuing anyway.",
+ file_name, operation_name, strerror(errno_save));
+ }
+ }
+#endif /* defined(UNIV_SOLARIS) && defined(DIRECTIO_ON) */
+}
+
+/****************************************************************//**
+NOTE! Use the corresponding macro os_file_create(), not directly
+this function!
+Opens an existing file or creates a new.
+@return own: handle to the file, not defined if error, error number
+can be retrieved with os_file_get_last_error */
+UNIV_INTERN
+pfs_os_file_t
+os_file_create_func(
+/*================*/
+ const char* name, /*!< in: name of the file or path as a
+ null-terminated string */
+ ulint create_mode,/*!< in: create mode */
+ ulint purpose,/*!< in: OS_FILE_AIO, if asynchronous,
+ non-buffered i/o is desired,
+ OS_FILE_NORMAL, if any normal file;
+ NOTE that it also depends on type, os_aio_..
+ and srv_.. variables whether we really use
+ async i/o or unbuffered i/o: look in the
+ function source code for the exact rules */
+ ulint type, /*!< in: OS_DATA_FILE or OS_LOG_FILE */
+ ibool* success)/*!< out: TRUE if succeed, FALSE if error */
+{
+ pfs_os_file_t file;
+ ibool retry;
+ ibool on_error_no_exit;
+ ibool on_error_silent;
+#ifdef __WIN__
+ DBUG_EXECUTE_IF(
+ "ib_create_table_fail_disk_full",
+ *success = FALSE;
+ SetLastError(ERROR_DISK_FULL);
+ file.m_file = (os_file_t)-1;
+ return(file);
+ );
+#else /* __WIN__ */
+ DBUG_EXECUTE_IF(
+ "ib_create_table_fail_disk_full",
+ *success = FALSE;
+ errno = ENOSPC;
+ file.m_file = -1;
+ return(file);
+ );
+#endif /* __WIN__ */
+
+#ifdef __WIN__
+ DWORD create_flag;
+ DWORD share_mode = FILE_SHARE_READ;
+
+ on_error_no_exit = create_mode & OS_FILE_ON_ERROR_NO_EXIT
+ ? TRUE : FALSE;
+
+ on_error_silent = create_mode & OS_FILE_ON_ERROR_SILENT
+ ? TRUE : FALSE;
+
+ create_mode &= ~OS_FILE_ON_ERROR_NO_EXIT;
+ create_mode &= ~OS_FILE_ON_ERROR_SILENT;
+
+ if (create_mode == OS_FILE_OPEN_RAW) {
+
+ ut_a(!srv_read_only_mode);
+
+ create_flag = OPEN_EXISTING;
+
+ /* On Windows Physical devices require admin privileges and
+ have to have the write-share mode set. See the remarks
+ section for the CreateFile() function documentation in MSDN. */
+
+ share_mode |= FILE_SHARE_WRITE;
+
+ } else if (create_mode == OS_FILE_OPEN
+ || create_mode == OS_FILE_OPEN_RETRY) {
+
+ create_flag = OPEN_EXISTING;
+
+ } else if (srv_read_only_mode) {
+
+ create_flag = OPEN_EXISTING;
+
+ } else if (create_mode == OS_FILE_CREATE) {
+
+ create_flag = CREATE_NEW;
+
+ } else if (create_mode == OS_FILE_OVERWRITE) {
+
+ create_flag = CREATE_ALWAYS;
+
+ } else {
+ ib_logf(IB_LOG_LEVEL_ERROR,
+ "Unknown file create mode (%lu) for file '%s'",
+ create_mode, name);
+
+ file.m_file = (os_file_t)-1;
+ return(file);
+ }
+
+ DWORD attributes = 0;
+
+#ifdef UNIV_HOTBACKUP
+ attributes |= FILE_FLAG_NO_BUFFERING;
+#else
+ if (purpose == OS_FILE_AIO) {
+
+#ifdef WIN_ASYNC_IO
+ /* If specified, use asynchronous (overlapped) io and no
+ buffering of writes in the OS */
+
+ if (srv_use_native_aio) {
+ attributes |= FILE_FLAG_OVERLAPPED;
+ }
+#endif /* WIN_ASYNC_IO */
+
+ } else if (purpose == OS_FILE_NORMAL) {
+ /* Use default setting. */
+ } else {
+ ib_logf(IB_LOG_LEVEL_ERROR,
+ "Unknown purpose flag (%lu) while opening file '%s'",
+ purpose, name);
+ file.m_file = (os_file_t)-1;
+ return(file);
+ }
+
+#ifdef UNIV_NON_BUFFERED_IO
+ // TODO: Create a bug, this looks wrong. The flush log
+ // parameter is dynamic.
+ if (type == OS_LOG_FILE && srv_flush_log_at_trx_commit == 2) {
+
+ /* Do not use unbuffered i/o for the log files because
+ value 2 denotes that we do not flush the log at every
+ commit, but only once per second */
+
+ } else if (srv_win_file_flush_method == SRV_WIN_IO_UNBUFFERED) {
+
+ attributes |= FILE_FLAG_NO_BUFFERING;
+ }
+#endif /* UNIV_NON_BUFFERED_IO */
+
+#endif /* UNIV_HOTBACKUP */
+ DWORD access = GENERIC_READ;
+
+ if (!srv_read_only_mode) {
+ access |= GENERIC_WRITE;
+ }
+
+ do {
+ /* Use default security attributes and no template file. */
+ file.m_file = CreateFile(
+ (LPCTSTR) name, access, share_mode, NULL,
+ create_flag, attributes, NULL);
+
+ if (file.m_file == INVALID_HANDLE_VALUE) {
+ const char* operation;
+
+ operation = (create_mode == OS_FILE_CREATE
+ && !srv_read_only_mode)
+ ? "create" : "open";
+
+ *success = FALSE;
+
+ if (on_error_no_exit) {
+ retry = os_file_handle_error_no_exit(
+ name, operation, on_error_silent);
+ } else {
+ retry = os_file_handle_error(name, operation);
+ }
+ } else {
+ *success = TRUE;
+ retry = FALSE;
+ }
+
+ } while (retry);
+
+#else /* __WIN__ */
+ int create_flag;
+ const char* mode_str = NULL;
+ if (create_mode != OS_FILE_OPEN && create_mode != OS_FILE_OPEN_RAW)
+ WAIT_ALLOW_WRITES();
+
+ on_error_no_exit = create_mode & OS_FILE_ON_ERROR_NO_EXIT
+ ? TRUE : FALSE;
+ on_error_silent = create_mode & OS_FILE_ON_ERROR_SILENT
+ ? TRUE : FALSE;
+
+ create_mode &= ~OS_FILE_ON_ERROR_NO_EXIT;
+ create_mode &= ~OS_FILE_ON_ERROR_SILENT;
+
+ if (create_mode == OS_FILE_OPEN
+ || create_mode == OS_FILE_OPEN_RAW
+ || create_mode == OS_FILE_OPEN_RETRY) {
+
+ mode_str = "OPEN";
+
+ create_flag = srv_read_only_mode ? O_RDONLY : O_RDWR;
+
+ } else if (srv_read_only_mode) {
+
+ mode_str = "OPEN";
+
+ create_flag = O_RDONLY;
+
+ } else if (create_mode == OS_FILE_CREATE) {
+
+ mode_str = "CREATE";
+ create_flag = O_RDWR | O_CREAT | O_EXCL;
+
+ } else if (create_mode == OS_FILE_OVERWRITE) {
+
+ mode_str = "OVERWRITE";
+ create_flag = O_RDWR | O_CREAT | O_TRUNC;
+
+ } else {
+ ib_logf(IB_LOG_LEVEL_ERROR,
+ "Unknown file create mode (%lu) for file '%s'",
+ create_mode, name);
+
+ file.m_file = -1;
+ return(file);
+ }
+
+ ut_a(type == OS_LOG_FILE || type == OS_DATA_FILE);
+ ut_a(purpose == OS_FILE_AIO || purpose == OS_FILE_NORMAL);
+
+#ifdef O_SYNC
+ /* We let O_SYNC only affect log files; note that we map O_DSYNC to
+ O_SYNC because the datasync options seemed to corrupt files in 2001
+ in both Linux and Solaris */
+
+ if (!srv_read_only_mode
+ && type == OS_LOG_FILE
+ && srv_unix_file_flush_method == SRV_UNIX_O_DSYNC) {
+
+ create_flag |= O_SYNC;
+ }
+#endif /* O_SYNC */
+
+ do {
- file.m_file = ::open(name, create_flag, os_innodb_umask);
++ file.m_file = ::open(name, create_flag | O_CLOEXEC, os_innodb_umask);
+
+ if (file.m_file == -1) {
+ const char* operation;
+
+ operation = (create_mode == OS_FILE_CREATE
+ && !srv_read_only_mode)
+ ? "create" : "open";
+
+ *success = FALSE;
+
+ if (on_error_no_exit) {
+ retry = os_file_handle_error_no_exit(
+ name, operation, on_error_silent);
+ } else {
+ retry = os_file_handle_error(name, operation);
+ }
+ } else {
+ *success = TRUE;
+ retry = false;
+ }
+
+ } while (retry);
+
+ /* We disable OS caching (O_DIRECT) only on data files */
+
+ if (!srv_read_only_mode
+ && *success
+ && type != OS_LOG_FILE
+ && (srv_unix_file_flush_method == SRV_UNIX_O_DIRECT
+ || srv_unix_file_flush_method == SRV_UNIX_O_DIRECT_NO_FSYNC)) {
+
+ os_file_set_nocache(file.m_file, name, mode_str);
+ }
+
+#ifdef USE_FILE_LOCK
+ if (!srv_read_only_mode
+ && *success
+ && create_mode != OS_FILE_OPEN_RAW
+ && os_file_lock(file.m_file, name)) {
+
+ if (create_mode == OS_FILE_OPEN_RETRY) {
+
+ ut_a(!srv_read_only_mode);
+
+ ib_logf(IB_LOG_LEVEL_INFO,
+ "Retrying to lock the first data file");
+
+ for (int i = 0; i < 100; i++) {
+ os_thread_sleep(1000000);
+
+ if (!os_file_lock(file.m_file, name)) {
+ *success = TRUE;
+ return(file);
+ }
+ }
+
+ ib_logf(IB_LOG_LEVEL_INFO,
+ "Unable to open the first data file");
+ }
+
+ *success = FALSE;
+ close(file.m_file);
+ file.m_file = -1;
+ }
+#endif /* USE_FILE_LOCK */
+
+#endif /* __WIN__ */
+
+ return(file);
+}
+
+/***********************************************************************//**
+Deletes a file if it exists. The file has to be closed before calling this.
+@return TRUE if success */
+UNIV_INTERN
+bool
+os_file_delete_if_exists_func(
+/*==========================*/
+ const char* name) /*!< in: file path as a null-terminated
+ string */
+{
+#ifdef __WIN__
+ bool ret;
+ ulint count = 0;
+loop:
+ /* In Windows, deleting an .ibd file may fail if mysqlbackup is copying
+ it */
+
+ ret = DeleteFile((LPCTSTR) name);
+
+ if (ret) {
+ return(true);
+ }
+
+ DWORD lasterr = GetLastError();
+ if (lasterr == ERROR_FILE_NOT_FOUND
+ || lasterr == ERROR_PATH_NOT_FOUND) {
+ /* the file does not exist, this not an error */
+
+ return(true);
+ }
+
+ count++;
+
+ if (count > 100 && 0 == (count % 10)) {
+ os_file_get_last_error(true); /* print error information */
+
+ ib_logf(IB_LOG_LEVEL_WARN, "Delete of file %s failed.", name);
+ }
+
+ os_thread_sleep(500000); /* sleep for 0.5 second */
+
+ if (count > 2000) {
+
+ return(false);
+ }
+
+ goto loop;
+#else
+ int ret;
+ WAIT_ALLOW_WRITES();
+
+ ret = unlink(name);
+
+ if (ret != 0 && errno != ENOENT) {
+ os_file_handle_error_no_exit(name, "delete", FALSE);
+
+ return(false);
+ }
+
+ return(true);
+#endif /* __WIN__ */
+}
+
+/***********************************************************************//**
+Deletes a file. The file has to be closed before calling this.
+@return TRUE if success */
+UNIV_INTERN
+bool
+os_file_delete_func(
+/*================*/
+ const char* name) /*!< in: file path as a null-terminated
+ string */
+{
+#ifdef __WIN__
+ BOOL ret;
+ ulint count = 0;
+loop:
+ /* In Windows, deleting an .ibd file may fail if mysqlbackup is copying
+ it */
+
+ ret = DeleteFile((LPCTSTR) name);
+
+ if (ret) {
+ return(true);
+ }
+
+ if (GetLastError() == ERROR_FILE_NOT_FOUND) {
+ /* If the file does not exist, we classify this as a 'mild'
+ error and return */
+
+ return(false);
+ }
+
+ count++;
+
+ if (count > 100 && 0 == (count % 10)) {
+ os_file_get_last_error(true); /* print error information */
+
+ fprintf(stderr,
+ "InnoDB: Warning: cannot delete file %s\n"
+ "InnoDB: Are you running mysqlbackup"
+ " to back up the file?\n", name);
+ }
+
+ os_thread_sleep(1000000); /* sleep for a second */
+
+ if (count > 2000) {
+
+ return(false);
+ }
+
+ goto loop;
+#else
+ int ret;
+ WAIT_ALLOW_WRITES();
+
+ ret = unlink(name);
+
+ if (ret != 0) {
+ os_file_handle_error_no_exit(name, "delete", FALSE);
+
+ return(false);
+ }
+
+ return(true);
+#endif
+}
+
+/***********************************************************************//**
+NOTE! Use the corresponding macro os_file_rename(), not directly this function!
+Renames a file (can also move it to another directory). It is safest that the
+file is closed before calling this function.
+@return TRUE if success */
+UNIV_INTERN
+ibool
+os_file_rename_func(
+/*================*/
+ const char* oldpath,/*!< in: old file path as a null-terminated
+ string */
+ const char* newpath)/*!< in: new file path */
+{
+#ifdef UNIV_DEBUG
+ os_file_type_t type;
+ ibool exists;
+
+ /* New path must not exist. */
+ ut_ad(os_file_status(newpath, &exists, &type));
+ ut_ad(!exists);
+
+ /* Old path must exist. */
+ ut_ad(os_file_status(oldpath, &exists, &type));
+ ut_ad(exists);
+#endif /* UNIV_DEBUG */
+
+#ifdef __WIN__
+ BOOL ret;
+
+ ret = MoveFile((LPCTSTR) oldpath, (LPCTSTR) newpath);
+
+ if (ret) {
+ return(TRUE);
+ }
+
+ os_file_handle_error_no_exit(oldpath, "rename", FALSE);
+
+ return(FALSE);
+#else
+ int ret;
+ WAIT_ALLOW_WRITES();
+
+ ret = rename(oldpath, newpath);
+
+ if (ret != 0) {
+ os_file_handle_error_no_exit(oldpath, "rename", FALSE);
+
+ return(FALSE);
+ }
+
+ return(TRUE);
+#endif /* __WIN__ */
+}
+
+/***********************************************************************//**
+NOTE! Use the corresponding macro os_file_close(), not directly this function!
+Closes a file handle. In case of error, error number can be retrieved with
+os_file_get_last_error.
+@return TRUE if success */
+UNIV_INTERN
+ibool
+os_file_close_func(
+/*===============*/
+ os_file_t file) /*!< in, own: handle to a file */
+{
+#ifdef __WIN__
+ BOOL ret;
+
+ ret = CloseHandle(file);
+
+ if (ret) {
+ return(TRUE);
+ }
+
+ os_file_handle_error(NULL, "close");
+
+ return(FALSE);
+#else
+ int ret;
+
+ ret = close(file);
+
+ if (ret == -1) {
+ os_file_handle_error(NULL, "close");
+
+ return(FALSE);
+ }
+
+ return(TRUE);
+#endif /* __WIN__ */
+}
+
+#ifdef UNIV_HOTBACKUP
+/***********************************************************************//**
+Closes a file handle.
+@return TRUE if success */
+UNIV_INTERN
+ibool
+os_file_close_no_error_handling(
+/*============================*/
+ os_file_t file) /*!< in, own: handle to a file */
+{
+#ifdef __WIN__
+ BOOL ret;
+
+ ret = CloseHandle(file);
+
+ if (ret) {
+ return(TRUE);
+ }
+
+ return(FALSE);
+#else
+ int ret;
+
+ ret = close(file);
+
+ if (ret == -1) {
+
+ return(FALSE);
+ }
+
+ return(TRUE);
+#endif /* __WIN__ */
+}
+#endif /* UNIV_HOTBACKUP */
+
+/***********************************************************************//**
+Gets a file size.
+@return file size, or (os_offset_t) -1 on failure */
+UNIV_INTERN
+os_offset_t
+os_file_get_size(
+/*=============*/
+ pfs_os_file_t file) /*!< in: handle to a file */
+{
+#ifdef __WIN__
+ os_offset_t offset;
+ DWORD high;
+ DWORD low;
+
+ low = GetFileSize(file.m_file, &high);
+
+ if ((low == 0xFFFFFFFF) && (GetLastError() != NO_ERROR)) {
+ return((os_offset_t) -1);
+ }
+
+ offset = (os_offset_t) low | ((os_offset_t) high << 32);
+
+ return(offset);
+#else
+ return((os_offset_t) lseek(file.m_file, 0, SEEK_END));
+
+#endif /* __WIN__ */
+}
+
+/***********************************************************************//**
+Write the specified number of zeros to a newly created file.
+@return TRUE if success */
+UNIV_INTERN
+ibool
+os_file_set_size(
+/*=============*/
+ const char* name, /*!< in: name of the file or path as a
+ null-terminated string */
+ pfs_os_file_t file, /*!< in: handle to a file */
+ os_offset_t size) /*!< in: file size */
+{
+ ibool ret;
+ byte* buf;
+ byte* buf2;
+ ulint buf_size;
+
+#ifdef HAVE_POSIX_FALLOCATE
+ if (srv_use_posix_fallocate) {
+ int err;
+ do {
+ err = posix_fallocate(file.m_file, 0, size);
+ } while (err == EINTR
+ && srv_shutdown_state == SRV_SHUTDOWN_NONE);
+
+ if (err) {
+ ib_logf(IB_LOG_LEVEL_ERROR,
+ "preallocating " INT64PF " bytes for"
+ "file %s failed with error %d",
+ size, name, err);
+ }
+ return(!err);
+ }
+#endif
+
+#ifdef _WIN32
+ /* Write 1 page of zeroes at the desired end. */
+ buf_size = UNIV_PAGE_SIZE;
+ os_offset_t current_size = size - buf_size;
+#else
+ /* Write up to 1 megabyte at a time. */
+ buf_size = ut_min(64, (ulint) (size / UNIV_PAGE_SIZE))
+ * UNIV_PAGE_SIZE;
+ os_offset_t current_size = 0;
+#endif
+ buf2 = static_cast<byte*>(calloc(1, buf_size + UNIV_PAGE_SIZE));
+
+ if (!buf2) {
+ ib_logf(IB_LOG_LEVEL_ERROR,
+ "Cannot allocate " ULINTPF " bytes to extend file\n",
+ buf_size + UNIV_PAGE_SIZE);
+ return(FALSE);
+ }
+
+ /* Align the buffer for possible raw i/o */
+ buf = static_cast<byte*>(ut_align(buf2, UNIV_PAGE_SIZE));
+
+ do {
+ ulint n_bytes;
+
+ if (size - current_size < (os_offset_t) buf_size) {
+ n_bytes = (ulint) (size - current_size);
+ } else {
+ n_bytes = buf_size;
+ }
+
+ ret = os_file_write(name, file, buf, current_size, n_bytes);
+ if (!ret) {
+ break;
+ }
+
+ current_size += n_bytes;
+ } while (current_size < size);
+
+ free(buf2);
+
+ return(ret && os_file_flush(file));
+}
+
+/***********************************************************************//**
+Truncates a file at its current position.
+@return TRUE if success */
+UNIV_INTERN
+ibool
+os_file_set_eof(
+/*============*/
+ FILE* file) /*!< in: file to be truncated */
+{
+#ifdef __WIN__
+ HANDLE h = (HANDLE) _get_osfhandle(fileno(file));
+ return(SetEndOfFile(h));
+#else /* __WIN__ */
+ WAIT_ALLOW_WRITES();
+ return(!ftruncate(fileno(file), ftell(file)));
+#endif /* __WIN__ */
+}
+
+#ifndef __WIN__
+/***********************************************************************//**
+Wrapper to fsync(2) that retries the call on some errors.
+Returns the value 0 if successful; otherwise the value -1 is returned and
+the global variable errno is set to indicate the error.
+@return 0 if success, -1 otherwise */
+
+static
+int
+os_file_fsync(
+/*==========*/
+ os_file_t file) /*!< in: handle to a file */
+{
+ int ret;
+ int failures;
+ ibool retry;
+
+ failures = 0;
+
+ do {
+ ret = fsync(file);
+
+ os_n_fsyncs++;
+
+ if (ret == -1 && errno == ENOLCK) {
+
+ if (failures % 100 == 0) {
+
+ ut_print_timestamp(stderr);
+ fprintf(stderr,
+ " InnoDB: fsync(): "
+ "No locks available; retrying\n");
+ }
+
+ os_thread_sleep(200000 /* 0.2 sec */);
+
+ failures++;
+
+ retry = TRUE;
+ } else {
+
+ retry = FALSE;
+ }
+ } while (retry);
+
+ return(ret);
+}
+#endif /* !__WIN__ */
+
+/***********************************************************************//**
+NOTE! Use the corresponding macro os_file_flush(), not directly this function!
+Flushes the write buffers of a given file to the disk.
+@return TRUE if success */
+UNIV_INTERN
+ibool
+os_file_flush_func(
+/*===============*/
+ os_file_t file) /*!< in, own: handle to a file */
+{
+#ifdef __WIN__
+ BOOL ret;
+
+ os_n_fsyncs++;
+
+ ret = FlushFileBuffers(file);
+
+ if (ret) {
+ return(TRUE);
+ }
+
+ /* Since Windows returns ERROR_INVALID_FUNCTION if the 'file' is
+ actually a raw device, we choose to ignore that error if we are using
+ raw disks */
+
+ if (srv_start_raw_disk_in_use && GetLastError()
+ == ERROR_INVALID_FUNCTION) {
+ return(TRUE);
+ }
+
+ os_file_handle_error(NULL, "flush");
+
+ /* It is a fatal error if a file flush does not succeed, because then
+ the database can get corrupt on disk */
+ ut_error;
+
+ return(FALSE);
+#else
+ int ret;
+ WAIT_ALLOW_WRITES();
+
+#if defined(HAVE_DARWIN_THREADS)
+# ifndef F_FULLFSYNC
+ /* The following definition is from the Mac OS X 10.3 <sys/fcntl.h> */
+# define F_FULLFSYNC 51 /* fsync + ask the drive to flush to the media */
+# elif F_FULLFSYNC != 51
+# error "F_FULLFSYNC != 51: ABI incompatibility with Mac OS X 10.3"
+# endif
+ /* Apple has disabled fsync() for internal disk drives in OS X. That
+ caused corruption for a user when he tested a power outage. Let us in
+ OS X use a nonstandard flush method recommended by an Apple
+ engineer. */
+
+ if (!srv_have_fullfsync) {
+ /* If we are not on an operating system that supports this,
+ then fall back to a plain fsync. */
+
+ ret = os_file_fsync(file);
+ } else {
+ ret = fcntl(file, F_FULLFSYNC, NULL);
+
+ if (ret) {
+ /* If we are not on a file system that supports this,
+ then fall back to a plain fsync. */
+ ret = os_file_fsync(file);
+ }
+ }
+#else
+ ret = os_file_fsync(file);
+#endif
+
+ if (ret == 0) {
+ return(TRUE);
+ }
+
+ /* Since Linux returns EINVAL if the 'file' is actually a raw device,
+ we choose to ignore that error if we are using raw disks */
+
+ if (srv_start_raw_disk_in_use && errno == EINVAL) {
+
+ return(TRUE);
+ }
+
+ ib_logf(IB_LOG_LEVEL_ERROR, "The OS said file flush did not succeed");
+
+ os_file_handle_error(NULL, "flush");
+
+ /* It is a fatal error if a file flush does not succeed, because then
+ the database can get corrupt on disk */
+ ut_error;
+
+ return(FALSE);
+#endif
+}
+
+#ifndef __WIN__
+/*******************************************************************//**
+Does a synchronous read operation in Posix.
+@return number of bytes read, -1 if error */
+static MY_ATTRIBUTE((nonnull, warn_unused_result))
+ssize_t
+os_file_pread(
+/*==========*/
+ os_file_t file, /*!< in: handle to a file */
+ void* buf, /*!< in: buffer where to read */
+ ulint n, /*!< in: number of bytes to read */
+ os_offset_t offset) /*!< in: file offset from where to read */
+{
+ off_t offs;
+
+ ut_ad(n);
+
+ /* If off_t is > 4 bytes in size, then we assume we can pass a
+ 64-bit address */
+ offs = (off_t) offset;
+
+ if (sizeof(off_t) <= 4) {
+ if (offset != (os_offset_t) offs) {
+ ib_logf(IB_LOG_LEVEL_ERROR,
+ "File read at offset > 4 GB");
+ }
+ }
+
+ os_n_file_reads++;
+
+ const bool monitor = MONITOR_IS_ON(MONITOR_OS_PENDING_READS);
+
+#ifdef HAVE_PREAD
+ MONITOR_ATOMIC_INC_LOW(MONITOR_OS_PENDING_READS, monitor);
+ ssize_t n_bytes = pread(file, buf, n, offs);
+ MONITOR_ATOMIC_DEC_LOW(MONITOR_OS_PENDING_READS, monitor);
+ return(n_bytes);
+#else
+ {
+ off_t ret_offset;
+ ssize_t ret;
+#ifndef UNIV_HOTBACKUP
+ ulint i;
+#endif /* !UNIV_HOTBACKUP */
+
+ MONITOR_ATOMIC_INC_LOW(MONITOR_OS_PENDING_READS, monitor);
+#ifndef UNIV_HOTBACKUP
+ /* Protect the seek / read operation with a mutex */
+ i = ((ulint) file) % OS_FILE_N_SEEK_MUTEXES;
+
+ os_mutex_enter(os_file_seek_mutexes[i]);
+#endif /* !UNIV_HOTBACKUP */
+
+ ret_offset = lseek(file, offs, SEEK_SET);
+
+ if (ret_offset < 0) {
+ ret = -1;
+ } else {
+ ret = read(file, buf, (ssize_t) n);
+ }
+
+#ifndef UNIV_HOTBACKUP
+ os_mutex_exit(os_file_seek_mutexes[i]);
+#endif /* !UNIV_HOTBACKUP */
+
+ MONITOR_ATOMIC_DEC_LOW(MONITOR_OS_PENDING_READS, monitor);
+ return(ret);
+ }
+#endif
+}
+
+/*******************************************************************//**
+Does a synchronous write operation in Posix.
+@return number of bytes written, -1 if error */
+static MY_ATTRIBUTE((nonnull, warn_unused_result))
+ssize_t
+os_file_pwrite(
+/*===========*/
+ os_file_t file, /*!< in: handle to a file */
+ const void* buf, /*!< in: buffer from where to write */
+ ulint n, /*!< in: number of bytes to write */
+ os_offset_t offset) /*!< in: file offset where to write */
+{
+ ssize_t ret;
+ off_t offs;
+
+ ut_ad(n);
+ ut_ad(!srv_read_only_mode);
+
+ /* If off_t is > 4 bytes in size, then we assume we can pass a
+ 64-bit address */
+ offs = (off_t) offset;
+
+ if (sizeof(off_t) <= 4) {
+ if (offset != (os_offset_t) offs) {
+ ib_logf(IB_LOG_LEVEL_ERROR,
+ "File write at offset > 4 GB.");
+ }
+ }
+
+ os_n_file_writes++;
+
+ const bool monitor = MONITOR_IS_ON(MONITOR_OS_PENDING_WRITES);
+#ifdef HAVE_PWRITE
+ MONITOR_ATOMIC_INC_LOW(MONITOR_OS_PENDING_WRITES, monitor);
+ ret = pwrite(file, buf, (ssize_t) n, offs);
+ MONITOR_ATOMIC_DEC_LOW(MONITOR_OS_PENDING_WRITES, monitor);
+
+ return(ret);
+#else
+ {
+ off_t ret_offset;
+# ifndef UNIV_HOTBACKUP
+ ulint i;
+# endif /* !UNIV_HOTBACKUP */
+
+ MONITOR_ATOMIC_INC_LOW(MONITOR_OS_PENDING_WRITES, monitor);
+
+# ifndef UNIV_HOTBACKUP
+ /* Protect the seek / write operation with a mutex */
+ i = ((ulint) file) % OS_FILE_N_SEEK_MUTEXES;
+
+ os_mutex_enter(os_file_seek_mutexes[i]);
+# endif /* UNIV_HOTBACKUP */
+
+ ret_offset = lseek(file, offs, SEEK_SET);
+
+ if (ret_offset < 0) {
+ ret = -1;
+
+ goto func_exit;
+ }
+
+ ret = write(file, buf, (ssize_t) n);
+
+func_exit:
+# ifndef UNIV_HOTBACKUP
+ os_mutex_exit(os_file_seek_mutexes[i]);
+# endif /* !UNIV_HOTBACKUP */
+
+ MONITOR_ATOMIC_DEC_LOW(MONITOR_OS_PENDING_WRITES, monitor);
+ return(ret);
+ }
+#endif /* HAVE_PWRITE */
+}
+#endif
+
+/*******************************************************************//**
+NOTE! Use the corresponding macro os_file_read(), not directly this
+function!
+Requests a synchronous positioned read operation.
+@return TRUE if request was successful, FALSE if fail */
+UNIV_INTERN
+ibool
+os_file_read_func(
+/*==============*/
+ os_file_t file, /*!< in: handle to a file */
+ void* buf, /*!< in: buffer where to read */
+ os_offset_t offset, /*!< in: file offset where to read */
+ ulint n) /*!< in: number of bytes to read */
+{
+#ifdef __WIN__
+ BOOL ret;
+ DWORD len;
+ DWORD ret2;
+ DWORD low;
+ DWORD high;
+ ibool retry;
+#ifndef UNIV_HOTBACKUP
+ ulint i;
+#endif /* !UNIV_HOTBACKUP */
+
+ /* On 64-bit Windows, ulint is 64 bits. But offset and n should be
+ no more than 32 bits. */
+ ut_a((n & 0xFFFFFFFFUL) == n);
+
+ os_n_file_reads++;
+ os_bytes_read_since_printout += n;
+ const bool monitor = MONITOR_IS_ON(MONITOR_OS_PENDING_READS);
+
+try_again:
+ ut_ad(buf);
+ ut_ad(n > 0);
+
+ low = (DWORD) offset & 0xFFFFFFFF;
+ high = (DWORD) (offset >> 32);
+
+ MONITOR_ATOMIC_INC_LOW(MONITOR_OS_PENDING_READS, monitor);
+
+#ifndef UNIV_HOTBACKUP
+ /* Protect the seek / read operation with a mutex */
+ i = ((ulint) file) % OS_FILE_N_SEEK_MUTEXES;
+
+ os_mutex_enter(os_file_seek_mutexes[i]);
+#endif /* !UNIV_HOTBACKUP */
+
+ ret2 = SetFilePointer(
+ file, low, reinterpret_cast<PLONG>(&high), FILE_BEGIN);
+
+ if (ret2 == 0xFFFFFFFF && GetLastError() != NO_ERROR) {
+
+#ifndef UNIV_HOTBACKUP
+ os_mutex_exit(os_file_seek_mutexes[i]);
+#endif /* !UNIV_HOTBACKUP */
+
+ MONITOR_ATOMIC_DEC_LOW(MONITOR_OS_PENDING_READS, monitor);
+ goto error_handling;
+ }
+
+ ret = ReadFile(file, buf, (DWORD) n, &len, NULL);
+
+#ifndef UNIV_HOTBACKUP
+ os_mutex_exit(os_file_seek_mutexes[i]);
+#endif /* !UNIV_HOTBACKUP */
+
+ MONITOR_ATOMIC_DEC_LOW(MONITOR_OS_PENDING_READS, monitor);
+
+ if (ret && len == n) {
+ return(TRUE);
+ }
+#else /* __WIN__ */
+ ibool retry;
+ ssize_t ret;
+
+ os_bytes_read_since_printout += n;
+
+try_again:
+ ret = os_file_pread(file, buf, n, offset);
+
+ if ((ulint) ret == n) {
+ return(TRUE);
+ } else if (ret == -1) {
+ ib_logf(IB_LOG_LEVEL_ERROR,
+ "Error in system call pread(). The operating"
+ " system error number is %lu.",(ulint) errno);
+ } else {
+ /* Partial read occurred */
+ ib_logf(IB_LOG_LEVEL_ERROR,
+ "Tried to read " ULINTPF " bytes at offset "
+ UINT64PF ". Was only able to read %ld.",
+ n, offset, (lint) ret);
+ }
+#endif /* __WIN__ */
+#ifdef __WIN__
+error_handling:
+#endif
+ retry = os_file_handle_error(NULL, "read");
+
+ if (retry) {
+ goto try_again;
+ }
+
+ fprintf(stderr,
+ "InnoDB: Fatal error: cannot read from file."
+ " OS error number %lu.\n",
+#ifdef __WIN__
+ (ulong) GetLastError()
+#else
+ (ulong) errno
+#endif /* __WIN__ */
+ );
+ fflush(stderr);
+
+ ut_error;
+
+ return(FALSE);
+}
+
+/*******************************************************************//**
+NOTE! Use the corresponding macro os_file_read_no_error_handling(),
+not directly this function!
+Requests a synchronous positioned read operation. This function does not do
+any error handling. In case of error it returns FALSE.
+@return TRUE if request was successful, FALSE if fail */
+UNIV_INTERN
+ibool
+os_file_read_no_error_handling_func(
+/*================================*/
+ os_file_t file, /*!< in: handle to a file */
+ void* buf, /*!< in: buffer where to read */
+ os_offset_t offset, /*!< in: file offset where to read */
+ ulint n) /*!< in: number of bytes to read */
+{
+#ifdef __WIN__
+ BOOL ret;
+ DWORD len;
+ DWORD ret2;
+ DWORD low;
+ DWORD high;
+ ibool retry;
+#ifndef UNIV_HOTBACKUP
+ ulint i;
+#endif /* !UNIV_HOTBACKUP */
+
+ /* On 64-bit Windows, ulint is 64 bits. But offset and n should be
+ no more than 32 bits. */
+ ut_a((n & 0xFFFFFFFFUL) == n);
+
+ os_n_file_reads++;
+ os_bytes_read_since_printout += n;
+ const bool monitor = MONITOR_IS_ON(MONITOR_OS_PENDING_READS);
+
+try_again:
+ ut_ad(buf);
+ ut_ad(n > 0);
+
+ low = (DWORD) offset & 0xFFFFFFFF;
+ high = (DWORD) (offset >> 32);
+
+ MONITOR_ATOMIC_INC_LOW(MONITOR_OS_PENDING_READS, monitor);
+
+#ifndef UNIV_HOTBACKUP
+ /* Protect the seek / read operation with a mutex */
+ i = ((ulint) file) % OS_FILE_N_SEEK_MUTEXES;
+
+ os_mutex_enter(os_file_seek_mutexes[i]);
+#endif /* !UNIV_HOTBACKUP */
+
+ ret2 = SetFilePointer(
+ file, low, reinterpret_cast<PLONG>(&high), FILE_BEGIN);
+
+ if (ret2 == 0xFFFFFFFF && GetLastError() != NO_ERROR) {
+
+#ifndef UNIV_HOTBACKUP
+ os_mutex_exit(os_file_seek_mutexes[i]);
+#endif /* !UNIV_HOTBACKUP */
+
+ MONITOR_ATOMIC_DEC_LOW(MONITOR_OS_PENDING_READS, monitor);
+ goto error_handling;
+ }
+
+ ret = ReadFile(file, buf, (DWORD) n, &len, NULL);
+
+#ifndef UNIV_HOTBACKUP
+ os_mutex_exit(os_file_seek_mutexes[i]);
+#endif /* !UNIV_HOTBACKUP */
+
+ MONITOR_ATOMIC_DEC_LOW(MONITOR_OS_PENDING_READS, monitor);
+
+ if (ret && len == n) {
+ return(TRUE);
+ }
+#else /* __WIN__ */
+ ibool retry;
+ ssize_t ret;
+
+ os_bytes_read_since_printout += n;
+
+try_again:
+ ret = os_file_pread(file, buf, n, offset);
+
+ if ((ulint) ret == n) {
+ return(TRUE);
+ } else if (ret == -1) {
+ ib_logf(IB_LOG_LEVEL_ERROR,
+ "Error in system call pread(). The operating"
+ " system error number is %lu.",(ulint) errno);
+ } else {
+ /* Partial read occurred */
+ ib_logf(IB_LOG_LEVEL_ERROR,
+ "Tried to read " ULINTPF " bytes at offset "
+ UINT64PF ". Was only able to read %ld.",
+ n, offset, (lint) ret);
+ }
+#endif /* __WIN__ */
+#ifdef __WIN__
+error_handling:
+#endif
+ retry = os_file_handle_error_no_exit(NULL, "read", FALSE);
+
+ if (retry) {
+ goto try_again;
+ }
+
+ return(FALSE);
+}
+
+/*******************************************************************//**
+Rewind file to its start, read at most size - 1 bytes from it to str, and
+NUL-terminate str. All errors are silently ignored. This function is
+mostly meant to be used with temporary files. */
+UNIV_INTERN
+void
+os_file_read_string(
+/*================*/
+ FILE* file, /*!< in: file to read from */
+ char* str, /*!< in: buffer where to read */
+ ulint size) /*!< in: size of buffer */
+{
+ size_t flen;
+
+ if (size == 0) {
+ return;
+ }
+
+ rewind(file);
+ flen = fread(str, 1, size - 1, file);
+ str[flen] = '\0';
+}
+
+/*******************************************************************//**
+NOTE! Use the corresponding macro os_file_write(), not directly
+this function!
+Requests a synchronous write operation.
+@return TRUE if request was successful, FALSE if fail */
+UNIV_INTERN
+ibool
+os_file_write_func(
+/*===============*/
+ const char* name, /*!< in: name of the file or path as a
+ null-terminated string */
+ os_file_t file, /*!< in: handle to a file */
+ const void* buf, /*!< in: buffer from which to write */
+ os_offset_t offset, /*!< in: file offset where to write */
+ ulint n) /*!< in: number of bytes to write */
+{
+ ut_ad(!srv_read_only_mode);
+#ifdef __WIN__
+ BOOL ret;
+ DWORD len;
+ DWORD ret2;
+ DWORD low;
+ DWORD high;
+ ulint n_retries = 0;
+ ulint err;
+ DWORD saved_error = 0;
+#ifndef UNIV_HOTBACKUP
+ ulint i;
+#endif /* !UNIV_HOTBACKUP */
+
+ /* On 64-bit Windows, ulint is 64 bits. But offset and n should be
+ no more than 32 bits. */
+ ut_a((n & 0xFFFFFFFFUL) == n);
+
+ os_n_file_writes++;
+
+ ut_ad(buf);
+ ut_ad(n > 0);
+ const bool monitor = MONITOR_IS_ON(MONITOR_OS_PENDING_WRITES);
+retry:
+ low = (DWORD) offset & 0xFFFFFFFF;
+ high = (DWORD) (offset >> 32);
+
+ MONITOR_ATOMIC_INC_LOW(MONITOR_OS_PENDING_WRITES, monitor);
+
+#ifndef UNIV_HOTBACKUP
+ /* Protect the seek / write operation with a mutex */
+ i = ((ulint) file) % OS_FILE_N_SEEK_MUTEXES;
+
+ os_mutex_enter(os_file_seek_mutexes[i]);
+#endif /* !UNIV_HOTBACKUP */
+
+ ret2 = SetFilePointer(
+ file, low, reinterpret_cast<PLONG>(&high), FILE_BEGIN);
+
+ if (ret2 == 0xFFFFFFFF && GetLastError() != NO_ERROR) {
+
+#ifndef UNIV_HOTBACKUP
+ os_mutex_exit(os_file_seek_mutexes[i]);
+#endif /* !UNIV_HOTBACKUP */
+
+ MONITOR_ATOMIC_DEC_LOW(MONITOR_OS_PENDING_WRITES, monitor);
+
+ ut_print_timestamp(stderr);
+
+ fprintf(stderr,
+ " InnoDB: Error: File pointer positioning to"
+ " file %s failed at\n"
+ "InnoDB: offset %llu. Operating system"
+ " error number %lu.\n"
+ "InnoDB: Some operating system error numbers"
+ " are described at\n"
+ "InnoDB: "
+ REFMAN "operating-system-error-codes.html\n",
+ name, offset, (ulong) GetLastError());
+
+ return(FALSE);
+ }
+
+ ret = WriteFile(file, buf, (DWORD) n, &len, NULL);
+
+#ifndef UNIV_HOTBACKUP
+ os_mutex_exit(os_file_seek_mutexes[i]);
+#endif /* !UNIV_HOTBACKUP */
+
+ MONITOR_ATOMIC_DEC_LOW(MONITOR_OS_PENDING_WRITES, monitor);
+
+ if (ret && len == n) {
+
+ return(TRUE);
+ }
+
+ /* If some background file system backup tool is running, then, at
+ least in Windows 2000, we may get here a specific error. Let us
+ retry the operation 100 times, with 1 second waits. */
+
+ if (GetLastError() == ERROR_LOCK_VIOLATION && n_retries < 100) {
+
+ os_thread_sleep(1000000);
+
+ n_retries++;
+
+ goto retry;
+ }
+
+ if (!os_has_said_disk_full) {
+ char *winmsg = NULL;
+
+ saved_error = GetLastError();
+ err = (ulint) saved_error;
+
+ ut_print_timestamp(stderr);
+
+ fprintf(stderr,
+ " InnoDB: Error: Write to file %s failed"
+ " at offset %llu.\n"
+ "InnoDB: %lu bytes should have been written,"
+ " only %lu were written.\n"
+ "InnoDB: Operating system error number %lu.\n"
+ "InnoDB: Check that your OS and file system"
+ " support files of this size.\n"
+ "InnoDB: Check also that the disk is not full"
+ " or a disk quota exceeded.\n",
+ name, offset,
+ (ulong) n, (ulong) len, (ulong) err);
+
+ /* Ask Windows to prepare a standard message for a
+ GetLastError() */
+
+ FormatMessage(FORMAT_MESSAGE_ALLOCATE_BUFFER |
+ FORMAT_MESSAGE_FROM_SYSTEM |
+ FORMAT_MESSAGE_IGNORE_INSERTS,
+ NULL, saved_error,
+ MAKELANGID(LANG_NEUTRAL, SUBLANG_DEFAULT),
+ (LPSTR)&winmsg, 0, NULL);
+
+ if (winmsg) {
+ fprintf(stderr,
+ "InnoDB: FormatMessage: Error number %lu means '%s'.\n",
+ (ulong) saved_error, winmsg);
+ LocalFree(winmsg);
+ }
+
+ if (strerror((int) err) != NULL) {
+ fprintf(stderr,
+ "InnoDB: Error number %lu means '%s'.\n",
+ (ulong) err, strerror((int) err));
+ }
+
+ fprintf(stderr,
+ "InnoDB: Some operating system error numbers"
+ " are described at\n"
+ "InnoDB: "
+ REFMAN "operating-system-error-codes.html\n");
+
+ os_has_said_disk_full = TRUE;
+ }
+
+ return(FALSE);
+#else
+ ssize_t ret;
+ WAIT_ALLOW_WRITES();
+
+ ret = os_file_pwrite(file, buf, n, offset);
+
+ if ((ulint) ret == n) {
+
+ return(TRUE);
+ }
+
+ if (!os_has_said_disk_full) {
+ ut_print_timestamp(stderr);
+
+ if(ret == -1) {
+ ib_logf(IB_LOG_LEVEL_ERROR,
+ "Failure of system call pwrite(). Operating"
+ " system error number is %lu.",
+ (ulint) errno);
+ } else {
+ fprintf(stderr,
+ " InnoDB: Error: Write to file %s failed"
+ " at offset " UINT64PF ".\n"
+ "InnoDB: %lu bytes should have been written,"
+ " only %ld were written.\n"
+ "InnoDB: Operating system error number %lu.\n"
+ "InnoDB: Check that your OS and file system"
+ " support files of this size.\n"
+ "InnoDB: Check also that the disk is not full"
+ " or a disk quota exceeded.\n",
+ name, offset, n, (lint) ret,
+ (ulint) errno);
+ }
+
+ if (strerror(errno) != NULL) {
+ fprintf(stderr,
+ "InnoDB: Error number %d means '%s'.\n",
+ errno, strerror(errno));
+ }
+
+ fprintf(stderr,
+ "InnoDB: Some operating system error numbers"
+ " are described at\n"
+ "InnoDB: "
+ REFMAN "operating-system-error-codes.html\n");
+
+ os_has_said_disk_full = TRUE;
+ }
+
+ return(FALSE);
+#endif
+}
+
+/*******************************************************************//**
+Check the existence and type of the given file.
+@return TRUE if call succeeded */
+UNIV_INTERN
+ibool
+os_file_status(
+/*===========*/
+ const char* path, /*!< in: pathname of the file */
+ ibool* exists, /*!< out: TRUE if file exists */
+ os_file_type_t* type) /*!< out: type of the file (if it exists) */
+{
+#ifdef __WIN__
+ int ret;
+ struct _stat64 statinfo;
+
+ ret = _stat64(path, &statinfo);
+ if (ret && (errno == ENOENT || errno == ENOTDIR || errno == ENAMETOOLONG)) {
+ /* file does not exist */
+ *exists = FALSE;
+ return(TRUE);
+ } else if (ret) {
+ /* file exists, but stat call failed */
+
+ os_file_handle_error_no_exit(path, "stat", FALSE);
+
+ return(FALSE);
+ }
+
+ if (_S_IFDIR & statinfo.st_mode) {
+ *type = OS_FILE_TYPE_DIR;
+ } else if (_S_IFREG & statinfo.st_mode) {
+ *type = OS_FILE_TYPE_FILE;
+ } else {
+ *type = OS_FILE_TYPE_UNKNOWN;
+ }
+
+ *exists = TRUE;
+
+ return(TRUE);
+#else
+ int ret;
+ struct stat statinfo;
+
+ ret = stat(path, &statinfo);
+ if (ret && (errno == ENOENT || errno == ENOTDIR || errno == ENAMETOOLONG)) {
+ /* file does not exist */
+ *exists = FALSE;
+ return(TRUE);
+ } else if (ret) {
+ /* file exists, but stat call failed */
+
+ os_file_handle_error_no_exit(path, "stat", FALSE);
+
+ return(FALSE);
+ }
+
+ if (S_ISDIR(statinfo.st_mode)) {
+ *type = OS_FILE_TYPE_DIR;
+ } else if (S_ISLNK(statinfo.st_mode)) {
+ *type = OS_FILE_TYPE_LINK;
+ } else if (S_ISREG(statinfo.st_mode)) {
+ *type = OS_FILE_TYPE_FILE;
+ } else {
+ *type = OS_FILE_TYPE_UNKNOWN;
+ }
+
+ *exists = TRUE;
+
+ return(TRUE);
+#endif
+}
+
+/*******************************************************************//**
+This function returns information about the specified file
+@return DB_SUCCESS if all OK */
+UNIV_INTERN
+dberr_t
+os_file_get_status(
+/*===============*/
+ const char* path, /*!< in: pathname of the file */
+ os_file_stat_t* stat_info, /*!< information of a file in a
+ directory */
+ bool check_rw_perm) /*!< in: for testing whether the
+ file can be opened in RW mode */
+{
+ int ret;
+
+#ifdef __WIN__
+ struct _stat64 statinfo;
+
+ ret = _stat64(path, &statinfo);
+
+ if (ret && (errno == ENOENT || errno == ENOTDIR)) {
+ /* file does not exist */
+
+ return(DB_NOT_FOUND);
+
+ } else if (ret) {
+ /* file exists, but stat call failed */
+
+ os_file_handle_error_no_exit(path, "stat", FALSE);
+
+ return(DB_FAIL);
+
+ } else if (_S_IFDIR & statinfo.st_mode) {
+ stat_info->type = OS_FILE_TYPE_DIR;
+ } else if (_S_IFREG & statinfo.st_mode) {
+
+ DWORD access = GENERIC_READ;
+
+ if (!srv_read_only_mode) {
+ access |= GENERIC_WRITE;
+ }
+
+ stat_info->type = OS_FILE_TYPE_FILE;
+
+ /* Check if we can open it in read-only mode. */
+
+ if (check_rw_perm) {
+ HANDLE fh;
+
+ fh = CreateFile(
+ (LPCTSTR) path, // File to open
+ access,
+ 0, // No sharing
+ NULL, // Default security
+ OPEN_EXISTING, // Existing file only
+ FILE_ATTRIBUTE_NORMAL, // Normal file
+ NULL); // No attr. template
+
+ if (fh == INVALID_HANDLE_VALUE) {
+ stat_info->rw_perm = false;
+ } else {
+ stat_info->rw_perm = true;
+ CloseHandle(fh);
+ }
+ }
+ } else {
+ stat_info->type = OS_FILE_TYPE_UNKNOWN;
+ }
+#else
+ struct stat statinfo;
+
+ ret = stat(path, &statinfo);
+
+ if (ret && (errno == ENOENT || errno == ENOTDIR)) {
+ /* file does not exist */
+
+ return(DB_NOT_FOUND);
+
+ } else if (ret) {
+ /* file exists, but stat call failed */
+
+ os_file_handle_error_no_exit(path, "stat", FALSE);
+
+ return(DB_FAIL);
+
+ }
+
+ switch (statinfo.st_mode & S_IFMT) {
+ case S_IFDIR:
+ stat_info->type = OS_FILE_TYPE_DIR;
+ break;
+ case S_IFLNK:
+ stat_info->type = OS_FILE_TYPE_LINK;
+ break;
+ case S_IFBLK:
+ /* Handle block device as regular file. */
+ case S_IFCHR:
+ /* Handle character device as regular file. */
+ case S_IFREG:
+ stat_info->type = OS_FILE_TYPE_FILE;
+ break;
+ default:
+ stat_info->type = OS_FILE_TYPE_UNKNOWN;
+ }
+
+
+ if (check_rw_perm && stat_info->type == OS_FILE_TYPE_FILE) {
+
+ int fh;
+ int access;
+
+ access = !srv_read_only_mode ? O_RDWR : O_RDONLY;
+
- fh = ::open(path, access, os_innodb_umask);
++ fh = ::open(path, access | O_CLOEXEC, os_innodb_umask);
+
+ if (fh == -1) {
+ stat_info->rw_perm = false;
+ } else {
+ stat_info->rw_perm = true;
+ close(fh);
+ }
+ }
+
+#endif /* _WIN_ */
+
+ stat_info->ctime = statinfo.st_ctime;
+ stat_info->atime = statinfo.st_atime;
+ stat_info->mtime = statinfo.st_mtime;
+ stat_info->size = statinfo.st_size;
+
+ return(DB_SUCCESS);
+}
+
+/* path name separator character */
+#ifdef __WIN__
+# define OS_FILE_PATH_SEPARATOR '\\'
+#else
+# define OS_FILE_PATH_SEPARATOR '/'
+#endif
+
+/****************************************************************//**
+This function returns a new path name after replacing the basename
+in an old path with a new basename. The old_path is a full path
+name including the extension. The tablename is in the normal
+form "databasename/tablename". The new base name is found after
+the forward slash. Both input strings are null terminated.
+
+This function allocates memory to be returned. It is the callers
+responsibility to free the return value after it is no longer needed.
+
+@return own: new full pathname */
+UNIV_INTERN
+char*
+os_file_make_new_pathname(
+/*======================*/
+ const char* old_path, /*!< in: pathname */
+ const char* tablename) /*!< in: contains new base name */
+{
+ ulint dir_len;
+ char* last_slash;
+ char* base_name;
+ char* new_path;
+ ulint new_path_len;
+
+ /* Split the tablename into its database and table name components.
+ They are separated by a '/'. */
+ last_slash = strrchr((char*) tablename, '/');
+ base_name = last_slash ? last_slash + 1 : (char*) tablename;
+
+ /* Find the offset of the last slash. We will strip off the
+ old basename.ibd which starts after that slash. */
+ last_slash = strrchr((char*) old_path, OS_FILE_PATH_SEPARATOR);
+ dir_len = last_slash ? last_slash - old_path : strlen(old_path);
+
+ /* allocate a new path and move the old directory path to it. */
+ new_path_len = dir_len + strlen(base_name) + sizeof "/.ibd";
+ new_path = static_cast<char*>(mem_alloc(new_path_len));
+ memcpy(new_path, old_path, dir_len);
+
+ ut_snprintf(new_path + dir_len,
+ new_path_len - dir_len,
+ "%c%s.ibd",
+ OS_FILE_PATH_SEPARATOR,
+ base_name);
+
+ return(new_path);
+}
+
+/****************************************************************//**
+This function returns a remote path name by combining a data directory
+path provided in a DATA DIRECTORY clause with the tablename which is
+in the form 'database/tablename'. It strips the file basename (which
+is the tablename) found after the last directory in the path provided.
+The full filepath created will include the database name as a directory
+under the path provided. The filename is the tablename with the '.ibd'
+extension. All input and output strings are null-terminated.
+
+This function allocates memory to be returned. It is the callers
+responsibility to free the return value after it is no longer needed.
+
+@return own: A full pathname; data_dir_path/databasename/tablename.ibd */
+UNIV_INTERN
+char*
+os_file_make_remote_pathname(
+/*=========================*/
+ const char* data_dir_path, /*!< in: pathname */
+ const char* tablename, /*!< in: tablename */
+ const char* extention) /*!< in: file extention; ibd,cfg */
+{
+ ulint data_dir_len;
+ char* last_slash;
+ char* new_path;
+ ulint new_path_len;
+
+ ut_ad(extention && strlen(extention) == 3);
+
+ /* Find the offset of the last slash. We will strip off the
+ old basename or tablename which starts after that slash. */
+ last_slash = strrchr((char*) data_dir_path, OS_FILE_PATH_SEPARATOR);
+ data_dir_len = last_slash ? last_slash - data_dir_path : strlen(data_dir_path);
+
+ /* allocate a new path and move the old directory path to it. */
+ new_path_len = data_dir_len + strlen(tablename)
+ + sizeof "/." + strlen(extention);
+ new_path = static_cast<char*>(mem_alloc(new_path_len));
+ memcpy(new_path, data_dir_path, data_dir_len);
+ ut_snprintf(new_path + data_dir_len,
+ new_path_len - data_dir_len,
+ "%c%s.%s",
+ OS_FILE_PATH_SEPARATOR,
+ tablename,
+ extention);
+
+ srv_normalize_path_for_win(new_path);
+
+ return(new_path);
+}
+
+/****************************************************************//**
+This function reduces a null-terminated full remote path name into
+the path that is sent by MySQL for DATA DIRECTORY clause. It replaces
+the 'databasename/tablename.ibd' found at the end of the path with just
+'tablename'.
+
+Since the result is always smaller than the path sent in, no new memory
+is allocated. The caller should allocate memory for the path sent in.
+This function manipulates that path in place.
+
+If the path format is not as expected, just return. The result is used
+to inform a SHOW CREATE TABLE command. */
+UNIV_INTERN
+void
+os_file_make_data_dir_path(
+/*========================*/
+ char* data_dir_path) /*!< in/out: full path/data_dir_path */
+{
+ char* ptr;
+ char* tablename;
+ ulint tablename_len;
+
+ /* Replace the period before the extension with a null byte. */
+ ptr = strrchr((char*) data_dir_path, '.');
+ if (!ptr) {
+ return;
+ }
+ ptr[0] = '\0';
+
+ /* The tablename starts after the last slash. */
+ ptr = strrchr((char*) data_dir_path, OS_FILE_PATH_SEPARATOR);
+ if (!ptr) {
+ return;
+ }
+ ptr[0] = '\0';
+ tablename = ptr + 1;
+
+ /* The databasename starts after the next to last slash. */
+ ptr = strrchr((char*) data_dir_path, OS_FILE_PATH_SEPARATOR);
+ if (!ptr) {
+ return;
+ }
+ tablename_len = ut_strlen(tablename);
+
+ ut_memmove(++ptr, tablename, tablename_len);
+
+ ptr[tablename_len] = '\0';
+}
+
+/****************************************************************//**
+The function os_file_dirname returns a directory component of a
+null-terminated pathname string. In the usual case, dirname returns
+the string up to, but not including, the final '/', and basename
+is the component following the final '/'. Trailing '/' characters
+are not counted as part of the pathname.
+
+If path does not contain a slash, dirname returns the string ".".
+
+Concatenating the string returned by dirname, a "/", and the basename
+yields a complete pathname.
+
+The return value is a copy of the directory component of the pathname.
+The copy is allocated from heap. It is the caller responsibility
+to free it after it is no longer needed.
+
+The following list of examples (taken from SUSv2) shows the strings
+returned by dirname and basename for different paths:
+
+ path dirname basename
+ "/usr/lib" "/usr" "lib"
+ "/usr/" "/" "usr"
+ "usr" "." "usr"
+ "/" "/" "/"
+ "." "." "."
+ ".." "." ".."
+
+@return own: directory component of the pathname */
+UNIV_INTERN
+char*
+os_file_dirname(
+/*============*/
+ const char* path) /*!< in: pathname */
+{
+ /* Find the offset of the last slash */
+ const char* last_slash = strrchr(path, OS_FILE_PATH_SEPARATOR);
+ if (!last_slash) {
+ /* No slash in the path, return "." */
+
+ return(mem_strdup("."));
+ }
+
+ /* Ok, there is a slash */
+
+ if (last_slash == path) {
+ /* last slash is the first char of the path */
+
+ return(mem_strdup("/"));
+ }
+
+ /* Non-trivial directory component */
+
+ return(mem_strdupl(path, last_slash - path));
+}
+
+/****************************************************************//**
+Creates all missing subdirectories along the given path.
+@return TRUE if call succeeded FALSE otherwise */
+UNIV_INTERN
+ibool
+os_file_create_subdirs_if_needed(
+/*=============================*/
+ const char* path) /*!< in: path name */
+{
+ if (srv_read_only_mode) {
+
+ ib_logf(IB_LOG_LEVEL_ERROR,
+ "read only mode set. Can't create subdirectories '%s'",
+ path);
+
+ return(FALSE);
+
+ }
+
+ char* subdir = os_file_dirname(path);
+
+ if (strlen(subdir) == 1
+ && (*subdir == OS_FILE_PATH_SEPARATOR || *subdir == '.')) {
+ /* subdir is root or cwd, nothing to do */
+ mem_free(subdir);
+
+ return(TRUE);
+ }
+
+ /* Test if subdir exists */
+ os_file_type_t type;
+ ibool subdir_exists;
+ ibool success = os_file_status(subdir, &subdir_exists, &type);
+
+ if (success && !subdir_exists) {
+
+ /* subdir does not exist, create it */
+ success = os_file_create_subdirs_if_needed(subdir);
+
+ if (!success) {
+ mem_free(subdir);
+
+ return(FALSE);
+ }
+
+ success = os_file_create_directory(subdir, FALSE);
+ }
+
+ mem_free(subdir);
+
+ return(success);
+}
+
+#ifndef UNIV_HOTBACKUP
+/****************************************************************//**
+Returns a pointer to the nth slot in the aio array.
+@return pointer to slot */
+static
+os_aio_slot_t*
+os_aio_array_get_nth_slot(
+/*======================*/
+ os_aio_array_t* array, /*!< in: aio array */
+ ulint index) /*!< in: index of the slot */
+{
+ ut_a(index < array->n_slots);
+
+ return(&array->slots[index]);
+}
+
+#if defined(LINUX_NATIVE_AIO)
+/******************************************************************//**
+Creates an io_context for native linux AIO.
+@return TRUE on success. */
+static
+ibool
+os_aio_linux_create_io_ctx(
+/*=======================*/
+ ulint max_events, /*!< in: number of events. */
+ io_context_t* io_ctx) /*!< out: io_ctx to initialize. */
+{
+ int ret;
+ ulint retries = 0;
+
+retry:
+ memset(io_ctx, 0x0, sizeof(*io_ctx));
+
+ /* Initialize the io_ctx. Tell it how many pending
+ IO requests this context will handle. */
+
+ ret = io_setup(max_events, io_ctx);
+ if (ret == 0) {
+#if defined(UNIV_AIO_DEBUG)
+ fprintf(stderr,
+ "InnoDB: Linux native AIO:"
+ " initialized io_ctx for segment\n");
+#endif
+ /* Success. Return now. */
+ return(TRUE);
+ }
+
+ /* If we hit EAGAIN we'll make a few attempts before failing. */
+
+ switch (ret) {
+ case -EAGAIN:
+ if (retries == 0) {
+ /* First time around. */
+ ut_print_timestamp(stderr);
+ fprintf(stderr,
+ " InnoDB: Warning: io_setup() failed"
+ " with EAGAIN. Will make %d attempts"
+ " before giving up.\n",
+ OS_AIO_IO_SETUP_RETRY_ATTEMPTS);
+ }
+
+ if (retries < OS_AIO_IO_SETUP_RETRY_ATTEMPTS) {
+ ++retries;
+ fprintf(stderr,
+ "InnoDB: Warning: io_setup() attempt"
+ " %lu failed.\n",
+ retries);
+ os_thread_sleep(OS_AIO_IO_SETUP_RETRY_SLEEP);
+ goto retry;
+ }
+
+ /* Have tried enough. Better call it a day. */
+ ut_print_timestamp(stderr);
+ fprintf(stderr,
+ " InnoDB: Error: io_setup() failed"
+ " with EAGAIN after %d attempts.\n",
+ OS_AIO_IO_SETUP_RETRY_ATTEMPTS);
+ break;
+
+ case -ENOSYS:
+ ut_print_timestamp(stderr);
+ fprintf(stderr,
+ " InnoDB: Error: Linux Native AIO interface"
+ " is not supported on this platform. Please"
+ " check your OS documentation and install"
+ " appropriate binary of InnoDB.\n");
+
+ break;
+
+ default:
+ ut_print_timestamp(stderr);
+ fprintf(stderr,
+ " InnoDB: Error: Linux Native AIO setup"
+ " returned following error[%d]\n", -ret);
+ break;
+ }
+
+ fprintf(stderr,
+ "InnoDB: You can disable Linux Native AIO by"
+ " setting innodb_use_native_aio = 0 in my.cnf\n");
+ return(FALSE);
+}
+
+/******************************************************************//**
+Checks if the system supports native linux aio. On some kernel
+versions where native aio is supported it won't work on tmpfs. In such
+cases we can't use native aio as it is not possible to mix simulated
+and native aio.
+@return: TRUE if supported, FALSE otherwise. */
+static
+ibool
+os_aio_native_aio_supported(void)
+/*=============================*/
+{
+ int fd;
+ io_context_t io_ctx;
+ char name[1000];
+
+ if (!os_aio_linux_create_io_ctx(1, &io_ctx)) {
+ /* The platform does not support native aio. */
+ return(FALSE);
+ } else if (!srv_read_only_mode) {
+ /* Now check if tmpdir supports native aio ops. */
+ fd = innobase_mysql_tmpfile(NULL);
+
+ if (fd < 0) {
+ ib_logf(IB_LOG_LEVEL_WARN,
+ "Unable to create temp file to check "
+ "native AIO support.");
+
+ return(FALSE);
+ }
+ } else {
+
+ srv_normalize_path_for_win(srv_log_group_home_dir);
+
+ ulint dirnamelen = strlen(srv_log_group_home_dir);
+ ut_a(dirnamelen < (sizeof name) - 10 - sizeof "ib_logfile");
+ memcpy(name, srv_log_group_home_dir, dirnamelen);
+
+ /* Add a path separator if needed. */
+ if (dirnamelen && name[dirnamelen - 1] != SRV_PATH_SEPARATOR) {
+ name[dirnamelen++] = SRV_PATH_SEPARATOR;
+ }
+
+ strcpy(name + dirnamelen, "ib_logfile0");
+
+ fd = ::open(name, O_RDONLY | O_CLOEXEC);
+
+ if (fd == -1) {
+
+ ib_logf(IB_LOG_LEVEL_WARN,
+ "Unable to open \"%s\" to check "
+ "native AIO read support.", name);
+
+ return(FALSE);
+ }
+ }
+
+ struct io_event io_event;
+
+ memset(&io_event, 0x0, sizeof(io_event));
+
+ byte* buf = static_cast<byte*>(ut_malloc(UNIV_PAGE_SIZE * 2));
+ byte* ptr = static_cast<byte*>(ut_align(buf, UNIV_PAGE_SIZE));
+
+ struct iocb iocb;
+
+ /* Suppress valgrind warning. */
+ memset(buf, 0x00, UNIV_PAGE_SIZE * 2);
+ memset(&iocb, 0x0, sizeof(iocb));
+
+ struct iocb* p_iocb = &iocb;
+
+ if (!srv_read_only_mode) {
+ io_prep_pwrite(p_iocb, fd, ptr, UNIV_PAGE_SIZE, 0);
+ } else {
+ ut_a(UNIV_PAGE_SIZE >= 512);
+ io_prep_pread(p_iocb, fd, ptr, 512, 0);
+ }
+
+ int err = io_submit(io_ctx, 1, &p_iocb);
+
+ if (err >= 1) {
+ /* Now collect the submitted IO request. */
+ err = io_getevents(io_ctx, 1, 1, &io_event, NULL);
+ }
+
+ ut_free(buf);
+ close(fd);
+
+ switch (err) {
+ case 1:
+ return(TRUE);
+
+ case -EINVAL:
+ case -ENOSYS:
+ ib_logf(IB_LOG_LEVEL_ERROR,
+ "Linux Native AIO not supported. You can either "
+ "move %s to a file system that supports native "
+ "AIO or you can set innodb_use_native_aio to "
+ "FALSE to avoid this message.",
+ srv_read_only_mode ? name : "tmpdir");
+
+ /* fall through. */
+ default:
+ ib_logf(IB_LOG_LEVEL_ERROR,
+ "Linux Native AIO check on %s returned error[%d]",
+ srv_read_only_mode ? name : "tmpdir", -err);
+ }
+
+ return(FALSE);
+}
+#endif /* LINUX_NATIVE_AIO */
+
+/******************************************************************//**
+Creates an aio wait array. Note that we return NULL in case of failure.
+We don't care about freeing memory here because we assume that a
+failure will result in server refusing to start up.
+@return own: aio array, NULL on failure */
+static
+os_aio_array_t*
+os_aio_array_create(
+/*================*/
+ ulint n, /*!< in: maximum number of pending aio
+ operations allowed; n must be
+ divisible by n_segments */
+ ulint n_segments) /*!< in: number of segments in the aio array */
+{
+ os_aio_array_t* array;
+#ifdef WIN_ASYNC_IO
+ OVERLAPPED* over;
+#elif defined(LINUX_NATIVE_AIO)
+ struct io_event* io_event = NULL;
+#endif /* WIN_ASYNC_IO */
+ ut_a(n > 0);
+ ut_a(n_segments > 0);
+
+ array = static_cast<os_aio_array_t*>(ut_malloc(sizeof(*array)));
+ memset(array, 0x0, sizeof(*array));
+
+ array->mutex = os_mutex_create();
+ array->not_full = os_event_create();
+ array->is_empty = os_event_create();
+
+ os_event_set(array->is_empty);
+
+ array->n_slots = n;
+ array->n_segments = n_segments;
+
+ array->slots = static_cast<os_aio_slot_t*>(
+ ut_malloc(n * sizeof(*array->slots)));
+
+ memset(array->slots, 0x0, sizeof(n * sizeof(*array->slots)));
+#ifdef __WIN__
+ array->handles = static_cast<HANDLE*>(ut_malloc(n * sizeof(HANDLE)));
+#endif /* __WIN__ */
+
+#if defined(LINUX_NATIVE_AIO)
+ array->aio_ctx = NULL;
+ array->aio_events = NULL;
+
+ /* If we are not using native aio interface then skip this
+ part of initialization. */
+ if (!srv_use_native_aio) {
+ goto skip_native_aio;
+ }
+
+ /* Initialize the io_context array. One io_context
+ per segment in the array. */
+
+ array->aio_ctx = static_cast<io_context**>(
+ ut_malloc(n_segments * sizeof(*array->aio_ctx)));
+
+ for (ulint i = 0; i < n_segments; ++i) {
+ if (!os_aio_linux_create_io_ctx(n/n_segments,
+ &array->aio_ctx[i])) {
+ /* If something bad happened during aio setup
+ we disable linux native aio.
+ The disadvantage will be a small memory leak
+ at shutdown but that's ok compared to a crash
+ or a not working server.
+ This frequently happens when running the test suite
+ with many threads on a system with low fs.aio-max-nr!
+ */
+
+ fprintf(stderr,
+ " InnoDB: Warning: Linux Native AIO disabled "
+ "because os_aio_linux_create_io_ctx() "
+ "failed. To get rid of this warning you can "
+ "try increasing system "
+ "fs.aio-max-nr to 1048576 or larger or "
+ "setting innodb_use_native_aio = 0 in my.cnf\n");
+ srv_use_native_aio = FALSE;
+ goto skip_native_aio;
+ }
+ }
+
+ /* Initialize the event array. One event per slot. */
+ io_event = static_cast<struct io_event*>(
+ ut_malloc(n * sizeof(*io_event)));
+
+ memset(io_event, 0x0, sizeof(*io_event) * n);
+ array->aio_events = io_event;
+
+skip_native_aio:
+#endif /* LINUX_NATIVE_AIO */
+ for (ulint i = 0; i < n; i++) {
+ os_aio_slot_t* slot;
+
+ slot = os_aio_array_get_nth_slot(array, i);
+
+ slot->pos = i;
+ slot->reserved = FALSE;
+#ifdef WIN_ASYNC_IO
+ slot->handle = CreateEvent(NULL,TRUE, FALSE, NULL);
+
+ over = &slot->control;
+
+ over->hEvent = slot->handle;
+
+ array->handles[i] = over->hEvent;
+
+#elif defined(LINUX_NATIVE_AIO)
+ memset(&slot->control, 0x0, sizeof(slot->control));
+ slot->n_bytes = 0;
+ slot->ret = 0;
+#endif /* WIN_ASYNC_IO */
+ }
+
+ return(array);
+}
+
+/************************************************************************//**
+Frees an aio wait array. */
+static
+void
+os_aio_array_free(
+/*==============*/
+ os_aio_array_t*& array) /*!< in, own: array to free */
+{
+#ifdef WIN_ASYNC_IO
+ ulint i;
+
+ for (i = 0; i < array->n_slots; i++) {
+ os_aio_slot_t* slot = os_aio_array_get_nth_slot(array, i);
+ CloseHandle(slot->handle);
+ }
+#endif /* WIN_ASYNC_IO */
+
+#ifdef __WIN__
+ ut_free(array->handles);
+#endif /* __WIN__ */
+ os_mutex_free(array->mutex);
+ os_event_free(array->not_full);
+ os_event_free(array->is_empty);
+
+#if defined(LINUX_NATIVE_AIO)
+ if (srv_use_native_aio) {
+ ut_free(array->aio_events);
+ ut_free(array->aio_ctx);
+ }
+#endif /* LINUX_NATIVE_AIO */
+
+ ut_free(array->slots);
+ ut_free(array);
+
+ array = 0;
+}
+
+/***********************************************************************
+Initializes the asynchronous io system. Creates one array each for ibuf
+and log i/o. Also creates one array each for read and write where each
+array is divided logically into n_read_segs and n_write_segs
+respectively. The caller must create an i/o handler thread for each
+segment in these arrays. This function also creates the sync array.
+No i/o handler thread needs to be created for that */
+UNIV_INTERN
+ibool
+os_aio_init(
+/*========*/
+ ulint n_per_seg, /*<! in: maximum number of pending aio
+ operations allowed per segment */
+ ulint n_read_segs, /*<! in: number of reader threads */
+ ulint n_write_segs, /*<! in: number of writer threads */
+ ulint n_slots_sync) /*<! in: number of slots in the sync aio
+ array */
+{
+ os_io_init_simple();
+
+#if defined(LINUX_NATIVE_AIO)
+ /* Check if native aio is supported on this system and tmpfs */
+ if (srv_use_native_aio && !os_aio_native_aio_supported()) {
+
+ ib_logf(IB_LOG_LEVEL_WARN, "Linux Native AIO disabled.");
+
+ srv_use_native_aio = FALSE;
+ }
+#endif /* LINUX_NATIVE_AIO */
+
+ srv_reset_io_thread_op_info();
+
+ os_aio_read_array = os_aio_array_create(
+ n_read_segs * n_per_seg, n_read_segs);
+
+ if (os_aio_read_array == NULL) {
+ return(FALSE);
+ }
+
+ ulint start = (srv_read_only_mode) ? 0 : 2;
+ ulint n_segs = n_read_segs + start;
+
+ /* 0 is the ibuf segment and 1 is the insert buffer segment. */
+ for (ulint i = start; i < n_segs; ++i) {
+ ut_a(i < SRV_MAX_N_IO_THREADS);
+ srv_io_thread_function[i] = "read thread";
+ }
+
+ ulint n_segments = n_read_segs;
+
+ if (!srv_read_only_mode) {
+
+ os_aio_log_array = os_aio_array_create(n_per_seg, 1);
+
+ if (os_aio_log_array == NULL) {
+ return(FALSE);
+ }
+
+ ++n_segments;
+
+ srv_io_thread_function[1] = "log thread";
+
+ os_aio_ibuf_array = os_aio_array_create(n_per_seg, 1);
+
+ if (os_aio_ibuf_array == NULL) {
+ return(FALSE);
+ }
+
+ ++n_segments;
+
+ srv_io_thread_function[0] = "insert buffer thread";
+
+ os_aio_write_array = os_aio_array_create(
+ n_write_segs * n_per_seg, n_write_segs);
+
+ if (os_aio_write_array == NULL) {
+ return(FALSE);
+ }
+
+ n_segments += n_write_segs;
+
+ for (ulint i = start + n_read_segs; i < n_segments; ++i) {
+ ut_a(i < SRV_MAX_N_IO_THREADS);
+ srv_io_thread_function[i] = "write thread";
+ }
+
+ ut_ad(n_segments >= 4);
+ } else {
+ ut_ad(n_segments > 0);
+ }
+
+ os_aio_sync_array = os_aio_array_create(n_slots_sync, 1);
+
+ if (os_aio_sync_array == NULL) {
+ return(FALSE);
+ }
+
+ os_aio_n_segments = n_segments;
+
+ os_aio_validate();
+
+ os_last_printout = ut_time();
+
+ if (srv_use_native_aio) {
+ return(TRUE);
+ }
+
+ os_aio_segment_wait_events = static_cast<os_event_t*>(
+ ut_malloc(n_segments * sizeof *os_aio_segment_wait_events));
+
+ for (ulint i = 0; i < n_segments; ++i) {
+ os_aio_segment_wait_events[i] = os_event_create();
+ }
+
+ return(TRUE);
+}
+
+/***********************************************************************
+Frees the asynchronous io system. */
+UNIV_INTERN
+void
+os_aio_free(void)
+/*=============*/
+{
+ if (os_aio_ibuf_array != 0) {
+ os_aio_array_free(os_aio_ibuf_array);
+ }
+
+ if (os_aio_log_array != 0) {
+ os_aio_array_free(os_aio_log_array);
+ }
+
+ if (os_aio_write_array != 0) {
+ os_aio_array_free(os_aio_write_array);
+ }
+
+ if (os_aio_sync_array != 0) {
+ os_aio_array_free(os_aio_sync_array);
+ }
+
+ os_aio_array_free(os_aio_read_array);
+
+ if (!srv_use_native_aio) {
+ for (ulint i = 0; i < os_aio_n_segments; i++) {
+ os_event_free(os_aio_segment_wait_events[i]);
+ }
+ }
+
+ ut_free(os_aio_segment_wait_events);
+ os_aio_segment_wait_events = 0;
+ os_aio_n_segments = 0;
+}
+
+#ifdef WIN_ASYNC_IO
+/************************************************************************//**
+Wakes up all async i/o threads in the array in Windows async i/o at
+shutdown. */
+static
+void
+os_aio_array_wake_win_aio_at_shutdown(
+/*==================================*/
+ os_aio_array_t* array) /*!< in: aio array */
+{
+ ulint i;
+
+ for (i = 0; i < array->n_slots; i++) {
+
+ SetEvent((array->slots + i)->handle);
+ }
+}
+#endif
+
+/************************************************************************//**
+Wakes up all async i/o threads so that they know to exit themselves in
+shutdown. */
+UNIV_INTERN
+void
+os_aio_wake_all_threads_at_shutdown(void)
+/*=====================================*/
+{
+#ifdef WIN_ASYNC_IO
+ /* This code wakes up all ai/o threads in Windows native aio */
+ os_aio_array_wake_win_aio_at_shutdown(os_aio_read_array);
+ if (os_aio_write_array != 0) {
+ os_aio_array_wake_win_aio_at_shutdown(os_aio_write_array);
+ }
+
+ if (os_aio_ibuf_array != 0) {
+ os_aio_array_wake_win_aio_at_shutdown(os_aio_ibuf_array);
+ }
+
+ if (os_aio_log_array != 0) {
+ os_aio_array_wake_win_aio_at_shutdown(os_aio_log_array);
+ }
+#elif defined(LINUX_NATIVE_AIO)
+ /* When using native AIO interface the io helper threads
+ wait on io_getevents with a timeout value of 500ms. At
+ each wake up these threads check the server status.
+ No need to do anything to wake them up. */
+#endif /* !WIN_ASYNC_AIO */
+
+ if (srv_use_native_aio) {
+ return;
+ }
+
+ /* This loop wakes up all simulated ai/o threads */
+
+ for (ulint i = 0; i < os_aio_n_segments; i++) {
+
+ os_event_set(os_aio_segment_wait_events[i]);
+ }
+}
+
+/************************************************************************//**
+Waits until there are no pending writes in os_aio_write_array. There can
+be other, synchronous, pending writes. */
+UNIV_INTERN
+void
+os_aio_wait_until_no_pending_writes(void)
+/*=====================================*/
+{
+ ut_ad(!srv_read_only_mode);
+ os_event_wait(os_aio_write_array->is_empty);
+}
+
+/**********************************************************************//**
+Calculates segment number for a slot.
+@return segment number (which is the number used by, for example,
+i/o-handler threads) */
+static
+ulint
+os_aio_get_segment_no_from_slot(
+/*============================*/
+ os_aio_array_t* array, /*!< in: aio wait array */
+ os_aio_slot_t* slot) /*!< in: slot in this array */
+{
+ ulint segment;
+ ulint seg_len;
+
+ if (array == os_aio_ibuf_array) {
+ ut_ad(!srv_read_only_mode);
+
+ segment = IO_IBUF_SEGMENT;
+
+ } else if (array == os_aio_log_array) {
+ ut_ad(!srv_read_only_mode);
+
+ segment = IO_LOG_SEGMENT;
+
+ } else if (array == os_aio_read_array) {
+ seg_len = os_aio_read_array->n_slots
+ / os_aio_read_array->n_segments;
+
+ segment = (srv_read_only_mode ? 0 : 2) + slot->pos / seg_len;
+ } else {
+ ut_ad(!srv_read_only_mode);
+ ut_a(array == os_aio_write_array);
+
+ seg_len = os_aio_write_array->n_slots
+ / os_aio_write_array->n_segments;
+
+ segment = os_aio_read_array->n_segments + 2
+ + slot->pos / seg_len;
+ }
+
+ return(segment);
+}
+
+/**********************************************************************//**
+Calculates local segment number and aio array from global segment number.
+@return local segment number within the aio array */
+static
+ulint
+os_aio_get_array_and_local_segment(
+/*===============================*/
+ os_aio_array_t** array, /*!< out: aio wait array */
+ ulint global_segment)/*!< in: global segment number */
+{
+ ulint segment;
+
+ ut_a(global_segment < os_aio_n_segments);
+
+ if (srv_read_only_mode) {
+ *array = os_aio_read_array;
+
+ return(global_segment);
+ } else if (global_segment == IO_IBUF_SEGMENT) {
+ *array = os_aio_ibuf_array;
+ segment = 0;
+
+ } else if (global_segment == IO_LOG_SEGMENT) {
+ *array = os_aio_log_array;
+ segment = 0;
+
+ } else if (global_segment < os_aio_read_array->n_segments + 2) {
+ *array = os_aio_read_array;
+
+ segment = global_segment - 2;
+ } else {
+ *array = os_aio_write_array;
+
+ segment = global_segment - (os_aio_read_array->n_segments + 2);
+ }
+
+ return(segment);
+}
+
+/*******************************************************************//**
+Requests for a slot in the aio array. If no slot is available, waits until
+not_full-event becomes signaled.
+@return pointer to slot */
+static
+os_aio_slot_t*
+os_aio_array_reserve_slot(
+/*======================*/
+ ulint type, /*!< in: OS_FILE_READ or OS_FILE_WRITE */
+ os_aio_array_t* array, /*!< in: aio array */
+ fil_node_t* message1,/*!< in: message to be passed along with
+ the aio operation */
+ void* message2,/*!< in: message to be passed along with
+ the aio operation */
+ pfs_os_file_t file, /*!< in: file handle */
+ const char* name, /*!< in: name of the file or path as a
+ null-terminated string */
+ void* buf, /*!< in: buffer where to read or from which
+ to write */
+ os_offset_t offset, /*!< in: file offset */
+ ulint len) /*!< in: length of the block to read or write */
+{
+ os_aio_slot_t* slot = NULL;
+#ifdef WIN_ASYNC_IO
+ OVERLAPPED* control;
+
+#elif defined(LINUX_NATIVE_AIO)
+
+ struct iocb* iocb;
+ off_t aio_offset;
+
+#endif /* WIN_ASYNC_IO */
+ ulint i;
+ ulint counter;
+ ulint slots_per_seg;
+ ulint local_seg;
+
+#ifdef WIN_ASYNC_IO
+ ut_a((len & 0xFFFFFFFFUL) == len);
+#endif /* WIN_ASYNC_IO */
+
+ /* No need of a mutex. Only reading constant fields */
+ slots_per_seg = array->n_slots / array->n_segments;
+
+ /* We attempt to keep adjacent blocks in the same local
+ segment. This can help in merging IO requests when we are
+ doing simulated AIO */
+ local_seg = (offset >> (UNIV_PAGE_SIZE_SHIFT + 6))
+ % array->n_segments;
+
+loop:
+ os_mutex_enter(array->mutex);
+
+ if (array->n_reserved == array->n_slots) {
+ os_mutex_exit(array->mutex);
+
+ if (!srv_use_native_aio) {
+ /* If the handler threads are suspended, wake them
+ so that we get more slots */
+
+ os_aio_simulated_wake_handler_threads();
+ }
+
+ os_event_wait(array->not_full);
+
+ goto loop;
+ }
+
+ /* We start our search for an available slot from our preferred
+ local segment and do a full scan of the array. We are
+ guaranteed to find a slot in full scan. */
+ for (i = local_seg * slots_per_seg, counter = 0;
+ counter < array->n_slots;
+ i++, counter++) {
+
+ i %= array->n_slots;
+
+ slot = os_aio_array_get_nth_slot(array, i);
+
+ if (slot->reserved == FALSE) {
+ goto found;
+ }
+ }
+
+ /* We MUST always be able to get hold of a reserved slot. */
+ ut_error;
+
+found:
+ ut_a(slot->reserved == FALSE);
+ array->n_reserved++;
+
+ if (array->n_reserved == 1) {
+ os_event_reset(array->is_empty);
+ }
+
+ if (array->n_reserved == array->n_slots) {
+ os_event_reset(array->not_full);
+ }
+
+ slot->reserved = TRUE;
+ slot->reservation_time = ut_time();
+ slot->message1 = message1;
+ slot->message2 = message2;
+ slot->file = file;
+ slot->name = name;
+ slot->len = len;
+ slot->type = type;
+ slot->buf = static_cast<byte*>(buf);
+ slot->offset = offset;
+ slot->io_already_done = FALSE;
+
+#ifdef WIN_ASYNC_IO
+ control = &slot->control;
+ control->Offset = (DWORD) offset & 0xFFFFFFFF;
+ control->OffsetHigh = (DWORD) (offset >> 32);
+ ResetEvent(slot->handle);
+
+#elif defined(LINUX_NATIVE_AIO)
+
+ /* If we are not using native AIO skip this part. */
+ if (!srv_use_native_aio) {
+ goto skip_native_aio;
+ }
+
+ /* Check if we are dealing with 64 bit arch.
+ If not then make sure that offset fits in 32 bits. */
+ aio_offset = (off_t) offset;
+
+ ut_a(sizeof(aio_offset) >= sizeof(offset)
+ || ((os_offset_t) aio_offset) == offset);
+
+ iocb = &slot->control;
+
+ if (type == OS_FILE_READ) {
+ io_prep_pread(iocb, file.m_file, buf, len, aio_offset);
+ } else {
+ ut_a(type == OS_FILE_WRITE);
+ io_prep_pwrite(iocb, file.m_file, buf, len, aio_offset);
+ }
+
+ iocb->data = (void*) slot;
+ slot->n_bytes = 0;
+ slot->ret = 0;
+
+skip_native_aio:
+#endif /* LINUX_NATIVE_AIO */
+ os_mutex_exit(array->mutex);
+
+ return(slot);
+}
+
+/*******************************************************************//**
+Frees a slot in the aio array. */
+static
+void
+os_aio_array_free_slot(
+/*===================*/
+ os_aio_array_t* array, /*!< in: aio array */
+ os_aio_slot_t* slot) /*!< in: pointer to slot */
+{
+ os_mutex_enter(array->mutex);
+
+ ut_ad(slot->reserved);
+
+ slot->reserved = FALSE;
+
+ array->n_reserved--;
+
+ if (array->n_reserved == array->n_slots - 1) {
+ os_event_set(array->not_full);
+ }
+
+ if (array->n_reserved == 0) {
+ os_event_set(array->is_empty);
+ }
+
+#ifdef WIN_ASYNC_IO
+
+ ResetEvent(slot->handle);
+
+#elif defined(LINUX_NATIVE_AIO)
+
+ if (srv_use_native_aio) {
+ memset(&slot->control, 0x0, sizeof(slot->control));
+ slot->n_bytes = 0;
+ slot->ret = 0;
+ /*fprintf(stderr, "Freed up Linux native slot.\n");*/
+ } else {
+ /* These fields should not be used if we are not
+ using native AIO. */
+ ut_ad(slot->n_bytes == 0);
+ ut_ad(slot->ret == 0);
+ }
+
+#endif
+ os_mutex_exit(array->mutex);
+}
+
+/**********************************************************************//**
+Wakes up a simulated aio i/o-handler thread if it has something to do. */
+static
+void
+os_aio_simulated_wake_handler_thread(
+/*=================================*/
+ ulint global_segment) /*!< in: the number of the segment in the aio
+ arrays */
+{
+ os_aio_array_t* array;
+ ulint segment;
+
+ ut_ad(!srv_use_native_aio);
+
+ segment = os_aio_get_array_and_local_segment(&array, global_segment);
+
+ ulint n = array->n_slots / array->n_segments;
+
+ segment *= n;
+
+ /* Look through n slots after the segment * n'th slot */
+
+ os_mutex_enter(array->mutex);
+
+ for (ulint i = 0; i < n; ++i) {
+ const os_aio_slot_t* slot;
+
+ slot = os_aio_array_get_nth_slot(array, segment + i);
+
+ if (slot->reserved) {
+
+ /* Found an i/o request */
+
+ os_mutex_exit(array->mutex);
+
+ os_event_t event;
+
+ event = os_aio_segment_wait_events[global_segment];
+
+ os_event_set(event);
+
+ return;
+ }
+ }
+
+ os_mutex_exit(array->mutex);
+}
+
+/**********************************************************************//**
+Wakes up simulated aio i/o-handler threads if they have something to do. */
+UNIV_INTERN
+void
+os_aio_simulated_wake_handler_threads(void)
+/*=======================================*/
+{
+ if (srv_use_native_aio) {
+ /* We do not use simulated aio: do nothing */
+
+ return;
+ }
+
+ os_aio_recommend_sleep_for_read_threads = FALSE;
+
+ for (ulint i = 0; i < os_aio_n_segments; i++) {
+ os_aio_simulated_wake_handler_thread(i);
+ }
+}
+
+#ifdef _WIN32
+/**********************************************************************//**
+This function can be called if one wants to post a batch of reads and
+prefers an i/o-handler thread to handle them all at once later. You must
+call os_aio_simulated_wake_handler_threads later to ensure the threads
+are not left sleeping! */
+UNIV_INTERN
+void
+os_aio_simulated_put_read_threads_to_sleep()
+{
+
+/* The idea of putting background IO threads to sleep is only for
+Windows when using simulated AIO. Windows XP seems to schedule
+background threads too eagerly to allow for coalescing during
+readahead requests. */
+
+ os_aio_array_t* array;
+
+ if (srv_use_native_aio) {
+ /* We do not use simulated aio: do nothing */
+
+ return;
+ }
+
+ os_aio_recommend_sleep_for_read_threads = TRUE;
+
+ for (ulint i = 0; i < os_aio_n_segments; i++) {
+ os_aio_get_array_and_local_segment(&array, i);
+
+ if (array == os_aio_read_array) {
+
+ os_event_reset(os_aio_segment_wait_events[i]);
+ }
+ }
+}
+#endif /* _WIN32 */
+
+#if defined(LINUX_NATIVE_AIO)
+/*******************************************************************//**
+Dispatch an AIO request to the kernel.
+@return TRUE on success. */
+static
+ibool
+os_aio_linux_dispatch(
+/*==================*/
+ os_aio_array_t* array, /*!< in: io request array. */
+ os_aio_slot_t* slot) /*!< in: an already reserved slot. */
+{
+ int ret;
+ ulint io_ctx_index;
+ struct iocb* iocb;
+
+ ut_ad(slot != NULL);
+ ut_ad(array);
+
+ ut_a(slot->reserved);
+
+ /* Find out what we are going to work with.
+ The iocb struct is directly in the slot.
+ The io_context is one per segment. */
+
+ iocb = &slot->control;
+ io_ctx_index = (slot->pos * array->n_segments) / array->n_slots;
+
+ ret = io_submit(array->aio_ctx[io_ctx_index], 1, &iocb);
+
+#if defined(UNIV_AIO_DEBUG)
+ fprintf(stderr,
+ "io_submit[%c] ret[%d]: slot[%p] ctx[%p] seg[%lu]\n",
+ (slot->type == OS_FILE_WRITE) ? 'w' : 'r', ret, slot,
+ array->aio_ctx[io_ctx_index], (ulong) io_ctx_index);
+#endif
+
+ /* io_submit returns number of successfully
+ queued requests or -errno. */
+ if (UNIV_UNLIKELY(ret != 1)) {
+ errno = -ret;
+ return(FALSE);
+ }
+
+ return(TRUE);
+}
+#endif /* LINUX_NATIVE_AIO */
+
+
+/*******************************************************************//**
+NOTE! Use the corresponding macro os_aio(), not directly this function!
+Requests an asynchronous i/o operation.
+@return TRUE if request was queued successfully, FALSE if fail */
+UNIV_INTERN
+ibool
+os_aio_func(
+/*========*/
+ ulint type, /*!< in: OS_FILE_READ or OS_FILE_WRITE */
+ ulint mode, /*!< in: OS_AIO_NORMAL, ..., possibly ORed
+ to OS_AIO_SIMULATED_WAKE_LATER: the
+ last flag advises this function not to wake
+ i/o-handler threads, but the caller will
+ do the waking explicitly later, in this
+ way the caller can post several requests in
+ a batch; NOTE that the batch must not be
+ so big that it exhausts the slots in aio
+ arrays! NOTE that a simulated batch
+ may introduce hidden chances of deadlocks,
+ because i/os are not actually handled until
+ all have been posted: use with great
+ caution! */
+ const char* name, /*!< in: name of the file or path as a
+ null-terminated string */
+ pfs_os_file_t file, /*!< in: handle to a file */
+ void* buf, /*!< in: buffer where to read or from which
+ to write */
+ os_offset_t offset, /*!< in: file offset where to read or write */
+ ulint n, /*!< in: number of bytes to read or write */
+ fil_node_t* message1,/*!< in: message for the aio handler
+ (can be used to identify a completed
+ aio operation); ignored if mode is
+ OS_AIO_SYNC */
+ void* message2)/*!< in: message for the aio handler
+ (can be used to identify a completed
+ aio operation); ignored if mode is
+ OS_AIO_SYNC */
+{
+ os_aio_array_t* array;
+ os_aio_slot_t* slot;
+#ifdef WIN_ASYNC_IO
+ ibool retval;
+ BOOL ret = TRUE;
+ DWORD len = (DWORD) n;
+ struct fil_node_t* dummy_mess1;
+ void* dummy_mess2;
+ ulint dummy_type;
+#endif /* WIN_ASYNC_IO */
+ ulint wake_later;
+ ut_ad(buf);
+ ut_ad(n > 0);
+ ut_ad(n % OS_FILE_LOG_BLOCK_SIZE == 0);
+ ut_ad(offset % OS_FILE_LOG_BLOCK_SIZE == 0);
+ ut_ad(os_aio_validate_skip());
+#ifdef WIN_ASYNC_IO
+ ut_ad((n & 0xFFFFFFFFUL) == n);
+#endif
+
+ wake_later = mode & OS_AIO_SIMULATED_WAKE_LATER;
+ mode = mode & (~OS_AIO_SIMULATED_WAKE_LATER);
+
+ DBUG_EXECUTE_IF("ib_os_aio_func_io_failure_28",
+ mode = OS_AIO_SYNC; os_has_said_disk_full = FALSE;);
+
+ if (mode == OS_AIO_SYNC
+#ifdef WIN_ASYNC_IO
+ && !srv_use_native_aio
+#endif /* WIN_ASYNC_IO */
+ ) {
+ ibool ret;
+
+ /* This is actually an ordinary synchronous read or write:
+ no need to use an i/o-handler thread. NOTE that if we use
+ Windows async i/o, Windows does not allow us to use
+ ordinary synchronous os_file_read etc. on the same file,
+ therefore we have built a special mechanism for synchronous
+ wait in the Windows case.
+ Also note that the Performance Schema instrumentation has
+ been performed by current os_aio_func()'s wrapper function
+ pfs_os_aio_func(). So we would no longer need to call
+ Performance Schema instrumented os_file_read() and
+ os_file_write(). Instead, we should use os_file_read_func()
+ and os_file_write_func() */
+
+ if (type == OS_FILE_READ) {
+ ret = os_file_read_func(file.m_file, buf, offset, n);
+ } else {
+
+ ut_ad(!srv_read_only_mode);
+ ut_a(type == OS_FILE_WRITE);
+
+ ret = os_file_write_func(name, file.m_file, buf, offset, n);
+
+ DBUG_EXECUTE_IF("ib_os_aio_func_io_failure_28",
+ os_has_said_disk_full = FALSE; ret = 0; errno = 28;);
+
+ if (!ret) {
+ os_file_handle_error_cond_exit(name, "os_file_write_func", TRUE, FALSE);
+ }
+ }
+
+ return ret;
+ }
+
+try_again:
+ switch (mode) {
+ case OS_AIO_NORMAL:
+ if (type == OS_FILE_READ) {
+ array = os_aio_read_array;
+ } else {
+ ut_ad(!srv_read_only_mode);
+ array = os_aio_write_array;
+ }
+ break;
+ case OS_AIO_IBUF:
+ ut_ad(type == OS_FILE_READ);
+ /* Reduce probability of deadlock bugs in connection with ibuf:
+ do not let the ibuf i/o handler sleep */
+
+ wake_later = FALSE;
+
+ if (srv_read_only_mode) {
+ array = os_aio_read_array;
+ } else {
+ array = os_aio_ibuf_array;
+ }
+ break;
+ case OS_AIO_LOG:
+ if (srv_read_only_mode) {
+ array = os_aio_read_array;
+ } else {
+ array = os_aio_log_array;
+ }
+ break;
+ case OS_AIO_SYNC:
+ array = os_aio_sync_array;
+#if defined(LINUX_NATIVE_AIO)
+ /* In Linux native AIO we don't use sync IO array. */
+ ut_a(!srv_use_native_aio);
+#endif /* LINUX_NATIVE_AIO */
+ break;
+ default:
+ ut_error;
+ array = NULL; /* Eliminate compiler warning */
+ }
+
+ slot = os_aio_array_reserve_slot(type, array, message1, message2, file,
+ name, buf, offset, n);
+ if (type == OS_FILE_READ) {
+ if (srv_use_native_aio) {
+ os_n_file_reads++;
+ os_bytes_read_since_printout += n;
+#ifdef WIN_ASYNC_IO
+ ret = ReadFile(file.m_file, buf, (DWORD) n, &len,
+ &(slot->control));
+#elif defined(LINUX_NATIVE_AIO)
+ if (!os_aio_linux_dispatch(array, slot)) {
+ goto err_exit;
+ }
+#endif /* WIN_ASYNC_IO */
+ } else {
+ if (!wake_later) {
+ os_aio_simulated_wake_handler_thread(
+ os_aio_get_segment_no_from_slot(
+ array, slot));
+ }
+ }
+ } else if (type == OS_FILE_WRITE) {
+ ut_ad(!srv_read_only_mode);
+ if (srv_use_native_aio) {
+ os_n_file_writes++;
+#ifdef WIN_ASYNC_IO
+ ret = WriteFile(file.m_file, buf, (DWORD) n, &len,
+ &(slot->control));
+#elif defined(LINUX_NATIVE_AIO)
+ if (!os_aio_linux_dispatch(array, slot)) {
+ goto err_exit;
+ }
+#endif /* WIN_ASYNC_IO */
+ } else {
+ if (!wake_later) {
+ os_aio_simulated_wake_handler_thread(
+ os_aio_get_segment_no_from_slot(
+ array, slot));
+ }
+ }
+ } else {
+ ut_error;
+ }
+
+#ifdef WIN_ASYNC_IO
+ if (srv_use_native_aio) {
+ if ((ret && len == n)
+ || (!ret && GetLastError() == ERROR_IO_PENDING)) {
+ /* aio was queued successfully! */
+
+ if (mode == OS_AIO_SYNC) {
+ /* We want a synchronous i/o operation on a
+ file where we also use async i/o: in Windows
+ we must use the same wait mechanism as for
+ async i/o */
+
+ retval = os_aio_windows_handle(
+ ULINT_UNDEFINED, slot->pos,
+ &dummy_mess1, &dummy_mess2,
+ &dummy_type);
+
+ return(retval);
+ }
+
+ return(TRUE);
+ }
+
+ goto err_exit;
+ }
+#endif /* WIN_ASYNC_IO */
+ /* aio was queued successfully! */
+ return(TRUE);
+
+#if defined LINUX_NATIVE_AIO || defined WIN_ASYNC_IO
+err_exit:
+#endif /* LINUX_NATIVE_AIO || WIN_ASYNC_IO */
+ os_aio_array_free_slot(array, slot);
+
+ if (os_file_handle_error(
+ name,type == OS_FILE_READ ? "aio read" : "aio write")) {
+
+ goto try_again;
+ }
+
+ return(FALSE);
+}
+
+#ifdef WIN_ASYNC_IO
+/**********************************************************************//**
+This function is only used in Windows asynchronous i/o.
+Waits for an aio operation to complete. This function is used to wait the
+for completed requests. The aio array of pending requests is divided
+into segments. The thread specifies which segment or slot it wants to wait
+for. NOTE: this function will also take care of freeing the aio slot,
+therefore no other thread is allowed to do the freeing!
+@return TRUE if the aio operation succeeded */
+UNIV_INTERN
+ibool
+os_aio_windows_handle(
+/*==================*/
+ ulint segment, /*!< in: the number of the segment in the aio
+ arrays to wait for; segment 0 is the ibuf
+ i/o thread, segment 1 the log i/o thread,
+ then follow the non-ibuf read threads, and as
+ the last are the non-ibuf write threads; if
+ this is ULINT_UNDEFINED, then it means that
+ sync aio is used, and this parameter is
+ ignored */
+ ulint pos, /*!< this parameter is used only in sync aio:
+ wait for the aio slot at this position */
+ fil_node_t**message1, /*!< out: the messages passed with the aio
+ request; note that also in the case where
+ the aio operation failed, these output
+ parameters are valid and can be used to
+ restart the operation, for example */
+ void** message2,
+ ulint* type) /*!< out: OS_FILE_WRITE or ..._READ */
+{
+ ulint orig_seg = segment;
+ os_aio_array_t* array;
+ os_aio_slot_t* slot;
+ ulint n;
+ ulint i;
+ ibool ret_val;
+ BOOL ret;
+ DWORD len;
+ BOOL retry = FALSE;
+
+ if (segment == ULINT_UNDEFINED) {
+ segment = 0;
+ array = os_aio_sync_array;
+ } else {
+ segment = os_aio_get_array_and_local_segment(&array, segment);
+ }
+
+ /* NOTE! We only access constant fields in os_aio_array. Therefore
+ we do not have to acquire the protecting mutex yet */
+
+ ut_ad(os_aio_validate_skip());
+ ut_ad(segment < array->n_segments);
+
+ n = array->n_slots / array->n_segments;
+
+ if (array == os_aio_sync_array) {
+
+ WaitForSingleObject(
+ os_aio_array_get_nth_slot(array, pos)->handle,
+ INFINITE);
+
+ i = pos;
+
+ } else {
+ if (orig_seg != ULINT_UNDEFINED) {
+ srv_set_io_thread_op_info(orig_seg, "wait Windows aio");
+ }
+
+ i = WaitForMultipleObjects(
+ (DWORD) n, array->handles + segment * n,
+ FALSE, INFINITE);
+ }
+
+ os_mutex_enter(array->mutex);
+
+ if (srv_shutdown_state == SRV_SHUTDOWN_EXIT_THREADS
+ && array->n_reserved == 0) {
+ *message1 = NULL;
+ *message2 = NULL;
+ os_mutex_exit(array->mutex);
+ return(TRUE);
+ }
+
+ ut_a(i >= WAIT_OBJECT_0 && i <= WAIT_OBJECT_0 + n);
+
+ slot = os_aio_array_get_nth_slot(array, i + segment * n);
+
+ ut_a(slot->reserved);
+
+ if (orig_seg != ULINT_UNDEFINED) {
+ srv_set_io_thread_op_info(
+ orig_seg, "get windows aio return value");
+ }
+ ret = GetOverlappedResult(slot->file.m_file, &(slot->control), &len, TRUE);
+
+ *message1 = slot->message1;
+ *message2 = slot->message2;
+
+ *type = slot->type;
+
+ if (ret && len == slot->len) {
+
+ ret_val = TRUE;
+ } else if (os_file_handle_error(slot->name, "Windows aio")) {
+
+ retry = TRUE;
+ } else {
+
+ ret_val = FALSE;
+ }
+
+ os_mutex_exit(array->mutex);
+
+ if (retry) {
+ /* retry failed read/write operation synchronously.
+ No need to hold array->mutex. */
+
+#ifdef UNIV_PFS_IO
+ /* This read/write does not go through os_file_read
+ and os_file_write APIs, need to register with
+ performance schema explicitly here. */
+ struct PSI_file_locker* locker = NULL;
+ PSI_file_locker_state state;
+ register_pfs_file_io_begin(&state, locker, slot->file, slot->len,
+ (slot->type == OS_FILE_WRITE)
+ ? PSI_FILE_WRITE
+ : PSI_FILE_READ,
+ __FILE__, __LINE__);
+#endif
+
+ ut_a((slot->len & 0xFFFFFFFFUL) == slot->len);
+
+ switch (slot->type) {
+ case OS_FILE_WRITE:
+ ret = WriteFile(slot->file.m_file, slot->buf,
+ (DWORD) slot->len, &len,
+ &(slot->control));
+ break;
+ case OS_FILE_READ:
+ ret = ReadFile(slot->file.m_file, slot->buf,
+ (DWORD) slot->len, &len,
+ &(slot->control));
+ break;
+ default:
+ ut_error;
+ }
+
+#ifdef UNIV_PFS_IO
+ register_pfs_file_io_end(locker, len);
+#endif
+
+ if (!ret && GetLastError() == ERROR_IO_PENDING) {
+ /* aio was queued successfully!
+ We want a synchronous i/o operation on a
+ file where we also use async i/o: in Windows
+ we must use the same wait mechanism as for
+ async i/o */
+ ret = GetOverlappedResult(slot->file.m_file,
+ &(slot->control),
+ &len, TRUE);
+ }
+
+ ret_val = ret && len == slot->len;
+ }
+
+ os_aio_array_free_slot(array, slot);
+
+ return(ret_val);
+}
+#endif
+
+#if defined(LINUX_NATIVE_AIO)
+/******************************************************************//**
+This function is only used in Linux native asynchronous i/o. This is
+called from within the io-thread. If there are no completed IO requests
+in the slot array, the thread calls this function to collect more
+requests from the kernel.
+The io-thread waits on io_getevents(), which is a blocking call, with
+a timeout value. Unless the system is very heavy loaded, keeping the
+io-thread very busy, the io-thread will spend most of its time waiting
+in this function.
+The io-thread also exits in this function. It checks server status at
+each wakeup and that is why we use timed wait in io_getevents(). */
+static
+void
+os_aio_linux_collect(
+/*=================*/
+ os_aio_array_t* array, /*!< in/out: slot array. */
+ ulint segment, /*!< in: local segment no. */
+ ulint seg_size) /*!< in: segment size. */
+{
+ int i;
+ int ret;
+ ulint start_pos;
+ ulint end_pos;
+ struct timespec timeout;
+ struct io_event* events;
+ struct io_context* io_ctx;
+
+ /* sanity checks. */
+ ut_ad(array != NULL);
+ ut_ad(seg_size > 0);
+ ut_ad(segment < array->n_segments);
+
+ /* Which part of event array we are going to work on. */
+ events = &array->aio_events[segment * seg_size];
+
+ /* Which io_context we are going to use. */
+ io_ctx = array->aio_ctx[segment];
+
+ /* Starting point of the segment we will be working on. */
+ start_pos = segment * seg_size;
+
+ /* End point. */
+ end_pos = start_pos + seg_size;
+
+retry:
+
+ /* Initialize the events. The timeout value is arbitrary.
+ We probably need to experiment with it a little. */
+ memset(events, 0, sizeof(*events) * seg_size);
+ timeout.tv_sec = 0;
+ timeout.tv_nsec = OS_AIO_REAP_TIMEOUT;
+
+ ret = io_getevents(io_ctx, 1, seg_size, events, &timeout);
+
+ if (ret > 0) {
+ for (i = 0; i < ret; i++) {
+ os_aio_slot_t* slot;
+ struct iocb* control;
+
+ control = (struct iocb*) events[i].obj;
+ ut_a(control != NULL);
+
+ slot = (os_aio_slot_t*) control->data;
+
+ /* Some sanity checks. */
+ ut_a(slot != NULL);
+ ut_a(slot->reserved);
+
+#if defined(UNIV_AIO_DEBUG)
+ fprintf(stderr,
+ "io_getevents[%c]: slot[%p] ctx[%p]"
+ " seg[%lu]\n",
+ (slot->type == OS_FILE_WRITE) ? 'w' : 'r',
+ slot, io_ctx, segment);
+#endif
+
+ /* We are not scribbling previous segment. */
+ ut_a(slot->pos >= start_pos);
+
+ /* We have not overstepped to next segment. */
+ ut_a(slot->pos < end_pos);
+
+ /* Mark this request as completed. The error handling
+ will be done in the calling function. */
+ os_mutex_enter(array->mutex);
+ slot->n_bytes = events[i].res;
+ slot->ret = events[i].res2;
+ slot->io_already_done = TRUE;
+ os_mutex_exit(array->mutex);
+ }
+ return;
+ }
+
+ if (UNIV_UNLIKELY(srv_shutdown_state == SRV_SHUTDOWN_EXIT_THREADS)) {
+ return;
+ }
+
+ /* This error handling is for any error in collecting the
+ IO requests. The errors, if any, for any particular IO
+ request are simply passed on to the calling routine. */
+
+ switch (ret) {
+ case -EAGAIN:
+ /* Not enough resources! Try again. */
+ case -EINTR:
+ /* Interrupted! I have tested the behaviour in case of an
+ interrupt. If we have some completed IOs available then
+ the return code will be the number of IOs. We get EINTR only
+ if there are no completed IOs and we have been interrupted. */
+ case 0:
+ /* No pending request! Go back and check again. */
+ goto retry;
+ }
+
+ /* All other errors should cause a trap for now. */
+ ut_print_timestamp(stderr);
+ fprintf(stderr,
+ " InnoDB: unexpected ret_code[%d] from io_getevents()!\n",
+ ret);
+ ut_error;
+}
+
+/**********************************************************************//**
+This function is only used in Linux native asynchronous i/o.
+Waits for an aio operation to complete. This function is used to wait for
+the completed requests. The aio array of pending requests is divided
+into segments. The thread specifies which segment or slot it wants to wait
+for. NOTE: this function will also take care of freeing the aio slot,
+therefore no other thread is allowed to do the freeing!
+@return TRUE if the IO was successful */
+UNIV_INTERN
+ibool
+os_aio_linux_handle(
+/*================*/
+ ulint global_seg, /*!< in: segment number in the aio array
+ to wait for; segment 0 is the ibuf
+ i/o thread, segment 1 is log i/o thread,
+ then follow the non-ibuf read threads,
+ and the last are the non-ibuf write
+ threads. */
+ fil_node_t**message1, /*!< out: the messages passed with the */
+ void** message2, /*!< aio request; note that in case the
+ aio operation failed, these output
+ parameters are valid and can be used to
+ restart the operation. */
+ ulint* type) /*!< out: OS_FILE_WRITE or ..._READ */
+{
+ ulint segment;
+ os_aio_array_t* array;
+ os_aio_slot_t* slot;
+ ulint n;
+ ulint i;
+ ibool ret = FALSE;
+
+ /* Should never be doing Sync IO here. */
+ ut_a(global_seg != ULINT_UNDEFINED);
+
+ /* Find the array and the local segment. */
+ segment = os_aio_get_array_and_local_segment(&array, global_seg);
+ n = array->n_slots / array->n_segments;
+
+ /* Loop until we have found a completed request. */
+ for (;;) {
+ ibool any_reserved = FALSE;
+ os_mutex_enter(array->mutex);
+ for (i = 0; i < n; ++i) {
+ slot = os_aio_array_get_nth_slot(
+ array, i + segment * n);
+ if (!slot->reserved) {
+ continue;
+ } else if (slot->io_already_done) {
+ /* Something for us to work on. */
+ goto found;
+ } else {
+ any_reserved = TRUE;
+ }
+ }
+
+ os_mutex_exit(array->mutex);
+
+ /* There is no completed request.
+ If there is no pending request at all,
+ and the system is being shut down, exit. */
+ if (UNIV_UNLIKELY
+ (!any_reserved
+ && srv_shutdown_state == SRV_SHUTDOWN_EXIT_THREADS)) {
+ *message1 = NULL;
+ *message2 = NULL;
+ return(TRUE);
+ }
+
+ /* Wait for some request. Note that we return
+ from wait iff we have found a request. */
+
+ srv_set_io_thread_op_info(global_seg,
+ "waiting for completed aio requests");
+ os_aio_linux_collect(array, segment, n);
+ }
+
+found:
+ /* Note that it may be that there are more then one completed
+ IO requests. We process them one at a time. We may have a case
+ here to improve the performance slightly by dealing with all
+ requests in one sweep. */
+ srv_set_io_thread_op_info(global_seg,
+ "processing completed aio requests");
+
+ /* Ensure that we are scribbling only our segment. */
+ ut_a(i < n);
+
+ ut_ad(slot != NULL);
+ ut_ad(slot->reserved);
+ ut_ad(slot->io_already_done);
+
+ *message1 = slot->message1;
+ *message2 = slot->message2;
+
+ *type = slot->type;
+
+ if (slot->ret == 0 && slot->n_bytes == (long) slot->len) {
+
+ ret = TRUE;
+ } else {
+ errno = -slot->ret;
+
+ /* os_file_handle_error does tell us if we should retry
+ this IO. As it stands now, we don't do this retry when
+ reaping requests from a different context than
+ the dispatcher. This non-retry logic is the same for
+ windows and linux native AIO.
+ We should probably look into this to transparently
+ re-submit the IO. */
+ os_file_handle_error(slot->name, "Linux aio");
+
+ ret = FALSE;
+ }
+
+ os_mutex_exit(array->mutex);
+
+ os_aio_array_free_slot(array, slot);
+
+ return(ret);
+}
+#endif /* LINUX_NATIVE_AIO */
+
+/**********************************************************************//**
+Does simulated aio. This function should be called by an i/o-handler
+thread.
+@return TRUE if the aio operation succeeded */
+UNIV_INTERN
+ibool
+os_aio_simulated_handle(
+/*====================*/
+ ulint global_segment, /*!< in: the number of the segment in the aio
+ arrays to wait for; segment 0 is the ibuf
+ i/o thread, segment 1 the log i/o thread,
+ then follow the non-ibuf read threads, and as
+ the last are the non-ibuf write threads */
+ fil_node_t**message1, /*!< out: the messages passed with the aio
+ request; note that also in the case where
+ the aio operation failed, these output
+ parameters are valid and can be used to
+ restart the operation, for example */
+ void** message2,
+ ulint* type) /*!< out: OS_FILE_WRITE or ..._READ */
+{
+ os_aio_array_t* array;
+ ulint segment;
+ os_aio_slot_t* consecutive_ios[OS_AIO_MERGE_N_CONSECUTIVE];
+ ulint n_consecutive;
+ ulint total_len;
+ ulint offs;
+ os_offset_t lowest_offset;
+ ulint biggest_age;
+ ulint age;
+ byte* combined_buf;
+ byte* combined_buf2;
+ ibool ret;
+ ibool any_reserved;
+ ulint n;
+ os_aio_slot_t* aio_slot;
+
+ /* Fix compiler warning */
+ *consecutive_ios = NULL;
+
+ segment = os_aio_get_array_and_local_segment(&array, global_segment);
+
+restart:
+ /* NOTE! We only access constant fields in os_aio_array. Therefore
+ we do not have to acquire the protecting mutex yet */
+
+ srv_set_io_thread_op_info(global_segment,
+ "looking for i/o requests (a)");
+ ut_ad(os_aio_validate_skip());
+ ut_ad(segment < array->n_segments);
+
+ n = array->n_slots / array->n_segments;
+
+ /* Look through n slots after the segment * n'th slot */
+
+ if (array == os_aio_read_array
+ && os_aio_recommend_sleep_for_read_threads) {
+
+ /* Give other threads chance to add several i/os to the array
+ at once. */
+
+ goto recommended_sleep;
+ }
+
+ srv_set_io_thread_op_info(global_segment,
+ "looking for i/o requests (b)");
+
+ /* Check if there is a slot for which the i/o has already been
+ done */
+ any_reserved = FALSE;
+
+ os_mutex_enter(array->mutex);
+
+ for (ulint i = 0; i < n; i++) {
+ os_aio_slot_t* slot;
+
+ slot = os_aio_array_get_nth_slot(array, i + segment * n);
+
+ if (!slot->reserved) {
+ continue;
+ } else if (slot->io_already_done) {
+
+ if (os_aio_print_debug) {
+ fprintf(stderr,
+ "InnoDB: i/o for slot %lu"
+ " already done, returning\n",
+ (ulong) i);
+ }
+
+ aio_slot = slot;
+ ret = TRUE;
+ goto slot_io_done;
+ } else {
+ any_reserved = TRUE;
+ }
+ }
+
+ /* There is no completed request.
+ If there is no pending request at all,
+ and the system is being shut down, exit. */
+ if (!any_reserved && srv_shutdown_state == SRV_SHUTDOWN_EXIT_THREADS) {
+ os_mutex_exit(array->mutex);
+ *message1 = NULL;
+ *message2 = NULL;
+ return(TRUE);
+ }
+
+ n_consecutive = 0;
+
+ /* If there are at least 2 seconds old requests, then pick the oldest
+ one to prevent starvation. If several requests have the same age,
+ then pick the one at the lowest offset. */
+
+ biggest_age = 0;
+ lowest_offset = IB_UINT64_MAX;
+
+ for (ulint i = 0; i < n; i++) {
+ os_aio_slot_t* slot;
+
+ slot = os_aio_array_get_nth_slot(array, i + segment * n);
+
+ if (slot->reserved) {
+
+ age = (ulint) difftime(
+ ut_time(), slot->reservation_time);
+
+ if ((age >= 2 && age > biggest_age)
+ || (age >= 2 && age == biggest_age
+ && slot->offset < lowest_offset)) {
+
+ /* Found an i/o request */
+ consecutive_ios[0] = slot;
+
+ n_consecutive = 1;
+
+ biggest_age = age;
+ lowest_offset = slot->offset;
+ }
+ }
+ }
+
+ if (n_consecutive == 0) {
+ /* There were no old requests. Look for an i/o request at the
+ lowest offset in the array (we ignore the high 32 bits of the
+ offset in these heuristics) */
+
+ lowest_offset = IB_UINT64_MAX;
+
+ for (ulint i = 0; i < n; i++) {
+ os_aio_slot_t* slot;
+
+ slot = os_aio_array_get_nth_slot(
+ array, i + segment * n);
+
+ if (slot->reserved && slot->offset < lowest_offset) {
+
+ /* Found an i/o request */
+ consecutive_ios[0] = slot;
+
+ n_consecutive = 1;
+
+ lowest_offset = slot->offset;
+ }
+ }
+ }
+
+ if (n_consecutive == 0) {
+
+ /* No i/o requested at the moment */
+
+ goto wait_for_io;
+ }
+
+ /* if n_consecutive != 0, then we have assigned
+ something valid to consecutive_ios[0] */
+ ut_ad(n_consecutive != 0);
+ ut_ad(consecutive_ios[0] != NULL);
+
+ aio_slot = consecutive_ios[0];
+
+ /* Check if there are several consecutive blocks to read or write */
+
+consecutive_loop:
+ for (ulint i = 0; i < n; i++) {
+ os_aio_slot_t* slot;
+
+ slot = os_aio_array_get_nth_slot(array, i + segment * n);
+ if (slot->reserved
+ && slot != aio_slot
+ && slot->offset == aio_slot->offset + aio_slot->len
+ && slot->type == aio_slot->type
+ && slot->file.m_file == aio_slot->file.m_file) {
+
+ /* Found a consecutive i/o request */
+
+ consecutive_ios[n_consecutive] = slot;
+ n_consecutive++;
+
+ aio_slot = slot;
+
+ if (n_consecutive < OS_AIO_MERGE_N_CONSECUTIVE) {
+
+ goto consecutive_loop;
+ } else {
+ break;
+ }
+ }
+ }
+
+ srv_set_io_thread_op_info(global_segment, "consecutive i/o requests");
+
+ /* We have now collected n_consecutive i/o requests in the array;
+ allocate a single buffer which can hold all data, and perform the
+ i/o */
+
+ total_len = 0;
+ aio_slot = consecutive_ios[0];
+
+ for (ulint i = 0; i < n_consecutive; i++) {
+ total_len += consecutive_ios[i]->len;
+ }
+
+ if (n_consecutive == 1) {
+ /* We can use the buffer of the i/o request */
+ combined_buf = aio_slot->buf;
+ combined_buf2 = NULL;
+ } else {
+ combined_buf2 = static_cast<byte*>(
+ ut_malloc(total_len + UNIV_PAGE_SIZE));
+
+ ut_a(combined_buf2);
+
+ combined_buf = static_cast<byte*>(
+ ut_align(combined_buf2, UNIV_PAGE_SIZE));
+ }
+
+ /* We release the array mutex for the time of the i/o: NOTE that
+ this assumes that there is just one i/o-handler thread serving
+ a single segment of slots! */
+
+ os_mutex_exit(array->mutex);
+
+ if (aio_slot->type == OS_FILE_WRITE && n_consecutive > 1) {
+ /* Copy the buffers to the combined buffer */
+ offs = 0;
+
+ for (ulint i = 0; i < n_consecutive; i++) {
+
+ ut_memcpy(combined_buf + offs, consecutive_ios[i]->buf,
+ consecutive_ios[i]->len);
+
+ offs += consecutive_ios[i]->len;
+ }
+ }
+
+ srv_set_io_thread_op_info(global_segment, "doing file i/o");
+
+ /* Do the i/o with ordinary, synchronous i/o functions: */
+ if (aio_slot->type == OS_FILE_WRITE) {
+ ut_ad(!srv_read_only_mode);
+ ret = os_file_write(
+ aio_slot->name, aio_slot->file, combined_buf,
+ aio_slot->offset, total_len);
+
+ DBUG_EXECUTE_IF("ib_os_aio_func_io_failure_28",
+ os_has_said_disk_full = FALSE;
+ ret = 0;
+ errno = 28;);
+
+ if (!ret) {
+ os_file_handle_error_cond_exit(aio_slot->name, "os_file_write_func", TRUE, FALSE);
+ }
+
+ } else {
+ ret = os_file_read(
+ aio_slot->file, combined_buf,
+ aio_slot->offset, total_len);
+ }
+
+ srv_set_io_thread_op_info(global_segment, "file i/o done");
+
+ if (aio_slot->type == OS_FILE_READ && n_consecutive > 1) {
+ /* Copy the combined buffer to individual buffers */
+ offs = 0;
+
+ for (ulint i = 0; i < n_consecutive; i++) {
+
+ ut_memcpy(consecutive_ios[i]->buf, combined_buf + offs,
+ consecutive_ios[i]->len);
+ offs += consecutive_ios[i]->len;
+ }
+ }
+
+ if (combined_buf2) {
+ ut_free(combined_buf2);
+ }
+
+ os_mutex_enter(array->mutex);
+
+ /* Mark the i/os done in slots */
+
+ for (ulint i = 0; i < n_consecutive; i++) {
+ consecutive_ios[i]->io_already_done = TRUE;
+ }
+
+ /* We return the messages for the first slot now, and if there were
+ several slots, the messages will be returned with subsequent calls
+ of this function */
+
+slot_io_done:
+
+ ut_a(aio_slot->reserved);
+
+ *message1 = aio_slot->message1;
+ *message2 = aio_slot->message2;
+
+ *type = aio_slot->type;
+
+ os_mutex_exit(array->mutex);
+
+ os_aio_array_free_slot(array, aio_slot);
+
+ return(ret);
+
+wait_for_io:
+ srv_set_io_thread_op_info(global_segment, "resetting wait event");
+
+ /* We wait here until there again can be i/os in the segment
+ of this thread */
+
+ os_event_reset(os_aio_segment_wait_events[global_segment]);
+
+ os_mutex_exit(array->mutex);
+
+recommended_sleep:
+ srv_set_io_thread_op_info(global_segment, "waiting for i/o request");
+
+ os_event_wait(os_aio_segment_wait_events[global_segment]);
+
+ goto restart;
+}
+
+/**********************************************************************//**
+Validates the consistency of an aio array.
+@return true if ok */
+static
+bool
+os_aio_array_validate(
+/*==================*/
+ os_aio_array_t* array) /*!< in: aio wait array */
+{
+ ulint i;
+ ulint n_reserved = 0;
+
+ os_mutex_enter(array->mutex);
+
+ ut_a(array->n_slots > 0);
+ ut_a(array->n_segments > 0);
+
+ for (i = 0; i < array->n_slots; i++) {
+ os_aio_slot_t* slot;
+
+ slot = os_aio_array_get_nth_slot(array, i);
+
+ if (slot->reserved) {
+ n_reserved++;
+ ut_a(slot->len > 0);
+ }
+ }
+
+ ut_a(array->n_reserved == n_reserved);
+
+ os_mutex_exit(array->mutex);
+
+ return(true);
+}
+
+/**********************************************************************//**
+Validates the consistency the aio system.
+@return TRUE if ok */
+UNIV_INTERN
+ibool
+os_aio_validate(void)
+/*=================*/
+{
+ os_aio_array_validate(os_aio_read_array);
+
+ if (os_aio_write_array != 0) {
+ os_aio_array_validate(os_aio_write_array);
+ }
+
+ if (os_aio_ibuf_array != 0) {
+ os_aio_array_validate(os_aio_ibuf_array);
+ }
+
+ if (os_aio_log_array != 0) {
+ os_aio_array_validate(os_aio_log_array);
+ }
+
+ if (os_aio_sync_array != 0) {
+ os_aio_array_validate(os_aio_sync_array);
+ }
+
+ return(TRUE);
+}
+
+/**********************************************************************//**
+Prints pending IO requests per segment of an aio array.
+We probably don't need per segment statistics but they can help us
+during development phase to see if the IO requests are being
+distributed as expected. */
+static
+void
+os_aio_print_segment_info(
+/*======================*/
+ FILE* file, /*!< in: file where to print */
+ ulint* n_seg, /*!< in: pending IO array */
+ os_aio_array_t* array) /*!< in: array to process */
+{
+ ulint i;
+
+ ut_ad(array);
+ ut_ad(n_seg);
+ ut_ad(array->n_segments > 0);
+
+ if (array->n_segments == 1) {
+ return;
+ }
+
+ fprintf(file, " [");
+ for (i = 0; i < array->n_segments; i++) {
+ if (i != 0) {
+ fprintf(file, ", ");
+ }
+
+ fprintf(file, "%lu", n_seg[i]);
+ }
+ fprintf(file, "] ");
+}
+
+/**********************************************************************//**
+Prints info about the aio array. */
+UNIV_INTERN
+void
+os_aio_print_array(
+/*==============*/
+ FILE* file, /*!< in: file where to print */
+ os_aio_array_t* array) /*!< in: aio array to print */
+{
+ ulint n_reserved = 0;
+ ulint n_res_seg[SRV_MAX_N_IO_THREADS];
+
+ os_mutex_enter(array->mutex);
+
+ ut_a(array->n_slots > 0);
+ ut_a(array->n_segments > 0);
+
+ memset(n_res_seg, 0x0, sizeof(n_res_seg));
+
+ for (ulint i = 0; i < array->n_slots; ++i) {
+ os_aio_slot_t* slot;
+ ulint seg_no;
+
+ slot = os_aio_array_get_nth_slot(array, i);
+
+ seg_no = (i * array->n_segments) / array->n_slots;
+
+ if (slot->reserved) {
+ ++n_reserved;
+ ++n_res_seg[seg_no];
+
+ ut_a(slot->len > 0);
+ }
+ }
+
+ ut_a(array->n_reserved == n_reserved);
+
+ fprintf(file, " %lu", (ulong) n_reserved);
+
+ os_aio_print_segment_info(file, n_res_seg, array);
+
+ os_mutex_exit(array->mutex);
+}
+
+/**********************************************************************//**
+Prints info of the aio arrays. */
+UNIV_INTERN
+void
+os_aio_print(
+/*=========*/
+ FILE* file) /*!< in: file where to print */
+{
+ time_t current_time;
+ double time_elapsed;
+ double avg_bytes_read;
+
+ for (ulint i = 0; i < srv_n_file_io_threads; ++i) {
+ fprintf(file, "I/O thread %lu state: %s (%s)",
+ (ulong) i,
+ srv_io_thread_op_info[i],
+ srv_io_thread_function[i]);
+
+#ifndef _WIN32
+ if (!srv_use_native_aio
+ && os_aio_segment_wait_events[i]->is_set) {
+ fprintf(file, " ev set");
+ }
+#endif /* _WIN32 */
+
+ fprintf(file, "\n");
+ }
+
+ fputs("Pending normal aio reads:", file);
+
+ os_aio_print_array(file, os_aio_read_array);
+
+ if (os_aio_write_array != 0) {
+ fputs(", aio writes:", file);
+ os_aio_print_array(file, os_aio_write_array);
+ }
+
+ if (os_aio_ibuf_array != 0) {
+ fputs(",\n ibuf aio reads:", file);
+ os_aio_print_array(file, os_aio_ibuf_array);
+ }
+
+ if (os_aio_log_array != 0) {
+ fputs(", log i/o's:", file);
+ os_aio_print_array(file, os_aio_log_array);
+ }
+
+ if (os_aio_sync_array != 0) {
+ fputs(", sync i/o's:", file);
+ os_aio_print_array(file, os_aio_sync_array);
+ }
+
+ putc('\n', file);
+ current_time = ut_time();
+ time_elapsed = 0.001 + difftime(current_time, os_last_printout);
+
+ fprintf(file,
+ "Pending flushes (fsync) log: " ULINTPF
+ "; buffer pool: " ULINTPF "\n"
+ ULINTPF " OS file reads, "
+ ULINTPF " OS file writes, "
+ ULINTPF " OS fsyncs\n",
+ fil_n_pending_log_flushes,
+ fil_n_pending_tablespace_flushes,
+ os_n_file_reads,
+ os_n_file_writes,
+ os_n_fsyncs);
+
+ const ulint n_reads = ulint(MONITOR_VALUE(MONITOR_OS_PENDING_READS));
+ const ulint n_writes = ulint(MONITOR_VALUE(MONITOR_OS_PENDING_WRITES));
+
+ if (n_reads != 0 || n_writes != 0) {
+ fprintf(file,
+ ULINTPF " pending reads, " ULINTPF " pending writes\n",
+ n_reads, n_writes);
+ }
+
+ if (os_n_file_reads == os_n_file_reads_old) {
+ avg_bytes_read = 0.0;
+ } else {
+ avg_bytes_read = (double) os_bytes_read_since_printout
+ / (os_n_file_reads - os_n_file_reads_old);
+ }
+
+ fprintf(file,
+ "%.2f reads/s, %lu avg bytes/read,"
+ " %.2f writes/s, %.2f fsyncs/s\n",
+ (os_n_file_reads - os_n_file_reads_old)
+ / time_elapsed,
+ (ulong) avg_bytes_read,
+ (os_n_file_writes - os_n_file_writes_old)
+ / time_elapsed,
+ (os_n_fsyncs - os_n_fsyncs_old)
+ / time_elapsed);
+
+ os_n_file_reads_old = os_n_file_reads;
+ os_n_file_writes_old = os_n_file_writes;
+ os_n_fsyncs_old = os_n_fsyncs;
+ os_bytes_read_since_printout = 0;
+
+ os_last_printout = current_time;
+}
+
+/**********************************************************************//**
+Refreshes the statistics used to print per-second averages. */
+UNIV_INTERN
+void
+os_aio_refresh_stats(void)
+/*======================*/
+{
+ os_n_file_reads_old = os_n_file_reads;
+ os_n_file_writes_old = os_n_file_writes;
+ os_n_fsyncs_old = os_n_fsyncs;
+ os_bytes_read_since_printout = 0;
+
+ os_last_printout = time(NULL);
+}
+
+#ifdef UNIV_DEBUG
+/**********************************************************************//**
+Checks that all slots in the system have been freed, that is, there are
+no pending io operations.
+@return TRUE if all free */
+UNIV_INTERN
+ibool
+os_aio_all_slots_free(void)
+/*=======================*/
+{
+ os_aio_array_t* array;
+ ulint n_res = 0;
+
+ array = os_aio_read_array;
+
+ os_mutex_enter(array->mutex);
+
+ n_res += array->n_reserved;
+
+ os_mutex_exit(array->mutex);
+
+ if (!srv_read_only_mode) {
+ ut_a(os_aio_write_array == 0);
+
+ array = os_aio_write_array;
+
+ os_mutex_enter(array->mutex);
+
+ n_res += array->n_reserved;
+
+ os_mutex_exit(array->mutex);
+
+ ut_a(os_aio_ibuf_array == 0);
+
+ array = os_aio_ibuf_array;
+
+ os_mutex_enter(array->mutex);
+
+ n_res += array->n_reserved;
+
+ os_mutex_exit(array->mutex);
+ }
+
+ ut_a(os_aio_log_array == 0);
+
+ array = os_aio_log_array;
+
+ os_mutex_enter(array->mutex);
+
+ n_res += array->n_reserved;
+
+ os_mutex_exit(array->mutex);
+
+ array = os_aio_sync_array;
+
+ os_mutex_enter(array->mutex);
+
+ n_res += array->n_reserved;
+
+ os_mutex_exit(array->mutex);
+
+ if (n_res == 0) {
+
+ return(TRUE);
+ }
+
+ return(FALSE);
+}
+#endif /* UNIV_DEBUG */
+
+#endif /* !UNIV_HOTBACKUP */
diff --cc storage/xtradb/handler/ha_innodb.cc
index d3e3109951b,65c5ce69713..440e14e1989
--- a/storage/xtradb/handler/ha_innodb.cc
+++ b/storage/xtradb/handler/ha_innodb.cc
@@@ -1,10 -1,8 +1,10 @@@
/*****************************************************************************
-Copyright (c) 2000, 2015, Oracle and/or its affiliates. All Rights Reserved.
+Copyright (c) 2000, 2017, Oracle and/or its affiliates. All Rights Reserved.
Copyright (c) 2008, 2009 Google Inc.
Copyright (c) 2009, Percona Inc.
+Copyright (c) 2012, Facebook Inc.
- Copyright (c) 2013, 2017, MariaDB Corporation.
++Copyright (c) 2013, 2018, MariaDB Corporation.
Portions of this file contain modifications contributed and copyrighted by
Google, Inc. Those modifications are gratefully acknowledged and are described
@@@ -1131,3238 -972,216 +1131,3242 @@@ static SHOW_VAR innodb_status_variables
{NullS, NullS, SHOW_LONG}
};
-/* General functions */
-
-/******************************************************************//**
-Returns true if the thread is the replication thread on the slave
-server. Used in srv_conc_enter_innodb() to determine if the thread
-should be allowed to enter InnoDB - the replication thread is treated
-differently than other threads. Also used in
-srv_conc_force_exit_innodb().
-@return true if thd is the replication thread */
-extern "C" UNIV_INTERN
-ibool
-thd_is_replication_slave_thread(
-/*============================*/
- const void* thd) /*!< in: thread handle (THD*) */
-{
- return((ibool) thd_slave_thread((THD*) thd));
-}
+/************************************************************************//**
+Handling the shared INNOBASE_SHARE structure that is needed to provide table
+locking. Register the table name if it doesn't exist in the hash table. */
+static
+INNOBASE_SHARE*
+get_share(
+/*======*/
+ const char* table_name); /*!< in: table to lookup */
-/******************************************************************//**
-Save some CPU by testing the value of srv_thread_concurrency in inline
-functions. */
-static inline
+/************************************************************************//**
+Free the shared object that was registered with get_share(). */
+static
void
-innodb_srv_conc_enter_innodb(
-/*=========================*/
- trx_t* trx) /*!< in: transaction handle */
-{
- if (UNIV_LIKELY(!srv_thread_concurrency)) {
+free_share(
+/*=======*/
+ INNOBASE_SHARE* share); /*!< in/own: share to free */
- return;
- }
+/*****************************************************************//**
+Frees a possible InnoDB trx object associated with the current THD.
+@return 0 or error number */
+static
+int
+innobase_close_connection(
+/*======================*/
+ handlerton* hton, /*!< in/out: Innodb handlerton */
+ THD* thd); /*!< in: MySQL thread handle for
+ which to close the connection */
- srv_conc_enter_innodb(trx);
-}
+static void innobase_commit_ordered(handlerton *hton, THD* thd, bool all);
+static void innobase_checkpoint_request(handlerton *hton, void *cookie);
-/******************************************************************//**
-Save some CPU by testing the value of srv_thread_concurrency in inline
-functions. */
-static inline
+/*****************************************************************//**
+Cancel any pending lock request associated with the current THD. */
+static
void
-innodb_srv_conc_exit_innodb(
-/*========================*/
- trx_t* trx) /*!< in: transaction handle */
-{
- if (UNIV_LIKELY(!trx->declared_to_be_inside_innodb)) {
-
- return;
- }
+innobase_kill_connection(
+/*======================*/
+ handlerton* hton, /*!< in: innobase handlerton */
+ THD* thd, /*!< in: handle to the MySQL thread being killed */
+ thd_kill_levels);
- srv_conc_exit_innodb(trx);
-}
+/*****************************************************************//**
+Commits a transaction in an InnoDB database or marks an SQL statement
+ended.
+@return 0 */
+static
+int
+innobase_commit(
+/*============*/
+ handlerton* hton, /*!< in/out: Innodb handlerton */
+ THD* thd, /*!< in: MySQL thread handle of the
+ user for whom the transaction should
+ be committed */
+ bool commit_trx); /*!< in: true - commit transaction
+ false - the current SQL statement
+ ended */
-/******************************************************************//**
-Force a thread to leave InnoDB even if it has spare tickets. */
-static inline
-void
-innodb_srv_conc_force_exit_innodb(
-/*==============================*/
- trx_t* trx) /*!< in: transaction handle */
-{
-#ifdef UNIV_SYNC_DEBUG
- ut_ad(!sync_thread_levels_nonempty_trx(trx->has_search_latch));
-#endif /* UNIV_SYNC_DEBUG */
+/*****************************************************************//**
+Rolls back a transaction to a savepoint.
+@return 0 if success, HA_ERR_NO_SAVEPOINT if no savepoint with the
+given name */
+static
+int
+innobase_rollback(
+/*==============*/
+ handlerton* hton, /*!< in/out: Innodb handlerton */
+ THD* thd, /*!< in: handle to the MySQL thread
+ of the user whose transaction should
+ be rolled back */
+ bool rollback_trx); /*!< in: TRUE - rollback entire
+ transaction FALSE - rollback the current
+ statement only */
- if (trx->declared_to_be_inside_innodb) {
+/*****************************************************************//**
+Rolls back a transaction to a savepoint.
+@return 0 if success, HA_ERR_NO_SAVEPOINT if no savepoint with the
+given name */
+static
+int
+innobase_rollback_to_savepoint(
+/*===========================*/
+ handlerton* hton, /*!< in/out: InnoDB handlerton */
+ THD* thd, /*!< in: handle to the MySQL thread of
+ the user whose XA transaction should
+ be rolled back to savepoint */
+ void* savepoint); /*!< in: savepoint data */
- srv_conc_force_exit_innodb(trx);
- }
-}
+/*****************************************************************//**
+Check whether innodb state allows to safely release MDL locks after
+rollback to savepoint.
+@return true if it is safe, false if its not safe. */
+static
+bool
+innobase_rollback_to_savepoint_can_release_mdl(
+/*===========================================*/
+ handlerton* hton, /*!< in/out: InnoDB handlerton */
+ THD* thd); /*!< in: handle to the MySQL thread of
+ the user whose XA transaction should
+ be rolled back to savepoint */
-/******************************************************************//**
-Returns true if the transaction this thread is processing has edited
-non-transactional tables. Used by the deadlock detector when deciding
-which transaction to rollback in case of a deadlock - we try to avoid
-rolling back transactions that have edited non-transactional tables.
-@return true if non-transactional tables have been edited */
-extern "C" UNIV_INTERN
-ibool
-thd_has_edited_nontrans_tables(
-/*===========================*/
- void* thd) /*!< in: thread handle (THD*) */
-{
- return((ibool) thd_non_transactional_update((THD*) thd));
-}
+/*****************************************************************//**
+Sets a transaction savepoint.
+@return always 0, that is, always succeeds */
+static
+int
+innobase_savepoint(
+/*===============*/
+ handlerton* hton, /*!< in/out: InnoDB handlerton */
+ THD* thd, /*!< in: handle to the MySQL thread of
+ the user's XA transaction for which
+ we need to take a savepoint */
+ void* savepoint); /*!< in: savepoint data */
-/******************************************************************//**
-Returns true if the thread is executing a SELECT statement.
-@return true if thd is executing SELECT */
-extern "C" UNIV_INTERN
-ibool
-thd_is_select(
-/*==========*/
- const void* thd) /*!< in: thread handle (THD*) */
-{
- return(thd_sql_command((const THD*) thd) == SQLCOM_SELECT);
-}
+/*****************************************************************//**
+Release transaction savepoint name.
+@return 0 if success, HA_ERR_NO_SAVEPOINT if no savepoint with the
+given name */
+static
+int
+innobase_release_savepoint(
+/*=======================*/
+ handlerton* hton, /*!< in/out: handlerton for Innodb */
+ THD* thd, /*!< in: handle to the MySQL thread
+ of the user whose transaction's
+ savepoint should be released */
+ void* savepoint); /*!< in: savepoint data */
-/******************************************************************//**
-Returns true if the thread supports XA,
-global value of innodb_supports_xa if thd is NULL.
-@return true if thd has XA support */
-extern "C" UNIV_INTERN
-ibool
-thd_supports_xa(
-/*============*/
- void* thd) /*!< in: thread handle (THD*), or NULL to query
- the global innodb_supports_xa */
-{
- return(THDVAR((THD*) thd, support_xa));
-}
+/************************************************************************//**
+Function for constructing an InnoDB table handler instance. */
+static
+handler*
+innobase_create_handler(
+/*====================*/
+ handlerton* hton, /*!< in/out: handlerton for Innodb */
+ TABLE_SHARE* table,
+ MEM_ROOT* mem_root);
-/******************************************************************//**
-Check the status of fake changes mode (innodb_fake_changes)
-@return true if fake change mode is enabled. */
-extern "C" UNIV_INTERN
-ibool
-thd_fake_changes(
-/*=============*/
- void* thd) /*!< in: thread handle, or NULL to query
- the global innodb_supports_xa */
-{
- return(THDVAR((THD*) thd, fake_changes));
-}
+/** @brief Initialize the default value of innodb_commit_concurrency.
-/******************************************************************//**
-Returns the lock wait timeout for the current connection.
-@return the lock wait timeout, in seconds */
-extern "C" UNIV_INTERN
-ulong
-thd_lock_wait_timeout(
-/*==================*/
- void* thd) /*!< in: thread handle (THD*), or NULL to query
- the global innodb_lock_wait_timeout */
-{
- /* According to <mysql/plugin.h>, passing thd == NULL
- returns the global value of the session variable. */
- return(THDVAR((THD*) thd, lock_wait_timeout));
-}
+Once InnoDB is running, the innodb_commit_concurrency must not change
+from zero to nonzero. (Bug #42101)
-/******************************************************************//**
-Set the time waited for the lock for the current query. */
-extern "C" UNIV_INTERN
-void
-thd_set_lock_wait_time(
+The initial default value is 0, and without this extra initialization,
+SET GLOBAL innodb_commit_concurrency=DEFAULT would set the parameter
+to 0, even if it was initially set to nonzero at the command line
+or configuration file. */
+static
+void
+innobase_commit_concurrency_init_default();
+/*=======================================*/
+
+/** @brief Initialize the default and max value of innodb_undo_logs.
+
+Once InnoDB is running, the default value and the max value of
+innodb_undo_logs must be equal to the available undo logs,
+given by srv_available_undo_logs. */
+static
+void
+innobase_undo_logs_init_default_max();
+/*==================================*/
+
+/************************************************************//**
+Validate the file format name and return its corresponding id.
+@return valid file format id */
+static
+uint
+innobase_file_format_name_lookup(
+/*=============================*/
+ const char* format_name); /*!< in: pointer to file format
+ name */
+/************************************************************//**
+Validate the file format check config parameters, as a side effect it
+sets the srv_max_file_format_at_startup variable.
+@return the format_id if valid config value, otherwise, return -1 */
+static
+int
+innobase_file_format_validate_and_set(
+/*==================================*/
+ const char* format_max); /*!< in: parameter value */
+
+/*******************************************************************//**
+This function is used to prepare an X/Open XA distributed transaction.
+@return 0 or error number */
+static
+int
+innobase_xa_prepare(
+/*================*/
+ handlerton* hton, /*!< in: InnoDB handlerton */
+ THD* thd, /*!< in: handle to the MySQL thread of
+ the user whose XA transaction should
+ be prepared */
+ bool all); /*!< in: true - prepare transaction
+ false - the current SQL statement
+ ended */
+/*******************************************************************//**
+This function is used to recover X/Open XA distributed transactions.
+@return number of prepared transactions stored in xid_list */
+static
+int
+innobase_xa_recover(
+/*================*/
+ handlerton* hton, /*!< in: InnoDB handlerton */
+ XID* xid_list, /*!< in/out: prepared transactions */
+ uint len); /*!< in: number of slots in xid_list */
+/*******************************************************************//**
+This function is used to commit one X/Open XA distributed transaction
+which is in the prepared state
+@return 0 or error number */
+static
+int
+innobase_commit_by_xid(
/*===================*/
- void* thd, /*!< in: thread handle (THD*) */
- ulint value) /*!< in: time waited for the lock */
+ handlerton* hton, /*!< in: InnoDB handlerton */
+ XID* xid); /*!< in: X/Open XA transaction
+ identification */
+/*******************************************************************//**
+This function is used to rollback one X/Open XA distributed transaction
+which is in the prepared state
+@return 0 or error number */
+static
+int
+innobase_rollback_by_xid(
+/*=====================*/
+ handlerton* hton, /*!< in: InnoDB handlerton */
+ XID* xid); /*!< in: X/Open XA transaction
+ identification */
+/*******************************************************************//**
+Create a consistent view for a cursor based on current transaction
+which is created if the corresponding MySQL thread still lacks one.
+This consistent view is then used inside of MySQL when accessing records
+using a cursor.
+@return pointer to cursor view or NULL */
+static
+void*
+innobase_create_cursor_view(
+/*========================*/
+ handlerton* hton, /*!< in: innobase hton */
+ THD* thd); /*!< in: user thread handle */
+/*******************************************************************//**
+Set the given consistent cursor view to a transaction which is created
+if the corresponding MySQL thread still lacks one. If the given
+consistent cursor view is NULL global read view of a transaction is
+restored to a transaction read view. */
+static
+void
+innobase_set_cursor_view(
+/*=====================*/
+ handlerton* hton, /*!< in: handlerton of Innodb */
+ THD* thd, /*!< in: user thread handle */
+ void* curview); /*!< in: Consistent cursor view to
+ be set */
+/*******************************************************************//**
+Close the given consistent cursor view of a transaction and restore
+global read view to a transaction read view. Transaction is created if the
+corresponding MySQL thread still lacks one. */
+static
+void
+innobase_close_cursor_view(
+/*=======================*/
+ handlerton* hton, /*!< in: handlerton of Innodb */
+ THD* thd, /*!< in: user thread handle */
+ void* curview); /*!< in: Consistent read view to be
+ closed */
+/*****************************************************************//**
+Removes all tables in the named database inside InnoDB. */
+static
+void
+innobase_drop_database(
+/*===================*/
+ handlerton* hton, /*!< in: handlerton of Innodb */
+ char* path); /*!< in: database path; inside InnoDB
+ the name of the last directory in
+ the path is used as the database name:
+ for example, in 'mysql/data/test' the
+ database name is 'test' */
+/** Shut down the InnoDB storage engine.
+@return 0 */
+static
+int
+innobase_end(handlerton*, ha_panic_function);
+
+#if NOT_USED
+/*****************************************************************//**
+Stores the current binlog coordinates in the trx system header. */
+static
+int
+innobase_store_binlog_info(
+/*=======================*/
+ handlerton* hton, /*!< in: InnoDB handlerton */
+ THD* thd); /*!< in: MySQL thread handle */
+#endif
+
+/*****************************************************************//**
+Creates an InnoDB transaction struct for the thd if it does not yet have one.
+Starts a new InnoDB transaction if a transaction is not yet started. And
+assigns a new snapshot for a consistent read if the transaction does not yet
+have one.
+@return 0 */
+static
+int
+innobase_start_trx_and_assign_read_view(
+/*====================================*/
+ handlerton* hton, /* in: Innodb handlerton */
+ THD* thd); /* in: MySQL thread handle of the
+ user for whom the transaction should
+ be committed */
+#ifdef NOT_USED
+/*****************************************************************//**
+Creates an InnoDB transaction struct for the thd if it does not yet have one.
+Starts a new InnoDB transaction if a transaction is not yet started. And
+clones snapshot for a consistent read from another session, if it has one.
+@return 0 */
+static
+int
+innobase_start_trx_and_clone_read_view(
+/*====================================*/
+ handlerton* hton, /* in: Innodb handlerton */
+ THD* thd, /* in: MySQL thread handle of the
+ user for whom the transaction should
+ be committed */
+ THD* from_thd); /* in: MySQL thread handle of the
+ user session from which the consistent
+ read should be cloned */
+#endif
+/****************************************************************//**
+Flushes InnoDB logs to disk and makes a checkpoint. Really, a commit flushes
+the logs, and the name of this function should be innobase_checkpoint.
+@return TRUE if error */
+static
+bool
+innobase_flush_logs(
+/*================*/
+ handlerton* hton); /*!< in: InnoDB handlerton */
+
+/************************************************************************//**
+Implements the SHOW ENGINE INNODB STATUS command. Sends the output of the
+InnoDB Monitor to the client.
+@return 0 on success */
+static
+int
+innodb_show_status(
+/*===============*/
+ handlerton* hton, /*!< in: the innodb handlerton */
+ THD* thd, /*!< in: the MySQL query thread of
+ the caller */
+ stat_print_fn* stat_print);
+/************************************************************************//**
+Return 0 on success and non-zero on failure. Note: the bool return type
+seems to be abused here, should be an int. */
+static
+bool
+innobase_show_status(
+/*=================*/
+ handlerton* hton, /*!< in: the innodb handlerton */
+ THD* thd, /*!< in: the MySQL query thread of
+ the caller */
+ stat_print_fn* stat_print,
+ enum ha_stat_type stat_type);
+
+/*****************************************************************//**
+Commits a transaction in an InnoDB database. */
+static
+void
+innobase_commit_low(
+/*================*/
+ trx_t* trx); /*!< in: transaction handle */
+
+/****************************************************************//**
+Parse and enable InnoDB monitor counters during server startup.
+User can enable monitor counters/groups by specifying
+"loose-innodb_monitor_enable = monitor_name1;monitor_name2..."
+in server configuration file or at the command line. */
+static
+void
+innodb_enable_monitor_at_startup(
+/*=============================*/
+ char* str); /*!< in: monitor counter enable list */
+
+/*********************************************************************
+Normalizes a table name string. A normalized name consists of the
+database name catenated to '/' and table name. An example:
+test/mytable. On Windows normalization puts both the database name and the
+table name always to lower case if "set_lower_case" is set to TRUE. */
+void
+normalize_table_name_low(
+/*=====================*/
+ char* norm_name, /* out: normalized name as a
+ null-terminated string */
+ const char* name, /* in: table name string */
+ ibool set_lower_case); /* in: TRUE if we want to set
+ name to lower case */
+
+#ifdef NOT_USED
+/*************************************************************//**
+Removes old archived transaction log files.
+@return true on error */
+static bool innobase_purge_archive_logs(
+ handlerton *hton, /*!< in: InnoDB handlerton */
+ time_t before_date, /*!< in: all files modified
+ before timestamp should be removed */
+ const char* to_filename) /*!< in: this and earler files
+ should be removed */
+{
+ ulint err= DB_ERROR;
+ if (before_date > 0) {
+ err= purge_archived_logs(before_date, 0);
+ } else if (to_filename) {
+ if (is_prefix(to_filename, IB_ARCHIVED_LOGS_PREFIX)) {
+ unsigned long long log_file_lsn = strtoll(to_filename
+ + IB_ARCHIVED_LOGS_PREFIX_LEN,
+ NULL, 10);
+ if (log_file_lsn > 0 && log_file_lsn < ULLONG_MAX) {
+ err= purge_archived_logs(0, log_file_lsn);
+ }
+ }
+ }
+ return (err != DB_SUCCESS);
+}
+#endif
+
+
+/*************************************************************//**
+Check for a valid value of innobase_commit_concurrency.
+@return 0 for valid innodb_commit_concurrency */
+static
+int
+innobase_commit_concurrency_validate(
+/*=================================*/
+ THD* thd, /*!< in: thread handle */
+ struct st_mysql_sys_var* var, /*!< in: pointer to system
+ variable */
+ void* save, /*!< out: immediate result
+ for update function */
+ struct st_mysql_value* value) /*!< in: incoming string */
+{
+ long long intbuf;
+ ulong commit_concurrency;
+
+ DBUG_ENTER("innobase_commit_concurrency_validate");
+
+ if (value->val_int(value, &intbuf)) {
+ /* The value is NULL. That is invalid. */
+ DBUG_RETURN(1);
+ }
+
+ *reinterpret_cast<ulong*>(save) = commit_concurrency
+ = static_cast<ulong>(intbuf);
+
+ /* Allow the value to be updated, as long as it remains zero
+ or nonzero. */
+ DBUG_RETURN(!(!commit_concurrency == !innobase_commit_concurrency));
+}
+
+/*******************************************************************//**
+Function for constructing an InnoDB table handler instance. */
+static
+handler*
+innobase_create_handler(
+/*====================*/
+ handlerton* hton, /*!< in: InnoDB handlerton */
+ TABLE_SHARE* table,
+ MEM_ROOT* mem_root)
+{
+ return(new (mem_root) ha_innobase(hton, table));
+}
+
+/* General functions */
+
+/*************************************************************//**
+Check that a page_size is correct for InnoDB. If correct, set the
+associated page_size_shift which is the power of 2 for this page size.
+@return an associated page_size_shift if valid, 0 if invalid. */
+inline
+int
+innodb_page_size_validate(
+/*======================*/
+ ulong page_size) /*!< in: Page Size to evaluate */
+{
+ ulong n;
+
+ DBUG_ENTER("innodb_page_size_validate");
+
+ for (n = UNIV_PAGE_SIZE_SHIFT_MIN;
+ n <= UNIV_PAGE_SIZE_SHIFT_MAX;
+ n++) {
+ if (page_size == (ulong) (1 << n)) {
+ DBUG_RETURN(n);
+ }
+ }
+
+ DBUG_RETURN(0);
+}
+
+/******************************************************************//**
+Returns true if the thread is the replication thread on the slave
+server. Used in srv_conc_enter_innodb() to determine if the thread
+should be allowed to enter InnoDB - the replication thread is treated
+differently than other threads. Also used in
+srv_conc_force_exit_innodb().
+@return true if thd is the replication thread */
+UNIV_INTERN
+ibool
+thd_is_replication_slave_thread(
+/*============================*/
+ THD* thd) /*!< in: thread handle */
+{
+ return((ibool) thd_slave_thread(thd));
+}
+
+/******************************************************************//**
+Gets information on the durability property requested by thread.
+Used when writing either a prepare or commit record to the log
+buffer. @return the durability property. */
+UNIV_INTERN
+enum durability_properties
+thd_requested_durability(
+/*=====================*/
+ const THD* thd) /*!< in: thread handle */
+{
+ return(thd_get_durability_property(thd));
+}
+
+/******************************************************************//**
+Returns true if transaction should be flagged as read-only.
+@return true if the thd is marked as read-only */
+UNIV_INTERN
+ibool
+thd_trx_is_read_only(
+/*=================*/
+ THD* thd) /*!< in: thread handle */
+{
+ return(thd != 0 && thd_tx_is_read_only(thd));
+}
+
+/******************************************************************//**
+Check if the transaction is an auto-commit transaction. TRUE also
+implies that it is a SELECT (read-only) transaction.
+@return true if the transaction is an auto commit read-only transaction. */
+UNIV_INTERN
+ibool
+thd_trx_is_auto_commit(
+/*===================*/
+ THD* thd) /*!< in: thread handle, can be NULL */
+{
+ return(thd != NULL
+ && !thd_test_options(
+ thd,
+ OPTION_NOT_AUTOCOMMIT | OPTION_BEGIN)
+ && thd_is_select(thd));
+}
+
+/******************************************************************//**
+Save some CPU by testing the value of srv_thread_concurrency in inline
+functions. */
+static inline
+void
+innobase_srv_conc_enter_innodb(
+/*===========================*/
+ trx_t* trx) /*!< in: transaction handle */
+{
+#ifdef WITH_WSREP
+ if (wsrep_on(trx->mysql_thd) &&
+ wsrep_thd_is_BF(trx->mysql_thd, FALSE)) return;
+#endif /* WITH_WSREP */
+ if (srv_thread_concurrency) {
+ if (trx->n_tickets_to_enter_innodb > 0) {
+
+ /* If trx has 'free tickets' to enter the engine left,
+ then use one such ticket */
+
+ --trx->n_tickets_to_enter_innodb;
+
+ } else if (trx->mysql_thd != NULL
+ && thd_is_replication_slave_thread(trx->mysql_thd)) {
+
+ UT_WAIT_FOR(
+ srv_conc_get_active_threads()
+ < srv_thread_concurrency,
+ srv_replication_delay * 1000);
+
+ } else {
+ srv_conc_enter_innodb(trx);
+ }
+ }
+}
+
+/******************************************************************//**
+Note that the thread wants to leave InnoDB only if it doesn't have
+any spare tickets. */
+static inline
+void
+innobase_srv_conc_exit_innodb(
+/*==========================*/
+ trx_t* trx) /*!< in: transaction handle */
+{
+#ifdef UNIV_SYNC_DEBUG
+ ut_ad(!sync_thread_levels_nonempty_trx(trx->has_search_latch));
+#endif /* UNIV_SYNC_DEBUG */
+#ifdef WITH_WSREP
+ if (wsrep_on(trx->mysql_thd) &&
+ wsrep_thd_is_BF(trx->mysql_thd, FALSE)) return;
+#endif /* WITH_WSREP */
+
+ /* This is to avoid making an unnecessary function call. */
+ if (trx->declared_to_be_inside_innodb
+ && trx->n_tickets_to_enter_innodb == 0) {
+
+ srv_conc_force_exit_innodb(trx);
+ }
+}
+
+/******************************************************************//**
+Force a thread to leave InnoDB even if it has spare tickets. */
+static inline
+void
+innobase_srv_conc_force_exit_innodb(
+/*================================*/
+ trx_t* trx) /*!< in: transaction handle */
+{
+#ifdef UNIV_SYNC_DEBUG
+ ut_ad(!sync_thread_levels_nonempty_trx(trx->has_search_latch));
+#endif /* UNIV_SYNC_DEBUG */
+
+ /* This is to avoid making an unnecessary function call. */
+ if (trx->declared_to_be_inside_innodb) {
+ srv_conc_force_exit_innodb(trx);
+ }
+}
+
+/******************************************************************//**
+Returns the NUL terminated value of glob_hostname.
+@return pointer to glob_hostname. */
+UNIV_INTERN
+const char*
+server_get_hostname()
+/*=================*/
+{
+ return(glob_hostname);
+}
+
+/******************************************************************//**
+Returns true if the transaction this thread is processing has edited
+non-transactional tables. Used by the deadlock detector when deciding
+which transaction to rollback in case of a deadlock - we try to avoid
+rolling back transactions that have edited non-transactional tables.
+@return true if non-transactional tables have been edited */
+UNIV_INTERN
+ibool
+thd_has_edited_nontrans_tables(
+/*===========================*/
+ THD* thd) /*!< in: thread handle */
+{
+ return((ibool) thd_non_transactional_update(thd));
+}
+
+/******************************************************************//**
+Returns true if the thread is executing a SELECT statement.
+@return true if thd is executing SELECT */
+UNIV_INTERN
+ibool
+thd_is_select(
+/*==========*/
+ const THD* thd) /*!< in: thread handle */
+{
+ return(thd_sql_command(thd) == SQLCOM_SELECT);
+}
+
+/******************************************************************//**
+Returns true if the thread supports XA,
+global value of innodb_supports_xa if thd is NULL.
+@return true if thd has XA support */
+UNIV_INTERN
+ibool
+thd_supports_xa(
+/*============*/
+ THD* thd) /*!< in: thread handle, or NULL to query
+ the global innodb_supports_xa */
+{
+ return(THDVAR(thd, support_xa));
+}
+
+/** Get the value of innodb_tmpdir.
+@param[in] thd thread handle, or NULL to query
+ the global innodb_tmpdir.
+@retval NULL if innodb_tmpdir="" */
+UNIV_INTERN
+const char*
+thd_innodb_tmpdir(
+ THD* thd)
+{
+#ifdef UNIV_SYNC_DEBUG
+ ut_ad(!sync_thread_levels_nonempty_trx(false));
+#endif /* UNIV_SYNC_DEBUG */
+
+ const char* tmp_dir = THDVAR(thd, tmpdir);
+ if (tmp_dir != NULL && *tmp_dir == '\0') {
+ tmp_dir = NULL;
+ }
+
+ return(tmp_dir);
+}
+/******************************************************************//**
+Check the status of fake changes mode (innodb_fake_changes)
+@return true if fake change mode is enabled. */
+UNIV_INTERN
+ibool
+thd_fake_changes(
+/*=============*/
+ THD* thd) /*!< in: thread handle, or NULL to query
+ the global innodb_supports_xa */
+{
+ return(THDVAR((THD*) thd, fake_changes));
+}
+
+/******************************************************************//**
+Returns the lock wait timeout for the current connection.
+@return the lock wait timeout, in seconds */
+UNIV_INTERN
+ulong
+thd_lock_wait_timeout(
+/*==================*/
+ THD* thd) /*!< in: thread handle, or NULL to query
+ the global innodb_lock_wait_timeout */
+{
+ /* According to <mysql/plugin.h>, passing thd == NULL
+ returns the global value of the session variable. */
+ return(THDVAR(thd, lock_wait_timeout));
+}
+
+/******************************************************************//**
+Set the time waited for the lock for the current query. */
+UNIV_INTERN
+void
+thd_set_lock_wait_time(
+/*===================*/
+ THD* thd, /*!< in/out: thread handle */
+ ulint value) /*!< in: time waited for the lock */
+{
+ if (thd) {
+ thd_storage_lock_wait(thd, value);
+ }
+}
+
+/******************************************************************//**
+*/
+UNIV_INTERN
+ulong
+thd_flush_log_at_trx_commit(
+/*================================*/
+ void* thd)
+{
+ return(THDVAR((THD*) thd, flush_log_at_trx_commit));
+}
+
+/********************************************************************//**
+Obtain the InnoDB transaction of a MySQL thread.
+@return reference to transaction pointer */
+MY_ATTRIBUTE((warn_unused_result, nonnull))
+static inline
+trx_t*&
+thd_to_trx(
+/*=======*/
+ THD* thd) /*!< in: MySQL thread */
+{
+ return(*(trx_t**) thd_ha_data(thd, innodb_hton_ptr));
+}
+
+#ifdef WITH_WSREP
+ulonglong
+thd_to_trx_id(
+/*=======*/
+ THD* thd) /*!< in: MySQL thread */
+{
+ return(thd_to_trx(thd)->id);
+}
+#endif /* WITH_WSREP */
+
+my_bool
+ha_innobase::is_fake_change_enabled(THD* thd)
+{
+ trx_t* trx = thd_to_trx(thd);
+ return(trx && UNIV_UNLIKELY(trx->fake_changes));
+}
+
+/********************************************************************//**
+In XtraDB it is impossible for a transaction to own a search latch outside of
+InnoDB code, so there is nothing to release on demand. We keep this function to
+simplify maintenance.
+@return 0 */
+static
+int
+innobase_release_temporary_latches(
+/*===============================*/
+ handlerton* hton MY_ATTRIBUTE((unused)), /*!< in: handlerton */
+ THD* thd MY_ATTRIBUTE((unused))) /*!< in: MySQL thread */
+{
+#ifdef UNIV_DEBUG
+ DBUG_ASSERT(hton == innodb_hton_ptr);
+
+ if (!innodb_inited || thd == NULL) {
+
+ return(0);
+ }
+
+ trx_t* trx = thd_to_trx(thd);
+
+ if (trx != NULL) {
+#ifdef UNIV_SYNC_DEBUG
+ ut_ad(!btr_search_own_any());
+#endif
+ trx_search_latch_release_if_reserved(trx);
+ }
+#endif
+
+ return(0);
+}
+
+/********************************************************************//**
+Increments innobase_active_counter and every INNOBASE_WAKE_INTERVALth
+time calls srv_active_wake_master_thread. This function should be used
+when a single database operation may introduce a small need for
+server utility activity, like checkpointing. */
+static inline
+void
+innobase_active_small(void)
+/*=======================*/
+{
+ innobase_active_counter++;
+
+ if ((innobase_active_counter % INNOBASE_WAKE_INTERVAL) == 0) {
+ srv_active_wake_master_thread();
+ }
+}
+
+/********************************************************************//**
+Converts an InnoDB error code to a MySQL error code and also tells to MySQL
+about a possible transaction rollback inside InnoDB caused by a lock wait
+timeout or a deadlock.
+@return MySQL error code */
+static
+int
+convert_error_code_to_mysql(
+/*========================*/
+ dberr_t error, /*!< in: InnoDB error code */
+ ulint flags, /*!< in: InnoDB table flags, or 0 */
+ THD* thd) /*!< in: user thread handle or NULL */
+{
+ switch (error) {
+ case DB_SUCCESS:
+ return(0);
+
+ case DB_INTERRUPTED:
+ return(HA_ERR_ABORTED_BY_USER);
+
+ case DB_FOREIGN_EXCEED_MAX_CASCADE:
+ ut_ad(thd);
+ push_warning_printf(thd, Sql_condition::WARN_LEVEL_WARN,
+ HA_ERR_ROW_IS_REFERENCED,
+ "InnoDB: Cannot delete/update "
+ "rows with cascading foreign key "
+ "constraints that exceed max "
+ "depth of %d. Please "
+ "drop extra constraints and try "
+ "again", DICT_FK_MAX_RECURSIVE_LOAD);
+
+ /* fall through */
+
+ case DB_ERROR:
+ default:
+ return(-1); /* unspecified error */
+
+ case DB_DUPLICATE_KEY:
+ /* Be cautious with returning this error, since
+ mysql could re-enter the storage layer to get
+ duplicated key info, the operation requires a
+ valid table handle and/or transaction information,
+ which might not always be available in the error
+ handling stage. */
+ return(HA_ERR_FOUND_DUPP_KEY);
+
+ case DB_READ_ONLY:
+ return(HA_ERR_TABLE_READONLY);
+
+ case DB_FOREIGN_DUPLICATE_KEY:
+ return(HA_ERR_FOREIGN_DUPLICATE_KEY);
+
+ case DB_MISSING_HISTORY:
+ return(HA_ERR_TABLE_DEF_CHANGED);
+
+ case DB_RECORD_NOT_FOUND:
+ return(HA_ERR_NO_ACTIVE_RECORD);
+
+ case DB_SEARCH_ABORTED_BY_USER:
+ return(HA_ERR_ABORTED_BY_USER);
+
+ case DB_DEADLOCK:
+ /* Since we rolled back the whole transaction, we must
+ tell it also to MySQL so that MySQL knows to empty the
+ cached binlog for this transaction */
+
+ if (thd) {
+ thd_mark_transaction_to_rollback(thd, TRUE);
+ }
+
+ return(HA_ERR_LOCK_DEADLOCK);
+
+ case DB_LOCK_WAIT_TIMEOUT:
+ /* Starting from 5.0.13, we let MySQL just roll back the
+ latest SQL statement in a lock wait timeout. Previously, we
+ rolled back the whole transaction. */
+
+ if (thd) {
+ thd_mark_transaction_to_rollback(
+ thd, (bool) row_rollback_on_timeout);
+ }
+
+ return(HA_ERR_LOCK_WAIT_TIMEOUT);
+
+ case DB_NO_REFERENCED_ROW:
+ return(HA_ERR_NO_REFERENCED_ROW);
+
+ case DB_ROW_IS_REFERENCED:
+ return(HA_ERR_ROW_IS_REFERENCED);
+
+ case DB_CANNOT_ADD_CONSTRAINT:
+ case DB_CHILD_NO_INDEX:
+ case DB_PARENT_NO_INDEX:
+ return(HA_ERR_CANNOT_ADD_FOREIGN);
+
+ case DB_CANNOT_DROP_CONSTRAINT:
+
+ return(HA_ERR_ROW_IS_REFERENCED); /* TODO: This is a bit
+ misleading, a new MySQL error
+ code should be introduced */
+
+ case DB_CORRUPTION:
+ return(HA_ERR_CRASHED);
+
+ case DB_OUT_OF_FILE_SPACE:
+ return(HA_ERR_RECORD_FILE_FULL);
+
+ case DB_TEMP_FILE_WRITE_FAILURE:
+ my_error(ER_GET_ERRMSG, MYF(0),
+ DB_TEMP_FILE_WRITE_FAILURE,
+ ut_strerr(DB_TEMP_FILE_WRITE_FAILURE),
+ "InnoDB");
+ return(HA_ERR_INTERNAL_ERROR);
+
+ case DB_TABLE_IN_FK_CHECK:
+ return(HA_ERR_TABLE_IN_FK_CHECK);
+
+ case DB_TABLE_IS_BEING_USED:
+ return(HA_ERR_WRONG_COMMAND);
+
+ case DB_TABLESPACE_DELETED:
+ case DB_TABLE_NOT_FOUND:
+ return(HA_ERR_NO_SUCH_TABLE);
+
+ case DB_TABLESPACE_NOT_FOUND:
+ return(HA_ERR_NO_SUCH_TABLE);
+
+ case DB_TOO_BIG_RECORD: {
+ /* If prefix is true then a 768-byte prefix is stored
+ locally for BLOB fields. Refer to dict_table_get_format() */
+ bool prefix = (dict_tf_get_format(flags) == UNIV_FORMAT_A);
+ my_printf_error(ER_TOO_BIG_ROWSIZE,
+ "Row size too large (> %lu). Changing some columns "
+ "to TEXT or BLOB %smay help. In current row "
+ "format, BLOB prefix of %d bytes is stored inline.",
+ MYF(0),
+ page_get_free_space_of_empty(flags &
+ DICT_TF_COMPACT) / 2,
+ prefix ? "or using ROW_FORMAT=DYNAMIC "
+ "or ROW_FORMAT=COMPRESSED ": "",
+ prefix ? DICT_MAX_FIXED_COL_LEN : 0);
+ return(HA_ERR_TO_BIG_ROW);
+ }
+
+
+ case DB_TOO_BIG_FOR_REDO:
+ my_printf_error(ER_TOO_BIG_ROWSIZE, "%s" , MYF(0),
+ "The size of BLOB/TEXT data inserted"
+ " in one transaction is greater than"
+ " 10% of redo log size. Increase the"
+ " redo log size using innodb_log_file_size.");
+ return(HA_ERR_TO_BIG_ROW);
+
+ case DB_TOO_BIG_INDEX_COL:
+ my_error(ER_INDEX_COLUMN_TOO_LONG, MYF(0),
+ DICT_MAX_FIELD_LEN_BY_FORMAT_FLAG(flags));
+ return(HA_ERR_INDEX_COL_TOO_LONG);
+
+ case DB_NO_SAVEPOINT:
+ return(HA_ERR_NO_SAVEPOINT);
+
+ case DB_LOCK_TABLE_FULL:
+ /* Since we rolled back the whole transaction, we must
+ tell it also to MySQL so that MySQL knows to empty the
+ cached binlog for this transaction */
+
+ if (thd) {
+ thd_mark_transaction_to_rollback(thd, TRUE);
+ }
+
+ return(HA_ERR_LOCK_TABLE_FULL);
+
+ case DB_FTS_INVALID_DOCID:
+ return(HA_FTS_INVALID_DOCID);
+ case DB_FTS_EXCEED_RESULT_CACHE_LIMIT:
+ return(HA_ERR_OUT_OF_MEM);
+ case DB_TOO_MANY_CONCURRENT_TRXS:
+ return(HA_ERR_TOO_MANY_CONCURRENT_TRXS);
+ case DB_UNSUPPORTED:
+ return(HA_ERR_UNSUPPORTED);
+ case DB_INDEX_CORRUPT:
+ return(HA_ERR_INDEX_CORRUPT);
+ case DB_UNDO_RECORD_TOO_BIG:
+ return(HA_ERR_UNDO_REC_TOO_BIG);
+ case DB_OUT_OF_MEMORY:
+ return(HA_ERR_OUT_OF_MEM);
+ case DB_TABLESPACE_EXISTS:
+ return(HA_ERR_TABLESPACE_EXISTS);
+ case DB_IDENTIFIER_TOO_LONG:
+ return(HA_ERR_INTERNAL_ERROR);
+ case DB_FTS_TOO_MANY_WORDS_IN_PHRASE:
+ return(HA_ERR_FTS_TOO_MANY_WORDS_IN_PHRASE);
+ }
+}
+
+/*************************************************************//**
+Prints info of a THD object (== user session thread) to the given file. */
+UNIV_INTERN
+void
+innobase_mysql_print_thd(
+/*=====================*/
+ FILE* f, /*!< in: output stream */
+ THD* thd, /*!< in: MySQL THD object */
+ uint max_query_len) /*!< in: max query length to print, or 0 to
+ use the default max length */
+{
+ char buffer[1024];
+
+ fputs(thd_get_error_context_description((THD*) thd,
+ buffer, sizeof buffer,
+ max_query_len), f);
+ putc('\n', f);
+}
+
+/******************************************************************//**
+Get the error message format string.
+@return the format string or 0 if not found. */
+UNIV_INTERN
+const char*
+innobase_get_err_msg(
+/*=================*/
+ int error_code) /*!< in: MySQL error code */
+{
+ return(my_get_err_msg(error_code));
+}
+
+/******************************************************************//**
+Get the variable length bounds of the given character set. */
+UNIV_INTERN
+void
+innobase_get_cset_width(
+/*====================*/
+ ulint cset, /*!< in: MySQL charset-collation code */
+ ulint* mbminlen, /*!< out: minimum length of a char (in bytes) */
+ ulint* mbmaxlen) /*!< out: maximum length of a char (in bytes) */
+{
+ CHARSET_INFO* cs;
+ ut_ad(cset <= MAX_CHAR_COLL_NUM);
+ ut_ad(mbminlen);
+ ut_ad(mbmaxlen);
+
+ cs = all_charsets[cset];
+ if (cs) {
+ *mbminlen = cs->mbminlen;
+ *mbmaxlen = cs->mbmaxlen;
+ ut_ad(*mbminlen < DATA_MBMAX);
+ ut_ad(*mbmaxlen < DATA_MBMAX);
+ } else {
+ THD* thd = current_thd;
+
+ if (thd && thd_sql_command(thd) == SQLCOM_DROP_TABLE) {
+
+ /* Fix bug#46256: allow tables to be dropped if the
+ collation is not found, but issue a warning. */
+ if ((global_system_variables.log_warnings)
+ && (cset != 0)){
+
+ sql_print_warning(
+ "Unknown collation #%lu.", cset);
+ }
+ } else {
+
+ ut_a(cset == 0);
+ }
+
+ *mbminlen = *mbmaxlen = 0;
+ }
+}
+
+/******************************************************************//**
+Converts an identifier to a table name. */
+UNIV_INTERN
+void
+innobase_convert_from_table_id(
+/*===========================*/
+ struct charset_info_st* cs, /*!< in: the 'from' character set */
+ char* to, /*!< out: converted identifier */
+ const char* from, /*!< in: identifier to convert */
+ ulint len) /*!< in: length of 'to', in bytes */
+{
+ uint errors;
+
+ strconvert(cs, from, strlen(from), &my_charset_filename, to, (uint) len, &errors);
+}
+
+/**********************************************************************
+Check if the length of the identifier exceeds the maximum allowed.
+return true when length of identifier is too long. */
+UNIV_INTERN
+my_bool
+innobase_check_identifier_length(
+/*=============================*/
+ const char* id) /* in: FK identifier to check excluding the
+ database portion. */
+{
+ int well_formed_error = 0;
+ CHARSET_INFO *cs = system_charset_info;
+ DBUG_ENTER("innobase_check_identifier_length");
+
+ size_t len = cs->cset->well_formed_len(
+ cs, id, id + strlen(id),
+ NAME_CHAR_LEN, &well_formed_error);
+
+ if (well_formed_error || len == NAME_CHAR_LEN) {
+ my_error(ER_TOO_LONG_IDENT, MYF(0), id);
+ DBUG_RETURN(true);
+ }
+ DBUG_RETURN(false);
+}
+
+/******************************************************************//**
+Converts an identifier to UTF-8. */
+UNIV_INTERN
+void
+innobase_convert_from_id(
+/*=====================*/
+ struct charset_info_st* cs, /*!< in: the 'from' character set */
+ char* to, /*!< out: converted identifier */
+ const char* from, /*!< in: identifier to convert */
+ ulint len) /*!< in: length of 'to', in bytes */
+{
+ uint errors;
+
+ strconvert(cs, from, strlen(from), system_charset_info, to, (uint) len, &errors);
+}
+
+/******************************************************************//**
+Compares NUL-terminated UTF-8 strings case insensitively.
+@return 0 if a=b, <0 if a<b, >1 if a>b */
+UNIV_INTERN
+int
+innobase_strcasecmp(
+/*================*/
+ const char* a, /*!< in: first string to compare */
+ const char* b) /*!< in: second string to compare */
+{
+ if (!a) {
+ if (!b) {
+ return(0);
+ } else {
+ return(-1);
+ }
+ } else if (!b) {
+ return(1);
+ }
+
+ return(my_strcasecmp(system_charset_info, a, b));
+}
+
+/******************************************************************//**
+Compares NUL-terminated UTF-8 strings case insensitively. The
+second string contains wildcards.
+@return 0 if a match is found, 1 if not */
+UNIV_INTERN
+int
+innobase_wildcasecmp(
+/*=================*/
+ const char* a, /*!< in: string to compare */
+ const char* b) /*!< in: wildcard string to compare */
+{
+ return(wild_case_compare(system_charset_info, a, b));
+}
+
+/******************************************************************//**
+Strip dir name from a full path name and return only the file name
+@return file name or "null" if no file name */
+UNIV_INTERN
+const char*
+innobase_basename(
+/*==============*/
+ const char* path_name) /*!< in: full path name */
+{
+ const char* name = base_name(path_name);
+
+ return((name) ? name : "null");
+}
+
+/******************************************************************//**
+Makes all characters in a NUL-terminated UTF-8 string lower case. */
+UNIV_INTERN
+void
+innobase_casedn_str(
+/*================*/
+ char* a) /*!< in/out: string to put in lower case */
+{
+ my_casedn_str(system_charset_info, a);
+}
+
+/**********************************************************************//**
+Determines the connection character set.
+@return connection character set */
+UNIV_INTERN
+struct charset_info_st*
+innobase_get_charset(
+/*=================*/
+ THD* mysql_thd) /*!< in: MySQL thread handle */
+{
+ return(thd_charset(mysql_thd));
+}
+
+/**********************************************************************//**
+Determines the current SQL statement.
+@return SQL statement string */
+UNIV_INTERN
+const char*
+innobase_get_stmt(
+/*==============*/
+ THD* thd, /*!< in: MySQL thread handle */
+ size_t* length) /*!< out: length of the SQL statement */
+{
+ if (const LEX_STRING *stmt = thd_query_string(thd)) {
+ *length = stmt->length;
+ return stmt->str;
+ }
+ return NULL;
+}
+
+/**********************************************************************//**
+Get the current setting of the table_def_size global parameter. We do
+a dirty read because for one there is no synchronization object and
+secondly there is little harm in doing so even if we get a torn read.
+@return value of table_def_size */
+UNIV_INTERN
+ulint
+innobase_get_table_cache_size(void)
+/*===============================*/
+{
+ return(tdc_size);
+}
+
+/**********************************************************************//**
+Get the current setting of the lower_case_table_names global parameter from
+mysqld.cc. We do a dirty read because for one there is no synchronization
+object and secondly there is little harm in doing so even if we get a torn
+read.
+@return value of lower_case_table_names */
+UNIV_INTERN
+ulint
+innobase_get_lower_case_table_names(void)
+/*=====================================*/
+{
+ return(lower_case_table_names);
+}
+
+/** Create a temporary file in the location specified by the parameter
+path. If the path is null, then it will be created in tmpdir.
+@param[in] path location for creating temporary file
+@return temporary file descriptor, or < 0 on error */
+UNIV_INTERN
+int
+innobase_mysql_tmpfile(
+ const char* path)
+{
+#ifdef WITH_INNODB_DISALLOW_WRITES
+ os_event_wait(srv_allow_writes_event);
+#endif /* WITH_INNODB_DISALLOW_WRITES */
+ int fd2 = -1;
+ File fd;
+
+ DBUG_EXECUTE_IF(
+ "innobase_tmpfile_creation_failure",
+ return(-1);
+ );
+
+ if (path == NULL) {
+ fd = mysql_tmpfile("ib");
+ } else {
+ fd = mysql_tmpfile_path(path, "ib");
+ }
+
+ if (fd >= 0) {
+ /* Copy the file descriptor, so that the additional resources
+ allocated by create_temp_file() can be freed by invoking
+ my_close().
+
+ Because the file descriptor returned by this function
+ will be passed to fdopen(), it will be closed by invoking
+ fclose(), which in turn will invoke close() instead of
+ my_close(). */
+
+#ifdef _WIN32
+ /* Note that on Windows, the integer returned by mysql_tmpfile
+ has no relation to C runtime file descriptor. Here, we need
+ to call my_get_osfhandle to get the HANDLE and then convert it
+ to C runtime filedescriptor. */
+ {
+ HANDLE hFile = my_get_osfhandle(fd);
+ HANDLE hDup;
+ BOOL bOK = DuplicateHandle(
+ GetCurrentProcess(),
+ hFile, GetCurrentProcess(),
+ &hDup, 0, FALSE, DUPLICATE_SAME_ACCESS);
+ if (bOK) {
+ fd2 = _open_osfhandle((intptr_t) hDup, 0);
+ } else {
+ my_osmaperr(GetLastError());
+ fd2 = -1;
+ }
+ }
++#else
++#ifdef F_DUPFD_CLOEXEC
++ fd2 = fcntl(fd, F_DUPFD_CLOEXEC, 0);
+#else
+ fd2 = dup(fd);
++#endif
+#endif
+ if (fd2 < 0) {
+ DBUG_PRINT("error",("Got error %d on dup",fd2));
+ my_errno=errno;
+ my_error(EE_OUT_OF_FILERESOURCES,
+ MYF(ME_BELL+ME_WAITTANG),
+ "ib*", my_errno);
+ }
+ my_close(fd, MYF(MY_WME));
+ }
+ return(fd2);
+}
+
+/*********************************************************************//**
+Wrapper around MySQL's copy_and_convert function.
+@return number of bytes copied to 'to' */
+UNIV_INTERN
+ulint
+innobase_convert_string(
+/*====================*/
+ void* to, /*!< out: converted string */
+ ulint to_length, /*!< in: number of bytes reserved
+ for the converted string */
+ CHARSET_INFO* to_cs, /*!< in: character set to convert to */
+ const void* from, /*!< in: string to convert */
+ ulint from_length, /*!< in: number of bytes to convert */
+ CHARSET_INFO* from_cs, /*!< in: character set to convert
+ from */
+ uint* errors) /*!< out: number of errors encountered
+ during the conversion */
+{
+ return(copy_and_convert(
+ (char*) to, (uint32) to_length, to_cs,
+ (const char*) from, (uint32) from_length, from_cs,
+ errors));
+}
+
+/*******************************************************************//**
+Formats the raw data in "data" (in InnoDB on-disk format) that is of
+type DATA_(CHAR|VARCHAR|MYSQL|VARMYSQL) using "charset_coll" and writes
+the result to "buf". The result is converted to "system_charset_info".
+Not more than "buf_size" bytes are written to "buf".
+The result is always NUL-terminated (provided buf_size > 0) and the
+number of bytes that were written to "buf" is returned (including the
+terminating NUL).
+@return number of bytes that were written */
+UNIV_INTERN
+ulint
+innobase_raw_format(
+/*================*/
+ const char* data, /*!< in: raw data */
+ ulint data_len, /*!< in: raw data length
+ in bytes */
+ ulint charset_coll, /*!< in: charset collation */
+ char* buf, /*!< out: output buffer */
+ ulint buf_size) /*!< in: output buffer size
+ in bytes */
+{
+ /* XXX we use a hard limit instead of allocating
+ but_size bytes from the heap */
+ CHARSET_INFO* data_cs;
+ char buf_tmp[8192];
+ ulint buf_tmp_used;
+ uint num_errors;
+
+ data_cs = all_charsets[charset_coll];
+
+ buf_tmp_used = innobase_convert_string(buf_tmp, sizeof(buf_tmp),
+ system_charset_info,
+ data, data_len, data_cs,
+ &num_errors);
+
+ return(ut_str_sql_format(buf_tmp, buf_tmp_used, buf, buf_size));
+}
+
+/*********************************************************************//**
+Compute the next autoinc value.
+
+For MySQL replication the autoincrement values can be partitioned among
+the nodes. The offset is the start or origin of the autoincrement value
+for a particular node. For n nodes the increment will be n and the offset
+will be in the interval [1, n]. The formula tries to allocate the next
+value for a particular node.
+
+Note: This function is also called with increment set to the number of
+values we want to reserve for multi-value inserts e.g.,
+
+ INSERT INTO T VALUES(), (), ();
+
+innobase_next_autoinc() will be called with increment set to 3 where
+autoinc_lock_mode != TRADITIONAL because we want to reserve 3 values for
+the multi-value INSERT above.
+@return the next value */
+UNIV_INTERN
+ulonglong
+innobase_next_autoinc(
+/*==================*/
+ ulonglong current, /*!< in: Current value */
+ ulonglong need, /*!< in: count of values needed */
+ ulonglong step, /*!< in: AUTOINC increment step */
+ ulonglong offset, /*!< in: AUTOINC offset */
+ ulonglong max_value) /*!< in: max value for type */
+{
+ ulonglong next_value;
+ ulonglong block = need * step;
+
+ /* Should never be 0. */
+ ut_a(need > 0);
+ ut_a(block > 0);
+ ut_a(max_value > 0);
+
+ /*
+ Allow auto_increment to go over max_value up to max ulonglong.
+ This allows us to detect that all values are exhausted.
+ If we don't do this, we will return max_value several times
+ and get duplicate key errors instead of auto increment value
+ out of range.
+ */
+ max_value= (~(ulonglong) 0);
+
+ /* According to MySQL documentation, if the offset is greater than
+ the step then the offset is ignored. */
+ if (offset > block) {
+ offset = 0;
+ }
+
+ /* Check for overflow. Current can be > max_value if the value is
+ in reality a negative value.The visual studio compilers converts
+ large double values automatically into unsigned long long datatype
+ maximum value */
+
+ if (block >= max_value
+ || offset > max_value
+ || current >= max_value
+ || max_value - offset <= offset) {
+
+ next_value = max_value;
+ } else {
+ ut_a(max_value > current);
+
+ ulonglong free = max_value - current;
+
+ if (free < offset || free - offset <= block) {
+ next_value = max_value;
+ } else {
+ next_value = 0;
+ }
+ }
+
+ if (next_value == 0) {
+ ulonglong next;
+
+ if (current >= offset) {
+ next = (current - offset) / step;
+ } else {
+ next = 0;
+ block -= step;
+ }
+
+ ut_a(max_value > next);
+ next_value = next * step;
+ /* Check for multiplication overflow. */
+ ut_a(next_value >= next);
+ ut_a(max_value > next_value);
+
+ /* Check for overflow */
+ if (max_value - next_value >= block) {
+
+ next_value += block;
+
+ if (max_value - next_value >= offset) {
+ next_value += offset;
+ } else {
+ next_value = max_value;
+ }
+ } else {
+ next_value = max_value;
+ }
+ }
+
+ ut_a(next_value != 0);
+ ut_a(next_value <= max_value);
+
+ return(next_value);
+}
+
+/*********************************************************************//**
+Initializes some fields in an InnoDB transaction object. */
+static
+void
+innobase_trx_init(
+/*==============*/
+ THD* thd, /*!< in: user thread handle */
+ trx_t* trx) /*!< in/out: InnoDB transaction handle */
+{
+ DBUG_ENTER("innobase_trx_init");
+ DBUG_ASSERT(thd == trx->mysql_thd);
+
+ trx->check_foreigns = !thd_test_options(
+ thd, OPTION_NO_FOREIGN_KEY_CHECKS);
+
+ trx->check_unique_secondary = !thd_test_options(
+ thd, OPTION_RELAXED_UNIQUE_CHECKS);
+
+ /* Transaction on start caches the fake_changes state and uses it for
+ complete transaction lifetime.
+ There are some APIs that doesn't need an active transaction object
+ but transaction object are just use as a cache object/data carrier.
+ Before using transaction object for such APIs refresh the state of
+ fake_changes. */
+ if (trx->state == TRX_STATE_NOT_STARTED) {
+ trx->fake_changes = thd_fake_changes(thd);
+ }
+
+#ifdef EXTENDED_SLOWLOG
+ if (thd_log_slow_verbosity(thd) & (1ULL << SLOG_V_INNODB)) {
+ trx->take_stats = TRUE;
+ } else {
+ trx->take_stats = FALSE;
+ }
+#else
+ trx->take_stats = FALSE;
+#endif
+
+ DBUG_VOID_RETURN;
+}
+
+/*********************************************************************//**
+Allocates an InnoDB transaction for a MySQL handler object for DML.
+@return InnoDB transaction handle */
+UNIV_INTERN
+trx_t*
+innobase_trx_allocate(
+/*==================*/
+ THD* thd) /*!< in: user thread handle */
+{
+ trx_t* trx;
+
+ DBUG_ENTER("innobase_trx_allocate");
+ DBUG_ASSERT(thd != NULL);
+ DBUG_ASSERT(EQ_CURRENT_THD(thd));
+
+ trx = trx_allocate_for_mysql();
+
+ trx->mysql_thd = thd;
+
+ innobase_trx_init(thd, trx);
+
+ DBUG_RETURN(trx);
+}
+
+/*********************************************************************//**
+Gets the InnoDB transaction handle for a MySQL handler object, creates
+an InnoDB transaction struct if the corresponding MySQL thread struct still
+lacks one.
+@return InnoDB transaction handle */
+static inline
+trx_t*
+check_trx_exists(
+/*=============*/
+ THD* thd) /*!< in: user thread handle */
+{
+ trx_t*& trx = thd_to_trx(thd);
+
+ if (trx == NULL) {
+ trx = innobase_trx_allocate(thd);
+ thd_set_ha_data(thd, innodb_hton_ptr, trx);
+ } else if (UNIV_UNLIKELY(trx->magic_n != TRX_MAGIC_N)) {
+ mem_analyze_corruption(trx);
+ ut_error;
+ }
+
+ innobase_trx_init(thd, trx);
+
+ return(trx);
+}
+
+/*************************************************************************
+Gets current trx. */
+trx_t*
+innobase_get_trx()
+{
+ THD *thd=current_thd;
+ if (likely(thd != 0)) {
+ trx_t*& trx = thd_to_trx(thd);
+ return(trx);
+ } else {
+ return(NULL);
+ }
+}
+
+ibool
+innobase_get_slow_log()
+{
+#ifdef EXTENDED_SLOWLOG
+ return((ibool) thd_opt_slow_log());
+#else
+ return(FALSE);
+#endif
+}
+
+/*********************************************************************//**
+Note that a transaction has been registered with MySQL.
+@return true if transaction is registered with MySQL 2PC coordinator */
+static inline
+bool
+trx_is_registered_for_2pc(
+/*=========================*/
+ const trx_t* trx) /* in: transaction */
+{
+ return(trx->is_registered == 1);
+}
+
+/*********************************************************************//**
+Note that innobase_commit_ordered() was run. */
+static inline
+void
+trx_set_active_commit_ordered(
+/*==============================*/
+ trx_t* trx) /* in: transaction */
+{
+ ut_a(trx_is_registered_for_2pc(trx));
+ trx->active_commit_ordered = 1;
+}
+
+/*********************************************************************//**
+Note that a transaction has been registered with MySQL 2PC coordinator. */
+static inline
+void
+trx_register_for_2pc(
+/*==================*/
+ trx_t* trx) /* in: transaction */
+{
+ trx->is_registered = 1;
+ ut_ad(trx->active_commit_ordered == 0);
+}
+
+/*********************************************************************//**
+Note that a transaction has been deregistered. */
+static inline
+void
+trx_deregister_from_2pc(
+/*====================*/
+ trx_t* trx) /* in: transaction */
+{
+ trx->is_registered = 0;
+ trx->active_commit_ordered = 0;
+}
+
+/*********************************************************************//**
+Check whether a transaction has active_commit_ordered set */
+static inline
+bool
+trx_is_active_commit_ordered(
+/*=========================*/
+ const trx_t* trx) /* in: transaction */
+{
+ return(trx->active_commit_ordered == 1);
+}
+
+/*********************************************************************//**
+Check if transaction is started.
+@reutrn true if transaction is in state started */
+static
+bool
+trx_is_started(
+/*===========*/
+ trx_t* trx) /* in: transaction */
+{
+ return(trx->state != TRX_STATE_NOT_STARTED);
+}
+
+/****************************************************************//**
+Update log_checksum_algorithm_ptr with a pointer to the function corresponding
+to a given checksum algorithm. */
+static
+void
+innodb_log_checksum_func_update(
+/*============================*/
+ ulint algorithm) /*!< in: algorithm */
+{
+ switch (algorithm) {
+ case SRV_CHECKSUM_ALGORITHM_STRICT_INNODB:
+ case SRV_CHECKSUM_ALGORITHM_INNODB:
+ log_checksum_algorithm_ptr=log_block_calc_checksum_innodb;
+ break;
+ case SRV_CHECKSUM_ALGORITHM_STRICT_CRC32:
+ case SRV_CHECKSUM_ALGORITHM_CRC32:
+ log_checksum_algorithm_ptr=log_block_calc_checksum_crc32;
+ break;
+ case SRV_CHECKSUM_ALGORITHM_STRICT_NONE:
+ case SRV_CHECKSUM_ALGORITHM_NONE:
+ log_checksum_algorithm_ptr=log_block_calc_checksum_none;
+ break;
+ default:
+ ut_a(0);
+ }
+}
+
+/****************************************************************//**
+On update hook for the innodb_log_checksum_algorithm variable. */
+static
+void
+innodb_log_checksum_algorithm_update(
+/*=================================*/
+ THD* thd, /*!< in: thread handle */
+ struct st_mysql_sys_var* var, /*!< in: pointer to
+ system variable */
+ void* var_ptr,/*!< out: where the
+ formal string goes */
+ const void* save) /*!< in: immediate result
+ from check function */
+{
+ srv_checksum_algorithm_t algorithm;
+
+ algorithm = (srv_checksum_algorithm_t)
+ (*static_cast<const ulong*>(save));
+
+ /* Make sure we are the only log user */
+ mutex_enter(&log_sys->mutex);
+
+ innodb_log_checksum_func_update(algorithm);
+
+ srv_log_checksum_algorithm = algorithm;
+
+ mutex_exit(&log_sys->mutex);
+}
+
+/*********************************************************************//**
+Copy table flags from MySQL's HA_CREATE_INFO into an InnoDB table object.
+Those flags are stored in .frm file and end up in the MySQL table object,
+but are frequently used inside InnoDB so we keep their copies into the
+InnoDB table object. */
+UNIV_INTERN
+void
+innobase_copy_frm_flags_from_create_info(
+/*=====================================*/
+ dict_table_t* innodb_table, /*!< in/out: InnoDB table */
+ const HA_CREATE_INFO* create_info) /*!< in: create info */
+{
+ ibool ps_on;
+ ibool ps_off;
+
+ if (dict_table_is_temporary(innodb_table)) {
+ /* Temp tables do not use persistent stats. */
+ ps_on = FALSE;
+ ps_off = TRUE;
+ } else {
+ ps_on = create_info->table_options
+ & HA_OPTION_STATS_PERSISTENT;
+ ps_off = create_info->table_options
+ & HA_OPTION_NO_STATS_PERSISTENT;
+ }
+
+ dict_stats_set_persistent(innodb_table, ps_on, ps_off);
+
+ dict_stats_auto_recalc_set(
+ innodb_table,
+ create_info->stats_auto_recalc == HA_STATS_AUTO_RECALC_ON,
+ create_info->stats_auto_recalc == HA_STATS_AUTO_RECALC_OFF);
+
+ innodb_table->stats_sample_pages = create_info->stats_sample_pages;
+}
+
+/*********************************************************************//**
+Copy table flags from MySQL's TABLE_SHARE into an InnoDB table object.
+Those flags are stored in .frm file and end up in the MySQL table object,
+but are frequently used inside InnoDB so we keep their copies into the
+InnoDB table object. */
+UNIV_INTERN
+void
+innobase_copy_frm_flags_from_table_share(
+/*=====================================*/
+ dict_table_t* innodb_table, /*!< in/out: InnoDB table */
+ const TABLE_SHARE* table_share) /*!< in: table share */
+{
+ ibool ps_on;
+ ibool ps_off;
+
+ if (dict_table_is_temporary(innodb_table)) {
+ /* Temp tables do not use persistent stats */
+ ps_on = FALSE;
+ ps_off = TRUE;
+ } else {
+ ps_on = table_share->db_create_options
+ & HA_OPTION_STATS_PERSISTENT;
+ ps_off = table_share->db_create_options
+ & HA_OPTION_NO_STATS_PERSISTENT;
+ }
+
+ dict_stats_set_persistent(innodb_table, ps_on, ps_off);
+
+ dict_stats_auto_recalc_set(
+ innodb_table,
+ table_share->stats_auto_recalc == HA_STATS_AUTO_RECALC_ON,
+ table_share->stats_auto_recalc == HA_STATS_AUTO_RECALC_OFF);
+
+ innodb_table->stats_sample_pages = table_share->stats_sample_pages;
+}
+
+/*********************************************************************//**
+Construct ha_innobase handler. */
+UNIV_INTERN
+ha_innobase::ha_innobase(
+/*=====================*/
+ handlerton* hton,
+ TABLE_SHARE* table_arg)
+ :handler(hton, table_arg),
+ int_table_flags(HA_REC_NOT_IN_SEQ |
+ HA_NULL_IN_KEY | HA_CAN_VIRTUAL_COLUMNS |
+ HA_CAN_INDEX_BLOBS |
+ HA_CAN_SQL_HANDLER |
+ HA_PRIMARY_KEY_REQUIRED_FOR_POSITION |
+ HA_PRIMARY_KEY_IN_READ_INDEX |
+ HA_BINLOG_ROW_CAPABLE |
+ HA_CAN_GEOMETRY | HA_PARTIAL_COLUMN_READ |
+ HA_TABLE_SCAN_ON_INDEX | HA_CAN_FULLTEXT |
+ HA_CAN_FULLTEXT_EXT | HA_CAN_EXPORT),
+ start_of_scan(0),
+ num_write_row(0)
+{}
+
+/*********************************************************************//**
+Destruct ha_innobase handler. */
+UNIV_INTERN
+ha_innobase::~ha_innobase()
+/*======================*/
+{
+}
+
+/*********************************************************************//**
+Updates the user_thd field in a handle and also allocates a new InnoDB
+transaction handle if needed, and updates the transaction fields in the
+prebuilt struct. */
+UNIV_INTERN inline
+void
+ha_innobase::update_thd(
+/*====================*/
+ THD* thd) /*!< in: thd to use the handle */
+{
+ trx_t* trx;
+
+ DBUG_ENTER("ha_innobase::update_thd");
+ DBUG_PRINT("ha_innobase::update_thd", ("user_thd: %p -> %p",
+ user_thd, thd));
+
+ /* The table should have been opened in ha_innobase::open(). */
+ DBUG_ASSERT(prebuilt->table->n_ref_count > 0);
+
+ trx = check_trx_exists(thd);
+
+ if (prebuilt->trx != trx) {
+
+ row_update_prebuilt_trx(prebuilt, trx);
+ }
+
+ user_thd = thd;
+ DBUG_VOID_RETURN;
+}
+
+/*********************************************************************//**
+Updates the user_thd field in a handle and also allocates a new InnoDB
+transaction handle if needed, and updates the transaction fields in the
+prebuilt struct. */
+UNIV_INTERN
+void
+ha_innobase::update_thd()
+/*=====================*/
+{
+ THD* thd = ha_thd();
+
+ ut_ad(EQ_CURRENT_THD(thd));
+ update_thd(thd);
+}
+
+/*********************************************************************//**
+Registers an InnoDB transaction with the MySQL 2PC coordinator, so that
+the MySQL XA code knows to call the InnoDB prepare and commit, or rollback
+for the transaction. This MUST be called for every transaction for which
+the user may call commit or rollback. Calling this several times to register
+the same transaction is allowed, too. This function also registers the
+current SQL statement. */
+static inline
+void
+innobase_register_trx(
+/*==================*/
+ handlerton* hton, /* in: Innobase handlerton */
+ THD* thd, /* in: MySQL thd (connection) object */
+ trx_t* trx) /* in: transaction to register */
+{
+ trans_register_ha(thd, FALSE, hton);
+
+ if (!trx_is_registered_for_2pc(trx)
+ && thd_test_options(thd, OPTION_NOT_AUTOCOMMIT | OPTION_BEGIN)) {
+
+ trans_register_ha(thd, TRUE, hton);
+ }
+
+ trx_register_for_2pc(trx);
+}
+
+/* BACKGROUND INFO: HOW THE MYSQL QUERY CACHE WORKS WITH INNODB
+ ------------------------------------------------------------
+
+1) The use of the query cache for TBL is disabled when there is an
+uncommitted change to TBL.
+
+2) When a change to TBL commits, InnoDB stores the current value of
+its global trx id counter, let us denote it by INV_TRX_ID, to the table object
+in the InnoDB data dictionary, and does only allow such transactions whose
+id <= INV_TRX_ID to use the query cache.
+
+3) When InnoDB does an INSERT/DELETE/UPDATE to a table TBL, or an implicit
+modification because an ON DELETE CASCADE, we invalidate the MySQL query cache
+of TBL immediately.
+
+How this is implemented inside InnoDB:
+
+1) Since every modification always sets an IX type table lock on the InnoDB
+table, it is easy to check if there can be uncommitted modifications for a
+table: just check if there are locks in the lock list of the table.
+
+2) When a transaction inside InnoDB commits, it reads the global trx id
+counter and stores the value INV_TRX_ID to the tables on which it had a lock.
+
+3) If there is an implicit table change from ON DELETE CASCADE or SET NULL,
+InnoDB calls an invalidate method for the MySQL query cache for that table.
+
+How this is implemented inside sql_cache.cc:
+
+1) The query cache for an InnoDB table TBL is invalidated immediately at an
+INSERT/UPDATE/DELETE, just like in the case of MyISAM. No need to delay
+invalidation to the transaction commit.
+
+2) To store or retrieve a value from the query cache of an InnoDB table TBL,
+any query must first ask InnoDB's permission. We must pass the thd as a
+parameter because InnoDB will look at the trx id, if any, associated with
+that thd. Also the full_name which is used as key to search for the table
+object. The full_name is a string containing the normalized path to the
+table in the canonical format.
+
+3) Use of the query cache for InnoDB tables is now allowed also when
+AUTOCOMMIT==0 or we are inside BEGIN ... COMMIT. Thus transactions no longer
+put restrictions on the use of the query cache.
+*/
+
+/******************************************************************//**
+The MySQL query cache uses this to check from InnoDB if the query cache at
+the moment is allowed to operate on an InnoDB table. The SQL query must
+be a non-locking SELECT.
+
+The query cache is allowed to operate on certain query only if this function
+returns TRUE for all tables in the query.
+
+If thd is not in the autocommit state, this function also starts a new
+transaction for thd if there is no active trx yet, and assigns a consistent
+read view to it if there is no read view yet.
+
+Why a deadlock of threads is not possible: the query cache calls this function
+at the start of a SELECT processing. Then the calling thread cannot be
+holding any InnoDB semaphores. The calling thread is holding the
+query cache mutex, and this function will reserve the InnoDB trx_sys->mutex.
+Thus, the 'rank' in sync0sync.h of the MySQL query cache mutex is above
+the InnoDB trx_sys->mutex.
+@return TRUE if permitted, FALSE if not; note that the value FALSE
+does not mean we should invalidate the query cache: invalidation is
+called explicitly */
+static
+my_bool
+innobase_query_caching_of_table_permitted(
+/*======================================*/
+ THD* thd, /*!< in: thd of the user who is trying to
+ store a result to the query cache or
+ retrieve it */
+ char* full_name, /*!< in: normalized path to the table */
+ uint full_name_len, /*!< in: length of the normalized path
+ to the table */
+ ulonglong *unused) /*!< unused for this engine */
+{
+ ibool is_autocommit;
+ trx_t* trx;
+ char norm_name[1000];
+
+ ut_a(full_name_len < 999);
+
+ trx = check_trx_exists(thd);
+
+ if (trx->isolation_level == TRX_ISO_SERIALIZABLE) {
+ /* In the SERIALIZABLE mode we add LOCK IN SHARE MODE to every
+ plain SELECT if AUTOCOMMIT is not on. */
+
+ return((my_bool)FALSE);
+ }
+
+ if (UNIV_UNLIKELY(trx->has_search_latch)) {
+ sql_print_error("The calling thread is holding the adaptive "
+ "search, latch though calling "
+ "innobase_query_caching_of_table_permitted.");
+ trx_print(stderr, trx, 1024);
+ }
+
+ trx_search_latch_release_if_reserved(trx);
+
+ innobase_srv_conc_force_exit_innodb(trx);
+
+ if (!thd_test_options(thd, OPTION_NOT_AUTOCOMMIT | OPTION_BEGIN)) {
+
+ is_autocommit = TRUE;
+ } else {
+ is_autocommit = FALSE;
+
+ }
+
+ if (is_autocommit && trx->n_mysql_tables_in_use == 0) {
+ /* We are going to retrieve the query result from the query
+ cache. This cannot be a store operation to the query cache
+ because then MySQL would have locks on tables already.
+
+ TODO: if the user has used LOCK TABLES to lock the table,
+ then we open a transaction in the call of row_.. below.
+ That trx can stay open until UNLOCK TABLES. The same problem
+ exists even if we do not use the query cache. MySQL should be
+ modified so that it ALWAYS calls some cleanup function when
+ the processing of a query ends!
+
+ We can imagine we instantaneously serialize this consistent
+ read trx to the current trx id counter. If trx2 would have
+ changed the tables of a query result stored in the cache, and
+ trx2 would have already committed, making the result obsolete,
+ then trx2 would have already invalidated the cache. Thus we
+ can trust the result in the cache is ok for this query. */
+
+ return((my_bool)TRUE);
+ }
+
+ /* Normalize the table name to InnoDB format */
+ normalize_table_name(norm_name, full_name);
+
+ innobase_register_trx(innodb_hton_ptr, thd, trx);
+
+ if (row_search_check_if_query_cache_permitted(trx, norm_name)) {
+
+ /* printf("Query cache for %s permitted\n", norm_name); */
+
+ return((my_bool)TRUE);
+ }
+
+ /* printf("Query cache for %s NOT permitted\n", norm_name); */
+
+ return((my_bool)FALSE);
+}
+
+/*****************************************************************//**
+Invalidates the MySQL query cache for the table. */
+UNIV_INTERN
+void
+innobase_invalidate_query_cache(
+/*============================*/
+ trx_t* trx, /*!< in: transaction which
+ modifies the table */
+ const char* full_name, /*!< in: concatenation of
+ database name, null char NUL,
+ table name, null char NUL;
+ NOTE that in Windows this is
+ always in LOWER CASE! */
+ ulint full_name_len) /*!< in: full name length where
+ also the null chars count */
+{
+ /* Note that the sync0sync.h rank of the query cache mutex is just
+ above the InnoDB trx_sys_t->lock. The caller of this function must
+ not have latches of a lower rank. */
+
+#ifdef HAVE_QUERY_CACHE
+ char qcache_key_name[2 * (NAME_LEN + 1)];
+ size_t tabname_len;
+ size_t dbname_len;
+
+ /* Construct the key("db-name\0table$name\0") for the query cache using
+ the path name("db@002dname\0table@0024name\0") of the table in its
+ canonical form. */
+ dbname_len = filename_to_tablename(full_name, qcache_key_name,
+ sizeof(qcache_key_name));
+ tabname_len = filename_to_tablename(full_name + strlen(full_name) + 1,
+ qcache_key_name + dbname_len + 1,
+ sizeof(qcache_key_name)
+ - dbname_len - 1);
+
+ /* Argument TRUE below means we are using transactions */
+ mysql_query_cache_invalidate4(trx->mysql_thd,
+ qcache_key_name,
+ (dbname_len + tabname_len + 2),
+ TRUE);
+#endif
+}
+
+/*****************************************************************//**
+Convert an SQL identifier to the MySQL system_charset_info (UTF-8)
+and quote it if needed.
+@return pointer to the end of buf */
+static
+char*
+innobase_convert_identifier(
+/*========================*/
+ char* buf, /*!< out: buffer for converted identifier */
+ ulint buflen, /*!< in: length of buf, in bytes */
+ const char* id, /*!< in: identifier to convert */
+ ulint idlen, /*!< in: length of id, in bytes */
+ THD* thd, /*!< in: MySQL connection thread, or NULL */
+ ibool file_id)/*!< in: TRUE=id is a table or database name;
+ FALSE=id is an UTF-8 string */
+{
+ char nz2[MAX_TABLE_NAME_LEN + 1];
+ const char* s = id;
+ int q;
+
+ if (file_id) {
+
+ char nz[MAX_TABLE_NAME_LEN + 1];
+
+ /* Decode the table name. The MySQL function expects
+ a NUL-terminated string. The input and output strings
+ buffers must not be shared. */
+ ut_a(idlen <= MAX_TABLE_NAME_LEN);
+ memcpy(nz, id, idlen);
+ nz[idlen] = 0;
+
+ s = nz2;
+ idlen = explain_filename(thd, nz, nz2, sizeof nz2,
+ EXPLAIN_PARTITIONS_AS_COMMENT);
+ goto no_quote;
+ }
+
+ /* See if the identifier needs to be quoted. */
+ if (UNIV_UNLIKELY(!thd)) {
+ q = '"';
+ } else {
+ q = get_quote_char_for_identifier(thd, s, (int) idlen);
+ }
+
+ if (q == EOF) {
+no_quote:
+ if (UNIV_UNLIKELY(idlen > buflen)) {
+ idlen = buflen;
+ }
+ memcpy(buf, s, idlen);
+ return(buf + idlen);
+ }
+
+ /* Quote the identifier. */
+ if (buflen < 2) {
+ return(buf);
+ }
+
+ *buf++ = q;
+ buflen--;
+
+ for (; idlen; idlen--) {
+ int c = *s++;
+ if (UNIV_UNLIKELY(c == q)) {
+ if (UNIV_UNLIKELY(buflen < 3)) {
+ break;
+ }
+
+ *buf++ = c;
+ *buf++ = c;
+ buflen -= 2;
+ } else {
+ if (UNIV_UNLIKELY(buflen < 2)) {
+ break;
+ }
+
+ *buf++ = c;
+ buflen--;
+ }
+ }
+
+ *buf++ = q;
+ return(buf);
+}
+
+/*****************************************************************//**
+Convert a table or index name to the MySQL system_charset_info (UTF-8)
+and quote it if needed.
+@return pointer to the end of buf */
+UNIV_INTERN
+char*
+innobase_convert_name(
+/*==================*/
+ char* buf, /*!< out: buffer for converted identifier */
+ ulint buflen, /*!< in: length of buf, in bytes */
+ const char* id, /*!< in: identifier to convert */
+ ulint idlen, /*!< in: length of id, in bytes */
+ THD* thd, /*!< in: MySQL connection thread, or NULL */
+ ibool table_id)/*!< in: TRUE=id is a table or database name;
+ FALSE=id is an index name */
+{
+ char* s = buf;
+ const char* bufend = buf + buflen;
+
+ if (table_id) {
+ const char* slash = (const char*) memchr(id, '/', idlen);
+ if (!slash) {
+
+ goto no_db_name;
+ }
+
+ /* Print the database name and table name separately. */
+ s = innobase_convert_identifier(s, bufend - s, id, slash - id,
+ thd, TRUE);
+ if (UNIV_LIKELY(s < bufend)) {
+ *s++ = '.';
+ s = innobase_convert_identifier(s, bufend - s,
+ slash + 1, idlen
+ - (slash - id) - 1,
+ thd, TRUE);
+ }
+ } else if (UNIV_UNLIKELY(*id == TEMP_INDEX_PREFIX)) {
+ /* Temporary index name (smart ALTER TABLE) */
+ const char temp_index_suffix[]= "--temporary--";
+
+ s = innobase_convert_identifier(buf, buflen, id + 1, idlen - 1,
+ thd, FALSE);
+ if (s - buf + (sizeof temp_index_suffix - 1) < buflen) {
+ memcpy(s, temp_index_suffix,
+ sizeof temp_index_suffix - 1);
+ s += sizeof temp_index_suffix - 1;
+ }
+ } else {
+no_db_name:
+ s = innobase_convert_identifier(buf, buflen, id, idlen,
+ thd, table_id);
+ }
+
+ return(s);
+}
+
+/*****************************************************************//**
+A wrapper function of innobase_convert_name(), convert a table or
+index name to the MySQL system_charset_info (UTF-8) and quote it if needed.
+@return pointer to the end of buf */
+UNIV_INTERN
+void
+innobase_format_name(
+/*==================*/
+ char* buf, /*!< out: buffer for converted identifier */
+ ulint buflen, /*!< in: length of buf, in bytes */
+ const char* name, /*!< in: index or table name to format */
+ ibool is_index_name) /*!< in: index name */
+{
+ const char* bufend;
+
+ bufend = innobase_convert_name(buf, buflen, name, strlen(name),
+ NULL, !is_index_name);
+
+ ut_ad((ulint) (bufend - buf) < buflen);
+
+ buf[bufend - buf] = '\0';
+}
+
+/**********************************************************************//**
+Determines if the currently running transaction has been interrupted.
+@return TRUE if interrupted */
+UNIV_INTERN
+ibool
+trx_is_interrupted(
+/*===============*/
+ const trx_t* trx) /*!< in: transaction */
+{
+ return(trx && trx->mysql_thd && thd_kill_level((THD*) trx->mysql_thd));
+}
+
+/**********************************************************************//**
+Determines if the currently running transaction is in strict mode.
+@return TRUE if strict */
+UNIV_INTERN
+ibool
+trx_is_strict(
+/*==========*/
+ trx_t* trx) /*!< in: transaction */
+{
+ return(trx && trx->mysql_thd && THDVAR(trx->mysql_thd, strict_mode));
+}
+
+/**************************************************************//**
+Resets some fields of a prebuilt struct. The template is used in fast
+retrieval of just those column values MySQL needs in its processing. */
+inline
+void
+ha_innobase::reset_template(void)
+/*=============================*/
+{
+ ut_ad(prebuilt->magic_n == ROW_PREBUILT_ALLOCATED);
+ ut_ad(prebuilt->magic_n2 == prebuilt->magic_n);
+
+ /* Force table to be freed in close_thread_table(). */
+ DBUG_EXECUTE_IF("free_table_in_fts_query",
+ if (prebuilt->in_fts_query) {
+ table->m_needs_reopen = true;
+ }
+ );
+
+ prebuilt->keep_other_fields_on_keyread = 0;
+ prebuilt->read_just_key = 0;
+ prebuilt->in_fts_query = 0;
+ /* Reset index condition pushdown state. */
+ if (prebuilt->idx_cond) {
+ prebuilt->idx_cond = NULL;
+ prebuilt->idx_cond_n_cols = 0;
+ /* Invalidate prebuilt->mysql_template
+ in ha_innobase::write_row(). */
+ prebuilt->template_type = ROW_MYSQL_NO_TEMPLATE;
+ }
+}
+
+/*****************************************************************//**
+Call this when you have opened a new table handle in HANDLER, before you
+call index_read_idx() etc. Actually, we can let the cursor stay open even
+over a transaction commit! Then you should call this before every operation,
+fetch next etc. This function inits the necessary things even after a
+transaction commit. */
+UNIV_INTERN
+void
+ha_innobase::init_table_handle_for_HANDLER(void)
+/*============================================*/
+{
+ /* If current thd does not yet have a trx struct, create one.
+ If the current handle does not yet have a prebuilt struct, create
+ one. Update the trx pointers in the prebuilt struct. Normally
+ this operation is done in external_lock. */
+
+ update_thd(ha_thd());
+
+ /* Initialize the prebuilt struct much like it would be inited in
+ external_lock */
+
+ trx_search_latch_release_if_reserved(prebuilt->trx);
+
+ innobase_srv_conc_force_exit_innodb(prebuilt->trx);
+
+ /* If the transaction is not started yet, start it */
+
+ trx_start_if_not_started_xa(prebuilt->trx);
+
+ /* Assign a read view if the transaction does not have it yet */
+
+ trx_assign_read_view(prebuilt->trx);
+
+ innobase_register_trx(ht, user_thd, prebuilt->trx);
+
+ /* We did the necessary inits in this function, no need to repeat them
+ in row_search_for_mysql */
+
+ prebuilt->sql_stat_start = FALSE;
+
+ /* We let HANDLER always to do the reads as consistent reads, even
+ if the trx isolation level would have been specified as SERIALIZABLE */
+
+ prebuilt->select_lock_type = LOCK_NONE;
+ prebuilt->stored_select_lock_type = LOCK_NONE;
+
+ /* Always fetch all columns in the index record */
+
+ prebuilt->hint_need_to_fetch_extra_cols = ROW_RETRIEVE_ALL_COLS;
+
+ /* We want always to fetch all columns in the whole row? Or do
+ we???? */
+
+ prebuilt->used_in_HANDLER = TRUE;
+ reset_template();
+}
+
+/****************************************************************//**
+Gives the file extension of an InnoDB single-table tablespace. */
+static const char* ha_innobase_exts[] = {
+ ".ibd",
+ ".isl",
+ NullS
+};
+
+/*********************************************************************//**
+Opens an InnoDB database.
+@return 0 on success, error code on failure */
+static
+int
+innobase_init(
+/*==========*/
+ void *p) /*!< in: InnoDB handlerton */
+{
+ static char current_dir[3]; /*!< Set if using current lib */
+ int err;
+ bool ret;
+ char *default_path;
+ uint format_id;
+ ulong num_pll_degree;
+
+ DBUG_ENTER("innobase_init");
+ handlerton *innobase_hton= (handlerton*) p;
+ innodb_hton_ptr = innobase_hton;
+
+ innobase_hton->state = SHOW_OPTION_YES;
+ innobase_hton->db_type= DB_TYPE_INNODB;
+ innobase_hton->savepoint_offset = sizeof(trx_named_savept_t);
+ innobase_hton->close_connection = innobase_close_connection;
+ innobase_hton->savepoint_set = innobase_savepoint;
+ innobase_hton->savepoint_rollback = innobase_rollback_to_savepoint;
+ innobase_hton->savepoint_rollback_can_release_mdl =
+ innobase_rollback_to_savepoint_can_release_mdl;
+ innobase_hton->savepoint_release = innobase_release_savepoint;
+ innobase_hton->commit_ordered=innobase_commit_ordered;
+ innobase_hton->commit = innobase_commit;
+ innobase_hton->rollback = innobase_rollback;
+ innobase_hton->prepare = innobase_xa_prepare;
+ innobase_hton->recover = innobase_xa_recover;
+ innobase_hton->commit_by_xid = innobase_commit_by_xid;
+ innobase_hton->rollback_by_xid = innobase_rollback_by_xid;
+ innobase_hton->commit_checkpoint_request=innobase_checkpoint_request;
+ innobase_hton->checkpoint_state= innobase_checkpoint_state;
+ innobase_hton->create_cursor_read_view = innobase_create_cursor_view;
+ innobase_hton->set_cursor_read_view = innobase_set_cursor_view;
+ innobase_hton->close_cursor_read_view = innobase_close_cursor_view;
+ innobase_hton->create = innobase_create_handler;
+ innobase_hton->drop_database = innobase_drop_database;
+ innobase_hton->panic = innobase_end;
+
+ innobase_hton->start_consistent_snapshot =
+ innobase_start_trx_and_assign_read_view;
+
+ /*innobase_hton->store_binlog_info =
+ innobase_store_binlog_info;*/
+
+ innobase_hton->flush_logs = innobase_flush_logs;
+ innobase_hton->show_status = innobase_show_status;
+ innobase_hton->flags = HTON_SUPPORTS_EXTENDED_KEYS |
+ HTON_SUPPORTS_FOREIGN_KEYS;
+
+ innobase_hton->release_temporary_latches =
+ innobase_release_temporary_latches;
+
+ innobase_hton->kill_query = innobase_kill_connection;
+
+ if (srv_file_per_table)
+ innobase_hton->tablefile_extensions = ha_innobase_exts;
+
+#ifdef WITH_WSREP
+ innobase_hton->wsrep_abort_transaction=wsrep_abort_transaction;
+ innobase_hton->wsrep_set_checkpoint=innobase_wsrep_set_checkpoint;
+ innobase_hton->wsrep_get_checkpoint=innobase_wsrep_get_checkpoint;
+ innobase_hton->wsrep_fake_trx_id=wsrep_fake_trx_id;
+#endif /* WITH_WSREP */
+
+ ut_a(DATA_MYSQL_TRUE_VARCHAR == (ulint)MYSQL_TYPE_VARCHAR);
+
+#ifndef DBUG_OFF
+ static const char test_filename[] = "-@";
+ char test_tablename[sizeof test_filename
+ + sizeof(srv_mysql50_table_name_prefix) - 1];
+ if ((sizeof(test_tablename)) - 1
+ != filename_to_tablename(test_filename,
+ test_tablename,
+ sizeof(test_tablename), true)
+ || strncmp(test_tablename,
+ srv_mysql50_table_name_prefix,
+ sizeof(srv_mysql50_table_name_prefix) - 1)
+ || strcmp(test_tablename
+ + sizeof(srv_mysql50_table_name_prefix) - 1,
+ test_filename)) {
+
+ sql_print_error("tablename encoding has been changed");
+
+ goto error;
+ }
+#endif /* DBUG_OFF */
+
+ srv_log_block_size = 0;
+ if (innobase_log_block_size != (1 << 9)) { /*!=512*/
+ uint n_shift;
+
+ fprintf(stderr,
+ "InnoDB: Warning: innodb_log_block_size has been "
+ "changed from default value 512. (###EXPERIMENTAL### "
+ "operation)\n");
+ for (n_shift = 9; n_shift <= UNIV_PAGE_SIZE_SHIFT_MAX;
+ n_shift++) {
+ if (innobase_log_block_size == ((ulong)1 << n_shift)) {
+ srv_log_block_size = (1 << n_shift);
+ fprintf(stderr,
+ "InnoDB: The log block size is set to "
+ ULINTPF ".\n",srv_log_block_size);
+ break;
+ }
+ }
+ } else {
+ srv_log_block_size = 512;
+ }
+ ut_ad (srv_log_block_size >= OS_MIN_LOG_BLOCK_SIZE);
+
+ if (!srv_log_block_size) {
+ fprintf(stderr,
+ "InnoDB: Error: %lu is not a valid value for "
+ "innodb_log_block_size.\n"
+ "InnoDB: Error: A valid value for "
+ "innodb_log_block_size is\n"
+ "InnoDB: Error: a power of 2 from 512 to 16384.\n",
+ innobase_log_block_size);
+ goto error;
+ }
+
+ /* Check that values don't overflow on 32-bit systems. */
+ if (sizeof(ulint) == 4) {
+ if (innobase_buffer_pool_size > UINT_MAX32) {
+ sql_print_error(
+ "innobase_buffer_pool_size can't be over 4GB"
+ " on 32-bit systems");
+
+ goto error;
+ }
+ }
+
+ os_innodb_umask = (ulint) my_umask;
+
+ /* First calculate the default path for innodb_data_home_dir etc.,
+ in case the user has not given any value.
+
+ Note that when using the embedded server, the datadirectory is not
+ necessarily the current directory of this program. */
+
+ if (mysqld_embedded) {
+ default_path = mysql_real_data_home;
+ fil_path_to_mysql_datadir = mysql_real_data_home;
+ } else {
+ /* It's better to use current lib, to keep paths short */
+ current_dir[0] = FN_CURLIB;
+ current_dir[1] = FN_LIBCHAR;
+ current_dir[2] = 0;
+ default_path = current_dir;
+ }
+
+ ut_a(default_path);
+
+ /* Set InnoDB initialization parameters according to the values
+ read from MySQL .cnf file */
+
+ /*--------------- Data files -------------------------*/
+
+ /* The default dir for data files is the datadir of MySQL */
+
+ srv_data_home = (innobase_data_home_dir ? innobase_data_home_dir :
+ default_path);
+
+ /* Set default InnoDB data file size to 12 MB and let it be
+ auto-extending. Thus users can use InnoDB in >= 4.0 without having
+ to specify any startup options. */
+
+ if (!innobase_data_file_path) {
+ innobase_data_file_path = (char*) "ibdata1:12M:autoextend";
+ }
+
+ /* Since InnoDB edits the argument in the next call, we make another
+ copy of it: */
+
+ internal_innobase_data_file_path = my_strdup(innobase_data_file_path,
+ MYF(MY_FAE));
+
+ ret = (bool) srv_parse_data_file_paths_and_sizes(
+ internal_innobase_data_file_path);
+ if (ret == FALSE) {
+ sql_print_error(
+ "InnoDB: syntax error in innodb_data_file_path"
+ " or size specified is less than 1 megabyte");
+mem_free_and_error:
+ srv_free_paths_and_sizes();
+ my_free(internal_innobase_data_file_path);
+ goto error;
+ }
+
+ /* -------------- All log files ---------------------------*/
+
+ /* The default dir for log files is the datadir of MySQL */
+
+ if (!srv_log_group_home_dir) {
+ srv_log_group_home_dir = default_path;
+ }
+
+#ifdef UNIV_LOG_ARCHIVE
+ if (!innobase_log_arch_dir) {
+ innobase_log_arch_dir = srv_log_group_home_dir;
+ }
+ srv_arch_dir = innobase_log_arch_dir;
+#endif /* UNIG_LOG_ARCHIVE */
+
+ srv_normalize_path_for_win(srv_log_group_home_dir);
+
+ if (strchr(srv_log_group_home_dir, ';')) {
+ sql_print_error("syntax error in innodb_log_group_home_dir");
+ goto mem_free_and_error;
+ }
+
+ if (innobase_mirrored_log_groups == 1) {
+ sql_print_warning(
+ "innodb_mirrored_log_groups is an unimplemented "
+ "feature and the variable will be completely "
+ "removed in a future version.");
+ }
+
+ if (innobase_mirrored_log_groups > 1) {
+ sql_print_error(
+ "innodb_mirrored_log_groups is an unimplemented feature and "
+ "the variable will be completely removed in a future version. "
+ "Using values other than 1 is not supported.");
+ goto mem_free_and_error;
+ }
+
+ if (innobase_mirrored_log_groups == 0) {
+ /* To throw a deprecation warning message when the option is
+ passed, the default was changed to '0' (as a workaround). Since
+ the only value accepted for this option is '1', reset it to 1 */
+ innobase_mirrored_log_groups = 1;
+ }
+
+ /* Validate the file format by animal name */
+ if (innobase_file_format_name != NULL) {
+
+ format_id = innobase_file_format_name_lookup(
+ innobase_file_format_name);
+
+ if (format_id > UNIV_FORMAT_MAX) {
+
+ sql_print_error("InnoDB: wrong innodb_file_format.");
+
+ goto mem_free_and_error;
+ }
+ } else {
+ /* Set it to the default file format id. Though this
+ should never happen. */
+ format_id = 0;
+ }
+
+ srv_file_format = format_id;
+
+ /* Given the type of innobase_file_format_name we have little
+ choice but to cast away the constness from the returned name.
+ innobase_file_format_name is used in the MySQL set variable
+ interface and so can't be const. */
+
+ innobase_file_format_name =
+ (char*) trx_sys_file_format_id_to_name(format_id);
+
+ /* Check innobase_file_format_check variable */
+ if (!innobase_file_format_check) {
+
+ /* Set the value to disable checking. */
+ srv_max_file_format_at_startup = UNIV_FORMAT_MAX + 1;
+
+ } else {
+
+ /* Set the value to the lowest supported format. */
+ srv_max_file_format_at_startup = UNIV_FORMAT_MIN;
+ }
+
+ /* Did the user specify a format name that we support?
+ As a side effect it will update the variable
+ srv_max_file_format_at_startup */
+ if (innobase_file_format_validate_and_set(
+ innobase_file_format_max) < 0) {
+
+ sql_print_error("InnoDB: invalid "
+ "innodb_file_format_max value: "
+ "should be any value up to %s or its "
+ "equivalent numeric id",
+ trx_sys_file_format_id_to_name(
+ UNIV_FORMAT_MAX));
+
+ goto mem_free_and_error;
+ }
+
+ if (innobase_change_buffering) {
+ ulint use;
+
+ for (use = 0;
+ use < UT_ARR_SIZE(innobase_change_buffering_values);
+ use++) {
+ if (!innobase_strcasecmp(
+ innobase_change_buffering,
+ innobase_change_buffering_values[use])) {
+ ibuf_use = (ibuf_use_t) use;
+ goto innobase_change_buffering_inited_ok;
+ }
+ }
+
+ sql_print_error("InnoDB: invalid value "
+ "innodb_change_buffering=%s",
+ innobase_change_buffering);
+ goto mem_free_and_error;
+ }
+
+innobase_change_buffering_inited_ok:
+ ut_a((ulint) ibuf_use < UT_ARR_SIZE(innobase_change_buffering_values));
+ innobase_change_buffering = (char*)
+ innobase_change_buffering_values[ibuf_use];
+
+ /* Check that interdependent parameters have sane values. */
+ if (srv_max_buf_pool_modified_pct < srv_max_dirty_pages_pct_lwm) {
+ sql_print_warning("InnoDB: innodb_max_dirty_pages_pct_lwm"
+ " cannot be set higher than"
+ " innodb_max_dirty_pages_pct.\n"
+ "InnoDB: Setting"
+ " innodb_max_dirty_pages_pct_lwm to %lf\n",
+ srv_max_buf_pool_modified_pct);
+
+ srv_max_dirty_pages_pct_lwm = srv_max_buf_pool_modified_pct;
+ }
+
+ if (srv_max_io_capacity == SRV_MAX_IO_CAPACITY_DUMMY_DEFAULT) {
+
+ if (srv_io_capacity >= SRV_MAX_IO_CAPACITY_LIMIT / 2) {
+ /* Avoid overflow. */
+ srv_max_io_capacity = SRV_MAX_IO_CAPACITY_LIMIT;
+ } else {
+ /* The user has not set the value. We should
+ set it based on innodb_io_capacity. */
+ srv_max_io_capacity = static_cast<ulong>(
+ ut_max(2 * srv_io_capacity, 2000));
+ }
+
+ } else if (srv_max_io_capacity < srv_io_capacity) {
+ sql_print_warning("InnoDB: innodb_io_capacity"
+ " cannot be set higher than"
+ " innodb_io_capacity_max.\n"
+ "InnoDB: Setting"
+ " innodb_io_capacity to %lu\n",
+ srv_max_io_capacity);
+
+ srv_io_capacity = srv_max_io_capacity;
+ }
+
+ if (!is_filename_allowed(srv_buf_dump_filename,
+ strlen(srv_buf_dump_filename), FALSE)) {
+ sql_print_error("InnoDB: innodb_buffer_pool_filename"
+ " cannot have colon (:) in the file name.");
+ goto mem_free_and_error;
+ }
+
+ /* --------------------------------------------------*/
+
+ srv_file_flush_method_str = innobase_file_flush_method;
+
+ srv_log_file_size = (ib_uint64_t) innobase_log_file_size;
+
+#ifdef UNIV_LOG_ARCHIVE
+ srv_log_archive_on = (ulint) innobase_log_archive;
+#endif /* UNIV_LOG_ARCHIVE */
+
+ /* Check that the value of system variable innodb_page_size was
+ set correctly. Its value was put into srv_page_size. If valid,
+ return the associated srv_page_size_shift.*/
+ srv_page_size_shift = innodb_page_size_validate(srv_page_size);
+ if (!srv_page_size_shift) {
+ sql_print_error("InnoDB: Invalid page size=%lu.\n",
+ srv_page_size);
+ goto mem_free_and_error;
+ }
+ if (UNIV_PAGE_SIZE_DEF != srv_page_size) {
+ ut_print_timestamp(stderr);
+ fprintf(stderr,
+ " InnoDB: innodb-page-size has been changed"
+ " from the default value %d to %lu.\n",
+ UNIV_PAGE_SIZE_DEF, srv_page_size);
+ }
+
+ srv_log_buffer_size = (ulint) innobase_log_buffer_size;
+
+ if (innobase_buffer_pool_instances == 0) {
+ innobase_buffer_pool_instances = 8;
+
+#if defined(__WIN__) && !defined(_WIN64)
+ if (innobase_buffer_pool_size > 1331 * 1024 * 1024) {
+ innobase_buffer_pool_instances
+ = ut_min(MAX_BUFFER_POOLS,
+ (long) (innobase_buffer_pool_size
+ / (128 * 1024 * 1024)));
+ }
+#endif /* defined(__WIN__) && !defined(_WIN64) */
+ }
+ srv_buf_pool_size = (ulint) innobase_buffer_pool_size;
+ srv_buf_pool_instances = (ulint) innobase_buffer_pool_instances;
+
+ srv_mem_pool_size = (ulint) innobase_additional_mem_pool_size;
+
+ if (innobase_additional_mem_pool_size
+ != 8*1024*1024L /* the default */ ) {
+
+ ut_print_timestamp(stderr);
+ fprintf(stderr,
+ " InnoDB: Warning: Using "
+ "innodb_additional_mem_pool_size is DEPRECATED. "
+ "This option may be removed in future releases, "
+ "together with the option innodb_use_sys_malloc "
+ "and with the InnoDB's internal memory "
+ "allocator.\n");
+ }
+
+ if (!srv_use_sys_malloc ) {
+ ut_print_timestamp(stderr);
+ fprintf(stderr,
+ " InnoDB: Warning: Setting "
+ "innodb_use_sys_malloc to FALSE is DEPRECATED. "
+ "This option may be removed in future releases, "
+ "together with the InnoDB's internal memory "
+ "allocator.\n");
+ }
+
+ srv_n_file_io_threads = (ulint) innobase_file_io_threads;
+ srv_n_read_io_threads = (ulint) innobase_read_io_threads;
+ srv_n_write_io_threads = (ulint) innobase_write_io_threads;
+
+ srv_use_doublewrite_buf = (ibool) innobase_use_doublewrite;
+
+ if (!innobase_use_checksums) {
+ ut_print_timestamp(stderr);
+ fprintf(stderr,
+ " InnoDB: Warning: Setting "
+ "innodb_checksums to OFF is DEPRECATED. "
+ "This option may be removed in future releases. "
+ "You should set innodb_checksum_algorithm=NONE "
+ "instead.\n");
+ srv_checksum_algorithm = SRV_CHECKSUM_ALGORITHM_NONE;
+ }
+
+ innodb_log_checksum_func_update(srv_log_checksum_algorithm);
+
+#ifdef HAVE_LARGE_PAGES
+ if ((os_use_large_pages = (ibool) my_use_large_pages)) {
+ os_large_page_size = (ulint) opt_large_page_size;
+ }
+#endif
+
+ row_rollback_on_timeout = (ibool) innobase_rollback_on_timeout;
+
+ srv_locks_unsafe_for_binlog = (ibool) innobase_locks_unsafe_for_binlog;
+ if (innobase_locks_unsafe_for_binlog) {
+ ut_print_timestamp(stderr);
+ fprintf(stderr,
+ " InnoDB: Warning: Using "
+ "innodb_locks_unsafe_for_binlog is DEPRECATED. "
+ "This option may be removed in future releases. "
+ "Please use READ COMMITTED transaction isolation "
+ "level instead, see " REFMAN "set-transaction.html.\n");
+ }
+
+ if (innobase_open_files < 10) {
+ innobase_open_files = 300;
+ if (srv_file_per_table && tc_size > 300) {
+ innobase_open_files = tc_size;
+ }
+ }
+
+ if (innobase_open_files > (long) open_files_limit) {
+ fprintf(stderr,
+ "innodb_open_files should not be greater"
+ " than the open_files_limit.\n");
+ if (innobase_open_files > (long) tc_size) {
+ innobase_open_files = tc_size;
+ }
+ }
+
+ srv_max_n_open_files = (ulint) innobase_open_files;
+ srv_innodb_status = (ibool) innobase_create_status_file;
+
+ srv_print_verbose_log = mysqld_embedded ? 0 : 1;
+
+ /* Round up fts_sort_pll_degree to nearest power of 2 number */
+ for (num_pll_degree = 1;
+ num_pll_degree < fts_sort_pll_degree;
+ num_pll_degree <<= 1) {
+
+ /* No op */
+ }
+
+ fts_sort_pll_degree = num_pll_degree;
+
+ /* Store the default charset-collation number of this MySQL
+ installation */
+
+ data_mysql_default_charset_coll = (ulint) default_charset_info->number;
+
+ ut_a(DATA_MYSQL_LATIN1_SWEDISH_CHARSET_COLL ==
+ my_charset_latin1.number);
+ ut_a(DATA_MYSQL_BINARY_CHARSET_COLL == my_charset_bin.number);
+
+ /* Store the latin1_swedish_ci character ordering table to InnoDB. For
+ non-latin1_swedish_ci charsets we use the MySQL comparison functions,
+ and consequently we do not need to know the ordering internally in
+ InnoDB. */
+
+ srv_latin1_ordering = my_charset_latin1.sort_order;
+
+ innobase_commit_concurrency_init_default();
+
+#ifdef HAVE_POSIX_FALLOCATE
+ srv_use_posix_fallocate = (ibool) innobase_use_fallocate;
+#endif
+ /* Do not enable backoff algorithm for small buffer pool. */
+ if (!innodb_empty_free_list_algorithm_allowed(
+ static_cast<srv_empty_free_list_t>(
+ srv_empty_free_list_algorithm))) {
+ sql_print_information(
+ "InnoDB: innodb_empty_free_list_algorithm "
+ "has been changed to legacy "
+ "because of small buffer pool size. "
+ "In order to use backoff, "
+ "increase buffer pool at least up to 20MB.\n");
+ srv_empty_free_list_algorithm
+ = SRV_EMPTY_FREE_LIST_LEGACY;
+ }
+
+ srv_use_atomic_writes = (ibool) innobase_use_atomic_writes;
+ if (innobase_use_atomic_writes) {
+ ib_logf(IB_LOG_LEVEL_INFO, "using atomic writes.");
+
+ /* Force doublewrite buffer off, atomic writes replace it. */
+ if (srv_use_doublewrite_buf) {
+ ib_logf(IB_LOG_LEVEL_INFO, "switching off doublewrite "
+ "buffer because of atomic writes.");
+ innobase_use_doublewrite = FALSE;
+ srv_use_doublewrite_buf = FALSE;
+ }
+
+ /* Force O_DIRECT on Unixes (on Windows writes are always
+ unbuffered)*/
+#ifndef _WIN32
+ if(!innobase_file_flush_method ||
+ !strstr(innobase_file_flush_method, "O_DIRECT")) {
+ innobase_file_flush_method =
+ srv_file_flush_method_str = (char*)"O_DIRECT";
+ ib_logf(IB_LOG_LEVEL_INFO,
+ "using O_DIRECT due to atomic writes.");
+ }
+#endif
+#ifdef HAVE_POSIX_FALLOCATE
+ /* Due to a bug in directFS, using atomics needs
+ posix_fallocate() to extend the file, because pwrite() past the
+ end of the file won't work */
+ srv_use_posix_fallocate = TRUE;
+#endif
+ }
+
+#ifdef HAVE_PSI_INTERFACE
+ /* Register keys with MySQL performance schema */
+ int count;
+
+ count = array_elements(all_pthread_mutexes);
+ mysql_mutex_register("innodb", all_pthread_mutexes, count);
+
+# ifdef UNIV_PFS_MUTEX
+ count = array_elements(all_innodb_mutexes);
+ mysql_mutex_register("innodb", all_innodb_mutexes, count);
+# endif /* UNIV_PFS_MUTEX */
+
+# ifdef UNIV_PFS_RWLOCK
+ count = array_elements(all_innodb_rwlocks);
+ mysql_rwlock_register("innodb", all_innodb_rwlocks, count);
+# endif /* UNIV_PFS_MUTEX */
+
+# ifdef UNIV_PFS_THREAD
+ count = array_elements(all_innodb_threads);
+ mysql_thread_register("innodb", all_innodb_threads, count);
+# endif /* UNIV_PFS_THREAD */
+
+# ifdef UNIV_PFS_IO
+ count = array_elements(all_innodb_files);
+ mysql_file_register("innodb", all_innodb_files, count);
+# endif /* UNIV_PFS_IO */
+
+ count = array_elements(all_innodb_conds);
+ mysql_cond_register("innodb", all_innodb_conds, count);
+#endif /* HAVE_PSI_INTERFACE */
+
+ /* Since we in this module access directly the fields of a trx
+ struct, and due to different headers and flags it might happen that
+ ib_mutex_t has a different size in this module and in InnoDB
+ modules, we check at run time that the size is the same in
+ these compilation modules. */
+
+ err = innobase_start_or_create_for_mysql();
+
+ if (err != DB_SUCCESS) {
+ goto mem_free_and_error;
+ }
+
+ /* Adjust the innodb_undo_logs config object */
+ innobase_undo_logs_init_default_max();
+
+ innobase_old_blocks_pct = static_cast<uint>(
+ buf_LRU_old_ratio_update(innobase_old_blocks_pct, TRUE));
+
+ ibuf_max_size_update(innobase_change_buffer_max_size);
+
+ innobase_open_tables = hash_create(200);
+ mysql_mutex_init(innobase_share_mutex_key,
+ &innobase_share_mutex,
+ MY_MUTEX_INIT_FAST);
+ mysql_mutex_init(commit_cond_mutex_key,
+ &commit_cond_m, MY_MUTEX_INIT_FAST);
+ mysql_cond_init(commit_cond_key, &commit_cond, NULL);
+ mysql_mutex_init(pending_checkpoint_mutex_key,
+ &pending_checkpoint_mutex,
+ MY_MUTEX_INIT_FAST);
+ innodb_inited= 1;
+#ifdef MYSQL_DYNAMIC_PLUGIN
+ if (innobase_hton != p) {
+ innobase_hton = reinterpret_cast<handlerton*>(p);
+ *innobase_hton = *innodb_hton_ptr;
+ }
+#endif /* MYSQL_DYNAMIC_PLUGIN */
+
+ /* Get the current high water mark format. */
+ innobase_file_format_max = (char*) trx_sys_file_format_max_get();
+
+ /* Currently, monitor counter information are not persistent. */
+ memset(monitor_set_tbl, 0, sizeof monitor_set_tbl);
+
+ memset(innodb_counter_value, 0, sizeof innodb_counter_value);
+
+ /* Do this as late as possible so server is fully starts up,
+ since we might get some initial stats if user choose to turn
+ on some counters from start up */
+ if (innobase_enable_monitor_counter) {
+ innodb_enable_monitor_at_startup(
+ innobase_enable_monitor_counter);
+ }
+
+ /* Turn on monitor counters that are default on */
+ srv_mon_default_on();
+
+ DBUG_RETURN(FALSE);
+error:
+ DBUG_RETURN(TRUE);
+}
+
+/** Shut down the InnoDB storage engine.
+@return 0 */
+static
+int
+innobase_end(handlerton*, ha_panic_function)
{
- if (thd) {
- thd_storage_lock_wait((THD*)thd, value);
+ DBUG_ENTER("innobase_end");
+
+ if (innodb_inited) {
+
+ THD *thd= current_thd;
+ if (thd) { // may be UNINSTALL PLUGIN statement
+ trx_t* trx = thd_to_trx(thd);
+ if (trx) {
+ trx_free_for_mysql(trx);
+ }
+ }
+
+ srv_fast_shutdown = (ulint) innobase_fast_shutdown;
+
+ innodb_inited = 0;
+ hash_table_free(innobase_open_tables);
+ innobase_open_tables = NULL;
+ innodb_shutdown();
+ srv_free_paths_and_sizes();
+ my_free(internal_innobase_data_file_path);
+ mysql_mutex_destroy(&innobase_share_mutex);
+ mysql_mutex_destroy(&commit_cond_m);
+ mysql_cond_destroy(&commit_cond);
+ mysql_mutex_destroy(&pending_checkpoint_mutex);
}
+
+ DBUG_RETURN(0);
}
-/******************************************************************//**
-*/
-extern "C" UNIV_INTERN
-ulong
-thd_flush_log_at_trx_commit(
-/*================================*/
- void* thd)
+/****************************************************************//**
+Flushes InnoDB logs to disk and makes a checkpoint. Really, a commit flushes
+the logs, and the name of this function should be innobase_checkpoint.
+@return TRUE if error */
+static
+bool
+innobase_flush_logs(
+/*================*/
+ handlerton* hton) /*!< in/out: InnoDB handlerton */
{
- return(THDVAR((THD*) thd, flush_log_at_trx_commit));
+ bool result = 0;
+
+ DBUG_ENTER("innobase_flush_logs");
+ DBUG_ASSERT(hton == innodb_hton_ptr);
+
+ if (!srv_read_only_mode) {
+ log_buffer_flush_to_disk();
+ }
+
+ DBUG_RETURN(result);
}
-/******************************************************************//**
-Returns the merge-sort block size used for the secondary index creation
-for the current connection.
-@return the merge-sort block size, in bytes */
-extern "C" UNIV_INTERN
-ulong
-thd_merge_sort_block_size(
-/*================================*/
- void* thd) /*!< in: thread handle (THD*), or NULL to query
-+ the global merge_sort_block_size */
+/************************************************************//**
+Synchronously read and parse the redo log up to the last
+checkpoint to write the changed page bitmap.
+@return 0 to indicate success. Current implementation cannot fail. */
+static
+my_bool
+innobase_flush_changed_page_bitmaps()
+/*=================================*/
{
- return(THDVAR((THD*) thd, merge_sort_block_size));
+ if (srv_track_changed_pages) {
+ os_event_reset(srv_checkpoint_completed_event);
+ log_online_follow_redo_log();
+ }
+ return FALSE;
}
-/********************************************************************//**
-Obtain the InnoDB transaction of a MySQL thread.
-@return reference to transaction pointer */
-static inline
-trx_t*&
-thd_to_trx(
-/*=======*/
- THD* thd) /*!< in: MySQL thread */
+/************************************************************//**
+Delete all the bitmap files for data less than the specified LSN.
+If called with lsn == IB_ULONGLONG_MAX (i.e. set by RESET request),
+restart the bitmap file sequence, otherwise continue it.
+@return 0 to indicate success, 1 for failure. */
+static
+my_bool
+innobase_purge_changed_page_bitmaps(
+/*================================*/
+ ulonglong lsn) /*!< in: LSN to purge files up to */
{
- return(*(trx_t**) thd_ha_data(thd, innodb_hton_ptr));
+ return (my_bool)log_online_purge_changed_page_bitmaps(lsn);
}
-#ifdef WITH_WSREP
-ulonglong
-thd_to_trx_id(
-/*=======*/
- THD* thd) /*!< in: MySQL thread */
+
+/*****************************************************************//**
+Commits a transaction in an InnoDB database. */
+static
+void
+innobase_commit_low(
+/*================*/
+ trx_t* trx) /*!< in: transaction handle */
{
- return(thd_to_trx(thd)->id);
+#ifdef WITH_WSREP
+ THD* thd = (THD*)trx->mysql_thd;
+ const char* tmp = 0;
+ if (wsrep_on((void*)thd)) {
+#ifdef WSREP_PROC_INFO
+ char info[64];
+ info[sizeof(info) - 1] = '\0';
+ snprintf(info, sizeof(info) - 1,
+ "innobase_commit_low():trx_commit_for_mysql(%lld)",
+ (long long) wsrep_thd_trx_seqno(thd));
+ tmp = thd_proc_info(thd, info);
+
+#else
+ tmp = thd_proc_info(thd, "innobase_commit_low()");
+#endif /* WSREP_PROC_INFO */
+ }
+#endif /* WITH_WSREP */
+ if (trx_is_started(trx)) {
+
+ trx_commit_for_mysql(trx);
+ }
+#ifdef WITH_WSREP
+ if (wsrep_on((void*)thd)) { thd_proc_info(thd, tmp); }
+#endif /* WITH_WSREP */
}
-#endif
-my_bool
-ha_innobase::is_fake_change_enabled(THD* thd)
+#if NOT_USED
+/*****************************************************************//**
+Stores the current binlog coordinates in the trx system header. */
+static
+int
+innobase_store_binlog_info(
+/*=======================*/
+ handlerton* hton, /*!< in: InnoDB handlerton */
+ THD* thd) /*!< in: MySQL thread handle */
+
{
- trx_t* trx = thd_to_trx(thd);
- return(trx && UNIV_UNLIKELY(trx->fake_changes));
+ const char* file_name;
+ unsigned long long pos;
+ mtr_t mtr;
+
+ DBUG_ENTER("innobase_store_binlog_info");
+
+ thd_binlog_pos(thd, &file_name, &pos);
+
+ mtr_start(&mtr);
+
+ trx_sys_update_mysql_binlog_offset(file_name, pos,
+ TRX_SYS_MYSQL_LOG_INFO, &mtr);
+
+ mtr_commit(&mtr);
+
+ innobase_flush_logs(hton);
+
+ DBUG_RETURN(0);
}
+#endif
-/********************************************************************//**
-Call this function when mysqld passes control to the client. That is to
-avoid deadlocks on the adaptive hash S-latch possibly held by thd. For more
-documentation, see handler.cc.
+/*****************************************************************//**
+Creates an InnoDB transaction struct for the thd if it does not yet have one.
+Starts a new InnoDB transaction if a transaction is not yet started. And
+assigns a new snapshot for a consistent read if the transaction does not yet
+have one.
@return 0 */
static
int
@@@ -8000,2081 -4788,1847 +8004,2079 @@@ ha_innobase::build_template
}
}
- dict_table_autoinc_initialize(prebuilt->table, auto_inc);
-}
+ clust_index = dict_table_get_first_index(prebuilt->table);
-/*****************************************************************//**
-Creates and opens a handle to a table which already exists in an InnoDB
-database.
-@return 1 if error, 0 if success */
-UNIV_INTERN
-int
-ha_innobase::open(
-/*==============*/
- const char* name, /*!< in: table name */
- int mode, /*!< in: not used */
- uint test_if_locked) /*!< in: not used */
-{
- dict_table_t* ib_table;
- char norm_name[1000];
- THD* thd;
- char* is_part = NULL;
- ibool par_case_name_set = FALSE;
- char par_case_name[MAX_FULL_NAME_LEN + 1];
- dict_err_ignore_t ignore_err = DICT_ERR_IGNORE_NONE;
+ index = whole_row ? clust_index : prebuilt->index;
- DBUG_ENTER("ha_innobase::open");
+ prebuilt->need_to_access_clustered = (index == clust_index);
- UT_NOT_USED(mode);
- UT_NOT_USED(test_if_locked);
+ /* Either prebuilt->index should be a secondary index, or it
+ should be the clustered index. */
+ ut_ad(dict_index_is_clust(index) == (index == clust_index));
- thd = ha_thd();
+ /* Below we check column by column if we need to access
+ the clustered index. */
+
+ n_stored_fields= (ulint)table->s->stored_fields; /* number of stored columns */
- /* Under some cases MySQL seems to call this function while
- holding btr_search_latch. This breaks the latching order as
- we acquire dict_sys->mutex below and leads to a deadlock. */
- if (thd != NULL) {
- innobase_release_temporary_latches(ht, thd);
+ if (!prebuilt->mysql_template) {
+ prebuilt->mysql_template = (mysql_row_templ_t*)
+ mem_alloc(n_stored_fields * sizeof(mysql_row_templ_t));
}
- normalize_table_name(norm_name, name);
+ prebuilt->template_type = whole_row
+ ? ROW_MYSQL_WHOLE_ROW : ROW_MYSQL_REC_FIELDS;
+ prebuilt->null_bitmap_len = table->s->null_bytes;
- user_thd = NULL;
+ /* Prepare to build prebuilt->mysql_template[]. */
+ prebuilt->templ_contains_blob = FALSE;
+ prebuilt->mysql_prefix_len = 0;
+ prebuilt->n_template = 0;
+ prebuilt->idx_cond_n_cols = 0;
- if (!(share=get_share(name))) {
+ /* Note that in InnoDB, i is the column number in the table.
+ MySQL calls columns 'fields'. */
- DBUG_RETURN(1);
- }
+ if (active_index != MAX_KEY && active_index == pushed_idx_cond_keyno) {
+ /* Push down an index condition or an end_range check. */
+ for (i = 0, sql_idx = 0; i < n_stored_fields; i++, sql_idx++) {
- if (UNIV_UNLIKELY(share->ib_table &&
- share->ib_table->is_corrupt &&
- srv_pass_corrupt_table <= 1)) {
- free_share(share);
+ while (!table->field[sql_idx]->stored_in_db) {
+ sql_idx++;
+ }
- DBUG_RETURN(HA_ERR_CRASHED_ON_USAGE);
- }
+ const ibool index_contains
+ = dict_index_contains_col_or_prefix(index, i);
- /* Will be allocated if it is needed in ::update_row() */
- upd_buf = NULL;
- upd_buf_size = 0;
+ /* Test if an end_range or an index condition
+ refers to the field. Note that "index" and
+ "index_contains" may refer to the clustered index.
+ Index condition pushdown is relative to prebuilt->index
+ (the index that is being looked up first). */
- /* We look for pattern #P# to see if the table is partitioned
- MySQL table. */
-#ifdef __WIN__
- is_part = strstr(norm_name, "#p#");
-#else
- is_part = strstr(norm_name, "#P#");
-#endif /* __WIN__ */
+ /* When join_read_always_key() invokes this
+ code via handler::ha_index_init() and
+ ha_innobase::index_init(), end_range is not
+ yet initialized. Because of that, we must
+ always check for index_contains, instead of
+ the subset
+ field->part_of_key.is_set(active_index)
+ which would be acceptable if end_range==NULL. */
+ if (build_template_needs_field_in_icp(
+ index, prebuilt, index_contains, i)) {
+ /* Needed in ICP */
+ const Field* field;
+ mysql_row_templ_t* templ;
- /* Check whether FOREIGN_KEY_CHECKS is set to 0. If so, the table
- can be opened even if some FK indexes are missing. If not, the table
- can't be opened in the same situation */
- if (thd_test_options(thd, OPTION_NO_FOREIGN_KEY_CHECKS)) {
- ignore_err = DICT_ERR_IGNORE_FK_NOKEY;
- }
+ if (whole_row) {
+ field = table->field[sql_idx];
+ } else {
+ field = build_template_needs_field(
+ index_contains,
+ prebuilt->read_just_key,
+ fetch_all_in_key,
+ fetch_primary_key_cols,
+ index, table, i, sql_idx);
+ if (!field) {
+ continue;
+ }
+ }
+
+ templ = build_template_field(
+ prebuilt, clust_index, index,
+ table, field, i);
+ prebuilt->idx_cond_n_cols++;
+ ut_ad(prebuilt->idx_cond_n_cols
+ == prebuilt->n_template);
+
+ if (index == prebuilt->index) {
+ templ->icp_rec_field_no
+ = templ->rec_field_no;
+ } else {
+ templ->icp_rec_field_no
+ = dict_index_get_nth_col_pos(
+ prebuilt->index, i,
+ NULL);
+ }
+
+ if (dict_index_is_clust(prebuilt->index)) {
+ ut_ad(templ->icp_rec_field_no
+ != ULINT_UNDEFINED);
+ /* If the primary key includes
+ a column prefix, use it in
+ index condition pushdown,
+ because the condition is
+ evaluated before fetching any
+ off-page (externally stored)
+ columns. */
+ if (templ->icp_rec_field_no
+ < prebuilt->index->n_uniq) {
+ /* This is a key column;
+ all set. */
+ continue;
+ }
+ } else if (templ->icp_rec_field_no
+ != ULINT_UNDEFINED) {
+ continue;
+ }
+
+ /* This is a column prefix index.
+ The column prefix can be used in
+ an end_range comparison. */
+
+ templ->icp_rec_field_no
+ = dict_index_get_nth_col_or_prefix_pos(
+ prebuilt->index, i, TRUE, NULL);
+ ut_ad(templ->icp_rec_field_no
+ != ULINT_UNDEFINED);
+
+ /* Index condition pushdown can be used on
+ all columns of a secondary index, and on
+ the PRIMARY KEY columns. On the clustered
+ index, it must never be used on other than
+ PRIMARY KEY columns, because those columns
+ may be stored off-page, and we will not
+ fetch externally stored columns before
+ checking the index condition. */
+ /* TODO: test the above with an assertion
+ like this. Note that index conditions are
+ currently pushed down as part of the
+ "optimizer phase" while end_range is done
+ as part of the execution phase. Therefore,
+ we were unable to use an accurate condition
+ for end_range in the "if" condition above,
+ and the following assertion would fail.
+ ut_ad(!dict_index_is_clust(prebuilt->index)
+ || templ->rec_field_no
+ < prebuilt->index->n_uniq);
+ */
+ }
+ }
+
+ ut_ad(prebuilt->idx_cond_n_cols > 0);
+ ut_ad(prebuilt->idx_cond_n_cols == prebuilt->n_template);
- /* Get pointer to a table object in InnoDB dictionary cache */
- ib_table = dict_table_get(norm_name, TRUE, ignore_err);
+ /* Include the fields that are not needed in index condition
+ pushdown. */
+ for (i = 0, sql_idx = 0; i < n_stored_fields; i++, sql_idx++) {
- if (UNIV_UNLIKELY(ib_table &&
- ib_table->is_corrupt &&
- srv_pass_corrupt_table <= 1)) {
- free_share(share);
- my_free(upd_buf);
- upd_buf = NULL;
- upd_buf_size = 0;
+ while (!table->field[sql_idx]->stored_in_db) {
+ sql_idx++;
+ }
- DBUG_RETURN(HA_ERR_CRASHED_ON_USAGE);
- }
+ const ibool index_contains
+ = dict_index_contains_col_or_prefix(index, i);
- share->ib_table = ib_table;
+ if (!build_template_needs_field_in_icp(
+ index, prebuilt, index_contains, i)) {
+ /* Not needed in ICP */
+ const Field* field;
- if (NULL == ib_table) {
- if (is_part) {
- /* MySQL partition engine hard codes the file name
- separator as "#P#". The text case is fixed even if
- lower_case_table_names is set to 1 or 2. This is true
- for sub-partition names as well. InnoDB always
- normalises file names to lower case on Windows, this
- can potentially cause problems when copying/moving
- tables between platforms.
+ if (whole_row) {
+ field = table->field[sql_idx];
+ } else {
+ field = build_template_needs_field(
+ index_contains,
+ prebuilt->read_just_key,
+ fetch_all_in_key,
+ fetch_primary_key_cols,
+ index, table, i, sql_idx);
+ if (!field) {
+ continue;
+ }
+ }
- 1) If boot against an installation from Windows
- platform, then its partition table name could
- be all be in lower case in system tables. So we
- will need to check lower case name when load table.
+ build_template_field(prebuilt,
+ clust_index, index,
+ table, field, i);
+ }
+ }
- 2) If we boot an installation from other case
- sensitive platform in Windows, we might need to
- check the existence of table name without lowering
- case them in the system table. */
- if (innobase_get_lower_case_table_names() == 1) {
+ prebuilt->idx_cond = this;
+ } else {
+ /* No index condition pushdown */
+ prebuilt->idx_cond = NULL;
- if (!par_case_name_set) {
-#ifndef __WIN__
- /* Check for the table using lower
- case name, including the partition
- separator "P" */
- memcpy(par_case_name, norm_name,
- strlen(norm_name));
- par_case_name[strlen(norm_name)] = 0;
- innobase_casedn_str(par_case_name);
-#else
- /* On Windows platfrom, check
- whether there exists table name in
- system table whose name is
- not being normalized to lower case */
- normalize_table_name_low(
- par_case_name, name, FALSE);
-#endif
- par_case_name_set = TRUE;
- }
+ for (i = 0, sql_idx = 0; i < n_stored_fields; i++, sql_idx++) {
+ const Field* field;
- ib_table = dict_table_get(
- par_case_name, TRUE, ignore_err);
+ while (!table->field[sql_idx]->stored_in_db) {
+ sql_idx++;
}
- if (ib_table) {
-#ifndef __WIN__
- sql_print_warning("Partition table %s opened "
- "after converting to lower "
- "case. The table may have "
- "been moved from a case "
- "in-sensitive file system. "
- "Please recreate table in "
- "the current file system\n",
- norm_name);
-#else
- sql_print_warning("Partition table %s opened "
- "after skipping the step to "
- "lower case the table name. "
- "The table may have been "
- "moved from a case sensitive "
- "file system. Please "
- "recreate table in the "
- "current file system\n",
- norm_name);
-#endif
- /* We allow use of table if it is found.
- this is consistent to current behavior
- to innodb_plugin */
- share->ib_table = ib_table;
- goto table_opened;
+ if (whole_row) {
+ field = table->field[sql_idx];
+ } else {
+ field = build_template_needs_field(
+ dict_index_contains_col_or_prefix(
+ index, i),
+ prebuilt->read_just_key,
+ fetch_all_in_key,
+ fetch_primary_key_cols,
+ index, table, i, sql_idx);
+ if (!field) {
+ continue;
+ }
}
- }
- if (is_part) {
- sql_print_error("Failed to open table %s.\n",
- norm_name);
+ build_template_field(prebuilt, clust_index, index,
+ table, field, i);
}
-
- sql_print_error("Cannot find or open table %s from\n"
- "the internal data dictionary of InnoDB "
- "though the .frm file for the\n"
- "table exists. Maybe you have deleted and "
- "recreated InnoDB data\n"
- "files but have forgotten to delete the "
- "corresponding .frm files\n"
- "of InnoDB tables, or you have moved .frm "
- "files to another database?\n"
- "or, the table contains indexes that this "
- "version of the engine\n"
- "doesn't support.\n"
- "See " REFMAN "innodb-troubleshooting.html\n"
- "how you can resolve the problem.\n",
- norm_name);
- free_share(share);
- my_errno = ENOENT;
-
- DBUG_RETURN(HA_ERR_NO_SUCH_TABLE);
}
-table_opened:
+ if (index != clust_index && prebuilt->need_to_access_clustered) {
+ /* Change rec_field_no's to correspond to the clustered index
+ record */
+ for (i = 0; i < prebuilt->n_template; i++) {
- if (ib_table->ibd_file_missing && !thd_tablespace_op(thd)) {
- sql_print_error("MySQL is trying to open a table handle but "
- "the .ibd file for\ntable %s does not exist.\n"
- "Have you deleted the .ibd file from the "
- "database directory under\nthe MySQL datadir, "
- "or have you used DISCARD TABLESPACE?\n"
- "See " REFMAN "innodb-troubleshooting.html\n"
- "how you can resolve the problem.\n",
- norm_name);
- free_share(share);
- my_errno = ENOENT;
+ mysql_row_templ_t* templ
+ = &prebuilt->mysql_template[i];
- dict_table_decrement_handle_count(ib_table, FALSE);
- DBUG_RETURN(HA_ERR_NO_SUCH_TABLE);
+ templ->rec_field_no = templ->clust_rec_field_no;
+ }
}
+}
- prebuilt = row_create_prebuilt(ib_table, table->s->stored_rec_length);
-
- prebuilt->default_rec = table->s->default_values;
- ut_ad(prebuilt->default_rec);
+/********************************************************************//**
+This special handling is really to overcome the limitations of MySQL's
+binlogging. We need to eliminate the non-determinism that will arise in
+INSERT ... SELECT type of statements, since MySQL binlog only stores the
+min value of the autoinc interval. Once that is fixed we can get rid of
+the special lock handling.
+@return DB_SUCCESS if all OK else error code */
+UNIV_INTERN
+dberr_t
+ha_innobase::innobase_lock_autoinc(void)
+/*====================================*/
+{
+ DBUG_ENTER("ha_innobase::innobase_lock_autoinc");
+ dberr_t error = DB_SUCCESS;
- /* Looks like MySQL-3.23 sometimes has primary key number != 0 */
+ ut_ad(!srv_read_only_mode);
- primary_key = table->s->primary_key;
- key_used_on_scan = primary_key;
+ switch (innobase_autoinc_lock_mode) {
+ case AUTOINC_NO_LOCKING:
+ /* Acquire only the AUTOINC mutex. */
+ dict_table_autoinc_lock(prebuilt->table);
+ break;
- if (!innobase_build_index_translation(table, ib_table, share)) {
- sql_print_error("Build InnoDB index translation table for"
- " Table %s failed", name);
- }
+ case AUTOINC_NEW_STYLE_LOCKING:
+ /* For simple (single/multi) row INSERTs/REPLACEs and RBR
+ events, we fallback to the old style only if another
+ transaction has already acquired the AUTOINC lock on
+ behalf of a LOAD FILE or INSERT ... SELECT etc. type of
+ statement. */
+ if (thd_sql_command(user_thd) == SQLCOM_INSERT
+ || thd_sql_command(user_thd) == SQLCOM_REPLACE
+ || thd_sql_command(user_thd) == SQLCOM_END // RBR event
+ ) {
+ dict_table_t* ib_table = prebuilt->table;
- /* Allocate a buffer for a 'row reference'. A row reference is
- a string of bytes of length ref_length which uniquely specifies
- a row in our table. Note that MySQL may also compare two row
- references for equality by doing a simple memcmp on the strings
- of length ref_length! */
+ /* Acquire the AUTOINC mutex. */
+ dict_table_autoinc_lock(ib_table);
- if (!row_table_got_default_clust_index(ib_table)) {
+ /* We need to check that another transaction isn't
+ already holding the AUTOINC lock on the table. */
+ if (ib_table->n_waiting_or_granted_auto_inc_locks) {
+ /* Release the mutex to avoid deadlocks and
+ fall back to old style locking. */
+ dict_table_autoinc_unlock(ib_table);
+ } else {
+ /* Do not fall back to old style locking. */
+ break;
+ }
+ }
+ /* Use old style locking. */
+ /* fall through */
+ case AUTOINC_OLD_STYLE_LOCKING:
+ DBUG_EXECUTE_IF("die_if_autoinc_old_lock_style_used",
+ ut_ad(0););
+ error = row_lock_table_autoinc_for_mysql(prebuilt);
- prebuilt->clust_index_was_generated = FALSE;
+ if (error == DB_SUCCESS) {
- if (UNIV_UNLIKELY(primary_key >= MAX_KEY)) {
- sql_print_error("Table %s has a primary key in "
- "InnoDB data dictionary, but not "
- "in MySQL!", name);
+ /* Acquire the AUTOINC mutex. */
+ dict_table_autoinc_lock(prebuilt->table);
+ }
+ break;
- /* This mismatch could cause further problems
- if not attended, bring this to the user's attention
- by printing a warning in addition to log a message
- in the errorlog */
- push_warning_printf(thd, MYSQL_ERROR::WARN_LEVEL_WARN,
- ER_NO_SUCH_INDEX,
- "InnoDB: Table %s has a "
- "primary key in InnoDB data "
- "dictionary, but not in "
- "MySQL!", name);
+ default:
+ ut_error;
+ }
- /* If primary_key >= MAX_KEY, its (primary_key)
- value could be out of bound if continue to index
- into key_info[] array. Find InnoDB primary index,
- and assign its key_length to ref_length.
- In addition, since MySQL indexes are sorted starting
- with primary index, unique index etc., initialize
- ref_length to the first index key length in
- case we fail to find InnoDB cluster index.
+ DBUG_RETURN(error);
+}
- Please note, this will not resolve the primary
- index mismatch problem, other side effects are
- possible if users continue to use the table.
- However, we allow this table to be opened so
- that user can adopt necessary measures for the
- mismatch while still being accessible to the table
- date. */
- ref_length = table->key_info[0].key_length;
+/********************************************************************//**
+Reset the autoinc value in the table.
+@return DB_SUCCESS if all went well else error code */
+UNIV_INTERN
+dberr_t
+ha_innobase::innobase_reset_autoinc(
+/*================================*/
+ ulonglong autoinc) /*!< in: value to store */
+{
+ dberr_t error;
- /* Find correspoinding cluster index
- key length in MySQL's key_info[] array */
- for (ulint i = 0; i < table->s->keys; i++) {
- dict_index_t* index;
- index = innobase_get_index(i);
- if (dict_index_is_clust(index)) {
- ref_length =
- table->key_info[i].key_length;
- }
- }
- } else {
- /* MySQL allocates the buffer for ref.
- key_info->key_length includes space for all key
- columns + one byte for each column that may be
- NULL. ref_length must be as exact as possible to
- save space, because all row reference buffers are
- allocated based on ref_length. */
+ error = innobase_lock_autoinc();
- ref_length = table->key_info[primary_key].key_length;
- }
- } else {
- if (primary_key != MAX_KEY) {
- sql_print_error(
- "Table %s has no primary key in InnoDB data "
- "dictionary, but has one in MySQL! If you "
- "created the table with a MySQL version < "
- "3.23.54 and did not define a primary key, "
- "but defined a unique key with all non-NULL "
- "columns, then MySQL internally treats that "
- "key as the primary key. You can fix this "
- "error by dump + DROP + CREATE + reimport "
- "of the table.", name);
+ if (error == DB_SUCCESS) {
- /* This mismatch could cause further problems
- if not attended, bring this to the user attention
- by printing a warning in addition to log a message
- in the errorlog */
- push_warning_printf(thd, MYSQL_ERROR::WARN_LEVEL_WARN,
- ER_NO_SUCH_INDEX,
- "InnoDB: Table %s has no "
- "primary key in InnoDB data "
- "dictionary, but has one in "
- "MySQL!", name);
- }
+ dict_table_autoinc_initialize(prebuilt->table, autoinc);
- prebuilt->clust_index_was_generated = TRUE;
+ dict_table_autoinc_unlock(prebuilt->table);
+ }
- ref_length = DATA_ROW_ID_LEN;
+ return(error);
+}
- /* If we automatically created the clustered index, then
- MySQL does not know about it, and MySQL must NOT be aware
- of the index used on scan, to make it avoid checking if we
- update the column of the index. That is why we assert below
- that key_used_on_scan is the undefined value MAX_KEY.
- The column is the row id in the automatical generation case,
- and it will never be updated anyway. */
+/********************************************************************//**
+Store the autoinc value in the table. The autoinc value is only set if
+it's greater than the existing autoinc value in the table.
+@return DB_SUCCESS if all went well else error code */
+UNIV_INTERN
+dberr_t
+ha_innobase::innobase_set_max_autoinc(
+/*==================================*/
+ ulonglong auto_inc) /*!< in: value to store */
+{
+ dberr_t error;
- if (key_used_on_scan != MAX_KEY) {
- sql_print_warning(
- "Table %s key_used_on_scan is %lu even "
- "though there is no primary key inside "
- "InnoDB.", name, (ulong) key_used_on_scan);
- }
+ error = innobase_lock_autoinc();
+
+ if (error == DB_SUCCESS) {
+
+ dict_table_autoinc_update_if_greater(prebuilt->table, auto_inc);
+
+ dict_table_autoinc_unlock(prebuilt->table);
}
- /* Index block size in InnoDB: used by MySQL in query optimization */
- stats.block_size = 16 * 1024;
+ return(error);
+}
- /* Init table lock structure */
- thr_lock_data_init(&share->lock,&lock,(void*) 0);
+/********************************************************************//**
+Stores a row in an InnoDB database, to the table specified in this
+handle.
+@return error code */
+UNIV_INTERN
+int
+ha_innobase::write_row(
+/*===================*/
+ uchar* record) /*!< in: a row in MySQL format */
+{
+ dberr_t error;
+ int error_result= 0;
+ ibool auto_inc_used= FALSE;
+#ifdef WITH_WSREP
+ ibool auto_inc_inserted= FALSE; /* if NULL was inserted */
+#endif
+ ulint sql_command;
+ trx_t* trx = thd_to_trx(user_thd);
- if (prebuilt->table) {
- /* We update the highest file format in the system table
- space, if this table has higher file format setting. */
+ DBUG_ENTER("ha_innobase::write_row");
- trx_sys_file_format_max_upgrade(
- (const char**) &innobase_file_format_max,
- dict_table_get_format(prebuilt->table));
+ if (high_level_read_only) {
+ ib_senderrf(ha_thd(), IB_LOG_LEVEL_WARN, ER_READ_ONLY_MODE);
+ DBUG_RETURN(HA_ERR_TABLE_READONLY);
+ } else if (prebuilt->trx != trx) {
+ sql_print_error("The transaction object for the table handle "
+ "is at %p, but for the current thread it is at "
+ "%p",
+ (const void*) prebuilt->trx, (const void*) trx);
+
+ fputs("InnoDB: Dump of 200 bytes around prebuilt: ", stderr);
+ ut_print_buf(stderr, ((const byte*) prebuilt) - 100, 200);
+ fputs("\n"
+ "InnoDB: Dump of 200 bytes around ha_data: ",
+ stderr);
+ ut_print_buf(stderr, ((const byte*) trx) - 100, 200);
+ putc('\n', stderr);
+ ut_error;
+ } else if (!trx_is_started(trx)) {
+ ++trx->will_lock;
}
- /* Only if the table has an AUTOINC column. */
- if (prebuilt->table != NULL && table->found_next_number_field != NULL) {
- dict_table_autoinc_lock(prebuilt->table);
+ ha_statistic_increment(&SSV::ha_write_count);
- /* Since a table can already be "open" in InnoDB's internal
- data dictionary, we only init the autoinc counter once, the
- first time the table is loaded. We can safely reuse the
- autoinc value from a previous MySQL open. */
- if (dict_table_autoinc_read(prebuilt->table) == 0) {
+ if (share->ib_table != prebuilt->table) {
+ fprintf(stderr,
+ "InnoDB: Warning: share->ib_table %p prebuilt->table %p table %s is_corrupt %lu.",
+ share->ib_table, prebuilt->table, prebuilt->table->name, prebuilt->table->is_corrupt);
+ }
- innobase_initialize_autoinc();
+ if (UNIV_UNLIKELY(share->ib_table && share->ib_table->is_corrupt)) {
+ DBUG_RETURN(HA_ERR_CRASHED);
+ }
+
+ sql_command = thd_sql_command(user_thd);
+
+ if ((sql_command == SQLCOM_ALTER_TABLE
+ || sql_command == SQLCOM_OPTIMIZE
+ || sql_command == SQLCOM_CREATE_INDEX
+#ifdef WITH_WSREP
+ || (wsrep_on(user_thd) && wsrep_load_data_splitting &&
+ sql_command == SQLCOM_LOAD &&
+ !thd_test_options(
+ user_thd, OPTION_NOT_AUTOCOMMIT | OPTION_BEGIN))
+#endif /* WITH_WSREP */
+ || sql_command == SQLCOM_DROP_INDEX)
+ && num_write_row >= 10000) {
+#ifdef WITH_WSREP
+ if (wsrep_on(user_thd) && sql_command == SQLCOM_LOAD) {
+ WSREP_DEBUG("forced trx split for LOAD: %s",
+ wsrep_thd_query(user_thd));
}
+#endif /* WITH_WSREP */
+ /* ALTER TABLE is COMMITted at every 10000 copied rows.
+ The IX table lock for the original table has to be re-issued.
+ As this method will be called on a temporary table where the
+ contents of the original table is being copied to, it is
+ a bit tricky to determine the source table. The cursor
+ position in the source table need not be adjusted after the
+ intermediate COMMIT, since writes by other transactions are
+ being blocked by a MySQL table lock TL_WRITE_ALLOW_READ. */
- dict_table_autoinc_unlock(prebuilt->table);
- }
+ dict_table_t* src_table;
+ enum lock_mode mode;
- info(HA_STATUS_NO_LOCK | HA_STATUS_VARIABLE | HA_STATUS_CONST);
+ num_write_row = 0;
- DBUG_RETURN(0);
-}
+ /* Commit the transaction. This will release the table
+ locks, so they have to be acquired again. */
-UNIV_INTERN
-handler*
-ha_innobase::clone(
-/*===============*/
- const char* name, /*!< in: table name */
- MEM_ROOT* mem_root) /*!< in: memory context */
-{
- ha_innobase* new_handler;
+ /* Altering an InnoDB table */
+ /* Get the source table. */
+ src_table = lock_get_src_table(
+ prebuilt->trx, prebuilt->table, &mode);
+ if (!src_table) {
+no_commit:
+ /* Unknown situation: do not commit */
+ /*
+ ut_print_timestamp(stderr);
+ fprintf(stderr,
+ " InnoDB: ALTER TABLE is holding lock"
+ " on %lu tables!\n",
+ prebuilt->trx->mysql_n_tables_locked);
+ */
+ ;
+ } else if (src_table == prebuilt->table) {
+#ifdef WITH_WSREP
+ if (wsrep_on(user_thd) &&
+ wsrep_load_data_splitting &&
+ sql_command == SQLCOM_LOAD &&
+ !thd_test_options(user_thd,
+ OPTION_NOT_AUTOCOMMIT | OPTION_BEGIN))
+ {
+ switch (wsrep_run_wsrep_commit(user_thd, wsrep_hton, 1))
+ {
+ case WSREP_TRX_OK:
+ break;
+ case WSREP_TRX_SIZE_EXCEEDED:
+ case WSREP_TRX_CERT_FAIL:
+ case WSREP_TRX_ERROR:
+ DBUG_RETURN(1);
+ }
- DBUG_ENTER("ha_innobase::clone");
+ if (binlog_hton->commit(binlog_hton, user_thd, 1))
+ DBUG_RETURN(1);
+ wsrep_post_commit(user_thd, TRUE);
+ }
+#endif /* WITH_WSREP */
+ /* Source table is not in InnoDB format:
+ no need to re-acquire locks on it. */
- new_handler = static_cast<ha_innobase*>(handler::clone(name,
- mem_root));
- if (new_handler) {
- new_handler->prebuilt->select_lock_type
- = prebuilt->select_lock_type;
+ /* Altering to InnoDB format */
+ innobase_commit(ht, user_thd, 1);
+ /* Note that this transaction is still active. */
+ trx_register_for_2pc(prebuilt->trx);
+ /* We will need an IX lock on the destination table. */
+ prebuilt->sql_stat_start = TRUE;
+ } else {
+#ifdef WITH_WSREP
+ if (wsrep_on(user_thd) &&
+ wsrep_load_data_splitting &&
+ sql_command == SQLCOM_LOAD &&
+ !thd_test_options(user_thd,
+ OPTION_NOT_AUTOCOMMIT | OPTION_BEGIN))
+ {
+ switch (wsrep_run_wsrep_commit(user_thd, wsrep_hton, 1))
+ {
+ case WSREP_TRX_OK:
+ break;
+ case WSREP_TRX_SIZE_EXCEEDED:
+ case WSREP_TRX_CERT_FAIL:
+ case WSREP_TRX_ERROR:
+ DBUG_RETURN(1);
+ }
+
+ if (binlog_hton->commit(binlog_hton, user_thd, 1))
+ DBUG_RETURN(1);
+ wsrep_post_commit(user_thd, TRUE);
+ }
+#endif /* WITH_WSREP */
+ /* Ensure that there are no other table locks than
+ LOCK_IX and LOCK_AUTO_INC on the destination table. */
+
+ if (!lock_is_table_exclusive(prebuilt->table,
+ prebuilt->trx)) {
+ goto no_commit;
+ }
+
+ /* Commit the transaction. This will release the table
+ locks, so they have to be acquired again. */
+ innobase_commit(ht, user_thd, 1);
+ /* Note that this transaction is still active. */
+ trx_register_for_2pc(prebuilt->trx);
+ /* Re-acquire the table lock on the source table. */
+ row_lock_table_for_mysql(prebuilt, src_table, mode);
+ /* We will need an IX lock on the destination table. */
+ prebuilt->sql_stat_start = TRUE;
+ }
}
- DBUG_RETURN(new_handler);
-}
+ num_write_row++;
-UNIV_INTERN
-uint
-ha_innobase::max_supported_key_part_length() const
-{
- /* A table format specific index column length check will be performed
- at ha_innobase::add_index() and row_create_index_for_mysql() */
- return(innobase_large_prefix
- ? REC_VERSION_56_MAX_INDEX_COL_LEN
- : REC_ANTELOPE_MAX_INDEX_COL_LEN - 1);
-}
+ /* This is the case where the table has an auto-increment column */
+ if (table->next_number_field && record == table->record[0]) {
-/******************************************************************//**
-Closes a handle to an InnoDB table.
-@return 0 */
-UNIV_INTERN
-int
-ha_innobase::close(void)
-/*====================*/
-{
- THD* thd;
+ /* Reset the error code before calling
+ innobase_get_auto_increment(). */
+ prebuilt->autoinc_error = DB_SUCCESS;
+
+#ifdef WITH_WSREP
+ auto_inc_inserted= (table->next_number_field->val_int() == 0);
+#endif
+
+ if ((error_result = update_auto_increment())) {
+ /* We don't want to mask autoinc overflow errors. */
- DBUG_ENTER("ha_innobase::close");
+ /* Handle the case where the AUTOINC sub-system
+ failed during initialization. */
+ if (prebuilt->autoinc_error == DB_UNSUPPORTED) {
+ error_result = ER_AUTOINC_READ_FAILED;
+ /* Set the error message to report too. */
+ my_error(ER_AUTOINC_READ_FAILED, MYF(0));
+ goto func_exit;
+ } else if (prebuilt->autoinc_error != DB_SUCCESS) {
+ error = prebuilt->autoinc_error;
+ goto report_error;
+ }
- thd = ha_thd();
- if (thd != NULL) {
- innobase_release_temporary_latches(ht, thd);
+ /* MySQL errors are passed straight back. except for
+ ER_AUTOINC_READ_FAILED. This can only happen
+ for values out of range.
+ */
+ goto func_exit;
+ }
+
+ auto_inc_used = TRUE;
}
- row_prebuilt_free(prebuilt, FALSE);
+ if (prebuilt->mysql_template == NULL
+ || prebuilt->template_type != ROW_MYSQL_WHOLE_ROW) {
- if (upd_buf != NULL) {
- ut_ad(upd_buf_size != 0);
- my_free(upd_buf);
- upd_buf = NULL;
- upd_buf_size = 0;
+ /* Build the template used in converting quickly between
+ the two database formats */
+
+ build_template(true);
}
- free_share(share);
+ innobase_srv_conc_enter_innodb(prebuilt->trx);
- /* Tell InnoDB server that there might be work for
- utility threads: */
+ error = row_insert_for_mysql((byte*) record, prebuilt);
+ DEBUG_SYNC(user_thd, "ib_after_row_insert");
- srv_active_wake_master_thread();
+ /* Handle duplicate key errors */
+ if (auto_inc_used) {
+ ulonglong auto_inc;
+ ulonglong col_max_value;
- DBUG_RETURN(0);
-}
+ /* Note the number of rows processed for this statement, used
+ by get_auto_increment() to determine the number of AUTO-INC
+ values to reserve. This is only useful for a mult-value INSERT
+ and is a statement level counter.*/
+ if (trx->n_autoinc_rows > 0) {
+ --trx->n_autoinc_rows;
+ }
-/* The following accessor functions should really be inside MySQL code! */
+ /* We need the upper limit of the col type to check for
+ whether we update the table autoinc counter or not. */
+ col_max_value = innobase_get_int_col_max_value(
+ table->next_number_field);
-/**************************************************************//**
-Gets field offset for a field in a table.
-@return offset */
-static inline
-uint
-get_field_offset(
-/*=============*/
- const TABLE* table, /*!< in: MySQL table object */
- const Field* field) /*!< in: MySQL field object */
-{
- return((uint) (field->ptr - table->record[0]));
-}
+ /* Get the value that MySQL attempted to store in the table.*/
+ auto_inc = table->next_number_field->val_uint();
-/**************************************************************//**
-Checks if a field in a record is SQL NULL. Uses the record format
-information in table to track the null bit in record.
-@return 1 if NULL, 0 otherwise */
-static inline
-uint
-field_in_record_is_null(
-/*====================*/
- TABLE* table, /*!< in: MySQL table object */
- Field* field, /*!< in: MySQL field object */
- char* record) /*!< in: a row in MySQL format */
-{
- int null_offset;
+ switch (error) {
+ case DB_DUPLICATE_KEY:
- if (!field->null_ptr) {
+ /* A REPLACE command and LOAD DATA INFILE REPLACE
+ handle a duplicate key error themselves, but we
+ must update the autoinc counter if we are performing
+ those statements. */
- return(0);
- }
+ switch (sql_command) {
+ case SQLCOM_LOAD:
+ if (trx->duplicates) {
- null_offset = (uint) ((char*) field->null_ptr
- - (char*) table->record[0]);
+ goto set_max_autoinc;
+ }
+ break;
- if (record[null_offset] & field->null_bit) {
+ case SQLCOM_REPLACE:
+ case SQLCOM_INSERT_SELECT:
+ case SQLCOM_REPLACE_SELECT:
+ goto set_max_autoinc;
- return(1);
- }
+#ifdef WITH_WSREP
+ /* workaround for LP bug #355000, retrying the insert */
+ case SQLCOM_INSERT:
- return(0);
-}
+ WSREP_DEBUG("DUPKEY error for autoinc\n"
+ "THD %ld, value %llu, off %llu inc %llu",
+ wsrep_thd_thread_id(current_thd),
+ auto_inc,
+ prebuilt->autoinc_offset,
+ prebuilt->autoinc_increment);
-/*************************************************************//**
-InnoDB uses this function to compare two data fields for which the data type
-is such that we must use MySQL code to compare them. NOTE that the prototype
-of this function is in rem0cmp.c in InnoDB source code! If you change this
-function, remember to update the prototype there!
-@return 1, 0, -1, if a is greater, equal, less than b, respectively */
-extern "C" UNIV_INTERN
-int
-innobase_mysql_cmp(
-/*===============*/
- int mysql_type, /*!< in: MySQL type */
- uint charset_number, /*!< in: number of the charset */
- const unsigned char* a, /*!< in: data field */
- unsigned int a_length, /*!< in: data field length,
- not UNIV_SQL_NULL */
- const unsigned char* b, /*!< in: data field */
- unsigned int b_length) /*!< in: data field length,
- not UNIV_SQL_NULL */
-{
- CHARSET_INFO* charset;
- enum_field_types mysql_tp;
- int ret;
+ if (wsrep_on(current_thd) &&
+ auto_inc_inserted &&
+ wsrep_drupal_282555_workaround &&
+ wsrep_thd_retry_counter(current_thd) == 0 &&
+ !thd_test_options(current_thd,
+ OPTION_NOT_AUTOCOMMIT |
+ OPTION_BEGIN)) {
+ WSREP_DEBUG(
+ "retrying insert: %s",
+ (*wsrep_thd_query(current_thd)) ?
+ wsrep_thd_query(current_thd) :
+ (char *)"void");
+ error= DB_SUCCESS;
+ wsrep_thd_set_conflict_state(
+ current_thd, MUST_ABORT);
+ innobase_srv_conc_exit_innodb(prebuilt->trx);
+ /* jump straight to func exit over
+ * later wsrep hooks */
+ goto func_exit;
+ }
+ break;
+#endif /* WITH_WSREP */
- DBUG_ASSERT(a_length != UNIV_SQL_NULL);
- DBUG_ASSERT(b_length != UNIV_SQL_NULL);
+ default:
+ break;
+ }
- mysql_tp = (enum_field_types) mysql_type;
+ break;
- switch (mysql_tp) {
+ case DB_SUCCESS:
+ /* If the actual value inserted is greater than
+ the upper limit of the interval, then we try and
+ update the table upper limit. Note: last_value
+ will be 0 if get_auto_increment() was not called.*/
- case MYSQL_TYPE_BIT:
- case MYSQL_TYPE_STRING:
- case MYSQL_TYPE_VAR_STRING:
- case MYSQL_TYPE_TINY_BLOB:
- case MYSQL_TYPE_MEDIUM_BLOB:
- case MYSQL_TYPE_BLOB:
- case MYSQL_TYPE_LONG_BLOB:
- case MYSQL_TYPE_VARCHAR:
- /* Use the charset number to pick the right charset struct for
- the comparison. Since the MySQL function get_charset may be
- slow before Bar removes the mutex operation there, we first
- look at 2 common charsets directly. */
+ if (auto_inc >= prebuilt->autoinc_last_value) {
+set_max_autoinc:
+ /* This should filter out the negative
+ values set explicitly by the user. */
+ if (auto_inc <= col_max_value) {
+ ut_a(prebuilt->autoinc_increment > 0);
- if (charset_number == default_charset_info->number) {
- charset = default_charset_info;
- } else if (charset_number == my_charset_latin1.number) {
- charset = &my_charset_latin1;
- } else {
- charset = get_charset(charset_number, MYF(MY_WME));
+ ulonglong offset;
+ ulonglong increment;
+ dberr_t err;
- if (charset == NULL) {
- sql_print_error("InnoDB needs charset %lu for doing "
- "a comparison, but MySQL cannot "
- "find that charset.",
- (ulong) charset_number);
- ut_a(0);
+ offset = prebuilt->autoinc_offset;
+ increment = prebuilt->autoinc_increment;
+
+ auto_inc = innobase_next_autoinc(
+ auto_inc,
+ 1, increment, offset,
+ col_max_value);
+
+ err = innobase_set_max_autoinc(
+ auto_inc);
+
+ if (err != DB_SUCCESS) {
+ error = err;
+ }
+ }
}
+ break;
+ default:
+ break;
}
+ }
- /* Starting from 4.1.3, we use strnncollsp() in comparisons of
- non-latin1_swedish_ci strings. NOTE that the collation order
- changes then: 'b\0\0...' is ordered BEFORE 'b ...'. Users
- having indexes on such data need to rebuild their tables! */
+ innobase_srv_conc_exit_innodb(prebuilt->trx);
- ret = charset->coll->strnncollsp(charset,
- a, a_length,
- b, b_length, 0);
- if (ret < 0) {
- return(-1);
- } else if (ret > 0) {
- return(1);
- } else {
- return(0);
- }
- default:
- ut_error;
+report_error:
+ if (error == DB_TABLESPACE_DELETED) {
+ ib_senderrf(
+ trx->mysql_thd, IB_LOG_LEVEL_ERROR,
+ ER_TABLESPACE_DISCARDED,
+ table->s->table_name.str);
}
- return(0);
-}
+ error_result = convert_error_code_to_mysql(error,
+ prebuilt->table->flags,
+ user_thd);
+
#ifdef WITH_WSREP
- if (!error_result &&
- wsrep_thd_exec_mode(user_thd) == LOCAL_STATE &&
- wsrep_on(user_thd) &&
- !wsrep_consistency_check(user_thd) &&
- !wsrep_thd_skip_append_keys(user_thd))
- {
- if (wsrep_append_keys(user_thd, false, record, NULL))
- {
-extern "C" UNIV_INTERN
-int
-wsrep_innobase_mysql_sort(
-/*===============*/
- /* out: str contains sort string */
- int mysql_type, /* in: MySQL type */
- uint charset_number, /* in: number of the charset */
- unsigned char* str, /* in: data field */
- unsigned int str_length, /* in: data field length,
- not UNIV_SQL_NULL */
- unsigned int buf_length) /* in: total str buffer length */
++ if (!error_result
++ && wsrep_on(user_thd)
++ && wsrep_thd_exec_mode(user_thd) == LOCAL_STATE
++ && !wsrep_consistency_check(user_thd)
++ && !wsrep_thd_skip_append_keys(user_thd)) {
++ if (wsrep_append_keys(user_thd, false, record, NULL)) {
+ DBUG_PRINT("wsrep", ("row key failed"));
+ error_result = HA_ERR_INTERNAL_ERROR;
+ goto wsrep_error;
+ }
+ }
+wsrep_error:
+#endif /* WITH_WSREP */
+
+ if (error_result == HA_FTS_INVALID_DOCID) {
+ my_error(HA_FTS_INVALID_DOCID, MYF(0));
+ }
+
+func_exit:
+ innobase_active_small();
+
+ if (share->ib_table != prebuilt->table) {
+ fprintf(stderr,
+ "InnoDB: Warning: share->ib_table %p prebuilt->table %p table %s is_corrupt %lu.",
+ share->ib_table, prebuilt->table, prebuilt->table->name, prebuilt->table->is_corrupt);
+ }
+
+ if (UNIV_UNLIKELY(share->ib_table && share->ib_table->is_corrupt)) {
+ DBUG_RETURN(HA_ERR_CRASHED);
+ }
+
+ DBUG_RETURN(error_result);
+}
+/**********************************************************************//**
+Checks which fields have changed in a row and stores information
+of them to an update vector.
+@return DB_SUCCESS or error code */
+static
+dberr_t
+calc_row_difference(
+/*================*/
+ upd_t* uvect, /*!< in/out: update vector */
+ uchar* old_row, /*!< in: old row in MySQL format */
+ uchar* new_row, /*!< in: new row in MySQL format */
+ TABLE* table, /*!< in: table in MySQL data
+ dictionary */
+ uchar* upd_buff, /*!< in: buffer to use */
+ ulint buff_len, /*!< in: buffer length */
+ row_prebuilt_t* prebuilt, /*!< in: InnoDB prebuilt struct */
+ THD* thd) /*!< in: user thread */
{
- CHARSET_INFO* charset;
- enum_field_types mysql_tp;
- int ret_length = str_length;
+ uchar* original_upd_buff = upd_buff;
+ Field* field;
+ enum_field_types field_mysql_type;
+ uint n_fields;
+ ulint o_len;
+ ulint n_len;
+ ulint col_pack_len;
+ const byte* new_mysql_row_col;
+ const byte* o_ptr;
+ const byte* n_ptr;
+ byte* buf;
+ upd_field_t* ufield;
+ ulint col_type;
+ ulint n_changed = 0;
+ dfield_t dfield;
+ dict_index_t* clust_index;
+ uint sql_idx, innodb_idx= 0;
+ ibool changes_fts_column = FALSE;
+ ibool changes_fts_doc_col = FALSE;
+ trx_t* trx = thd_to_trx(thd);
+ doc_id_t doc_id = FTS_NULL_DOC_ID;
- DBUG_ASSERT(str_length != UNIV_SQL_NULL);
+ ut_ad(!srv_read_only_mode);
- mysql_tp = (enum_field_types) mysql_type;
+ n_fields = table->s->fields;
+ clust_index = dict_table_get_first_index(prebuilt->table);
- switch (mysql_tp) {
+ /* We use upd_buff to convert changed fields */
+ buf = (byte*) upd_buff;
- case MYSQL_TYPE_BIT:
- case MYSQL_TYPE_STRING:
- case MYSQL_TYPE_VAR_STRING:
- case MYSQL_TYPE_TINY_BLOB:
- case MYSQL_TYPE_MEDIUM_BLOB:
- case MYSQL_TYPE_BLOB:
- case MYSQL_TYPE_LONG_BLOB:
- case MYSQL_TYPE_VARCHAR:
- {
- uchar tmp_str[REC_VERSION_56_MAX_INDEX_COL_LEN];
- uint tmp_length = REC_VERSION_56_MAX_INDEX_COL_LEN;
+ for (sql_idx = 0; sql_idx < n_fields; sql_idx++) {
+ field = table->field[sql_idx];
+ if (!field->stored_in_db)
+ continue;
- /* Use the charset number to pick the right charset struct for
- the comparison. Since the MySQL function get_charset may be
- slow before Bar removes the mutex operation there, we first
- look at 2 common charsets directly. */
+ o_ptr = (const byte*) old_row + get_field_offset(table, field);
+ n_ptr = (const byte*) new_row + get_field_offset(table, field);
- if (charset_number == default_charset_info->number) {
- charset = default_charset_info;
- } else if (charset_number == my_charset_latin1.number) {
- charset = &my_charset_latin1;
- } else {
- charset = get_charset(charset_number, MYF(MY_WME));
+ /* Use new_mysql_row_col and col_pack_len save the values */
- if (charset == NULL) {
- sql_print_error("InnoDB needs charset %lu for doing "
- "a comparison, but MySQL cannot "
- "find that charset.",
- (ulong) charset_number);
- ut_a(0);
- }
- }
+ new_mysql_row_col = n_ptr;
+ col_pack_len = field->pack_length();
- ut_a(str_length <= tmp_length);
- memcpy(tmp_str, str, str_length);
+ o_len = col_pack_len;
+ n_len = col_pack_len;
- if (wsrep_protocol_version < 3) {
- tmp_length = charset->coll->strnxfrm(
- charset, str, str_length,
- tmp_str, str_length);
- DBUG_ASSERT(tmp_length <= str_length);
- } else {
- /* strnxfrm will expand the destination string,
- protocols < 3 truncated the sorted sring
- protocols > 3 gets full sorted sring
- */
- /* 5.5 strnxfrm pads the tail with spaces and
- always returns the full destination buffer lenght
- we cannot know how many characters were converted
- using 2 * str length here as best guess
- */
- uint dst_length = (str_length * 2 < tmp_length) ?
- (str_length * 2) : tmp_length;
- tmp_length = charset->coll->strnxfrm(
- charset, str, dst_length,
- tmp_str, str_length);
- DBUG_ASSERT(tmp_length <= buf_length);
- ret_length = tmp_length;
- }
-
- break;
- }
- case MYSQL_TYPE_DECIMAL :
- case MYSQL_TYPE_TINY :
- case MYSQL_TYPE_SHORT :
- case MYSQL_TYPE_LONG :
- case MYSQL_TYPE_FLOAT :
- case MYSQL_TYPE_DOUBLE :
- case MYSQL_TYPE_NULL :
- case MYSQL_TYPE_TIMESTAMP :
- case MYSQL_TYPE_LONGLONG :
- case MYSQL_TYPE_INT24 :
- case MYSQL_TYPE_DATE :
- case MYSQL_TYPE_TIME :
- case MYSQL_TYPE_DATETIME :
- case MYSQL_TYPE_YEAR :
- case MYSQL_TYPE_NEWDATE :
- case MYSQL_TYPE_NEWDECIMAL :
- case MYSQL_TYPE_ENUM :
- case MYSQL_TYPE_SET :
- case MYSQL_TYPE_GEOMETRY :
- break;
- default:
- break;
- }
+ /* We use o_ptr and n_ptr to dig up the actual data for
+ comparison. */
- return ret_length;
-}
-#endif // WITH_WSREP
-/**************************************************************//**
-Converts a MySQL type to an InnoDB type. Note that this function returns
-the 'mtype' of InnoDB. InnoDB differentiates between MySQL's old <= 4.1
-VARCHAR and the new true VARCHAR in >= 5.0.3 by the 'prtype'.
-@return DATA_BINARY, DATA_VARCHAR, ... */
-extern "C" UNIV_INTERN
-ulint
-get_innobase_type_from_mysql_type(
-/*==============================*/
- ulint* unsigned_flag, /*!< out: DATA_UNSIGNED if an
- 'unsigned type';
- at least ENUM and SET,
- and unsigned integer
- types are 'unsigned types' */
- const void* f) /*!< in: MySQL Field */
-{
- const class Field* field = reinterpret_cast<const class Field*>(f);
+ field_mysql_type = field->type();
- /* The following asserts try to check that the MySQL type code fits in
- 8 bits: this is used in ibuf and also when DATA_NOT_NULL is ORed to
- the type */
+ col_type = prebuilt->table->cols[innodb_idx].mtype;
- DBUG_ASSERT((ulint)MYSQL_TYPE_STRING < 256);
- DBUG_ASSERT((ulint)MYSQL_TYPE_VAR_STRING < 256);
- DBUG_ASSERT((ulint)MYSQL_TYPE_DOUBLE < 256);
- DBUG_ASSERT((ulint)MYSQL_TYPE_FLOAT < 256);
- DBUG_ASSERT((ulint)MYSQL_TYPE_DECIMAL < 256);
+ switch (col_type) {
- if (field->flags & UNSIGNED_FLAG) {
+ case DATA_BLOB:
+ /* Do not compress blob column while comparing*/
+ o_ptr = row_mysql_read_blob_ref(&o_len, o_ptr, o_len);
+ n_ptr = row_mysql_read_blob_ref(&n_len, n_ptr, n_len);
- *unsigned_flag = DATA_UNSIGNED;
- } else {
- *unsigned_flag = 0;
- }
+ break;
- if (field->real_type() == MYSQL_TYPE_ENUM
- || field->real_type() == MYSQL_TYPE_SET) {
+ case DATA_VARCHAR:
+ case DATA_BINARY:
+ case DATA_VARMYSQL:
+ if (field_mysql_type == MYSQL_TYPE_VARCHAR) {
+ /* This is a >= 5.0.3 type true VARCHAR where
+ the real payload data length is stored in
+ 1 or 2 bytes */
- /* MySQL has field->type() a string type for these, but the
- data is actually internally stored as an unsigned integer
- code! */
+ o_ptr = row_mysql_read_true_varchar(
+ &o_len, o_ptr,
+ (ulint)
+ (((Field_varstring*) field)->length_bytes));
- *unsigned_flag = DATA_UNSIGNED; /* MySQL has its own unsigned
- flag set to zero, even though
- internally this is an unsigned
- integer type */
- return(DATA_INT);
- }
+ n_ptr = row_mysql_read_true_varchar(
+ &n_len, n_ptr,
+ (ulint)
+ (((Field_varstring*) field)->length_bytes));
+ }
- switch (field->type()) {
- /* NOTE that we only allow string types in DATA_MYSQL and
- DATA_VARMYSQL */
- case MYSQL_TYPE_VAR_STRING: /* old <= 4.1 VARCHAR */
- case MYSQL_TYPE_VARCHAR: /* new >= 5.0.3 true VARCHAR */
- if (field->binary()) {
- return(DATA_BINARY);
- } else if (strcmp(
- field->charset()->name,
- "latin1_swedish_ci") == 0) {
- return(DATA_VARCHAR);
- } else {
- return(DATA_VARMYSQL);
+ break;
+ default:
+ ;
}
- case MYSQL_TYPE_BIT:
- case MYSQL_TYPE_STRING: if (field->binary()) {
- return(DATA_FIXBINARY);
- } else if (strcmp(
- field->charset()->name,
- "latin1_swedish_ci") == 0) {
- return(DATA_CHAR);
- } else {
- return(DATA_MYSQL);
+ if (field_mysql_type == MYSQL_TYPE_LONGLONG
+ && prebuilt->table->fts
+ && innobase_strcasecmp(
+ field->field_name, FTS_DOC_ID_COL_NAME) == 0) {
+ doc_id = (doc_id_t) mach_read_from_n_little_endian(
+ n_ptr, 8);
+ if (doc_id == 0) {
+ return(DB_FTS_INVALID_DOCID);
+ }
}
- case MYSQL_TYPE_NEWDECIMAL:
- return(DATA_FIXBINARY);
- case MYSQL_TYPE_LONG:
- case MYSQL_TYPE_LONGLONG:
- case MYSQL_TYPE_TINY:
- case MYSQL_TYPE_SHORT:
- case MYSQL_TYPE_INT24:
- case MYSQL_TYPE_DATE:
- case MYSQL_TYPE_YEAR:
- case MYSQL_TYPE_NEWDATE:
- return(DATA_INT);
-
- case MYSQL_TYPE_TIME:
- case MYSQL_TYPE_DATETIME:
- case MYSQL_TYPE_TIMESTAMP:
- /*
- XtraDB should ideally just check field->keytype() and never
- field->type(). The following check is here to only
- change the new hires datetime/timestamp/time fields to
- use DATA_FIXBINARY. We can't convert this function to
- just test for field->keytype() as then the check if a
- table is compatible will fail for old tables.
- */
- if (field->key_type() == HA_KEYTYPE_BINARY)
- return(DATA_FIXBINARY);
- return(DATA_INT);
- case MYSQL_TYPE_FLOAT:
- return(DATA_FLOAT);
- case MYSQL_TYPE_DOUBLE:
- return(DATA_DOUBLE);
- case MYSQL_TYPE_DECIMAL:
- return(DATA_DECIMAL);
- case MYSQL_TYPE_GEOMETRY:
- case MYSQL_TYPE_TINY_BLOB:
- case MYSQL_TYPE_MEDIUM_BLOB:
- case MYSQL_TYPE_BLOB:
- case MYSQL_TYPE_LONG_BLOB:
- return(DATA_BLOB);
- case MYSQL_TYPE_NULL:
- return(DATA_FIXBINARY);
- default:
- ut_error;
- }
-
- return(0);
-}
-
-/*******************************************************************//**
-Writes an unsigned integer value < 64k to 2 bytes, in the little-endian
-storage format. */
-static inline
-void
-innobase_write_to_2_little_endian(
-/*==============================*/
- byte* buf, /*!< in: where to store */
- ulint val) /*!< in: value to write, must be < 64k */
-{
- ut_a(val < 256 * 256);
- buf[0] = (byte)(val & 0xFF);
- buf[1] = (byte)(val / 256);
-}
-/*******************************************************************//**
-Reads an unsigned integer value < 64k from 2 bytes, in the little-endian
-storage format.
-@return value */
-static inline
-uint
-innobase_read_from_2_little_endian(
-/*===============================*/
- const uchar* buf) /*!< in: from where to read */
-{
- return (uint) ((ulint)(buf[0]) + 256 * ((ulint)(buf[1])));
-}
+ if (field->real_maybe_null()) {
+ if (field->is_null_in_record(old_row)) {
+ o_len = UNIV_SQL_NULL;
+ }
-/*******************************************************************//**
-Stores a key value for a row to a buffer.
-@return key value length as stored in buff */
-#ifdef WITH_WSREP
-UNIV_INTERN
-uint
-wsrep_store_key_val_for_row(
-/*===============================*/
- TABLE* table,
- uint keynr, /*!< in: key number */
- char* buff, /*!< in/out: buffer for the key value (in MySQL
- format) */
- uint buff_len,/*!< in: buffer length */
- const uchar* record,
- ibool* key_is_null)/*!< out: full key was null */
-{
- KEY* key_info = table->key_info + keynr;
- KEY_PART_INFO* key_part = key_info->key_part;
- KEY_PART_INFO* end = key_part + key_info->key_parts;
- char* buff_start = buff;
- enum_field_types mysql_type;
- Field* field;
-
- DBUG_ENTER("store_key_val_for_row");
+ if (field->is_null_in_record(new_row)) {
+ n_len = UNIV_SQL_NULL;
+ }
+ }
- bzero(buff, buff_len);
- *key_is_null = TRUE;
+ if (o_len != n_len || (o_len != 0 && o_len != UNIV_SQL_NULL
+ && 0 != memcmp(o_ptr, n_ptr, o_len))) {
+ /* The field has changed */
- for (; key_part != end; key_part++) {
- uchar sorted[REC_VERSION_56_MAX_INDEX_COL_LEN] = {'\0'};
- ibool part_is_null = FALSE;
+ ufield = uvect->fields + n_changed;
+ UNIV_MEM_INVALID(ufield, sizeof *ufield);
- if (key_part->null_bit) {
- if (record[key_part->null_offset] &
- key_part->null_bit) {
- *buff = 1;
- part_is_null = TRUE;
+ /* Let us use a dummy dfield to make the conversion
+ from the MySQL column format to the InnoDB format */
+
+ if (n_len != UNIV_SQL_NULL) {
+ dict_col_copy_type(prebuilt->table->cols + innodb_idx,
+ dfield_get_type(&dfield));
+
+ buf = row_mysql_store_col_in_innobase_format(
+ &dfield,
+ (byte*) buf,
+ TRUE,
+ new_mysql_row_col,
+ col_pack_len,
+ dict_table_is_comp(prebuilt->table));
+ dfield_copy(&ufield->new_val, &dfield);
} else {
- *buff = 0;
+ dfield_set_null(&ufield->new_val);
}
- buff++;
- }
- if (!part_is_null) *key_is_null = FALSE;
- field = key_part->field;
- mysql_type = field->type();
+ ufield->exp = NULL;
+ ufield->orig_len = 0;
+ ufield->field_no = dict_col_get_clust_pos(
+ &prebuilt->table->cols[innodb_idx], clust_index);
+ n_changed++;
- if (mysql_type == MYSQL_TYPE_VARCHAR) {
- /* >= 5.0.3 true VARCHAR */
- ulint lenlen;
- ulint len;
- const byte* data;
- ulint key_len;
- ulint true_len;
- CHARSET_INFO* cs;
- int error=0;
+ /* If an FTS indexed column was changed by this
+ UPDATE then we need to inform the FTS sub-system.
- key_len = key_part->length;
+ NOTE: Currently we re-index all FTS indexed columns
+ even if only a subset of the FTS indexed columns
+ have been updated. That is the reason we are
+ checking only once here. Later we will need to
+ note which columns have been updated and do
+ selective processing. */
+ if (prebuilt->table->fts != NULL) {
+ ulint offset;
+ dict_table_t* innodb_table;
- if (part_is_null) {
- buff += key_len + 2;
+ innodb_table = prebuilt->table;
- continue;
+ if (!changes_fts_column) {
+ offset = row_upd_changes_fts_column(
+ innodb_table, ufield);
+
+ if (offset != ULINT_UNDEFINED) {
+ changes_fts_column = TRUE;
+ }
+ }
+
+ if (!changes_fts_doc_col) {
+ changes_fts_doc_col =
+ row_upd_changes_doc_id(
+ innodb_table, ufield);
+ }
+ }
+ }
+ if (field->stored_in_db)
+ innodb_idx++;
+ }
+
+ /* If the update changes a column with an FTS index on it, we
+ then add an update column node with a new document id to the
+ other changes. We piggy back our changes on the normal UPDATE
+ to reduce processing and IO overhead. */
+ if (!prebuilt->table->fts) {
+ trx->fts_next_doc_id = 0;
+ } else if (changes_fts_column || changes_fts_doc_col) {
+ dict_table_t* innodb_table = prebuilt->table;
+
+ ufield = uvect->fields + n_changed;
+
+ if (!DICT_TF2_FLAG_IS_SET(
+ innodb_table, DICT_TF2_FTS_HAS_DOC_ID)) {
+
+ /* If Doc ID is managed by user, and if any
+ FTS indexed column has been updated, its corresponding
+ Doc ID must also be updated. Otherwise, return
+ error */
+ if (changes_fts_column && !changes_fts_doc_col) {
+ ut_print_timestamp(stderr);
+ fprintf(stderr, " InnoDB: A new Doc ID"
+ " must be supplied while updating"
+ " FTS indexed columns.\n");
+ return(DB_FTS_INVALID_DOCID);
}
- cs = field->charset();
- lenlen = (ulint)
- (((Field_varstring*)field)->length_bytes);
+ /* Doc ID must monotonically increase */
+ ut_ad(innodb_table->fts->cache);
+ if (doc_id < prebuilt->table->fts->cache->next_doc_id) {
+ fprintf(stderr,
+ "InnoDB: FTS Doc ID must be larger than"
+ " " IB_ID_FMT " for table",
+ innodb_table->fts->cache->next_doc_id
+ - 1);
+ ut_print_name(stderr, trx,
+ TRUE, innodb_table->name);
+ putc('\n', stderr);
+
+ return(DB_FTS_INVALID_DOCID);
+ } else if ((doc_id
+ - prebuilt->table->fts->cache->next_doc_id)
+ >= FTS_DOC_ID_MAX_STEP) {
+ fprintf(stderr,
+ "InnoDB: Doc ID " UINT64PF " is too"
+ " big. Its difference with largest"
+ " Doc ID used " UINT64PF " cannot"
+ " exceed or equal to %d\n",
+ doc_id,
+ prebuilt->table->fts->cache->next_doc_id - 1,
+ FTS_DOC_ID_MAX_STEP);
+ }
- data = row_mysql_read_true_varchar(&len,
- (byte*) (record
- + (ulint)get_field_offset(table, field)),
- lenlen);
- true_len = len;
+ trx->fts_next_doc_id = doc_id;
+ } else {
+ /* If the Doc ID is a hidden column, it can't be
+ changed by user */
+ ut_ad(!changes_fts_doc_col);
- /* For multi byte character sets we need to calculate
- the true length of the key */
+ /* Doc ID column is hidden, a new Doc ID will be
+ generated by following fts_update_doc_id() call */
+ trx->fts_next_doc_id = 0;
+ }
- if (len > 0 && cs->mbmaxlen > 1) {
- true_len = (ulint) cs->cset->well_formed_len(cs,
- (const char *) data,
- (const char *) data + len,
- (uint) (key_len /
- cs->mbmaxlen),
- &error);
- }
+ fts_update_doc_id(
+ innodb_table, ufield, &trx->fts_next_doc_id);
- /* In a column prefix index, we may need to truncate
- the stored value: */
+ ++n_changed;
+ } else {
+ /* We have a Doc ID column, but none of FTS indexed
+ columns are touched, nor the Doc ID column, so set
+ fts_next_doc_id to UINT64_UNDEFINED, which means do not
+ update the Doc ID column */
+ trx->fts_next_doc_id = UINT64_UNDEFINED;
+ }
- if (true_len > key_len) {
- true_len = key_len;
- }
+ uvect->n_fields = n_changed;
+ uvect->info_bits = 0;
- memcpy(sorted, data, true_len);
- true_len = wsrep_innobase_mysql_sort(
- mysql_type, cs->number, sorted, true_len,
- REC_VERSION_56_MAX_INDEX_COL_LEN);
+ ut_a(buf <= (byte*) original_upd_buff + buff_len);
- if (wsrep_protocol_version > 1) {
- memcpy(buff, sorted, true_len);
- /* Note that we always reserve the maximum possible
- length of the true VARCHAR in the key value, though
- only len first bytes after the 2 length bytes contain
- actual data. The rest of the space was reset to zero
- in the bzero() call above. */
- buff += true_len;
- } else {
- buff += key_len;
- }
- } else if (mysql_type == MYSQL_TYPE_TINY_BLOB
- || mysql_type == MYSQL_TYPE_MEDIUM_BLOB
- || mysql_type == MYSQL_TYPE_BLOB
- || mysql_type == MYSQL_TYPE_LONG_BLOB
- /* MYSQL_TYPE_GEOMETRY data is treated
- as BLOB data in innodb. */
- || mysql_type == MYSQL_TYPE_GEOMETRY) {
+ return(DB_SUCCESS);
+}
- CHARSET_INFO* cs;
- ulint key_len;
- ulint true_len;
- int error=0;
- ulint blob_len;
- const byte* blob_data;
+#ifdef WITH_WSREP
+static
+int
+wsrep_calc_row_hash(
+/*================*/
+ byte* digest, /*!< in/out: md5 sum */
+ const uchar* row, /*!< in: row in MySQL format */
+ TABLE* table, /*!< in: table in MySQL data
+ dictionary */
+ row_prebuilt_t* prebuilt, /*!< in: InnoDB prebuilt struct */
+ THD* thd) /*!< in: user thread */
+{
+ Field* field;
+ enum_field_types field_mysql_type;
+ uint n_fields;
+ ulint len;
+ const byte* ptr;
+ ulint col_type;
+ uint i;
- ut_a(key_part->key_part_flag & HA_PART_KEY_SEG);
+ void *ctx = wsrep_md5_init();
- key_len = key_part->length;
+ n_fields = table->s->fields;
- if (part_is_null) {
- buff += key_len + 2;
+ for (i = 0; i < n_fields; i++) {
+ byte null_byte=0;
+ byte true_byte=1;
- continue;
- }
+ field = table->field[i];
- cs = field->charset();
+ ptr = (const byte*) row + get_field_offset(table, field);
+ len = field->pack_length();
- blob_data = row_mysql_read_blob_ref(&blob_len,
- (byte*) (record
- + (ulint)get_field_offset(table, field)),
- (ulint) field->pack_length());
+ field_mysql_type = field->type();
- true_len = blob_len;
+ col_type = prebuilt->table->cols[i].mtype;
- ut_a(get_field_offset(table, field)
- == key_part->offset);
+ switch (col_type) {
- /* For multi byte character sets we need to calculate
- the true length of the key */
+ case DATA_BLOB:
+ ptr = row_mysql_read_blob_ref(&len, ptr, len);
+ break;
- if (blob_len > 0 && cs->mbmaxlen > 1) {
- true_len = (ulint) cs->cset->well_formed_len(cs,
- (const char *) blob_data,
- (const char *) blob_data
- + blob_len,
- (uint) (key_len /
- cs->mbmaxlen),
- &error);
- }
+ case DATA_VARCHAR:
+ case DATA_BINARY:
+ case DATA_VARMYSQL:
+ if (field_mysql_type == MYSQL_TYPE_VARCHAR) {
+ /* This is a >= 5.0.3 type true VARCHAR where
+ the real payload data length is stored in
+ 1 or 2 bytes */
- /* All indexes on BLOB and TEXT are column prefix
- indexes, and we may need to truncate the data to be
- stored in the key value: */
+ ptr = row_mysql_read_true_varchar(
+ &len, ptr,
+ (ulint)
+ (((Field_varstring*)field)->length_bytes));
- if (true_len > key_len) {
- true_len = key_len;
}
- memcpy(sorted, blob_data, true_len);
- true_len = wsrep_innobase_mysql_sort(
- mysql_type, cs->number, sorted, true_len,
- REC_VERSION_56_MAX_INDEX_COL_LEN);
+ break;
+ default:
+ ;
+ }
+ /*
+ if (field->null_ptr &&
+ field_in_record_is_null(table, field, (char*) row)) {
+ */
+
+ if (field->is_null_in_record(row)) {
+ wsrep_md5_update(ctx, (char*)&null_byte, 1);
+ } else {
+ wsrep_md5_update(ctx, (char*)&true_byte, 1);
+ wsrep_md5_update(ctx, (char*)ptr, len);
+ }
+ }
+
+ wsrep_compute_md5_hash((char*)digest, ctx);
+
+ return(0);
+}
+#endif /* WITH_WSREP */
+/**********************************************************************//**
+Updates a row given as a parameter to a new value. Note that we are given
+whole rows, not just the fields which are updated: this incurs some
+overhead for CPU when we check which fields are actually updated.
+TODO: currently InnoDB does not prevent the 'Halloween problem':
+in a searched update a single row can get updated several times
+if its index columns are updated!
+@return error number or 0 */
+UNIV_INTERN
+int
+ha_innobase::update_row(
+/*====================*/
+ const uchar* old_row, /*!< in: old row in MySQL format */
+ uchar* new_row) /*!< in: new row in MySQL format */
+{
+ upd_t* uvect;
+ dberr_t error;
+ trx_t* trx = thd_to_trx(user_thd);
- memcpy(buff, sorted, true_len);
+ DBUG_ENTER("ha_innobase::update_row");
- /* Note that we always reserve the maximum possible
- length of the BLOB prefix in the key value. */
- if (wsrep_protocol_version > 1) {
- buff += true_len;
- } else {
- buff += key_len;
- }
- } else {
- /* Here we handle all other data types except the
- true VARCHAR, BLOB and TEXT. Note that the column
- value we store may be also in a column prefix
- index. */
+ ut_a(prebuilt->trx == trx);
- CHARSET_INFO* cs;
- ulint true_len;
- ulint key_len;
- const uchar* src_start;
- int error=0;
- enum_field_types real_type;
+ if (high_level_read_only) {
+ ib_senderrf(ha_thd(), IB_LOG_LEVEL_WARN, ER_READ_ONLY_MODE);
+ DBUG_RETURN(HA_ERR_TABLE_READONLY);
+ } else if (!trx_is_started(trx)) {
+ ++trx->will_lock;
+ }
- key_len = key_part->length;
+ if (upd_buf == NULL) {
+ ut_ad(upd_buf_size == 0);
- if (part_is_null) {
- buff += key_len;
+ /* Create a buffer for packing the fields of a record. Why
+ table->stored_rec_length did not work here? Obviously, because char
+ fields when packed actually became 1 byte longer, when we also
+ stored the string length as the first byte. */
- continue;
- }
+ upd_buf_size = table->s->stored_rec_length + table->s->max_key_length
+ + MAX_REF_PARTS * 3;
+ upd_buf = (uchar*) my_malloc(upd_buf_size, MYF(MY_WME));
+ if (upd_buf == NULL) {
+ upd_buf_size = 0;
+ DBUG_RETURN(HA_ERR_OUT_OF_MEM);
+ }
+ }
- src_start = record + key_part->offset;
- real_type = field->real_type();
- true_len = key_len;
+ ha_statistic_increment(&SSV::ha_update_count);
- /* Character set for the field is defined only
- to fields whose type is string and real field
- type is not enum or set. For these fields check
- if character set is multi byte. */
+ if (share->ib_table != prebuilt->table) {
+ fprintf(stderr,
+ "InnoDB: Warning: share->ib_table %p prebuilt->table %p table %s is_corrupt %lu.",
+ share->ib_table, prebuilt->table, prebuilt->table->name, prebuilt->table->is_corrupt);
+ }
- if (real_type != MYSQL_TYPE_ENUM
- && real_type != MYSQL_TYPE_SET
- && ( mysql_type == MYSQL_TYPE_VAR_STRING
- || mysql_type == MYSQL_TYPE_STRING)) {
+ if (UNIV_UNLIKELY(share->ib_table && share->ib_table->is_corrupt)) {
+ DBUG_RETURN(HA_ERR_CRASHED);
+ }
- cs = field->charset();
+ if (prebuilt->upd_node) {
+ uvect = prebuilt->upd_node->update;
+ } else {
+ uvect = row_get_prebuilt_update_vector(prebuilt);
+ }
- /* For multi byte character sets we need to
- calculate the true length of the key */
+ /* Build an update vector from the modified fields in the rows
+ (uses upd_buf of the handle) */
- if (key_len > 0 && cs->mbmaxlen > 1) {
+ error = calc_row_difference(uvect, (uchar*) old_row, new_row, table,
+ upd_buf, upd_buf_size, prebuilt, user_thd);
- true_len = (ulint)
- cs->cset->well_formed_len(cs,
- (const char *)src_start,
- (const char *)src_start
- + key_len,
- (uint) (key_len /
- cs->mbmaxlen),
- &error);
- }
- memcpy(sorted, src_start, true_len);
- true_len = wsrep_innobase_mysql_sort(
- mysql_type, cs->number, sorted, true_len,
- REC_VERSION_56_MAX_INDEX_COL_LEN);
+ if (error != DB_SUCCESS) {
+ goto func_exit;
+ }
- memcpy(buff, sorted, true_len);
- } else {
- memcpy(buff, src_start, true_len);
- }
- buff += true_len;
+ /* This is not a delete */
+ prebuilt->upd_node->is_delete = FALSE;
- /* Pad the unused space with spaces. */
+ ut_a(prebuilt->template_type == ROW_MYSQL_WHOLE_ROW);
-#ifdef REMOVED
- if (true_len < key_len) {
- ulint pad_len = key_len - true_len;
- ut_a(!(pad_len % cs->mbminlen));
+ innobase_srv_conc_enter_innodb(trx);
- cs->cset->fill(cs, buff, pad_len,
- 0x20 /* space */);
- buff += pad_len;
- }
-#endif /* REMOVED */
- }
- }
+ error = row_update_for_mysql((byte*) old_row, prebuilt);
- ut_a(buff <= buff_start + buff_len);
+ /* We need to do some special AUTOINC handling for the following case:
- DBUG_RETURN((uint)(buff - buff_start));
-}
-#endif /* WITH_WSREP */
-UNIV_INTERN
-uint
-ha_innobase::store_key_val_for_row(
-/*===============================*/
- uint keynr, /*!< in: key number */
- char* buff, /*!< in/out: buffer for the key value (in MySQL
- format) */
- uint buff_len,/*!< in: buffer length */
- const uchar* record)/*!< in: row in MySQL format */
-{
- KEY* key_info = table->key_info + keynr;
- KEY_PART_INFO* key_part = key_info->key_part;
- KEY_PART_INFO* end = key_part + key_info->key_parts;
- char* buff_start = buff;
- enum_field_types mysql_type;
- Field* field;
- ibool is_null;
+ INSERT INTO t (c1,c2) VALUES(x,y) ON DUPLICATE KEY UPDATE ...
- DBUG_ENTER("store_key_val_for_row");
+ We need to use the AUTOINC counter that was actually used by
+ MySQL in the UPDATE statement, which can be different from the
+ value used in the INSERT statement.*/
- /* The format for storing a key field in MySQL is the following:
+ if (error == DB_SUCCESS
+ && table->next_number_field
+ && new_row == table->record[0]
+ && thd_sql_command(user_thd) == SQLCOM_INSERT
+ && trx->duplicates) {
- 1. If the column can be NULL, then in the first byte we put 1 if the
- field value is NULL, 0 otherwise.
+ ulonglong auto_inc;
+ ulonglong col_max_value;
- 2. If the column is of a BLOB type (it must be a column prefix field
- in this case), then we put the length of the data in the field to the
- next 2 bytes, in the little-endian format. If the field is SQL NULL,
- then these 2 bytes are set to 0. Note that the length of data in the
- field is <= column prefix length.
+ auto_inc = table->next_number_field->val_uint();
- 3. In a column prefix field, prefix_len next bytes are reserved for
- data. In a normal field the max field length next bytes are reserved
- for data. For a VARCHAR(n) the max field length is n. If the stored
- value is the SQL NULL then these data bytes are set to 0.
+ /* We need the upper limit of the col type to check for
+ whether we update the table autoinc counter or not. */
+ col_max_value = innobase_get_int_col_max_value(
+ table->next_number_field);
- 4. We always use a 2 byte length for a true >= 5.0.3 VARCHAR. Note that
- in the MySQL row format, the length is stored in 1 or 2 bytes,
- depending on the maximum allowed length. But in the MySQL key value
- format, the length always takes 2 bytes.
+ if (auto_inc <= col_max_value && auto_inc != 0) {
- We have to zero-fill the buffer so that MySQL is able to use a
- simple memcmp to compare two key values to determine if they are
- equal. MySQL does this to compare contents of two 'ref' values. */
+ ulonglong offset;
+ ulonglong increment;
- bzero(buff, buff_len);
+ offset = prebuilt->autoinc_offset;
+ increment = prebuilt->autoinc_increment;
- for (; key_part != end; key_part++) {
- is_null = FALSE;
+ auto_inc = innobase_next_autoinc(
+ auto_inc, 1, increment, offset, col_max_value);
- if (key_part->null_bit) {
- if (record[key_part->null_offset]
- & key_part->null_bit) {
- *buff = 1;
- is_null = TRUE;
- } else {
- *buff = 0;
- }
- buff++;
+ error = innobase_set_max_autoinc(auto_inc);
}
+ }
- field = key_part->field;
- mysql_type = field->type();
+ innobase_srv_conc_exit_innodb(trx);
- if (mysql_type == MYSQL_TYPE_VARCHAR) {
- /* >= 5.0.3 true VARCHAR */
- ulint lenlen;
- ulint len;
- const byte* data;
- ulint key_len;
- ulint true_len;
- CHARSET_INFO* cs;
- int error=0;
+func_exit:
+ int err = convert_error_code_to_mysql(error,
+ prebuilt->table->flags, user_thd);
- key_len = key_part->length;
+ /* If success and no columns were updated. */
+ if (err == 0 && uvect->n_fields == 0) {
- if (is_null) {
- buff += key_len + 2;
+ /* This is the same as success, but instructs
+ MySQL that the row is not really updated and it
+ should not increase the count of updated rows.
+ This is fix for http://bugs.mysql.com/29157 */
+ err = HA_ERR_RECORD_IS_THE_SAME;
+ } else if (err == HA_FTS_INVALID_DOCID) {
+ my_error(HA_FTS_INVALID_DOCID, MYF(0));
+ }
- continue;
- }
- cs = field->charset();
+ /* Tell InnoDB server that there might be work for
+ utility threads: */
- lenlen = (ulint)
- (((Field_varstring*)field)->length_bytes);
+ innobase_active_small();
- data = row_mysql_read_true_varchar(&len,
- (byte*) (record
- + (ulint)get_field_offset(table, field)),
- lenlen);
+#ifdef WITH_WSREP
+ if (error == DB_SUCCESS &&
+ wsrep_thd_exec_mode(user_thd) == LOCAL_STATE &&
+ wsrep_on(user_thd) &&
+ !wsrep_thd_skip_append_keys(user_thd))
+ {
+ DBUG_PRINT("wsrep", ("update row key"));
- true_len = len;
+ if (wsrep_append_keys(user_thd, false, old_row, new_row)) {
+ WSREP_DEBUG("WSREP: UPDATE_ROW_KEY FAILED");
+ DBUG_PRINT("wsrep", ("row key failed"));
+ err = HA_ERR_INTERNAL_ERROR;
+ goto wsrep_error;
+ }
+ }
+wsrep_error:
+#endif /* WITH_WSREP */
- /* For multi byte character sets we need to calculate
- the true length of the key */
+ if (share->ib_table != prebuilt->table) {
+ fprintf(stderr,
+ "InnoDB: Warning: share->ib_table %p prebuilt->table %p table %s is_corrupt %lu.",
+ share->ib_table, prebuilt->table, prebuilt->table->name, prebuilt->table->is_corrupt);
+ }
- if (len > 0 && cs->mbmaxlen > 1) {
- true_len = (ulint) cs->cset->well_formed_len(cs,
- (const char *) data,
- (const char *) data + len,
- (uint) (key_len /
- cs->mbmaxlen),
- &error);
- }
+ if (UNIV_UNLIKELY(share->ib_table && share->ib_table->is_corrupt)) {
+ DBUG_RETURN(HA_ERR_CRASHED);
+ }
- /* In a column prefix index, we may need to truncate
- the stored value: */
+ DBUG_RETURN(err);
+}
- if (true_len > key_len) {
- true_len = key_len;
- }
+/**********************************************************************//**
+Deletes a row given as the parameter.
+@return error number or 0 */
+UNIV_INTERN
+int
+ha_innobase::delete_row(
+/*====================*/
+ const uchar* record) /*!< in: a row in MySQL format */
+{
+ dberr_t error;
+ trx_t* trx = thd_to_trx(user_thd);
- /* The length in a key value is always stored in 2
- bytes */
+ DBUG_ENTER("ha_innobase::delete_row");
- row_mysql_store_true_var_len((byte*)buff, true_len, 2);
- buff += 2;
+ ut_a(prebuilt->trx == trx);
- memcpy(buff, data, true_len);
+ if (high_level_read_only) {
+ ib_senderrf(ha_thd(), IB_LOG_LEVEL_WARN, ER_READ_ONLY_MODE);
+ DBUG_RETURN(HA_ERR_TABLE_READONLY);
+ } else if (!trx_is_started(trx)) {
+ ++trx->will_lock;
+ }
- /* Note that we always reserve the maximum possible
- length of the true VARCHAR in the key value, though
- only len first bytes after the 2 length bytes contain
- actual data. The rest of the space was reset to zero
- in the bzero() call above. */
+ ha_statistic_increment(&SSV::ha_delete_count);
- buff += key_len;
+ if (UNIV_UNLIKELY(share && share->ib_table
+ && share->ib_table->is_corrupt)) {
+ DBUG_RETURN(HA_ERR_CRASHED);
+ }
- } else if (mysql_type == MYSQL_TYPE_TINY_BLOB
- || mysql_type == MYSQL_TYPE_MEDIUM_BLOB
- || mysql_type == MYSQL_TYPE_BLOB
- || mysql_type == MYSQL_TYPE_LONG_BLOB
- /* MYSQL_TYPE_GEOMETRY data is treated
- as BLOB data in innodb. */
- || mysql_type == MYSQL_TYPE_GEOMETRY) {
+ if (!prebuilt->upd_node) {
+ row_get_prebuilt_update_vector(prebuilt);
+ }
- CHARSET_INFO* cs;
- ulint key_len;
- ulint true_len;
- int error=0;
- ulint blob_len;
- const byte* blob_data;
+ /* This is a delete */
- ut_a(key_part->key_part_flag & HA_PART_KEY_SEG);
+ prebuilt->upd_node->is_delete = TRUE;
- key_len = key_part->length;
+ innobase_srv_conc_enter_innodb(trx);
- if (is_null) {
- buff += key_len + 2;
+ error = row_update_for_mysql((byte*) record, prebuilt);
- continue;
- }
+ innobase_srv_conc_exit_innodb(trx);
- cs = field->charset();
+ /* Tell the InnoDB server that there might be work for
+ utility threads: */
- blob_data = row_mysql_read_blob_ref(&blob_len,
- (byte*) (record
- + (ulint)get_field_offset(table, field)),
- (ulint) field->pack_length());
+ innobase_active_small();
- true_len = blob_len;
+#ifdef WITH_WSREP
+ if (error == DB_SUCCESS &&
+ wsrep_thd_exec_mode(user_thd) == LOCAL_STATE &&
+ wsrep_on(user_thd) &&
+ !wsrep_thd_skip_append_keys(user_thd))
+ {
+ if (wsrep_append_keys(user_thd, false, record, NULL)) {
+ DBUG_PRINT("wsrep", ("delete fail"));
+ error = DB_ERROR;
+ goto wsrep_error;
+ }
+ }
+wsrep_error:
+#endif /* WITH_WSREP */
- ut_a(get_field_offset(table, field)
- == key_part->offset);
+ if (UNIV_UNLIKELY(share && share->ib_table
+ && share->ib_table->is_corrupt)) {
+ DBUG_RETURN(HA_ERR_CRASHED);
+ }
- /* For multi byte character sets we need to calculate
- the true length of the key */
+ DBUG_RETURN(convert_error_code_to_mysql(
+ error, prebuilt->table->flags, user_thd));
+}
- if (blob_len > 0 && cs->mbmaxlen > 1) {
- true_len = (ulint) cs->cset->well_formed_len(cs,
- (const char *) blob_data,
- (const char *) blob_data
- + blob_len,
- (uint) (key_len /
- cs->mbmaxlen),
- &error);
- }
+/**********************************************************************//**
+Removes a new lock set on a row, if it was not read optimistically. This can
+be called after a row has been read in the processing of an UPDATE or a DELETE
+query, if the option innodb_locks_unsafe_for_binlog is set. */
+UNIV_INTERN
+void
+ha_innobase::unlock_row(void)
+/*=========================*/
+{
+ DBUG_ENTER("ha_innobase::unlock_row");
- /* All indexes on BLOB and TEXT are column prefix
- indexes, and we may need to truncate the data to be
- stored in the key value: */
+ /* Consistent read does not take any locks, thus there is
+ nothing to unlock. */
- if (true_len > key_len) {
- true_len = key_len;
- }
+ if (prebuilt->select_lock_type == LOCK_NONE) {
+ DBUG_VOID_RETURN;
+ }
- /* MySQL reserves 2 bytes for the length and the
- storage of the number is little-endian */
+ /* Ideally, this assert must be in the beginning of the function.
+ But there are some calls to this function from the SQL layer when the
+ transaction is in state TRX_STATE_NOT_STARTED. The check on
+ prebuilt->select_lock_type above gets around this issue. */
+ ut_ad(trx_state_eq(prebuilt->trx, TRX_STATE_ACTIVE));
- innobase_write_to_2_little_endian(
- (byte*)buff, true_len);
- buff += 2;
+ switch (prebuilt->row_read_type) {
+ case ROW_READ_WITH_LOCKS:
+ if (!srv_locks_unsafe_for_binlog
+ && prebuilt->trx->isolation_level
+ > TRX_ISO_READ_COMMITTED) {
+ break;
+ }
+ /* fall through */
+ case ROW_READ_TRY_SEMI_CONSISTENT:
+ row_unlock_for_mysql(prebuilt, FALSE);
+ break;
+ case ROW_READ_DID_SEMI_CONSISTENT:
+ prebuilt->row_read_type = ROW_READ_TRY_SEMI_CONSISTENT;
+ break;
+ }
- memcpy(buff, blob_data, true_len);
+ DBUG_VOID_RETURN;
+}
- /* Note that we always reserve the maximum possible
- length of the BLOB prefix in the key value. */
+/* See handler.h and row0mysql.h for docs on this function. */
+UNIV_INTERN
+bool
+ha_innobase::was_semi_consistent_read(void)
+/*=======================================*/
+{
+ return(prebuilt->row_read_type == ROW_READ_DID_SEMI_CONSISTENT);
+}
- buff += key_len;
- } else {
- /* Here we handle all other data types except the
- true VARCHAR, BLOB and TEXT. Note that the column
- value we store may be also in a column prefix
- index. */
+/* See handler.h and row0mysql.h for docs on this function. */
+UNIV_INTERN
+void
+ha_innobase::try_semi_consistent_read(bool yes)
+/*===========================================*/
+{
+ ut_a(prebuilt->trx == thd_to_trx(ha_thd()));
- CHARSET_INFO* cs;
- ulint true_len;
- ulint key_len;
- const uchar* src_start;
- int error=0;
- enum_field_types real_type;
+ /* Row read type is set to semi consistent read if this was
+ requested by the MySQL and either innodb_locks_unsafe_for_binlog
+ option is used or this session is using READ COMMITTED isolation
+ level. */
- key_len = key_part->length;
+ if (yes
+ && (srv_locks_unsafe_for_binlog
+ || prebuilt->trx->isolation_level <= TRX_ISO_READ_COMMITTED)) {
+ prebuilt->row_read_type = ROW_READ_TRY_SEMI_CONSISTENT;
+ } else {
+ prebuilt->row_read_type = ROW_READ_WITH_LOCKS;
+ }
+}
- if (is_null) {
- buff += key_len;
+/******************************************************************//**
+Initializes a handle to use an index.
+@return 0 or error number */
+UNIV_INTERN
+int
+ha_innobase::index_init(
+/*====================*/
+ uint keynr, /*!< in: key (index) number */
+ bool sorted) /*!< in: 1 if result MUST be sorted according to index */
+{
+ DBUG_ENTER("index_init");
- continue;
- }
+ DBUG_RETURN(change_active_index(keynr));
+}
- src_start = record + key_part->offset;
- real_type = field->real_type();
- true_len = key_len;
+/******************************************************************//**
+Currently does nothing.
+@return 0 */
+UNIV_INTERN
+int
+ha_innobase::index_end(void)
+/*========================*/
+{
+ int error = 0;
+ DBUG_ENTER("index_end");
+ active_index = MAX_KEY;
+ in_range_check_pushed_down = FALSE;
+ ds_mrr.dsmrr_close();
+ DBUG_RETURN(error);
+}
- /* Character set for the field is defined only
- to fields whose type is string and real field
- type is not enum or set. For these fields check
- if character set is multi byte. */
+/*********************************************************************//**
+Converts a search mode flag understood by MySQL to a flag understood
+by InnoDB. */
+static inline
+ulint
+convert_search_mode_to_innobase(
+/*============================*/
+ enum ha_rkey_function find_flag)
+{
+ switch (find_flag) {
+ case HA_READ_KEY_EXACT:
+ /* this does not require the index to be UNIQUE */
+ return(PAGE_CUR_GE);
+ case HA_READ_KEY_OR_NEXT:
+ return(PAGE_CUR_GE);
+ case HA_READ_KEY_OR_PREV:
+ return(PAGE_CUR_LE);
+ case HA_READ_AFTER_KEY:
+ return(PAGE_CUR_G);
+ case HA_READ_BEFORE_KEY:
+ return(PAGE_CUR_L);
+ case HA_READ_PREFIX:
+ return(PAGE_CUR_GE);
+ case HA_READ_PREFIX_LAST:
+ return(PAGE_CUR_LE);
+ case HA_READ_PREFIX_LAST_OR_PREV:
+ return(PAGE_CUR_LE);
+ /* In MySQL-4.0 HA_READ_PREFIX and HA_READ_PREFIX_LAST always
+ pass a complete-field prefix of a key value as the search
+ tuple. I.e., it is not allowed that the last field would
+ just contain n first bytes of the full field value.
+ MySQL uses a 'padding' trick to convert LIKE 'abc%'
+ type queries so that it can use as a search tuple
+ a complete-field-prefix of a key value. Thus, the InnoDB
+ search mode PAGE_CUR_LE_OR_EXTENDS is never used.
+ TODO: when/if MySQL starts to use also partial-field
+ prefixes, we have to deal with stripping of spaces
+ and comparison of non-latin1 char type fields in
+ innobase_mysql_cmp() to get PAGE_CUR_LE_OR_EXTENDS to
+ work correctly. */
+ case HA_READ_MBR_CONTAIN:
+ case HA_READ_MBR_INTERSECT:
+ case HA_READ_MBR_WITHIN:
+ case HA_READ_MBR_DISJOINT:
+ case HA_READ_MBR_EQUAL:
+ return(PAGE_CUR_UNSUPP);
+ /* do not use "default:" in order to produce a gcc warning:
+ enumeration value '...' not handled in switch
+ (if -Wswitch or -Wall is used) */
+ }
- if (real_type != MYSQL_TYPE_ENUM
- && real_type != MYSQL_TYPE_SET
- && ( mysql_type == MYSQL_TYPE_VAR_STRING
- || mysql_type == MYSQL_TYPE_STRING)) {
+ my_error(ER_CHECK_NOT_IMPLEMENTED, MYF(0), "this functionality");
- cs = field->charset();
+ return(PAGE_CUR_UNSUPP);
+}
+
+/*
+ BACKGROUND INFO: HOW A SELECT SQL QUERY IS EXECUTED
+ ---------------------------------------------------
+The following does not cover all the details, but explains how we determine
+the start of a new SQL statement, and what is associated with it.
- /* For multi byte character sets we need to
- calculate the true length of the key */
+For each table in the database the MySQL interpreter may have several
+table handle instances in use, also in a single SQL query. For each table
+handle instance there is an InnoDB 'prebuilt' struct which contains most
+of the InnoDB data associated with this table handle instance.
- if (key_len > 0 && cs->mbmaxlen > 1) {
+ A) if the user has not explicitly set any MySQL table level locks:
- true_len = (ulint)
- cs->cset->well_formed_len(cs,
- (const char *)src_start,
- (const char *)src_start
- + key_len,
- (uint) (key_len /
- cs->mbmaxlen),
- &error);
- }
- }
+ 1) MySQL calls ::external_lock to set an 'intention' table level lock on
+the table of the handle instance. There we set
+prebuilt->sql_stat_start = TRUE. The flag sql_stat_start should be set
+true if we are taking this table handle instance to use in a new SQL
+statement issued by the user. We also increment trx->n_mysql_tables_in_use.
- memcpy(buff, src_start, true_len);
- buff += true_len;
+ 2) If prebuilt->sql_stat_start == TRUE we 'pre-compile' the MySQL search
+instructions to prebuilt->template of the table handle instance in
+::index_read. The template is used to save CPU time in large joins.
- /* Pad the unused space with spaces. */
+ 3) In row_search_for_mysql, if prebuilt->sql_stat_start is true, we
+allocate a new consistent read view for the trx if it does not yet have one,
+or in the case of a locking read, set an InnoDB 'intention' table level
+lock on the table.
- if (true_len < key_len) {
- ulint pad_len = key_len - true_len;
- ut_a(!(pad_len % cs->mbminlen));
+ 4) We do the SELECT. MySQL may repeatedly call ::index_read for the
+same table handle instance, if it is a join.
- cs->cset->fill(cs, buff, pad_len,
- 0x20 /* space */);
- buff += pad_len;
- }
- }
- }
+ 5) When the SELECT ends, MySQL removes its intention table level locks
+in ::external_lock. When trx->n_mysql_tables_in_use drops to zero,
+ (a) we execute a COMMIT there if the autocommit is on,
+ (b) we also release possible 'SQL statement level resources' InnoDB may
+have for this SQL statement. The MySQL interpreter does NOT execute
+autocommit for pure read transactions, though it should. That is why the
+table handler in that case has to execute the COMMIT in ::external_lock.
- ut_a(buff <= buff_start + buff_len);
+ B) If the user has explicitly set MySQL table level locks, then MySQL
+does NOT call ::external_lock at the start of the statement. To determine
+when we are at the start of a new SQL statement we at the start of
+::index_read also compare the query id to the latest query id where the
+table handle instance was used. If it has changed, we know we are at the
+start of a new SQL statement. Since the query id can theoretically
+overwrap, we use this test only as a secondary way of determining the
+start of a new SQL statement. */
- DBUG_RETURN((uint)(buff - buff_start));
-}
-/**************************************************************//**
-Determines if a field is needed in a prebuilt struct 'template'.
-@return field to use, or NULL if the field is not needed */
-static
-const Field*
-build_template_needs_field(
-/*=======================*/
- ibool index_contains, /*!< in:
- dict_index_contains_col_or_prefix(
- index, i) */
- ibool read_just_key, /*!< in: TRUE when MySQL calls
- ha_innobase::extra with the
- argument HA_EXTRA_KEYREAD; it is enough
- to read just columns defined in
- the index (i.e., no read of the
- clustered index record necessary) */
- ibool fetch_all_in_key,
- /*!< in: true=fetch all fields in
- the index */
- ibool fetch_primary_key_cols,
- /*!< in: true=fetch the
- primary key columns */
- dict_index_t* index, /*!< in: InnoDB index to use */
- const TABLE* table, /*!< in: MySQL table object */
- ulint i, /*!< in: field index in InnoDB table */
- ulint sql_idx) /*!< in: field index in SQL table */
+/**********************************************************************//**
+Positions an index cursor to the index specified in the handle. Fetches the
+row if any.
+@return 0, HA_ERR_KEY_NOT_FOUND, or error number */
+UNIV_INTERN
+int
+ha_innobase::index_read(
+/*====================*/
+ uchar* buf, /*!< in/out: buffer for the returned
+ row */
+ const uchar* key_ptr, /*!< in: key value; if this is NULL
+ we position the cursor at the
+ start or end of index; this can
+ also contain an InnoDB row id, in
+ which case key_len is the InnoDB
+ row id length; the key value can
+ also be a prefix of a full key value,
+ and the last column can be a prefix
+ of a full column */
+ uint key_len,/*!< in: key value length */
+ enum ha_rkey_function find_flag)/*!< in: search flags from my_base.h */
{
- const Field* field = table->field[sql_idx];
+ ulint mode;
+ dict_index_t* index;
+ ulint match_mode = 0;
+ int error;
+ dberr_t ret;
- ut_ad(index_contains == dict_index_contains_col_or_prefix(index, i));
+ DBUG_ENTER("index_read");
+ DEBUG_SYNC_C("ha_innobase_index_read_begin");
- if (!index_contains) {
- if (read_just_key) {
- /* If this is a 'key read', we do not need
- columns that are not in the key */
+ ut_a(prebuilt->trx == thd_to_trx(user_thd));
+ ut_ad(key_len != 0 || find_flag != HA_READ_KEY_EXACT);
- return(NULL);
- }
- } else if (fetch_all_in_key) {
- /* This field is needed in the query */
+ ha_statistic_increment(&SSV::ha_read_key_count);
- return(field);
+ if (UNIV_UNLIKELY(srv_pass_corrupt_table <= 1 && share
+ && share->ib_table && share->ib_table->is_corrupt)) {
+ DBUG_RETURN(HA_ERR_CRASHED);
}
- if (bitmap_is_set(table->read_set, sql_idx)
- || bitmap_is_set(table->write_set, sql_idx)) {
- /* This field is needed in the query */
+ index = prebuilt->index;
- return(field);
+ if (UNIV_UNLIKELY(index == NULL) || dict_index_is_corrupted(index)) {
+ prebuilt->index_usable = FALSE;
+ DBUG_RETURN(HA_ERR_CRASHED);
+ }
+ if (UNIV_UNLIKELY(!prebuilt->index_usable)) {
+ DBUG_RETURN(dict_index_is_corrupted(index)
+ ? HA_ERR_INDEX_CORRUPT
+ : HA_ERR_TABLE_DEF_CHANGED);
}
- if (fetch_primary_key_cols
- && dict_table_col_in_clustered_key(index->table, i)) {
- /* This field is needed in the query */
+ if (index->type & DICT_FTS) {
+ DBUG_RETURN(HA_ERR_KEY_NOT_FOUND);
+ }
- return(field);
+ /* Note that if the index for which the search template is built is not
+ necessarily prebuilt->index, but can also be the clustered index */
+
+ if (prebuilt->sql_stat_start) {
+ build_template(false);
}
- /* This field is not needed in the query, skip it */
+ if (key_ptr) {
+ /* Convert the search key value to InnoDB format into
+ prebuilt->search_tuple */
- return(NULL);
-}
+ row_sel_convert_mysql_key_to_innobase(
+ prebuilt->search_tuple,
+ prebuilt->srch_key_val1,
+ prebuilt->srch_key_val_len,
+ index,
+ (byte*) key_ptr,
+ (ulint) key_len,
+ prebuilt->trx);
+ DBUG_ASSERT(prebuilt->search_tuple->n_fields > 0);
+ } else {
+ /* We position the cursor to the last or the first entry
+ in the index */
-/**************************************************************//**
-Adds a field is to a prebuilt struct 'template'.
-@return the field template */
-static
-mysql_row_templ_t*
-build_template_field(
-/*=================*/
- row_prebuilt_t* prebuilt, /*!< in/out: template */
- dict_index_t* clust_index, /*!< in: InnoDB clustered index */
- dict_index_t* index, /*!< in: InnoDB index to use */
- TABLE* table, /*!< in: MySQL table object */
- const Field* field, /*!< in: field in MySQL table */
- ulint i) /*!< in: field index in InnoDB table */
-{
- mysql_row_templ_t* templ;
- const dict_col_t* col;
+ dtuple_set_n_fields(prebuilt->search_tuple, 0);
+ }
- //ut_ad(field == table->field[i]);
- ut_ad(clust_index->table == index->table);
+ mode = convert_search_mode_to_innobase(find_flag);
- col = dict_table_get_nth_col(index->table, i);
+ match_mode = 0;
- templ = prebuilt->mysql_template + prebuilt->n_template++;
- UNIV_MEM_INVALID(templ, sizeof *templ);
- templ->col_no = i;
- templ->clust_rec_field_no = dict_col_get_clust_pos(col, clust_index);
- ut_a(templ->clust_rec_field_no != ULINT_UNDEFINED);
+ if (find_flag == HA_READ_KEY_EXACT) {
- if (dict_index_is_clust(index)) {
- templ->rec_field_no = templ->clust_rec_field_no;
- } else {
- templ->rec_field_no = dict_index_get_nth_col_pos(index, i);
- }
+ match_mode = ROW_SEL_EXACT;
- if (field->null_ptr) {
- templ->mysql_null_byte_offset =
- (ulint) ((char*) field->null_ptr
- - (char*) table->record[0]);
+ } else if (find_flag == HA_READ_PREFIX
+ || find_flag == HA_READ_PREFIX_LAST) {
- templ->mysql_null_bit_mask = (ulint) field->null_bit;
- } else {
- templ->mysql_null_bit_mask = 0;
+ match_mode = ROW_SEL_EXACT_PREFIX;
}
- templ->mysql_col_offset = (ulint) get_field_offset(table, field);
+ last_match_mode = (uint) match_mode;
- templ->mysql_col_len = (ulint) field->pack_length();
- templ->type = col->mtype;
- templ->mysql_type = (ulint)field->type();
+ if (mode != PAGE_CUR_UNSUPP) {
- if (templ->mysql_type == DATA_MYSQL_TRUE_VARCHAR) {
- templ->mysql_length_bytes = (ulint)
- (((Field_varstring*)field)->length_bytes);
- }
+ innobase_srv_conc_enter_innodb(prebuilt->trx);
- templ->charset = dtype_get_charset_coll(col->prtype);
- templ->mbminlen = col->mbminlen;
- templ->mbmaxlen = col->mbmaxlen;
- templ->is_unsigned = col->prtype & DATA_UNSIGNED;
+ ret = row_search_for_mysql((byte*) buf, mode, prebuilt,
+ match_mode, 0);
- if (!dict_index_is_clust(index)
- && templ->rec_field_no == ULINT_UNDEFINED) {
- prebuilt->need_to_access_clustered = TRUE;
+ innobase_srv_conc_exit_innodb(prebuilt->trx);
+ } else {
+
+ ret = DB_UNSUPPORTED;
}
- if (prebuilt->mysql_prefix_len < templ->mysql_col_offset
- + templ->mysql_col_len) {
- prebuilt->mysql_prefix_len = templ->mysql_col_offset
- + templ->mysql_col_len;
+ if (UNIV_UNLIKELY(srv_pass_corrupt_table <= 1 && share
+ && share->ib_table && share->ib_table->is_corrupt)) {
+ DBUG_RETURN(HA_ERR_CRASHED);
}
- if (templ->type == DATA_BLOB) {
- prebuilt->templ_contains_blob = TRUE;
+ switch (ret) {
+ case DB_SUCCESS:
+ error = 0;
+ table->status = 0;
+ if (prebuilt->table->is_system_db) {
+ srv_stats.n_system_rows_read.add(
+ (size_t) prebuilt->trx->id, 1);
+ } else {
+ srv_stats.n_rows_read.add(
+ (size_t) prebuilt->trx->id, 1);
+ }
+ break;
+ case DB_RECORD_NOT_FOUND:
+ error = HA_ERR_KEY_NOT_FOUND;
+ table->status = STATUS_NOT_FOUND;
+ break;
+ case DB_END_OF_INDEX:
+ error = HA_ERR_KEY_NOT_FOUND;
+ table->status = STATUS_NOT_FOUND;
+ break;
+ case DB_TABLESPACE_DELETED:
+
+ ib_senderrf(
+ prebuilt->trx->mysql_thd, IB_LOG_LEVEL_ERROR,
+ ER_TABLESPACE_DISCARDED,
+ table->s->table_name.str);
+
+ table->status = STATUS_NOT_FOUND;
+ error = HA_ERR_NO_SUCH_TABLE;
+ break;
+ case DB_TABLESPACE_NOT_FOUND:
+
+ ib_senderrf(
+ prebuilt->trx->mysql_thd, IB_LOG_LEVEL_ERROR,
+ ER_TABLESPACE_MISSING, MYF(0),
+ table->s->table_name.str);
+
+ table->status = STATUS_NOT_FOUND;
+ error = HA_ERR_NO_SUCH_TABLE;
+ break;
+ default:
+ error = convert_error_code_to_mysql(
+ ret, prebuilt->table->flags, user_thd);
+
+ table->status = STATUS_NOT_FOUND;
+ break;
}
- return(templ);
+ DBUG_RETURN(error);
}
-/**************************************************************//**
-Builds a 'template' to the prebuilt struct. The template is used in fast
-retrieval of just those column values MySQL needs in its processing. */
+/*******************************************************************//**
+The following functions works like index_read, but it find the last
+row with the current key value or prefix.
+@return 0, HA_ERR_KEY_NOT_FOUND, or an error code */
UNIV_INTERN
-void
-ha_innobase::build_template(
-/*========================*/
- bool whole_row) /*!< in: true=ROW_MYSQL_WHOLE_ROW,
- false=ROW_MYSQL_REC_FIELDS */
+int
+ha_innobase::index_read_last(
+/*=========================*/
+ uchar* buf, /*!< out: fetched row */
+ const uchar* key_ptr,/*!< in: key value, or a prefix of a full
+ key value */
+ uint key_len)/*!< in: length of the key val or prefix
+ in bytes */
{
- dict_index_t* index;
- dict_index_t* clust_index;
- ulint n_stored_fields;
- ibool fetch_all_in_key = FALSE;
- ibool fetch_primary_key_cols = FALSE;
- ulint i, sql_idx;
-
- if (prebuilt->select_lock_type == LOCK_X) {
- /* We always retrieve the whole clustered index record if we
- use exclusive row level locks, for example, if the read is
- done in an UPDATE statement. */
+ return(index_read(buf, key_ptr, key_len, HA_READ_PREFIX_LAST));
+}
- whole_row = true;
- } else if (!whole_row) {
- if (prebuilt->hint_need_to_fetch_extra_cols
- == ROW_RETRIEVE_ALL_COLS) {
+/********************************************************************//**
+Get the index for a handle. Does not change active index.
+@return NULL or index instance. */
+UNIV_INTERN
+dict_index_t*
+ha_innobase::innobase_get_index(
+/*============================*/
+ uint keynr) /*!< in: use this index; MAX_KEY means always
+ clustered index, even if it was internally
+ generated by InnoDB */
+{
+ KEY* key = 0;
+ dict_index_t* index = 0;
- /* We know we must at least fetch all columns in the
- key, or all columns in the table */
+ DBUG_ENTER("innobase_get_index");
- if (prebuilt->read_just_key) {
- /* MySQL has instructed us that it is enough
- to fetch the columns in the key; looks like
- MySQL can set this flag also when there is
- only a prefix of the column in the key: in
- that case we retrieve the whole column from
- the clustered index */
+ if (keynr != MAX_KEY && table->s->keys > 0) {
+ key = table->key_info + keynr;
- fetch_all_in_key = TRUE;
- } else {
- whole_row = true;
- }
- } else if (prebuilt->hint_need_to_fetch_extra_cols
- == ROW_RETRIEVE_PRIMARY_KEY) {
- /* We must at least fetch all primary key cols. Note
- that if the clustered index was internally generated
- by InnoDB on the row id (no primary key was
- defined), then row_search_for_mysql() will always
- retrieve the row id to a special buffer in the
- prebuilt struct. */
+ index = innobase_index_lookup(share, keynr);
- fetch_primary_key_cols = TRUE;
- }
- }
+ if (index) {
- clust_index = dict_table_get_first_index(prebuilt->table);
+ if (!key || ut_strcmp(index->name, key->name) != 0) {
+ fprintf(stderr, "InnoDB: [Error] Index for key no %u"
+ " mysql name %s , InnoDB name %s for table %s\n",
+ keynr, key ? key->name : "NULL",
+ index->name,
+ prebuilt->table->name);
- index = whole_row ? clust_index : prebuilt->index;
+ for(ulint i=0; i < table->s->keys; i++) {
+ index = innobase_index_lookup(share, i);
+ key = table->key_info + keynr;
- prebuilt->need_to_access_clustered = (index == clust_index);
+ if (index) {
- /* Below we check column by column if we need to access
- the clustered index. */
+ fprintf(stderr, "InnoDB: [Note] Index for key no %u"
+ " mysql name %s , InnoDB name %s for table %s\n",
+ keynr, key ? key->name : "NULL",
+ index->name,
+ prebuilt->table->name);
+ }
+ }
+ }
- n_stored_fields= (ulint)table->s->stored_fields; /* number of stored columns */
+ ut_a(ut_strcmp(index->name, key->name) == 0);
+ } else {
+ /* Can't find index with keynr in the translation
+ table. Only print message if the index translation
+ table exists */
+ if (share->idx_trans_tbl.index_mapping) {
+ sql_print_warning("InnoDB could not find "
+ "index %s key no %u for "
+ "table %s through its "
+ "index translation table",
+ key ? key->name : "NULL",
+ keynr,
+ prebuilt->table->name);
+ }
- if (!prebuilt->mysql_template) {
- prebuilt->mysql_template = (mysql_row_templ_t*)
- mem_alloc(n_stored_fields * sizeof(mysql_row_templ_t));
+ index = dict_table_get_index_on_name(prebuilt->table,
+ key->name);
+ }
+ } else {
+ index = dict_table_get_first_index(prebuilt->table);
}
- prebuilt->template_type = whole_row
- ? ROW_MYSQL_WHOLE_ROW : ROW_MYSQL_REC_FIELDS;
- prebuilt->null_bitmap_len = table->s->null_bytes;
-
- /* Prepare to build prebuilt->mysql_template[]. */
- prebuilt->templ_contains_blob = FALSE;
- prebuilt->mysql_prefix_len = 0;
- prebuilt->n_template = 0;
- prebuilt->idx_cond_n_cols = 0;
+ if (!index) {
+ sql_print_error(
+ "Innodb could not find key n:o %u with name %s "
+ "from dict cache for table %s",
+ keynr, key ? key->name : "NULL",
+ prebuilt->table->name);
+ }
- /* Note that in InnoDB, i is the column number in the table.
- MySQL calls columns 'fields'. */
+ DBUG_RETURN(index);
+}
- if (active_index != MAX_KEY && active_index == pushed_idx_cond_keyno) {
- /* Push down an index condition or an end_range check. */
- for (i = 0, sql_idx = 0; i < n_stored_fields; i++, sql_idx++) {
+/********************************************************************//**
+Changes the active index of a handle.
+@return 0 or error code */
+UNIV_INTERN
+int
+ha_innobase::change_active_index(
+/*=============================*/
+ uint keynr) /*!< in: use this index; MAX_KEY means always clustered
+ index, even if it was internally generated by
+ InnoDB */
+{
+ DBUG_ENTER("change_active_index");
- while (!table->field[sql_idx]->stored_in_db) {
- sql_idx++;
- }
-
- const ibool index_contains
- = dict_index_contains_col_or_prefix(index, i);
+ if (UNIV_UNLIKELY(srv_pass_corrupt_table <= 1 && share
+ && share->ib_table && share->ib_table->is_corrupt)) {
+ DBUG_RETURN(HA_ERR_CRASHED);
+ }
- /* Test if an end_range or an index condition
- refers to the field. Note that "index" and
- "index_contains" may refer to the clustered index.
- Index condition pushdown is relative to prebuilt->index
- (the index that is being looked up first). */
+ ut_ad(user_thd == ha_thd());
+ ut_a(prebuilt->trx == thd_to_trx(user_thd));
- /* When join_read_always_key() invokes this
- code via handler::ha_index_init() and
- ha_innobase::index_init(), end_range is not
- yet initialized. Because of that, we must
- always check for index_contains, instead of
- the subset
- field->part_of_key.is_set(active_index)
- which would be acceptable if end_range==NULL. */
- if (index == prebuilt->index
- ? index_contains
- : dict_index_contains_col_or_prefix(
- prebuilt->index, i)) {
- /* Needed in ICP */
- const Field* field;
- mysql_row_templ_t* templ;
+ active_index = keynr;
- if (whole_row) {
- field = table->field[sql_idx];
- } else {
- field = build_template_needs_field(
- index_contains,
- prebuilt->read_just_key,
- fetch_all_in_key,
- fetch_primary_key_cols,
- index, table, i, sql_idx);
- if (!field) {
- continue;
- }
- }
+ prebuilt->index = innobase_get_index(keynr);
- templ = build_template_field(
- prebuilt, clust_index, index,
- table, field, i);
- prebuilt->idx_cond_n_cols++;
- ut_ad(prebuilt->idx_cond_n_cols
- == prebuilt->n_template);
+ if (UNIV_UNLIKELY(!prebuilt->index)) {
+ sql_print_warning("InnoDB: change_active_index(%u) failed",
+ keynr);
+ prebuilt->index_usable = FALSE;
+ DBUG_RETURN(1);
+ }
- if (index == prebuilt->index) {
- templ->icp_rec_field_no
- = templ->rec_field_no;
- } else {
- templ->icp_rec_field_no
- = dict_index_get_nth_col_pos(
- prebuilt->index, i);
- }
+ prebuilt->index_usable = row_merge_is_index_usable(prebuilt->trx,
+ prebuilt->index);
- if (dict_index_is_clust(prebuilt->index)) {
- ut_ad(templ->icp_rec_field_no
- != ULINT_UNDEFINED);
- /* If the primary key includes
- a column prefix, use it in
- index condition pushdown,
- because the condition is
- evaluated before fetching any
- off-page (externally stored)
- columns. */
- if (templ->icp_rec_field_no
- < prebuilt->index->n_uniq) {
- /* This is a key column;
- all set. */
- continue;
- }
- } else if (templ->icp_rec_field_no
- != ULINT_UNDEFINED) {
- continue;
- }
+ if (UNIV_UNLIKELY(!prebuilt->index_usable)) {
+ if (dict_index_is_corrupted(prebuilt->index)) {
+ char index_name[MAX_FULL_NAME_LEN + 1];
+ char table_name[MAX_FULL_NAME_LEN + 1];
- /* This is a column prefix index.
- The column prefix can be used in
- an end_range comparison. */
+ innobase_format_name(
+ index_name, sizeof index_name,
+ prebuilt->index->name, TRUE);
- templ->icp_rec_field_no
- = dict_index_get_nth_col_or_prefix_pos(
- prebuilt->index, i, TRUE);
- ut_ad(templ->icp_rec_field_no
- != ULINT_UNDEFINED);
+ innobase_format_name(
+ table_name, sizeof table_name,
+ prebuilt->index->table->name, FALSE);
- /* Index condition pushdown can be used on
- all columns of a secondary index, and on
- the PRIMARY KEY columns. */
- /* TODO: enable this assertion
- (but first ensure that end_range is
- valid here and use an accurate condition
- for end_range)
- ut_ad(!dict_index_is_clust(prebuilt->index)
- || templ->rec_field_no
- < prebuilt->index->n_uniq);
- */
- }
+ push_warning_printf(
+ user_thd, Sql_condition::WARN_LEVEL_WARN,
+ HA_ERR_INDEX_CORRUPT,
+ "InnoDB: Index %s for table %s is"
+ " marked as corrupted",
+ index_name, table_name);
+ DBUG_RETURN(HA_ERR_INDEX_CORRUPT);
+ } else {
+ push_warning_printf(
+ user_thd, Sql_condition::WARN_LEVEL_WARN,
+ HA_ERR_TABLE_DEF_CHANGED,
+ "InnoDB: insufficient history for index %u",
+ keynr);
}
- ut_ad(prebuilt->idx_cond_n_cols > 0);
- ut_ad(prebuilt->idx_cond_n_cols == prebuilt->n_template);
-
- /* Include the fields that are not needed in index condition
- pushdown. */
- for (i = 0, sql_idx = 0; i < n_stored_fields; i++, sql_idx++) {
-
- while (!table->field[sql_idx]->stored_in_db) {
- sql_idx++;
- }
-
- const ibool index_contains
- = dict_index_contains_col_or_prefix(index, i);
-
- if (index == prebuilt->index
- ? !index_contains
- : !dict_index_contains_col_or_prefix(
- prebuilt->index, i)) {
- /* Not needed in ICP */
- const Field* field;
+ /* The caller seems to ignore this. Thus, we must check
+ this again in row_search_for_mysql(). */
+ DBUG_RETURN(convert_error_code_to_mysql(DB_MISSING_HISTORY,
+ 0, NULL));
+ }
- if (whole_row) {
- field = table->field[sql_idx];
- } else {
- field = build_template_needs_field(
- index_contains,
- prebuilt->read_just_key,
- fetch_all_in_key,
- fetch_primary_key_cols,
- index, table, i, sql_idx);
- if (!field) {
- continue;
- }
- }
+ ut_a(prebuilt->search_tuple != 0);
- build_template_field(prebuilt,
- clust_index, index,
- table, field, i);
- }
- }
+ dtuple_set_n_fields(prebuilt->search_tuple, prebuilt->index->n_fields);
- prebuilt->idx_cond = this;
- } else {
- /* No index condition pushdown */
- prebuilt->idx_cond = NULL;
+ dict_index_copy_types(prebuilt->search_tuple, prebuilt->index,
+ prebuilt->index->n_fields);
- for (i = 0, sql_idx = 0; i < n_stored_fields; i++, sql_idx++) {
- const Field* field;
+ /* MySQL changes the active index for a handle also during some
+ queries, for example SELECT MAX(a), SUM(a) first retrieves the MAX()
+ and then calculates the sum. Previously we played safe and used
+ the flag ROW_MYSQL_WHOLE_ROW below, but that caused unnecessary
+ copying. Starting from MySQL-4.1 we use a more efficient flag here. */
- while (!table->field[sql_idx]->stored_in_db) {
- sql_idx++;
- }
+ build_template(false);
- if (whole_row) {
- field = table->field[sql_idx];
- } else {
- field = build_template_needs_field(
- dict_index_contains_col_or_prefix(
- index, i),
- prebuilt->read_just_key,
- fetch_all_in_key,
- fetch_primary_key_cols,
- index, table, i, sql_idx);
- if (!field) {
- continue;
- }
- }
+ DBUG_RETURN(0);
+}
- build_template_field(prebuilt, clust_index, index,
- table, field, i);
- }
- }
+/**********************************************************************//**
+Positions an index cursor to the index specified in keynr. Fetches the
+row if any.
+??? This is only used to read whole keys ???
+@return error number or 0 */
+UNIV_INTERN
+int
+ha_innobase::index_read_idx(
+/*========================*/
+ uchar* buf, /*!< in/out: buffer for the returned
+ row */
+ uint keynr, /*!< in: use this index */
+ const uchar* key, /*!< in: key value; if this is NULL
+ we position the cursor at the
+ start or end of index */
+ uint key_len, /*!< in: key value length */
+ enum ha_rkey_function find_flag)/*!< in: search flags from my_base.h */
+{
+ if (change_active_index(keynr)) {
- if (index != clust_index && prebuilt->need_to_access_clustered) {
- /* Change rec_field_no's to correspond to the clustered index
- record */
- for (i = 0; i < prebuilt->n_template; i++) {
- mysql_row_templ_t* templ
- = &prebuilt->mysql_template[i];
- templ->rec_field_no = templ->clust_rec_field_no;
- }
+ return(1);
}
+
+ return(index_read(buf, key, key_len, find_flag));
}
-/********************************************************************//**
-This special handling is really to overcome the limitations of MySQL's
-binlogging. We need to eliminate the non-determinism that will arise in
-INSERT ... SELECT type of statements, since MySQL binlog only stores the
-min value of the autoinc interval. Once that is fixed we can get rid of
-the special lock handling.
-@return DB_SUCCESS if all OK else error code */
+/***********************************************************************//**
+Reads the next or previous row from a cursor, which must have previously been
+positioned using index_read.
+@return 0, HA_ERR_END_OF_FILE, or error number */
UNIV_INTERN
-ulint
-ha_innobase::innobase_lock_autoinc(void)
-/*====================================*/
+int
+ha_innobase::general_fetch(
+/*=======================*/
+ uchar* buf, /*!< in/out: buffer for next row in MySQL
+ format */
+ uint direction, /*!< in: ROW_SEL_NEXT or ROW_SEL_PREV */
+ uint match_mode) /*!< in: 0, ROW_SEL_EXACT, or
+ ROW_SEL_EXACT_PREFIX */
{
- ulint error = DB_SUCCESS;
+ dberr_t ret;
+ int error;
- switch (innobase_autoinc_lock_mode) {
- case AUTOINC_NO_LOCKING:
- /* Acquire only the AUTOINC mutex. */
- dict_table_autoinc_lock(prebuilt->table);
- break;
+ DBUG_ENTER("general_fetch");
- case AUTOINC_NEW_STYLE_LOCKING:
- /* For simple (single/multi) row INSERTs/REPLACEs and RBR
- events, we fallback to the old style only if another
- transaction has already acquired the AUTOINC lock on
- behalf of a LOAD FILE or INSERT ... SELECT etc. type of
- statement. */
- if (thd_sql_command(user_thd) == SQLCOM_INSERT
- || thd_sql_command(user_thd) == SQLCOM_REPLACE
- || thd_sql_command(user_thd) == SQLCOM_END // RBR event
- ) {
- dict_table_t* table = prebuilt->table;
+ /* If transaction is not startted do not continue, instead return a error code. */
+ if(!(prebuilt->sql_stat_start || (prebuilt->trx && prebuilt->trx->state == 1))) {
+ DBUG_RETURN(HA_ERR_END_OF_FILE);
+ }
- /* Acquire the AUTOINC mutex. */
- dict_table_autoinc_lock(table);
+ if (UNIV_UNLIKELY(srv_pass_corrupt_table <= 1 && share
+ && share->ib_table && share->ib_table->is_corrupt)) {
+ DBUG_RETURN(HA_ERR_CRASHED);
+ }
- /* We need to check that another transaction isn't
- already holding the AUTOINC lock on the table. */
- if (table->n_waiting_or_granted_auto_inc_locks) {
- /* Release the mutex to avoid deadlocks and
- fall back to old style locking. */
- dict_table_autoinc_unlock(table);
- } else {
- /* Do not fall back to old style locking. */
- break;
- }
- }
- /* fall through */
+ ut_a(prebuilt->trx == thd_to_trx(user_thd));
- case AUTOINC_OLD_STYLE_LOCKING:
- error = row_lock_table_autoinc_for_mysql(prebuilt);
+ innobase_srv_conc_enter_innodb(prebuilt->trx);
- if (error == DB_SUCCESS) {
+ ret = row_search_for_mysql(
+ (byte*) buf, 0, prebuilt, match_mode, direction);
- /* Acquire the AUTOINC mutex. */
- dict_table_autoinc_lock(prebuilt->table);
- }
+ innobase_srv_conc_exit_innodb(prebuilt->trx);
+
+ if (UNIV_UNLIKELY(srv_pass_corrupt_table <= 1 && share
+ && share->ib_table && share->ib_table->is_corrupt)) {
+ DBUG_RETURN(HA_ERR_CRASHED);
+ }
+
+ switch (ret) {
+ case DB_SUCCESS:
+ error = 0;
+ table->status = 0;
+ srv_stats.n_rows_read.add((size_t) prebuilt->trx->id, 1);
+ break;
+ case DB_RECORD_NOT_FOUND:
+ error = HA_ERR_END_OF_FILE;
+ table->status = STATUS_NOT_FOUND;
+ break;
+ case DB_END_OF_INDEX:
+ error = HA_ERR_END_OF_FILE;
+ table->status = STATUS_NOT_FOUND;
+ break;
+ case DB_TABLESPACE_DELETED:
+
+ ib_senderrf(
+ prebuilt->trx->mysql_thd, IB_LOG_LEVEL_ERROR,
+ ER_TABLESPACE_DISCARDED,
+ table->s->table_name.str);
+
+ table->status = STATUS_NOT_FOUND;
+ error = HA_ERR_NO_SUCH_TABLE;
break;
+ case DB_TABLESPACE_NOT_FOUND:
+
+ ib_senderrf(
+ prebuilt->trx->mysql_thd, IB_LOG_LEVEL_ERROR,
+ ER_TABLESPACE_MISSING,
+ table->s->table_name.str);
+ table->status = STATUS_NOT_FOUND;
+ error = HA_ERR_NO_SUCH_TABLE;
+ break;
default:
- ut_error;
+ error = convert_error_code_to_mysql(
+ ret, prebuilt->table->flags, user_thd);
+
+ table->status = STATUS_NOT_FOUND;
+ break;
}
- return(ulong(error));
+ DBUG_RETURN(error);
+}
+
+/***********************************************************************//**
+Reads the next row from a cursor, which must have previously been
+positioned using index_read.
+@return 0, HA_ERR_END_OF_FILE, or error number */
+UNIV_INTERN
+int
+ha_innobase::index_next(
+/*====================*/
+ uchar* buf) /*!< in/out: buffer for next row in MySQL
+ format */
+{
+ return(general_fetch(buf, ROW_SEL_NEXT, 0));
+}
+
+/*******************************************************************//**
+Reads the next row matching to the key value given as the parameter.
+@return 0, HA_ERR_END_OF_FILE, or error number */
+UNIV_INTERN
+int
+ha_innobase::index_next_same(
+/*=========================*/
+ uchar* buf, /*!< in/out: buffer for the row */
+ const uchar* key, /*!< in: key value */
+ uint keylen) /*!< in: key value length */
+{
+ return(general_fetch(buf, ROW_SEL_NEXT, last_match_mode));
+}
+
+/***********************************************************************//**
+Reads the previous row from a cursor, which must have previously been
+positioned using index_read.
+@return 0, HA_ERR_END_OF_FILE, or error number */
+UNIV_INTERN
+int
+ha_innobase::index_prev(
+/*====================*/
+ uchar* buf) /*!< in/out: buffer for previous row in MySQL format */
+{
+ return(general_fetch(buf, ROW_SEL_PREV, 0));
}
/********************************************************************//**
diff --cc storage/xtradb/log/log0online.cc
index 1a30501f266,00000000000..1e373c8345f
mode 100644,000000..100644
--- a/storage/xtradb/log/log0online.cc
+++ b/storage/xtradb/log/log0online.cc
@@@ -1,1924 -1,0 +1,1921 @@@
+/*****************************************************************************
+
+Copyright (c) 2011-2012 Percona Inc. All Rights Reserved.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc., 51 Franklin
+Street, Fifth Floor, Boston, MA 02110-1301, USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file log/log0online.cc
+Online database log parsing for changed page tracking
+
+*******************************************************/
+
+#include "log0online.h"
+
+#include "my_dbug.h"
+
+#include "log0recv.h"
+#include "mach0data.h"
+#include "mtr0log.h"
+#include "srv0srv.h"
+#include "srv0start.h"
+#include "trx0sys.h"
+#include "ut0rbt.h"
+
+enum { FOLLOW_SCAN_SIZE = 4 * (UNIV_PAGE_SIZE_MAX) };
+
+#ifdef UNIV_PFS_MUTEX
+/* Key to register log_bmp_sys->mutex with PFS */
+UNIV_INTERN mysql_pfs_key_t log_bmp_sys_mutex_key;
+#endif /* UNIV_PFS_MUTEX */
+
+/** Log parsing and bitmap output data structure */
+struct log_bitmap_struct {
+ byte* read_buf_ptr; /*!< Unaligned log read buffer */
+ byte* read_buf; /*!< log read buffer */
+ byte parse_buf[RECV_PARSING_BUF_SIZE];
+ /*!< log parse buffer */
+ byte* parse_buf_end; /*!< parse buffer position where the
+ next read log data should be copied to.
+ If the previous log records were fully
+ parsed, it points to the start,
+ otherwise points immediatelly past the
+ end of the incomplete log record. */
+ char bmp_file_home[FN_REFLEN];
+ /*!< directory for bitmap files */
+ log_online_bitmap_file_t out; /*!< The current bitmap file */
+ ulint out_seq_num; /*!< the bitmap file sequence number */
+ lsn_t start_lsn; /*!< the LSN of the next unparsed
+ record and the start of the next LSN
+ interval to be parsed. */
+ lsn_t end_lsn; /*!< the end of the LSN interval to be
+ parsed, equal to the next checkpoint
+ LSN at the time of parse */
+ lsn_t next_parse_lsn; /*!< the LSN of the next unparsed
+ record in the current parse */
+ ib_rbt_t* modified_pages; /*!< the current modified page set,
+ organized as the RB-tree with the keys
+ of (space, 4KB-block-start-page-id)
+ pairs */
+ ib_rbt_node_t* page_free_list; /*!< Singly-linked list of freed nodes
+ of modified_pages tree for later
+ reuse. Nodes are linked through
+ ib_rbt_node_t.left as this field has
+ both the correct type and the tree does
+ not mind its overwrite during
+ rbt_next() tree traversal. */
+};
+
+/* The log parsing and bitmap output struct instance */
+static struct log_bitmap_struct* log_bmp_sys;
+
+/* Mutex protecting log_bmp_sys */
+static ib_mutex_t log_bmp_sys_mutex;
+
+/** File name stem for bitmap files. */
+static const char* bmp_file_name_stem = "ib_modified_log_";
+
+/** File name template for bitmap files. The 1st format tag is a directory
+name, the 2nd tag is the stem, the 3rd tag is a file sequence number, the 4th
+tag is the start LSN for the file. */
+static const char* bmp_file_name_template = "%s%s%lu_%llu.xdb";
+
+/* On server startup with empty database srv_start_lsn == 0, in
+which case the first LSN of actual log records will be this. */
+#define MIN_TRACKED_LSN ((LOG_START_LSN) + (LOG_BLOCK_HDR_SIZE))
+
+/* Tests if num bit of bitmap is set */
+#define IS_BIT_SET(bitmap, num) \
+ (*((bitmap) + ((num) >> 3)) & (1UL << ((num) & 7UL)))
+
+/** The bitmap file block size in bytes. All writes will be multiples of this.
+ */
+enum {
+ MODIFIED_PAGE_BLOCK_SIZE = 4096
+};
+
+
+/** Offsets in a file bitmap block */
+enum {
+ MODIFIED_PAGE_IS_LAST_BLOCK = 0,/* 1 if last block in the current
+ write, 0 otherwise. */
+ MODIFIED_PAGE_START_LSN = 4, /* The starting tracked LSN of this and
+ other blocks in the same write */
+ MODIFIED_PAGE_END_LSN = 12, /* The ending tracked LSN of this and
+ other blocks in the same write */
+ MODIFIED_PAGE_SPACE_ID = 20, /* The space ID of tracked pages in
+ this block */
+ MODIFIED_PAGE_1ST_PAGE_ID = 24, /* The page ID of the first tracked
+ page in this block */
+ MODIFIED_PAGE_BLOCK_UNUSED_1 = 28,/* Unused in order to align the start
+ of bitmap at 8 byte boundary */
+ MODIFIED_PAGE_BLOCK_BITMAP = 32,/* Start of the bitmap itself */
+ MODIFIED_PAGE_BLOCK_UNUSED_2 = MODIFIED_PAGE_BLOCK_SIZE - 8,
+ /* Unused in order to align the end of
+ bitmap at 8 byte boundary */
+ MODIFIED_PAGE_BLOCK_CHECKSUM = MODIFIED_PAGE_BLOCK_SIZE - 4
+ /* The checksum of the current block */
+};
+
+/** Length of the bitmap data in a block in bytes */
+enum { MODIFIED_PAGE_BLOCK_BITMAP_LEN
+ = MODIFIED_PAGE_BLOCK_UNUSED_2 - MODIFIED_PAGE_BLOCK_BITMAP };
+
+/** Length of the bitmap data in a block in page ids */
+enum { MODIFIED_PAGE_BLOCK_ID_COUNT = MODIFIED_PAGE_BLOCK_BITMAP_LEN * 8 };
+
+/****************************************************************//**
+Provide a comparisson function for the RB-tree tree (space,
+block_start_page) pairs. Actual implementation does not matter as
+long as the ordering is full.
+@return -1 if p1 < p2, 0 if p1 == p2, 1 if p1 > p2
+*/
+static
+int
+log_online_compare_bmp_keys(
+/*========================*/
+ const void* p1, /*!<in: 1st key to compare */
+ const void* p2) /*!<in: 2nd key to compare */
+{
+ const byte *k1 = (const byte *)p1;
+ const byte *k2 = (const byte *)p2;
+
+ ulint k1_space = mach_read_from_4(k1 + MODIFIED_PAGE_SPACE_ID);
+ ulint k2_space = mach_read_from_4(k2 + MODIFIED_PAGE_SPACE_ID);
+ if (k1_space == k2_space) {
+ ulint k1_start_page
+ = mach_read_from_4(k1 + MODIFIED_PAGE_1ST_PAGE_ID);
+ ulint k2_start_page
+ = mach_read_from_4(k2 + MODIFIED_PAGE_1ST_PAGE_ID);
+ return k1_start_page < k2_start_page
+ ? -1 : k1_start_page > k2_start_page ? 1 : 0;
+ }
+ return k1_space < k2_space ? -1 : 1;
+}
+
+/****************************************************************//**
+Set a bit for tracked page in the bitmap. Expand the bitmap tree as
+necessary. */
+static
+void
+log_online_set_page_bit(
+/*====================*/
+ ulint space, /*!<in: log record space id */
+ ulint page_no)/*!<in: log record page id */
+{
+ ut_ad(mutex_own(&log_bmp_sys_mutex));
+
+ ut_a(space != ULINT_UNDEFINED);
+ ut_a(page_no != ULINT_UNDEFINED);
+
+ ulint block_start_page = page_no / MODIFIED_PAGE_BLOCK_ID_COUNT
+ * MODIFIED_PAGE_BLOCK_ID_COUNT;
+ ulint block_pos = block_start_page ? (page_no % block_start_page / 8)
+ : (page_no / 8);
+ uint bit_pos = page_no % 8;
+
+ byte search_page[MODIFIED_PAGE_BLOCK_SIZE];
+ mach_write_to_4(search_page + MODIFIED_PAGE_SPACE_ID, space);
+ mach_write_to_4(search_page + MODIFIED_PAGE_1ST_PAGE_ID,
+ block_start_page);
+
+ byte *page_ptr;
+ ib_rbt_bound_t tree_search_pos;
+ if (!rbt_search(log_bmp_sys->modified_pages, &tree_search_pos,
+ search_page)) {
+ page_ptr = rbt_value(byte, tree_search_pos.last);
+ }
+ else {
+ ib_rbt_node_t *new_node;
+
+ if (log_bmp_sys->page_free_list) {
+ new_node = log_bmp_sys->page_free_list;
+ log_bmp_sys->page_free_list = new_node->left;
+ }
+ else {
+ new_node = static_cast<ib_rbt_node_t *>
+ (ut_malloc
+ (SIZEOF_NODE(log_bmp_sys->modified_pages)));
+ }
+ memset(new_node, 0, SIZEOF_NODE(log_bmp_sys->modified_pages));
+
+ page_ptr = rbt_value(byte, new_node);
+ mach_write_to_4(page_ptr + MODIFIED_PAGE_SPACE_ID, space);
+ mach_write_to_4(page_ptr + MODIFIED_PAGE_1ST_PAGE_ID,
+ block_start_page);
+
+ rbt_add_preallocated_node(log_bmp_sys->modified_pages,
+ &tree_search_pos, new_node);
+ }
+ page_ptr[MODIFIED_PAGE_BLOCK_BITMAP + block_pos] |= (1U << bit_pos);
+}
+
+/****************************************************************//**
+Calculate a bitmap block checksum. Algorithm borrowed from
+log_block_calc_checksum.
+@return checksum */
+UNIV_INLINE
+ulint
+log_online_calc_checksum(
+/*=====================*/
+ const byte* block) /*!<in: bitmap block */
+{
+ ulint sum;
+ ulint sh;
+ ulint i;
+
+ sum = 1;
+ sh = 0;
+
+ for (i = 0; i < MODIFIED_PAGE_BLOCK_CHECKSUM; i++) {
+
+ ulint b = block[i];
+ sum &= 0x7FFFFFFFUL;
+ sum += b;
+ sum += b << sh;
+ sh++;
+ if (sh > 24) {
+ sh = 0;
+ }
+ }
+
+ return sum;
+}
+
+/****************************************************************//**
+Read one bitmap data page and check it for corruption.
+
+@return TRUE if page read OK, FALSE if I/O error */
+static
+ibool
+log_online_read_bitmap_page(
+/*========================*/
+ log_online_bitmap_file_t *bitmap_file, /*!<in/out: bitmap
+ file */
+ byte *page, /*!<out: read page.
+ Must be at least
+ MODIFIED_PAGE_BLOCK_SIZE
+ bytes long */
+ ibool *checksum_ok) /*!<out: TRUE if page
+ checksum OK */
+{
+ ulint checksum;
+ ulint actual_checksum;
+ ibool success;
+
+ ut_a(bitmap_file->size >= MODIFIED_PAGE_BLOCK_SIZE);
+ ut_a(bitmap_file->offset
+ <= bitmap_file->size - MODIFIED_PAGE_BLOCK_SIZE);
+ ut_a(bitmap_file->offset % MODIFIED_PAGE_BLOCK_SIZE == 0);
+
+ success = os_file_read(bitmap_file->file, page, bitmap_file->offset,
+ MODIFIED_PAGE_BLOCK_SIZE);
+
+ if (UNIV_UNLIKELY(!success)) {
+
+ /* The following call prints an error message */
+ os_file_get_last_error(TRUE);
+ ib_logf(IB_LOG_LEVEL_WARN,
+ "failed reading changed page bitmap file \'%s\'",
+ bitmap_file->name);
+ return FALSE;
+ }
+
+ bitmap_file->offset += MODIFIED_PAGE_BLOCK_SIZE;
+ ut_ad(bitmap_file->offset <= bitmap_file->size);
+
+ checksum = mach_read_from_4(page + MODIFIED_PAGE_BLOCK_CHECKSUM);
+ actual_checksum = log_online_calc_checksum(page);
+ *checksum_ok = (checksum == actual_checksum);
+
+ return TRUE;
+}
+
+/****************************************************************//**
+Get the last tracked fully LSN from the bitmap file by reading
+backwards untile a correct end page is found. Detects incomplete
+writes and corrupted data. Sets the start output position for the
+written bitmap data.
+
+Multiple bitmap files are handled using the following assumptions:
+1) Only the last file might be corrupted. In case where no good data was found
+in the last file, assume that the next to last file is OK. This assumption
+does not limit crash recovery capability in any way.
+2) If the whole of the last file was corrupted, assume that the start LSN in
+its name is correct and use it for (re-)tracking start.
+
+@return the last fully tracked LSN */
+static
+lsn_t
+log_online_read_last_tracked_lsn(void)
+/*==================================*/
+{
+ byte page[MODIFIED_PAGE_BLOCK_SIZE];
+ ibool is_last_page = FALSE;
+ ibool checksum_ok = FALSE;
+ lsn_t result;
+ os_offset_t read_offset = log_bmp_sys->out.offset;
+
+ while ((!checksum_ok || !is_last_page) && read_offset > 0)
+ {
+ read_offset -= MODIFIED_PAGE_BLOCK_SIZE;
+ log_bmp_sys->out.offset = read_offset;
+
+ if (!log_online_read_bitmap_page(&log_bmp_sys->out, page,
+ &checksum_ok)) {
+ checksum_ok = FALSE;
+ result = 0;
+ break;
+ }
+
+ if (checksum_ok) {
+ is_last_page
+ = mach_read_from_4
+ (page + MODIFIED_PAGE_IS_LAST_BLOCK);
+ } else {
+
+ ib_logf(IB_LOG_LEVEL_WARN,
+ "corruption detected in \'%s\' at offset "
+ UINT64PF,
+ log_bmp_sys->out.name, read_offset);
+ }
+ };
+
+ result = (checksum_ok && is_last_page)
+ ? mach_read_from_8(page + MODIFIED_PAGE_END_LSN) : 0;
+
+ /* Truncate the output file to discard the corrupted bitmap data, if
+ any */
+ if (!os_file_set_eof_at(log_bmp_sys->out.file,
+ log_bmp_sys->out.offset)) {
+ ib_logf(IB_LOG_LEVEL_WARN,
+ "failed truncating changed page bitmap file \'%s\' to "
+ UINT64PF " bytes",
+ log_bmp_sys->out.name, log_bmp_sys->out.offset);
+ result = 0;
+ }
+ return result;
+}
+
+/****************************************************************//**
+Safely write the log_sys->tracked_lsn value. Uses atomic operations
+if available, otherwise this field is protected with the log system
+mutex. The reader counterpart function is log_get_tracked_lsn() in
+log0log.c. */
+UNIV_INLINE
+void
+log_set_tracked_lsn(
+/*================*/
+ lsn_t tracked_lsn) /*!<in: new value */
+{
+ log_sys->tracked_lsn = tracked_lsn;
+ os_wmb;
+}
+
+/*********************************************************************//**
+Check if missing, if any, LSN interval can be read and tracked using the
+current LSN value, the LSN value where the tracking stopped, and the log group
+capacity.
+
+@return TRUE if the missing interval can be tracked or if there's no missing
+data. */
+static
+ibool
+log_online_can_track_missing(
+/*=========================*/
+ lsn_t last_tracked_lsn, /*!<in: last tracked LSN */
+ lsn_t tracking_start_lsn) /*!<in: current LSN */
+{
+ /* last_tracked_lsn might be < MIN_TRACKED_LSN in the case of empty
+ bitmap file, handle this too. */
+ last_tracked_lsn = ut_max(last_tracked_lsn, MIN_TRACKED_LSN);
+
+ if (last_tracked_lsn > tracking_start_lsn) {
+ ib_logf(IB_LOG_LEVEL_ERROR,
+ "last tracked LSN " LSN_PF " is ahead of tracking "
+ "start LSN " LSN_PF ". This can be caused by "
+ "mismatched bitmap files.",
+ last_tracked_lsn, tracking_start_lsn);
+ exit(1);
+ }
+
+ return (last_tracked_lsn == tracking_start_lsn)
+ || (log_sys->lsn - last_tracked_lsn
+ <= log_sys->log_group_capacity);
+}
+
+
+/****************************************************************//**
+Diagnose a gap in tracked LSN range on server startup due to crash or
+very fast shutdown and try to close it by tracking the data
+immediatelly, if possible. */
+static
+void
+log_online_track_missing_on_startup(
+/*================================*/
+ lsn_t last_tracked_lsn, /*!<in: last tracked LSN read from the
+ bitmap file */
+ lsn_t tracking_start_lsn) /*!<in: last checkpoint LSN of the
+ current server startup */
+{
+ ut_ad(last_tracked_lsn != tracking_start_lsn);
+ ut_ad(srv_track_changed_pages);
+
+ ib_logf(IB_LOG_LEVEL_WARN, "last tracked LSN in \'%s\' is " LSN_PF
+ ", but the last checkpoint LSN is " LSN_PF ". This might be "
+ "due to a server crash or a very fast shutdown.",
+ log_bmp_sys->out.name, last_tracked_lsn, tracking_start_lsn);
+
+ /* See if we can fully recover the missing interval */
+ if (log_online_can_track_missing(last_tracked_lsn,
+ tracking_start_lsn)) {
+
+ ib_logf(IB_LOG_LEVEL_INFO,
+ "reading the log to advance the last tracked LSN.");
+
+ log_bmp_sys->start_lsn = ut_max(last_tracked_lsn,
+ MIN_TRACKED_LSN);
+ log_set_tracked_lsn(log_bmp_sys->start_lsn);
+ if (!log_online_follow_redo_log()) {
+ exit(1);
+ }
+ ut_ad(log_bmp_sys->end_lsn >= tracking_start_lsn);
+
+ ib_logf(IB_LOG_LEVEL_INFO,
+ "continuing tracking changed pages from LSN " LSN_PF,
+ log_bmp_sys->end_lsn);
+ }
+ else {
+ ib_logf(IB_LOG_LEVEL_WARN,
+ "the age of last tracked LSN exceeds log capacity, "
+ "tracking-based incremental backups will work only "
+ "from the higher LSN!");
+
+ log_bmp_sys->end_lsn = log_bmp_sys->start_lsn
+ = tracking_start_lsn;
+ log_set_tracked_lsn(log_bmp_sys->start_lsn);
+
+ ib_logf(IB_LOG_LEVEL_INFO,
+ "starting tracking changed pages from LSN " LSN_PF,
+ log_bmp_sys->end_lsn);
+ }
+}
+
+/*********************************************************************//**
+Format a bitmap output file name to log_bmp_sys->out.name. */
+static
+void
+log_online_make_bitmap_name(
+/*=========================*/
+ lsn_t start_lsn) /*!< in: the start LSN name part */
+{
+ ut_snprintf(log_bmp_sys->out.name, sizeof(log_bmp_sys->out.name),
+ bmp_file_name_template, log_bmp_sys->bmp_file_home,
+ bmp_file_name_stem, log_bmp_sys->out_seq_num, start_lsn);
+}
+
- /*********************************************************************//**
- }
-
+/*********************************************************************//**
+Check if an old file that has the name of a new bitmap file we are about to
+create should be overwritten. */
+static
+ibool
+log_online_should_overwrite(
+/*========================*/
+ const char *path) /*!< in: path to file */
+{
+ dberr_t err;
+ os_file_stat_t file_info;
+
+ /* Currently, it's OK to overwrite 0-sized files only */
+ err = os_file_get_status(path, &file_info, false);
+ return err == DB_SUCCESS && file_info.type == OS_FILE_TYPE_FILE
+ && file_info.size == 0LL;
+}
+
+/*********************************************************************//**
+Create a new empty bitmap output file.
+
+@return TRUE if operation succeeded, FALSE if I/O error */
+static
+ibool
+log_online_start_bitmap_file(void)
+/*==============================*/
+{
+ ibool success = TRUE;
+
+ /* Check for an old file that should be deleted first */
+ if (log_online_should_overwrite(log_bmp_sys->out.name)) {
+
+ success = static_cast<ibool>(
+ os_file_delete_if_exists(innodb_file_bmp_key,
+ log_bmp_sys->out.name));
+ }
+
+ if (UNIV_LIKELY(success)) {
+ log_bmp_sys->out.file
+ = os_file_create_simple_no_error_handling(
+ innodb_file_bmp_key,
+ log_bmp_sys->out.name,
+ OS_FILE_CREATE,
+ OS_FILE_READ_WRITE_CACHED,
+ &success);
+ }
+ if (UNIV_UNLIKELY(!success)) {
+
+ /* The following call prints an error message */
+ os_file_get_last_error(TRUE);
+ ib_logf(IB_LOG_LEVEL_ERROR,
+ "cannot create \'%s\'", log_bmp_sys->out.name);
+ return FALSE;
+ }
+
+ log_bmp_sys->out.offset = 0;
+ return TRUE;
+}
+
+/*********************************************************************//**
+Close the current bitmap output file and create the next one.
+
+@return TRUE if operation succeeded, FALSE if I/O error */
+static
+ibool
+log_online_rotate_bitmap_file(
+/*===========================*/
+ lsn_t next_file_start_lsn) /*!<in: the start LSN name
+ part */
+{
+ if (!os_file_is_invalid(log_bmp_sys->out.file)) {
+ os_file_close(log_bmp_sys->out.file);
+ os_file_mark_invalid(&log_bmp_sys->out.file);
+ }
+ log_bmp_sys->out_seq_num++;
+ log_online_make_bitmap_name(next_file_start_lsn);
+ return log_online_start_bitmap_file();
+}
+
+/*********************************************************************//**
+Check the name of a given file if it's a changed page bitmap file and
+return file sequence and start LSN name components if it is. If is not,
+the values of output parameters are undefined.
+
+@return TRUE if a given file is a changed page bitmap file. */
+static
+ibool
+log_online_is_bitmap_file(
+/*======================*/
+ const os_file_stat_t* file_info, /*!<in: file to
+ check */
+ ulong* bitmap_file_seq_num, /*!<out: bitmap file
+ sequence number */
+ lsn_t* bitmap_file_start_lsn) /*!<out: bitmap file
+ start LSN */
+{
+ char stem[FN_REFLEN];
+
+ ut_ad (strlen(file_info->name) < OS_FILE_MAX_PATH);
+
+ return ((file_info->type == OS_FILE_TYPE_FILE
+ || file_info->type == OS_FILE_TYPE_LINK)
+ && (sscanf(file_info->name, "%[a-z_]%lu_%llu.xdb", stem,
+ bitmap_file_seq_num,
+ (unsigned long long *)bitmap_file_start_lsn) == 3)
+ && (!strcmp(stem, bmp_file_name_stem)));
+}
+
+/** Initialize the constant part of the log tracking subsystem */
+UNIV_INTERN
+void
+log_online_init(void)
+{
+ mutex_create(log_bmp_sys_mutex_key, &log_bmp_sys_mutex,
+ SYNC_LOG_ONLINE);
+}
+
+/** Initialize the dynamic part of the log tracking subsystem */
+UNIV_INTERN
+void
+log_online_read_init(void)
+{
+ ibool success;
+ lsn_t tracking_start_lsn
+ = ut_max(log_sys->last_checkpoint_lsn, MIN_TRACKED_LSN);
+ os_file_dir_t bitmap_dir;
+ os_file_stat_t bitmap_dir_file_info;
+ lsn_t last_file_start_lsn = MIN_TRACKED_LSN;
+ size_t srv_data_home_len;
+
+ /* Bitmap data start and end in a bitmap block must be 8-byte
+ aligned. */
+ compile_time_assert(MODIFIED_PAGE_BLOCK_BITMAP % 8 == 0);
+ compile_time_assert(MODIFIED_PAGE_BLOCK_BITMAP_LEN % 8 == 0);
+
+ ut_ad(srv_track_changed_pages);
+
+ log_bmp_sys = static_cast<log_bitmap_struct *>
+ (ut_malloc(sizeof(*log_bmp_sys)));
+ log_bmp_sys->read_buf_ptr = static_cast<byte *>
+ (ut_malloc(FOLLOW_SCAN_SIZE + OS_FILE_LOG_BLOCK_SIZE));
+ log_bmp_sys->read_buf = static_cast<byte *>
+ (ut_align(log_bmp_sys->read_buf_ptr, OS_FILE_LOG_BLOCK_SIZE));
+
+ /* Initialize bitmap file directory from srv_data_home and add a path
+ separator if needed. */
+ srv_data_home_len = strlen(srv_data_home);
+ ut_a (srv_data_home_len < FN_REFLEN);
+ strcpy(log_bmp_sys->bmp_file_home, srv_data_home);
+ if (srv_data_home_len
+ && log_bmp_sys->bmp_file_home[srv_data_home_len - 1]
+ != SRV_PATH_SEPARATOR) {
+
+ ut_a (srv_data_home_len < FN_REFLEN - 1);
+ log_bmp_sys->bmp_file_home[srv_data_home_len]
+ = SRV_PATH_SEPARATOR;
+ log_bmp_sys->bmp_file_home[srv_data_home_len + 1] = '\0';
+ }
+
+ /* Enumerate existing bitmap files to either open the last one to get
+ the last tracked LSN either to find that there are none and start
+ tracking from scratch. */
+ log_bmp_sys->out.name[0] = '\0';
+ log_bmp_sys->out_seq_num = 0;
+
+ bitmap_dir = os_file_opendir(log_bmp_sys->bmp_file_home, TRUE);
+ ut_a(bitmap_dir);
+ while (!os_file_readdir_next_file(log_bmp_sys->bmp_file_home,
+ bitmap_dir, &bitmap_dir_file_info)) {
+
+ ulong file_seq_num;
+ lsn_t file_start_lsn;
+
+ if (!log_online_is_bitmap_file(&bitmap_dir_file_info,
+ &file_seq_num,
+ &file_start_lsn)) {
+ continue;
+ }
+
+ if (file_seq_num > log_bmp_sys->out_seq_num
+ && bitmap_dir_file_info.size > 0) {
+ log_bmp_sys->out_seq_num = file_seq_num;
+ last_file_start_lsn = file_start_lsn;
+ /* No dir component (log_bmp_sys->bmp_file_home) here,
+ because that's the cwd */
+ strncpy(log_bmp_sys->out.name,
+ bitmap_dir_file_info.name, FN_REFLEN - 1);
+ log_bmp_sys->out.name[FN_REFLEN - 1] = '\0';
+ }
+ }
+
+ if (os_file_closedir(bitmap_dir)) {
+ os_file_get_last_error(TRUE);
+ ib_logf(IB_LOG_LEVEL_ERROR, "cannot close \'%s\'",
+ log_bmp_sys->bmp_file_home);
+ exit(1);
+ }
+
+ if (!log_bmp_sys->out_seq_num) {
+ log_bmp_sys->out_seq_num = 1;
+ log_online_make_bitmap_name(0);
+ }
+
+ log_bmp_sys->modified_pages = rbt_create(MODIFIED_PAGE_BLOCK_SIZE,
+ log_online_compare_bmp_keys);
+ log_bmp_sys->page_free_list = NULL;
+
+ log_bmp_sys->out.file
+ = os_file_create_simple_no_error_handling
+ (innodb_file_bmp_key, log_bmp_sys->out.name, OS_FILE_OPEN,
+ OS_FILE_READ_WRITE_CACHED, &success);
+
+ if (!success) {
+
+ /* New file, tracking from scratch */
+ if (!log_online_start_bitmap_file()) {
+ exit(1);
+ }
+ }
+ else {
+
+ /* Read the last tracked LSN from the last file */
+ lsn_t last_tracked_lsn;
+ lsn_t file_start_lsn;
+
+ log_bmp_sys->out.size
+ = os_file_get_size(log_bmp_sys->out.file);
+ log_bmp_sys->out.offset = log_bmp_sys->out.size;
+
+ if (log_bmp_sys->out.offset % MODIFIED_PAGE_BLOCK_SIZE != 0) {
+
+ ib_logf(IB_LOG_LEVEL_WARN,
+ "truncated block detected in \'%s\' at offset "
+ UINT64PF,
+ log_bmp_sys->out.name,
+ log_bmp_sys->out.offset);
+ log_bmp_sys->out.offset -=
+ log_bmp_sys->out.offset
+ % MODIFIED_PAGE_BLOCK_SIZE;
+ }
+
+ last_tracked_lsn = log_online_read_last_tracked_lsn();
+ /* Do not rotate if we truncated the file to zero length - we
+ can just start writing there */
+ const bool need_rotate = (last_tracked_lsn != 0);
+ if (!last_tracked_lsn) {
+
+ last_tracked_lsn = last_file_start_lsn;
+ }
+
+ /* Start a new file. Choose the LSN value in its name based on
+ if we can retrack any missing data. */
+ if (log_online_can_track_missing(last_tracked_lsn,
+ tracking_start_lsn)) {
+ file_start_lsn = last_tracked_lsn;
+ } else {
+ file_start_lsn = tracking_start_lsn;
+ }
+
+ if (need_rotate
+ && !log_online_rotate_bitmap_file(file_start_lsn)) {
+
+ exit(1);
+ }
+
+ if (last_tracked_lsn < tracking_start_lsn) {
+
+ log_online_track_missing_on_startup
+ (last_tracked_lsn, tracking_start_lsn);
+ return;
+ }
+
+ if (last_tracked_lsn > tracking_start_lsn) {
+
+ ib_logf(IB_LOG_LEVEL_WARN,
+ "last tracked LSN is " LSN_PF ", but the last "
+ "checkpoint LSN is " LSN_PF ". The "
+ "tracking-based incremental backups will work "
+ "only from the latter LSN!",
+ last_tracked_lsn, tracking_start_lsn);
+ }
+
+ }
+
+ ib_logf(IB_LOG_LEVEL_INFO, "starting tracking changed pages from LSN "
+ LSN_PF, tracking_start_lsn);
+ log_bmp_sys->start_lsn = tracking_start_lsn;
+ log_set_tracked_lsn(tracking_start_lsn);
+}
+
+/** Shut down the dynamic part of the log tracking subsystem */
+UNIV_INTERN
+void
+log_online_read_shutdown(void)
+{
+ mutex_enter(&log_bmp_sys_mutex);
+
+ srv_track_changed_pages = FALSE;
+
+ ib_rbt_node_t *free_list_node = log_bmp_sys->page_free_list;
+
+ if (!os_file_is_invalid(log_bmp_sys->out.file)) {
+ os_file_close(log_bmp_sys->out.file);
+ os_file_mark_invalid(&log_bmp_sys->out.file);
+ }
+
+ rbt_free(log_bmp_sys->modified_pages);
+
+ while (free_list_node) {
+ ib_rbt_node_t *next = free_list_node->left;
+ ut_free(free_list_node);
+ free_list_node = next;
+ }
+
+ ut_free(log_bmp_sys->read_buf_ptr);
+ ut_free(log_bmp_sys);
+ log_bmp_sys = NULL;
+
+ srv_redo_log_thread_started = false;
+
+ mutex_exit(&log_bmp_sys_mutex);
+}
+
+/** Shut down the constant part of the log tracking subsystem */
+UNIV_INTERN
+void
+log_online_shutdown(void)
+{
+ mutex_free(&log_bmp_sys_mutex);
+}
+
+/*********************************************************************//**
+For the given minilog record type determine if the record has (space; page)
+associated with it.
+@return TRUE if the record has (space; page) in it */
+static
+ibool
+log_online_rec_has_page(
+/*====================*/
+ byte type) /*!<in: the minilog record type */
+{
+ return type != MLOG_MULTI_REC_END && type != MLOG_DUMMY_RECORD;
+}
+
+/*********************************************************************//**
+Check if a page field for a given log record type actually contains a page
+id. It does not for file operations and MLOG_LSN.
+@return TRUE if page field contains actual page id, FALSE otherwise */
+static
+ibool
+log_online_rec_page_means_page(
+/*===========================*/
+ byte type) /*!<in: log record type */
+{
+ return log_online_rec_has_page(type)
+#ifdef UNIV_LOG_LSN_DEBUG
+ && type != MLOG_LSN
+#endif
+ && type != MLOG_FILE_CREATE
+ && type != MLOG_FILE_RENAME
+ && type != MLOG_FILE_DELETE
+ && type != MLOG_FILE_CREATE2;
+}
+
+/*********************************************************************//**
+Parse the log data in the parse buffer for the (space, page) pairs and add
+them to the modified page set as necessary. Removes the fully-parsed records
+from the buffer. If an incomplete record is found, moves it to the end of the
+buffer. */
+static
+void
+log_online_parse_redo_log(void)
+/*===========================*/
+{
+ ut_ad(mutex_own(&log_bmp_sys_mutex));
+
+ byte *ptr = log_bmp_sys->parse_buf;
+ byte *end = log_bmp_sys->parse_buf_end;
+ ulint len = 0;
+
+ while (ptr != end
+ && log_bmp_sys->next_parse_lsn < log_bmp_sys->end_lsn) {
+
+ byte type;
+ ulint space;
+ ulint page_no;
+ byte* body;
+
+ /* recv_sys is not initialized, so on corrupt log we will
+ SIGSEGV. But the log of a live database should not be
+ corrupt. */
+ len = recv_parse_log_rec(ptr, end, &type, &space, &page_no,
+ &body);
+ if (len > 0) {
+
+ if (log_online_rec_page_means_page(type)) {
+
+ ut_a(len >= 3);
+ log_online_set_page_bit(space, page_no);
+ }
+
+ ptr += len;
+ ut_ad(ptr <= end);
+ log_bmp_sys->next_parse_lsn
+ = recv_calc_lsn_on_data_add
+ (log_bmp_sys->next_parse_lsn, len);
+ }
+ else {
+
+ /* Incomplete log record. Shift it to the
+ beginning of the parse buffer and leave it to be
+ completed on the next read. */
+ ut_memmove(log_bmp_sys->parse_buf, ptr, end - ptr);
+ log_bmp_sys->parse_buf_end
+ = log_bmp_sys->parse_buf + (end - ptr);
+ ptr = end;
+ }
+ }
+
+ if (len > 0) {
+
+ log_bmp_sys->parse_buf_end = log_bmp_sys->parse_buf;
+ }
+}
+
+/*********************************************************************//**
+Check the log block checksum.
+@return TRUE if the log block checksum is OK, FALSE otherwise. */
+static
+ibool
+log_online_is_valid_log_seg(
+/*========================*/
+ const byte* log_block) /*!< in: read log data */
+{
+ ibool checksum_is_ok
+ = log_block_checksum_is_ok_or_old_format(log_block);
+
+ if (!checksum_is_ok) {
+
+ ib_logf(IB_LOG_LEVEL_ERROR,
+ "log block checksum mismatch: expected " ULINTPF ", "
+ "calculated checksum " ULINTPF,
+ log_block_get_checksum(log_block),
+ log_block_calc_checksum(log_block));
+ }
+
+ return checksum_is_ok;
+}
+
+/*********************************************************************//**
+Copy new log data to the parse buffer while skipping log block header,
+trailer and already parsed data. */
+static
+void
+log_online_add_to_parse_buf(
+/*========================*/
+ const byte* log_block, /*!< in: read log data */
+ ulint data_len, /*!< in: length of read log data */
+ ulint skip_len) /*!< in: how much of log data to
+ skip */
+{
+ ut_ad(mutex_own(&log_bmp_sys_mutex));
+
+ ulint start_offset = skip_len ? skip_len : LOG_BLOCK_HDR_SIZE;
+ ulint end_offset
+ = (data_len == OS_FILE_LOG_BLOCK_SIZE)
+ ? data_len - LOG_BLOCK_TRL_SIZE
+ : data_len;
+ ulint actual_data_len = (end_offset >= start_offset)
+ ? end_offset - start_offset : 0;
+
+ ut_memcpy(log_bmp_sys->parse_buf_end, log_block + start_offset,
+ actual_data_len);
+
+ log_bmp_sys->parse_buf_end += actual_data_len;
+
+ ut_a(log_bmp_sys->parse_buf_end - log_bmp_sys->parse_buf
+ <= RECV_PARSING_BUF_SIZE);
+}
+
+/*********************************************************************//**
+Parse the log block: first copies the read log data to the parse buffer while
+skipping log block header, trailer and already parsed data. Then it actually
+parses the log to add to the modified page bitmap. */
+static
+void
+log_online_parse_redo_log_block(
+/*============================*/
+ const byte* log_block, /*!< in: read log data */
+ ulint skip_already_parsed_len) /*!< in: how many bytes of
+ log data should be skipped as
+ they were parsed before */
+{
+ ut_ad(mutex_own(&log_bmp_sys_mutex));
+
+ ulint block_data_len = log_block_get_data_len(log_block);
+
+ ut_ad(block_data_len % OS_FILE_LOG_BLOCK_SIZE == 0
+ || block_data_len < OS_FILE_LOG_BLOCK_SIZE);
+
+ log_online_add_to_parse_buf(log_block, block_data_len,
+ skip_already_parsed_len);
+ log_online_parse_redo_log();
+}
+
+/*********************************************************************//**
+Read and parse one redo log chunk and updates the modified page bitmap. */
+static
+void
+log_online_follow_log_seg(
+/*======================*/
+ log_group_t* group, /*!< in: the log group to use */
+ lsn_t block_start_lsn, /*!< in: the LSN to read from */
+ lsn_t block_end_lsn) /*!< in: the LSN to read to */
+{
+ ut_ad(mutex_own(&log_bmp_sys_mutex));
+
+ /* Pointer to the current OS_FILE_LOG_BLOCK-sized chunk of the read log
+ data to parse */
+ byte* log_block = log_bmp_sys->read_buf;
+ byte* log_block_end = log_bmp_sys->read_buf
+ + (block_end_lsn - block_start_lsn);
+
+ mutex_enter(&log_sys->mutex);
+ log_group_read_log_seg(LOG_RECOVER, log_bmp_sys->read_buf,
+ group, block_start_lsn, block_end_lsn, TRUE);
+ /* log_group_read_log_seg will release the log_sys->mutex for us */
+
+ while (log_block < log_block_end
+ && log_bmp_sys->next_parse_lsn < log_bmp_sys->end_lsn) {
+
+ /* How many bytes of log data should we skip in the current log
+ block. Skipping is necessary because we round down the next
+ parse LSN thus it is possible to read the already-processed log
+ data many times */
+ ulint skip_already_parsed_len = 0;
+
+ if (!log_online_is_valid_log_seg(log_block)) {
+ break;
+ }
+
+ if ((block_start_lsn <= log_bmp_sys->next_parse_lsn)
+ && (block_start_lsn + OS_FILE_LOG_BLOCK_SIZE
+ > log_bmp_sys->next_parse_lsn)) {
+
+ /* The next parse LSN is inside the current block, skip
+ data preceding it. */
+ skip_already_parsed_len
+ = (ulint)(log_bmp_sys->next_parse_lsn
+ - block_start_lsn);
+ }
+ else {
+
+ /* If the next parse LSN is not inside the current
+ block, then the only option is that we have processed
+ ahead already. */
+ ut_a(block_start_lsn > log_bmp_sys->next_parse_lsn);
+ }
+
+ /* TODO: merge the copying to the parse buf code with
+ skip_already_len calculations */
+ log_online_parse_redo_log_block(log_block,
+ skip_already_parsed_len);
+
+ log_block += OS_FILE_LOG_BLOCK_SIZE;
+ block_start_lsn += OS_FILE_LOG_BLOCK_SIZE;
+ }
+
+ return;
+}
+
+/*********************************************************************//**
+Read and parse the redo log in a given group in FOLLOW_SCAN_SIZE-sized
+chunks and updates the modified page bitmap. */
+static
+void
+log_online_follow_log_group(
+/*========================*/
+ log_group_t* group, /*!< in: the log group to use */
+ lsn_t contiguous_lsn) /*!< in: the LSN of log block start
+ containing the log_parse_start_lsn */
+{
+ ut_ad(mutex_own(&log_bmp_sys_mutex));
+
+ lsn_t block_start_lsn = contiguous_lsn;
+ lsn_t block_end_lsn;
+
+ log_bmp_sys->next_parse_lsn = log_bmp_sys->start_lsn;
+ log_bmp_sys->parse_buf_end = log_bmp_sys->parse_buf;
+
+ do {
+ block_end_lsn = block_start_lsn + FOLLOW_SCAN_SIZE;
+
+ log_online_follow_log_seg(group, block_start_lsn,
+ block_end_lsn);
+
+ /* Next parse LSN can become higher than the last read LSN
+ only in the case when the read LSN falls right on the block
+ boundary, in which case next parse lsn is bumped to the actual
+ data LSN on the next (not yet read) block. This assert is
+ slightly conservative. */
+ ut_a(log_bmp_sys->next_parse_lsn
+ <= block_end_lsn + LOG_BLOCK_HDR_SIZE
+ + LOG_BLOCK_TRL_SIZE);
+
+ block_start_lsn = block_end_lsn;
+ } while (block_end_lsn < log_bmp_sys->end_lsn);
+
+ /* Assert that the last read log record is a full one */
+ ut_a(log_bmp_sys->parse_buf_end == log_bmp_sys->parse_buf);
+}
+
+/*********************************************************************//**
+Write, flush one bitmap block to disk and advance the output position if
+successful.
+
+@return TRUE if page written OK, FALSE if I/O error */
+static
+ibool
+log_online_write_bitmap_page(
+/*=========================*/
+ const byte *block) /*!< in: block to write */
+{
+ ut_ad(mutex_own(&log_bmp_sys_mutex));
+
+ /* Simulate a write error */
+ DBUG_EXECUTE_IF("bitmap_page_write_error",
+ {
+ ulint space_id
+ = mach_read_from_4(block
+ + MODIFIED_PAGE_SPACE_ID);
+ if (space_id > 0) {
+ ib_logf(IB_LOG_LEVEL_ERROR,
+ "simulating bitmap write "
+ "error in "
+ "log_online_write_bitmap_page "
+ "for space ID %lu",
+ space_id);
+ return FALSE;
+ }
+ });
+
+ /* A crash injection site that ensures last checkpoint LSN > last
+ tracked LSN, so that LSN tracking for this interval is tested. */
+ DBUG_EXECUTE_IF("crash_before_bitmap_write",
+ {
+ ulint space_id
+ = mach_read_from_4(block
+ + MODIFIED_PAGE_SPACE_ID);
+ if (space_id > 0)
+ DBUG_SUICIDE();
+ });
+
+
+ ibool success = os_file_write(log_bmp_sys->out.name,
+ log_bmp_sys->out.file, block,
+ log_bmp_sys->out.offset,
+ MODIFIED_PAGE_BLOCK_SIZE);
+ if (UNIV_UNLIKELY(!success)) {
+
+ /* The following call prints an error message */
+ os_file_get_last_error(TRUE);
+ ib_logf(IB_LOG_LEVEL_ERROR, "failed writing changed page "
+ "bitmap file \'%s\'", log_bmp_sys->out.name);
+ return FALSE;
+ }
+
+ success = os_file_flush(log_bmp_sys->out.file);
+ if (UNIV_UNLIKELY(!success)) {
+
+ /* The following call prints an error message */
+ os_file_get_last_error(TRUE);
+ ib_logf(IB_LOG_LEVEL_ERROR, "failed flushing changed page "
+ "bitmap file \'%s\'", log_bmp_sys->out.name);
+ return FALSE;
+ }
+
+ os_file_advise(log_bmp_sys->out.file, log_bmp_sys->out.offset,
+ MODIFIED_PAGE_BLOCK_SIZE, OS_FILE_ADVISE_DONTNEED);
+
+ log_bmp_sys->out.offset += MODIFIED_PAGE_BLOCK_SIZE;
+ return TRUE;
+}
+
+/*********************************************************************//**
+Append the current changed page bitmap to the bitmap file. Clears the
+bitmap tree and recycles its nodes to the free list.
+
+@return TRUE if bitmap written OK, FALSE if I/O error*/
+static
+ibool
+log_online_write_bitmap(void)
+/*=========================*/
+{
+ ut_ad(mutex_own(&log_bmp_sys_mutex));
+
+ if (log_bmp_sys->out.offset >= srv_max_bitmap_file_size) {
+ if (!log_online_rotate_bitmap_file(log_bmp_sys->start_lsn)) {
+ return FALSE;
+ }
+ }
+
+ ib_rbt_node_t *bmp_tree_node
+ = (ib_rbt_node_t *)rbt_first(log_bmp_sys->modified_pages);
+ const ib_rbt_node_t * const last_bmp_tree_node
+ = rbt_last(log_bmp_sys->modified_pages);
+
+ ibool success = TRUE;
+
+ while (bmp_tree_node) {
+
+ byte *page = rbt_value(byte, bmp_tree_node);
+
+ /* In case of a bitmap page write error keep on looping over
+ the tree to reclaim its memory through the free list instead of
+ returning immediatelly. */
+ if (UNIV_LIKELY(success)) {
+ if (bmp_tree_node == last_bmp_tree_node) {
+ mach_write_to_4(page
+ + MODIFIED_PAGE_IS_LAST_BLOCK,
+ 1);
+ }
+
+ mach_write_to_8(page + MODIFIED_PAGE_START_LSN,
+ log_bmp_sys->start_lsn);
+ mach_write_to_8(page + MODIFIED_PAGE_END_LSN,
+ log_bmp_sys->end_lsn);
+ mach_write_to_4(page + MODIFIED_PAGE_BLOCK_CHECKSUM,
+ log_online_calc_checksum(page));
+
+ success = log_online_write_bitmap_page(page);
+ }
+
+ bmp_tree_node->left = log_bmp_sys->page_free_list;
+ log_bmp_sys->page_free_list = bmp_tree_node;
+
+ bmp_tree_node = (ib_rbt_node_t*)
+ rbt_next(log_bmp_sys->modified_pages, bmp_tree_node);
+
+ DBUG_EXECUTE_IF("bitmap_page_2_write_error",
+ if (bmp_tree_node)
+ {
+ DBUG_SET("+d,bitmap_page_write_error");
+ DBUG_SET("-d,bitmap_page_2_write_error");
+ });
+ }
+
+ rbt_reset(log_bmp_sys->modified_pages);
+ return success;
+}
+
+/*********************************************************************//**
+Read and parse the redo log up to last checkpoint LSN to build the changed
+page bitmap which is then written to disk.
+
+@return TRUE if log tracking succeeded, FALSE if bitmap write I/O error */
+UNIV_INTERN
+ibool
+log_online_follow_redo_log(void)
+/*============================*/
+{
+ lsn_t contiguous_start_lsn;
+ log_group_t* group;
+ ibool result;
+
+ ut_ad(!srv_read_only_mode);
+
+ if (!srv_track_changed_pages)
+ return TRUE;
+
+ DEBUG_SYNC_C("log_online_follow_redo_log");
+
+ mutex_enter(&log_bmp_sys_mutex);
+
+ if (!srv_track_changed_pages) {
+ mutex_exit(&log_bmp_sys_mutex);
+ return TRUE;
+ }
+
+ /* Grab the LSN of the last checkpoint, we will parse up to it */
+ mutex_enter(&(log_sys->mutex));
+ log_bmp_sys->end_lsn = log_sys->last_checkpoint_lsn;
+ mutex_exit(&(log_sys->mutex));
+
+ if (log_bmp_sys->end_lsn == log_bmp_sys->start_lsn) {
+ mutex_exit(&log_bmp_sys_mutex);
+ return TRUE;
+ }
+
+ group = UT_LIST_GET_FIRST(log_sys->log_groups);
+ ut_a(group);
+
+ contiguous_start_lsn = ut_uint64_align_down(log_bmp_sys->start_lsn,
+ OS_FILE_LOG_BLOCK_SIZE);
+
+ while (group) {
+ log_online_follow_log_group(group, contiguous_start_lsn);
+ group = UT_LIST_GET_NEXT(log_groups, group);
+ }
+
+ result = log_online_write_bitmap();
+ log_bmp_sys->start_lsn = log_bmp_sys->end_lsn;
+ log_set_tracked_lsn(log_bmp_sys->start_lsn);
+
+ mutex_exit(&log_bmp_sys_mutex);
+ return result;
+}
+
+/*********************************************************************//**
+Diagnose a bitmap file range setup failure and free the partially-initialized
+bitmap file range. */
+UNIV_COLD
+static
+void
+log_online_diagnose_inconsistent_dir(
+/*=================================*/
+ log_online_bitmap_file_range_t *bitmap_files) /*!<in/out: bitmap file
+ range */
+{
+ ib_logf(IB_LOG_LEVEL_WARN,
+ "InnoDB: Warning: inconsistent bitmap file "
+ "directory for a "
+ "INFORMATION_SCHEMA.INNODB_CHANGED_PAGES query");
+ free(bitmap_files->files);
+}
+
+/*********************************************************************//**
+List the bitmap files in srv_data_home and setup their range that contains the
+specified LSN interval. This range, if non-empty, will start with a file that
+has the greatest LSN equal to or less than the start LSN and will include all
+the files up to the one with the greatest LSN less than the end LSN. Caller
+must free bitmap_files->files when done if bitmap_files set to non-NULL and
+this function returned TRUE. Field bitmap_files->count might be set to a
+larger value than the actual count of the files, and space for the unused array
+slots will be allocated but cleared to zeroes.
+
+@return TRUE if succeeded
+*/
+static
+ibool
+log_online_setup_bitmap_file_range(
+/*===============================*/
+ log_online_bitmap_file_range_t *bitmap_files, /*!<in/out: bitmap file
+ range */
+ lsn_t range_start, /*!<in: start LSN */
+ lsn_t range_end) /*!<in: end LSN */
+{
+ os_file_dir_t bitmap_dir;
+ os_file_stat_t bitmap_dir_file_info;
+ ulong first_file_seq_num = ULONG_MAX;
+ ulong last_file_seq_num = 0;
+ lsn_t first_file_start_lsn = LSN_MAX;
+
+ ut_ad(range_end >= range_start);
+
+ bitmap_files->count = 0;
+ bitmap_files->files = NULL;
+
+ /* 1st pass: size the info array */
+
+ bitmap_dir = os_file_opendir(srv_data_home, FALSE);
+ if (UNIV_UNLIKELY(!bitmap_dir)) {
+
+ ib_logf(IB_LOG_LEVEL_ERROR,
+ "failed to open bitmap directory \'%s\'",
+ srv_data_home);
+ return FALSE;
+ }
+
+ while (!os_file_readdir_next_file(srv_data_home, bitmap_dir,
+ &bitmap_dir_file_info)) {
+
+ ulong file_seq_num;
+ lsn_t file_start_lsn;
+
+ if (!log_online_is_bitmap_file(&bitmap_dir_file_info,
+ &file_seq_num,
+ &file_start_lsn)
+ || file_start_lsn >= range_end) {
+
+ continue;
+ }
+
+ if (file_seq_num > last_file_seq_num) {
+
+ last_file_seq_num = file_seq_num;
+ }
+
+ if (file_start_lsn >= range_start
+ || file_start_lsn == first_file_start_lsn
+ || first_file_start_lsn > range_start) {
+
+ /* A file that falls into the range */
+
+ if (file_start_lsn < first_file_start_lsn) {
+
+ first_file_start_lsn = file_start_lsn;
+ }
+ if (file_seq_num < first_file_seq_num) {
+
+ first_file_seq_num = file_seq_num;
+ }
+ } else if (file_start_lsn > first_file_start_lsn) {
+
+ /* A file that has LSN closer to the range start
+ but smaller than it, replacing another such file */
+ first_file_start_lsn = file_start_lsn;
+ first_file_seq_num = file_seq_num;
+ }
+ }
+
+ if (UNIV_UNLIKELY(os_file_closedir(bitmap_dir))) {
+
+ os_file_get_last_error(TRUE);
+ ib_logf(IB_LOG_LEVEL_ERROR, "cannot close \'%s\'",
+ srv_data_home);
+ return FALSE;
+ }
+
+ if (first_file_seq_num == ULONG_MAX && last_file_seq_num == 0) {
+
+ bitmap_files->count = 0;
+ return TRUE;
+ }
+
+ bitmap_files->count = last_file_seq_num - first_file_seq_num + 1;
+
+ DEBUG_SYNC_C("setup_bitmap_range_middle");
+
+ /* 2nd pass: get the file names in the file_seq_num order */
+
+ bitmap_dir = os_file_opendir(srv_data_home, FALSE);
+ if (UNIV_UNLIKELY(!bitmap_dir)) {
+
+ ib_logf(IB_LOG_LEVEL_ERROR,
+ "failed to open bitmap directory \'%s\'",
+ srv_data_home);
+ return FALSE;
+ }
+
+ bitmap_files->files
+ = static_cast<log_online_bitmap_file_range_struct::files_t *>
+ (ut_malloc(bitmap_files->count
+ * sizeof(bitmap_files->files[0])));
+ memset(bitmap_files->files, 0,
+ bitmap_files->count * sizeof(bitmap_files->files[0]));
+
+ while (!os_file_readdir_next_file(srv_data_home, bitmap_dir,
+ &bitmap_dir_file_info)) {
+
+ ulong file_seq_num;
+ lsn_t file_start_lsn;
+ size_t array_pos;
+
+ if (!log_online_is_bitmap_file(&bitmap_dir_file_info,
+ &file_seq_num,
+ &file_start_lsn)
+ || file_start_lsn >= range_end
+ || file_start_lsn < first_file_start_lsn) {
+
+ continue;
+ }
+
+ array_pos = file_seq_num - first_file_seq_num;
+ if (UNIV_UNLIKELY(array_pos >= bitmap_files->count)) {
+
+ log_online_diagnose_inconsistent_dir(bitmap_files);
+ os_file_closedir(bitmap_dir);
+ return FALSE;
+ }
+
+
+ if (file_seq_num > bitmap_files->files[array_pos].seq_num) {
+
+ bitmap_files->files[array_pos].seq_num = file_seq_num;
+ strncpy(bitmap_files->files[array_pos].name,
+ bitmap_dir_file_info.name, FN_REFLEN);
+ bitmap_files->files[array_pos].name[FN_REFLEN - 1]
+ = '\0';
+ bitmap_files->files[array_pos].start_lsn
+ = file_start_lsn;
+ }
+ }
+
+ if (UNIV_UNLIKELY(os_file_closedir(bitmap_dir))) {
+
+ os_file_get_last_error(TRUE);
+ ib_logf(IB_LOG_LEVEL_ERROR, "cannot close \'%s\'",
+ srv_data_home);
+ free(bitmap_files->files);
+ return FALSE;
+ }
+
+ if (!bitmap_files->files[0].seq_num
+ || bitmap_files->files[0].seq_num != first_file_seq_num) {
+
+ log_online_diagnose_inconsistent_dir(bitmap_files);
+ return FALSE;
+ }
+
+ {
+ size_t i;
+ for (i = 1; i < bitmap_files->count; i++) {
+ if (!bitmap_files->files[i].seq_num) {
+ break;
+ }
+ if ((bitmap_files->files[i].seq_num
+ <= bitmap_files->files[i - 1].seq_num)
+ || (bitmap_files->files[i].start_lsn
+ < bitmap_files->files[i - 1].start_lsn)) {
+
+ log_online_diagnose_inconsistent_dir(
+ bitmap_files);
+ return FALSE;
+ }
+ }
+ }
+
+ return TRUE;
+}
+
+/****************************************************************//**
+Open a bitmap file for reading.
+
+@return TRUE if opened successfully */
+static
+ibool
+log_online_open_bitmap_file_read_only(
+/*==================================*/
+ const char* name, /*!<in: bitmap file
+ name without directory,
+ which is assumed to be
+ srv_data_home */
+ log_online_bitmap_file_t* bitmap_file) /*!<out: opened bitmap
+ file */
+{
+ ibool success = FALSE;
+ size_t srv_data_home_len;
+
+ ut_ad(name[0] != '\0');
+
+ srv_data_home_len = strlen(srv_data_home);
+ if (srv_data_home_len
+ && srv_data_home[srv_data_home_len-1]
+ != SRV_PATH_SEPARATOR) {
+ ut_snprintf(bitmap_file->name, FN_REFLEN, "%s%c%s",
+ srv_data_home, SRV_PATH_SEPARATOR, name);
+ } else {
+ ut_snprintf(bitmap_file->name, FN_REFLEN, "%s%s",
+ srv_data_home, name);
+ }
+ bitmap_file->file
+ = os_file_create_simple_no_error_handling(innodb_file_bmp_key,
+ bitmap_file->name,
+ OS_FILE_OPEN,
+ OS_FILE_READ_ONLY,
+ &success);
+ if (UNIV_UNLIKELY(!success)) {
+
+ /* Here and below assume that bitmap file names do not
+ contain apostrophes, thus no need for ut_print_filename(). */
+ ib_logf(IB_LOG_LEVEL_WARN,
+ "error opening the changed page bitmap \'%s\'",
+ bitmap_file->name);
+ return FALSE;
+ }
+
+ bitmap_file->size = os_file_get_size(bitmap_file->file);
+ bitmap_file->offset = 0;
+
+ os_file_advise(bitmap_file->file, 0, 0, OS_FILE_ADVISE_SEQUENTIAL);
+ os_file_advise(bitmap_file->file, 0, 0, OS_FILE_ADVISE_NOREUSE);
+
+ return TRUE;
+}
+
+/****************************************************************//**
+Diagnose one or both of the following situations if we read close to
+the end of bitmap file:
+1) Warn if the remainder of the file is less than one page.
+2) Error if we cannot read any more full pages but the last read page
+did not have the last-in-run flag set.
+
+@return FALSE for the error */
+static
+ibool
+log_online_diagnose_bitmap_eof(
+/*===========================*/
+ const log_online_bitmap_file_t* bitmap_file, /*!< in: bitmap file */
+ ibool last_page_in_run)/*!< in: "last page in
+ run" flag value in the
+ last read page */
+{
+ /* Check if we are too close to EOF to read a full page */
+ if ((bitmap_file->size < MODIFIED_PAGE_BLOCK_SIZE)
+ || (bitmap_file->offset
+ > bitmap_file->size - MODIFIED_PAGE_BLOCK_SIZE)) {
+
+ if (UNIV_UNLIKELY(bitmap_file->offset != bitmap_file->size)) {
+
+ /* If we are not at EOF and we have less than one page
+ to read, it's junk. This error is not fatal in
+ itself. */
+
+ ib_logf(IB_LOG_LEVEL_WARN,
+ "junk at the end of changed page bitmap file "
+ "\'%s\'.", bitmap_file->name);
+ }
+
+ if (UNIV_UNLIKELY(!last_page_in_run)) {
+
+ /* We are at EOF but the last read page did not finish
+ a run */
+ /* It's a "Warning" here because it's not a fatal error
+ for the whole server */
+ ib_logf(IB_LOG_LEVEL_WARN,
+ "changed page bitmap file \'%s\', size "
+ UINT64PF " bytes, does not "
+ "contain a complete run at the next read "
+ "offset " UINT64PF,
+ bitmap_file->name, bitmap_file->size,
+ bitmap_file->offset);
+ return FALSE;
+ }
+ }
+ return TRUE;
+}
+
+/*********************************************************************//**
+Initialize the log bitmap iterator for a given range. The records are
+processed at a bitmap block granularity, i.e. all the records in the same block
+share the same start and end LSN values, the exact LSN of each record is
+unavailable (nor is it defined for blocks that are touched more than once in
+the LSN interval contained in the block). Thus min_lsn and max_lsn should be
+set at block boundaries or bigger, otherwise the records at the 1st and the
+last blocks will not be returned. Also note that there might be returned
+records with LSN < min_lsn, as min_lsn is used to select the correct starting
+file but not block.
+
+@return TRUE if the iterator is initialized OK, FALSE otherwise. */
+UNIV_INTERN
+ibool
+log_online_bitmap_iterator_init(
+/*============================*/
+ log_bitmap_iterator_t *i, /*!<in/out: iterator */
+ lsn_t min_lsn,/*!< in: start LSN */
+ lsn_t max_lsn)/*!< in: end LSN */
+{
+ ut_a(i);
+
+ i->max_lsn = max_lsn;
+
+ if (UNIV_UNLIKELY(min_lsn > max_lsn)) {
+
+ /* Empty range */
+ i->in_files.count = 0;
+ i->in_files.files = NULL;
+ os_file_mark_invalid(&i->in.file);
+ i->page = NULL;
+ i->failed = FALSE;
+ return TRUE;
+ }
+
+ if (!log_online_setup_bitmap_file_range(&i->in_files, min_lsn,
+ max_lsn)) {
+
+ i->failed = TRUE;
+ return FALSE;
+ }
+
+ i->in_i = 0;
+
+ if (i->in_files.count == 0) {
+
+ /* Empty range */
+ os_file_mark_invalid(&i->in.file);
+ i->page = NULL;
+ i->failed = FALSE;
+ return TRUE;
+ }
+
+ /* Open the 1st bitmap file */
+ if (UNIV_UNLIKELY(!log_online_open_bitmap_file_read_only(
+ i->in_files.files[i->in_i].name,
+ &i->in))) {
+
+ i->in_i = i->in_files.count;
+ free(i->in_files.files);
+ i->failed = TRUE;
+ return FALSE;
+ }
+
+ i->page = static_cast<byte *>(ut_malloc(MODIFIED_PAGE_BLOCK_SIZE));
+ i->bit_offset = MODIFIED_PAGE_BLOCK_BITMAP_LEN;
+ i->start_lsn = i->end_lsn = 0;
+ i->space_id = 0;
+ i->first_page_id = 0;
+ i->last_page_in_run = TRUE;
+ i->changed = FALSE;
+ i->failed = FALSE;
+
+ return TRUE;
+}
+
+/*********************************************************************//**
+Releases log bitmap iterator. */
+UNIV_INTERN
+void
+log_online_bitmap_iterator_release(
+/*===============================*/
+ log_bitmap_iterator_t *i) /*!<in/out: iterator */
+{
+ ut_a(i);
+
+ if (!os_file_is_invalid(i->in.file)) {
+
+ os_file_close(i->in.file);
+ os_file_mark_invalid(&i->in.file);
+ }
+ if (i->in_files.files) {
+
+ ut_free(i->in_files.files);
+ }
+ if (i->page) {
+
+ ut_free(i->page);
+ }
+ i->failed = TRUE;
+}
+
+/*********************************************************************//**
+Iterates through bits of saved bitmap blocks.
+Sequentially reads blocks from bitmap file(s) and interates through
+their bits. Ignores blocks with wrong checksum.
+@return TRUE if iteration is successful, FALSE if all bits are iterated. */
+UNIV_INTERN
+ibool
+log_online_bitmap_iterator_next(
+/*============================*/
+ log_bitmap_iterator_t *i) /*!<in/out: iterator */
+{
+ ibool checksum_ok = FALSE;
+ ibool success;
+
+ ut_a(i);
+
+ if (UNIV_UNLIKELY(i->in_files.count == 0)) {
+
+ return FALSE;
+ }
+
+ if (UNIV_LIKELY(i->bit_offset < MODIFIED_PAGE_BLOCK_BITMAP_LEN))
+ {
+ ++i->bit_offset;
+ i->changed =
+ IS_BIT_SET(i->page + MODIFIED_PAGE_BLOCK_BITMAP,
+ i->bit_offset);
+ return TRUE;
+ }
+
+ if (i->end_lsn >= i->max_lsn && i->last_page_in_run)
+ return FALSE;
+
+ while (!checksum_ok)
+ {
+ while (i->in.size < MODIFIED_PAGE_BLOCK_SIZE
+ || (i->in.offset
+ > i->in.size - MODIFIED_PAGE_BLOCK_SIZE)) {
+
+ /* Advance file */
+ i->in_i++;
+ success = os_file_close_no_error_handling(
+ i->in.file);
+ os_file_mark_invalid(&i->in.file);
+ if (UNIV_UNLIKELY(!success)) {
+
+ os_file_get_last_error(TRUE);
+ i->failed = TRUE;
+ return FALSE;
+ }
+
+ success = log_online_diagnose_bitmap_eof(
+ &i->in, i->last_page_in_run);
+ if (UNIV_UNLIKELY(!success)) {
+
+ i->failed = TRUE;
+ return FALSE;
+
+ }
+
+ if (i->in_i == i->in_files.count) {
+
+ return FALSE;
+ }
+
+ if (UNIV_UNLIKELY(i->in_files.files[i->in_i].seq_num
+ == 0)) {
+
+ i->failed = TRUE;
+ return FALSE;
+ }
+
+ success = log_online_open_bitmap_file_read_only(
+ i->in_files.files[i->in_i].name,
+ &i->in);
+ if (UNIV_UNLIKELY(!success)) {
+
+ i->failed = TRUE;
+ return FALSE;
+ }
+ }
+
+ success = log_online_read_bitmap_page(&i->in, i->page,
+ &checksum_ok);
+ if (UNIV_UNLIKELY(!success)) {
+
+ os_file_get_last_error(TRUE);
+ ib_logf(IB_LOG_LEVEL_WARN,
+ "failed reading changed page bitmap file "
+ "\'%s\'", i->in_files.files[i->in_i].name);
+ i->failed = TRUE;
+ return FALSE;
+ }
+ }
+
+ i->start_lsn = mach_read_from_8(i->page + MODIFIED_PAGE_START_LSN);
+ i->end_lsn = mach_read_from_8(i->page + MODIFIED_PAGE_END_LSN);
+ i->space_id = mach_read_from_4(i->page + MODIFIED_PAGE_SPACE_ID);
+ i->first_page_id = mach_read_from_4(i->page
+ + MODIFIED_PAGE_1ST_PAGE_ID);
+ i->last_page_in_run = mach_read_from_4(i->page
+ + MODIFIED_PAGE_IS_LAST_BLOCK);
+ i->bit_offset = 0;
+ i->changed = IS_BIT_SET(i->page + MODIFIED_PAGE_BLOCK_BITMAP,
+ i->bit_offset);
+
+ return TRUE;
+}
+
+/************************************************************//**
+Delete all the bitmap files for data less than the specified LSN.
+If called with lsn == 0 (i.e. set by RESET request) or LSN_MAX,
+restart the bitmap file sequence, otherwise continue it.
+
+@return FALSE to indicate success, TRUE for failure. */
+UNIV_INTERN
+ibool
+log_online_purge_changed_page_bitmaps(
+/*==================================*/
+ lsn_t lsn) /*!< in: LSN to purge files up to */
+{
+ log_online_bitmap_file_range_t bitmap_files;
+ size_t i;
+ ibool result = FALSE;
+
+ if (lsn == 0) {
+ lsn = LSN_MAX;
+ }
+
+ bool log_bmp_sys_inited = false;
+ if (srv_redo_log_thread_started) {
+ /* User requests might happen with both enabled and disabled
+ tracking */
+ log_bmp_sys_inited = true;
+ mutex_enter(&log_bmp_sys_mutex);
+ if (!srv_redo_log_thread_started) {
+ log_bmp_sys_inited = false;
+ mutex_exit(&log_bmp_sys_mutex);
+ }
+ }
+
+ if (!log_online_setup_bitmap_file_range(&bitmap_files, 0, LSN_MAX)) {
+ if (log_bmp_sys_inited) {
+ mutex_exit(&log_bmp_sys_mutex);
+ }
+ return TRUE;
+ }
+
+ if (srv_redo_log_thread_started && lsn > log_bmp_sys->end_lsn) {
+ /* If we have to delete the current output file, close it
+ first. */
+ os_file_close(log_bmp_sys->out.file);
+ os_file_mark_invalid(&log_bmp_sys->out.file);
+ }
+
+ for (i = 0; i < bitmap_files.count; i++) {
+
+ /* We consider the end LSN of the current bitmap, derived from
+ the start LSN of the subsequent bitmap file, to determine
+ whether to remove the current bitmap. Note that bitmap_files
+ does not contain an entry for the bitmap past the given LSN so
+ we must check the boundary conditions as well. For example,
+ consider 1_0.xdb and 2_10.xdb and querying LSN 5. bitmap_files
+ will only contain 1_0.xdb and we must not delete it since it
+ represents LSNs 0-9. */
+ if ((i + 1 == bitmap_files.count
+ || bitmap_files.files[i + 1].seq_num == 0
+ || bitmap_files.files[i + 1].start_lsn > lsn)
+ && (lsn != LSN_MAX)) {
+
+ break;
+ }
+ if (!os_file_delete_if_exists(innodb_file_bmp_key,
+ bitmap_files.files[i].name)) {
+
+ os_file_get_last_error(TRUE);
+ result = TRUE;
+ break;
+ }
+ }
+
+ if (log_bmp_sys_inited) {
+ if (lsn > log_bmp_sys->end_lsn) {
+ lsn_t new_file_lsn;
+ if (lsn == LSN_MAX) {
+ /* RESET restarts the sequence */
+ log_bmp_sys->out_seq_num = 0;
+ new_file_lsn = 0;
+ } else {
+ new_file_lsn = log_bmp_sys->end_lsn;
+ }
+ if (!log_online_rotate_bitmap_file(new_file_lsn)) {
+ /* If file create failed, stop log tracking */
+ srv_track_changed_pages = FALSE;
+ }
+ }
+
+ mutex_exit(&log_bmp_sys_mutex);
+ }
+
+ free(bitmap_files.files);
+ return result;
+}
diff --cc storage/xtradb/os/os0file.cc
index 89013d9068f,00000000000..b4fafb127ec
mode 100644,000000..100644
--- a/storage/xtradb/os/os0file.cc
+++ b/storage/xtradb/os/os0file.cc
@@@ -1,6177 -1,0 +1,6177 @@@
+/***********************************************************************
+
+Copyright (c) 1995, 2017, Oracle and/or its affiliates. All Rights Reserved.
+Copyright (c) 2009, Percona Inc.
- Copyright (c) 2013, 2017, MariaDB Corporation.
++Copyright (c) 2013, 2019, MariaDB Corporation.
+
+Portions of this file contain modifications contributed and copyrighted
+by Percona Inc.. Those modifications are
+gratefully acknowledged and are described briefly in the InnoDB
+documentation. The contributions by Percona Inc. are incorporated with
+their permission, and subject to the conditions contained in the file
+COPYING.Percona.
+
+This program is free software; you can redistribute it and/or modify it
+under the terms of the GNU General Public License as published by the
+Free Software Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General
+Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA
+
+***********************************************************************/
+
+/**************************************************//**
+@file os/os0file.cc
+The interface to the operating system file i/o primitives
+
+Created 10/21/1995 Heikki Tuuri
+*******************************************************/
+
+#include "os0file.h"
+
+#ifdef UNIV_NONINL
+#include "os0file.ic"
+#endif
+#include "ha_prototypes.h"
+#include "ut0mem.h"
+#include "srv0srv.h"
+#include "srv0start.h"
+#include "fil0fil.h"
+#include "buf0buf.h"
+#include "btr0types.h"
+#include "trx0trx.h"
+#include "srv0mon.h"
+#ifndef UNIV_HOTBACKUP
+# include "os0sync.h"
+# include "os0thread.h"
+#else /* !UNIV_HOTBACKUP */
+# ifdef __WIN__
+/* Add includes for the _stat() call to compile on Windows */
+# include <sys/types.h>
+# include <sys/stat.h>
+# include <errno.h>
+# endif /* __WIN__ */
+#endif /* !UNIV_HOTBACKUP */
+
+#if defined(LINUX_NATIVE_AIO)
+#include <libaio.h>
+#endif
+
+#ifdef _WIN32
+#define IOCP_SHUTDOWN_KEY (ULONG_PTR)-1
+#endif
+
+#if defined(UNIV_LINUX) && defined(HAVE_SYS_IOCTL_H)
+# include <sys/ioctl.h>
+# ifndef DFS_IOCTL_ATOMIC_WRITE_SET
+# define DFS_IOCTL_ATOMIC_WRITE_SET _IOW(0x95, 2, uint)
+# endif
+#endif
+
+/** Insert buffer segment id */
+static const ulint IO_IBUF_SEGMENT = 0;
+
+/** Log segment id */
+static const ulint IO_LOG_SEGMENT = 1;
+
+/* This specifies the file permissions InnoDB uses when it creates files in
+Unix; the value of os_innodb_umask is initialized in ha_innodb.cc to
+my_umask */
+
+#ifndef __WIN__
+/** Umask for creating files */
+UNIV_INTERN ulint os_innodb_umask = S_IRUSR | S_IWUSR | S_IRGRP | S_IWGRP;
+# define os_file_invalid (-1)
+#else
+/** Umask for creating files */
+UNIV_INTERN ulint os_innodb_umask = 0;
+# define os_file_invalid INVALID_HANDLE_VALUE
+#endif /* __WIN__ */
+
+#ifndef UNIV_HOTBACKUP
+/* We use these mutexes to protect lseek + file i/o operation, if the
+OS does not provide an atomic pread or pwrite, or similar */
+#define OS_FILE_N_SEEK_MUTEXES 16
+UNIV_INTERN os_ib_mutex_t os_file_seek_mutexes[OS_FILE_N_SEEK_MUTEXES];
+
+/* In simulated aio, merge at most this many consecutive i/os */
+#define OS_AIO_MERGE_N_CONSECUTIVE 64
+
+#ifdef WITH_INNODB_DISALLOW_WRITES
+#define WAIT_ALLOW_WRITES() os_event_wait(srv_allow_writes_event)
+#else
+#define WAIT_ALLOW_WRITES() do { } while (0)
+#endif /* WITH_INNODB_DISALLOW_WRITES */
+
+/**********************************************************************
+
+InnoDB AIO Implementation:
+=========================
+
+We support native AIO for windows and linux. For rest of the platforms
+we simulate AIO by special io-threads servicing the IO-requests.
+
+Simulated AIO:
+==============
+
+In platforms where we 'simulate' AIO following is a rough explanation
+of the high level design.
+There are four io-threads (for ibuf, log, read, write).
+All synchronous IO requests are serviced by the calling thread using
+os_file_write/os_file_read. The Asynchronous requests are queued up
+in an array (there are four such arrays) by the calling thread.
+Later these requests are picked up by the io-thread and are serviced
+synchronously.
+
+Windows native AIO:
+==================
+
+If srv_use_native_aio is not set then windows follow the same
+code as simulated AIO. If the flag is set then native AIO interface
+is used. On windows, one of the limitation is that if a file is opened
+for AIO no synchronous IO can be done on it. Therefore we have an
+extra fifth array to queue up synchronous IO requests.
+There are innodb_file_io_threads helper threads. These threads work
+on the four arrays mentioned above in Simulated AIO. No thread is
+required for the sync array.
+If a synchronous IO request is made, it is first queued in the sync
+array. Then the calling thread itself waits on the request, thus
+making the call synchronous.
+If an AIO request is made the calling thread not only queues it in the
+array but also submits the requests. The helper thread then collects
+the completed IO request and calls completion routine on it.
+
+Linux native AIO:
+=================
+
+If we have libaio installed on the system and innodb_use_native_aio
+is set to TRUE we follow the code path of native AIO, otherwise we
+do simulated AIO.
+There are innodb_file_io_threads helper threads. These threads work
+on the four arrays mentioned above in Simulated AIO.
+If a synchronous IO request is made, it is handled by calling
+os_file_write/os_file_read.
+If an AIO request is made the calling thread not only queues it in the
+array but also submits the requests. The helper thread then collects
+the completed IO request and calls completion routine on it.
+
+**********************************************************************/
+
+/** Flag: enable debug printout for asynchronous i/o */
+UNIV_INTERN ibool os_aio_print_debug = FALSE;
+
+#ifdef UNIV_PFS_IO
+/* Keys to register InnoDB I/O with performance schema */
+UNIV_INTERN mysql_pfs_key_t innodb_file_data_key;
+UNIV_INTERN mysql_pfs_key_t innodb_file_log_key;
+UNIV_INTERN mysql_pfs_key_t innodb_file_temp_key;
+UNIV_INTERN mysql_pfs_key_t innodb_file_bmp_key;
+#endif /* UNIV_PFS_IO */
+
+/** The asynchronous i/o array slot structure */
+struct os_aio_slot_t{
+#ifdef WIN_ASYNC_IO
+ OVERLAPPED control; /*!< Windows control block for the
+ aio request, MUST be first element in the structure*/
+ void *arr; /*!< Array this slot belongs to*/
+#endif
+
+ ibool is_read; /*!< TRUE if a read operation */
+ ulint pos; /*!< index of the slot in the aio
+ array */
+ ibool reserved; /*!< TRUE if this slot is reserved */
+ time_t reservation_time;/*!< time when reserved */
+ ulint len; /*!< length of the block to read or
+ write */
+ byte* buf; /*!< buffer used in i/o */
+ ulint type; /*!< OS_FILE_READ or OS_FILE_WRITE */
+ os_offset_t offset; /*!< file offset in bytes */
+ pfs_os_file_t file; /*!< file where to read or write */
+ const char* name; /*!< file name or path */
+ ibool io_already_done;/*!< used only in simulated aio:
+ TRUE if the physical i/o already
+ made and only the slot message
+ needs to be passed to the caller
+ of os_aio_simulated_handle */
+ ulint space_id;
+ fil_node_t* message1; /*!< message which is given by the */
+ void* message2; /*!< the requester of an aio operation
+ and which can be used to identify
+ which pending aio operation was
+ completed */
+#ifdef LINUX_NATIVE_AIO
+ struct iocb control; /* Linux control block for aio */
+ int n_bytes; /* bytes written/read. */
+ int ret; /* AIO return code */
+#endif /* WIN_ASYNC_IO */
+};
+
+/** The asynchronous i/o array structure */
+struct os_aio_array_t{
+ os_ib_mutex_t mutex; /*!< the mutex protecting the aio array */
+ os_event_t not_full;
+ /*!< The event which is set to the
+ signaled state when there is space in
+ the aio outside the ibuf segment;
+ os_event_set() and os_event_reset()
+ are protected by os_aio_array_t::mutex */
+ os_event_t is_empty;
+ /*!< The event which is set to the
+ signaled state when there are no
+ pending i/os in this array;
+ os_event_set() and os_event_reset()
+ are protected by os_aio_array_t::mutex */
+ ulint n_slots;/*!< Total number of slots in the aio
+ array. This must be divisible by
+ n_threads. */
+ ulint n_segments;
+ /*!< Number of segments in the aio
+ array of pending aio requests. A
+ thread can wait separately for any one
+ of the segments. */
+ ulint cur_seg;/*!< We reserve IO requests in round
+ robin fashion to different segments.
+ This points to the segment that is to
+ be used to service next IO request. */
+ ulint n_reserved;
+ /*!< Number of reserved slots in the
+ aio array outside the ibuf segment */
+ os_aio_slot_t* slots; /*!< Pointer to the slots in the array */
+
+#if defined(LINUX_NATIVE_AIO)
+ io_context_t* aio_ctx;
+ /* completion queue for IO. There is
+ one such queue per segment. Each thread
+ will work on one ctx exclusively. */
+ struct io_event* aio_events;
+ /* The array to collect completed IOs.
+ There is one such event for each
+ possible pending IO. The size of the
+ array is equal to n_slots. */
+ struct iocb** pending;
+ /* Array to buffer the not-submitted aio
+ requests. The array length is n_slots.
+ It is divided into n_segments segments.
+ pending requests on each segment are buffered
+ separately.*/
+ ulint* count;
+ /* Array of length n_segments. Each element
+ counts the number of not-submitted aio
+ request on that segment.*/
+#endif /* LINUX_NATIV_AIO */
+};
+
+#if defined(LINUX_NATIVE_AIO)
+/** timeout for each io_getevents() call = 500ms. */
+#define OS_AIO_REAP_TIMEOUT (500000000UL)
+
+/** time to sleep, in microseconds if io_setup() returns EAGAIN. */
+#define OS_AIO_IO_SETUP_RETRY_SLEEP (500000UL)
+
+/** number of attempts before giving up on io_setup(). */
+#define OS_AIO_IO_SETUP_RETRY_ATTEMPTS 5
+#endif
+
+/** Array of events used in simulated aio. */
+static os_event_t* os_aio_segment_wait_events;
+
+/** The aio arrays for non-ibuf i/o and ibuf i/o, as well as sync aio. These
+are NULL when the module has not yet been initialized. @{ */
+static os_aio_array_t* os_aio_read_array = NULL; /*!< Reads */
+static os_aio_array_t* os_aio_write_array = NULL; /*!< Writes */
+static os_aio_array_t* os_aio_ibuf_array = NULL; /*!< Insert buffer */
+static os_aio_array_t* os_aio_log_array = NULL; /*!< Redo log */
+static os_aio_array_t* os_aio_sync_array = NULL; /*!< Synchronous I/O */
+/* @} */
+
+/** Number of asynchronous I/O segments. Set by os_aio_init(). */
+static ulint os_aio_n_segments = ULINT_UNDEFINED;
+
+/** If the following is TRUE, read i/o handler threads try to
+wait until a batch of new read requests have been posted */
+static ibool os_aio_recommend_sleep_for_read_threads = FALSE;
+#endif /* !UNIV_HOTBACKUP */
+
+UNIV_INTERN ulint os_n_file_reads = 0;
+UNIV_INTERN ulint os_bytes_read_since_printout = 0;
+UNIV_INTERN ulint os_n_file_writes = 0;
+UNIV_INTERN ulint os_n_fsyncs = 0;
+UNIV_INTERN ulint os_n_file_reads_old = 0;
+UNIV_INTERN ulint os_n_file_writes_old = 0;
+UNIV_INTERN ulint os_n_fsyncs_old = 0;
+UNIV_INTERN time_t os_last_printout;
+
+UNIV_INTERN ibool os_has_said_disk_full = FALSE;
+
+#ifdef UNIV_DEBUG
+# ifndef UNIV_HOTBACKUP
+/**********************************************************************//**
+Validates the consistency the aio system some of the time.
+@return TRUE if ok or the check was skipped */
+UNIV_INTERN
+ibool
+os_aio_validate_skip(void)
+/*======================*/
+{
+/** Try os_aio_validate() every this many times */
+# define OS_AIO_VALIDATE_SKIP 13
+
+ /** The os_aio_validate() call skip counter.
+ Use a signed type because of the race condition below. */
+ static int os_aio_validate_count = OS_AIO_VALIDATE_SKIP;
+
+ /* There is a race condition below, but it does not matter,
+ because this call is only for heuristic purposes. We want to
+ reduce the call frequency of the costly os_aio_validate()
+ check in debug builds. */
+ if (--os_aio_validate_count > 0) {
+ return(TRUE);
+ }
+
+ os_aio_validate_count = OS_AIO_VALIDATE_SKIP;
+ return(os_aio_validate());
+}
+# endif /* !UNIV_HOTBACKUP */
+#endif /* UNIV_DEBUG */
+
+#ifdef _WIN32
+/** IO completion port used by background io threads */
+static HANDLE completion_port;
+/** IO completion port used by background io READ threads */
+static HANDLE read_completion_port;
+/** Thread local storage index for the per-thread event used for synchronous IO */
+static DWORD tls_sync_io = TLS_OUT_OF_INDEXES;
+#endif
+
+#ifdef __WIN__
+/***********************************************************************//**
+Gets the operating system version. Currently works only on Windows.
+@return OS_WIN95, OS_WIN31, OS_WINNT, OS_WIN2000, OS_WINXP, OS_WINVISTA,
+OS_WIN7. */
+UNIV_INTERN
+ulint
+os_get_os_version(void)
+/*===================*/
+{
+ OSVERSIONINFO os_info;
+
+ os_info.dwOSVersionInfoSize = sizeof(OSVERSIONINFO);
+
+ ut_a(GetVersionEx(&os_info));
+
+ if (os_info.dwPlatformId == VER_PLATFORM_WIN32s) {
+ return(OS_WIN31);
+ } else if (os_info.dwPlatformId == VER_PLATFORM_WIN32_WINDOWS) {
+ return(OS_WIN95);
+ } else if (os_info.dwPlatformId == VER_PLATFORM_WIN32_NT) {
+ switch (os_info.dwMajorVersion) {
+ case 3:
+ case 4:
+ return(OS_WINNT);
+ case 5:
+ return (os_info.dwMinorVersion == 0)
+ ? OS_WIN2000 : OS_WINXP;
+ case 6:
+ return (os_info.dwMinorVersion == 0)
+ ? OS_WINVISTA : OS_WIN7;
+ default:
+ return(OS_WIN7);
+ }
+ } else {
+ ut_error;
+ return(0);
+ }
+}
+#endif /* __WIN__ */
+
+
+#ifdef _WIN32
+/*
+Windows : Handling synchronous IO on files opened asynchronously.
+
+If file is opened for asynchronous IO (FILE_FLAG_OVERLAPPED) and also bound to
+a completion port, then every IO on this file would normally be enqueued to the
+completion port. Sometimes however we would like to do a synchronous IO. This is
+possible if we initialitze have overlapped.hEvent with a valid event and set its
+lowest order bit to 1 (see MSDN ReadFile and WriteFile description for more info)
+
+We'll create this special event once for each thread and store in thread local
+storage.
+*/
+
+
+/***********************************************************************//**
+Initialize tls index.for event handle used for synchronized IO on files that
+might be opened with FILE_FLAG_OVERLAPPED.
+*/
+static void win_init_syncio_event()
+{
+ tls_sync_io = TlsAlloc();
+ ut_a(tls_sync_io != TLS_OUT_OF_INDEXES);
+}
+
+/***********************************************************************//**
+Retrieve per-thread event for doing synchronous io on asyncronously opened files
+*/
+static HANDLE win_get_syncio_event()
+{
+ HANDLE h;
+ if(tls_sync_io == TLS_OUT_OF_INDEXES){
+ win_init_syncio_event();
+ }
+
+ h = (HANDLE)TlsGetValue(tls_sync_io);
+ if (h)
+ return h;
+ h = CreateEventA(NULL, FALSE, FALSE, NULL);
+ ut_a(h);
+ h = (HANDLE)((uintptr_t)h | 1);
+ TlsSetValue(tls_sync_io, h);
+ return h;
+}
+
+/*
+ TLS destructor, inspired by Chromium code
+ http://src.chromium.org/svn/trunk/src/base/threading/thread_local_storage_w…
+*/
+
+static void win_free_syncio_event()
+{
+ HANDLE h = win_get_syncio_event();
+ if (h) {
+ CloseHandle(h);
+ }
+}
+
+static void NTAPI win_tls_thread_exit(PVOID module, DWORD reason, PVOID reserved) {
+ if (DLL_THREAD_DETACH == reason || DLL_PROCESS_DETACH == reason)
+ win_free_syncio_event();
+}
+
+extern "C" {
+#ifdef _WIN64
+#pragma comment(linker, "/INCLUDE:_tls_used")
+#pragma comment(linker, "/INCLUDE:p_thread_callback_base")
+#pragma const_seg(".CRT$XLB")
+extern const PIMAGE_TLS_CALLBACK p_thread_callback_base;
+const PIMAGE_TLS_CALLBACK p_thread_callback_base = win_tls_thread_exit;
+#pragma data_seg()
+#else
+#pragma comment(linker, "/INCLUDE:__tls_used")
+#pragma comment(linker, "/INCLUDE:_p_thread_callback_base")
+#pragma data_seg(".CRT$XLB")
+PIMAGE_TLS_CALLBACK p_thread_callback_base = win_tls_thread_exit;
+#pragma data_seg()
+#endif
+}
+#endif /*_WIN32 */
+
+/***********************************************************************//**
+For an EINVAL I/O error, prints a diagnostic message if innodb_flush_method
+== ALL_O_DIRECT.
+@return true if the diagnostic message was printed
+@return false if the diagnostic message does not apply */
+static
+bool
+os_diagnose_all_o_direct_einval(
+/*============================*/
+ ulint err) /*!< in: C error code */
+{
+ if ((err == EINVAL)
+ && (srv_unix_file_flush_method == SRV_UNIX_ALL_O_DIRECT)) {
+ ib_logf(IB_LOG_LEVEL_INFO,
+ "The error might be caused by redo log I/O not "
+ "satisfying innodb_flush_method=ALL_O_DIRECT "
+ "requirements by the underlying file system.");
+ if (srv_log_block_size != 512)
+ ib_logf(IB_LOG_LEVEL_INFO,
+ "This might be caused by an incompatible "
+ "non-default innodb_log_block_size value %lu.",
+ srv_log_block_size);
+ ib_logf(IB_LOG_LEVEL_INFO,
+ "Please file a bug at https://bugs.percona.com and "
+ "include this error message, my.cnf settings, and "
+ "information about the file system where the redo log "
+ "resides.");
+ ib_logf(IB_LOG_LEVEL_INFO,
+ "A possible workaround is to change "
+ "innodb_flush_method value to something else "
+ "than ALL_O_DIRECT.");
+ return(true);
+ }
+ return(false);
+}
+
+/***********************************************************************//**
+Retrieves the last error number if an error occurs in a file io function.
+The number should be retrieved before any other OS calls (because they may
+overwrite the error number). If the number is not known to this program,
+the OS error number + 100 is returned.
+@return error number, or OS error number + 100 */
+static
+ulint
+os_file_get_last_error_low(
+/*=======================*/
+ bool report_all_errors, /*!< in: TRUE if we want an error
+ message printed of all errors */
+ bool on_error_silent) /*!< in: TRUE then don't print any
+ diagnostic to the log */
+{
+#ifdef __WIN__
+
+ ulint err = (ulint) GetLastError();
+ if (err == ERROR_SUCCESS) {
+ return(0);
+ }
+
+ if (report_all_errors
+ || (!on_error_silent
+ && err != ERROR_DISK_FULL
+ && err != ERROR_FILE_EXISTS)) {
+
+ ut_print_timestamp(stderr);
+ fprintf(stderr,
+ " InnoDB: Operating system error number %lu"
+ " in a file operation.\n", (ulong) err);
+
+ if (err == ERROR_PATH_NOT_FOUND) {
+ fprintf(stderr,
+ "InnoDB: The error means the system"
+ " cannot find the path specified.\n");
+
+ if (srv_is_being_started) {
+ fprintf(stderr,
+ "InnoDB: If you are installing InnoDB,"
+ " remember that you must create\n"
+ "InnoDB: directories yourself, InnoDB"
+ " does not create them.\n");
+ }
+ } else if (err == ERROR_ACCESS_DENIED) {
+ fprintf(stderr,
+ "InnoDB: The error means mysqld does not have"
+ " the access rights to\n"
+ "InnoDB: the directory. It may also be"
+ " you have created a subdirectory\n"
+ "InnoDB: of the same name as a data file.\n");
+ } else if (err == ERROR_SHARING_VIOLATION
+ || err == ERROR_LOCK_VIOLATION) {
+ fprintf(stderr,
+ "InnoDB: The error means that another program"
+ " is using InnoDB's files.\n"
+ "InnoDB: This might be a backup or antivirus"
+ " software or another instance\n"
+ "InnoDB: of MySQL."
+ " Please close it to get rid of this error.\n");
+ } else if (err == ERROR_WORKING_SET_QUOTA
+ || err == ERROR_NO_SYSTEM_RESOURCES) {
+ fprintf(stderr,
+ "InnoDB: The error means that there are no"
+ " sufficient system resources or quota to"
+ " complete the operation.\n");
+ } else if (err == ERROR_OPERATION_ABORTED) {
+ fprintf(stderr,
+ "InnoDB: The error means that the I/O"
+ " operation has been aborted\n"
+ "InnoDB: because of either a thread exit"
+ " or an application request.\n"
+ "InnoDB: Retry attempt is made.\n");
+ } else {
+ fprintf(stderr,
+ "InnoDB: Some operating system error numbers"
+ " are described at\n"
+ "InnoDB: "
+ REFMAN
+ "operating-system-error-codes.html\n");
+ }
+ }
+
+ fflush(stderr);
+
+ if (err == ERROR_FILE_NOT_FOUND) {
+ return(OS_FILE_NOT_FOUND);
+ } else if (err == ERROR_DISK_FULL) {
+ return(OS_FILE_DISK_FULL);
+ } else if (err == ERROR_FILE_EXISTS) {
+ return(OS_FILE_ALREADY_EXISTS);
+ } else if (err == ERROR_SHARING_VIOLATION
+ || err == ERROR_LOCK_VIOLATION) {
+ return(OS_FILE_SHARING_VIOLATION);
+ } else if (err == ERROR_WORKING_SET_QUOTA
+ || err == ERROR_NO_SYSTEM_RESOURCES) {
+ return(OS_FILE_INSUFFICIENT_RESOURCE);
+ } else if (err == ERROR_OPERATION_ABORTED) {
+ return(OS_FILE_OPERATION_ABORTED);
+ } else if (err == ERROR_ACCESS_DENIED) {
+ return(OS_FILE_ACCESS_VIOLATION);
+ } else if (err == ERROR_BUFFER_OVERFLOW) {
+ return(OS_FILE_NAME_TOO_LONG);
+ } else {
+ return(OS_FILE_ERROR_MAX + err);
+ }
+#else
+ int err = errno;
+ if (err == 0) {
+ return(0);
+ }
+
+ if (report_all_errors
+ || (err != ENOSPC && err != EEXIST && !on_error_silent)) {
+
+ ut_print_timestamp(stderr);
+ fprintf(stderr,
+ " InnoDB: Operating system error number %d"
+ " in a file operation.\n", err);
+
+ if (err == ENOENT) {
+ fprintf(stderr,
+ "InnoDB: The error means the system"
+ " cannot find the path specified.\n");
+
+ if (srv_is_being_started) {
+ fprintf(stderr,
+ "InnoDB: If you are installing InnoDB,"
+ " remember that you must create\n"
+ "InnoDB: directories yourself, InnoDB"
+ " does not create them.\n");
+ }
+ } else if (err == EACCES) {
+ fprintf(stderr,
+ "InnoDB: The error means mysqld does not have"
+ " the access rights to\n"
+ "InnoDB: the directory.\n");
+ } else if (!os_diagnose_all_o_direct_einval(err)) {
+ if (strerror(err) != NULL) {
+ fprintf(stderr,
+ "InnoDB: Error number %d"
+ " means '%s'.\n",
+ err, strerror(err));
+ }
+
+
+ fprintf(stderr,
+ "InnoDB: Some operating system"
+ " error numbers are described at\n"
+ "InnoDB: "
+ REFMAN
+ "operating-system-error-codes.html\n");
+ }
+ }
+
+ fflush(stderr);
+
+ switch (err) {
+ case ENOSPC:
+ return(OS_FILE_DISK_FULL);
+ case ENOENT:
+ return(OS_FILE_NOT_FOUND);
+ case EEXIST:
+ return(OS_FILE_ALREADY_EXISTS);
+ case ENAMETOOLONG:
+ return(OS_FILE_NAME_TOO_LONG);
+ case EXDEV:
+ case ENOTDIR:
+ case EISDIR:
+ return(OS_FILE_PATH_ERROR);
+ case EAGAIN:
+ if (srv_use_native_aio) {
+ return(OS_FILE_AIO_RESOURCES_RESERVED);
+ }
+ break;
+ case EINTR:
+ if (srv_use_native_aio) {
+ return(OS_FILE_AIO_INTERRUPTED);
+ }
+ break;
+ case EACCES:
+ return(OS_FILE_ACCESS_VIOLATION);
+ }
+ return(OS_FILE_ERROR_MAX + err);
+#endif
+}
+
+/***********************************************************************//**
+Retrieves the last error number if an error occurs in a file io function.
+The number should be retrieved before any other OS calls (because they may
+overwrite the error number). If the number is not known to this program,
+the OS error number + 100 is returned.
+@return error number, or OS error number + 100 */
+UNIV_INTERN
+ulint
+os_file_get_last_error(
+/*===================*/
+ bool report_all_errors) /*!< in: TRUE if we want an error
+ message printed of all errors */
+{
+ return(os_file_get_last_error_low(report_all_errors, false));
+}
+
+/****************************************************************//**
+Does error handling when a file operation fails.
+Conditionally exits (calling exit(3)) based on should_exit value and the
+error type, if should_exit is TRUE then on_error_silent is ignored.
+@return TRUE if we should retry the operation */
+static
+ibool
+os_file_handle_error_cond_exit(
+/*===========================*/
+ const char* name, /*!< in: name of a file or NULL */
+ const char* operation, /*!< in: operation */
+ ibool should_exit, /*!< in: call exit(3) if unknown error
+ and this parameter is TRUE */
+ ibool on_error_silent)/*!< in: if TRUE then don't print
+ any message to the log iff it is
+ an unknown non-fatal error */
+{
+ ulint err;
+
+ err = os_file_get_last_error_low(false, on_error_silent);
+
+ switch (err) {
+ case OS_FILE_DISK_FULL:
+ /* We only print a warning about disk full once */
+
+ if (os_has_said_disk_full) {
+
+ return(FALSE);
+ }
+
+ /* Disk full error is reported irrespective of the
+ on_error_silent setting. */
+
+ if (name) {
+ ut_print_timestamp(stderr);
+ fprintf(stderr,
+ " InnoDB: Encountered a problem with"
+ " file %s\n", name);
+ }
+
+ ut_print_timestamp(stderr);
+ fprintf(stderr,
+ " InnoDB: Disk is full. Try to clean the disk"
+ " to free space.\n");
+
+ os_has_said_disk_full = TRUE;
+
+ fflush(stderr);
+
+ ut_error;
+ return(FALSE);
+
+ case OS_FILE_AIO_RESOURCES_RESERVED:
+ case OS_FILE_AIO_INTERRUPTED:
+
+ return(TRUE);
+
+ case OS_FILE_PATH_ERROR:
+ case OS_FILE_ALREADY_EXISTS:
+ case OS_FILE_ACCESS_VIOLATION:
+
+ return(FALSE);
+
+ case OS_FILE_SHARING_VIOLATION:
+
+ os_thread_sleep(10000000); /* 10 sec */
+ return(TRUE);
+
+ case OS_FILE_OPERATION_ABORTED:
+ case OS_FILE_INSUFFICIENT_RESOURCE:
+
+ os_thread_sleep(100000); /* 100 ms */
+ return(TRUE);
+
+ default:
+
+ /* If it is an operation that can crash on error then it
+ is better to ignore on_error_silent and print an error message
+ to the log. */
+
+ if (should_exit || !on_error_silent) {
+ ib_logf(IB_LOG_LEVEL_ERROR, "File %s: '%s' returned OS "
+ "error " ULINTPF ".%s", name ? name : "(unknown)",
+ operation, err, should_exit
+ ? " Cannot continue operation" : "");
+ }
+
+ if (should_exit) {
+ exit(1);
+ }
+ }
+
+ return(FALSE);
+}
+
+/****************************************************************//**
+Does error handling when a file operation fails.
+@return TRUE if we should retry the operation */
+static
+ibool
+os_file_handle_error(
+/*=================*/
+ const char* name, /*!< in: name of a file or NULL */
+ const char* operation) /*!< in: operation */
+{
+ /* exit in case of unknown error */
+ return(os_file_handle_error_cond_exit(name, operation, TRUE, FALSE));
+}
+
+/****************************************************************//**
+Does error handling when a file operation fails.
+@return TRUE if we should retry the operation */
+ibool
+os_file_handle_error_no_exit(
+/*=========================*/
+ const char* name, /*!< in: name of a file or NULL */
+ const char* operation, /*!< in: operation */
+ ibool on_error_silent)/*!< in: if TRUE then don't print
+ any message to the log. */
+{
+ /* don't exit in case of unknown error */
+ return(os_file_handle_error_cond_exit(
+ name, operation, FALSE, on_error_silent));
+}
+
+#undef USE_FILE_LOCK
+#define USE_FILE_LOCK
+#if defined(UNIV_HOTBACKUP) || defined(__WIN__)
+/* InnoDB Hot Backup does not lock the data files.
+ * On Windows, mandatory locking is used.
+ */
+# undef USE_FILE_LOCK
+#endif
+#ifdef USE_FILE_LOCK
+/****************************************************************//**
+Obtain an exclusive lock on a file.
+@return 0 on success */
+static
+int
+os_file_lock(
+/*=========*/
+ int fd, /*!< in: file descriptor */
+ const char* name) /*!< in: file name */
+{
+ struct flock lk;
+
+ ut_ad(!srv_read_only_mode);
+
+ lk.l_type = F_WRLCK;
+ lk.l_whence = SEEK_SET;
+ lk.l_start = lk.l_len = 0;
+
+ if (fcntl(fd, F_SETLK, &lk) == -1) {
+
+ ib_logf(IB_LOG_LEVEL_ERROR,
+ "Unable to lock %s, error: %d", name, errno);
+
+ if (errno == EAGAIN || errno == EACCES) {
+ ib_logf(IB_LOG_LEVEL_INFO,
+ "Check that you do not already have "
+ "another mysqld process using the "
+ "same InnoDB data or log files.");
+ }
+
+ return(-1);
+ }
+
+ return(0);
+}
+#endif /* USE_FILE_LOCK */
+
+#ifndef UNIV_HOTBACKUP
+/****************************************************************//**
+Creates the seek mutexes used in positioned reads and writes. */
+static
+void
+os_io_init_simple(void)
+/*===================*/
+{
+ for (ulint i = 0; i < OS_FILE_N_SEEK_MUTEXES; i++) {
+ os_file_seek_mutexes[i] = os_mutex_create();
+ }
+#ifdef _WIN32
+ win_init_syncio_event();
+#endif
+}
+
+/** Create a temporary file. This function is like tmpfile(3), but
+the temporary file is created in the given parameter path. If the path
+is null then it will create the file in the mysql server configuration
+parameter (--tmpdir).
+@param[in] path location for creating temporary file
+@return temporary file handle, or NULL on error */
+UNIV_INTERN
+FILE*
+os_file_create_tmpfile(
+ const char* path)
+{
+ FILE* file = NULL;
+ int fd;
+ WAIT_ALLOW_WRITES();
+ fd = innobase_mysql_tmpfile(path);
+
+ ut_ad(!srv_read_only_mode);
+
+ if (fd >= 0) {
+ file = fdopen(fd, "w+b");
+ }
+
+ if (!file) {
+ ut_print_timestamp(stderr);
+ fprintf(stderr,
+ " InnoDB: Error: unable to create temporary file;"
+ " errno: %d\n", errno);
+ if (fd >= 0) {
+ close(fd);
+ }
+ }
+
+ return(file);
+}
+#endif /* !UNIV_HOTBACKUP */
+
+/***********************************************************************//**
+The os_file_opendir() function opens a directory stream corresponding to the
+directory named by the dirname argument. The directory stream is positioned
+at the first entry. In both Unix and Windows we automatically skip the '.'
+and '..' items at the start of the directory listing.
+@return directory stream, NULL if error */
+UNIV_INTERN
+os_file_dir_t
+os_file_opendir(
+/*============*/
+ const char* dirname, /*!< in: directory name; it must not
+ contain a trailing '\' or '/' */
+ ibool error_is_fatal) /*!< in: TRUE if we should treat an
+ error as a fatal error; if we try to
+ open symlinks then we do not wish a
+ fatal error if it happens not to be
+ a directory */
+{
+ os_file_dir_t dir;
+#ifdef __WIN__
+ LPWIN32_FIND_DATA lpFindFileData;
+ char path[OS_FILE_MAX_PATH + 3];
+
+ ut_a(strlen(dirname) < OS_FILE_MAX_PATH);
+
+ strcpy(path, dirname);
+ strcpy(path + strlen(path), "\\*");
+
+ /* Note that in Windows opening the 'directory stream' also retrieves
+ the first entry in the directory. Since it is '.', that is no problem,
+ as we will skip over the '.' and '..' entries anyway. */
+
+ lpFindFileData = static_cast<LPWIN32_FIND_DATA>(
+ ut_malloc(sizeof(WIN32_FIND_DATA)));
+
+ dir = FindFirstFile((LPCTSTR) path, lpFindFileData);
+
+ ut_free(lpFindFileData);
+
+ if (dir == INVALID_HANDLE_VALUE) {
+
+ if (error_is_fatal) {
+ os_file_handle_error(dirname, "opendir");
+ }
+
+ return(NULL);
+ }
+
+ return(dir);
+#else
+ dir = opendir(dirname);
+
+ if (dir == NULL && error_is_fatal) {
+ os_file_handle_error(dirname, "opendir");
+ }
+
+ return(dir);
+#endif /* __WIN__ */
+}
+
+/***********************************************************************//**
+Closes a directory stream.
+@return 0 if success, -1 if failure */
+UNIV_INTERN
+int
+os_file_closedir(
+/*=============*/
+ os_file_dir_t dir) /*!< in: directory stream */
+{
+#ifdef __WIN__
+ BOOL ret;
+
+ ret = FindClose(dir);
+
+ if (!ret) {
+ os_file_handle_error_no_exit(NULL, "closedir", FALSE);
+
+ return(-1);
+ }
+
+ return(0);
+#else
+ int ret;
+
+ ret = closedir(dir);
+
+ if (ret) {
+ os_file_handle_error_no_exit(NULL, "closedir", FALSE);
+ }
+
+ return(ret);
+#endif /* __WIN__ */
+}
+
+/***********************************************************************//**
+This function returns information of the next file in the directory. We jump
+over the '.' and '..' entries in the directory.
+@return 0 if ok, -1 if error, 1 if at the end of the directory */
+UNIV_INTERN
+int
+os_file_readdir_next_file(
+/*======================*/
+ const char* dirname,/*!< in: directory name or path */
+ os_file_dir_t dir, /*!< in: directory stream */
+ os_file_stat_t* info) /*!< in/out: buffer where the info is returned */
+{
+#ifdef __WIN__
+ LPWIN32_FIND_DATA lpFindFileData;
+ BOOL ret;
+
+ lpFindFileData = static_cast<LPWIN32_FIND_DATA>(
+ ut_malloc(sizeof(WIN32_FIND_DATA)));
+next_file:
+ ret = FindNextFile(dir, lpFindFileData);
+
+ if (ret) {
+ ut_a(strlen((char*) lpFindFileData->cFileName)
+ < OS_FILE_MAX_PATH);
+
+ if (strcmp((char*) lpFindFileData->cFileName, ".") == 0
+ || strcmp((char*) lpFindFileData->cFileName, "..") == 0) {
+
+ goto next_file;
+ }
+
+ strcpy(info->name, (char*) lpFindFileData->cFileName);
+
+ info->size = (ib_int64_t)(lpFindFileData->nFileSizeLow)
+ + (((ib_int64_t)(lpFindFileData->nFileSizeHigh))
+ << 32);
+
+ if (lpFindFileData->dwFileAttributes
+ & FILE_ATTRIBUTE_REPARSE_POINT) {
+ /* TODO: test Windows symlinks */
+ /* TODO: MySQL has apparently its own symlink
+ implementation in Windows, dbname.sym can
+ redirect a database directory:
+ REFMAN "windows-symbolic-links.html" */
+ info->type = OS_FILE_TYPE_LINK;
+ } else if (lpFindFileData->dwFileAttributes
+ & FILE_ATTRIBUTE_DIRECTORY) {
+ info->type = OS_FILE_TYPE_DIR;
+ } else {
+ /* It is probably safest to assume that all other
+ file types are normal. Better to check them rather
+ than blindly skip them. */
+
+ info->type = OS_FILE_TYPE_FILE;
+ }
+ }
+
+ ut_free(lpFindFileData);
+
+ if (ret) {
+ return(0);
+ } else if (GetLastError() == ERROR_NO_MORE_FILES) {
+
+ return(1);
+ } else {
+ os_file_handle_error_no_exit(NULL, "readdir_next_file", FALSE);
+ return(-1);
+ }
+#else
+ struct dirent* ent;
+ char* full_path;
+ int ret;
+ struct stat statinfo;
+
+next_file:
+
+ ent = readdir(dir);
+
+ if (ent == NULL) {
+
+ return(1);
+ }
+ ut_a(strlen(ent->d_name) < OS_FILE_MAX_PATH);
+
+ if (strcmp(ent->d_name, ".") == 0 || strcmp(ent->d_name, "..") == 0) {
+
+ goto next_file;
+ }
+
+ strcpy(info->name, ent->d_name);
+
+ full_path = static_cast<char*>(
+ ut_malloc(strlen(dirname) + strlen(ent->d_name) + 10));
+
+ sprintf(full_path, "%s/%s", dirname, ent->d_name);
+
+ ret = stat(full_path, &statinfo);
+
+ if (ret) {
+
+ if (errno == ENOENT) {
+ /* readdir() returned a file that does not exist,
+ it must have been deleted in the meantime. Do what
+ would have happened if the file was deleted before
+ readdir() - ignore and go to the next entry.
+ If this is the last entry then info->name will still
+ contain the name of the deleted file when this
+ function returns, but this is not an issue since the
+ caller shouldn't be looking at info when end of
+ directory is returned. */
+
+ ut_free(full_path);
+
+ goto next_file;
+ }
+
+ os_file_handle_error_no_exit(full_path, "stat", FALSE);
+
+ ut_free(full_path);
+
+ return(-1);
+ }
+
+ info->size = (ib_int64_t) statinfo.st_size;
+
+ if (S_ISDIR(statinfo.st_mode)) {
+ info->type = OS_FILE_TYPE_DIR;
+ } else if (S_ISLNK(statinfo.st_mode)) {
+ info->type = OS_FILE_TYPE_LINK;
+ } else if (S_ISREG(statinfo.st_mode)) {
+ info->type = OS_FILE_TYPE_FILE;
+ } else {
+ info->type = OS_FILE_TYPE_UNKNOWN;
+ }
+
+ ut_free(full_path);
+
+ return(0);
+#endif
+}
+
+/*****************************************************************//**
+This function attempts to create a directory named pathname. The new
+directory gets default permissions. On Unix the permissions are
+(0770 & ~umask). If the directory exists already, nothing is done and
+the call succeeds, unless the fail_if_exists arguments is true.
+If another error occurs, such as a permission error, this does not crash,
+but reports the error and returns FALSE.
+@return TRUE if call succeeds, FALSE on error */
+UNIV_INTERN
+ibool
+os_file_create_directory(
+/*=====================*/
+ const char* pathname, /*!< in: directory name as
+ null-terminated string */
+ ibool fail_if_exists) /*!< in: if TRUE, pre-existing directory
+ is treated as an error. */
+{
+#ifdef __WIN__
+ BOOL rcode;
+
+ rcode = CreateDirectory((LPCTSTR) pathname, NULL);
+ if (!(rcode != 0
+ || (GetLastError() == ERROR_ALREADY_EXISTS
+ && !fail_if_exists))) {
+
+ os_file_handle_error_no_exit(
+ pathname, "CreateDirectory", FALSE);
+
+ return(FALSE);
+ }
+
+ return(TRUE);
+#else
+ int rcode;
+ WAIT_ALLOW_WRITES();
+
+ rcode = mkdir(pathname, 0770);
+
+ if (!(rcode == 0 || (errno == EEXIST && !fail_if_exists))) {
+ /* failure */
+ os_file_handle_error_no_exit(pathname, "mkdir", FALSE);
+
+ return(FALSE);
+ }
+
+ return (TRUE);
+#endif /* __WIN__ */
+}
+
+/****************************************************************//**
+NOTE! Use the corresponding macro os_file_create_simple(), not directly
+this function!
+A simple function to open or create a file.
+@return own: handle to the file, not defined if error, error number
+can be retrieved with os_file_get_last_error */
+UNIV_INTERN
+os_file_t
+os_file_create_simple_func(
+/*=======================*/
+ const char* name, /*!< in: name of the file or path as a
+ null-terminated string */
+ ulint create_mode,/*!< in: create mode */
+ ulint access_type,/*!< in: OS_FILE_READ_ONLY or
+ OS_FILE_READ_WRITE */
+ ibool* success)/*!< out: TRUE if succeed, FALSE if error */
+{
+ os_file_t file;
+ ibool retry;
+
+ *success = FALSE;
+#ifdef __WIN__
+ DWORD access;
+ DWORD create_flag;
+ DWORD attributes = 0;
+
+ ut_a(!(create_mode & OS_FILE_ON_ERROR_SILENT));
+ ut_a(!(create_mode & OS_FILE_ON_ERROR_NO_EXIT));
+
+ if (create_mode == OS_FILE_OPEN) {
+
+ create_flag = OPEN_EXISTING;
+
+ } else if (srv_read_only_mode) {
+
+ create_flag = OPEN_EXISTING;
+
+ } else if (create_mode == OS_FILE_CREATE) {
+
+ create_flag = CREATE_NEW;
+
+ } else if (create_mode == OS_FILE_CREATE_PATH) {
+
+ ut_a(!srv_read_only_mode);
+
+ /* Create subdirs along the path if needed */
+ *success = os_file_create_subdirs_if_needed(name);
+
+ if (!*success) {
+
+ ib_logf(IB_LOG_LEVEL_ERROR,
+ "Unable to create subdirectories '%s'",
+ name);
+
+ return((os_file_t) -1);
+ }
+
+ create_flag = CREATE_NEW;
+ create_mode = OS_FILE_CREATE;
+
+ } else {
+ ib_logf(IB_LOG_LEVEL_ERROR,
+ "Unknown file create mode (%lu) for file '%s'",
+ create_mode, name);
+
+ return((os_file_t) -1);
+ }
+
+ if (access_type == OS_FILE_READ_ONLY) {
+ access = GENERIC_READ;
+ } else if (srv_read_only_mode) {
+
+ ib_logf(IB_LOG_LEVEL_INFO,
+ "read only mode set. Unable to "
+ "open file '%s' in RW mode, trying RO mode", name);
+
+ access = GENERIC_READ;
+
+ } else if (access_type == OS_FILE_READ_WRITE) {
+ access = GENERIC_READ | GENERIC_WRITE;
+ } else {
+ ib_logf(IB_LOG_LEVEL_ERROR,
+ "Unknown file access type (%lu) for file '%s'",
+ access_type, name);
+
+ return((os_file_t) -1);
+ }
+
+ do {
+ /* Use default security attributes and no template file. */
+
+ file = CreateFile(
+ (LPCTSTR) name, access, FILE_SHARE_READ, NULL,
+ create_flag, attributes, NULL);
+
+ if (file == INVALID_HANDLE_VALUE) {
+
+ *success = FALSE;
+
+ retry = os_file_handle_error(
+ name, create_mode == OS_FILE_OPEN ?
+ "open" : "create");
+
+ } else {
+ *success = TRUE;
+ retry = false;
+ }
+
+ } while (retry);
+
+#else /* __WIN__ */
+ int create_flag;
+ if (create_mode != OS_FILE_OPEN && create_mode != OS_FILE_OPEN_RAW)
+ WAIT_ALLOW_WRITES();
+
+ ut_a(!(create_mode & OS_FILE_ON_ERROR_SILENT));
+ ut_a(!(create_mode & OS_FILE_ON_ERROR_NO_EXIT));
+
+ if (create_mode == OS_FILE_OPEN) {
+
+ if (access_type == OS_FILE_READ_ONLY) {
+ create_flag = O_RDONLY;
+ } else if (srv_read_only_mode) {
+ create_flag = O_RDONLY;
+ } else {
+ create_flag = O_RDWR;
+ }
+
+ } else if (srv_read_only_mode) {
+
+ create_flag = O_RDONLY;
+
+ } else if (create_mode == OS_FILE_CREATE) {
+
+ create_flag = O_RDWR | O_CREAT | O_EXCL;
+
+ } else if (create_mode == OS_FILE_CREATE_PATH) {
+
+ /* Create subdirs along the path if needed */
+
+ *success = os_file_create_subdirs_if_needed(name);
+
+ if (!*success) {
+
+ ib_logf(IB_LOG_LEVEL_ERROR,
+ "Unable to create subdirectories '%s'",
+ name);
+
+ return((os_file_t) -1);
+ }
+
+ create_flag = O_RDWR | O_CREAT | O_EXCL;
+ create_mode = OS_FILE_CREATE;
+ } else {
+
+ ib_logf(IB_LOG_LEVEL_ERROR,
+ "Unknown file create mode (%lu) for file '%s'",
+ create_mode, name);
+
+ return((os_file_t) -1);
+ }
+
+ do {
- file = ::open(name, create_flag, os_innodb_umask);
++ file = ::open(name, create_flag | O_CLOEXEC, os_innodb_umask);
+
+ if (file == -1) {
+ *success = FALSE;
+
+ retry = os_file_handle_error(
+ name,
+ create_mode == OS_FILE_OPEN
+ ? "open" : "create");
+ } else {
+ *success = TRUE;
+ retry = false;
+ }
+
+ } while (retry);
+
+#ifdef USE_FILE_LOCK
+ if (!srv_read_only_mode
+ && *success
+ && access_type == OS_FILE_READ_WRITE
+ && os_file_lock(file, name)) {
+
+ *success = FALSE;
+ close(file);
+ file = -1;
+ }
+#endif /* USE_FILE_LOCK */
+
+#endif /* __WIN__ */
+
+ return(file);
+}
+
+/** Disable OS I/O caching on the file if the file type and server
+configuration requires it.
+@param file handle to the file
+@param name name of the file, for diagnostics
+@param mode_str operation on the file, for diagnostics
+@param type OS_LOG_FILE or OS_DATA_FILE
+@param access_type if OS_FILE_READ_WRITE_CACHED, then caching will be disabled
+unconditionally, ignored otherwise */
+static
+void
+os_file_set_nocache_if_needed(os_file_t file, const char* name,
+ const char *mode_str, ulint type,
+ ulint access_type)
+{
+ if (srv_read_only_mode || access_type == OS_FILE_READ_WRITE_CACHED)
+ return;
+
+ if (srv_unix_file_flush_method == SRV_UNIX_ALL_O_DIRECT
+ || (type != OS_LOG_FILE
+ && (srv_unix_file_flush_method == SRV_UNIX_O_DIRECT
+ || (srv_unix_file_flush_method
+ == SRV_UNIX_O_DIRECT_NO_FSYNC))))
+ /* Do fsync() on log files when setting O_DIRECT fails.
+ See log_io_complete() */
+ if (!os_file_set_nocache(file, name, mode_str)
+ && srv_unix_file_flush_method == SRV_UNIX_ALL_O_DIRECT)
+ srv_unix_file_flush_method = SRV_UNIX_O_DIRECT;
+}
+
+/****************************************************************//**
+NOTE! Use the corresponding macro
+os_file_create_simple_no_error_handling(), not directly this function!
+A simple function to open or create a file.
+@return own: handle to the file, not defined if error, error number
+can be retrieved with os_file_get_last_error */
+UNIV_INTERN
+pfs_os_file_t
+os_file_create_simple_no_error_handling_func(
+/*=========================================*/
+ const char* name, /*!< in: name of the file or path as a
+ null-terminated string */
+ ulint create_mode,/*!< in: create mode */
+ ulint access_type,/*!< in: OS_FILE_READ_ONLY,
+ OS_FILE_READ_WRITE,
+ OS_FILE_READ_ALLOW_DELETE (used by a backup
+ program reading the file), or
+ OS_FILE_READ_WRITE_CACHED (disable O_DIRECT
+ if it would be enabled otherwise) */
+ ibool* success)/*!< out: TRUE if succeed, FALSE if error */
+{
+ pfs_os_file_t file;
+
+ *success = FALSE;
+#ifdef __WIN__
+ DWORD access;
+ DWORD create_flag;
+ DWORD attributes = 0;
+ DWORD share_mode = FILE_SHARE_READ;
+ ut_a(name);
+
+ ut_a(!(create_mode & OS_FILE_ON_ERROR_SILENT));
+ ut_a(!(create_mode & OS_FILE_ON_ERROR_NO_EXIT));
+
+ if (create_mode == OS_FILE_OPEN) {
+ create_flag = OPEN_EXISTING;
+ } else if (srv_read_only_mode) {
+ create_flag = OPEN_EXISTING;
+ } else if (create_mode == OS_FILE_CREATE) {
+ create_flag = CREATE_NEW;
+ } else {
+
+ ib_logf(IB_LOG_LEVEL_ERROR,
+ "Unknown file create mode (%lu) for file '%s'",
+ create_mode, name);
+ file.m_file = (os_file_t)-1;
+ return(file);
+ }
+
+ if (access_type == OS_FILE_READ_ONLY) {
+ access = GENERIC_READ;
+ } else if (srv_read_only_mode) {
+ access = GENERIC_READ;
+ } else if (access_type == OS_FILE_READ_WRITE
+ || access_type == OS_FILE_READ_WRITE_CACHED) {
+ access = GENERIC_READ | GENERIC_WRITE;
+ } else if (access_type == OS_FILE_READ_ALLOW_DELETE) {
+
+ ut_a(!srv_read_only_mode);
+
+ access = GENERIC_READ;
+
+ /*!< A backup program has to give mysqld the maximum
+ freedom to do what it likes with the file */
+
+ share_mode |= FILE_SHARE_DELETE | FILE_SHARE_WRITE;
+ } else {
+ ib_logf(IB_LOG_LEVEL_ERROR,
+ "Unknown file access type (%lu) for file '%s'",
+ access_type, name);
+ file.m_file = (os_file_t)-1;
+ return(file);
+ }
+
+ file.m_file = CreateFile((LPCTSTR) name,
+ access,
+ share_mode,
+ NULL, // Security attributes
+ create_flag,
+ attributes,
+ NULL); // No template file
+
+ *success = (file.m_file != INVALID_HANDLE_VALUE);
+#else /* __WIN__ */
+ int create_flag;
+ const char* mode_str = NULL;
+ ut_a(name);
+ if (create_mode != OS_FILE_OPEN && create_mode != OS_FILE_OPEN_RAW)
+ WAIT_ALLOW_WRITES();
+
+ ut_a(!(create_mode & OS_FILE_ON_ERROR_SILENT));
+ ut_a(!(create_mode & OS_FILE_ON_ERROR_NO_EXIT));
+
+ if (create_mode == OS_FILE_OPEN) {
+
+ mode_str = "OPEN";
+
+ if (access_type == OS_FILE_READ_ONLY) {
+
+ create_flag = O_RDONLY;
+
+ } else if (srv_read_only_mode) {
+
+ create_flag = O_RDONLY;
+
+ } else {
+
+ ut_a(access_type == OS_FILE_READ_WRITE
+ || access_type == OS_FILE_READ_ALLOW_DELETE
+ || access_type == OS_FILE_READ_WRITE_CACHED);
+
+ create_flag = O_RDWR;
+ }
+
+ } else if (srv_read_only_mode) {
+
+ mode_str = "OPEN";
+
+ create_flag = O_RDONLY;
+
+ } else if (create_mode == OS_FILE_CREATE) {
+
+ mode_str = "CREATE";
+
+ create_flag = O_RDWR | O_CREAT | O_EXCL;
+
+ } else {
+ ib_logf(IB_LOG_LEVEL_ERROR,
+ "Unknown file create mode (%lu) for file '%s'",
+ create_mode, name);
+ file.m_file = -1;
+ return(file);
+ }
+
- file.m_file = ::open(name, create_flag, os_innodb_umask);
++ file.m_file = ::open(name, create_flag | O_CLOEXEC , os_innodb_umask);
+
+ *success = file.m_file == -1 ? FALSE : TRUE;
+
+ /* This function is always called for data files, we should disable
+ OS caching (O_DIRECT) here as we do in os_file_create_func(), so
+ we open the same file in the same mode, see man page of open(2). */
+ if (*success) {
+ os_file_set_nocache_if_needed(file.m_file, name, mode_str,
+ OS_DATA_FILE, access_type);
+ }
+
+#ifdef USE_FILE_LOCK
+ if (!srv_read_only_mode
+ && *success
+ && (access_type == OS_FILE_READ_WRITE
+ || access_type == OS_FILE_READ_WRITE_CACHED)
+ && os_file_lock(file.m_file, name)) {
+
+ *success = FALSE;
+ close(file.m_file);
+ file.m_file = -1;
+
+ }
+#endif /* USE_FILE_LOCK */
+
+#endif /* __WIN__ */
+
+ return(file);
+}
+
+/****************************************************************//**
+Tries to disable OS caching on an opened file descriptor.
+@return TRUE if operation is success and FALSE otherwise */
+UNIV_INTERN
+bool
+os_file_set_nocache(
+/*================*/
+ os_file_t fd /*!< in: file descriptor to alter */
+ MY_ATTRIBUTE((unused)),
+ const char* file_name /*!< in: used in the diagnostic
+ message */
+ MY_ATTRIBUTE((unused)),
+ const char* operation_name MY_ATTRIBUTE((unused)))
+ /*!< in: "open" or "create"; used
+ in the diagnostic message */
+{
+ /* some versions of Solaris may not have DIRECTIO_ON */
+#if defined(UNIV_SOLARIS) && defined(DIRECTIO_ON)
+ if (directio(fd, DIRECTIO_ON) == -1) {
+ int errno_save = errno;
+
+ ib_logf(IB_LOG_LEVEL_ERROR,
+ "Failed to set DIRECTIO_ON on file %s: %s: %s, "
+ "continuing anyway.",
+ file_name, operation_name, strerror(errno_save));
+ return false;
+ }
+#elif defined(O_DIRECT)
+ if (fcntl(fd, F_SETFL, O_DIRECT) == -1) {
+ int errno_save = errno;
+ static bool warning_message_printed = false;
+ if (errno_save == EINVAL) {
+ if (!warning_message_printed) {
+ warning_message_printed = true;
+# ifdef UNIV_LINUX
+ ib_logf(IB_LOG_LEVEL_WARN,
+ "Failed to set O_DIRECT on file "
+ "%s: %s: %s, continuing anyway. "
+ "O_DIRECT is known to result "
+ "in 'Invalid argument' on Linux on "
+ "tmpfs, see MySQL Bug#26662.",
+ file_name, operation_name,
+ strerror(errno_save));
+# else /* UNIV_LINUX */
+ goto short_warning;
+# endif /* UNIV_LINUX */
+ }
+ } else {
+# ifndef UNIV_LINUX
+short_warning:
+# endif
+ ib_logf(IB_LOG_LEVEL_WARN,
+ "Failed to set O_DIRECT on file %s: %s: %s, "
+ "continuing anyway.",
+ file_name, operation_name, strerror(errno_save));
+ }
+ return false;
+ }
+#endif /* defined(UNIV_SOLARIS) && defined(DIRECTIO_ON) */
+ return true;
+}
+
+
+/****************************************************************//**
+Tries to enable the atomic write feature, if available, for the specified file
+handle.
+@return TRUE if success */
+static MY_ATTRIBUTE((warn_unused_result))
+ibool
+os_file_set_atomic_writes(
+/*======================*/
+ const char* name /*!< in: name of the file */
+ MY_ATTRIBUTE((unused)),
+ os_file_t file /*!< in: handle to the file */
+ MY_ATTRIBUTE((unused)))
+
+{
+#ifdef DFS_IOCTL_ATOMIC_WRITE_SET
+ int atomic_option = 1;
+
+ if (ioctl(file, DFS_IOCTL_ATOMIC_WRITE_SET, &atomic_option)) {
+
+ os_file_handle_error_no_exit(name, "ioctl", FALSE);
+ return(FALSE);
+ }
+
+ return(TRUE);
+#else
+ ib_logf(IB_LOG_LEVEL_ERROR,
+ "trying to enable atomic writes on non-supported platform! "
+ "Please restart with innodb_use_atomic_writes disabled.");
+ return(FALSE);
+#endif
+}
+
+/****************************************************************//**
+NOTE! Use the corresponding macro os_file_create(), not directly
+this function!
+Opens an existing file or creates a new.
+@return own: handle to the file, not defined if error, error number
+can be retrieved with os_file_get_last_error */
+UNIV_INTERN
+pfs_os_file_t
+os_file_create_func(
+/*================*/
+ const char* name, /*!< in: name of the file or path as a
+ null-terminated string */
+ ulint create_mode,/*!< in: create mode */
+ ulint purpose,/*!< in: OS_FILE_AIO, if asynchronous,
+ non-buffered i/o is desired,
+ OS_FILE_NORMAL, if any normal file;
+ NOTE that it also depends on type, os_aio_..
+ and srv_.. variables whether we really use
+ async i/o or unbuffered i/o: look in the
+ function source code for the exact rules */
+ ulint type, /*!< in: OS_DATA_FILE or OS_LOG_FILE */
+ ibool* success)/*!< out: TRUE if succeed, FALSE if error */
+{
+ pfs_os_file_t file;
+ ibool retry;
+ ibool on_error_no_exit;
+ ibool on_error_silent;
+#ifdef __WIN__
+ DBUG_EXECUTE_IF(
+ "ib_create_table_fail_disk_full",
+ *success = FALSE;
+ SetLastError(ERROR_DISK_FULL);
+ file.m_file = (os_file_t)-1;
+ return(file);
+ );
+#else /* __WIN__ */
+ DBUG_EXECUTE_IF(
+ "ib_create_table_fail_disk_full",
+ *success = FALSE;
+ errno = ENOSPC;
+ file.m_file = -1;
+ return(file);
+ );
+#endif /* __WIN__ */
+
+#ifdef __WIN__
+ DWORD create_flag;
+ DWORD share_mode = FILE_SHARE_READ;
+
+ on_error_no_exit = create_mode & OS_FILE_ON_ERROR_NO_EXIT
+ ? TRUE : FALSE;
+
+ on_error_silent = create_mode & OS_FILE_ON_ERROR_SILENT
+ ? TRUE : FALSE;
+
+ create_mode &= ~OS_FILE_ON_ERROR_NO_EXIT;
+ create_mode &= ~OS_FILE_ON_ERROR_SILENT;
+
+ if (create_mode == OS_FILE_OPEN_RAW) {
+
+ ut_a(!srv_read_only_mode);
+
+ create_flag = OPEN_EXISTING;
+
+ /* On Windows Physical devices require admin privileges and
+ have to have the write-share mode set. See the remarks
+ section for the CreateFile() function documentation in MSDN. */
+
+ share_mode |= FILE_SHARE_WRITE;
+
+ } else if (create_mode == OS_FILE_OPEN
+ || create_mode == OS_FILE_OPEN_RETRY) {
+
+ create_flag = OPEN_EXISTING;
+
+ } else if (srv_read_only_mode) {
+
+ create_flag = OPEN_EXISTING;
+
+ } else if (create_mode == OS_FILE_CREATE) {
+
+ create_flag = CREATE_NEW;
+
+ } else if (create_mode == OS_FILE_OVERWRITE) {
+
+ create_flag = CREATE_ALWAYS;
+
+ } else {
+ ib_logf(IB_LOG_LEVEL_ERROR,
+ "Unknown file create mode (%lu) for file '%s'",
+ create_mode, name);
+
+ file.m_file = (os_file_t)-1;
+ return(file);
+ }
+
+ DWORD attributes = 0;
+
+#ifdef UNIV_HOTBACKUP
+ attributes |= FILE_FLAG_NO_BUFFERING;
+#else
+ if (purpose == OS_FILE_AIO) {
+
+#ifdef WIN_ASYNC_IO
+ /* If specified, use asynchronous (overlapped) io and no
+ buffering of writes in the OS */
+
+ if (srv_use_native_aio) {
+ attributes |= FILE_FLAG_OVERLAPPED;
+ }
+#endif /* WIN_ASYNC_IO */
+
+ } else if (purpose == OS_FILE_NORMAL) {
+ /* Use default setting. */
+ } else {
+ ib_logf(IB_LOG_LEVEL_ERROR,
+ "Unknown purpose flag (%lu) while opening file '%s'",
+ purpose, name);
+ file.m_file = (os_file_t)-1;
+ return(file);
+ }
+
+#ifdef UNIV_NON_BUFFERED_IO
+ // TODO: Create a bug, this looks wrong. The flush log
+ // parameter is dynamic.
+ if (type == OS_LOG_FILE && thd_flush_log_at_trx_commit(NULL) == 2) {
+
+ /* Do not use unbuffered i/o for the log files because
+ value 2 denotes that we do not flush the log at every
+ commit, but only once per second */
+
+ } else if (srv_win_file_flush_method == SRV_WIN_IO_UNBUFFERED) {
+
+ attributes |= FILE_FLAG_NO_BUFFERING;
+ }
+#endif /* UNIV_NON_BUFFERED_IO */
+
+#endif /* UNIV_HOTBACKUP */
+ DWORD access = GENERIC_READ;
+
+ if (!srv_read_only_mode) {
+ access |= GENERIC_WRITE;
+ }
+
+ if (type == OS_LOG_FILE) {
+ if (srv_unix_file_flush_method == SRV_UNIX_O_DSYNC) {
+ /* Map O_DSYNC to WRITE_THROUGH */
+ attributes |= FILE_FLAG_WRITE_THROUGH;
+ } else if (srv_unix_file_flush_method == SRV_UNIX_ALL_O_DIRECT) {
+ /* Open log file without buffering */
+ attributes |= FILE_FLAG_NO_BUFFERING;
+ }
+ }
+
+ do {
+ /* Use default security attributes and no template file. */
+ file.m_file = CreateFile(
+ (LPCTSTR) name, access, share_mode, NULL,
+ create_flag, attributes, NULL);
+
+ if (file.m_file == INVALID_HANDLE_VALUE) {
+ const char* operation;
+
+ operation = (create_mode == OS_FILE_CREATE
+ && !srv_read_only_mode)
+ ? "create" : "open";
+
+ *success = FALSE;
+
+ if (on_error_no_exit) {
+ retry = os_file_handle_error_no_exit(
+ name, operation, on_error_silent);
+ } else {
+ retry = os_file_handle_error(name, operation);
+ }
+ } else {
+ *success = TRUE;
+ retry = FALSE;
+ if (srv_use_native_aio && ((attributes & FILE_FLAG_OVERLAPPED) != 0)) {
+ ut_a(CreateIoCompletionPort(file.m_file, completion_port, 0, 0));
+ }
+ }
+
+ } while (retry);
+
+ if (srv_use_atomic_writes && type == OS_DATA_FILE &&
+ !os_file_set_atomic_writes(name, file.m_file)) {
+ CloseHandle(file.m_file);
+ *success = FALSE;
+ file.m_file = INVALID_HANDLE_VALUE;
+ }
+
+#else /* __WIN__ */
+ int create_flag;
+ const char* mode_str = NULL;
+ if (create_mode != OS_FILE_OPEN && create_mode != OS_FILE_OPEN_RAW)
+ WAIT_ALLOW_WRITES();
+
+ on_error_no_exit = create_mode & OS_FILE_ON_ERROR_NO_EXIT
+ ? TRUE : FALSE;
+ on_error_silent = create_mode & OS_FILE_ON_ERROR_SILENT
+ ? TRUE : FALSE;
+
+ create_mode &= ~OS_FILE_ON_ERROR_NO_EXIT;
+ create_mode &= ~OS_FILE_ON_ERROR_SILENT;
+
+ if (create_mode == OS_FILE_OPEN
+ || create_mode == OS_FILE_OPEN_RAW
+ || create_mode == OS_FILE_OPEN_RETRY) {
+
+ mode_str = "OPEN";
+
+ create_flag = srv_read_only_mode ? O_RDONLY : O_RDWR;
+
+ } else if (srv_read_only_mode) {
+
+ mode_str = "OPEN";
+
+ create_flag = O_RDONLY;
+
+ } else if (create_mode == OS_FILE_CREATE) {
+
+ mode_str = "CREATE";
+ create_flag = O_RDWR | O_CREAT | O_EXCL;
+
+ } else if (create_mode == OS_FILE_OVERWRITE) {
+
+ mode_str = "OVERWRITE";
+ create_flag = O_RDWR | O_CREAT | O_TRUNC;
+
+ } else {
+ ib_logf(IB_LOG_LEVEL_ERROR,
+ "Unknown file create mode (%lu) for file '%s'",
+ create_mode, name);
+
+ file.m_file = -1;
+ return(file);
+ }
+
+ ut_a(type == OS_LOG_FILE || type == OS_DATA_FILE);
+ ut_a(purpose == OS_FILE_AIO || purpose == OS_FILE_NORMAL);
+
+#ifdef O_SYNC
+ /* We let O_SYNC only affect log files; note that we map O_DSYNC to
+ O_SYNC because the datasync options seemed to corrupt files in 2001
+ in both Linux and Solaris */
+
+ if (!srv_read_only_mode
+ && type == OS_LOG_FILE
+ && srv_unix_file_flush_method == SRV_UNIX_O_DSYNC) {
+
+ create_flag |= O_SYNC;
+ }
+#endif /* O_SYNC */
+
+ do {
- file.m_file = ::open(name, create_flag, os_innodb_umask);
++ file.m_file = ::open(name, create_flag | O_CLOEXEC, os_innodb_umask);
+
+ if (file.m_file == -1) {
+ const char* operation;
+
+ operation = (create_mode == OS_FILE_CREATE
+ && !srv_read_only_mode)
+ ? "create" : "open";
+
+ *success = FALSE;
+
+ if (on_error_no_exit) {
+ retry = os_file_handle_error_no_exit(
+ name, operation, on_error_silent);
+ } else {
+ retry = os_file_handle_error(name, operation);
+ }
+ } else {
+ *success = TRUE;
+ retry = false;
+ }
+
+ } while (retry);
+
+ if (*success) {
+
+ os_file_set_nocache_if_needed(file.m_file, name, mode_str,
+ type, 0);
+ }
+
+#ifdef USE_FILE_LOCK
+ if (!srv_read_only_mode
+ && *success
+ && create_mode != OS_FILE_OPEN_RAW
+ && os_file_lock(file.m_file, name)) {
+
+ if (create_mode == OS_FILE_OPEN_RETRY) {
+
+ ut_a(!srv_read_only_mode);
+
+ ib_logf(IB_LOG_LEVEL_INFO,
+ "Retrying to lock the first data file");
+
+ for (int i = 0; i < 100; i++) {
+ os_thread_sleep(1000000);
+
+ if (!os_file_lock(file.m_file, name)) {
+ *success = TRUE;
+ return(file);
+ }
+ }
+
+ ib_logf(IB_LOG_LEVEL_INFO,
+ "Unable to open the first data file");
+ }
+
+ *success = FALSE;
+ close(file.m_file);
+ file.m_file = -1;
+ }
+#endif /* USE_FILE_LOCK */
+
+ if (srv_use_atomic_writes && type == OS_DATA_FILE
+ && file.m_file != -1
+ && !os_file_set_atomic_writes(name, file.m_file)) {
+
+ *success = FALSE;
+ close(file.m_file);
+ file.m_file = -1;
+ }
+
+#endif /* __WIN__ */
+
+ return(file);
+}
+
+/***********************************************************************//**
+Deletes a file if it exists. The file has to be closed before calling this.
+@return TRUE if success */
+UNIV_INTERN
+bool
+os_file_delete_if_exists_func(
+/*==========================*/
+ const char* name) /*!< in: file path as a null-terminated
+ string */
+{
+#ifdef __WIN__
+ bool ret;
+ ulint count = 0;
+loop:
+ /* In Windows, deleting an .ibd file may fail if mysqlbackup is copying
+ it */
+
+ ret = DeleteFile((LPCTSTR) name);
+
+ if (ret) {
+ return(true);
+ }
+
+ DWORD lasterr = GetLastError();
+ if (lasterr == ERROR_FILE_NOT_FOUND
+ || lasterr == ERROR_PATH_NOT_FOUND) {
+ /* the file does not exist, this not an error */
+
+ return(true);
+ }
+
+ count++;
+
+ if (count > 100 && 0 == (count % 10)) {
+ os_file_get_last_error(true); /* print error information */
+
+ ib_logf(IB_LOG_LEVEL_WARN, "Delete of file %s failed.", name);
+ }
+
+ os_thread_sleep(500000); /* sleep for 0.5 second */
+
+ if (count > 2000) {
+
+ return(false);
+ }
+
+ goto loop;
+#else
+ int ret;
+ WAIT_ALLOW_WRITES();
+
+ ret = unlink(name);
+
+ if (ret != 0 && errno != ENOENT) {
+ os_file_handle_error_no_exit(name, "delete", FALSE);
+
+ return(false);
+ }
+
+ return(true);
+#endif /* __WIN__ */
+}
+
+/***********************************************************************//**
+Deletes a file. The file has to be closed before calling this.
+@return TRUE if success */
+UNIV_INTERN
+bool
+os_file_delete_func(
+/*================*/
+ const char* name) /*!< in: file path as a null-terminated
+ string */
+{
+#ifdef __WIN__
+ BOOL ret;
+ ulint count = 0;
+loop:
+ /* In Windows, deleting an .ibd file may fail if mysqlbackup is copying
+ it */
+
+ ret = DeleteFile((LPCTSTR) name);
+
+ if (ret) {
+ return(true);
+ }
+
+ if (GetLastError() == ERROR_FILE_NOT_FOUND) {
+ /* If the file does not exist, we classify this as a 'mild'
+ error and return */
+
+ return(false);
+ }
+
+ count++;
+
+ if (count > 100 && 0 == (count % 10)) {
+ os_file_get_last_error(true); /* print error information */
+
+ fprintf(stderr,
+ "InnoDB: Warning: cannot delete file %s\n"
+ "InnoDB: Are you running mysqlbackup"
+ " to back up the file?\n", name);
+ }
+
+ os_thread_sleep(1000000); /* sleep for a second */
+
+ if (count > 2000) {
+
+ return(false);
+ }
+
+ goto loop;
+#else
+ int ret;
+ WAIT_ALLOW_WRITES();
+
+ ret = unlink(name);
+
+ if (ret != 0) {
+ os_file_handle_error_no_exit(name, "delete", FALSE);
+
+ return(false);
+ }
+
+ return(true);
+#endif
+}
+
+/***********************************************************************//**
+NOTE! Use the corresponding macro os_file_rename(), not directly this function!
+Renames a file (can also move it to another directory). It is safest that the
+file is closed before calling this function.
+@return TRUE if success */
+UNIV_INTERN
+ibool
+os_file_rename_func(
+/*================*/
+ const char* oldpath,/*!< in: old file path as a null-terminated
+ string */
+ const char* newpath)/*!< in: new file path */
+{
+#ifdef UNIV_DEBUG
+ os_file_type_t type;
+ ibool exists;
+
+ /* New path must not exist. */
+ ut_ad(os_file_status(newpath, &exists, &type));
+ ut_ad(!exists);
+
+ /* Old path must exist. */
+ ut_ad(os_file_status(oldpath, &exists, &type));
+ ut_ad(exists);
+#endif /* UNIV_DEBUG */
+
+#ifdef __WIN__
+ BOOL ret;
+
+ ret = MoveFileEx((LPCTSTR)oldpath, (LPCTSTR)newpath, MOVEFILE_REPLACE_EXISTING);
+
+ if (ret) {
+ return(TRUE);
+ }
+
+ os_file_handle_error_no_exit(oldpath, "rename", FALSE);
+
+ return(FALSE);
+#else
+ int ret;
+ WAIT_ALLOW_WRITES();
+
+ ret = rename(oldpath, newpath);
+
+ if (ret != 0) {
+ os_file_handle_error_no_exit(oldpath, "rename", FALSE);
+
+ return(FALSE);
+ }
+
+ return(TRUE);
+#endif /* __WIN__ */
+}
+
+/***********************************************************************//**
+NOTE! Use the corresponding macro os_file_close(), not directly this function!
+Closes a file handle. In case of error, error number can be retrieved with
+os_file_get_last_error.
+@return TRUE if success */
+UNIV_INTERN
+ibool
+os_file_close_func(
+/*===============*/
+ os_file_t file) /*!< in, own: handle to a file */
+{
+#ifdef __WIN__
+ BOOL ret;
+
+ ret = CloseHandle(file);
+
+ if (ret) {
+ return(TRUE);
+ }
+
+ os_file_handle_error(NULL, "close");
+
+ return(FALSE);
+#else
+ int ret;
+
+ ret = close(file);
+
+ if (ret == -1) {
+ os_file_handle_error(NULL, "close");
+
+ return(FALSE);
+ }
+
+ return(TRUE);
+#endif /* __WIN__ */
+}
+
+/***********************************************************************//**
+Closes a file handle.
+@return TRUE if success */
+UNIV_INTERN
+bool
+os_file_close_no_error_handling_func(
+/*============================*/
+ os_file_t file) /*!< in, own: handle to a file */
+{
+#ifdef __WIN__
+ BOOL ret;
+
+ ret = CloseHandle(file);
+
+ if (ret) {
+ return(true);
+ }
+
+ return(false);
+#else
+ int ret;
+
+ ret = close(file);
+
+ if (ret == -1) {
+
+ return(false);
+ }
+
+ return(true);
+#endif /* __WIN__ */
+}
+
+#ifdef HAVE_POSIX_FALLOCATE
+/***********************************************************************//**
+Ensures that disk space is allocated for the file.
+@return TRUE if success */
+UNIV_INTERN
+bool
+os_file_allocate_func(
+ os_file_t file, /*!< in, own: handle to a file */
+ os_offset_t offset, /*!< in: file region offset */
+ os_offset_t len) /*!< in: file region length */
+{
+ return(posix_fallocate(file, offset, len) == 0);
+}
+#endif
+
+/***********************************************************************//**
+Checks if the file is marked as invalid.
+@return TRUE if invalid */
+UNIV_INTERN
+bool
+os_file_is_invalid(
+ pfs_os_file_t file) /*!< in, own: handle to a file */
+{
+ return(file.m_file == os_file_invalid);
+}
+
+/***********************************************************************//**
+Marks the file as invalid. */
+UNIV_INTERN
+void
+os_file_mark_invalid(
+ pfs_os_file_t* file) /*!< out: pointer to a handle to a file */
+{
+ file->m_file = os_file_invalid;
+}
+
+/***********************************************************************//**
+Announces an intention to access file data in a specific pattern in the
+future.
+@return TRUE if success */
+UNIV_INTERN
+bool
+os_file_advise(
+ pfs_os_file_t file, /*!< in, own: handle to a file */
+ os_offset_t offset, /*!< in: file region offset */
+ os_offset_t len, /*!< in: file region length */
+ ulint advice)/*!< in: advice for access pattern */
+{
+#ifdef __WIN__
+ return(true);
+#else
+#ifdef UNIV_LINUX
+ int native_advice = 0;
+ if ((advice & OS_FILE_ADVISE_NORMAL) != 0)
+ native_advice |= POSIX_FADV_NORMAL;
+ if ((advice & OS_FILE_ADVISE_RANDOM) != 0)
+ native_advice |= POSIX_FADV_RANDOM;
+ if ((advice & OS_FILE_ADVISE_SEQUENTIAL) != 0)
+ native_advice |= POSIX_FADV_SEQUENTIAL;
+ if ((advice & OS_FILE_ADVISE_WILLNEED) != 0)
+ native_advice |= POSIX_FADV_WILLNEED;
+ if ((advice & OS_FILE_ADVISE_DONTNEED) != 0)
+ native_advice |= POSIX_FADV_DONTNEED;
+ if ((advice & OS_FILE_ADVISE_NOREUSE) != 0)
+ native_advice |= POSIX_FADV_NOREUSE;
+
+ return(posix_fadvise(file.m_file, offset, len, native_advice) == 0);
+#else
+ return(true);
+#endif
+#endif /* __WIN__ */
+}
+
+/***********************************************************************//**
+Gets a file size.
+@return file size, or (os_offset_t) -1 on failure */
+UNIV_INTERN
+os_offset_t
+os_file_get_size(
+/*=============*/
+ pfs_os_file_t file) /*!< in: handle to a file */
+{
+#ifdef __WIN__
+ os_offset_t offset;
+ DWORD high;
+ DWORD low;
+
+ low = GetFileSize(file.m_file, &high);
+
+ if ((low == 0xFFFFFFFF) && (GetLastError() != NO_ERROR)) {
+ return((os_offset_t) -1);
+ }
+
+ offset = (os_offset_t) low | ((os_offset_t) high << 32);
+
+ return(offset);
+#else
+ return((os_offset_t) lseek(file.m_file, 0, SEEK_END));
+
+#endif /* __WIN__ */
+}
+
+/***********************************************************************//**
+Write the specified number of zeros to a newly created file.
+@return TRUE if success */
+UNIV_INTERN
+ibool
+os_file_set_size(
+/*=============*/
+ const char* name, /*!< in: name of the file or path as a
+ null-terminated string */
+ pfs_os_file_t file, /*!< in: handle to a file */
+ os_offset_t size) /*!< in: file size */
+{
+ ibool ret;
+ byte* buf;
+ byte* buf2;
+ ulint buf_size;
+
+#ifdef HAVE_POSIX_FALLOCATE
+ if (srv_use_posix_fallocate) {
+ int err;
+ do {
+ err = posix_fallocate(file.m_file, 0, size);
+ } while (err == EINTR
+ && srv_shutdown_state == SRV_SHUTDOWN_NONE);
+
+ if (err) {
+ ib_logf(IB_LOG_LEVEL_ERROR,
+ "preallocating " INT64PF " bytes for"
+ "file %s failed with error %d",
+ size, name, err);
+ }
+ return(!err);
+ }
+#endif
+
+#ifdef _WIN32
+ /* Write 1 page of zeroes at the desired end. */
+ buf_size = UNIV_PAGE_SIZE;
+ os_offset_t current_size = size - buf_size;
+#else
+ /* Write up to 1 megabyte at a time. */
+ buf_size = ut_min(64, (ulint) (size / UNIV_PAGE_SIZE))
+ * UNIV_PAGE_SIZE;
+ os_offset_t current_size = 0;
+#endif
+ buf2 = static_cast<byte*>(calloc(1, buf_size + UNIV_PAGE_SIZE));
+
+ if (!buf2) {
+ ib_logf(IB_LOG_LEVEL_ERROR,
+ "Cannot allocate " ULINTPF " bytes to extend file\n",
+ buf_size + UNIV_PAGE_SIZE);
+ return(FALSE);
+ }
+
+ /* Align the buffer for possible raw i/o */
+ buf = static_cast<byte*>(ut_align(buf2, UNIV_PAGE_SIZE));
+
+ do {
+ ulint n_bytes;
+
+ if (size - current_size < (os_offset_t) buf_size) {
+ n_bytes = (ulint) (size - current_size);
+ } else {
+ n_bytes = buf_size;
+ }
+
+ ret = os_file_write(name, file, buf, current_size, n_bytes);
+ if (!ret) {
+ break;
+ }
+
+ current_size += n_bytes;
+ } while (current_size < size);
+
+ free(buf2);
+
+ return(ret && os_file_flush(file));
+}
+
+/***********************************************************************//**
+Truncates a file at its current position.
+@return TRUE if success */
+UNIV_INTERN
+ibool
+os_file_set_eof(
+/*============*/
+ FILE* file) /*!< in: file to be truncated */
+{
+#ifdef __WIN__
+ HANDLE h = (HANDLE) _get_osfhandle(fileno(file));
+ return(SetEndOfFile(h));
+#else /* __WIN__ */
+ WAIT_ALLOW_WRITES();
+ return(!ftruncate(fileno(file), ftell(file)));
+#endif /* __WIN__ */
+}
+
+/***********************************************************************//**
+Truncates a file at the specified position.
+@return TRUE if success */
+UNIV_INTERN
+bool
+os_file_set_eof_at_func(
+ os_file_t file, /*!< in: handle to a file */
+ ib_uint64_t new_len)/*!< in: new file length */
+{
+#ifdef __WIN__
+ LARGE_INTEGER li, li2;
+ li.QuadPart = new_len;
+ return(SetFilePointerEx(file, li, &li2,FILE_BEGIN)
+ && SetEndOfFile(file));
+#else
+ WAIT_ALLOW_WRITES();
+ /* TODO: works only with -D_FILE_OFFSET_BITS=64 ? */
+ return(!ftruncate(file, new_len));
+#endif
+}
+
+
+#ifndef __WIN__
+/***********************************************************************//**
+Wrapper to fsync(2) that retries the call on some errors.
+Returns the value 0 if successful; otherwise the value -1 is returned and
+the global variable errno is set to indicate the error.
+@return 0 if success, -1 otherwise */
+
+static
+int
+os_file_fsync(
+/*==========*/
+ os_file_t file) /*!< in: handle to a file */
+{
+ int ret;
+ int failures;
+ ibool retry;
+
+ failures = 0;
+
+ do {
+ ret = fsync(file);
+
+ os_n_fsyncs++;
+
+ if (ret == -1 && errno == ENOLCK) {
+
+ if (failures % 100 == 0) {
+
+ ut_print_timestamp(stderr);
+ fprintf(stderr,
+ " InnoDB: fsync(): "
+ "No locks available; retrying\n");
+ }
+
+ os_thread_sleep(200000 /* 0.2 sec */);
+
+ failures++;
+
+ retry = TRUE;
+ } else if (ret == -1 && errno == EINTR) {
+ /* Handle signal interruptions correctly */
+ retry = TRUE;
+ } else {
+
+ retry = FALSE;
+ }
+ } while (retry);
+
+ return(ret);
+}
+#endif /* !__WIN__ */
+
+/***********************************************************************//**
+NOTE! Use the corresponding macro os_file_flush(), not directly this function!
+Flushes the write buffers of a given file to the disk.
+@return TRUE if success */
+UNIV_INTERN
+ibool
+os_file_flush_func(
+/*===============*/
+ os_file_t file) /*!< in, own: handle to a file */
+{
+#ifdef __WIN__
+ BOOL ret;
+
+ os_n_fsyncs++;
+
+ ret = FlushFileBuffers(file);
+
+ if (ret) {
+ return(TRUE);
+ }
+
+ /* Since Windows returns ERROR_INVALID_FUNCTION if the 'file' is
+ actually a raw device, we choose to ignore that error if we are using
+ raw disks */
+
+ if (srv_start_raw_disk_in_use && GetLastError()
+ == ERROR_INVALID_FUNCTION) {
+ return(TRUE);
+ }
+
+ os_file_handle_error(NULL, "flush");
+
+ /* It is a fatal error if a file flush does not succeed, because then
+ the database can get corrupt on disk */
+ ut_error;
+
+ return(FALSE);
+#else
+ int ret;
+ WAIT_ALLOW_WRITES();
+
+#if defined(HAVE_DARWIN_THREADS)
+# ifndef F_FULLFSYNC
+ /* The following definition is from the Mac OS X 10.3 <sys/fcntl.h> */
+# define F_FULLFSYNC 51 /* fsync + ask the drive to flush to the media */
+# elif F_FULLFSYNC != 51
+# error "F_FULLFSYNC != 51: ABI incompatibility with Mac OS X 10.3"
+# endif
+ /* Apple has disabled fsync() for internal disk drives in OS X. That
+ caused corruption for a user when he tested a power outage. Let us in
+ OS X use a nonstandard flush method recommended by an Apple
+ engineer. */
+
+ if (!srv_have_fullfsync) {
+ /* If we are not on an operating system that supports this,
+ then fall back to a plain fsync. */
+
+ ret = os_file_fsync(file);
+ } else {
+ ret = fcntl(file, F_FULLFSYNC, NULL);
+
+ if (ret) {
+ /* If we are not on a file system that supports this,
+ then fall back to a plain fsync. */
+ ret = os_file_fsync(file);
+ }
+ }
+#else
+ ret = os_file_fsync(file);
+#endif
+
+ if (ret == 0) {
+ return(TRUE);
+ }
+
+ /* Since Linux returns EINVAL if the 'file' is actually a raw device,
+ we choose to ignore that error if we are using raw disks */
+
+ if (srv_start_raw_disk_in_use && errno == EINVAL) {
+
+ return(TRUE);
+ }
+
+ ib_logf(IB_LOG_LEVEL_ERROR, "The OS said file flush did not succeed");
+
+ os_file_handle_error(NULL, "flush");
+
+ /* It is a fatal error if a file flush does not succeed, because then
+ the database can get corrupt on disk */
+ ut_error;
+
+ return(FALSE);
+#endif
+}
+
+#ifndef __WIN__
+/*******************************************************************//**
+Does a synchronous read operation in Posix.
+@return number of bytes read, -1 if error */
+static MY_ATTRIBUTE((nonnull(2), warn_unused_result))
+ssize_t
+os_file_pread(
+/*==========*/
+ os_file_t file, /*!< in: handle to a file */
+ void* buf, /*!< in: buffer where to read */
+ ulint n, /*!< in: number of bytes to read */
+ os_offset_t offset, /*!< in: file offset from where to read */
+ trx_t* trx)
+{
+ off_t offs;
+ ulint sec;
+ ulint ms;
+ ib_uint64_t start_time;
+ ib_uint64_t finish_time;
+
+ ut_ad(n);
+
+ /* If off_t is > 4 bytes in size, then we assume we can pass a
+ 64-bit address */
+ offs = (off_t) offset;
+
+ if (sizeof(off_t) <= 4) {
+ if (offset != (os_offset_t) offs) {
+ ib_logf(IB_LOG_LEVEL_ERROR,
+ "File read at offset > 4 GB");
+ }
+ }
+
+ os_n_file_reads++;
+
+ if (UNIV_UNLIKELY(trx && trx->take_stats))
+ {
+ trx->io_reads++;
+ trx->io_read += n;
+ ut_usectime(&sec, &ms);
+ start_time = (ib_uint64_t)sec * 1000000 + ms;
+ } else {
+ start_time = 0;
+ }
+
+ const bool monitor = MONITOR_IS_ON(MONITOR_OS_PENDING_READS);
+#ifdef HAVE_PREAD
+ MONITOR_ATOMIC_INC_LOW(MONITOR_OS_PENDING_READS, monitor);
+
+ ssize_t n_bytes;
+
+ /* Handle partial reads and signal interruptions correctly */
+ for (n_bytes = 0; n_bytes < (ssize_t) n; ) {
+ ssize_t n_read = pread(file, buf, (ssize_t)n - n_bytes, offs);
+ if (n_read > 0) {
+ n_bytes += n_read;
+ offs += n_read;
+ buf = (char *)buf + n_read;
+ } else if (n_read == -1 && errno == EINTR) {
+ continue;
+ } else {
+ break;
+ }
+ }
+
+ MONITOR_ATOMIC_DEC_LOW(MONITOR_OS_PENDING_READS, monitor);
+
+ if (UNIV_UNLIKELY(start_time != 0))
+ {
+ ut_usectime(&sec, &ms);
+ finish_time = (ib_uint64_t)sec * 1000000 + ms;
+ trx->io_reads_wait_timer += (ulint)(finish_time - start_time);
+ }
+
+ return(n_bytes);
+#else
+ {
+ off_t ret_offset;
+ ssize_t ret;
+ ssize_t n_read;
+#ifndef UNIV_HOTBACKUP
+ ulint i;
+#endif /* !UNIV_HOTBACKUP */
+
+ MONITOR_ATOMIC_INC_LOW(MONITOR_OS_PENDING_READS, monitor);
+#ifndef UNIV_HOTBACKUP
+ /* Protect the seek / read operation with a mutex */
+ i = ((ulint) file) % OS_FILE_N_SEEK_MUTEXES;
+
+ os_mutex_enter(os_file_seek_mutexes[i]);
+#endif /* !UNIV_HOTBACKUP */
+
+ ret_offset = lseek(file, offs, SEEK_SET);
+
+ if (ret_offset < 0) {
+ ret = -1;
+ } else {
+ /* Handle signal interruptions correctly */
+ for (ret = 0; ret < (ssize_t) n; ) {
+ n_read = read(file, buf, (ssize_t)n);
+ if (n_read > 0) {
+ ret += n_read;
+ } else if (n_read == -1 && errno == EINTR) {
+ continue;
+ } else {
+ break;
+ }
+ }
+ }
+
+#ifndef UNIV_HOTBACKUP
+ os_mutex_exit(os_file_seek_mutexes[i]);
+#endif /* !UNIV_HOTBACKUP */
+
+ MONITOR_ATOMIC_DEC_LOW(MONITOR_OS_PENDING_READS, monitor);
+
+ if (UNIV_UNLIKELY(start_time != 0)
+ {
+ ut_usectime(&sec, &ms);
+ finish_time = (ib_uint64_t)sec * 1000000 + ms;
+ trx->io_reads_wait_timer += (ulint)(finish_time - start_time);
+ }
+
+ return(ret);
+ }
+#endif
+}
+
+/*******************************************************************//**
+Does a synchronous write operation in Posix.
+@return number of bytes written, -1 if error */
+static MY_ATTRIBUTE((nonnull, warn_unused_result))
+ssize_t
+os_file_pwrite(
+/*===========*/
+ os_file_t file, /*!< in: handle to a file */
+ const void* buf, /*!< in: buffer from where to write */
+ ulint n, /*!< in: number of bytes to write */
+ os_offset_t offset) /*!< in: file offset where to write */
+{
+ ssize_t ret;
+ ssize_t n_written;
+ off_t offs;
+
+ ut_ad(n);
+ ut_ad(!srv_read_only_mode);
+
+ /* If off_t is > 4 bytes in size, then we assume we can pass a
+ 64-bit address */
+ offs = (off_t) offset;
+
+ if (sizeof(off_t) <= 4) {
+ if (offset != (os_offset_t) offs) {
+ ib_logf(IB_LOG_LEVEL_ERROR,
+ "File write at offset > 4 GB.");
+ }
+ }
+
+ os_n_file_writes++;
+
+ const bool monitor = MONITOR_IS_ON(MONITOR_OS_PENDING_WRITES);
+#ifdef HAVE_PWRITE
+ MONITOR_ATOMIC_INC_LOW(MONITOR_OS_PENDING_WRITES, monitor);
+
+ /* Handle partial writes and signal interruptions correctly */
+ for (ret = 0; ret < (ssize_t) n; ) {
+ n_written = pwrite(file, buf, (ssize_t)n - ret, offs);
+ DBUG_EXECUTE_IF("xb_simulate_all_o_direct_write_failure",
+ n_written = -1;
+ errno = EINVAL;);
+ if (n_written >= 0) {
+ ret += n_written;
+ offs += n_written;
+ buf = (char *)buf + n_written;
+ } else if (n_written == -1 && errno == EINTR) {
+ continue;
+ } else {
+ break;
+ }
+ }
+
+ MONITOR_ATOMIC_DEC_LOW(MONITOR_OS_PENDING_WRITES, monitor);
+
+ return(ret);
+#else
+ {
+ off_t ret_offset;
+# ifndef UNIV_HOTBACKUP
+ ulint i;
+# endif /* !UNIV_HOTBACKUP */
+
+ MONITOR_ATOMIC_INC_LOW(MONITOR_OS_PENDING_WRITES, monitor);
+
+# ifndef UNIV_HOTBACKUP
+ /* Protect the seek / write operation with a mutex */
+ i = ((ulint) file) % OS_FILE_N_SEEK_MUTEXES;
+
+ os_mutex_enter(os_file_seek_mutexes[i]);
+# endif /* UNIV_HOTBACKUP */
+
+ ret_offset = lseek(file, offs, SEEK_SET);
+
+ if (ret_offset < 0) {
+ ret = -1;
+
+ goto func_exit;
+ }
+
+ /* Handle signal interruptions correctly */
+ for (ret = 0; ret < (ssize_t) n; ) {
+ n_written = write(file, buf, (ssize_t)n);
+ if (n_written > 0) {
+ ret += n_written;
+ } else if (n_written == -1 && errno == EINTR) {
+ continue;
+ } else {
+ break;
+ }
+ }
+
+func_exit:
+# ifndef UNIV_HOTBACKUP
+ os_mutex_exit(os_file_seek_mutexes[i]);
+# endif /* !UNIV_HOTBACKUP */
+
+ MONITOR_ATOMIC_DEC_LOW(MONITOR_OS_PENDING_WRITES, monitor);
+ return(ret);
+ }
+#endif /* HAVE_PWRITE */
+}
+#endif
+
+/*******************************************************************//**
+NOTE! Use the corresponding macro os_file_read(), not directly this
+function!
+Requests a synchronous positioned read operation.
+@return TRUE if request was successful, FALSE if fail */
+UNIV_INTERN
+ibool
+os_file_read_func(
+/*==============*/
+ os_file_t file, /*!< in: handle to a file */
+ void* buf, /*!< in: buffer where to read */
+ os_offset_t offset, /*!< in: file offset where to read */
+ ulint n, /*!< in: number of bytes to read */
+ trx_t* trx)
+{
+#ifdef __WIN__
+ BOOL ret;
+ DWORD len;
+ ibool retry;
+ OVERLAPPED overlapped;
+
+
+ /* On 64-bit Windows, ulint is 64 bits. But offset and n should be
+ no more than 32 bits. */
+ ut_a((n & 0xFFFFFFFFUL) == n);
+
+ os_n_file_reads++;
+ os_bytes_read_since_printout += n;
+ const bool monitor = MONITOR_IS_ON(MONITOR_OS_PENDING_READS);
+
+try_again:
+ ut_ad(buf);
+ ut_ad(n > 0);
+
+ MONITOR_ATOMIC_INC_LOW(MONITOR_OS_PENDING_READS, monitor);
+
+ memset (&overlapped, 0, sizeof (overlapped));
+ overlapped.Offset = (DWORD)(offset & 0xFFFFFFFF);
+ overlapped.OffsetHigh = (DWORD)(offset >> 32);
+ overlapped.hEvent = win_get_syncio_event();
+ ret = ReadFile(file, buf, n, NULL, &overlapped);
+ if (ret) {
+ ret = GetOverlappedResult(file, &overlapped, (DWORD *)&len, FALSE);
+ }
+ else if(GetLastError() == ERROR_IO_PENDING) {
+ ret = GetOverlappedResult(file, &overlapped, (DWORD *)&len, TRUE);
+ }
+ MONITOR_ATOMIC_DEC_LOW(MONITOR_OS_PENDING_READS, monitor);
+
+ if (ret && len == n) {
+ return(TRUE);
+ }
+#else /* __WIN__ */
+ ibool retry;
+ ssize_t ret;
+
+ os_bytes_read_since_printout += n;
+
+try_again:
+ ret = os_file_pread(file, buf, n, offset, trx);
+
+ DBUG_EXECUTE_IF("xb_simulate_all_o_direct_read_failure",
+ ret = -1;
+ errno = EINVAL;);
+
+ if ((ulint) ret == n) {
+ return(TRUE);
+ } else if (ret == -1) {
+ ib_logf(IB_LOG_LEVEL_ERROR,
+ "Error in system call pread(). The operating"
+ " system error number is %lu.",(ulint) errno);
+ } else {
+ /* Partial read occurred */
+ ib_logf(IB_LOG_LEVEL_ERROR,
+ "Tried to read " ULINTPF " bytes at offset "
+ UINT64PF ". Was only able to read %ld.",
+ n, offset, (lint) ret);
+ }
+#endif /* __WIN__ */
+ retry = os_file_handle_error(NULL, "read");
+
+ if (retry) {
+ goto try_again;
+ }
+
+ fprintf(stderr,
+ "InnoDB: Fatal error: cannot read from file."
+ " OS error number %lu.\n",
+#ifdef __WIN__
+ (ulong) GetLastError()
+#else
+ (ulong) errno
+#endif /* __WIN__ */
+ );
+ fflush(stderr);
+
+ ut_error;
+
+ return(FALSE);
+}
+
+/*******************************************************************//**
+NOTE! Use the corresponding macro os_file_read_no_error_handling(),
+not directly this function!
+Requests a synchronous positioned read operation. This function does not do
+any error handling. In case of error it returns FALSE.
+@return TRUE if request was successful, FALSE if fail */
+UNIV_INTERN
+ibool
+os_file_read_no_error_handling_func(
+/*================================*/
+ os_file_t file, /*!< in: handle to a file */
+ void* buf, /*!< in: buffer where to read */
+ os_offset_t offset, /*!< in: file offset where to read */
+ ulint n) /*!< in: number of bytes to read */
+{
+#ifdef __WIN__
+ BOOL ret;
+ DWORD len;
+ ibool retry;
+ OVERLAPPED overlapped;
+ overlapped.Offset = (DWORD)(offset & 0xFFFFFFFF);
+ overlapped.OffsetHigh = (DWORD)(offset >> 32);
+
+
+ /* On 64-bit Windows, ulint is 64 bits. But offset and n should be
+ no more than 32 bits. */
+ ut_a((n & 0xFFFFFFFFUL) == n);
+
+ os_n_file_reads++;
+ os_bytes_read_since_printout += n;
+ const bool monitor = MONITOR_IS_ON(MONITOR_OS_PENDING_READS);
+
+try_again:
+ ut_ad(buf);
+ ut_ad(n > 0);
+
+ MONITOR_ATOMIC_INC_LOW(MONITOR_OS_PENDING_READS, monitor);
+
+ memset (&overlapped, 0, sizeof (overlapped));
+ overlapped.Offset = (DWORD)(offset & 0xFFFFFFFF);
+ overlapped.OffsetHigh = (DWORD)(offset >> 32);
+ overlapped.hEvent = win_get_syncio_event();
+ ret = ReadFile(file, buf, n, NULL, &overlapped);
+ if (ret) {
+ ret = GetOverlappedResult(file, &overlapped, (DWORD *)&len, FALSE);
+ }
+ else if(GetLastError() == ERROR_IO_PENDING) {
+ ret = GetOverlappedResult(file, &overlapped, (DWORD *)&len, TRUE);
+ }
+ MONITOR_ATOMIC_DEC_LOW(MONITOR_OS_PENDING_READS, monitor);
+
+ if (ret && len == n) {
+ return(TRUE);
+ }
+#else /* __WIN__ */
+ ibool retry;
+ ssize_t ret;
+
+ os_bytes_read_since_printout += n;
+
+try_again:
+ ret = os_file_pread(file, buf, n, offset, NULL);
+
+ if ((ulint) ret == n) {
+ return(TRUE);
+ } else if (ret == -1) {
+ ib_logf(IB_LOG_LEVEL_ERROR,
+ "Error in system call pread(). The operating"
+ " system error number is %lu.",(ulint) errno);
+ } else {
+ /* Partial read occurred */
+ ib_logf(IB_LOG_LEVEL_ERROR,
+ "Tried to read " ULINTPF " bytes at offset "
+ UINT64PF ". Was only able to read %ld.",
+ n, offset, (lint) ret);
+ }
+#endif /* __WIN__ */
+ retry = os_file_handle_error_no_exit(NULL, "read", FALSE);
+
+ if (retry) {
+ goto try_again;
+ }
+
+ return(FALSE);
+}
+
+/*******************************************************************//**
+Rewind file to its start, read at most size - 1 bytes from it to str, and
+NUL-terminate str. All errors are silently ignored. This function is
+mostly meant to be used with temporary files. */
+UNIV_INTERN
+void
+os_file_read_string(
+/*================*/
+ FILE* file, /*!< in: file to read from */
+ char* str, /*!< in: buffer where to read */
+ ulint size) /*!< in: size of buffer */
+{
+ size_t flen;
+
+ if (size == 0) {
+ return;
+ }
+
+ rewind(file);
+ flen = fread(str, 1, size - 1, file);
+ str[flen] = '\0';
+}
+
+/*******************************************************************//**
+NOTE! Use the corresponding macro os_file_write(), not directly
+this function!
+Requests a synchronous write operation.
+@return TRUE if request was successful, FALSE if fail */
+UNIV_INTERN
+ibool
+os_file_write_func(
+/*===============*/
+ const char* name, /*!< in: name of the file or path as a
+ null-terminated string */
+ os_file_t file, /*!< in: handle to a file */
+ const void* buf, /*!< in: buffer from which to write */
+ os_offset_t offset, /*!< in: file offset where to write */
+ ulint n) /*!< in: number of bytes to write */
+{
+ ut_ad(!srv_read_only_mode);
+
+#ifdef __WIN__
+ BOOL ret;
+ DWORD len;
+ ulint n_retries = 0;
+ ulint err;
+ OVERLAPPED overlapped;
+ DWORD saved_error = 0;
+
+ /* On 64-bit Windows, ulint is 64 bits. But offset and n should be
+ no more than 32 bits. */
+ ut_a((n & 0xFFFFFFFFUL) == n);
+
+ os_n_file_writes++;
+
+ ut_ad(buf);
+ ut_ad(n > 0);
+ const bool monitor = MONITOR_IS_ON(MONITOR_OS_PENDING_WRITES);
+retry:
+
+ MONITOR_ATOMIC_INC_LOW(MONITOR_OS_PENDING_WRITES, monitor);
+
+ memset (&overlapped, 0, sizeof (overlapped));
+ overlapped.Offset = (DWORD)(offset & 0xFFFFFFFF);
+ overlapped.OffsetHigh = (DWORD)(offset >> 32);
+
+ overlapped.hEvent = win_get_syncio_event();
+ ret = WriteFile(file, buf, n, NULL, &overlapped);
+ if (ret) {
+ ret = GetOverlappedResult(file, &overlapped, (DWORD *)&len, FALSE);
+ }
+ else if ( GetLastError() == ERROR_IO_PENDING) {
+ ret = GetOverlappedResult(file, &overlapped, (DWORD *)&len, TRUE);
+ }
+
+ MONITOR_ATOMIC_DEC_LOW(MONITOR_OS_PENDING_WRITES, monitor);
+
+ if (ret && len == n) {
+
+ return(TRUE);
+ }
+
+ /* If some background file system backup tool is running, then, at
+ least in Windows 2000, we may get here a specific error. Let us
+ retry the operation 100 times, with 1 second waits. */
+
+ if (GetLastError() == ERROR_LOCK_VIOLATION && n_retries < 100) {
+
+ os_thread_sleep(1000000);
+
+ n_retries++;
+
+ goto retry;
+ }
+
+ if (!os_has_said_disk_full) {
+ char *winmsg = NULL;
+
+ saved_error = GetLastError();
+ err = (ulint) saved_error;
+
+ ut_print_timestamp(stderr);
+
+ fprintf(stderr,
+ " InnoDB: Error: Write to file %s failed"
+ " at offset %llu.\n"
+ "InnoDB: %lu bytes should have been written,"
+ " only %lu were written.\n"
+ "InnoDB: Operating system error number %lu.\n"
+ "InnoDB: Check that your OS and file system"
+ " support files of this size.\n"
+ "InnoDB: Check also that the disk is not full"
+ " or a disk quota exceeded.\n",
+ name, offset,
+ (ulong) n, (ulong) len, (ulong) err);
+
+ /* Ask Windows to prepare a standard message for a
+ GetLastError() */
+
+ FormatMessage(FORMAT_MESSAGE_ALLOCATE_BUFFER |
+ FORMAT_MESSAGE_FROM_SYSTEM |
+ FORMAT_MESSAGE_IGNORE_INSERTS,
+ NULL, saved_error,
+ MAKELANGID(LANG_NEUTRAL, SUBLANG_DEFAULT),
+ (LPSTR)&winmsg, 0, NULL);
+
+ if (winmsg) {
+ fprintf(stderr,
+ "InnoDB: FormatMessage: Error number %lu means '%s'.\n",
+ (ulong) saved_error, winmsg);
+ LocalFree(winmsg);
+ }
+
+ if (strerror((int) err) != NULL) {
+ fprintf(stderr,
+ "InnoDB: Error number %lu means '%s'.\n",
+ (ulong) err, strerror((int) err));
+ }
+
+ fprintf(stderr,
+ "InnoDB: Some operating system error numbers"
+ " are described at\n"
+ "InnoDB: "
+ REFMAN "operating-system-error-codes.html\n");
+
+ os_has_said_disk_full = TRUE;
+ }
+
+ return(FALSE);
+#else
+ ssize_t ret;
+ WAIT_ALLOW_WRITES();
+
+ ret = os_file_pwrite(file, buf, n, offset);
+
+ if ((ulint) ret == n) {
+
+ return(TRUE);
+ }
+
+ if (!os_has_said_disk_full) {
+
+ ut_print_timestamp(stderr);
+
+ if(ret == -1) {
+ ib_logf(IB_LOG_LEVEL_ERROR,
+ "Failure of system call pwrite(). Operating"
+ " system error number is %lu.",
+ (ulint) errno);
+ } else {
+ fprintf(stderr,
+ " InnoDB: Error: Write to file %s failed"
+ " at offset " UINT64PF ".\n"
+ "InnoDB: %lu bytes should have been written,"
+ " only %ld were written.\n"
+ "InnoDB: Operating system error number %lu.\n"
+ "InnoDB: Check that your OS and file system"
+ " support files of this size.\n"
+ "InnoDB: Check also that the disk is not full"
+ " or a disk quota exceeded.\n",
+ name, offset, n, (lint) ret,
+ (ulint) errno);
+ }
+
+ if (strerror(errno) != NULL) {
+ fprintf(stderr,
+ "InnoDB: Error number %d means '%s'.\n",
+ errno, strerror(errno));
+ }
+
+ fprintf(stderr,
+ "InnoDB: Some operating system error numbers"
+ " are described at\n"
+ "InnoDB: "
+ REFMAN "operating-system-error-codes.html\n");
+
+ os_diagnose_all_o_direct_einval(errno);
+
+ os_has_said_disk_full = TRUE;
+ }
+
+ return(FALSE);
+#endif
+}
+
+/*******************************************************************//**
+Check the existence and type of the given file.
+@return TRUE if call succeeded */
+UNIV_INTERN
+ibool
+os_file_status(
+/*===========*/
+ const char* path, /*!< in: pathname of the file */
+ ibool* exists, /*!< out: TRUE if file exists */
+ os_file_type_t* type) /*!< out: type of the file (if it exists) */
+{
+#ifdef __WIN__
+ int ret;
+ struct _stat64 statinfo;
+
+ ret = _stat64(path, &statinfo);
+ if (ret && (errno == ENOENT || errno == ENOTDIR || errno == ENAMETOOLONG)) {
+ /* file does not exist */
+ *exists = FALSE;
+ return(TRUE);
+ } else if (ret) {
+ /* file exists, but stat call failed */
+
+ os_file_handle_error_no_exit(path, "stat", FALSE);
+
+ return(FALSE);
+ }
+
+ if (_S_IFDIR & statinfo.st_mode) {
+ *type = OS_FILE_TYPE_DIR;
+ } else if (_S_IFREG & statinfo.st_mode) {
+ *type = OS_FILE_TYPE_FILE;
+ } else {
+ *type = OS_FILE_TYPE_UNKNOWN;
+ }
+
+ *exists = TRUE;
+
+ return(TRUE);
+#else
+ int ret;
+ struct stat statinfo;
+
+ ret = stat(path, &statinfo);
+ if (ret && (errno == ENOENT || errno == ENOTDIR || errno == ENAMETOOLONG)) {
+ /* file does not exist */
+ *exists = FALSE;
+ return(TRUE);
+ } else if (ret) {
+ /* file exists, but stat call failed */
+
+ os_file_handle_error_no_exit(path, "stat", FALSE);
+
+ return(FALSE);
+ }
+
+ if (S_ISDIR(statinfo.st_mode)) {
+ *type = OS_FILE_TYPE_DIR;
+ } else if (S_ISLNK(statinfo.st_mode)) {
+ *type = OS_FILE_TYPE_LINK;
+ } else if (S_ISREG(statinfo.st_mode)) {
+ *type = OS_FILE_TYPE_FILE;
+ } else {
+ *type = OS_FILE_TYPE_UNKNOWN;
+ }
+
+ *exists = TRUE;
+
+ return(TRUE);
+#endif
+}
+
+/*******************************************************************//**
+This function returns information about the specified file
+@return DB_SUCCESS if all OK */
+UNIV_INTERN
+dberr_t
+os_file_get_status(
+/*===============*/
+ const char* path, /*!< in: pathname of the file */
+ os_file_stat_t* stat_info, /*!< information of a file in a
+ directory */
+ bool check_rw_perm) /*!< in: for testing whether the
+ file can be opened in RW mode */
+{
+ int ret;
+
+#ifdef __WIN__
+ struct _stat64 statinfo;
+
+ ret = _stat64(path, &statinfo);
+
+ if (ret && (errno == ENOENT || errno == ENOTDIR)) {
+ /* file does not exist */
+
+ return(DB_NOT_FOUND);
+
+ } else if (ret) {
+ /* file exists, but stat call failed */
+
+ os_file_handle_error_no_exit(path, "stat", FALSE);
+
+ return(DB_FAIL);
+
+ } else if (_S_IFDIR & statinfo.st_mode) {
+ stat_info->type = OS_FILE_TYPE_DIR;
+ } else if (_S_IFREG & statinfo.st_mode) {
+
+ DWORD access = GENERIC_READ;
+
+ if (!srv_read_only_mode) {
+ access |= GENERIC_WRITE;
+ }
+
+ stat_info->type = OS_FILE_TYPE_FILE;
+
+ /* Check if we can open it in read-only mode. */
+
+ if (check_rw_perm) {
+ HANDLE fh;
+
+ fh = CreateFile(
+ (LPCTSTR) path, // File to open
+ access,
+ 0, // No sharing
+ NULL, // Default security
+ OPEN_EXISTING, // Existing file only
+ FILE_ATTRIBUTE_NORMAL, // Normal file
+ NULL); // No attr. template
+
+ if (fh == INVALID_HANDLE_VALUE) {
+ stat_info->rw_perm = false;
+ } else {
+ stat_info->rw_perm = true;
+ CloseHandle(fh);
+ }
+ }
+ } else {
+ stat_info->type = OS_FILE_TYPE_UNKNOWN;
+ }
+#else
+ struct stat statinfo;
+
+ ret = stat(path, &statinfo);
+
+ if (ret && (errno == ENOENT || errno == ENOTDIR)) {
+ /* file does not exist */
+
+ return(DB_NOT_FOUND);
+
+ } else if (ret) {
+ /* file exists, but stat call failed */
+
+ os_file_handle_error_no_exit(path, "stat", FALSE);
+
+ return(DB_FAIL);
+
+ }
+
+ switch (statinfo.st_mode & S_IFMT) {
+ case S_IFDIR:
+ stat_info->type = OS_FILE_TYPE_DIR;
+ break;
+ case S_IFLNK:
+ stat_info->type = OS_FILE_TYPE_LINK;
+ break;
+ case S_IFBLK:
+ /* Handle block device as regular file. */
+ case S_IFCHR:
+ /* Handle character device as regular file. */
+ case S_IFREG:
+ stat_info->type = OS_FILE_TYPE_FILE;
+ break;
+ default:
+ stat_info->type = OS_FILE_TYPE_UNKNOWN;
+ }
+
+
+ if (check_rw_perm && stat_info->type == OS_FILE_TYPE_FILE) {
+
+ int fh;
+ int access;
+
+ access = !srv_read_only_mode ? O_RDWR : O_RDONLY;
+
- fh = ::open(path, access, os_innodb_umask);
++ fh = ::open(path, access | O_CLOEXEC, os_innodb_umask);
+
+ if (fh == -1) {
+ stat_info->rw_perm = false;
+ } else {
+ stat_info->rw_perm = true;
+ close(fh);
+ }
+ }
+
+#endif /* _WIN_ */
+
+ stat_info->ctime = statinfo.st_ctime;
+ stat_info->atime = statinfo.st_atime;
+ stat_info->mtime = statinfo.st_mtime;
+ stat_info->size = statinfo.st_size;
+
+ return(DB_SUCCESS);
+}
+
+/* path name separator character */
+#ifdef __WIN__
+# define OS_FILE_PATH_SEPARATOR '\\'
+#else
+# define OS_FILE_PATH_SEPARATOR '/'
+#endif
+
+/****************************************************************//**
+This function returns a new path name after replacing the basename
+in an old path with a new basename. The old_path is a full path
+name including the extension. The tablename is in the normal
+form "databasename/tablename". The new base name is found after
+the forward slash. Both input strings are null terminated.
+
+This function allocates memory to be returned. It is the callers
+responsibility to free the return value after it is no longer needed.
+
+@return own: new full pathname */
+UNIV_INTERN
+char*
+os_file_make_new_pathname(
+/*======================*/
+ const char* old_path, /*!< in: pathname */
+ const char* tablename) /*!< in: contains new base name */
+{
+ ulint dir_len;
+ char* last_slash;
+ char* base_name;
+ char* new_path;
+ ulint new_path_len;
+
+ /* Split the tablename into its database and table name components.
+ They are separated by a '/'. */
+ last_slash = strrchr((char*) tablename, '/');
+ base_name = last_slash ? last_slash + 1 : (char*) tablename;
+
+ /* Find the offset of the last slash. We will strip off the
+ old basename.ibd which starts after that slash. */
+ last_slash = strrchr((char*) old_path, OS_FILE_PATH_SEPARATOR);
+ dir_len = last_slash ? last_slash - old_path : strlen(old_path);
+
+ /* allocate a new path and move the old directory path to it. */
+ new_path_len = dir_len + strlen(base_name) + sizeof "/.ibd";
+ new_path = static_cast<char*>(mem_alloc(new_path_len));
+ memcpy(new_path, old_path, dir_len);
+
+ ut_snprintf(new_path + dir_len,
+ new_path_len - dir_len,
+ "%c%s.ibd",
+ OS_FILE_PATH_SEPARATOR,
+ base_name);
+
+ return(new_path);
+}
+
+/****************************************************************//**
+This function returns a remote path name by combining a data directory
+path provided in a DATA DIRECTORY clause with the tablename which is
+in the form 'database/tablename'. It strips the file basename (which
+is the tablename) found after the last directory in the path provided.
+The full filepath created will include the database name as a directory
+under the path provided. The filename is the tablename with the '.ibd'
+extension. All input and output strings are null-terminated.
+
+This function allocates memory to be returned. It is the callers
+responsibility to free the return value after it is no longer needed.
+
+@return own: A full pathname; data_dir_path/databasename/tablename.ibd */
+UNIV_INTERN
+char*
+os_file_make_remote_pathname(
+/*=========================*/
+ const char* data_dir_path, /*!< in: pathname */
+ const char* tablename, /*!< in: tablename */
+ const char* extention) /*!< in: file extention; ibd,cfg */
+{
+ ulint data_dir_len;
+ char* last_slash;
+ char* new_path;
+ ulint new_path_len;
+
+ ut_ad(extention && strlen(extention) == 3);
+
+ /* Find the offset of the last slash. We will strip off the
+ old basename or tablename which starts after that slash. */
+ last_slash = strrchr((char*) data_dir_path, OS_FILE_PATH_SEPARATOR);
+ data_dir_len = last_slash ? last_slash - data_dir_path : strlen(data_dir_path);
+
+ /* allocate a new path and move the old directory path to it. */
+ new_path_len = data_dir_len + strlen(tablename)
+ + sizeof "/." + strlen(extention);
+ new_path = static_cast<char*>(mem_alloc(new_path_len));
+ memcpy(new_path, data_dir_path, data_dir_len);
+ ut_snprintf(new_path + data_dir_len,
+ new_path_len - data_dir_len,
+ "%c%s.%s",
+ OS_FILE_PATH_SEPARATOR,
+ tablename,
+ extention);
+
+ srv_normalize_path_for_win(new_path);
+
+ return(new_path);
+}
+
+/****************************************************************//**
+This function reduces a null-terminated full remote path name into
+the path that is sent by MySQL for DATA DIRECTORY clause. It replaces
+the 'databasename/tablename.ibd' found at the end of the path with just
+'tablename'.
+
+Since the result is always smaller than the path sent in, no new memory
+is allocated. The caller should allocate memory for the path sent in.
+This function manipulates that path in place.
+
+If the path format is not as expected, just return. The result is used
+to inform a SHOW CREATE TABLE command. */
+UNIV_INTERN
+void
+os_file_make_data_dir_path(
+/*========================*/
+ char* data_dir_path) /*!< in/out: full path/data_dir_path */
+{
+ char* ptr;
+ char* tablename;
+ ulint tablename_len;
+
+ /* Replace the period before the extension with a null byte. */
+ ptr = strrchr((char*) data_dir_path, '.');
+ if (!ptr) {
+ return;
+ }
+ ptr[0] = '\0';
+
+ /* The tablename starts after the last slash. */
+ ptr = strrchr((char*) data_dir_path, OS_FILE_PATH_SEPARATOR);
+ if (!ptr) {
+ return;
+ }
+ ptr[0] = '\0';
+ tablename = ptr + 1;
+
+ /* The databasename starts after the next to last slash. */
+ ptr = strrchr((char*) data_dir_path, OS_FILE_PATH_SEPARATOR);
+ if (!ptr) {
+ return;
+ }
+ tablename_len = ut_strlen(tablename);
+
+ ut_memmove(++ptr, tablename, tablename_len);
+
+ ptr[tablename_len] = '\0';
+}
+
+/****************************************************************//**
+The function os_file_dirname returns a directory component of a
+null-terminated pathname string. In the usual case, dirname returns
+the string up to, but not including, the final '/', and basename
+is the component following the final '/'. Trailing '/' characters
+are not counted as part of the pathname.
+
+If path does not contain a slash, dirname returns the string ".".
+
+Concatenating the string returned by dirname, a "/", and the basename
+yields a complete pathname.
+
+The return value is a copy of the directory component of the pathname.
+The copy is allocated from heap. It is the caller responsibility
+to free it after it is no longer needed.
+
+The following list of examples (taken from SUSv2) shows the strings
+returned by dirname and basename for different paths:
+
+ path dirname basename
+ "/usr/lib" "/usr" "lib"
+ "/usr/" "/" "usr"
+ "usr" "." "usr"
+ "/" "/" "/"
+ "." "." "."
+ ".." "." ".."
+
+@return own: directory component of the pathname */
+UNIV_INTERN
+char*
+os_file_dirname(
+/*============*/
+ const char* path) /*!< in: pathname */
+{
+ /* Find the offset of the last slash */
+ const char* last_slash = strrchr(path, OS_FILE_PATH_SEPARATOR);
+ if (!last_slash) {
+ /* No slash in the path, return "." */
+
+ return(mem_strdup("."));
+ }
+
+ /* Ok, there is a slash */
+
+ if (last_slash == path) {
+ /* last slash is the first char of the path */
+
+ return(mem_strdup("/"));
+ }
+
+ /* Non-trivial directory component */
+
+ return(mem_strdupl(path, last_slash - path));
+}
+
+/****************************************************************//**
+Creates all missing subdirectories along the given path.
+@return TRUE if call succeeded FALSE otherwise */
+UNIV_INTERN
+ibool
+os_file_create_subdirs_if_needed(
+/*=============================*/
+ const char* path) /*!< in: path name */
+{
+ if (srv_read_only_mode) {
+
+ ib_logf(IB_LOG_LEVEL_ERROR,
+ "read only mode set. Can't create subdirectories '%s'",
+ path);
+
+ return(FALSE);
+
+ }
+
+ char* subdir = os_file_dirname(path);
+
+ if (strlen(subdir) == 1
+ && (*subdir == OS_FILE_PATH_SEPARATOR || *subdir == '.')) {
+ /* subdir is root or cwd, nothing to do */
+ mem_free(subdir);
+
+ return(TRUE);
+ }
+
+ /* Test if subdir exists */
+ os_file_type_t type;
+ ibool subdir_exists;
+ ibool success = os_file_status(subdir, &subdir_exists, &type);
+
+ if (success && !subdir_exists) {
+
+ /* subdir does not exist, create it */
+ success = os_file_create_subdirs_if_needed(subdir);
+
+ if (!success) {
+ mem_free(subdir);
+
+ return(FALSE);
+ }
+
+ success = os_file_create_directory(subdir, FALSE);
+ }
+
+ mem_free(subdir);
+
+ return(success);
+}
+
+#ifndef UNIV_HOTBACKUP
+/****************************************************************//**
+Returns a pointer to the nth slot in the aio array.
+@return pointer to slot */
+static
+os_aio_slot_t*
+os_aio_array_get_nth_slot(
+/*======================*/
+ os_aio_array_t* array, /*!< in: aio array */
+ ulint index) /*!< in: index of the slot */
+{
+ ut_a(index < array->n_slots);
+
+ return(&array->slots[index]);
+}
+
+#if defined(LINUX_NATIVE_AIO)
+/******************************************************************//**
+Creates an io_context for native linux AIO.
+@return TRUE on success. */
+static
+ibool
+os_aio_linux_create_io_ctx(
+/*=======================*/
+ ulint max_events, /*!< in: number of events. */
+ io_context_t* io_ctx) /*!< out: io_ctx to initialize. */
+{
+ int ret;
+ ulint retries = 0;
+
+retry:
+ memset(io_ctx, 0x0, sizeof(*io_ctx));
+
+ /* Initialize the io_ctx. Tell it how many pending
+ IO requests this context will handle. */
+
+ ret = io_setup(max_events, io_ctx);
+ if (ret == 0) {
+#if defined(UNIV_AIO_DEBUG)
+ fprintf(stderr,
+ "InnoDB: Linux native AIO:"
+ " initialized io_ctx for segment\n");
+#endif
+ /* Success. Return now. */
+ return(TRUE);
+ }
+
+ /* If we hit EAGAIN we'll make a few attempts before failing. */
+
+ switch (ret) {
+ case -EAGAIN:
+ if (retries == 0) {
+ /* First time around. */
+ ut_print_timestamp(stderr);
+ fprintf(stderr,
+ " InnoDB: Warning: io_setup() failed"
+ " with EAGAIN. Will make %d attempts"
+ " before giving up.\n",
+ OS_AIO_IO_SETUP_RETRY_ATTEMPTS);
+ }
+
+ if (retries < OS_AIO_IO_SETUP_RETRY_ATTEMPTS) {
+ ++retries;
+ fprintf(stderr,
+ "InnoDB: Warning: io_setup() attempt"
+ " %lu failed.\n",
+ retries);
+ os_thread_sleep(OS_AIO_IO_SETUP_RETRY_SLEEP);
+ goto retry;
+ }
+
+ /* Have tried enough. Better call it a day. */
+ ut_print_timestamp(stderr);
+ fprintf(stderr,
+ " InnoDB: Error: io_setup() failed"
+ " with EAGAIN after %d attempts.\n",
+ OS_AIO_IO_SETUP_RETRY_ATTEMPTS);
+ break;
+
+ case -ENOSYS:
+ ut_print_timestamp(stderr);
+ fprintf(stderr,
+ " InnoDB: Error: Linux Native AIO interface"
+ " is not supported on this platform. Please"
+ " check your OS documentation and install"
+ " appropriate binary of InnoDB.\n");
+
+ break;
+
+ default:
+ ut_print_timestamp(stderr);
+ fprintf(stderr,
+ " InnoDB: Error: Linux Native AIO setup"
+ " returned following error[%d]\n", -ret);
+ break;
+ }
+
+ fprintf(stderr,
+ "InnoDB: You can disable Linux Native AIO by"
+ " setting innodb_use_native_aio = 0 in my.cnf\n");
+ return(FALSE);
+}
+
+/******************************************************************//**
+Checks if the system supports native linux aio. On some kernel
+versions where native aio is supported it won't work on tmpfs. In such
+cases we can't use native aio as it is not possible to mix simulated
+and native aio.
+@return: TRUE if supported, FALSE otherwise. */
+static
+ibool
+os_aio_native_aio_supported(void)
+/*=============================*/
+{
+ int fd;
+ io_context_t io_ctx;
+ char name[1000];
+
+ if (!os_aio_linux_create_io_ctx(1, &io_ctx)) {
+ /* The platform does not support native aio. */
+ return(FALSE);
+ } else if (!srv_read_only_mode) {
+ /* Now check if tmpdir supports native aio ops. */
+ fd = innobase_mysql_tmpfile(NULL);
+
+ if (fd < 0) {
+ ib_logf(IB_LOG_LEVEL_WARN,
+ "Unable to create temp file to check "
+ "native AIO support.");
+
+ return(FALSE);
+ }
+ } else {
+
+ srv_normalize_path_for_win(srv_log_group_home_dir);
+
+ ulint dirnamelen = strlen(srv_log_group_home_dir);
+ ut_a(dirnamelen < (sizeof name) - 10 - sizeof "ib_logfile");
+ memcpy(name, srv_log_group_home_dir, dirnamelen);
+
+ /* Add a path separator if needed. */
+ if (dirnamelen && name[dirnamelen - 1] != SRV_PATH_SEPARATOR) {
+ name[dirnamelen++] = SRV_PATH_SEPARATOR;
+ }
+
+ strcpy(name + dirnamelen, "ib_logfile0");
+
- fd = ::open(name, O_RDONLY);
++ fd = ::open(name, O_RDONLY | O_CLOEXEC);
+
+ if (fd == -1) {
+
+ ib_logf(IB_LOG_LEVEL_WARN,
+ "Unable to open \"%s\" to check "
+ "native AIO read support.", name);
+
+ return(FALSE);
+ }
+ }
+
+ struct io_event io_event;
+
+ memset(&io_event, 0x0, sizeof(io_event));
+
+ byte* buf = static_cast<byte*>(ut_malloc(UNIV_PAGE_SIZE * 2));
+ byte* ptr = static_cast<byte*>(ut_align(buf, UNIV_PAGE_SIZE));
+
+ struct iocb iocb;
+
+ /* Suppress valgrind warning. */
+ memset(buf, 0x00, UNIV_PAGE_SIZE * 2);
+ memset(&iocb, 0x0, sizeof(iocb));
+
+ struct iocb* p_iocb = &iocb;
+
+ if (!srv_read_only_mode) {
+ io_prep_pwrite(p_iocb, fd, ptr, UNIV_PAGE_SIZE, 0);
+ } else {
+ ut_a(UNIV_PAGE_SIZE >= 512);
+ io_prep_pread(p_iocb, fd, ptr, 512, 0);
+ }
+
+ int err = io_submit(io_ctx, 1, &p_iocb);
+
+ if (err >= 1) {
+ /* Now collect the submitted IO request. */
+ err = io_getevents(io_ctx, 1, 1, &io_event, NULL);
+ }
+
+ ut_free(buf);
+ close(fd);
+
+ switch (err) {
+ case 1:
+ return(TRUE);
+
+ case -EINVAL:
+ case -ENOSYS:
+ ib_logf(IB_LOG_LEVEL_ERROR,
+ "Linux Native AIO not supported. You can either "
+ "move %s to a file system that supports native "
+ "AIO or you can set innodb_use_native_aio to "
+ "FALSE to avoid this message.",
+ srv_read_only_mode ? name : "tmpdir");
+
+ /* fall through. */
+ default:
+ ib_logf(IB_LOG_LEVEL_ERROR,
+ "Linux Native AIO check on %s returned error[%d]",
+ srv_read_only_mode ? name : "tmpdir", -err);
+ }
+
+ return(FALSE);
+}
+#endif /* LINUX_NATIVE_AIO */
+
+/******************************************************************//**
+Creates an aio wait array. Note that we return NULL in case of failure.
+We don't care about freeing memory here because we assume that a
+failure will result in server refusing to start up.
+@return own: aio array, NULL on failure */
+static
+os_aio_array_t*
+os_aio_array_create(
+/*================*/
+ ulint n, /*!< in: maximum number of pending aio
+ operations allowed; n must be
+ divisible by n_segments */
+ ulint n_segments) /*!< in: number of segments in the aio array */
+{
+ os_aio_array_t* array;
+#ifdef LINUX_NATIVE_AIO
+ struct io_event* io_event = NULL;
+#endif
+ ut_a(n > 0);
+ ut_a(n_segments > 0);
+
+ array = static_cast<os_aio_array_t*>(ut_malloc(sizeof(*array)));
+ memset(array, 0x0, sizeof(*array));
+
+ array->mutex = os_mutex_create();
+ array->not_full = os_event_create();
+ array->is_empty = os_event_create();
+
+ os_event_set(array->is_empty);
+
+ array->n_slots = n;
+ array->n_segments = n_segments;
+
+ array->slots = static_cast<os_aio_slot_t*>(
+ ut_malloc(n * sizeof(*array->slots)));
+
+ memset(array->slots, 0x0, sizeof(n * sizeof(*array->slots)));
+
+#if defined(LINUX_NATIVE_AIO)
+ array->aio_ctx = NULL;
+ array->aio_events = NULL;
+
+ /* If we are not using native aio interface then skip this
+ part of initialization. */
+ if (!srv_use_native_aio) {
+ goto skip_native_aio;
+ }
+
+ /* Initialize the io_context array. One io_context
+ per segment in the array. */
+
+ array->aio_ctx = static_cast<io_context**>(
+ ut_malloc(n_segments * sizeof(*array->aio_ctx)));
+
+ for (ulint i = 0; i < n_segments; ++i) {
+ if (!os_aio_linux_create_io_ctx(n/n_segments,
+ &array->aio_ctx[i])) {
+ /* If something bad happened during aio setup
+ we disable linux native aio.
+ The disadvantage will be a small memory leak
+ at shutdown but that's ok compared to a crash
+ or a not working server.
+ This frequently happens when running the test suite
+ with many threads on a system with low fs.aio-max-nr!
+ */
+
+ fprintf(stderr,
+ " InnoDB: Warning: Linux Native AIO disabled "
+ "because os_aio_linux_create_io_ctx() "
+ "failed. To get rid of this warning you can "
+ "try increasing system "
+ "fs.aio-max-nr to 1048576 or larger or "
+ "setting innodb_use_native_aio = 0 in my.cnf\n");
+ srv_use_native_aio = FALSE;
+ goto skip_native_aio;
+ }
+ }
+
+ /* Initialize the event array. One event per slot. */
+ io_event = static_cast<struct io_event*>(
+ ut_malloc(n * sizeof(*io_event)));
+
+ memset(io_event, 0x0, sizeof(*io_event) * n);
+ array->aio_events = io_event;
+
+ array->pending = static_cast<struct iocb**>(
+ ut_malloc(n * sizeof(struct iocb*)));
+ memset(array->pending, 0x0, sizeof(struct iocb*) * n);
+ array->count = static_cast<ulint*>(
+ ut_malloc(n_segments * sizeof(ulint)));
+ memset(array->count, 0x0, sizeof(ulint) * n_segments);
+
+skip_native_aio:
+#endif /* LINUX_NATIVE_AIO */
+ for (ulint i = 0; i < n; i++) {
+ os_aio_slot_t* slot;
+
+ slot = os_aio_array_get_nth_slot(array, i);
+ slot->pos = i;
+ slot->reserved = FALSE;
+#ifdef LINUX_NATIVE_AIO
+ memset(&slot->control, 0x0, sizeof(slot->control));
+ slot->n_bytes = 0;
+ slot->ret = 0;
+#endif /* WIN_ASYNC_IO */
+ }
+
+ return(array);
+}
+
+/************************************************************************//**
+Frees an aio wait array. */
+static
+void
+os_aio_array_free(
+/*==============*/
+ os_aio_array_t*& array) /*!< in, own: array to free */
+{
+ os_mutex_free(array->mutex);
+ os_event_free(array->not_full);
+ os_event_free(array->is_empty);
+
+#if defined(LINUX_NATIVE_AIO)
+ if (srv_use_native_aio) {
+ ut_free(array->aio_events);
+ ut_free(array->aio_ctx);
+
+#ifdef UNIV_DEBUG
+ for (size_t idx = 0; idx < array->n_slots; ++idx)
+ ut_ad(array->pending[idx] == NULL);
+ for (size_t idx = 0; idx < array->n_segments; ++idx)
+ ut_ad(array->count[idx] == 0);
+#endif
+
+ ut_free(array->pending);
+ ut_free(array->count);
+ }
+#endif /* LINUX_NATIVE_AIO */
+
+ ut_free(array->slots);
+ ut_free(array);
+
+ array = 0;
+}
+
+/***********************************************************************
+Initializes the asynchronous io system. Creates one array each for ibuf
+and log i/o. Also creates one array each for read and write where each
+array is divided logically into n_read_segs and n_write_segs
+respectively. The caller must create an i/o handler thread for each
+segment in these arrays. This function also creates the sync array.
+No i/o handler thread needs to be created for that */
+UNIV_INTERN
+ibool
+os_aio_init(
+/*========*/
+ ulint n_per_seg, /*<! in: maximum number of pending aio
+ operations allowed per segment */
+ ulint n_read_segs, /*<! in: number of reader threads */
+ ulint n_write_segs, /*<! in: number of writer threads */
+ ulint n_slots_sync) /*<! in: number of slots in the sync aio
+ array */
+{
+ os_io_init_simple();
+
+#if defined(LINUX_NATIVE_AIO)
+ /* Check if native aio is supported on this system and tmpfs */
+ if (srv_use_native_aio && !os_aio_native_aio_supported()) {
+
+ ib_logf(IB_LOG_LEVEL_WARN, "Linux Native AIO disabled.");
+
+ srv_use_native_aio = FALSE;
+ }
+#endif /* LINUX_NATIVE_AIO */
+
+ srv_reset_io_thread_op_info();
+
+ os_aio_read_array = os_aio_array_create(
+ n_read_segs * n_per_seg, n_read_segs);
+
+ if (os_aio_read_array == NULL) {
+ return(FALSE);
+ }
+
+ ulint start = (srv_read_only_mode) ? 0 : 2;
+ ulint n_segs = n_read_segs + start;
+
+ /* 0 is the ibuf segment and 1 is the insert buffer segment. */
+ for (ulint i = start; i < n_segs; ++i) {
+ ut_a(i < SRV_MAX_N_IO_THREADS);
+ srv_io_thread_function[i] = "read thread";
+ }
+
+ ulint n_segments = n_read_segs;
+
+ if (!srv_read_only_mode) {
+
+ os_aio_log_array = os_aio_array_create(n_per_seg, 1);
+
+ if (os_aio_log_array == NULL) {
+ return(FALSE);
+ }
+
+ ++n_segments;
+
+ srv_io_thread_function[1] = "log thread";
+
+ os_aio_ibuf_array = os_aio_array_create(n_per_seg, 1);
+
+ if (os_aio_ibuf_array == NULL) {
+ return(FALSE);
+ }
+
+ ++n_segments;
+
+ srv_io_thread_function[0] = "insert buffer thread";
+
+ os_aio_write_array = os_aio_array_create(
+ n_write_segs * n_per_seg, n_write_segs);
+
+ if (os_aio_write_array == NULL) {
+ return(FALSE);
+ }
+
+ n_segments += n_write_segs;
+
+ for (ulint i = start + n_read_segs; i < n_segments; ++i) {
+ ut_a(i < SRV_MAX_N_IO_THREADS);
+ srv_io_thread_function[i] = "write thread";
+ }
+
+ ut_ad(n_segments >= 4);
+ } else {
+ ut_ad(n_segments > 0);
+ }
+
+ os_aio_sync_array = os_aio_array_create(n_slots_sync, 1);
+
+ if (os_aio_sync_array == NULL) {
+ return(FALSE);
+ }
+
+ os_aio_n_segments = n_segments;
+
+ os_aio_validate();
+
+ os_last_printout = ut_time();
+
+#ifdef _WIN32
+ ut_a(completion_port == 0 && read_completion_port == 0);
+ completion_port = CreateIoCompletionPort(INVALID_HANDLE_VALUE, NULL, 0, 0);
+ read_completion_port = srv_read_only_mode? completion_port : CreateIoCompletionPort(INVALID_HANDLE_VALUE, NULL, 0, 0);
+ ut_a(completion_port && read_completion_port);
+#endif
+
+ if (srv_use_native_aio) {
+ return(TRUE);
+ }
+
+ os_aio_segment_wait_events = static_cast<os_event_t*>(
+ ut_malloc(n_segments * sizeof *os_aio_segment_wait_events));
+
+ for (ulint i = 0; i < n_segments; ++i) {
+ os_aio_segment_wait_events[i] = os_event_create();
+ }
+
+ return(TRUE);
+}
+
+/***********************************************************************
+Frees the asynchronous io system. */
+UNIV_INTERN
+void
+os_aio_free(void)
+/*=============*/
+{
+ if (os_aio_ibuf_array != 0) {
+ os_aio_array_free(os_aio_ibuf_array);
+ }
+
+ if (os_aio_log_array != 0) {
+ os_aio_array_free(os_aio_log_array);
+ }
+
+ if (os_aio_write_array != 0) {
+ os_aio_array_free(os_aio_write_array);
+ }
+
+ if (os_aio_sync_array != 0) {
+ os_aio_array_free(os_aio_sync_array);
+ }
+
+ os_aio_array_free(os_aio_read_array);
+
+ if (!srv_use_native_aio) {
+ for (ulint i = 0; i < os_aio_n_segments; i++) {
+ os_event_free(os_aio_segment_wait_events[i]);
+ }
+ }
+
+ for (ulint i = 0; i < OS_FILE_N_SEEK_MUTEXES; i++) {
+ os_mutex_free(os_file_seek_mutexes[i]);
+ }
+
+ ut_free(os_aio_segment_wait_events);
+ os_aio_segment_wait_events = 0;
+ os_aio_n_segments = 0;
+#ifdef _WIN32
+ completion_port = 0;
+ read_completion_port = 0;
+#endif
+}
+
+#ifdef WIN_ASYNC_IO
+/************************************************************************//**
+Wakes up all async i/o threads in the array in Windows async i/o at
+shutdown. */
+static
+void
+os_aio_array_wake_win_aio_at_shutdown(
+/*==================================*/
+ os_aio_array_t* array) /*!< in: aio array */
+{
+ if(completion_port)
+ {
+ PostQueuedCompletionStatus(completion_port, 0, IOCP_SHUTDOWN_KEY, NULL);
+ PostQueuedCompletionStatus(read_completion_port, 0, IOCP_SHUTDOWN_KEY, NULL);
+ }
+}
+#endif
+
+/************************************************************************//**
+Wakes up all async i/o threads so that they know to exit themselves in
+shutdown. */
+UNIV_INTERN
+void
+os_aio_wake_all_threads_at_shutdown(void)
+/*=====================================*/
+{
+#ifdef WIN_ASYNC_IO
+ /* This code wakes up all ai/o threads in Windows native aio */
+ os_aio_array_wake_win_aio_at_shutdown(os_aio_read_array);
+ if (os_aio_write_array != 0) {
+ os_aio_array_wake_win_aio_at_shutdown(os_aio_write_array);
+ }
+
+ if (os_aio_ibuf_array != 0) {
+ os_aio_array_wake_win_aio_at_shutdown(os_aio_ibuf_array);
+ }
+
+ if (os_aio_log_array != 0) {
+ os_aio_array_wake_win_aio_at_shutdown(os_aio_log_array);
+ }
+#elif defined(LINUX_NATIVE_AIO)
+ /* When using native AIO interface the io helper threads
+ wait on io_getevents with a timeout value of 500ms. At
+ each wake up these threads check the server status.
+ No need to do anything to wake them up. */
+#endif /* !WIN_ASYNC_AIO */
+
+ if (srv_use_native_aio) {
+ return;
+ }
+
+ /* This loop wakes up all simulated ai/o threads */
+
+ for (ulint i = 0; i < os_aio_n_segments; i++) {
+
+ os_event_set(os_aio_segment_wait_events[i]);
+ }
+}
+
+/************************************************************************//**
+Waits until there are no pending writes in os_aio_write_array. There can
+be other, synchronous, pending writes. */
+UNIV_INTERN
+void
+os_aio_wait_until_no_pending_writes(void)
+/*=====================================*/
+{
+ ut_ad(!srv_read_only_mode);
+ os_event_wait(os_aio_write_array->is_empty);
+}
+
+/**********************************************************************//**
+Calculates segment number for a slot.
+@return segment number (which is the number used by, for example,
+i/o-handler threads) */
+static
+ulint
+os_aio_get_segment_no_from_slot(
+/*============================*/
+ os_aio_array_t* array, /*!< in: aio wait array */
+ os_aio_slot_t* slot) /*!< in: slot in this array */
+{
+ ulint segment;
+ ulint seg_len;
+
+ if (array == os_aio_ibuf_array) {
+ ut_ad(!srv_read_only_mode);
+
+ segment = IO_IBUF_SEGMENT;
+
+ } else if (array == os_aio_log_array) {
+ ut_ad(!srv_read_only_mode);
+
+ segment = IO_LOG_SEGMENT;
+
+ } else if (array == os_aio_read_array) {
+ seg_len = os_aio_read_array->n_slots
+ / os_aio_read_array->n_segments;
+
+ segment = (srv_read_only_mode ? 0 : 2) + slot->pos / seg_len;
+ } else {
+ ut_ad(!srv_read_only_mode);
+ ut_a(array == os_aio_write_array);
+
+ seg_len = os_aio_write_array->n_slots
+ / os_aio_write_array->n_segments;
+
+ segment = os_aio_read_array->n_segments + 2
+ + slot->pos / seg_len;
+ }
+
+ return(segment);
+}
+
+/**********************************************************************//**
+Calculates local segment number and aio array from global segment number.
+@return local segment number within the aio array */
+static
+ulint
+os_aio_get_array_and_local_segment(
+/*===============================*/
+ os_aio_array_t** array, /*!< out: aio wait array */
+ ulint global_segment)/*!< in: global segment number */
+{
+ ulint segment;
+
+ ut_a(global_segment < os_aio_n_segments);
+
+ if (srv_read_only_mode) {
+ *array = os_aio_read_array;
+
+ return(global_segment);
+ } else if (global_segment == IO_IBUF_SEGMENT) {
+ *array = os_aio_ibuf_array;
+ segment = 0;
+
+ } else if (global_segment == IO_LOG_SEGMENT) {
+ *array = os_aio_log_array;
+ segment = 0;
+
+ } else if (global_segment < os_aio_read_array->n_segments + 2) {
+ *array = os_aio_read_array;
+
+ segment = global_segment - 2;
+ } else {
+ *array = os_aio_write_array;
+
+ segment = global_segment - (os_aio_read_array->n_segments + 2);
+ }
+
+ return(segment);
+}
+
+/*******************************************************************//**
+Requests for a slot in the aio array. If no slot is available, waits until
+not_full-event becomes signaled.
+@return pointer to slot */
+static
+os_aio_slot_t*
+os_aio_array_reserve_slot(
+/*======================*/
+ ulint type, /*!< in: OS_FILE_READ or OS_FILE_WRITE */
+ os_aio_array_t* array, /*!< in: aio array */
+ fil_node_t* message1,/*!< in: message to be passed along with
+ the aio operation */
+ void* message2,/*!< in: message to be passed along with
+ the aio operation */
+ pfs_os_file_t file, /*!< in: file handle */
+ const char* name, /*!< in: name of the file or path as a
+ null-terminated string */
+ void* buf, /*!< in: buffer where to read or from which
+ to write */
+ os_offset_t offset, /*!< in: file offset */
+ ulint len, /*!< in: length of the block to read or write */
+ ulint space_id)
+{
+ os_aio_slot_t* slot = NULL;
+#ifdef WIN_ASYNC_IO
+ OVERLAPPED* control;
+
+#elif defined(LINUX_NATIVE_AIO)
+
+ struct iocb* iocb;
+ off_t aio_offset;
+
+#endif /* WIN_ASYNC_IO */
+ ulint i;
+ ulint counter;
+ ulint slots_per_seg;
+ ulint local_seg;
+
+#ifdef WIN_ASYNC_IO
+ ut_a((len & 0xFFFFFFFFUL) == len);
+#endif /* WIN_ASYNC_IO */
+
+ /* No need of a mutex. Only reading constant fields */
+ slots_per_seg = array->n_slots / array->n_segments;
+
+ /* We attempt to keep adjacent blocks in the same local
+ segment. This can help in merging IO requests when we are
+ doing simulated AIO */
+ local_seg = (offset >> (UNIV_PAGE_SIZE_SHIFT + 6))
+ % array->n_segments;
+
+loop:
+ os_mutex_enter(array->mutex);
+
+ if (array->n_reserved == array->n_slots) {
+ os_mutex_exit(array->mutex);
+
+ if (!srv_use_native_aio) {
+ /* If the handler threads are suspended, wake them
+ so that we get more slots */
+
+ os_aio_simulated_wake_handler_threads();
+ }
+
+ os_event_wait(array->not_full);
+
+ goto loop;
+ }
+
+ /* We start our search for an available slot from our preferred
+ local segment and do a full scan of the array. We are
+ guaranteed to find a slot in full scan. */
+ for (i = local_seg * slots_per_seg, counter = 0;
+ counter < array->n_slots;
+ i++, counter++) {
+
+ i %= array->n_slots;
+
+ slot = os_aio_array_get_nth_slot(array, i);
+
+ if (slot->reserved == FALSE) {
+ goto found;
+ }
+ }
+
+ /* We MUST always be able to get hold of a reserved slot. */
+ ut_error;
+
+found:
+ ut_a(slot->reserved == FALSE);
+ array->n_reserved++;
+
+ if (array->n_reserved == 1) {
+ os_event_reset(array->is_empty);
+ }
+
+ if (array->n_reserved == array->n_slots) {
+ os_event_reset(array->not_full);
+ }
+
+ slot->reserved = TRUE;
+ slot->reservation_time = ut_time();
+ slot->message1 = message1;
+ slot->message2 = message2;
+ slot->file = file;
+ slot->name = name;
+ slot->len = len;
+ slot->type = type;
+ slot->buf = static_cast<byte*>(buf);
+ slot->offset = offset;
+ slot->io_already_done = FALSE;
+ slot->space_id = space_id;
+
+#ifdef WIN_ASYNC_IO
+ control = &slot->control;
+ control->Offset = (DWORD) offset & 0xFFFFFFFF;
+ control->OffsetHigh = (DWORD) (offset >> 32);
+ control->hEvent = 0;
+ slot->arr = array;
+
+#elif defined(LINUX_NATIVE_AIO)
+
+ /* If we are not using native AIO skip this part. */
+ if (!srv_use_native_aio) {
+ goto skip_native_aio;
+ }
+
+ /* Check if we are dealing with 64 bit arch.
+ If not then make sure that offset fits in 32 bits. */
+ aio_offset = (off_t) offset;
+
+ ut_a(sizeof(aio_offset) >= sizeof(offset)
+ || ((os_offset_t) aio_offset) == offset);
+
+ iocb = &slot->control;
+
+ if (type == OS_FILE_READ) {
+ io_prep_pread(iocb, file.m_file, buf, len, aio_offset);
+ } else {
+ ut_a(type == OS_FILE_WRITE);
+ io_prep_pwrite(iocb, file.m_file, buf, len, aio_offset);
+ }
+
+ iocb->data = (void*) slot;
+ slot->n_bytes = 0;
+ slot->ret = 0;
+
+skip_native_aio:
+#endif /* LINUX_NATIVE_AIO */
+ os_mutex_exit(array->mutex);
+
+ return(slot);
+}
+
+/*******************************************************************//**
+Frees a slot in the aio array. */
+static
+void
+os_aio_array_free_slot(
+/*===================*/
+ os_aio_array_t* array, /*!< in: aio array */
+ os_aio_slot_t* slot) /*!< in: pointer to slot */
+{
+ os_mutex_enter(array->mutex);
+
+ ut_ad(slot->reserved);
+
+ slot->reserved = FALSE;
+
+ array->n_reserved--;
+
+ if (array->n_reserved == array->n_slots - 1) {
+ os_event_set(array->not_full);
+ }
+
+ if (array->n_reserved == 0) {
+ os_event_set(array->is_empty);
+ }
+
+#ifdef LINUX_NATIVE_AIO
+
+ if (srv_use_native_aio) {
+ memset(&slot->control, 0x0, sizeof(slot->control));
+ slot->n_bytes = 0;
+ slot->ret = 0;
+ /*fprintf(stderr, "Freed up Linux native slot.\n");*/
+ } else {
+ /* These fields should not be used if we are not
+ using native AIO. */
+ ut_ad(slot->n_bytes == 0);
+ ut_ad(slot->ret == 0);
+ }
+
+#endif
+ os_mutex_exit(array->mutex);
+}
+
+/**********************************************************************//**
+Wakes up a simulated aio i/o-handler thread if it has something to do. */
+static
+void
+os_aio_simulated_wake_handler_thread(
+/*=================================*/
+ ulint global_segment) /*!< in: the number of the segment in the aio
+ arrays */
+{
+ os_aio_array_t* array;
+ ulint segment;
+
+ ut_ad(!srv_use_native_aio);
+
+ segment = os_aio_get_array_and_local_segment(&array, global_segment);
+
+ ulint n = array->n_slots / array->n_segments;
+
+ segment *= n;
+
+ /* Look through n slots after the segment * n'th slot */
+
+ os_mutex_enter(array->mutex);
+
+ for (ulint i = 0; i < n; ++i) {
+ const os_aio_slot_t* slot;
+
+ slot = os_aio_array_get_nth_slot(array, segment + i);
+
+ if (slot->reserved) {
+
+ /* Found an i/o request */
+
+ os_mutex_exit(array->mutex);
+
+ os_event_t event;
+
+ event = os_aio_segment_wait_events[global_segment];
+
+ os_event_set(event);
+
+ return;
+ }
+ }
+
+ os_mutex_exit(array->mutex);
+}
+
+/**********************************************************************//**
+Wakes up simulated aio i/o-handler threads if they have something to do. */
+UNIV_INTERN
+void
+os_aio_simulated_wake_handler_threads(void)
+/*=======================================*/
+{
+ if (srv_use_native_aio) {
+ /* We do not use simulated aio: do nothing */
+
+ return;
+ }
+
+ os_aio_recommend_sleep_for_read_threads = FALSE;
+
+ for (ulint i = 0; i < os_aio_n_segments; i++) {
+ os_aio_simulated_wake_handler_thread(i);
+ }
+}
+
+#ifdef _WIN32
+/**********************************************************************//**
+This function can be called if one wants to post a batch of reads and
+prefers an i/o-handler thread to handle them all at once later. You must
+call os_aio_simulated_wake_handler_threads later to ensure the threads
+are not left sleeping! */
+UNIV_INTERN
+void
+os_aio_simulated_put_read_threads_to_sleep()
+{
+
+/* The idea of putting background IO threads to sleep is only for
+Windows when using simulated AIO. Windows XP seems to schedule
+background threads too eagerly to allow for coalescing during
+readahead requests. */
+
+ os_aio_array_t* array;
+
+ if (srv_use_native_aio) {
+ /* We do not use simulated aio: do nothing */
+
+ return;
+ }
+
+ os_aio_recommend_sleep_for_read_threads = TRUE;
+
+ for (ulint i = 0; i < os_aio_n_segments; i++) {
+ os_aio_get_array_and_local_segment(&array, i);
+
+ if (array == os_aio_read_array) {
+
+ os_event_reset(os_aio_segment_wait_events[i]);
+ }
+ }
+}
+#endif /* _WIN32 */
+
+/** Submit buffered AIO requests on the given segment to the kernel
+(low level function).
+@param acquire_mutex specifies whether to lock array mutex
+*/
+static
+void
+os_aio_dispatch_read_array_submit_low(bool acquire_mutex MY_ATTRIBUTE((unused)))
+{
+ if (!srv_use_native_aio) {
+ return;
+ }
+#if defined(LINUX_NATIVE_AIO)
+ os_aio_array_t* array = os_aio_read_array;
+ ulint total_submitted = 0;
+ if (acquire_mutex)
+ os_mutex_enter(array->mutex);
+ /* Submit aio requests buffered on all segments. */
+ for (ulint i = 0; i < array->n_segments; i++) {
+ const int count = array->count[i];
+ int offset = 0;
+ while (offset != count) {
+ struct iocb** const iocb_array = array->pending
+ + i * array->n_slots / array->n_segments
+ + offset;
+ const int partial_count = count - offset;
+ /* io_submit() returns number of successfully queued
+ requests or (-errno).
+ It returns 0 only if the number of iocb blocks passed
+ is also 0. */
+ const int submitted = io_submit(array->aio_ctx[i],
+ partial_count, iocb_array);
+
+ /* This assertion prevents infinite loop in both
+ debug and release modes. */
+ ut_a(submitted != 0);
+
+ if (submitted < 0) {
+ /* Terminating with fatal error */
+ const char* errmsg =
+ strerror(-submitted);
+ ib_logf(IB_LOG_LEVEL_FATAL,
+ "Trying to sumbit %d aio requests, "
+ "io_submit() set errno to %d: %s",
+ partial_count, -submitted,
+ errmsg ? errmsg : "<unknown>");
+ }
+ ut_ad(submitted <= partial_count);
+ if (submitted < partial_count)
+ {
+ ib_logf(IB_LOG_LEVEL_WARN,
+ "Trying to sumbit %d aio requests, "
+ "io_submit() submitted only %d",
+ partial_count, submitted);
+ }
+ offset += submitted;
+ }
+ total_submitted += count;
+ }
+ /* Reset the aio request buffer. */
+ memset(array->pending, 0x0, sizeof(struct iocb*) * array->n_slots);
+ memset(array->count, 0x0, sizeof(ulint) * array->n_segments);
+
+ if (acquire_mutex)
+ os_mutex_exit(array->mutex);
+
+ srv_stats.n_aio_submitted.add(total_submitted);
+#endif
+}
+
+/** Submit buffered AIO requests on the given segment to the kernel. */
+UNIV_INTERN
+void
+os_aio_dispatch_read_array_submit()
+{
+ os_aio_dispatch_read_array_submit_low(true);
+}
+
+#if defined(LINUX_NATIVE_AIO)
+/*******************************************************************//**
+Dispatch an AIO request to the kernel.
+@return TRUE on success. */
+static
+ibool
+os_aio_linux_dispatch(
+/*==================*/
+ os_aio_array_t* array, /*!< in: io request array. */
+ os_aio_slot_t* slot, /*!< in: an already reserved slot. */
+ bool should_buffer) /*!< in: should buffer the request
+ rather than submit. */
+{
+ int ret;
+ struct iocb* iocb;
+
+ ut_ad(slot != NULL);
+ ut_ad(array);
+
+ ut_a(slot->reserved);
+
+ /* Find out what we are going to work with.
+ The iocb struct is directly in the slot.
+ The io_context is one per segment. */
+
+ ulint slots_per_segment = array->n_slots / array->n_segments;
+ iocb = &slot->control;
+ ulint io_ctx_index = slot->pos / slots_per_segment;
+ if (should_buffer) {
+ ut_ad(array == os_aio_read_array);
+
+ os_mutex_enter(array->mutex);
+ /* There are array->n_slots elements in array->pending,
+ which is divided into array->n_segments area of equal size.
+ The iocb of each segment are buffered in its corresponding area
+ in the pending array consecutively as they come.
+ array->count[i] records the number of buffered aio requests
+ in the ith segment.*/
+ ulint& count = array->count[io_ctx_index];
+ ut_ad(count != slots_per_segment);
+ ulint n = io_ctx_index * slots_per_segment + count;
+ array->pending[n] = iocb;
+ ++count;
+ if (count == slots_per_segment) {
+ os_aio_dispatch_read_array_submit_low(false);
+ }
+ os_mutex_exit(array->mutex);
+ return(TRUE);
+ }
+ /* Submit the given request. */
+ ret = io_submit(array->aio_ctx[io_ctx_index], 1, &iocb);
+
+#if defined(UNIV_AIO_DEBUG)
+ fprintf(stderr,
+ "io_submit[%c] ret[%d]: slot[%p] ctx[%p] seg[%lu]\n",
+ (slot->type == OS_FILE_WRITE) ? 'w' : 'r', ret, slot,
+ array->aio_ctx[io_ctx_index], (ulong) io_ctx_index);
+#endif
+
+ /* io_submit returns number of successfully
+ queued requests or -errno. */
+ if (UNIV_UNLIKELY(ret != 1)) {
+ errno = -ret;
+ return(FALSE);
+ }
+
+ return(TRUE);
+}
+#endif /* LINUX_NATIVE_AIO */
+
+
+/*******************************************************************//**
+NOTE! Use the corresponding macro os_aio(), not directly this function!
+Requests an asynchronous i/o operation.
+@return TRUE if request was queued successfully, FALSE if fail */
+UNIV_INTERN
+ibool
+os_aio_func(
+/*========*/
+ ulint type, /*!< in: OS_FILE_READ or OS_FILE_WRITE */
+ ulint mode, /*!< in: OS_AIO_NORMAL, ..., possibly ORed
+ to OS_AIO_SIMULATED_WAKE_LATER: the
+ last flag advises this function not to wake
+ i/o-handler threads, but the caller will
+ do the waking explicitly later, in this
+ way the caller can post several requests in
+ a batch; NOTE that the batch must not be
+ so big that it exhausts the slots in aio
+ arrays! NOTE that a simulated batch
+ may introduce hidden chances of deadlocks,
+ because i/os are not actually handled until
+ all have been posted: use with great
+ caution! */
+ const char* name, /*!< in: name of the file or path as a
+ null-terminated string */
+ pfs_os_file_t file, /*!< in: handle to a file */
+ void* buf, /*!< in: buffer where to read or from which
+ to write */
+ os_offset_t offset, /*!< in: file offset where to read or write */
+ ulint n, /*!< in: number of bytes to read or write */
+ fil_node_t* message1,/*!< in: message for the aio handler
+ (can be used to identify a completed
+ aio operation); ignored if mode is
+ OS_AIO_SYNC */
+ void* message2,/*!< in: message for the aio handler
+ (can be used to identify a completed
+ aio operation); ignored if mode is
+ OS_AIO_SYNC */
+ ulint space_id,
+ trx_t* trx,
+ bool should_buffer)
+ /*!< in: Whether to buffer an aio request.
+ AIO read ahead uses this. If you plan to
+ use this parameter, make sure you remember
+ to call os_aio_dispatch_read_array_submit()
+ when you're ready to commit all your requests.*/
+{
+ os_aio_array_t* array;
+ os_aio_slot_t* slot;
+#ifdef WIN_ASYNC_IO
+ DWORD len = (DWORD) n;
+ BOOL ret;
+#endif
+ ulint wake_later;
+ ut_ad(buf);
+ ut_ad(n > 0);
+ ut_ad(n % OS_MIN_LOG_BLOCK_SIZE == 0);
+ ut_ad(offset % OS_MIN_LOG_BLOCK_SIZE == 0);
+ ut_ad(os_aio_validate_skip());
+#ifdef WIN_ASYNC_IO
+ ut_ad((n & 0xFFFFFFFFUL) == n);
+#endif
+
+ wake_later = mode & OS_AIO_SIMULATED_WAKE_LATER;
+ mode = mode & (~OS_AIO_SIMULATED_WAKE_LATER);
+
+ DBUG_EXECUTE_IF("ib_os_aio_func_io_failure_28",
+ mode = OS_AIO_SYNC; os_has_said_disk_full = FALSE;);
+
+ if (mode == OS_AIO_SYNC) {
+ ibool ret;
+ /* This is actually an ordinary synchronous read or write:
+ no need to use an i/o-handler thread */
+
+ if (type == OS_FILE_READ) {
+ ret = os_file_read_func(file.m_file, buf, offset, n, trx);
+ } else {
+ ut_ad(!srv_read_only_mode);
+ ut_a(type == OS_FILE_WRITE);
+
+ ret = os_file_write(name, file, buf, offset, n);
+
+ DBUG_EXECUTE_IF("ib_os_aio_func_io_failure_28",
+ os_has_said_disk_full = FALSE; ret = 0; errno = 28;);
+
+ if (!ret) {
+ os_file_handle_error_cond_exit(name, "os_file_write_func", TRUE, FALSE);
+ }
+ }
+
+ if (!ret) {
+ fprintf(stderr, "FAIL");
+ }
+
+ return ret;
+ }
+
+try_again:
+ switch (mode) {
+ case OS_AIO_NORMAL:
+ if (type == OS_FILE_READ) {
+ array = os_aio_read_array;
+ } else {
+ ut_ad(!srv_read_only_mode);
+ array = os_aio_write_array;
+ }
+ break;
+ case OS_AIO_IBUF:
+ ut_ad(type == OS_FILE_READ);
+ /* Reduce probability of deadlock bugs in connection with ibuf:
+ do not let the ibuf i/o handler sleep */
+
+ wake_later = FALSE;
+
+ if (srv_read_only_mode) {
+ array = os_aio_read_array;
+ } else {
+ array = os_aio_ibuf_array;
+ }
+ break;
+ case OS_AIO_LOG:
+ if (srv_read_only_mode) {
+ array = os_aio_read_array;
+ } else {
+ array = os_aio_log_array;
+ }
+ break;
+ case OS_AIO_SYNC:
+ array = os_aio_sync_array;
+#if defined(LINUX_NATIVE_AIO)
+ /* In Linux native AIO we don't use sync IO array. */
+ ut_a(!srv_use_native_aio);
+#endif /* LINUX_NATIVE_AIO */
+ break;
+ default:
+ ut_error;
+ array = NULL; /* Eliminate compiler warning */
+ }
+
+ if (trx && type == OS_FILE_READ)
+ {
+ trx->io_reads++;
+ trx->io_read += n;
+ }
+ slot = os_aio_array_reserve_slot(type, array, message1, message2, file,
+ name, buf, offset, n, space_id);
+ if (type == OS_FILE_READ) {
+ if (srv_use_native_aio) {
+ os_n_file_reads++;
+ os_bytes_read_since_printout += n;
+#ifdef WIN_ASYNC_IO
+ ret = ReadFile(file.m_file, buf, (DWORD) n, &len,
+ &(slot->control));
+ if(!ret && GetLastError() != ERROR_IO_PENDING)
+ goto err_exit;
+
+#elif defined(LINUX_NATIVE_AIO)
+ if (!os_aio_linux_dispatch(array, slot,
+ should_buffer)) {
+ goto err_exit;
+ }
+#endif /* WIN_ASYNC_IO */
+ } else {
+ if (!wake_later) {
+ os_aio_simulated_wake_handler_thread(
+ os_aio_get_segment_no_from_slot(
+ array, slot));
+ }
+ }
+ } else if (type == OS_FILE_WRITE) {
+ ut_ad(!srv_read_only_mode);
+ if (srv_use_native_aio) {
+ os_n_file_writes++;
+#ifdef WIN_ASYNC_IO
+ ret = WriteFile(file.m_file, buf, (DWORD) n, &len,
+ &(slot->control));
+
+ if(!ret && GetLastError() != ERROR_IO_PENDING)
+ goto err_exit;
+#elif defined(LINUX_NATIVE_AIO)
+ if (!os_aio_linux_dispatch(array, slot, false)) {
+ goto err_exit;
+ }
+#endif /* WIN_ASYNC_IO */
+ } else {
+ if (!wake_later) {
+ os_aio_simulated_wake_handler_thread(
+ os_aio_get_segment_no_from_slot(
+ array, slot));
+ }
+ }
+ } else {
+ ut_error;
+ }
+
+ /* aio was queued successfully! */
+ return(TRUE);
+
+#if defined LINUX_NATIVE_AIO || defined WIN_ASYNC_IO
+err_exit:
+#endif /* LINUX_NATIVE_AIO || WIN_ASYNC_IO */
+ os_aio_array_free_slot(array, slot);
+
+ if (os_file_handle_error(
+ name,type == OS_FILE_READ ? "aio read" : "aio write")) {
+
+ goto try_again;
+ }
+
+ return(FALSE);
+}
+
+#ifdef WIN_ASYNC_IO
+#define READ_SEGMENT(x) (x < srv_n_read_io_threads)
+#define WRITE_SEGMENT(x) !READ_SEGMENT(x)
+
+/**********************************************************************//**
+This function is only used in Windows asynchronous i/o.
+Waits for an aio operation to complete. This function is used to wait the
+for completed requests. The aio array of pending requests is divided
+into segments. The thread specifies which segment or slot it wants to wait
+for. NOTE: this function will also take care of freeing the aio slot,
+therefore no other thread is allowed to do the freeing!
+@return TRUE if the aio operation succeeded */
+UNIV_INTERN
+ibool
+os_aio_windows_handle(
+/*==================*/
+ ulint segment, /*!< in: the number of the segment in the aio
+ arrays to wait for; segment 0 is the ibuf
+ i/o thread, segment 1 the log i/o thread,
+ then follow the non-ibuf read threads, and as
+ the last are the non-ibuf write threads; if
+ this is ULINT_UNDEFINED, then it means that
+ sync aio is used, and this parameter is
+ ignored */
+ ulint pos, /*!< this parameter is used only in sync aio:
+ wait for the aio slot at this position */
+ fil_node_t**message1, /*!< out: the messages passed with the aio
+ request; note that also in the case where
+ the aio operation failed, these output
+ parameters are valid and can be used to
+ restart the operation, for example */
+ void** message2,
+ ulint* type, /*!< out: OS_FILE_WRITE or ..._READ */
+ ulint* space_id)
+{
+ ulint orig_seg = segment;
+ os_aio_slot_t* slot;
+ ibool ret_val;
+ BOOL ret;
+ DWORD len;
+ BOOL retry = FALSE;
+ ULONG_PTR key;
+ HANDLE port = READ_SEGMENT(segment)? read_completion_port : completion_port;
+
+ for(;;) {
+ ret = GetQueuedCompletionStatus(port, &len, &key,
+ (OVERLAPPED **)&slot, INFINITE);
+
+ /* If shutdown key was received, repost the shutdown message and exit */
+ if (ret && (key == IOCP_SHUTDOWN_KEY)) {
+ PostQueuedCompletionStatus(port, 0, key, NULL);
+ os_thread_exit(NULL);
+ }
+
+ if (srv_shutdown_state == SRV_SHUTDOWN_EXIT_THREADS) {
+ os_thread_exit(NULL);
+ }
+
+ if(WRITE_SEGMENT(segment)&& slot->type == OS_FILE_READ) {
+ /*
+ Redirect read completions to the dedicated completion port
+ and thread. We need to split read and write threads. If we do not
+ do that, and just allow all io threads process all IO, it is possible
+ to get stuck in a deadlock in buffer pool code,
+
+ Currently, the problem is solved this way - "write io" threads
+ always get all completion notifications, from both async reads and
+ writes. Write completion is handled in the same thread that gets it.
+ Read completion is forwarded via PostQueueCompletionStatus())
+ to the second completion port dedicated solely to reads. One of the
+ "read io" threads waiting on this port will finally handle the IO.
+
+ Forwarding IO completion this way costs a context switch , and this
+ seems tolerable since asynchronous reads are by far less frequent.
+ */
+ ut_a(PostQueuedCompletionStatus(read_completion_port, len, key,
+ &slot->control));
+ }
+ else {
+ break;
+ }
+ }
+ *message1 = slot->message1;
+ *message2 = slot->message2;
+
+ *type = slot->type;
+ *space_id = slot->space_id;
+
+ if (ret && len == slot->len) {
+
+ ret_val = TRUE;
+ } else if (os_file_handle_error(slot->name, "Windows aio")) {
+
+ retry = TRUE;
+ } else {
+
+ ret_val = FALSE;
+ }
+
+ if (retry) {
+ LARGE_INTEGER li;
+ li.LowPart = slot->control.Offset;
+ li.HighPart = slot->control.OffsetHigh;
+
+ ut_a((slot->len & 0xFFFFFFFFUL) == slot->len);
+
+ switch (slot->type) {
+ case OS_FILE_WRITE:
+ ret_val = os_file_write(slot->name, slot->file, slot->buf,
+ li.QuadPart, slot->len);
+ break;
+ case OS_FILE_READ:
+ ret_val = os_file_read(slot->file, slot->buf,
+ li.QuadPart, slot->len);
+ break;
+ default:
+ ut_error;
+ }
+
+ }
+
+ os_aio_array_free_slot((os_aio_array_t *)slot->arr, slot);
+
+ return(ret_val);
+}
+#endif
+
+#if defined(LINUX_NATIVE_AIO)
+/******************************************************************//**
+This function is only used in Linux native asynchronous i/o. This is
+called from within the io-thread. If there are no completed IO requests
+in the slot array, the thread calls this function to collect more
+requests from the kernel.
+The io-thread waits on io_getevents(), which is a blocking call, with
+a timeout value. Unless the system is very heavy loaded, keeping the
+io-thread very busy, the io-thread will spend most of its time waiting
+in this function.
+The io-thread also exits in this function. It checks server status at
+each wakeup and that is why we use timed wait in io_getevents(). */
+static
+void
+os_aio_linux_collect(
+/*=================*/
+ os_aio_array_t* array, /*!< in/out: slot array. */
+ ulint segment, /*!< in: local segment no. */
+ ulint seg_size) /*!< in: segment size. */
+{
+ int i;
+ int ret;
+ ulint start_pos;
+ ulint end_pos;
+ struct timespec timeout;
+ struct io_event* events;
+ struct io_context* io_ctx;
+
+ /* sanity checks. */
+ ut_ad(array != NULL);
+ ut_ad(seg_size > 0);
+ ut_ad(segment < array->n_segments);
+
+ /* Which part of event array we are going to work on. */
+ events = &array->aio_events[segment * seg_size];
+
+ /* Which io_context we are going to use. */
+ io_ctx = array->aio_ctx[segment];
+
+ /* Starting point of the segment we will be working on. */
+ start_pos = segment * seg_size;
+
+ /* End point. */
+ end_pos = start_pos + seg_size;
+
+retry:
+
+ /* Initialize the events. The timeout value is arbitrary.
+ We probably need to experiment with it a little. */
+ memset(events, 0, sizeof(*events) * seg_size);
+ timeout.tv_sec = 0;
+ timeout.tv_nsec = OS_AIO_REAP_TIMEOUT;
+
+ ret = io_getevents(io_ctx, 1, seg_size, events, &timeout);
+
+ if (ret > 0) {
+ for (i = 0; i < ret; i++) {
+ os_aio_slot_t* slot;
+ struct iocb* control;
+
+ control = (struct iocb*) events[i].obj;
+ ut_a(control != NULL);
+
+ slot = (os_aio_slot_t*) control->data;
+
+ /* Some sanity checks. */
+ ut_a(slot != NULL);
+ ut_a(slot->reserved);
+
+#if defined(UNIV_AIO_DEBUG)
+ fprintf(stderr,
+ "io_getevents[%c]: slot[%p] ctx[%p]"
+ " seg[%lu]\n",
+ (slot->type == OS_FILE_WRITE) ? 'w' : 'r',
+ slot, io_ctx, segment);
+#endif
+
+ /* We are not scribbling previous segment. */
+ ut_a(slot->pos >= start_pos);
+
+ /* We have not overstepped to next segment. */
+ ut_a(slot->pos < end_pos);
+
+ /* Mark this request as completed. The error handling
+ will be done in the calling function. */
+ os_mutex_enter(array->mutex);
+ slot->n_bytes = events[i].res;
+ slot->ret = events[i].res2;
+ slot->io_already_done = TRUE;
+ os_mutex_exit(array->mutex);
+ }
+ return;
+ }
+
+ if (UNIV_UNLIKELY(srv_shutdown_state == SRV_SHUTDOWN_EXIT_THREADS)) {
+ return;
+ }
+
+ /* This error handling is for any error in collecting the
+ IO requests. The errors, if any, for any particular IO
+ request are simply passed on to the calling routine. */
+
+ switch (ret) {
+ case -EAGAIN:
+ /* Not enough resources! Try again. */
+ case -EINTR:
+ /* Interrupted! I have tested the behaviour in case of an
+ interrupt. If we have some completed IOs available then
+ the return code will be the number of IOs. We get EINTR only
+ if there are no completed IOs and we have been interrupted. */
+ case 0:
+ /* No pending request! Go back and check again. */
+ goto retry;
+ }
+
+ /* All other errors should cause a trap for now. */
+ ut_print_timestamp(stderr);
+ fprintf(stderr,
+ " InnoDB: unexpected ret_code[%d] from io_getevents()!\n",
+ ret);
+ ut_error;
+}
+
+/**********************************************************************//**
+This function is only used in Linux native asynchronous i/o.
+Waits for an aio operation to complete. This function is used to wait for
+the completed requests. The aio array of pending requests is divided
+into segments. The thread specifies which segment or slot it wants to wait
+for. NOTE: this function will also take care of freeing the aio slot,
+therefore no other thread is allowed to do the freeing!
+@return TRUE if the IO was successful */
+UNIV_INTERN
+ibool
+os_aio_linux_handle(
+/*================*/
+ ulint global_seg, /*!< in: segment number in the aio array
+ to wait for; segment 0 is the ibuf
+ i/o thread, segment 1 is log i/o thread,
+ then follow the non-ibuf read threads,
+ and the last are the non-ibuf write
+ threads. */
+ fil_node_t**message1, /*!< out: the messages passed with the */
+ void** message2, /*!< aio request; note that in case the
+ aio operation failed, these output
+ parameters are valid and can be used to
+ restart the operation. */
+ ulint* type, /*!< out: OS_FILE_WRITE or ..._READ */
+ ulint* space_id)
+{
+ ulint segment;
+ os_aio_array_t* array;
+ os_aio_slot_t* slot;
+ ulint n;
+ ulint i;
+ ibool ret = FALSE;
+
+ /* Should never be doing Sync IO here. */
+ ut_a(global_seg != ULINT_UNDEFINED);
+
+ /* Find the array and the local segment. */
+ segment = os_aio_get_array_and_local_segment(&array, global_seg);
+ n = array->n_slots / array->n_segments;
+
+ wait_for_event:
+ /* Loop until we have found a completed request. */
+ for (;;) {
+ ibool any_reserved = FALSE;
+ os_mutex_enter(array->mutex);
+ for (i = 0; i < n; ++i) {
+ slot = os_aio_array_get_nth_slot(
+ array, i + segment * n);
+ if (!slot->reserved) {
+ continue;
+ } else if (slot->io_already_done) {
+ /* Something for us to work on. */
+ goto found;
+ } else {
+ any_reserved = TRUE;
+ }
+ }
+
+ os_mutex_exit(array->mutex);
+
+ /* There is no completed request.
+ If there is no pending request at all,
+ and the system is being shut down, exit. */
+ if (UNIV_UNLIKELY
+ (!any_reserved
+ && srv_shutdown_state == SRV_SHUTDOWN_EXIT_THREADS)) {
+ *message1 = NULL;
+ *message2 = NULL;
+ return(TRUE);
+ }
+
+ /* Wait for some request. Note that we return
+ from wait iff we have found a request. */
+
+ srv_set_io_thread_op_info(global_seg,
+ "waiting for completed aio requests");
+ os_aio_linux_collect(array, segment, n);
+ }
+
+found:
+ /* Note that it may be that there are more then one completed
+ IO requests. We process them one at a time. We may have a case
+ here to improve the performance slightly by dealing with all
+ requests in one sweep. */
+ srv_set_io_thread_op_info(global_seg,
+ "processing completed aio requests");
+
+ /* Ensure that we are scribbling only our segment. */
+ ut_a(i < n);
+
+ ut_ad(slot != NULL);
+ ut_ad(slot->reserved);
+ ut_ad(slot->io_already_done);
+
+ *message1 = slot->message1;
+ *message2 = slot->message2;
+
+ *type = slot->type;
+ *space_id = slot->space_id;
+
+ if (slot->ret == 0 && slot->n_bytes == (long) slot->len) {
+
+ ret = TRUE;
+ } else if ((slot->ret == 0) && (slot->n_bytes > 0)
+ && (slot->n_bytes < (long) slot->len)) {
+ /* Partial read or write scenario */
+ int submit_ret;
+ struct iocb* iocb;
+ slot->buf = (byte*)slot->buf + slot->n_bytes;
+ slot->offset = slot->offset + slot->n_bytes;
+ slot->len = slot->len - slot->n_bytes;
+ /* Resetting the bytes read/written */
+ slot->n_bytes = 0;
+ slot->io_already_done = FALSE;
+ iocb = &(slot->control);
+
+ if (slot->type == OS_FILE_READ) {
+ io_prep_pread(&slot->control, slot->file.m_file,
+ slot->buf, slot->len,
+ (off_t) slot->offset);
+ } else {
+ ut_a(slot->type == OS_FILE_WRITE);
+ io_prep_pwrite(&slot->control, slot->file.m_file,
+ slot->buf, slot->len,
+ (off_t) slot->offset);
+ }
+ /* Resubmit an I/O request */
+ submit_ret = io_submit(array->aio_ctx[segment], 1, &iocb);
+ if (submit_ret < 0 ) {
+ /* Aborting in case of submit failure */
+ ib_logf(IB_LOG_LEVEL_FATAL,
+ "Native Linux AIO interface. io_submit()"
+ " call failed when resubmitting a partial"
+ " I/O request on the file %s.",
+ slot->name);
+ } else {
+ ret = FALSE;
+ os_mutex_exit(array->mutex);
+ goto wait_for_event;
+ }
+ } else {
+ errno = -slot->ret;
+
+ /* os_file_handle_error does tell us if we should retry
+ this IO. As it stands now, we don't do this retry when
+ reaping requests from a different context than
+ the dispatcher. This non-retry logic is the same for
+ windows and linux native AIO.
+ We should probably look into this to transparently
+ re-submit the IO. */
+ os_file_handle_error(slot->name, "Linux aio");
+
+ ret = FALSE;
+ }
+
+ os_mutex_exit(array->mutex);
+
+ os_aio_array_free_slot(array, slot);
+
+ return(ret);
+}
+#endif /* LINUX_NATIVE_AIO */
+
+/**********************************************************************//**
+Does simulated aio. This function should be called by an i/o-handler
+thread.
+@return TRUE if the aio operation succeeded */
+UNIV_INTERN
+ibool
+os_aio_simulated_handle(
+/*====================*/
+ ulint global_segment, /*!< in: the number of the segment in the aio
+ arrays to wait for; segment 0 is the ibuf
+ i/o thread, segment 1 the log i/o thread,
+ then follow the non-ibuf read threads, and as
+ the last are the non-ibuf write threads */
+ fil_node_t**message1, /*!< out: the messages passed with the aio
+ request; note that also in the case where
+ the aio operation failed, these output
+ parameters are valid and can be used to
+ restart the operation, for example */
+ void** message2,
+ ulint* type, /*!< out: OS_FILE_WRITE or ..._READ */
+ ulint* space_id)
+{
+ os_aio_array_t* array;
+ ulint segment;
+ os_aio_slot_t* consecutive_ios[OS_AIO_MERGE_N_CONSECUTIVE];
+ ulint n_consecutive;
+ ulint total_len;
+ ulint offs;
+ os_offset_t lowest_offset;
+ ulint biggest_age;
+ ulint age;
+ byte* combined_buf;
+ byte* combined_buf2;
+ ibool ret;
+ ibool any_reserved;
+ ulint n;
+ os_aio_slot_t* aio_slot;
+
+ /* Fix compiler warning */
+ *consecutive_ios = NULL;
+
+ segment = os_aio_get_array_and_local_segment(&array, global_segment);
+
+restart:
+ /* NOTE! We only access constant fields in os_aio_array. Therefore
+ we do not have to acquire the protecting mutex yet */
+
+ srv_set_io_thread_op_info(global_segment,
+ "looking for i/o requests (a)");
+ ut_ad(os_aio_validate_skip());
+ ut_ad(segment < array->n_segments);
+
+ n = array->n_slots / array->n_segments;
+
+ /* Look through n slots after the segment * n'th slot */
+
+ if (array == os_aio_read_array
+ && os_aio_recommend_sleep_for_read_threads) {
+
+ /* Give other threads chance to add several i/os to the array
+ at once. */
+
+ goto recommended_sleep;
+ }
+
+ srv_set_io_thread_op_info(global_segment,
+ "looking for i/o requests (b)");
+
+ /* Check if there is a slot for which the i/o has already been
+ done */
+ any_reserved = FALSE;
+
+ os_mutex_enter(array->mutex);
+
+ for (ulint i = 0; i < n; i++) {
+ os_aio_slot_t* slot;
+
+ slot = os_aio_array_get_nth_slot(array, i + segment * n);
+
+ if (!slot->reserved) {
+ continue;
+ } else if (slot->io_already_done) {
+
+ if (os_aio_print_debug) {
+ fprintf(stderr,
+ "InnoDB: i/o for slot %lu"
+ " already done, returning\n",
+ (ulong) i);
+ }
+
+ aio_slot = slot;
+ ret = TRUE;
+ goto slot_io_done;
+ } else {
+ any_reserved = TRUE;
+ }
+ }
+
+ /* There is no completed request.
+ If there is no pending request at all,
+ and the system is being shut down, exit. */
+ if (!any_reserved && srv_shutdown_state == SRV_SHUTDOWN_EXIT_THREADS) {
+ os_mutex_exit(array->mutex);
+ *message1 = NULL;
+ *message2 = NULL;
+ return(TRUE);
+ }
+
+ n_consecutive = 0;
+
+ /* If there are at least 2 seconds old requests, then pick the oldest
+ one to prevent starvation. If several requests have the same age,
+ then pick the one at the lowest offset. */
+
+ biggest_age = 0;
+ lowest_offset = IB_UINT64_MAX;
+
+ for (ulint i = 0; i < n; i++) {
+ os_aio_slot_t* slot;
+
+ slot = os_aio_array_get_nth_slot(array, i + segment * n);
+
+ if (slot->reserved) {
+
+ age = (ulint) difftime(
+ ut_time(), slot->reservation_time);
+
+ if ((age >= 2 && age > biggest_age)
+ || (age >= 2 && age == biggest_age
+ && slot->offset < lowest_offset)) {
+
+ /* Found an i/o request */
+ consecutive_ios[0] = slot;
+
+ n_consecutive = 1;
+
+ biggest_age = age;
+ lowest_offset = slot->offset;
+ }
+ }
+ }
+
+ if (n_consecutive == 0) {
+ /* There were no old requests. Look for an i/o request at the
+ lowest offset in the array (we ignore the high 32 bits of the
+ offset in these heuristics) */
+
+ lowest_offset = IB_UINT64_MAX;
+
+ for (ulint i = 0; i < n; i++) {
+ os_aio_slot_t* slot;
+
+ slot = os_aio_array_get_nth_slot(
+ array, i + segment * n);
+
+ if (slot->reserved && slot->offset < lowest_offset) {
+
+ /* Found an i/o request */
+ consecutive_ios[0] = slot;
+
+ n_consecutive = 1;
+
+ lowest_offset = slot->offset;
+ }
+ }
+ }
+
+ if (n_consecutive == 0) {
+
+ /* No i/o requested at the moment */
+
+ goto wait_for_io;
+ }
+
+ /* if n_consecutive != 0, then we have assigned
+ something valid to consecutive_ios[0] */
+ ut_ad(n_consecutive != 0);
+ ut_ad(consecutive_ios[0] != NULL);
+
+ aio_slot = consecutive_ios[0];
+
+ /* Check if there are several consecutive blocks to read or write */
+
+consecutive_loop:
+ for (ulint i = 0; i < n; i++) {
+ os_aio_slot_t* slot;
+
+ slot = os_aio_array_get_nth_slot(array, i + segment * n);
+ if (slot->reserved
+ && slot != aio_slot
+ && slot->offset == aio_slot->offset + aio_slot->len
+ && slot->type == aio_slot->type
+ && slot->file.m_file == aio_slot->file.m_file) {
+
+ /* Found a consecutive i/o request */
+
+ consecutive_ios[n_consecutive] = slot;
+ n_consecutive++;
+
+ aio_slot = slot;
+
+ if (n_consecutive < OS_AIO_MERGE_N_CONSECUTIVE) {
+
+ goto consecutive_loop;
+ } else {
+ break;
+ }
+ }
+ }
+
+ srv_set_io_thread_op_info(global_segment, "consecutive i/o requests");
+
+ /* We have now collected n_consecutive i/o requests in the array;
+ allocate a single buffer which can hold all data, and perform the
+ i/o */
+
+ total_len = 0;
+ aio_slot = consecutive_ios[0];
+
+ for (ulint i = 0; i < n_consecutive; i++) {
+ total_len += consecutive_ios[i]->len;
+ }
+
+ if (n_consecutive == 1) {
+ /* We can use the buffer of the i/o request */
+ combined_buf = aio_slot->buf;
+ combined_buf2 = NULL;
+ } else {
+ combined_buf2 = static_cast<byte*>(
+ ut_malloc(total_len + UNIV_PAGE_SIZE));
+
+ ut_a(combined_buf2);
+
+ combined_buf = static_cast<byte*>(
+ ut_align(combined_buf2, UNIV_PAGE_SIZE));
+ }
+
+ /* We release the array mutex for the time of the i/o: NOTE that
+ this assumes that there is just one i/o-handler thread serving
+ a single segment of slots! */
+
+ os_mutex_exit(array->mutex);
+
+ if (aio_slot->type == OS_FILE_WRITE && n_consecutive > 1) {
+ /* Copy the buffers to the combined buffer */
+ offs = 0;
+
+ for (ulint i = 0; i < n_consecutive; i++) {
+
+ ut_memcpy(combined_buf + offs, consecutive_ios[i]->buf,
+ consecutive_ios[i]->len);
+
+ offs += consecutive_ios[i]->len;
+ }
+ }
+
+ srv_set_io_thread_op_info(global_segment, "doing file i/o");
+
+ /* Do the i/o with ordinary, synchronous i/o functions: */
+ if (aio_slot->type == OS_FILE_WRITE) {
+ ut_ad(!srv_read_only_mode);
+ ret = os_file_write(
+ aio_slot->name, aio_slot->file, combined_buf,
+ aio_slot->offset, total_len);
+
+ DBUG_EXECUTE_IF("ib_os_aio_func_io_failure_28",
+ os_has_said_disk_full = FALSE; ret = 0; errno = 28;);
+
+ if (!ret) {
+ os_file_handle_error_cond_exit(aio_slot->name, "os_file_write_func", TRUE, FALSE);
+ }
+
+ } else {
+ ret = os_file_read(
+ aio_slot->file, combined_buf,
+ aio_slot->offset, total_len);
+ }
+
+ srv_set_io_thread_op_info(global_segment, "file i/o done");
+
+ if (aio_slot->type == OS_FILE_READ && n_consecutive > 1) {
+ /* Copy the combined buffer to individual buffers */
+ offs = 0;
+
+ for (ulint i = 0; i < n_consecutive; i++) {
+
+ ut_memcpy(consecutive_ios[i]->buf, combined_buf + offs,
+ consecutive_ios[i]->len);
+ offs += consecutive_ios[i]->len;
+ }
+ }
+
+ if (combined_buf2) {
+ ut_free(combined_buf2);
+ }
+
+ os_mutex_enter(array->mutex);
+
+ /* Mark the i/os done in slots */
+
+ for (ulint i = 0; i < n_consecutive; i++) {
+ consecutive_ios[i]->io_already_done = TRUE;
+ }
+
+ /* We return the messages for the first slot now, and if there were
+ several slots, the messages will be returned with subsequent calls
+ of this function */
+
+slot_io_done:
+
+ ut_a(aio_slot->reserved);
+
+ *message1 = aio_slot->message1;
+ *message2 = aio_slot->message2;
+
+ *type = aio_slot->type;
+ *space_id = aio_slot->space_id;
+
+ os_mutex_exit(array->mutex);
+
+ os_aio_array_free_slot(array, aio_slot);
+
+ return(ret);
+
+wait_for_io:
+ srv_set_io_thread_op_info(global_segment, "resetting wait event");
+
+ /* We wait here until there again can be i/os in the segment
+ of this thread */
+
+ os_event_reset(os_aio_segment_wait_events[global_segment]);
+
+ os_mutex_exit(array->mutex);
+
+recommended_sleep:
+ srv_set_io_thread_op_info(global_segment, "waiting for i/o request");
+
+ os_event_wait(os_aio_segment_wait_events[global_segment]);
+
+ goto restart;
+}
+
+/**********************************************************************//**
+Validates the consistency of an aio array.
+@return true if ok */
+static
+bool
+os_aio_array_validate(
+/*==================*/
+ os_aio_array_t* array) /*!< in: aio wait array */
+{
+ ulint i;
+ ulint n_reserved = 0;
+
+ os_mutex_enter(array->mutex);
+
+ ut_a(array->n_slots > 0);
+ ut_a(array->n_segments > 0);
+
+ for (i = 0; i < array->n_slots; i++) {
+ os_aio_slot_t* slot;
+
+ slot = os_aio_array_get_nth_slot(array, i);
+
+ if (slot->reserved) {
+ n_reserved++;
+ ut_a(slot->len > 0);
+ }
+ }
+
+ ut_a(array->n_reserved == n_reserved);
+
+ os_mutex_exit(array->mutex);
+
+ return(true);
+}
+
+/**********************************************************************//**
+Validates the consistency the aio system.
+@return TRUE if ok */
+UNIV_INTERN
+ibool
+os_aio_validate(void)
+/*=================*/
+{
+ os_aio_array_validate(os_aio_read_array);
+
+ if (os_aio_write_array != 0) {
+ os_aio_array_validate(os_aio_write_array);
+ }
+
+ if (os_aio_ibuf_array != 0) {
+ os_aio_array_validate(os_aio_ibuf_array);
+ }
+
+ if (os_aio_log_array != 0) {
+ os_aio_array_validate(os_aio_log_array);
+ }
+
+ if (os_aio_sync_array != 0) {
+ os_aio_array_validate(os_aio_sync_array);
+ }
+
+ return(TRUE);
+}
+
+/**********************************************************************//**
+Prints pending IO requests per segment of an aio array.
+We probably don't need per segment statistics but they can help us
+during development phase to see if the IO requests are being
+distributed as expected. */
+static
+void
+os_aio_print_segment_info(
+/*======================*/
+ FILE* file, /*!< in: file where to print */
+ ulint* n_seg, /*!< in: pending IO array */
+ os_aio_array_t* array) /*!< in: array to process */
+{
+ ulint i;
+
+ ut_ad(array);
+ ut_ad(n_seg);
+ ut_ad(array->n_segments > 0);
+
+ if (array->n_segments == 1) {
+ return;
+ }
+
+ fprintf(file, " [");
+ for (i = 0; i < array->n_segments; i++) {
+ if (i != 0) {
+ fprintf(file, ", ");
+ }
+
+ fprintf(file, "%lu", n_seg[i]);
+ }
+ fprintf(file, "] ");
+}
+
+/**********************************************************************//**
+Prints info about the aio array. */
+UNIV_INTERN
+void
+os_aio_print_array(
+/*==============*/
+ FILE* file, /*!< in: file where to print */
+ os_aio_array_t* array) /*!< in: aio array to print */
+{
+ ulint n_reserved = 0;
+ ulint n_res_seg[SRV_MAX_N_IO_THREADS];
+
+ os_mutex_enter(array->mutex);
+
+ ut_a(array->n_slots > 0);
+ ut_a(array->n_segments > 0);
+
+ memset(n_res_seg, 0x0, sizeof(n_res_seg));
+
+ for (ulint i = 0; i < array->n_slots; ++i) {
+ os_aio_slot_t* slot;
+ ulint seg_no;
+
+ slot = os_aio_array_get_nth_slot(array, i);
+
+ seg_no = (i * array->n_segments) / array->n_slots;
+
+ if (slot->reserved) {
+ ++n_reserved;
+ ++n_res_seg[seg_no];
+
+ ut_a(slot->len > 0);
+ }
+ }
+
+ ut_a(array->n_reserved == n_reserved);
+
+ fprintf(file, " %lu", (ulong) n_reserved);
+
+ os_aio_print_segment_info(file, n_res_seg, array);
+
+ os_mutex_exit(array->mutex);
+}
+
+/**********************************************************************//**
+Prints info of the aio arrays. */
+UNIV_INTERN
+void
+os_aio_print(
+/*=========*/
+ FILE* file) /*!< in: file where to print */
+{
+ time_t current_time;
+ double time_elapsed;
+ double avg_bytes_read;
+
+ for (ulint i = 0; i < srv_n_file_io_threads; ++i) {
+ fprintf(file, "I/O thread %lu state: %s (%s)",
+ (ulong) i,
+ srv_io_thread_op_info[i],
+ srv_io_thread_function[i]);
+
+#ifndef _WIN32
+ if (!srv_use_native_aio
+ && os_aio_segment_wait_events[i]->is_set()) {
+ fprintf(file, " ev set");
+ }
+#endif /* _WIN32 */
+
+ fprintf(file, "\n");
+ }
+
+ fputs("Pending normal aio reads:", file);
+
+ os_aio_print_array(file, os_aio_read_array);
+
+ if (os_aio_write_array != 0) {
+ fputs(", aio writes:", file);
+ os_aio_print_array(file, os_aio_write_array);
+ }
+
+ if (os_aio_ibuf_array != 0) {
+ fputs(",\n ibuf aio reads:", file);
+ os_aio_print_array(file, os_aio_ibuf_array);
+ }
+
+ if (os_aio_log_array != 0) {
+ fputs(", log i/o's:", file);
+ os_aio_print_array(file, os_aio_log_array);
+ }
+
+ if (os_aio_sync_array != 0) {
+ fputs(", sync i/o's:", file);
+ os_aio_print_array(file, os_aio_sync_array);
+ }
+
+ putc('\n', file);
+ current_time = ut_time();
+ time_elapsed = 0.001 + difftime(current_time, os_last_printout);
+
+ fprintf(file,
+ "Pending flushes (fsync) log: " ULINTPF
+ "; buffer pool: " ULINTPF "\n"
+ ULINTPF " OS file reads, "
+ ULINTPF " OS file writes, "
+ ULINTPF " OS fsyncs\n",
+ fil_n_pending_log_flushes,
+ fil_n_pending_tablespace_flushes,
+ os_n_file_reads,
+ os_n_file_writes,
+ os_n_fsyncs);
+
+ const ulint n_reads = ulint(MONITOR_VALUE(MONITOR_OS_PENDING_READS));
+ const ulint n_writes = ulint(MONITOR_VALUE(MONITOR_OS_PENDING_WRITES));
+
+ if (n_reads != 0 || n_writes != 0) {
+ fprintf(file,
+ ULINTPF " pending reads, " ULINTPF " pending writes\n",
+ n_reads, n_writes);
+ }
+
+ if (os_n_file_reads == os_n_file_reads_old) {
+ avg_bytes_read = 0.0;
+ } else {
+ avg_bytes_read = (double) os_bytes_read_since_printout
+ / (os_n_file_reads - os_n_file_reads_old);
+ }
+
+ fprintf(file,
+ "%.2f reads/s, %lu avg bytes/read,"
+ " %.2f writes/s, %.2f fsyncs/s\n",
+ (os_n_file_reads - os_n_file_reads_old)
+ / time_elapsed,
+ (ulong) avg_bytes_read,
+ (os_n_file_writes - os_n_file_writes_old)
+ / time_elapsed,
+ (os_n_fsyncs - os_n_fsyncs_old)
+ / time_elapsed);
+
+ os_n_file_reads_old = os_n_file_reads;
+ os_n_file_writes_old = os_n_file_writes;
+ os_n_fsyncs_old = os_n_fsyncs;
+ os_bytes_read_since_printout = 0;
+
+ os_last_printout = current_time;
+}
+
+/**********************************************************************//**
+Refreshes the statistics used to print per-second averages. */
+UNIV_INTERN
+void
+os_aio_refresh_stats(void)
+/*======================*/
+{
+ os_n_file_reads_old = os_n_file_reads;
+ os_n_file_writes_old = os_n_file_writes;
+ os_n_fsyncs_old = os_n_fsyncs;
+ os_bytes_read_since_printout = 0;
+
+ os_last_printout = time(NULL);
+}
+
+#ifdef UNIV_DEBUG
+/**********************************************************************//**
+Checks that all slots in the system have been freed, that is, there are
+no pending io operations.
+@return TRUE if all free */
+UNIV_INTERN
+ibool
+os_aio_all_slots_free(void)
+/*=======================*/
+{
+ os_aio_array_t* array;
+ ulint n_res = 0;
+
+ array = os_aio_read_array;
+
+ os_mutex_enter(array->mutex);
+
+ n_res += array->n_reserved;
+
+ os_mutex_exit(array->mutex);
+
+ if (!srv_read_only_mode) {
+ ut_a(os_aio_write_array == 0);
+
+ array = os_aio_write_array;
+
+ os_mutex_enter(array->mutex);
+
+ n_res += array->n_reserved;
+
+ os_mutex_exit(array->mutex);
+
+ ut_a(os_aio_ibuf_array == 0);
+
+ array = os_aio_ibuf_array;
+
+ os_mutex_enter(array->mutex);
+
+ n_res += array->n_reserved;
+
+ os_mutex_exit(array->mutex);
+ }
+
+ ut_a(os_aio_log_array == 0);
+
+ array = os_aio_log_array;
+
+ os_mutex_enter(array->mutex);
+
+ n_res += array->n_reserved;
+
+ os_mutex_exit(array->mutex);
+
+ array = os_aio_sync_array;
+
+ os_mutex_enter(array->mutex);
+
+ n_res += array->n_reserved;
+
+ os_mutex_exit(array->mutex);
+
+ if (n_res == 0) {
+
+ return(TRUE);
+ }
+
+ return(FALSE);
+}
+#endif /* UNIV_DEBUG */
+
+#endif /* !UNIV_HOTBACKUP */
diff --cc support-files/mysql.server.sh
index 9c4d8e35ec5,28f0c2f041b..34f3ca4af34
--- a/support-files/mysql.server.sh
+++ b/support-files/mysql.server.sh
@@@ -25,7 -25,14 +25,6 @@@
# Description: MariaDB is a very fast and reliable SQL database engine.
### END INIT INFO
- # If you install MariaDB on some other places than @prefix@, then you
-# Prevent OpenSUSE's init scripts from calling systemd, so that
-# both 'bootstrap' and 'start' are handled entirely within this
-# script
-SYSTEMD_NO_WRAP=1
-
-# Prevent Debian's init scripts from calling systemctl
-_SYSTEMCTL_SKIP_REDIRECT=true
-
# have to do one of the following things for this script to work:
#
# - Run this script from within the MariaDB installation directory
@@@ -438,7 -452,7 +437,6 @@@ case "$mode" i
*)
# usage
basename=`basename "$0"`
- echo "Usage: $basename {start|stop|restart|reload|force-reload|status|configtest} [ MariaDB server options ]"
- echo "Usage: $basename {start|stop|restart|reload|force-reload|status|configtest|bootstrap} [ MySQL server options ]"
exit 1
;;
esac
1
0