[Commits] 648cf7176cc: Merge remote-tracking branch 'origin/5.5-galera' into 10.0-galera

7 May 2018

revision-id: 648cf7176cc95f697abd8b94e860c74768680298 (mariadb-galera-10.0.34-6-g648cf7176cc)
parent(s): 7b115181987fb88b97ef6d3d88bb16bdbc281e40 1ecd68d867ced1d00ebffdcedbf6bc97493f5067
author: Jan Lindström
committer: Jan Lindström
timestamp: 2018-05-07 13:49:14 +0300
message:

Merge remote-tracking branch 'origin/5.5-galera' into 10.0-galera


 .gitignore                                         |   1 +
 extra/yassl/src/handshake.cpp                      |  10 +
 include/heap.h                                     |   1 +
 include/my_valgrind.h                              |   2 +
 include/mysql_com.h                                |   2 +-
 include/sql_common.h                               |   2 +-
 mysql-test/mysql-test-run.pl                       |   2 +-
 mysql-test/r/connect_debug.result                  |   5 +
 mysql-test/r/ctype_ucs.result                      |  31 +++
 mysql-test/r/ctype_utf8mb4.result                  |  23 ++
 mysql-test/r/func_misc.result                      |  11 +
 mysql-test/r/join_outer.result                     |  18 +-
 mysql-test/r/join_outer_jcl6.result                |  18 +-
 mysql-test/r/mysqld--help.result                   |   4 +-
 mysql-test/r/parser.result                         |   7 +
 mysql-test/r/ps_qc_innodb.result                   |  23 ++
 mysql-test/r/subselect4.result                     |  35 ++-
 mysql-test/r/subselect_mat.result                  |  15 +
 mysql-test/r/view.result                           | 305 ++++++++++++--------
 mysql-test/suite/galera/disabled.def               |   2 +
 mysql-test/suite/galera/r/MW-416.result            | 114 ++++++++
 mysql-test/suite/galera/r/galera_defaults.result   |   2 +-
 .../suite/galera/r/galera_var_dirty_reads.result   |   2 -
 mysql-test/suite/galera/t/MW-416.test              | 134 +++++++++
 .../suite/galera/t/galera_concurrent_ctas.test     |   8 +-
 .../suite/galera/t/galera_var_dirty_reads.test     |   9 +-
 .../suite/innodb/r/innodb-replace-debug.result     |   5 +-
 .../suite/innodb/t/innodb-replace-debug.test       |   5 +-
 mysql-test/suite/maria/dynamic.result              |   4 +
 mysql-test/suite/maria/dynamic.test                |   7 +
 .../suite/parts/r/partition_alter_maria.result     |   9 +
 .../suite/parts/t/partition_alter_maria.test       |   7 +
 mysql-test/suite/plugins/t/server_audit.test       |   2 +
 mysql-test/suite/wsrep/r/variables.result          |   7 +
 mysql-test/suite/wsrep/t/variables.test            |  14 +
 mysql-test/t/connect_debug.test                    |  12 +
 mysql-test/t/ctype_ucs.test                        |  22 ++
 mysql-test/t/ctype_utf8mb4.test                    |  19 ++
 mysql-test/t/func_misc.test                        |  12 +
 mysql-test/t/join_outer.test                       |  18 +-
 mysql-test/t/parser.test                           |   9 +
 mysql-test/t/ps_qc_innodb.test                     |  35 +++
 mysql-test/t/subselect4.test                       |  31 +++
 mysql-test/t/subselect_mat.test                    |  13 +
 mysql-test/t/view.test                             | 308 +++++++++++++--------
 mysys/lf_hash.c                                    |   9 +-
 mysys/mf_iocache.c                                 |   2 +-
 mysys/my_addr_resolve.c                            |   2 +-
 mysys/my_symlink.c                                 |   2 +-
 policy/selinux/mariadb-server.fc                   |   2 +-
 policy/selinux/mariadb-server.te                   |   2 +-
 scripts/CMakeLists.txt                             |  16 ++
 scripts/wsrep_sst_xtrabackup-v2.sh                 |   2 +-
 sql-common/client.c                                |  12 +-
 sql/event_data_objects.cc                          |  21 +-
 sql/event_db_repository.cc                         |   5 +-
 sql/events.cc                                      |  15 +
 sql/handler.cc                                     |   6 +
 sql/item_cmpfunc.h                                 |   5 +
 sql/item_func.h                                    |   4 +-
 sql/item_strfunc.h                                 |   2 +
 sql/item_subselect.cc                              |   2 +-
 sql/log.cc                                         |   8 +-
 sql/log_event.cc                                   |  69 ++++-
 sql/log_event_old.cc                               |   3 +-
 sql/mysqld.cc                                      |   5 +-
 sql/mysqld.h                                       |   3 +-
 sql/opt_subselect.cc                               |   9 +-
 sql/slave.cc                                       |   2 +-
 sql/sp.cc                                          |   6 +-
 sql/sql_acl.cc                                     |   3 +-
 sql/sql_admin.cc                                   |   2 +-
 sql/sql_base.h                                     |   2 +
 sql/sql_cache.cc                                   |   1 +
 sql/sql_class.cc                                   |   7 +-
 sql/sql_class.h                                    |   1 +
 sql/sql_insert.cc                                  |  29 ++
 sql/sql_parse.cc                                   |  14 +-
 sql/sql_partition.cc                               |   2 +-
 sql/sql_plugin.cc                                  |  33 ++-
 sql/sql_prepare.cc                                 |   6 +-
 sql/sql_priv.h                                     |   4 +-
 sql/sql_table.cc                                   |   2 +-
 sql/sql_trigger.cc                                 |   6 +
 sql/sql_truncate.cc                                |   2 +-
 sql/sql_update.cc                                  |   2 +
 sql/sql_view.cc                                    |   5 +
 sql/sql_yacc.yy                                    |   5 +
 sql/sys_vars.cc                                    |   6 +-
 sql/table.cc                                       |  16 +-
 sql/table.h                                        |   8 +-
 sql/wsrep_hton.cc                                  |  38 ++-
 sql/wsrep_mysqld.cc                                |  65 ++---
 sql/wsrep_mysqld.h                                 |  13 +
 sql/wsrep_priv.h                                   |   2 +-
 sql/wsrep_sst.cc                                   |   1 -
 sql/wsrep_thd.cc                                   |   2 +-
 sql/wsrep_utils.cc                                 |   1 -
 storage/heap/_check.c                              |   2 +-
 storage/heap/ha_heap.cc                            |  11 +-
 storage/heap/hp_create.c                           |   8 +-
 storage/heap/hp_delete.c                           |   2 +-
 storage/heap/hp_rrnd.c                             |   4 +-
 storage/heap/hp_rsame.c                            |   2 +-
 storage/heap/hp_scan.c                             |   2 +-
 storage/heap/hp_write.c                            |   4 +-
 storage/innobase/handler/ha_innodb.cc              |  73 ++---
 storage/innobase/os/os0file.cc                     |  10 +-
 storage/maria/ma_control_file.c                    |   2 +-
 storage/maria/ma_dynrec.c                          |  10 +-
 storage/maria/ma_loghandler.c                      |   6 +-
 storage/maria/ma_open.c                            |   8 +-
 storage/myisam/mi_open.c                           |   8 +-
 storage/xtradb/handler/ha_innodb.cc                |  20 +-
 storage/xtradb/log/log0online.cc                   |   3 -
 storage/xtradb/os/os0file.cc                       |  12 +-
 support-files/mysql.server.sh                      |   2 -
 support-files/wsrep.cnf.sh                         |   3 +
 118 files changed, 1536 insertions(+), 473 deletions(-)

diff --cc mysql-test/r/ctype_ucs.result
index 6520694a804,1c9e31d3a06..59d88414cab

--- a/mysql-test/r/ctype_ucs.result
+++ b/mysql-test/r/ctype_ucs.result
@@@ -4590,1014 -4397,36 +4590,1045 @@@ Field	Type	Null	Key	Default	Extr
  c1	mediumtext	YES		NULL	
  DROP TABLE t1;
  #
+ # MDEV-15624 Changing the default character set to utf8mb4 changes query evaluation in a very surprising way
+ #
+ SET NAMES utf8;
+ CREATE TABLE t1 (id INT);
+ INSERT INTO t1 VALUES (1),(2),(3);
+ SELECT COUNT(DISTINCT c) FROM (SELECT id, REPLACE(uuid_short(), '0', CAST('o' AS CHAR CHARACTER SET ucs2)) AS c FROM t1) AS d1;
+ COUNT(DISTINCT c)
+ 3
+ SELECT DISTINCT REPLACE(uuid_short(), '0', CAST('o' AS CHAR CHARACTER SET ucs2)) AS c FROM t1;
+ c
+ xxxxxxxxxxxxxxxxx
+ xxxxxxxxxxxxxxxxx
+ xxxxxxxxxxxxxxxxx
+ SELECT COUNT(DISTINCT c) FROM (SELECT id, INSERT(uuid_short(), 1, 1, CAST('0' AS CHAR CHARACTER SET ucs2)) AS c FROM t1) AS d1;
+ COUNT(DISTINCT c)
+ 3
+ SELECT DISTINCT INSERT(uuid_short(), 1, 1, CAST('0' AS CHAR CHARACTER SET ucs2)) AS c FROM t1;
+ c
+ xxxxxxxxxxxxxxxxx
+ xxxxxxxxxxxxxxxxx
+ xxxxxxxxxxxxxxxxx
+ SELECT COUNT(DISTINCT c) FROM (SELECT id, CONCAT(uuid_short(), CAST('0' AS CHAR CHARACTER SET ucs2)) AS c FROM t1) AS d1;
+ COUNT(DISTINCT c)
+ 3
+ SELECT DISTINCT CONCAT(uuid_short(), CAST('0' AS CHAR CHARACTER SET ucs2)) AS c FROM t1;
+ c
+ xxxxxxxxxxxxxxxxx
+ xxxxxxxxxxxxxxxxx
+ xxxxxxxxxxxxxxxxx
+ DROP TABLE t1;
+ #
  # End of 5.5 tests
  #
 +#
 +# Start of 5.6 tests
 +#
 +#
 +# WL#3664 WEIGHT_STRING
 +#
 +set collation_connection=ucs2_general_ci;
 +select @@collation_connection;
 +@@collation_connection
 +ucs2_general_ci
 +CREATE TABLE t1 AS SELECT 'a' AS a;
 +SHOW CREATE TABLE t1;
 +Table	Create Table
 +t1	CREATE TABLE `t1` (
 +  `a` varchar(1) CHARACTER SET ucs2 NOT NULL DEFAULT ''
 +) ENGINE=MyISAM DEFAULT CHARSET=latin1
 +CREATE TABLE t2 AS SELECT WEIGHT_STRING(a) AS ws FROM t1;
 +SHOW CREATE TABLE t2;
 +Table	Create Table
 +t2	CREATE TABLE `t2` (
 +  `ws` varbinary(2) DEFAULT NULL
 +) ENGINE=MyISAM DEFAULT CHARSET=latin1
 +SELECT HEX(WEIGHT_STRING(a)) FROM t1;
 +HEX(WEIGHT_STRING(a))
 +0041
 +SELECT HEX(ws) FROM t2;
 +HEX(ws)
 +0041
 +DROP TABLE t2;
 +DROP TABLE t1;
 +CREATE TABLE t1 AS SELECT REPEAT('a',5) AS a;
 +SHOW CREATE TABLE t1;
 +Table	Create Table
 +t1	CREATE TABLE `t1` (
 +  `a` varchar(5) CHARACTER SET ucs2 NOT NULL DEFAULT ''
 +) ENGINE=MyISAM DEFAULT CHARSET=latin1
 +CREATE TABLE t2 AS SELECT WEIGHT_STRING(a) AS ws FROM t1;
 +SHOW CREATE TABLE t2;
 +Table	Create Table
 +t2	CREATE TABLE `t2` (
 +  `ws` varbinary(10) DEFAULT NULL
 +) ENGINE=MyISAM DEFAULT CHARSET=latin1
 +SELECT HEX(WEIGHT_STRING(a)) FROM t1;
 +HEX(WEIGHT_STRING(a))
 +00410041004100410041
 +SELECT HEX(ws) FROM t2;
 +HEX(ws)
 +00410041004100410041
 +DROP TABLE t2;
 +CREATE TABLE t2 AS SELECT WEIGHT_STRING(a AS CHAR(3)) AS ws FROM t1;
 +SHOW CREATE TABLE t2;
 +Table	Create Table
 +t2	CREATE TABLE `t2` (
 +  `ws` varbinary(6) DEFAULT NULL
 +) ENGINE=MyISAM DEFAULT CHARSET=latin1
 +SELECT HEX(WEIGHT_STRING(a AS CHAR(3))) FROM t1;
 +HEX(WEIGHT_STRING(a AS CHAR(3)))
 +004100410041
 +SELECT HEX(ws) FROM t2;
 +HEX(ws)
 +004100410041
 +DROP TABLE t2;
 +CREATE TABLE t2 AS SELECT WEIGHT_STRING(a AS CHAR(10)) AS ws FROM t1;
 +SHOW CREATE TABLE t2;
 +Table	Create Table
 +t2	CREATE TABLE `t2` (
 +  `ws` varbinary(20) DEFAULT NULL
 +) ENGINE=MyISAM DEFAULT CHARSET=latin1
 +SELECT HEX(WEIGHT_STRING(a AS CHAR(10))) FROM t1;
 +HEX(WEIGHT_STRING(a AS CHAR(10)))
 +0041004100410041004100200020002000200020
 +SELECT HEX(ws) FROM t2;
 +HEX(ws)
 +0041004100410041004100200020002000200020
 +DROP TABLE t2;
 +DROP TABLE t1;
 +select hex(weight_string('a'));
 +hex(weight_string('a'))
 +0041
 +select hex(weight_string('A'));
 +hex(weight_string('A'))
 +0041
 +select hex(weight_string('abc'));
 +hex(weight_string('abc'))
 +004100420043
 +select hex(weight_string('abc' as char(2)));
 +hex(weight_string('abc' as char(2)))
 +00410042
 +select hex(weight_string('abc' as char(3)));
 +hex(weight_string('abc' as char(3)))
 +004100420043
 +select hex(weight_string('abc' as char(5)));
 +hex(weight_string('abc' as char(5)))
 +00410042004300200020
 +select hex(weight_string('abc', 1, 2, 0xC0));
 +hex(weight_string('abc', 1, 2, 0xC0))
 +00
 +select hex(weight_string('abc', 2, 2, 0xC0));
 +hex(weight_string('abc', 2, 2, 0xC0))
 +0041
 +select hex(weight_string('abc', 3, 2, 0xC0));
 +hex(weight_string('abc', 3, 2, 0xC0))
 +004100
 +select hex(weight_string('abc', 4, 2, 0xC0));
 +hex(weight_string('abc', 4, 2, 0xC0))
 +00410042
 +select hex(weight_string('abc', 5, 2, 0xC0));
 +hex(weight_string('abc', 5, 2, 0xC0))
 +0041004200
 +select hex(weight_string('abc',25, 2, 0xC0));
 +hex(weight_string('abc',25, 2, 0xC0))
 +00410042002000200020002000200020002000200020002000
 +select hex(weight_string('abc', 1, 3, 0xC0));
 +hex(weight_string('abc', 1, 3, 0xC0))
 +00
 +select hex(weight_string('abc', 2, 3, 0xC0));
 +hex(weight_string('abc', 2, 3, 0xC0))
 +0041
 +select hex(weight_string('abc', 3, 3, 0xC0));
 +hex(weight_string('abc', 3, 3, 0xC0))
 +004100
 +select hex(weight_string('abc', 4, 3, 0xC0));
 +hex(weight_string('abc', 4, 3, 0xC0))
 +00410042
 +select hex(weight_string('abc', 5, 3, 0xC0));
 +hex(weight_string('abc', 5, 3, 0xC0))
 +0041004200
 +select hex(weight_string('abc',25, 3, 0xC0));
 +hex(weight_string('abc',25, 3, 0xC0))
 +00410042004300200020002000200020002000200020002000
 +select hex(weight_string('abc', 1, 4, 0xC0));
 +hex(weight_string('abc', 1, 4, 0xC0))
 +00
 +select hex(weight_string('abc', 2, 4, 0xC0));
 +hex(weight_string('abc', 2, 4, 0xC0))
 +0041
 +select hex(weight_string('abc', 3, 4, 0xC0));
 +hex(weight_string('abc', 3, 4, 0xC0))
 +004100
 +select hex(weight_string('abc', 4, 4, 0xC0));
 +hex(weight_string('abc', 4, 4, 0xC0))
 +00410042
 +select hex(weight_string('abc', 5, 4, 0xC0));
 +hex(weight_string('abc', 5, 4, 0xC0))
 +0041004200
 +select hex(weight_string('abc',25, 4, 0xC0));
 +hex(weight_string('abc',25, 4, 0xC0))
 +00410042004300200020002000200020002000200020002000
 +select @@collation_connection;
 +@@collation_connection
 +ucs2_general_ci
 +select hex(weight_string(cast(_latin1 0x80 as char)));
 +hex(weight_string(cast(_latin1 0x80 as char)))
 +20AC
 +select hex(weight_string(cast(_latin1 0x808080 as char)));
 +hex(weight_string(cast(_latin1 0x808080 as char)))
 +20AC20AC20AC
 +select hex(weight_string(cast(_latin1 0x808080 as char) as char(2)));
 +hex(weight_string(cast(_latin1 0x808080 as char) as char(2)))
 +20AC20AC
 +select hex(weight_string(cast(_latin1 0x808080 as char) as char(3)));
 +hex(weight_string(cast(_latin1 0x808080 as char) as char(3)))
 +20AC20AC20AC
 +select hex(weight_string(cast(_latin1 0x808080 as char) as char(5)));
 +hex(weight_string(cast(_latin1 0x808080 as char) as char(5)))
 +20AC20AC20AC00200020
 +select hex(weight_string(cast(_latin1 0x808080 as char), 1, 2, 0xC0));
 +hex(weight_string(cast(_latin1 0x808080 as char), 1, 2, 0xC0))
 +20
 +select hex(weight_string(cast(_latin1 0x808080 as char), 2, 2, 0xC0));
 +hex(weight_string(cast(_latin1 0x808080 as char), 2, 2, 0xC0))
 +20AC
 +select hex(weight_string(cast(_latin1 0x808080 as char), 3, 2, 0xC0));
 +hex(weight_string(cast(_latin1 0x808080 as char), 3, 2, 0xC0))
 +20AC20
 +select hex(weight_string(cast(_latin1 0x808080 as char), 4, 2, 0xC0));
 +hex(weight_string(cast(_latin1 0x808080 as char), 4, 2, 0xC0))
 +20AC20AC
 +select hex(weight_string(cast(_latin1 0x808080 as char), 5, 2, 0xC0));
 +hex(weight_string(cast(_latin1 0x808080 as char), 5, 2, 0xC0))
 +20AC20AC00
 +select hex(weight_string(cast(_latin1 0x808080 as char),25, 2, 0xC0));
 +hex(weight_string(cast(_latin1 0x808080 as char),25, 2, 0xC0))
 +20AC20AC002000200020002000200020002000200020002000
 +select hex(weight_string(cast(_latin1 0x808080 as char), 1, 3, 0xC0));
 +hex(weight_string(cast(_latin1 0x808080 as char), 1, 3, 0xC0))
 +20
 +select hex(weight_string(cast(_latin1 0x808080 as char), 2, 3, 0xC0));
 +hex(weight_string(cast(_latin1 0x808080 as char), 2, 3, 0xC0))
 +20AC
 +select hex(weight_string(cast(_latin1 0x808080 as char), 3, 3, 0xC0));
 +hex(weight_string(cast(_latin1 0x808080 as char), 3, 3, 0xC0))
 +20AC20
 +select hex(weight_string(cast(_latin1 0x808080 as char), 4, 3, 0xC0));
 +hex(weight_string(cast(_latin1 0x808080 as char), 4, 3, 0xC0))
 +20AC20AC
 +select hex(weight_string(cast(_latin1 0x808080 as char), 5, 3, 0xC0));
 +hex(weight_string(cast(_latin1 0x808080 as char), 5, 3, 0xC0))
 +20AC20AC20
 +select hex(weight_string(cast(_latin1 0x808080 as char),25, 3, 0xC0));
 +hex(weight_string(cast(_latin1 0x808080 as char),25, 3, 0xC0))
 +20AC20AC20AC00200020002000200020002000200020002000
 +select hex(weight_string(cast(_latin1 0x808080 as char), 1, 4, 0xC0));
 +hex(weight_string(cast(_latin1 0x808080 as char), 1, 4, 0xC0))
 +20
 +select hex(weight_string(cast(_latin1 0x808080 as char), 2, 4, 0xC0));
 +hex(weight_string(cast(_latin1 0x808080 as char), 2, 4, 0xC0))
 +20AC
 +select hex(weight_string(cast(_latin1 0x808080 as char), 3, 4, 0xC0));
 +hex(weight_string(cast(_latin1 0x808080 as char), 3, 4, 0xC0))
 +20AC20
 +select hex(weight_string(cast(_latin1 0x808080 as char), 4, 4, 0xC0));
 +hex(weight_string(cast(_latin1 0x808080 as char), 4, 4, 0xC0))
 +20AC20AC
 +select hex(weight_string(cast(_latin1 0x808080 as char), 5, 4, 0xC0));
 +hex(weight_string(cast(_latin1 0x808080 as char), 5, 4, 0xC0))
 +20AC20AC20
 +select hex(weight_string(cast(_latin1 0x808080 as char),25, 4, 0xC0));
 +hex(weight_string(cast(_latin1 0x808080 as char),25, 4, 0xC0))
 +20AC20AC20AC00200020002000200020002000200020002000
 +select @@collation_connection;
 +@@collation_connection
 +ucs2_general_ci
 +select hex(weight_string('a' LEVEL 1));
 +hex(weight_string('a' LEVEL 1))
 +0041
 +select hex(weight_string('A' LEVEL 1));
 +hex(weight_string('A' LEVEL 1))
 +0041
 +select hex(weight_string('abc' LEVEL 1));
 +hex(weight_string('abc' LEVEL 1))
 +004100420043
 +select hex(weight_string('abc' as char(2) LEVEL 1));
 +hex(weight_string('abc' as char(2) LEVEL 1))
 +00410042
 +select hex(weight_string('abc' as char(3) LEVEL 1));
 +hex(weight_string('abc' as char(3) LEVEL 1))
 +004100420043
 +select hex(weight_string('abc' as char(5) LEVEL 1));
 +hex(weight_string('abc' as char(5) LEVEL 1))
 +00410042004300200020
 +select hex(weight_string('abc' as char(5) LEVEL 1 REVERSE));
 +hex(weight_string('abc' as char(5) LEVEL 1 REVERSE))
 +20002000430042004100
 +select hex(weight_string('abc' as char(5) LEVEL 1 DESC));
 +hex(weight_string('abc' as char(5) LEVEL 1 DESC))
 +FFBEFFBDFFBCFFDFFFDF
 +select hex(weight_string('abc' as char(5) LEVEL 1 DESC REVERSE));
 +hex(weight_string('abc' as char(5) LEVEL 1 DESC REVERSE))
 +DFFFDFFFBCFFBDFFBEFF
 +set collation_connection=ucs2_bin;
 +select @@collation_connection;
 +@@collation_connection
 +ucs2_bin
 +CREATE TABLE t1 AS SELECT 'a' AS a;
 +SHOW CREATE TABLE t1;
 +Table	Create Table
 +t1	CREATE TABLE `t1` (
 +  `a` varchar(1) CHARACTER SET ucs2 COLLATE ucs2_bin NOT NULL DEFAULT ''
 +) ENGINE=MyISAM DEFAULT CHARSET=latin1
 +CREATE TABLE t2 AS SELECT WEIGHT_STRING(a) AS ws FROM t1;
 +SHOW CREATE TABLE t2;
 +Table	Create Table
 +t2	CREATE TABLE `t2` (
 +  `ws` varbinary(2) DEFAULT NULL
 +) ENGINE=MyISAM DEFAULT CHARSET=latin1
 +SELECT HEX(WEIGHT_STRING(a)) FROM t1;
 +HEX(WEIGHT_STRING(a))
 +0061
 +SELECT HEX(ws) FROM t2;
 +HEX(ws)
 +0061
 +DROP TABLE t2;
 +DROP TABLE t1;
 +CREATE TABLE t1 AS SELECT REPEAT('a',5) AS a;
 +SHOW CREATE TABLE t1;
 +Table	Create Table
 +t1	CREATE TABLE `t1` (
 +  `a` varchar(5) CHARACTER SET ucs2 COLLATE ucs2_bin NOT NULL DEFAULT ''
 +) ENGINE=MyISAM DEFAULT CHARSET=latin1
 +CREATE TABLE t2 AS SELECT WEIGHT_STRING(a) AS ws FROM t1;
 +SHOW CREATE TABLE t2;
 +Table	Create Table
 +t2	CREATE TABLE `t2` (
 +  `ws` varbinary(10) DEFAULT NULL
 +) ENGINE=MyISAM DEFAULT CHARSET=latin1
 +SELECT HEX(WEIGHT_STRING(a)) FROM t1;
 +HEX(WEIGHT_STRING(a))
 +00610061006100610061
 +SELECT HEX(ws) FROM t2;
 +HEX(ws)
 +00610061006100610061
 +DROP TABLE t2;
 +CREATE TABLE t2 AS SELECT WEIGHT_STRING(a AS CHAR(3)) AS ws FROM t1;
 +SHOW CREATE TABLE t2;
 +Table	Create Table
 +t2	CREATE TABLE `t2` (
 +  `ws` varbinary(6) DEFAULT NULL
 +) ENGINE=MyISAM DEFAULT CHARSET=latin1
 +SELECT HEX(WEIGHT_STRING(a AS CHAR(3))) FROM t1;
 +HEX(WEIGHT_STRING(a AS CHAR(3)))
 +006100610061
 +SELECT HEX(ws) FROM t2;
 +HEX(ws)
 +006100610061
 +DROP TABLE t2;
 +CREATE TABLE t2 AS SELECT WEIGHT_STRING(a AS CHAR(10)) AS ws FROM t1;
 +SHOW CREATE TABLE t2;
 +Table	Create Table
 +t2	CREATE TABLE `t2` (
 +  `ws` varbinary(20) DEFAULT NULL
 +) ENGINE=MyISAM DEFAULT CHARSET=latin1
 +SELECT HEX(WEIGHT_STRING(a AS CHAR(10))) FROM t1;
 +HEX(WEIGHT_STRING(a AS CHAR(10)))
 +0061006100610061006100200020002000200020
 +SELECT HEX(ws) FROM t2;
 +HEX(ws)
 +0061006100610061006100200020002000200020
 +DROP TABLE t2;
 +DROP TABLE t1;
 +select hex(weight_string('a'));
 +hex(weight_string('a'))
 +0061
 +select hex(weight_string('A'));
 +hex(weight_string('A'))
 +0041
 +select hex(weight_string('abc'));
 +hex(weight_string('abc'))
 +006100620063
 +select hex(weight_string('abc' as char(2)));
 +hex(weight_string('abc' as char(2)))
 +00610062
 +select hex(weight_string('abc' as char(3)));
 +hex(weight_string('abc' as char(3)))
 +006100620063
 +select hex(weight_string('abc' as char(5)));
 +hex(weight_string('abc' as char(5)))
 +00610062006300200020
 +select hex(weight_string('abc', 1, 2, 0xC0));
 +hex(weight_string('abc', 1, 2, 0xC0))
 +00
 +select hex(weight_string('abc', 2, 2, 0xC0));
 +hex(weight_string('abc', 2, 2, 0xC0))
 +0061
 +select hex(weight_string('abc', 3, 2, 0xC0));
 +hex(weight_string('abc', 3, 2, 0xC0))
 +006100
 +select hex(weight_string('abc', 4, 2, 0xC0));
 +hex(weight_string('abc', 4, 2, 0xC0))
 +00610062
 +select hex(weight_string('abc', 5, 2, 0xC0));
 +hex(weight_string('abc', 5, 2, 0xC0))
 +0061006200
 +select hex(weight_string('abc',25, 2, 0xC0));
 +hex(weight_string('abc',25, 2, 0xC0))
 +00610062002000200020002000200020002000200020002000
 +select hex(weight_string('abc', 1, 3, 0xC0));
 +hex(weight_string('abc', 1, 3, 0xC0))
 +00
 +select hex(weight_string('abc', 2, 3, 0xC0));
 +hex(weight_string('abc', 2, 3, 0xC0))
 +0061
 +select hex(weight_string('abc', 3, 3, 0xC0));
 +hex(weight_string('abc', 3, 3, 0xC0))
 +006100
 +select hex(weight_string('abc', 4, 3, 0xC0));
 +hex(weight_string('abc', 4, 3, 0xC0))
 +00610062
 +select hex(weight_string('abc', 5, 3, 0xC0));
 +hex(weight_string('abc', 5, 3, 0xC0))
 +0061006200
 +select hex(weight_string('abc',25, 3, 0xC0));
 +hex(weight_string('abc',25, 3, 0xC0))
 +00610062006300200020002000200020002000200020002000
 +select hex(weight_string('abc', 1, 4, 0xC0));
 +hex(weight_string('abc', 1, 4, 0xC0))
 +00
 +select hex(weight_string('abc', 2, 4, 0xC0));
 +hex(weight_string('abc', 2, 4, 0xC0))
 +0061
 +select hex(weight_string('abc', 3, 4, 0xC0));
 +hex(weight_string('abc', 3, 4, 0xC0))
 +006100
 +select hex(weight_string('abc', 4, 4, 0xC0));
 +hex(weight_string('abc', 4, 4, 0xC0))
 +00610062
 +select hex(weight_string('abc', 5, 4, 0xC0));
 +hex(weight_string('abc', 5, 4, 0xC0))
 +0061006200
 +select hex(weight_string('abc',25, 4, 0xC0));
 +hex(weight_string('abc',25, 4, 0xC0))
 +00610062006300200020002000200020002000200020002000
 +select @@collation_connection;
 +@@collation_connection
 +ucs2_bin
 +select hex(weight_string(cast(_latin1 0x80 as char)));
 +hex(weight_string(cast(_latin1 0x80 as char)))
 +20AC
 +select hex(weight_string(cast(_latin1 0x808080 as char)));
 +hex(weight_string(cast(_latin1 0x808080 as char)))
 +20AC20AC20AC
 +select hex(weight_string(cast(_latin1 0x808080 as char) as char(2)));
 +hex(weight_string(cast(_latin1 0x808080 as char) as char(2)))
 +20AC20AC
 +select hex(weight_string(cast(_latin1 0x808080 as char) as char(3)));
 +hex(weight_string(cast(_latin1 0x808080 as char) as char(3)))
 +20AC20AC20AC
 +select hex(weight_string(cast(_latin1 0x808080 as char) as char(5)));
 +hex(weight_string(cast(_latin1 0x808080 as char) as char(5)))
 +20AC20AC20AC00200020
 +select hex(weight_string(cast(_latin1 0x808080 as char), 1, 2, 0xC0));
 +hex(weight_string(cast(_latin1 0x808080 as char), 1, 2, 0xC0))
 +20
 +select hex(weight_string(cast(_latin1 0x808080 as char), 2, 2, 0xC0));
 +hex(weight_string(cast(_latin1 0x808080 as char), 2, 2, 0xC0))
 +20AC
 +select hex(weight_string(cast(_latin1 0x808080 as char), 3, 2, 0xC0));
 +hex(weight_string(cast(_latin1 0x808080 as char), 3, 2, 0xC0))
 +20AC20
 +select hex(weight_string(cast(_latin1 0x808080 as char), 4, 2, 0xC0));
 +hex(weight_string(cast(_latin1 0x808080 as char), 4, 2, 0xC0))
 +20AC20AC
 +select hex(weight_string(cast(_latin1 0x808080 as char), 5, 2, 0xC0));
 +hex(weight_string(cast(_latin1 0x808080 as char), 5, 2, 0xC0))
 +20AC20AC00
 +select hex(weight_string(cast(_latin1 0x808080 as char),25, 2, 0xC0));
 +hex(weight_string(cast(_latin1 0x808080 as char),25, 2, 0xC0))
 +20AC20AC002000200020002000200020002000200020002000
 +select hex(weight_string(cast(_latin1 0x808080 as char), 1, 3, 0xC0));
 +hex(weight_string(cast(_latin1 0x808080 as char), 1, 3, 0xC0))
 +20
 +select hex(weight_string(cast(_latin1 0x808080 as char), 2, 3, 0xC0));
 +hex(weight_string(cast(_latin1 0x808080 as char), 2, 3, 0xC0))
 +20AC
 +select hex(weight_string(cast(_latin1 0x808080 as char), 3, 3, 0xC0));
 +hex(weight_string(cast(_latin1 0x808080 as char), 3, 3, 0xC0))
 +20AC20
 +select hex(weight_string(cast(_latin1 0x808080 as char), 4, 3, 0xC0));
 +hex(weight_string(cast(_latin1 0x808080 as char), 4, 3, 0xC0))
 +20AC20AC
 +select hex(weight_string(cast(_latin1 0x808080 as char), 5, 3, 0xC0));
 +hex(weight_string(cast(_latin1 0x808080 as char), 5, 3, 0xC0))
 +20AC20AC20
 +select hex(weight_string(cast(_latin1 0x808080 as char),25, 3, 0xC0));
 +hex(weight_string(cast(_latin1 0x808080 as char),25, 3, 0xC0))
 +20AC20AC20AC00200020002000200020002000200020002000
 +select hex(weight_string(cast(_latin1 0x808080 as char), 1, 4, 0xC0));
 +hex(weight_string(cast(_latin1 0x808080 as char), 1, 4, 0xC0))
 +20
 +select hex(weight_string(cast(_latin1 0x808080 as char), 2, 4, 0xC0));
 +hex(weight_string(cast(_latin1 0x808080 as char), 2, 4, 0xC0))
 +20AC
 +select hex(weight_string(cast(_latin1 0x808080 as char), 3, 4, 0xC0));
 +hex(weight_string(cast(_latin1 0x808080 as char), 3, 4, 0xC0))
 +20AC20
 +select hex(weight_string(cast(_latin1 0x808080 as char), 4, 4, 0xC0));
 +hex(weight_string(cast(_latin1 0x808080 as char), 4, 4, 0xC0))
 +20AC20AC
 +select hex(weight_string(cast(_latin1 0x808080 as char), 5, 4, 0xC0));
 +hex(weight_string(cast(_latin1 0x808080 as char), 5, 4, 0xC0))
 +20AC20AC20
 +select hex(weight_string(cast(_latin1 0x808080 as char),25, 4, 0xC0));
 +hex(weight_string(cast(_latin1 0x808080 as char),25, 4, 0xC0))
 +20AC20AC20AC00200020002000200020002000200020002000
 +select @@collation_connection;
 +@@collation_connection
 +ucs2_bin
 +select hex(weight_string('a' LEVEL 1));
 +hex(weight_string('a' LEVEL 1))
 +0061
 +select hex(weight_string('A' LEVEL 1));
 +hex(weight_string('A' LEVEL 1))
 +0041
 +select hex(weight_string('abc' LEVEL 1));
 +hex(weight_string('abc' LEVEL 1))
 +006100620063
 +select hex(weight_string('abc' as char(2) LEVEL 1));
 +hex(weight_string('abc' as char(2) LEVEL 1))
 +00610062
 +select hex(weight_string('abc' as char(3) LEVEL 1));
 +hex(weight_string('abc' as char(3) LEVEL 1))
 +006100620063
 +select hex(weight_string('abc' as char(5) LEVEL 1));
 +hex(weight_string('abc' as char(5) LEVEL 1))
 +00610062006300200020
 +select hex(weight_string('abc' as char(5) LEVEL 1 REVERSE));
 +hex(weight_string('abc' as char(5) LEVEL 1 REVERSE))
 +20002000630062006100
 +select hex(weight_string('abc' as char(5) LEVEL 1 DESC));
 +hex(weight_string('abc' as char(5) LEVEL 1 DESC))
 +FF9EFF9DFF9CFFDFFFDF
 +select hex(weight_string('abc' as char(5) LEVEL 1 DESC REVERSE));
 +hex(weight_string('abc' as char(5) LEVEL 1 DESC REVERSE))
 +DFFFDFFF9CFF9DFF9EFF
 +#
 +# Bug #36418 Character sets: crash if char(256 using utf32)
 +#
 +select hex(char(0x01 using ucs2));
 +hex(char(0x01 using ucs2))
 +0001
 +select hex(char(0x0102 using ucs2));
 +hex(char(0x0102 using ucs2))
 +0102
 +select hex(char(0x010203 using ucs2));
 +hex(char(0x010203 using ucs2))
 +00010203
 +select hex(char(0x01020304 using ucs2));
 +hex(char(0x01020304 using ucs2))
 +01020304
 +#
 +# Bug#10094 Displays wrong error message for UNIQUE key index on CHAR(255) Unicode datatype
 +#
 +CREATE TABLE t1 (f1 CHAR(255) unicode);
 +INSERT INTO t1 values ('abc'),('bcd'),('abc');
 +ALTER TABLE t1 ADD UNIQUE Index_1 (f1);
 +ERROR 23000: Duplicate entry 'abc' for key 'Index_1'
 +DROP TABLE t1;
 +#
 +# Test how character set works with date/time
 +#
 +SET collation_connection=ucs2_general_ci;
 +#
 +# Bug#32390 Character sets: casting utf32 to/from date doesn't work
 +#
 +CREATE TABLE t1 AS SELECT repeat('a',20) AS s1 LIMIT 0;
 +SET time_zone=_latin1'+03:00';
 +SET timestamp=1216359724;
 +INSERT INTO t1 VALUES (current_date);
 +INSERT INTO t1 VALUES (current_time);
 +INSERT INTO t1 VALUES (current_timestamp);
 +SELECT s1, hex(s1) FROM t1;
 +s1	hex(s1)
 +2008-07-18	0032003000300038002D00300037002D00310038
 +08:42:04	00300038003A00340032003A00300034
 +2008-07-18 08:42:04	0032003000300038002D00300037002D00310038002000300038003A00340032003A00300034
 +DROP TABLE t1;
 +SET timestamp=0;
 +SET time_zone=default;
 +#
 +# MDEV-5298 Illegal mix of collations on timestamp
 +#
 +SELECT CHARSET('2013-11-15 00:41:28' - INTERVAL 7 DAY);
 +CHARSET('2013-11-15 00:41:28' - INTERVAL 7 DAY)
 +ucs2
 +SELECT COERCIBILITY('2013-11-15 00:41:28' - INTERVAL 7 DAY);
 +COERCIBILITY('2013-11-15 00:41:28' - INTERVAL 7 DAY)
 +4
 +SELECT CHARSET(TIMESTAMP'2013-11-15 00:41:28' - INTERVAL 7 DAY);
 +CHARSET(TIMESTAMP'2013-11-15 00:41:28' - INTERVAL 7 DAY)
 +binary
 +SELECT COERCIBILITY(TIMESTAMP'2013-11-15 00:41:28' - INTERVAL 7 DAY);
 +COERCIBILITY(TIMESTAMP'2013-11-15 00:41:28' - INTERVAL 7 DAY)
 +5
 +SELECT CHARSET(CONCAT('2013-11-15 00:41:28' - INTERVAL 7 DAY));
 +CHARSET(CONCAT('2013-11-15 00:41:28' - INTERVAL 7 DAY))
 +ucs2
 +SELECT COERCIBILITY(CONCAT('2013-11-15 00:41:28' - INTERVAL 7 DAY));
 +COERCIBILITY(CONCAT('2013-11-15 00:41:28' - INTERVAL 7 DAY))
 +4
 +SELECT CHARSET(CONCAT(TIMESTAMP'2013-11-15 00:41:28' - INTERVAL 7 DAY));
 +CHARSET(CONCAT(TIMESTAMP'2013-11-15 00:41:28' - INTERVAL 7 DAY))
 +ucs2
 +SELECT COERCIBILITY(CONCAT(TIMESTAMP'2013-11-15 00:41:28' - INTERVAL 7 DAY));
 +COERCIBILITY(CONCAT(TIMESTAMP'2013-11-15 00:41:28' - INTERVAL 7 DAY))
 +4
 +SELECT CHARSET(CONCAT('','2001-01-08 00:00:00' - INTERVAL 7 DAY));
 +CHARSET(CONCAT('','2001-01-08 00:00:00' - INTERVAL 7 DAY))
 +ucs2
 +SELECT COERCIBILITY(CONCAT('','2001-01-08 00:00:00' - INTERVAL 7 DAY));
 +COERCIBILITY(CONCAT('','2001-01-08 00:00:00' - INTERVAL 7 DAY))
 +4
 +SELECT HEX(CONCAT('','2001-01-08 00:00:00' - INTERVAL 7 DAY));
 +HEX(CONCAT('','2001-01-08 00:00:00' - INTERVAL 7 DAY))
 +0032003000300031002D00300031002D00300031002000300030003A00300030003A00300030
 +SELECT CHARSET(CONCAT('',TIMESTAMP'2001-01-08 00:00:00' - INTERVAL 7 DAY));
 +CHARSET(CONCAT('',TIMESTAMP'2001-01-08 00:00:00' - INTERVAL 7 DAY))
 +ucs2
 +SELECT COERCIBILITY(CONCAT('',TIMESTAMP'2001-01-08 00:00:00' - INTERVAL 7 DAY));
 +COERCIBILITY(CONCAT('',TIMESTAMP'2001-01-08 00:00:00' - INTERVAL 7 DAY))
 +4
 +SELECT HEX(CONCAT('',TIMESTAMP'2001-01-08 00:00:00' - INTERVAL 7 DAY));
 +HEX(CONCAT('',TIMESTAMP'2001-01-08 00:00:00' - INTERVAL 7 DAY))
 +0032003000300031002D00300031002D00300031002000300030003A00300030003A00300030
 +CREATE TABLE t1 AS SELECT REPEAT('a', 64) AS a LIMIT 0;
 +SHOW CREATE TABLE t1;
 +Table	Create Table
 +t1	CREATE TABLE `t1` (
 +  `a` varchar(64) CHARACTER SET ucs2 NOT NULL DEFAULT ''
 +) ENGINE=MyISAM DEFAULT CHARSET=latin1
 +INSERT INTO t1 VALUES ('');
 +SELECT CHARSET(CONCAT(a,'2001-01-08 00:00:00' - INTERVAL 7 DAY)) FROM t1;
 +CHARSET(CONCAT(a,'2001-01-08 00:00:00' - INTERVAL 7 DAY))
 +ucs2
 +SELECT COERCIBILITY(CONCAT(a,'2001-01-08 00:00:00' - INTERVAL 7 DAY)) FROM t1;
 +COERCIBILITY(CONCAT(a,'2001-01-08 00:00:00' - INTERVAL 7 DAY))
 +2
 +SELECT HEX(CONCAT(a,'2001-01-08 00:00:00' - INTERVAL 7 DAY)) FROM t1;
 +HEX(CONCAT(a,'2001-01-08 00:00:00' - INTERVAL 7 DAY))
 +0032003000300031002D00300031002D00300031002000300030003A00300030003A00300030
 +SELECT CHARSET(CONCAT(a,TIMESTAMP'2001-01-08 00:00:00' - INTERVAL 7 DAY)) FROM t1;
 +CHARSET(CONCAT(a,TIMESTAMP'2001-01-08 00:00:00' - INTERVAL 7 DAY))
 +ucs2
 +SELECT COERCIBILITY(CONCAT(a,TIMESTAMP'2001-01-08 00:00:00' - INTERVAL 7 DAY)) FROM t1;
 +COERCIBILITY(CONCAT(a,TIMESTAMP'2001-01-08 00:00:00' - INTERVAL 7 DAY))
 +2
 +SELECT HEX(CONCAT(a,TIMESTAMP'2001-01-08 00:00:00' - INTERVAL 7 DAY)) FROM t1;
 +HEX(CONCAT(a,TIMESTAMP'2001-01-08 00:00:00' - INTERVAL 7 DAY))
 +0032003000300031002D00300031002D00300031002000300030003A00300030003A00300030
 +DROP TABLE t1;
 +CREATE TABLE t1 (t TIMESTAMP NOT NULL);
 +INSERT INTO t1 VALUES ('2001-01-01 00:00:00');
 +SELECT * FROM t1 WHERE t < '2013-11-15 00:41:28' - INTERVAL 7 DAY;
 +t
 +2001-01-01 00:00:00
 +SELECT * FROM t1 WHERE t = '2001-01-08 00:00:00' - INTERVAL 7 DAY;
 +t
 +2001-01-01 00:00:00
 +SELECT * FROM t1 WHERE t < CONCAT('2013-11-15 00:41:28',LEFT(RAND(),0)) - INTERVAL 7 DAY;
 +t
 +2001-01-01 00:00:00
 +SELECT * FROM t1 WHERE t = CONCAT('2001-01-08 00:00:00',LEFT(RAND(),0)) - INTERVAL 7 DAY;
 +t
 +2001-01-01 00:00:00
 +SELECT * FROM t1 WHERE t < TIMESTAMP'2013-11-15 00:41:28' - INTERVAL 7 DAY;
 +t
 +2001-01-01 00:00:00
 +SELECT * FROM t1 WHERE t = TIMESTAMP'2001-01-08 00:00:00' - INTERVAL 7 DAY;
 +t
 +2001-01-01 00:00:00
 +DROP TABLE t1;
 +SET NAMES latin1;
 +#
 +# WL#4013 Unicode german2 collation
 +#
 +SET collation_connection=ucs2_german2_ci;
 +"BEGIN ctype_german.inc"
 +drop table if exists t1;
 +create table t1 as select repeat(' ', 64) as s1;
 +select collation(s1) from t1;
 +collation(s1)
 +ucs2_german2_ci
 +delete from t1;
 +INSERT INTO t1 VALUES ('ud'),('uf');
 +INSERT INTO t1 VALUES ('od'),('of');
 +INSERT INTO t1 VALUES ('e');
 +INSERT INTO t1 VALUES ('ad'),('af');
 +insert into t1 values ('a'),('ae'),(_latin1 0xE4);
 +insert into t1 values ('o'),('oe'),(_latin1 0xF6);
 +insert into t1 values ('s'),('ss'),(_latin1 0xDF);
 +insert into t1 values ('u'),('ue'),(_latin1 0xFC);
 +INSERT INTO t1 VALUES (_latin1 0xE6), (_latin1 0xC6);
 +INSERT INTO t1 VALUES (_latin1 0x9C), (_latin1 0x8C);
 +select s1, hex(s1) from t1 order by s1, binary s1;
 +s1	hex(s1)
 +a	0061
 +ad	00610064
 +ae	00610065
 +�	00C6
 +�	00E4
 +�	00E6
 +af	00610066
 +e	0065
 +o	006F
 +od	006F0064
 +oe	006F0065
 +�	00F6
 +�	0152
 +�	0153
 +of	006F0066
 +s	0073
 +ss	00730073
 +�	00DF
 +u	0075
 +ud	00750064
 +ue	00750065
 +�	00FC
 +uf	00750066
 +select group_concat(s1 order by binary s1) from t1 group by s1;
 +group_concat(s1 order by binary s1)
 +a
 +ad
 +ae,�,�,�
 +af
 +e
 +o
 +od
 +oe,�,�,�
 +of
 +s
 +ss,�
 +u
 +ud
 +ue,�
 +uf
 +SELECT s1, hex(s1), hex(weight_string(s1)) FROM t1 ORDER BY s1, BINARY(s1);
 +s1	hex(s1)	hex(weight_string(s1))
 +a	0061	0E33
 +ad	00610064	0E330E6D
 +ae	00610065	0E330E8B
 +�	00C6	0E330E8B
 +�	00E4	0E330E8B
 +�	00E6	0E330E8B
 +af	00610066	0E330EB9
 +e	0065	0E8B
 +o	006F	0F82
 +od	006F0064	0F820E6D
 +oe	006F0065	0F820E8B
 +�	00F6	0F820E8B
 +�	0152	0F820E8B
 +�	0153	0F820E8B
 +of	006F0066	0F820EB9
 +s	0073	0FEA
 +ss	00730073	0FEA0FEA
 +�	00DF	0FEA0FEA
 +u	0075	101F
 +ud	00750064	101F0E6D
 +ue	00750065	101F0E8B
 +�	00FC	101F0E8B
 +uf	00750066	101F0EB9
 +SELECT s1, hex(s1) FROM t1 WHERE s1='ae' ORDER BY s1, BINARY(s1);
 +s1	hex(s1)
 +ae	00610065
 +�	00C6
 +�	00E4
 +�	00E6
 +drop table t1;
 +CREATE TABLE t1 AS SELECT REPEAT('a',1) AS a, 1 AS b LIMIT 0;
 +SHOW CREATE TABLE t1;
 +Table	Create Table
 +t1	CREATE TABLE `t1` (
 +  `a` varchar(1) CHARACTER SET ucs2 COLLATE ucs2_german2_ci NOT NULL DEFAULT '',
 +  `b` int(1) NOT NULL DEFAULT '0'
 +) ENGINE=MyISAM DEFAULT CHARSET=latin1
 +INSERT INTO t1 VALUES ('s',0),(_latin1 0xDF,1);
 +SELECT * FROM t1 ORDER BY a, b;
 +a	b
 +s	0
 +�	1
 +SELECT * FROM t1 ORDER BY a DESC, b;
 +a	b
 +�	1
 +s	0
 +SELECT * FROM t1 ORDER BY CONCAT(a), b;
 +a	b
 +s	0
 +�	1
 +SELECT * FROM t1 ORDER BY CONCAT(a) DESC, b;
 +a	b
 +�	1
 +s	0
 +DROP TABLE t1;
 +"END ctype_german.inc"
 +#
 +# Bug#59145 valgrind warnings for uninitialized values in my_strtoll10_mb2
 +#
 +SET NAMES latin1;
 +SELECT CONVERT(CHAR(NULL USING ucs2), UNSIGNED);
 +CONVERT(CHAR(NULL USING ucs2), UNSIGNED)
 +0
 +Warnings:
 +Warning	1292	Truncated incorrect INTEGER value: ''
 +DO IFNULL(CHAR(NULL USING ucs2), '');
 +DO CAST(CONVERT('' USING ucs2) AS UNSIGNED);
 +Warnings:
 +Warning	1292	Truncated incorrect INTEGER value: ''
 +#
 +# Test error message for conversion using different charset
 +# 
 +CREATE TABLE t1 (a DECIMAL(2,0));
 +SET sql_mode='strict_all_tables';
 +INSERT INTO t1 VALUES (CONVERT('9e99999999' USING ucs2));
 +ERROR 22007: Incorrect decimal value: '9e99999999' for column 'a' at row 1
 +SET sql_mode=DEFAULT;
 +INSERT INTO t1 VALUES (CONVERT('aaa' USING ucs2));
 +Warnings:
 +Warning	1366	Incorrect decimal value: 'aaa' for column 'a' at row 1
 +DROP TABLE t1;
 +#
 +# End of 5.6 tests
 +#
 +#
 +# Start of 10.0 tests
 +#
 +SET NAMES latin1, collation_connection=ucs2_bin;
 +#
 +# MDEV-7149 Constant condition propagation erroneously applied for LIKE
 +#
 +CREATE TABLE t1 AS SELECT REPEAT('a', 10) AS c1 LIMIT 0;
 +SHOW CREATE TABLE t1;
 +Table	Create Table
 +t1	CREATE TABLE `t1` (
 +  `c1` varchar(10) CHARACTER SET ucs2 COLLATE ucs2_bin NOT NULL DEFAULT ''
 +) ENGINE=MyISAM DEFAULT CHARSET=latin1
 +INSERT INTO t1 VALUES ('a'),('a ');
 +SELECT * FROM t1 WHERE CONCAT(c1)='a';
 +c1
 +a
 +a 
 +SELECT * FROM t1 WHERE CONCAT(c1) LIKE 'a ';
 +c1
 +a 
 +SELECT * FROM t1 WHERE CONCAT(c1)='a' AND CONCAT(c1) LIKE 'a ';
 +c1
 +a 
 +EXPLAIN EXTENDED SELECT * FROM t1 WHERE CONCAT(c1)='a' AND CONCAT(c1) LIKE 'a ';
 +id	select_type	table	type	possible_keys	key	key_len	ref	rows	filtered	Extra
 +1	SIMPLE	t1	ALL	NULL	NULL	NULL	NULL	2	100.00	Using where
 +Warnings:
 +Note	1003	select `test`.`t1`.`c1` AS `c1` from `test`.`t1` where ((concat(`test`.`t1`.`c1`) = 'a') and (concat(`test`.`t1`.`c1`) like 'a '))
 +DROP TABLE t1;
 +CREATE TABLE t1 AS SELECT REPEAT('a', 10) AS c1 LIMIT 0;
 +SHOW CREATE TABLE t1;
 +Table	Create Table
 +t1	CREATE TABLE `t1` (
 +  `c1` varchar(10) CHARACTER SET ucs2 COLLATE ucs2_bin NOT NULL DEFAULT ''
 +) ENGINE=MyISAM DEFAULT CHARSET=latin1
 +INSERT INTO t1 VALUES ('a'),('a ');
 +SELECT * FROM t1 WHERE 'a'=CONCAT(c1);
 +c1
 +a
 +a 
 +SELECT * FROM t1 WHERE 'a ' LIKE CONCAT(c1);
 +c1
 +a 
 +SELECT * FROM t1 WHERE 'a'=CONCAT(c1) AND 'a ' LIKE CONCAT(c1);
 +c1
 +a 
 +EXPLAIN EXTENDED SELECT * FROM t1 WHERE 'a'=CONCAT(c1) AND 'a ' LIKE CONCAT(c1);
 +id	select_type	table	type	possible_keys	key	key_len	ref	rows	filtered	Extra
 +1	SIMPLE	t1	ALL	NULL	NULL	NULL	NULL	2	100.00	Using where
 +Warnings:
 +Note	1003	select `test`.`t1`.`c1` AS `c1` from `test`.`t1` where (('a' = concat(`test`.`t1`.`c1`)) and ('a ' like concat(`test`.`t1`.`c1`)))
 +DROP TABLE t1;
 +CREATE TABLE t1 AS SELECT REPEAT('a', 10) AS c1 LIMIT 0;
 +SHOW CREATE TABLE t1;
 +Table	Create Table
 +t1	CREATE TABLE `t1` (
 +  `c1` varchar(10) CHARACTER SET ucs2 COLLATE ucs2_bin NOT NULL DEFAULT ''
 +) ENGINE=MyISAM DEFAULT CHARSET=latin1
 +INSERT INTO t1 VALUES ('%'),('% ');
 +SELECT * FROM t1 WHERE '% '=CONCAT(c1);
 +c1
 +%
 +% 
 +SELECT * FROM t1 WHERE 'a' LIKE CONCAT(c1);
 +c1
 +%
 +SELECT * FROM t1 WHERE '% '=CONCAT(c1) AND 'a' LIKE CONCAT(c1);
 +c1
 +%
 +EXPLAIN EXTENDED SELECT * FROM t1 WHERE '% '=CONCAT(c1) AND 'a' LIKE CONCAT(c1);
 +id	select_type	table	type	possible_keys	key	key_len	ref	rows	filtered	Extra
 +1	SIMPLE	t1	ALL	NULL	NULL	NULL	NULL	2	100.00	Using where
 +Warnings:
 +Note	1003	select `test`.`t1`.`c1` AS `c1` from `test`.`t1` where (('% ' = concat(`test`.`t1`.`c1`)) and ('a' like concat(`test`.`t1`.`c1`)))
 +DROP TABLE t1;
 +CREATE TABLE t1 AS SELECT REPEAT('a', 10) AS c1 LIMIT 0;
 +SHOW CREATE TABLE t1;
 +Table	Create Table
 +t1	CREATE TABLE `t1` (
 +  `c1` varchar(10) CHARACTER SET ucs2 COLLATE ucs2_bin NOT NULL DEFAULT ''
 +) ENGINE=MyISAM DEFAULT CHARSET=latin1
 +INSERT INTO t1 VALUES ('%'),('% ');
 +SELECT * FROM t1 WHERE '%'=CONCAT(c1);
 +c1
 +%
 +% 
 +SELECT * FROM t1 WHERE 'a' LIKE CONCAT(c1);
 +c1
 +%
 +SELECT * FROM t1 WHERE '%'=CONCAT(c1) AND 'a' LIKE CONCAT(c1);
 +c1
 +%
 +EXPLAIN EXTENDED SELECT * FROM t1 WHERE '%'=CONCAT(c1) AND 'a' LIKE CONCAT(c1);
 +id	select_type	table	type	possible_keys	key	key_len	ref	rows	filtered	Extra
 +1	SIMPLE	t1	ALL	NULL	NULL	NULL	NULL	2	100.00	Using where
 +Warnings:
 +Note	1003	select `test`.`t1`.`c1` AS `c1` from `test`.`t1` where (('%' = concat(`test`.`t1`.`c1`)) and ('a' like concat(`test`.`t1`.`c1`)))
 +DROP TABLE t1;
 +SET NAMES latin1, collation_connection=ucs2_general_ci;
 +#
 +# MDEV-7149 Constant condition propagation erroneously applied for LIKE
 +#
 +CREATE TABLE t1 AS SELECT REPEAT('a', 10) AS c1 LIMIT 0;
 +SHOW CREATE TABLE t1;
 +Table	Create Table
 +t1	CREATE TABLE `t1` (
 +  `c1` varchar(10) CHARACTER SET ucs2 NOT NULL DEFAULT ''
 +) ENGINE=MyISAM DEFAULT CHARSET=latin1
 +INSERT INTO t1 VALUES ('a'),('a ');
 +SELECT * FROM t1 WHERE CONCAT(c1)='a';
 +c1
 +a
 +a 
 +SELECT * FROM t1 WHERE CONCAT(c1) LIKE 'a ';
 +c1
 +a 
 +SELECT * FROM t1 WHERE CONCAT(c1)='a' AND CONCAT(c1) LIKE 'a ';
 +c1
 +a 
 +EXPLAIN EXTENDED SELECT * FROM t1 WHERE CONCAT(c1)='a' AND CONCAT(c1) LIKE 'a ';
 +id	select_type	table	type	possible_keys	key	key_len	ref	rows	filtered	Extra
 +1	SIMPLE	t1	ALL	NULL	NULL	NULL	NULL	2	100.00	Using where
 +Warnings:
 +Note	1003	select `test`.`t1`.`c1` AS `c1` from `test`.`t1` where ((concat(`test`.`t1`.`c1`) = 'a') and (concat(`test`.`t1`.`c1`) like 'a '))
 +DROP TABLE t1;
 +CREATE TABLE t1 AS SELECT REPEAT('a', 10) AS c1 LIMIT 0;
 +SHOW CREATE TABLE t1;
 +Table	Create Table
 +t1	CREATE TABLE `t1` (
 +  `c1` varchar(10) CHARACTER SET ucs2 NOT NULL DEFAULT ''
 +) ENGINE=MyISAM DEFAULT CHARSET=latin1
 +INSERT INTO t1 VALUES ('a'),('a ');
 +SELECT * FROM t1 WHERE 'a'=CONCAT(c1);
 +c1
 +a
 +a 
 +SELECT * FROM t1 WHERE 'a ' LIKE CONCAT(c1);
 +c1
 +a 
 +SELECT * FROM t1 WHERE 'a'=CONCAT(c1) AND 'a ' LIKE CONCAT(c1);
 +c1
 +a 
 +EXPLAIN EXTENDED SELECT * FROM t1 WHERE 'a'=CONCAT(c1) AND 'a ' LIKE CONCAT(c1);
 +id	select_type	table	type	possible_keys	key	key_len	ref	rows	filtered	Extra
 +1	SIMPLE	t1	ALL	NULL	NULL	NULL	NULL	2	100.00	Using where
 +Warnings:
 +Note	1003	select `test`.`t1`.`c1` AS `c1` from `test`.`t1` where (('a' = concat(`test`.`t1`.`c1`)) and ('a ' like concat(`test`.`t1`.`c1`)))
 +DROP TABLE t1;
 +CREATE TABLE t1 AS SELECT REPEAT('a', 10) AS c1 LIMIT 0;
 +SHOW CREATE TABLE t1;
 +Table	Create Table
 +t1	CREATE TABLE `t1` (
 +  `c1` varchar(10) CHARACTER SET ucs2 NOT NULL DEFAULT ''
 +) ENGINE=MyISAM DEFAULT CHARSET=latin1
 +INSERT INTO t1 VALUES ('%'),('% ');
 +SELECT * FROM t1 WHERE '% '=CONCAT(c1);
 +c1
 +%
 +% 
 +SELECT * FROM t1 WHERE 'a' LIKE CONCAT(c1);
 +c1
 +%
 +SELECT * FROM t1 WHERE '% '=CONCAT(c1) AND 'a' LIKE CONCAT(c1);
 +c1
 +%
 +EXPLAIN EXTENDED SELECT * FROM t1 WHERE '% '=CONCAT(c1) AND 'a' LIKE CONCAT(c1);
 +id	select_type	table	type	possible_keys	key	key_len	ref	rows	filtered	Extra
 +1	SIMPLE	t1	ALL	NULL	NULL	NULL	NULL	2	100.00	Using where
 +Warnings:
 +Note	1003	select `test`.`t1`.`c1` AS `c1` from `test`.`t1` where (('% ' = concat(`test`.`t1`.`c1`)) and ('a' like concat(`test`.`t1`.`c1`)))
 +DROP TABLE t1;
 +CREATE TABLE t1 AS SELECT REPEAT('a', 10) AS c1 LIMIT 0;
 +SHOW CREATE TABLE t1;
 +Table	Create Table
 +t1	CREATE TABLE `t1` (
 +  `c1` varchar(10) CHARACTER SET ucs2 NOT NULL DEFAULT ''
 +) ENGINE=MyISAM DEFAULT CHARSET=latin1
 +INSERT INTO t1 VALUES ('%'),('% ');
 +SELECT * FROM t1 WHERE '%'=CONCAT(c1);
 +c1
 +%
 +% 
 +SELECT * FROM t1 WHERE 'a' LIKE CONCAT(c1);
 +c1
 +%
 +SELECT * FROM t1 WHERE '%'=CONCAT(c1) AND 'a' LIKE CONCAT(c1);
 +c1
 +%
 +EXPLAIN EXTENDED SELECT * FROM t1 WHERE '%'=CONCAT(c1) AND 'a' LIKE CONCAT(c1);
 +id	select_type	table	type	possible_keys	key	key_len	ref	rows	filtered	Extra
 +1	SIMPLE	t1	ALL	NULL	NULL	NULL	NULL	2	100.00	Using where
 +Warnings:
 +Note	1003	select `test`.`t1`.`c1` AS `c1` from `test`.`t1` where (('%' = concat(`test`.`t1`.`c1`)) and ('a' like concat(`test`.`t1`.`c1`)))
 +DROP TABLE t1;
 +SET NAMES latin1;
 +#
 +# MDEV-6661 PI() does not work well in UCS2/UTF16/UTF32 context
 +#
 +SELECT CONCAT(CONVERT('pi=' USING ucs2),PI()) AS PI;
 +PI
 +pi=3.141593
 +#
 +# MDEV-6695 Bad column name for UCS2 string literals
 +#
 +SET NAMES utf8, character_set_connection=ucs2;
 +SELECT 'a','aa';
 +a	aa
 +a	aa
 +#
 +# MDEV-10306 Wrong results with combination of CONCAT, SUBSTR and CONVERT in subquery
 +#
 +SET NAMES utf8, character_set_connection=ucs2;
 +SET @save_optimizer_switch=@@optimizer_switch;
 +SET optimizer_switch=_utf8'derived_merge=on';
 +CREATE TABLE t1 (t VARCHAR(10) CHARSET latin1);
 +INSERT INTO t1 VALUES('abcdefghi');
 +SET NAMES utf8, character_set_connection=ucs2;
 +SELECT CONCAT(t2,'-',t2) c2 FROM (SELECT HEX(t) t2 FROM t1) sub;
 +c2
 +616263646566676869-616263646566676869
 +SELECT CONCAT(t2,'-',t2) c2 FROM (SELECT TO_BASE64(t) t2 FROM t1) sub;
 +c2
 +YWJjZGVmZ2hp-YWJjZGVmZ2hp
 +DROP TABLE t1;
 +SET optimizer_switch=@save_optimizer_switch;
 +#
 +# End of 10.0 tests
 +#
diff --cc mysql-test/r/func_misc.result
index d54a70cab45,66e3cfd4ff4..ea3f57c6204
--- a/mysql-test/r/func_misc.result
+++ b/mysql-test/r/func_misc.result
@@@ -571,6 -571,20 +571,17 @@@ AND 57813X540X1723 = 'Test'
  N	AVG
  0	NULL
  drop table t1;
+ #
+ # MDEV-15630 uuid() function evaluates at wrong time in query
+ #
+ CREATE TABLE t1 (id INT);
+ INSERT INTO t1 VALUES (1),(2),(3);
+ SELECT COUNT(1), UUID() as uid FROM t1 GROUP BY uid;
+ COUNT(1)	uid
+ 1	xxxxxxxx-xxxx-xxxx-xxxx-xxxxxxxxxxxx
+ 1	xxxxxxxx-xxxx-xxxx-xxxx-xxxxxxxxxxxx
+ 1	xxxxxxxx-xxxx-xxxx-xxxx-xxxxxxxxxxxx
+ DROP TABLE t1;
 -#
 -# End of 5.5 tests
 -#
  SELECT NAME_CONST('a', -(1 OR 2)) OR 1;
  ERROR HY000: Incorrect arguments to NAME_CONST
  SELECT NAME_CONST('a', -(1 AND 2)) OR 1;
diff --cc mysql-test/r/view.result
index 5a51ea85f55,7fc3c48c3a0..4e3146052e9
--- a/mysql-test/r/view.result
+++ b/mysql-test/r/view.result
@@@ -5644,6 -5535,203 +5536,203 @@@ View	Create View	character_set_client	c
  v1	CREATE ALGORITHM=UNDEFINED DEFINER=`root`@`localhost` SQL SECURITY DEFINER VIEW `v1` AS select group_concat(`t1`.`str` separator '\\') AS `GROUP_CONCAT(str  SEPARATOR  '\\')` from `t1`	latin1	latin1_swedish_ci
  drop view v1;
  drop table t1;
+ CREATE TABLE IF NOT EXISTS t0 (f0 INT);
+ CREATE TABLE IF NOT EXISTS t1 (f1 INT);
+ CREATE TABLE IF NOT EXISTS t2 (f2 INT);
+ CREATE TABLE IF NOT EXISTS t3 (f3 INT);
+ CREATE TABLE IF NOT EXISTS t4 (f4 INT);
+ CREATE TABLE IF NOT EXISTS t5 (f5 INT);
+ CREATE TABLE IF NOT EXISTS t6 (f6 INT);
+ CREATE TABLE IF NOT EXISTS t7 (f7 INT);
+ CREATE TABLE IF NOT EXISTS t8 (f8 INT);
+ CREATE TABLE IF NOT EXISTS t9 (f9 INT);
+ CREATE TABLE IF NOT EXISTS t10 (f10 INT);
+ CREATE TABLE IF NOT EXISTS t11 (f11 INT);
+ CREATE TABLE IF NOT EXISTS t12 (f12 INT);
+ CREATE TABLE IF NOT EXISTS t13 (f13 INT);
+ CREATE TABLE IF NOT EXISTS t14 (f14 INT);
+ CREATE TABLE IF NOT EXISTS t15 (f15 INT);
+ CREATE TABLE IF NOT EXISTS t16 (f16 INT);
+ CREATE TABLE IF NOT EXISTS t17 (f17 INT);
+ CREATE TABLE IF NOT EXISTS t18 (f18 INT);
+ CREATE TABLE IF NOT EXISTS t19 (f19 INT);
+ CREATE TABLE IF NOT EXISTS t20 (f20 INT);
+ CREATE TABLE IF NOT EXISTS t21 (f21 INT);
+ CREATE TABLE IF NOT EXISTS t22 (f22 INT);
+ CREATE TABLE IF NOT EXISTS t23 (f23 INT);
+ CREATE TABLE IF NOT EXISTS t24 (f24 INT);
+ CREATE TABLE IF NOT EXISTS t25 (f25 INT);
+ CREATE TABLE IF NOT EXISTS t26 (f26 INT);
+ CREATE TABLE IF NOT EXISTS t27 (f27 INT);
+ CREATE TABLE IF NOT EXISTS t28 (f28 INT);
+ CREATE TABLE IF NOT EXISTS t29 (f29 INT);
+ CREATE TABLE IF NOT EXISTS t30 (f30 INT);
+ CREATE TABLE IF NOT EXISTS t31 (f31 INT);
+ CREATE TABLE IF NOT EXISTS t32 (f32 INT);
+ CREATE TABLE IF NOT EXISTS t33 (f33 INT);
+ CREATE TABLE IF NOT EXISTS t34 (f34 INT);
+ CREATE TABLE IF NOT EXISTS t35 (f35 INT);
+ CREATE TABLE IF NOT EXISTS t36 (f36 INT);
+ CREATE TABLE IF NOT EXISTS t37 (f37 INT);
+ CREATE TABLE IF NOT EXISTS t38 (f38 INT);
+ CREATE TABLE IF NOT EXISTS t39 (f39 INT);
+ CREATE TABLE IF NOT EXISTS t40 (f40 INT);
+ CREATE TABLE IF NOT EXISTS t41 (f41 INT);
+ CREATE TABLE IF NOT EXISTS t42 (f42 INT);
+ CREATE TABLE IF NOT EXISTS t43 (f43 INT);
+ CREATE TABLE IF NOT EXISTS t44 (f44 INT);
+ CREATE TABLE IF NOT EXISTS t45 (f45 INT);
+ CREATE TABLE IF NOT EXISTS t46 (f46 INT);
+ CREATE TABLE IF NOT EXISTS t47 (f47 INT);
+ CREATE TABLE IF NOT EXISTS t48 (f48 INT);
+ CREATE TABLE IF NOT EXISTS t49 (f49 INT);
+ CREATE TABLE IF NOT EXISTS t50 (f50 INT);
+ CREATE TABLE IF NOT EXISTS t51 (f51 INT);
+ CREATE TABLE IF NOT EXISTS t52 (f52 INT);
+ CREATE TABLE IF NOT EXISTS t53 (f53 INT);
+ CREATE TABLE IF NOT EXISTS t54 (f54 INT);
+ CREATE TABLE IF NOT EXISTS t55 (f55 INT);
+ CREATE TABLE IF NOT EXISTS t56 (f56 INT);
+ CREATE TABLE IF NOT EXISTS t57 (f57 INT);
+ CREATE TABLE IF NOT EXISTS t58 (f58 INT);
+ CREATE TABLE IF NOT EXISTS t59 (f59 INT);
+ CREATE TABLE IF NOT EXISTS t60 (f60 INT);
+ CREATE OR REPLACE VIEW v60 AS SELECT * FROM t60;
+ EXPLAIN
+ SELECT t0.*
+ FROM t0
+ JOIN t1
+ ON t1.f1 = t0.f0
+ LEFT JOIN t2
+ ON t0.f0 = t2.f2
+ LEFT JOIN t3
+ ON t0.f0 = t3.f3
+ LEFT JOIN t4
+ ON t0.f0 = t4.f4
+ LEFT JOIN t5
+ ON t4.f4 = t5.f5
+ LEFT JOIN t6
+ ON t0.f0 = t6.f6
+ LEFT JOIN t7
+ ON t0.f0 = t7.f7
+ LEFT JOIN t8
+ ON t0.f0 = t8.f8
+ LEFT JOIN t9
+ ON t0.f0 = t9.f9
+ LEFT JOIN t10
+ ON t0.f0 = t10.f10
+ LEFT JOIN t11
+ ON t0.f0 = t11.f11
+ LEFT JOIN t12
+ ON t0.f0 = t12.f12
+ LEFT JOIN t13
+ ON t0.f0 = t13.f13
+ LEFT JOIN t14
+ ON t0.f0 = t14.f14
+ LEFT JOIN t15
+ ON t0.f0 = t15.f15
+ LEFT JOIN t16
+ ON t0.f0 = t16.f16
+ LEFT JOIN t17
+ ON t0.f0 = t17.f17
+ LEFT JOIN t18
+ ON t0.f0 = t18.f18
+ LEFT JOIN t19
+ ON t18.f18 = t19.f19
+ LEFT JOIN t20
+ ON t20.f20 = t19.f19
+ LEFT JOIN t21
+ ON t20.f20 = t21.f21
+ LEFT JOIN t22
+ ON t19.f19 = t22.f22
+ LEFT JOIN t23
+ ON t23.f23 = t0.f0
+ LEFT JOIN t24
+ ON t24.f24 = t23.f23
+ LEFT JOIN t25
+ ON t0.f0 = t25.f25
+ LEFT JOIN t26
+ ON t26.f26 = t0.f0
+ LEFT JOIN t27
+ ON t27.f27 = t0.f0
+ LEFT JOIN t28
+ ON t0.f0 = t28.f28
+ LEFT JOIN t29
+ ON t0.f0 = t29.f29
+ LEFT JOIN t30
+ ON t30.f30 = t0.f0
+ LEFT JOIN t31
+ ON t0.f0 = t31.f31
+ LEFT JOIN t32
+ ON t32.f32 = t31.f31
+ LEFT JOIN t33
+ ON t33.f33 = t0.f0
+ LEFT JOIN t34
+ ON t33.f33 = t34.f34
+ LEFT JOIN t35
+ ON t33.f33 = t35.f35
+ LEFT JOIN t36
+ ON t36.f36 = t0.f0
+ LEFT JOIN t37
+ ON t32.f32 = t37.f37
+ LEFT JOIN t38
+ ON t31.f31 = t38.f38
+ LEFT JOIN t39
+ ON t39.f39 = t0.f0
+ LEFT JOIN t40
+ ON t40.f40 = t39.f39
+ LEFT JOIN t41
+ ON t41.f41 = t0.f0
+ LEFT JOIN t42
+ ON t42.f42 = t41.f41
+ LEFT JOIN t43
+ ON t43.f43 = t41.f41
+ LEFT JOIN t44
+ ON t44.f44 = t0.f0
+ LEFT JOIN t45
+ ON t45.f45 = t0.f0
+ LEFT JOIN t46
+ ON t46.f46 = t0.f0
+ LEFT JOIN t47
+ ON t47.f47 = t0.f0
+ LEFT JOIN t48
+ ON t48.f48 = t0.f0
+ LEFT JOIN t49
+ ON t0.f0 = t49.f49
+ LEFT JOIN t50
+ ON t0.f0 = t50.f50
+ LEFT JOIN t51
+ ON t0.f0 = t51.f51
+ LEFT JOIN t52
+ ON t52.f52 = t0.f0
+ LEFT JOIN t53
+ ON t53.f53 = t0.f0
+ LEFT JOIN t54
+ ON t54.f54 = t0.f0
+ LEFT JOIN t55
+ ON t55.f55 = t0.f0
+ LEFT JOIN t56
+ ON t56.f56 = t0.f0
+ LEFT JOIN t57
+ ON t57.f57 = t0.f0
+ LEFT JOIN t58
+ ON t58.f58 = t57.f57
+ LEFT JOIN t59
+ ON t36.f36 = t59.f59
+ LEFT JOIN v60
+ ON t36.f36 = v60.f60
+ ;
+ id	select_type	table	type	possible_keys	key	key_len	ref	rows	Extra
 -1	SIMPLE	NULL	NULL	NULL	NULL	NULL	NULL	NULL	Impossible WHERE noticed after reading const tables
++1	PRIMARY	NULL	NULL	NULL	NULL	NULL	NULL	NULL	Impossible WHERE noticed after reading const tables
+ 2	SUBQUERY	NULL	NULL	NULL	NULL	NULL	NULL	NULL	no matching row in const table
+ drop table t0, t1, t2, t3, t4, t5, t6, t7, t8, t9,
+ t10, t11, t12, t13, t14, t15, t16, t17, t18,
+ t19, t20, t21, t22, t23, t24, t25, t26, t27,
+ t28, t29, t30, t31, t32, t33, t34, t35, t36,
+ t37, t38, t39, t40, t41, t42, t43, t44, t45,
+ t46, t47, t48, t49, t50, t51, t52, t53, t54,
+ t55, t56, t57, t58, t59,t60;
+ drop view v60;
  # -----------------------------------------------------------------
  # -- End of 5.5 tests.
  # -----------------------------------------------------------------
diff --cc mysql-test/suite/galera/disabled.def
index f9909914089,ad966ebab0d..78c3565b99d
--- a/mysql-test/suite/galera/disabled.def
+++ b/mysql-test/suite/galera/disabled.def
@@@ -1,49 -1,4 +1,51 @@@
 -galera_var_dirty_reads :  MDEV-12539
 -query_cache : MDEV-12539
 -MW-421 : MDEV-12539
 -galera_concurrent_ctas : MDEV-12539
 +##############################################################################
 +#
 +#  List the test cases that are to be disabled temporarily.
 +#
 +#  Separate the test case name and the comment with ':'.
 +#
 +#    <testcasename> : MDEV-<xxxx> <comment>
 +#
 +#  Do not use any TAB characters for whitespace.
 +#
 +##############################################################################
++
 +MW-336 : MDEV-13549 Galera test failures
 +galera_gra_log : MDEV-13549 Galera test failures
 +galera_flush_local : MDEV-13549 Galera test failures
 +galera_flush :  MDEV-13549 Galera test failures
 +MW-329 : MDEV-13549 Galera test failures
 +galera_account_management : MariaDB 10.0 does not support ALTER USER
 +galera_binlog_row_image : MariaDB 10.0 does not support binlog_row_image
 +galera_binlog_rows_query_log_events: MariaDB does not support binlog_rows_query_log_events
 +GAL-419 : MDEV-13549 Galera test failures
 +galera_toi_ddl_fk_insert : MDEV-13549 Galera test failures
 +galera_var_notify_cmd : MDEV-13549 Galera test failures
 +galera_var_slave_threads : MDEV-13549 Galera test failures
 +mysql-wsrep#90 : MDEV-13549 Galera test failures
 +galera_as_master_gtid : Requires MySQL GTID
 +galera_as_master_gtid_change_master : Requires MySQL GTID
 +galera_as_slave_replication_bundle : MDEV-13549 Galera test failures
 +galera_as_slave_preordered : wsrep-preordered feature not merged to MariaDB
 +galera_gcs_fragment : MDEV-13549 Galera test failures
 +galera_gcache_recover : MDEV-13549 Galera test failures
 +galera_gcache_recover_full_gcache : MDEV-13549 Galera test failures
 +galera_gcache_recover_manytrx : MDEV-13549 Galera test failures
 +galera_ist_mysqldump : MDEV-13549 Galera test failures
 +mysql-wsrep#31 :  MDEV-13549 Galera test failures
 +galera_migrate : MariaDB 10.0 does not support START SLAVE USER
 +galera_concurrent_ctas : MDEV-13549 Galera test failures
 +galera_bf_abort_for_update : MDEV-13549 Galera test failures
 +galera_wsrep_desync_wsrep_on : MDEV-13549 Galera test failures
 +galera_ssl_upgrade : MDEV-13549 Galera test failures
 +mysql-wsrep#33 : MDEV-13549 Galera test failures
 +galera_var_auto_inc_control_on : MDEV-13549 Galera test failures
 +MW-44 : MDEV-13549 Galera test failures
 +galera_var_retry_autocommit : MDEV-13549 Galera test failures
 +pxc-421 : MDEV-13549 Galera test failures
 +lp1376747-2 : MDEV-13549 Galera test failures
 +lp1376747 : MDEV-13549 Galera test failures
 +galera_toi_ddl_nonconflicting : MDEV-13549 Galera test failures
 +galera_parallel_simple : MDEV-13549 Galera test failures
 +galera_admin : MDEV-13549 Galera test failures
++MW-416 : MDEV-13549 Galera test failures
diff --cc mysql-test/suite/galera/r/galera_defaults.result
index b242a468f72,00000000000..e7a776e9047
mode 100644,000000..100644
--- a/mysql-test/suite/galera/r/galera_defaults.result
+++ b/mysql-test/suite/galera/r/galera_defaults.result
@@@ -1,119 -1,0 +1,119 @@@
 +SELECT COUNT(*) = 40 FROM INFORMATION_SCHEMA.GLOBAL_VARIABLES WHERE VARIABLE_NAME LIKE 'wsrep_%';
 +COUNT(*) = 40
 +1
 +SELECT VARIABLE_NAME, VARIABLE_VALUE
 +FROM INFORMATION_SCHEMA.GLOBAL_VARIABLES
 +WHERE VARIABLE_NAME LIKE 'wsrep_%'
 +AND VARIABLE_NAME NOT IN (
 +'WSREP_PROVIDER_OPTIONS',
 +'WSREP_SST_RECEIVE_ADDRESS',
 +'WSREP_NODE_ADDRESS',
 +'WSREP_NODE_NAME',
 +'WSREP_PROVIDER',
 +'WSREP_DATA_HOME_DIR',
 +'WSREP_NODE_INCOMING_ADDRESS',
 +'WSREP_START_POSITION'
 +)
 +ORDER BY VARIABLE_NAME;
 +VARIABLE_NAME	VARIABLE_VALUE
 +WSREP_AUTO_INCREMENT_CONTROL	ON
 +WSREP_CAUSAL_READS	ON
 +WSREP_CERTIFY_NONPK	ON
 +WSREP_CLUSTER_ADDRESS	gcomm://
 +WSREP_CLUSTER_NAME	my_wsrep_cluster
 +WSREP_CONVERT_LOCK_TO_TRX	OFF
 +WSREP_DBUG_OPTION	
 +WSREP_DEBUG	OFF
 +WSREP_DESYNC	OFF
 +WSREP_DIRTY_READS	OFF
 +WSREP_DRUPAL_282555_WORKAROUND	OFF
 +WSREP_FORCED_BINLOG_FORMAT	NONE
 +WSREP_LOAD_DATA_SPLITTING	ON
 +WSREP_LOG_CONFLICTS	OFF
 +WSREP_MAX_WS_ROWS	0
 +WSREP_MAX_WS_SIZE	2147483647
 +WSREP_MYSQL_REPLICATION_BUNDLE	0
 +WSREP_NOTIFY_CMD	
 +WSREP_ON	ON
 +WSREP_OSU_METHOD	TOI
 +WSREP_RECOVER	OFF
 +WSREP_REPLICATE_MYISAM	OFF
 +WSREP_RESTART_SLAVE	OFF
 +WSREP_RETRY_AUTOCOMMIT	1
 +WSREP_SLAVE_FK_CHECKS	ON
 +WSREP_SLAVE_THREADS	1
 +WSREP_SLAVE_UK_CHECKS	OFF
 +WSREP_SST_AUTH	
 +WSREP_SST_DONOR	
 +WSREP_SST_DONOR_REJECTS_QUERIES	OFF
 +WSREP_SST_METHOD	rsync
 +WSREP_SYNC_WAIT	15
- <BASE_DIR>; <BASE_HOST>; <BASE_PORT>; cert.log_conflicts = no; debug = no; evs.auto_evict = 0; evs.causal_keepalive_period = PT1S; evs.debug_log_mask = 0x1; evs.delay_margin = PT1S; evs.delayed_keep_period = PT30S; evs.inactive_check_period = PT0.5S; evs.inactive_timeout = PT15S; evs.info_log_mask = 0; evs.install_timeout = PT7.5S; evs.join_retrans_period = PT1S; evs.keepalive_period = PT1S; evs.max_install_timeouts = 3; evs.send_window = 4; evs.stats_report_period = PT1M; evs.suspect_timeout = PT10S; evs.use_aggregate = true; evs.user_send_window = 2; evs.version = 0; evs.view_forget_timeout = P1D; <GCACHE_DIR>; gcache.keep_pages_size = 0; gcache.mem_size = 0; <GCACHE_NAME>; gcache.page_size = 128M; gcache.recover = no; gcache.size = 10M; gcomm.thread_prio = ; gcs.fc_debug = 0; gcs.fc_factor = 1.0; gcs.fc_limit = 16; gcs.fc_master_slave = no; gcs.max_packet_size = 64500; gcs.max_throttle = 0.25; <GCS_RECV_Q_HARD_LIMIT>; gcs.recv_q_soft_limit = 0.25; gcs.sync_donor = no; <G
 MCAST_LISTEN_ADDR>; gmcast.mcast_addr = ; gmcast.mcast_ttl = 1; gmcast.peer_timeout = PT3S; gmcast.segment = 0; gmcast.time_wait = PT5S; gmcast.version = 0; <IST_RECV_ADDR>; pc.announce_timeout = PT3S; pc.checksum = false; pc.ignore_quorum = false; pc.ignore_sb = false; pc.linger = PT20S; pc.npvo = false; pc.recovery = true; pc.version = 0; pc.wait_prim = true; pc.wait_prim_timeout = PT30S; pc.weight = 1; protonet.backend = asio; protonet.version = 0; repl.causal_read_timeout = PT90S; repl.commit_order = 3; repl.key_format = FLAT8; repl.max_ws_size = 2147483647; repl.proto_max = 7; socket.checksum = 2; socket.recv_buf_size = 212992; 
++<BASE_DIR>; <BASE_HOST>; <BASE_PORT>; cert.log_conflicts = no; debug = no; evs.auto_evict = 0; evs.causal_keepalive_period = PT1S; evs.debug_log_mask = 0x1; evs.delay_margin = PT1S; evs.delayed_keep_period = PT30S; evs.inactive_check_period = PT0.5S; evs.inactive_timeout = PT15S; evs.info_log_mask = 0; evs.install_timeout = PT7.5S; evs.join_retrans_period = PT1S; evs.keepalive_period = PT1S; evs.max_install_timeouts = 3; evs.send_window = 4; evs.stats_report_period = PT1M; evs.suspect_timeout = PT10S; evs.use_aggregate = true; evs.user_send_window = 2; evs.version = 0; evs.view_forget_timeout = P1D; <GCACHE_DIR>; gcache.keep_pages_size = 0; gcache.mem_size = 0; <GCACHE_NAME>; gcache.page_size = 128M; gcache.recover = no; gcache.size = 10M; gcomm.thread_prio = ; gcs.fc_debug = 0; gcs.fc_factor = 1.0; gcs.fc_limit = 16; gcs.fc_master_slave = no; gcs.max_packet_size = 64500; gcs.max_throttle = 0.25; <GCS_RECV_Q_HARD_LIMIT>; gcs.recv_q_soft_limit = 0.25; gcs.sync_donor = no; <G
 MCAST_LISTEN_ADDR>; gmcast.mcast_addr = ; gmcast.mcast_ttl = 1; gmcast.peer_timeout = PT3S; gmcast.segment = 0; gmcast.time_wait = PT5S; gmcast.version = 0; <IST_RECV_ADDR>; pc.announce_timeout = PT3S; pc.checksum = false; pc.ignore_quorum = false; pc.ignore_sb = false; pc.linger = PT20S; pc.npvo = false; pc.recovery = true; pc.version = 0; pc.wait_prim = true; pc.wait_prim_timeout = PT30S; pc.weight = 1; protonet.backend = asio; protonet.version = 0; repl.causal_read_timeout = PT90S; repl.commit_order = 3; repl.key_format = FLAT8; repl.max_ws_size = 2147483647; repl.proto_max = 8; socket.checksum = 2; socket.recv_buf_size = 212992; 
 +SELECT COUNT(*) FROM INFORMATION_SCHEMA.GLOBAL_STATUS
 +WHERE VARIABLE_NAME LIKE 'wsrep_%'
 +AND VARIABLE_NAME != 'wsrep_debug_sync_waiters';
 +COUNT(*)
 +58
 +SELECT VARIABLE_NAME FROM INFORMATION_SCHEMA.GLOBAL_STATUS
 +WHERE VARIABLE_NAME LIKE 'wsrep_%'
 +AND VARIABLE_NAME != 'wsrep_debug_sync_waiters'
 +ORDER BY VARIABLE_NAME;
 +VARIABLE_NAME
 +WSREP_APPLY_OOOE
 +WSREP_APPLY_OOOL
 +WSREP_APPLY_WINDOW
 +WSREP_CAUSAL_READS
 +WSREP_CERT_DEPS_DISTANCE
 +WSREP_CERT_INDEX_SIZE
 +WSREP_CERT_INTERVAL
 +WSREP_CLUSTER_CONF_ID
 +WSREP_CLUSTER_SIZE
 +WSREP_CLUSTER_STATE_UUID
 +WSREP_CLUSTER_STATUS
 +WSREP_COMMIT_OOOE
 +WSREP_COMMIT_OOOL
 +WSREP_COMMIT_WINDOW
 +WSREP_CONNECTED
 +WSREP_DESYNC_COUNT
 +WSREP_EVS_DELAYED
 +WSREP_EVS_EVICT_LIST
 +WSREP_EVS_REPL_LATENCY
 +WSREP_EVS_STATE
 +WSREP_FLOW_CONTROL_PAUSED
 +WSREP_FLOW_CONTROL_PAUSED_NS
 +WSREP_FLOW_CONTROL_RECV
 +WSREP_FLOW_CONTROL_SENT
 +WSREP_GCOMM_UUID
 +WSREP_INCOMING_ADDRESSES
 +WSREP_LAST_COMMITTED
 +WSREP_LOCAL_BF_ABORTS
 +WSREP_LOCAL_CACHED_DOWNTO
 +WSREP_LOCAL_CERT_FAILURES
 +WSREP_LOCAL_COMMITS
 +WSREP_LOCAL_INDEX
 +WSREP_LOCAL_RECV_QUEUE
 +WSREP_LOCAL_RECV_QUEUE_AVG
 +WSREP_LOCAL_RECV_QUEUE_MAX
 +WSREP_LOCAL_RECV_QUEUE_MIN
 +WSREP_LOCAL_REPLAYS
 +WSREP_LOCAL_SEND_QUEUE
 +WSREP_LOCAL_SEND_QUEUE_AVG
 +WSREP_LOCAL_SEND_QUEUE_MAX
 +WSREP_LOCAL_SEND_QUEUE_MIN
 +WSREP_LOCAL_STATE
 +WSREP_LOCAL_STATE_COMMENT
 +WSREP_LOCAL_STATE_UUID
 +WSREP_PROTOCOL_VERSION
 +WSREP_PROVIDER_NAME
 +WSREP_PROVIDER_VENDOR
 +WSREP_PROVIDER_VERSION
 +WSREP_READY
 +WSREP_RECEIVED
 +WSREP_RECEIVED_BYTES
 +WSREP_REPLICATED
 +WSREP_REPLICATED_BYTES
 +WSREP_REPL_DATA_BYTES
 +WSREP_REPL_KEYS
 +WSREP_REPL_KEYS_BYTES
 +WSREP_REPL_OTHER_BYTES
 +WSREP_THREAD_COUNT
diff --cc mysql-test/suite/galera/r/galera_var_dirty_reads.result
index c469e49731d,8a3175912c7..405d86b3027
--- a/mysql-test/suite/galera/r/galera_var_dirty_reads.result
+++ b/mysql-test/suite/galera/r/galera_var_dirty_reads.result
@@@ -42,6 -88,8 +42,4 @@@ SELECT * FROM t1
  i
  1
  DROP TABLE t1;
- set GLOBAL auto_increment_offset = 1;
- set GLOBAL auto_increment_offset = 2;
 -drop user user1;
 -drop user user2;
 -disconnect node_2;
 -disconnect node_1;
  # End of test
diff --cc mysql-test/suite/galera/t/galera_var_dirty_reads.test
index 152c875a946,8fd3b1d22f2..df4c033ab3d
--- a/mysql-test/suite/galera/t/galera_var_dirty_reads.test
+++ b/mysql-test/suite/galera/t/galera_var_dirty_reads.test
@@@ -5,6 -5,14 +5,11 @@@
  --source include/galera_cluster.inc
  --source include/have_innodb.inc
  
 ---disable_query_log
+ # Save original auto_increment_offset values.
 ---connection node_1
 -let $auto_increment_offset_node_1 = `SELECT @@global.auto_increment_offset`;
 ---connection node_2
 -let $auto_increment_offset_node_2 = `SELECT @@global.auto_increment_offset`;
 ---enable_query_log
++--let $node_1=node_1
++--let $node_2=node_2
++--source include/auto_increment_offset_save.inc
+ 
  --connection node_2
  --let $wsrep_cluster_address_saved = `SELECT @@global.wsrep_cluster_address`
  
@@@ -64,10 -114,16 +69,8 @@@ USE test
  SELECT * FROM t1;
  # Cleanup
  DROP TABLE t1;
- set GLOBAL auto_increment_offset = 1;
 -drop user user1;
 -drop user user2;
  
 ---disable_query_log
 -# Restore original auto_increment_offset values.
 ---connection node_1
 ---eval SET @@global.auto_increment_offset = $auto_increment_offset_node_1;
----connection node_2
- set GLOBAL auto_increment_offset = 2;
 ---eval SET @@global.auto_increment_offset = $auto_increment_offset_node_2;
 ---enable_query_log
++--source include/auto_increment_offset_restore.inc
  
  --source include/galera_end.inc
  --echo # End of test
diff --cc mysql-test/suite/parts/r/partition_alter_maria.result
index fd09c0bd4bb,d79bc0a41fe..7d923570cfe
--- a/mysql-test/suite/parts/r/partition_alter_maria.result
+++ b/mysql-test/suite/parts/r/partition_alter_maria.result
@@@ -16,16 -16,16 +16,25 @@@ select * from t1
  pk	dt
  1	2017-09-28 15:12:00
  drop table t1;
+ create table t1 (a int) engine=Aria transactional=1 partition by hash(a) partitions 2;
+ show create table t1;
+ Table	Create Table
+ t1	CREATE TABLE `t1` (
+   `a` int(11) DEFAULT NULL
+ ) ENGINE=Aria DEFAULT CHARSET=latin1 TRANSACTIONAL=1
+ /*!50100 PARTITION BY HASH (a)
+ PARTITIONS 2 */
+ drop table t1;
  #
 +# MDEV-14641 Incompatible key or row definition between the MariaDB .frm file and the information in the storage engine
 +#
 +CREATE TABLE t1 (i INT) ENGINE=Aria PARTITION BY LIST(i) (PARTITION p0 VALUES IN (1),  PARTITION p1 VALUES IN (2));;
 +ALTER TABLE t1 ROW_FORMAT=COMPRESSED;
 +ALTER TABLE t1 DROP PARTITION p1;
 +SELECT * FROM t1;
 +i
 +DROP TABLE t1;
 +#
  # MDEV-13788 Server crash when issuing bad SQL partition syntax
  #
  CREATE TABLE t1 (id int, d date) ENGINE=Aria PARTITION BY RANGE COLUMNS(d) (PARTITION p1 VALUES LESS THAN (MAXVALUE));
diff --cc mysql-test/t/func_misc.test
index dc7202268d6,c21630c0c7b..4afed7d6f6e
--- a/mysql-test/t/func_misc.test
+++ b/mysql-test/t/func_misc.test
@@@ -596,6 -596,22 +596,18 @@@ AND 57813X540X1723 = 'Test'
  
  drop table t1;
  
+ 
+ --echo #
+ --echo # MDEV-15630 uuid() function evaluates at wrong time in query
+ --echo #
+ 
+ CREATE TABLE t1 (id INT);
+ INSERT INTO t1 VALUES (1),(2),(3);
+ --replace_column 2 xxxxxxxx-xxxx-xxxx-xxxx-xxxxxxxxxxxx
+ SELECT COUNT(1), UUID() as uid FROM t1 GROUP BY uid;
+ DROP TABLE t1;
+ 
+ 
 ---echo #
 ---echo # End of 5.5 tests
 ---echo #
 -
  #
  # Bug#12735545 - PARSER STACK OVERFLOW WITH NAME_CONST
  #                CONTAINING OR EXPRESSION
diff --cc scripts/wsrep_sst_xtrabackup-v2.sh
index 26119af2c61,00d8fe21113..f107cea6c74
--- a/scripts/wsrep_sst_xtrabackup-v2.sh
+++ b/scripts/wsrep_sst_xtrabackup-v2.sh
@@@ -1045,9 -863,9 +1045,9 @@@ the
  
  
          wsrep_log_info "Cleaning the existing datadir and innodb-data/log directories"
-         find $ib_home_dir $ib_log_dir $ib_undo_dir $DATA -mindepth 1  -regex $cpat  -prune  -o -exec rm -rfv {} 1>&2 \+
+         find $ib_home_dir $ib_log_dir $ib_undo_dir $DATA -mindepth 1 -prune -regex $cpat -o -exec rm -rfv {} 1>&2 \+
  
 -        tempdir=$(parse_cnf mysqld log-bin "")
 +        tempdir=$(parse_cnf --mysqld log-bin "")
          if [[ -n ${tempdir:-} ]];then
              binlog_dir=$(dirname $tempdir)
              binlog_file=$(basename $tempdir)
diff --cc sql/event_data_objects.cc
index e7bdc42b2e6,0cb123451df..aa85b570a84
--- a/sql/event_data_objects.cc
+++ b/sql/event_data_objects.cc
@@@ -1469,29 -1466,38 +1469,28 @@@ end
  
        saved_master_access= thd->security_ctx->master_access;
        thd->security_ctx->master_access |= SUPER_ACL;
 +      bool save_tx_read_only= thd->tx_read_only;
 +      thd->tx_read_only= false;
  
  #ifdef WITH_WSREP
-       if (WSREP(thd)) {
-         // sql_print_information("sizeof(LEX) = %d", sizeof(struct LEX));
-         // sizeof(LEX) = 4512, so it's relatively safe to allocate it on stack.
-         LEX lex;
-         LEX* saved = thd->lex;
-         lex.sql_command = SQLCOM_DROP_EVENT;
-         thd->lex = &lex;
-         WSREP_TO_ISOLATION_BEGIN(WSREP_MYSQL_DB, NULL, NULL);
-         thd->lex = saved;
 -      /*
 -         This code is processing event execution and does not have client
 -         connection. Here, event execution will now execute a prepared
 -         DROP EVENT statement, but thd->lex->sql_command is set to
 -         SQLCOM_CREATE_PROCEDURE
 -         DROP EVENT will be logged in binlog, and we have to
 -         replicate it to make all nodes have consistent event definitions
 -         Wsrep DDL replication is triggered inside Events::drop_event(),
 -         and here we need to prepare the THD so that DDL replication is
 -         possible, essentially it requires setting sql_command to
 -         SQLCOMM_DROP_EVENT, we will switch sql_command for the duration
 -         of DDL replication only.
 -      */
 -      const enum_sql_command sql_command_save= thd->lex->sql_command;
+       const bool sql_command_set= WSREP(thd);
 -      if (WSREP(thd))
 -      {
++      const enum_sql_command sql_command_save= thd->lex->sql_command;
++
++      if (sql_command_set) {
+         thd->lex->sql_command = SQLCOM_DROP_EVENT;
        }
  #endif
 -     
 +
        ret= Events::drop_event(thd, dbname, name, FALSE);
  
  #ifdef WITH_WSREP
-       WSREP_TO_ISOLATION_END;
-   error:
+       if (sql_command_set)
+       {
+         WSREP_TO_ISOLATION_END;
+         thd->lex->sql_command = sql_command_save;
+       }
  #endif
 +      thd->tx_read_only= save_tx_read_only;
        thd->security_ctx->master_access= saved_master_access;
      }
    }
diff --cc sql/events.cc
index dd4e4887d50,a6379ec5a46..661d9e19001
--- a/sql/events.cc
+++ b/sql/events.cc
@@@ -405,10 -401,16 +406,14 @@@ Events::create_event(THD *thd, Event_pa
        }
      }
    }
 -  /* Restore the state of binlog format */
 -  DBUG_ASSERT(!thd->is_current_stmt_binlog_format_row());
 -  if (save_binlog_row_based)
 -    thd->set_current_stmt_binlog_format_row();
 +
 +  thd->restore_stmt_binlog_format(save_binlog_format);
  
    DBUG_RETURN(ret);
+ #ifdef WITH_WSREP
+  error:
+   DBUG_RETURN(TRUE);
+ #endif /* WITH_WSREP */
  }
  
  
@@@ -517,9 -521,16 +523,13 @@@ Events::update_event(THD *thd, Event_pa
        ret= write_bin_log(thd, TRUE, thd->query(), thd->query_length());
      }
    }
 -  /* Restore the state of binlog format */
 -  DBUG_ASSERT(!thd->is_current_stmt_binlog_format_row());
 -  if (save_binlog_row_based)
 -    thd->set_current_stmt_binlog_format_row();
  
 +  thd->restore_stmt_binlog_format(save_binlog_format);
    DBUG_RETURN(ret);
+ #ifdef WITH_WSREP
+  error:
+   DBUG_RETURN(TRUE);
+ #endif /* WITH_WSREP */
  }
  
  
@@@ -578,9 -591,15 +589,13 @@@ Events::drop_event(THD *thd, LEX_STRIN
      DBUG_ASSERT(thd->query() && thd->query_length());
      ret= write_bin_log(thd, TRUE, thd->query(), thd->query_length());
    }
 -  /* Restore the state of binlog format */
 -  DBUG_ASSERT(!thd->is_current_stmt_binlog_format_row());
 -  if (save_binlog_row_based)
 -    thd->set_current_stmt_binlog_format_row();
 +
 +  thd->restore_stmt_binlog_format(save_binlog_format);
    DBUG_RETURN(ret);
+ #ifdef WITH_WSREP
+  error:
+   DBUG_RETURN(TRUE);
+ #endif /* WITH_WSREP */
  }
  
  
diff --cc sql/handler.cc
index 657cb01cbc8,7da373e6802..fc8bb53f35d
--- a/sql/handler.cc
+++ b/sql/handler.cc
@@@ -4453,19 -3838,13 +4453,20 @@@ handler::ha_create(const char *name, TA
  */
  
  int
 -handler::ha_create_handler_files(const char *name, const char *old_name,
 -                        int action_flag, HA_CREATE_INFO *info)
 +handler::ha_create_partitioning_metadata(const char *name, const char *old_name,
 +                                         int action_flag)
  {
 -  if (!opt_readonly || !info || !(info->options & HA_LEX_CREATE_TMP_TABLE))
 -    mark_trx_read_write();
 +  /*
 +    Normally this is done when unlocked, but in fast_alter_partition_table,
 +    it is done on an already locked handler when preparing to alter/rename
 +    partitions.
 +  */
 +  DBUG_ASSERT(m_lock_type == F_UNLCK ||
 +              (!old_name && strcmp(name, table_share->path.str)));
+ 
 -  return create_handler_files(name, old_name, action_flag, info);
 +  mark_trx_read_write();
 +
 +  return create_partitioning_metadata(name, old_name, action_flag);
  }
  
  
diff --cc sql/item_cmpfunc.h
index 6d81c7acc40,3c8cc71370d..6cd7e0e3e78
--- a/sql/item_cmpfunc.h
+++ b/sql/item_cmpfunc.h
@@@ -272,11 -268,13 +273,15 @@@ public
    virtual void get_cache_parameters(List<Item> ¶meters);
    bool is_top_level_item();
    bool eval_not_null_tables(uchar *opt_arg);
 -  void fix_after_pullout(st_select_lex *new_parent, Item **ref);
 +  void fix_after_pullout(st_select_lex *new_parent, Item **ref, bool merge);
 +  bool invisible_mode();
 +  void reset_cache() { cache= NULL; }
    virtual void print(String *str, enum_query_type query_type);
    void restore_first_argument();
+   Item* get_wrapped_in_subselect_item()
+   {
+     return args[1];
+   }
  };
  
  class Comp_creator
diff --cc sql/item_func.h
index 7dea193c99b,57818228b98..b0ba87b4bd0
--- a/sql/item_func.h
+++ b/sql/item_func.h
@@@ -73,7 -66,7 +73,7 @@@ public
                    NOW_FUNC, TRIG_COND_FUNC,
                    SUSERVAR_FUNC, GUSERVAR_FUNC, COLLATE_FUNC,
                    EXTRACT_FUNC, CHAR_TYPECAST_FUNC, FUNC_SP, UDF_FUNC,
-                   NEG_FUNC, GSYSVAR_FUNC, DYNCOL_FUNC };
 -                  NEG_FUNC, GSYSVAR_FUNC, IN_OPTIMIZER_FUNC };
++                  NEG_FUNC, GSYSVAR_FUNC, IN_OPTIMIZER_FUNC, DYNCOL_FUNC };
    enum optimize_type { OPTIMIZE_NONE,OPTIMIZE_KEY,OPTIMIZE_OP, OPTIMIZE_NULL,
                         OPTIMIZE_EQUAL };
    enum Type type() const { return FUNC_ITEM; }
diff --cc sql/log.cc
index b63d72f0d4a,ca7833a0460..0098dd2ba3d
--- a/sql/log.cc
+++ b/sql/log.cc
@@@ -8589,9 -7042,10 +8589,9 @@@ int TC_LOG_MMAP::open(const char *opt_n
    DBUG_ASSERT(opt_name && opt_name[0]);
  
    tc_log_page_size= my_getpagesize();
 -  DBUG_ASSERT(TC_LOG_PAGE_SIZE % tc_log_page_size == 0);
  
    fn_format(logname,opt_name,mysql_data_home,"",MY_UNPACK_FILENAME);
-   if ((fd= mysql_file_open(key_file_tclog, logname, O_RDWR, MYF(0))) < 0)
+   if ((fd= mysql_file_open(key_file_tclog, logname, O_RDWR | O_CLOEXEC, MYF(0))) < 0)
    {
      if (my_errno != ENOENT)
        goto err;
diff --cc sql/log_event.cc
index c57331df807,12489d6d7eb..e799f37ddae
--- a/sql/log_event.cc
+++ b/sql/log_event.cc
@@@ -4314,38 -3832,22 +4347,38 @@@ int Query_log_event::do_apply_event(rpl
        }
        else
          thd->variables.collation_database= thd->db_charset;
-       
+ 
 +      /*
 +        Record any GTID in the same transaction, so slave state is
 +        transactionally consistent.
 +      */
 +      if (current_stmt_is_commit)
        {
 -        const CHARSET_INFO *cs= thd->charset();
 -        /*
 -          We cannot ask for parsing a statement using a character set
 -          without state_maps (parser internal data).
 -        */
 -        if (!cs->state_map)
 +        thd->variables.option_bits&= ~OPTION_GTID_BEGIN;
 +        if (rgi->gtid_pending)
          {
 -          rli->report(ERROR_LEVEL, ER_SLAVE_FATAL_ERROR,
 -                      ER_THD(thd, ER_SLAVE_FATAL_ERROR),
 -                      "character_set cannot be parsed");
 -          thd->is_slave_error= true;
 -          goto end;
 -        }
 -      }
 +          sub_id= rgi->gtid_sub_id;
 +          rgi->gtid_pending= false;
 +
 +          gtid= rgi->current_gtid;
 +          if (rpl_global_gtid_slave_state->record_gtid(thd, >id, sub_id,
 +                                                       true, false))
 +          {
 +            int errcode= thd->get_stmt_da()->sql_errno();
 +            if (!is_parallel_retry_error(rgi, errcode))
 +              rli->report(ERROR_LEVEL, ER_CANNOT_UPDATE_GTID_STATE,
 +                          rgi->gtid_info(),
 +                          "Error during COMMIT: failed to update GTID state in "
 +                        "%s.%s: %d: %s",
 +                          "mysql", rpl_gtid_slave_state_table_name.str,
 +                          errcode,
 +                          thd->get_stmt_da()->message());
 +            sub_id= 0;
 +            thd->is_slave_error= 1;
 +            goto end;
 +          }
 +        }
 +      }
  
        thd->table_map_for_update= (table_map)table_map_for_update;
        thd->set_invoker(&user, &host);
@@@ -7703,7 -6540,12 +7752,12 @@@ User_var_log_event(const char* buf, uin
        Old events will not have this extra byte, thence,
        we keep the flags set to UNDEF_F.
      */
 -    uint bytes_read= ((val + val_len) - start);
 +    uint bytes_read= ((val + val_len) - buf_start);
+     if (bytes_read > event_len)
+     {
+       error= true;
+       goto err;
+     }
      if ((data_written - bytes_read) > 0)
      {
        flags= (uint) *(buf + UV_VAL_IS_NULL + UV_VAL_TYPE_SIZE +
diff --cc sql/mysqld.cc
index 8575709203c,4acfe57c684..f558b78104f
--- a/sql/mysqld.cc
+++ b/sql/mysqld.cc
@@@ -1110,19 -962,10 +1110,20 @@@ static PSI_cond_info all_server_conds[]
    { &key_COND_wsrep_sst_init, "COND_wsrep_sst_init", PSI_FLAG_GLOBAL},
    { &key_COND_wsrep_sst_thread, "wsrep_sst_thread", 0},
    { &key_COND_wsrep_rollback, "COND_wsrep_rollback", PSI_FLAG_GLOBAL},
+   { &key_COND_wsrep_thd, "THD::COND_wsrep_thd", 0},
    { &key_COND_wsrep_replaying, "COND_wsrep_replaying", PSI_FLAG_GLOBAL},
  #endif
 -  { &key_COND_flush_thread_cache, "COND_flush_thread_cache", PSI_FLAG_GLOBAL}
 +  { &key_COND_flush_thread_cache, "COND_flush_thread_cache", PSI_FLAG_GLOBAL},
 +  { &key_COND_rpl_thread, "COND_rpl_thread", 0},
 +  { &key_COND_rpl_thread_queue, "COND_rpl_thread_queue", 0},
 +  { &key_COND_rpl_thread_stop, "COND_rpl_thread_stop", 0},
 +  { &key_COND_rpl_thread_pool, "COND_rpl_thread_pool", 0},
 +  { &key_COND_parallel_entry, "COND_parallel_entry", 0},
 +  { &key_COND_group_commit_orderer, "COND_group_commit_orderer", 0},
 +  { &key_COND_prepare_ordered, "COND_prepare_ordered", 0},
 +  { &key_COND_slave_init, "COND_slave_init", 0},
 +  { &key_COND_wait_gtid, "COND_wait_gtid", 0},
 +  { &key_COND_gtid_ignore_duplicates, "COND_gtid_ignore_duplicates", 0}
  };
  
  PSI_thread_key key_thread_bootstrap, key_thread_delayed_insert,
diff --cc sql/mysqld.h
index 4af04a3df75,91fa2eda7fd..3bb9f35077e
--- a/sql/mysqld.h
+++ b/sql/mysqld.h
@@@ -245,12 -219,12 +245,13 @@@ extern pthread_key(MEM_ROOT**,THR_MALLO
  #ifdef HAVE_PSI_INTERFACE
  #ifdef HAVE_MMAP
  extern PSI_mutex_key key_PAGE_lock, key_LOCK_sync, key_LOCK_active,
 -       key_LOCK_pool;
 +       key_LOCK_pool, key_LOCK_pending_checkpoint;
  #endif /* HAVE_MMAP */
 +
  #ifdef WITH_WSREP
  extern PSI_mutex_key key_LOCK_wsrep_thd;
- #endif /* WITH_WSREP */
+ extern PSI_cond_key  key_COND_wsrep_thd;
+ #endif /* HAVE_WSREP */
  
  #ifdef HAVE_OPENSSL
  extern PSI_mutex_key key_LOCK_des_key_file;
diff --cc sql/slave.cc
index a633722db16,f370e3dd27f..3dee39ad65f
--- a/sql/slave.cc
+++ b/sql/slave.cc
@@@ -4936,39 -3778,37 +4936,39 @@@ err_during_init
      to avoid unneeded position re-init
    */
    thd->temporary_tables = 0; // remove tempation from destructor to close them
 -  DBUG_ASSERT(thd->net.buff != 0);
 -  net_end(&thd->net); // destructor will not free it, because we are weird
 -  DBUG_ASSERT(rli->sql_thd == thd);
    THD_CHECK_SENTRY(thd);
 -  rli->sql_thd= 0;
 -  set_thd_in_use_temporary_tables(rli);  // (re)set sql_thd in use for saved temp tables
 +  rli->sql_driver_thd= 0;
    mysql_mutex_lock(&LOCK_thread_count);
 -  THD_CHECK_SENTRY(thd);
 -  delete thd;
 +  thd->rgi_fake= thd->rgi_slave= NULL;
 +  delete serial_rgi;
    mysql_mutex_unlock(&LOCK_thread_count);
 +
  #ifdef WITH_WSREP
 -  /* if slave stopped due to node going non primary, we set global flag to
 -     trigger automatic restart of slave when node joins back to cluster
 +  /*
 +    If slave stopped due to node going non primary, we set global flag to
 +    trigger automatic restart of slave when node joins back to cluster.
    */
 -   if (wsrep_node_dropped && wsrep_restart_slave)
 -   {
 -     if (wsrep_ready_get())
 -     {
 -       WSREP_INFO("Slave error due to node temporarily non-primary"
 -		  "SQL slave will continue");
 -       wsrep_node_dropped= FALSE;
 -       mysql_mutex_unlock(&rli->run_lock);
 -       goto wsrep_restart_point;
 -     } else {
 -       WSREP_INFO("Slave error due to node going non-primary");
 -       WSREP_INFO("wsrep_restart_slave was set and therefore slave will be "
 -		  "automatically restarted when node joins back to cluster");
 -       wsrep_restart_slave_activated= TRUE;
 -     }
 -   }
 +  if (wsrep_node_dropped && wsrep_restart_slave)
 +  {
-     if (wsrep_ready)
++    if (wsrep_ready_get())
 +    {
 +      WSREP_INFO("Slave error due to node temporarily non-primary"
 +                 "SQL slave will continue");
 +      wsrep_node_dropped= FALSE;
 +      mysql_mutex_unlock(&rli->run_lock);
 +      WSREP_DEBUG("wsrep_conflict_state now: %d", thd->wsrep_conflict_state);
 +      WSREP_INFO("slave restart: %d", thd->wsrep_conflict_state);
 +      thd->wsrep_conflict_state= NO_CONFLICT;
 +      goto wsrep_restart_point;
 +    } else {
 +      WSREP_INFO("Slave error due to node going non-primary");
 +      WSREP_INFO("wsrep_restart_slave was set and therefore slave will be "
 +                 "automatically restarted when node joins back to cluster.");
 +      wsrep_restart_slave_activated= TRUE;
 +    }
 +  }
  #endif /* WITH_WSREP */
 +
   /*
    Note: the order of the broadcast and unlock calls below (first broadcast, then unlock)
    is important. Otherwise a killer_thread can execute between the calls and
diff --cc sql/sql_class.cc
index 2502962cef0,ce875ba87ef..b3d964d4006
--- a/sql/sql_class.cc
+++ b/sql/sql_class.cc
@@@ -4544,251 -4305,27 +4549,251 @@@ extern "C" int thd_slave_thread(const M
    return(thd->slave_thread);
  }
  
 -extern "C" int thd_non_transactional_update(const MYSQL_THD thd)
 +/* Returns true for a worker thread in parallel replication. */
 +extern "C" int thd_rpl_is_parallel(const MYSQL_THD thd)
  {
 -  return(thd->transaction.all.modified_non_trans_table);
 +  return thd->rgi_slave && thd->rgi_slave->is_parallel_exec;
  }
  
 -extern "C" int thd_binlog_format(const MYSQL_THD thd)
 -{
 -#ifdef WITH_WSREP
 -  if (WSREP(thd))
 -  {
 -    /* for wsrep binlog format is meaningful also when binlogging is off */
 -     return (int) WSREP_BINLOG_FORMAT(thd->variables.binlog_format);
 -  }
 -#endif /* WITH_WSREP */
 -  if (mysql_bin_log.is_open() && (thd->variables.option_bits & OPTION_BIN_LOG))
 -    return (int) thd->variables.binlog_format;
 -  else
 -   return BINLOG_FORMAT_UNSPEC;
 -}
 +/*
 +  This function can optionally be called to check if thd_report_wait_for()
 +  needs to be called for waits done by a given transaction.
  
 -extern "C" void thd_mark_transaction_to_rollback(MYSQL_THD thd, bool all)
 +  If this function returns false for a given thd, there is no need to do any
 +  calls to thd_report_wait_for() on that thd.
 +
 +  This call is optional; it is safe to call thd_report_wait_for() in any case.
 +  This call can be used to save some redundant calls to thd_report_wait_for()
 +  if desired. (This is unlikely to matter much unless there are _lots_ of
 +  waits to report, as the overhead of thd_report_wait_for() is small).
 +*/
 +extern "C" int
 +thd_need_wait_for(const MYSQL_THD thd)
 +{
 +  rpl_group_info *rgi;
 +
 +  if (mysql_bin_log.is_open() && opt_binlog_commit_wait_count > 0)
 +    return true;
 +  if (!thd)
 +    return false;
 +  rgi= thd->rgi_slave;
 +  if (!rgi)
 +    return false;
 +  return rgi->is_parallel_exec;
 +}
 +
 +/*
 +  Used by InnoDB/XtraDB to report that one transaction THD is about to go to
 +  wait for a transactional lock held by another transactions OTHER_THD.
 +
 +  This is used for parallel replication, where transactions are required to
 +  commit in the same order on the slave as they did on the master. If the
 +  transactions on the slave encounters lock conflicts on the slave that did
 +  not exist on the master, this can cause deadlocks.
 +
 +  Normally, such conflicts will not occur, because the same conflict would
 +  have prevented the two transactions from committing in parallel on the
 +  master, thus preventing them from running in parallel on the slave in the
 +  first place. However, it is possible in case when the optimizer chooses a
 +  different plan on the slave than on the master (eg. table scan instead of
 +  index scan).
 +
 +  InnoDB/XtraDB reports lock waits using this call. If a lock wait causes a
 +  deadlock with the pre-determined commit order, we kill the later transaction,
 +  and later re-try it, to resolve the deadlock.
 +
 +  This call need only receive reports about waits for locks that will remain
 +  until the holding transaction commits. InnoDB/XtraDB auto-increment locks
 +  are released earlier, and so need not be reported. (Such false positives are
 +  not harmful, but could lead to unnecessary kill and retry, so best avoided).
 +*/
 +extern "C" void
 +thd_report_wait_for(MYSQL_THD thd, MYSQL_THD other_thd)
 +{
 +  rpl_group_info *rgi;
 +  rpl_group_info *other_rgi;
 +
 +  if (!thd || !other_thd)
 +    return;
 +  binlog_report_wait_for(thd, other_thd);
 +  rgi= thd->rgi_slave;
 +  other_rgi= other_thd->rgi_slave;
 +  if (!rgi || !other_rgi)
 +    return;
 +  if (!rgi->is_parallel_exec)
 +    return;
 +  if (rgi->rli != other_rgi->rli)
 +    return;
 +  if (!rgi->gtid_sub_id || !other_rgi->gtid_sub_id)
 +    return;
 +  if (rgi->current_gtid.domain_id != other_rgi->current_gtid.domain_id)
 +    return;
 +  if (rgi->gtid_sub_id > other_rgi->gtid_sub_id)
 +    return;
 +  /*
 +    This transaction is about to wait for another transaction that is required
 +    by replication binlog order to commit after. This would cause a deadlock.
 +
 +    So send a kill to the other transaction, with a temporary error; this will
 +    cause replication to rollback (and later re-try) the other transaction,
 +    releasing the lock for this transaction so replication can proceed.
 +  */
 +  other_rgi->killed_for_retry= true;
 +  mysql_mutex_lock(&other_thd->LOCK_thd_data);
 +  other_thd->awake(KILL_CONNECTION);
 +  mysql_mutex_unlock(&other_thd->LOCK_thd_data);
 +}
 +
 +/*
 +  This function is called from InnoDB/XtraDB to check if the commit order of
 +  two transactions has already been decided by the upper layer. This happens
 +  in parallel replication, where the commit order is forced to be the same on
 +  the slave as it was originally on the master.
 +
 +  If this function returns false, it means that such commit order will be
 +  enforced. This allows the storage engine to optionally omit gap lock waits
 +  or similar measures that would otherwise be needed to ensure that
 +  transactions would be serialised in a way that would cause a commit order
 +  that is correct for binlogging for statement-based replication.
 +
 +  Since transactions are only run in parallel on the slave if they ran without
 +  lock conflicts on the master, normally no lock conflicts on the slave happen
 +  during parallel replication. However, there are a couple of corner cases
 +  where it can happen, like these secondary-index operations:
 +
 +    T1: INSERT INTO t1 VALUES (7, NULL);
 +    T2: DELETE FROM t1 WHERE b <= 3;
 +
 +    T1: UPDATE t1 SET secondary=NULL WHERE primary=1
 +    T2: DELETE t1 WHERE secondary <= 3
 +
 +  The DELETE takes a gap lock that can block the INSERT/UPDATE, but the row
 +  locks set by INSERT/UPDATE do not block the DELETE. Thus, the execution
 +  order of the transactions determine whether a lock conflict occurs or
 +  not. Thus a lock conflict can occur on the slave where it did not on the
 +  master.
 +
 +  If this function returns true, normal locking should be done as required by
 +  the binlogging and transaction isolation level in effect. But if it returns
 +  false, the correct order will be enforced anyway, and InnoDB/XtraDB can
 +  avoid taking the gap lock, preventing the lock conflict.
 +
 +  Calling this function is just an optimisation to avoid unnecessary
 +  deadlocks. If it was not used, a gap lock would be set that could eventually
 +  cause a deadlock; the deadlock would be caught by thd_report_wait_for() and
 +  the transaction T2 killed and rolled back (and later re-tried).
 +*/
 +extern "C" int
 +thd_need_ordering_with(const MYSQL_THD thd, const MYSQL_THD other_thd)
 +{
 +  rpl_group_info *rgi, *other_rgi;
 +
 +  DBUG_EXECUTE_IF("disable_thd_need_ordering_with", return 1;);
 +  if (!thd || !other_thd)
 +    return 1;
 +  rgi= thd->rgi_slave;
 +  other_rgi= other_thd->rgi_slave;
 +  if (!rgi || !other_rgi)
 +    return 1;
 +  if (!rgi->is_parallel_exec)
 +    return 1;
 +  if (rgi->rli != other_rgi->rli)
 +    return 1;
 +  if (rgi->current_gtid.domain_id != other_rgi->current_gtid.domain_id)
 +    return 1;
 +  if (!rgi->commit_id || rgi->commit_id != other_rgi->commit_id)
 +    return 1;
 +  DBUG_EXECUTE_IF("thd_need_ordering_with_force", return 1;);
 +  /*
 +    Otherwise, these two threads are doing parallel replication within the same
 +    replication domain. Their commit order is already fixed, so we do not need
 +    gap locks or similar to otherwise enforce ordering (and in fact such locks
 +    could lead to unnecessary deadlocks and transaction retry).
 +  */
 +  return 0;
 +}
 +
 +
 +/*
 +  If the storage engine detects a deadlock, and needs to choose a victim
 +  transaction to roll back, it can call this function to ask the upper
 +  server layer for which of two possible transactions is prefered to be
 +  aborted and rolled back.
 +
 +  In parallel replication, if two transactions are running in parallel and
 +  one is fixed to commit before the other, then the one that commits later
 +  will be prefered as the victim - chosing the early transaction as a victim
 +  will not resolve the deadlock anyway, as the later transaction still needs
 +  to wait for the earlier to commit.
 +
 +  Otherwise, a transaction that uses only transactional tables, and can thus
 +  be safely rolled back, will be prefered as a deadlock victim over a
 +  transaction that also modified non-transactional (eg. MyISAM) tables.
 +
 +  The return value is -1 if the first transaction is prefered as a deadlock
 +  victim, 1 if the second transaction is prefered, or 0 for no preference (in
 +  which case the storage engine can make the choice as it prefers).
 +*/
 +extern "C" int
 +thd_deadlock_victim_preference(const MYSQL_THD thd1, const MYSQL_THD thd2)
 +{
 +  rpl_group_info *rgi1, *rgi2;
 +  bool nontrans1, nontrans2;
 +
 +  if (!thd1 || !thd2)
 +    return 0;
 +
 +  /*
 +    If the transactions are participating in the same replication domain in
 +    parallel replication, then request to select the one that will commit
 +    later (in the fixed commit order from the master) as the deadlock victim.
 +  */
 +  rgi1= thd1->rgi_slave;
 +  rgi2= thd2->rgi_slave;
 +  if (rgi1 && rgi2 &&
 +      rgi1->is_parallel_exec &&
 +      rgi1->rli == rgi2->rli &&
 +      rgi1->current_gtid.domain_id == rgi2->current_gtid.domain_id)
 +    return rgi1->gtid_sub_id < rgi2->gtid_sub_id ? 1 : -1;
 +
 +  /*
 +    If one transaction has modified non-transactional tables (so that it
 +    cannot be safely rolled back), and the other has not, then prefer to
 +    select the purely transactional one as the victim.
 +  */
 +  nontrans1= thd1->transaction.all.modified_non_trans_table;
 +  nontrans2= thd2->transaction.all.modified_non_trans_table;
 +  if (nontrans1 && !nontrans2)
 +    return 1;
 +  else if (!nontrans1 && nontrans2)
 +    return -1;
 +
 +  /* No preferences, let the storage engine decide. */
 +  return 0;
 +}
 +
 +
 +extern "C" int thd_non_transactional_update(const MYSQL_THD thd)
 +{
 +  return(thd->transaction.all.modified_non_trans_table);
 +}
 +
 +extern "C" int thd_binlog_format(const MYSQL_THD thd)
 +{
 +#ifdef WITH_WSREP
 +  if (WSREP(thd))
 +  {
 +    /* for wsrep binlog format is meaningful also when binlogging is off */
 +    return (int) WSREP_FORMAT(thd->variables.binlog_format);
 +  }
 +#endif /* WITH_WSREP */
 +  if (mysql_bin_log.is_open() && (thd->variables.option_bits & OPTION_BIN_LOG))
 +    return (int) thd->variables.binlog_format;
 +  else
-     return BINLOG_FORMAT_UNSPEC;
++   return BINLOG_FORMAT_UNSPEC;
 +}
 +
 +extern "C" void thd_mark_transaction_to_rollback(MYSQL_THD thd, bool all)
  {
    DBUG_ASSERT(thd);
    thd->mark_transaction_to_rollback(all);
diff --cc sql/sql_class.h
index 0721252193d,cd1ac4fefd7..394575191e4
--- a/sql/sql_class.h
+++ b/sql/sql_class.h
@@@ -2854,6 -2398,43 +2854,7 @@@ public
      query_id_t first_query_id;
    } binlog_evt_union;
  
 -#ifdef WITH_WSREP
 -  const bool                wsrep_applier; /* dedicated slave applier thread */
 -  bool                      wsrep_applier_closing; /* applier marked to close */
 -  bool                      wsrep_client_thread; /* to identify client threads*/
 -  enum wsrep_exec_mode      wsrep_exec_mode;
 -  query_id_t                wsrep_last_query_id;
 -  enum wsrep_query_state    wsrep_query_state;
 -  enum wsrep_conflict_state wsrep_conflict_state;
 -  mysql_mutex_t             LOCK_wsrep_thd;
+   mysql_cond_t              COND_wsrep_thd;
 -  // changed from wsrep_seqno_t to wsrep_trx_meta_t in wsrep API rev 75
 -  // wsrep_seqno_t             wsrep_trx_seqno;
 -  wsrep_trx_meta_t          wsrep_trx_meta;
 -  uint32                    wsrep_rand;
 -  Relay_log_info*           wsrep_rli;
 -  bool                      wsrep_converted_lock_session;
 -  wsrep_ws_handle_t         wsrep_ws_handle;
 -#ifdef WSREP_PROC_INFO
 -  char                      wsrep_info[128]; /* string for dynamic proc info */
 -#endif /* WSREP_PROC_INFO */
 -  ulong                     wsrep_retry_counter; // of autocommit
 -  bool                      wsrep_PA_safe;
 -  char*                     wsrep_retry_query;
 -  size_t                    wsrep_retry_query_len;
 -  enum enum_server_command  wsrep_retry_command;
 -  enum wsrep_consistency_check_mode 
 -                            wsrep_consistency_check;
 -  wsrep_stats_var*          wsrep_status_vars;
 -  int                       wsrep_mysql_replicated;
 -  THD*                      wsrep_bf_thd;
 -  const char*               wsrep_TOI_pre_query; /* a query to apply before 
 -						    the actual TOI query */
 -  size_t                    wsrep_TOI_pre_query_len;
 -  void*                     wsrep_apply_format;
 -  bool                      wsrep_apply_toi; /* applier processing in TOI */
 -  wsrep_gtid_t              wsrep_sync_wait_gtid;
 -#endif /* WITH_WSREP */
    /**
      Internal parser state.
      Note that since the parser is not re-entrant, we keep only one parser
diff --cc sql/sql_insert.cc
index af0321ce68f,64c9497fb7d..fa754d2da38
--- a/sql/sql_insert.cc
+++ b/sql/sql_insert.cc
@@@ -4324,18 -4304,47 +4324,47 @@@ bool select_create::send_eof(
      abort_result_set();
      DBUG_RETURN(true);
    }
 -  else
 +
 +  /*
 +    Do an implicit commit at end of statement for non-temporary
 +    tables.  This can fail, but we should unlock the table
 +    nevertheless.
 +  */
 +  if (!table->s->tmp_table)
    {
 -    /*
 -      Do an implicit commit at end of statement for non-temporary
 -      tables.  This can fail, but we should unlock the table
 -      nevertheless.
 -    */
 -    if (!table->s->tmp_table)
 -    {
+ #ifdef WITH_WSREP
+       /*
+          append table level exclusive key for CTAS
+       */
+       wsrep_key_arr_t key_arr= {0, 0};
+       wsrep_prepare_keys_for_isolation(thd,
+                                        create_table->db,
+                                        create_table->table_name,
+                                        table_list,
+                                        &key_arr);
+       int rcode = wsrep->append_key(
+                                 wsrep,
+                                 &thd->wsrep_ws_handle,
+                                 key_arr.keys, //&wkey,
+                                 key_arr.keys_len,
+                                 WSREP_KEY_EXCLUSIVE,
+                                 false);
+       wsrep_keys_free(&key_arr);
+       if (rcode) {
+         DBUG_PRINT("wsrep", ("row key failed: %d", rcode));
+         WSREP_ERROR("Appending table key for CTAS failed: %s, %d",
+                     (wsrep_thd_query(thd)) ?
+                     wsrep_thd_query(thd) : "void", rcode);
+         return true;
+       }
+       /* If commit fails, we should be able to reset the OK status. */
 -      thd->stmt_da->can_overwrite_status= TRUE;
++      thd->get_stmt_da()->set_overwrite_status(TRUE);
+ #endif /* WITH_WSREP */
 -      trans_commit_stmt(thd);
 +    trans_commit_stmt(thd);
 +    if (!(thd->variables.option_bits & OPTION_GTID_BEGIN))
        trans_commit_implicit(thd);
  #ifdef WITH_WSREP
 -      thd->stmt_da->can_overwrite_status= FALSE;
++    thd->get_stmt_da()->set_overwrite_status(FALSE);
        mysql_mutex_lock(&thd->LOCK_wsrep_thd);
        if (thd->wsrep_conflict_state != NO_CONFLICT)
        {
diff --cc sql/sql_parse.cc
index 6fe25961e65,553a6e7539d..f60134b6162
--- a/sql/sql_parse.cc
+++ b/sql/sql_parse.cc
@@@ -1053,7 -851,7 +1053,7 @@@ bool do_command(THD *thd
       * bail out if DB snapshot has not been installed. We however,
       * allow queries "SET" and "SHOW", they are trapped later in execute_command
       */
-     if (thd->variables.wsrep_on && !thd->wsrep_applier && !wsrep_ready &&
 -    if (thd->variables.wsrep_on && !thd->wsrep_applier && !wsrep_ready_get() &&
++     if (thd->variables.wsrep_on && !thd->wsrep_applier && !wsrep_ready_get() &&
          command != COM_QUERY        &&
          command != COM_PING         &&
          command != COM_QUIT         &&
@@@ -2750,37 -2474,12 +2750,38 @@@ mysql_execute_command(THD *thd
    {
      WSREP_SYNC_WAIT(thd, WSREP_SYNC_WAIT_BEFORE_SHOW);
      execute_show_status(thd, all_tables);
+ 
 -#ifdef WITH_WSREP
 -    if (lex->sql_command == SQLCOM_SHOW_STATUS) wsrep_free_status(thd);
 -#endif /* WITH_WSREP */
      break;
    }
 +  case SQLCOM_SHOW_EXPLAIN:
 +  {
 +    if (!thd->security_ctx->priv_user[0] &&
 +        check_global_access(thd,PROCESS_ACL))
 +      break;
 +
 +    /*
 +      The select should use only one table, it's the SHOW EXPLAIN pseudo-table
 +    */
 +    if (lex->sroutines.records || lex->query_tables->next_global)
 +    {
 +      my_message(ER_SET_CONSTANTS_ONLY, ER(ER_SET_CONSTANTS_ONLY),
 +		 MYF(0));
 +      goto error;
 +    }
 +
 +    Item **it= lex->value_list.head_ref();
 +    if (!(*it)->basic_const_item() ||
 +        (!(*it)->fixed && (*it)->fix_fields(lex->thd, it)) || 
 +        (*it)->check_cols(1))
 +    {
 +      my_message(ER_SET_CONSTANTS_ONLY, ER(ER_SET_CONSTANTS_ONLY),
 +		 MYF(0));
 +      goto error;
 +    }
 +  }
 +    /* fall through */
 +  case SQLCOM_SHOW_STATUS_PROC:
 +  case SQLCOM_SHOW_STATUS_FUNC:
    case SQLCOM_SHOW_DATABASES:
    case SQLCOM_SHOW_TABLES:
    case SQLCOM_SHOW_TRIGGERS:
@@@ -3782,8 -3325,7 +3783,8 @@@ end_with_restore_list
    case SQLCOM_INSERT_SELECT:
    {
      WSREP_SYNC_WAIT(thd, WSREP_SYNC_WAIT_BEFORE_INSERT_REPLACE);
-     select_result *sel_result;
+     select_insert *sel_result;
 +    bool explain= MY_TEST(lex->describe);
      DBUG_ASSERT(first_table == all_tables && first_table != 0);
      if ((res= insert_precheck(thd, all_tables)))
        break;
@@@ -4386,10 -3860,9 +4386,9 @@@
                                     lex->spname->m_name);
      break;
    case SQLCOM_DROP_EVENT:
-     WSREP_TO_ISOLATION_BEGIN(WSREP_MYSQL_DB, NULL, NULL)
      if (!(res= Events::drop_event(thd,
                                    lex->spname->m_db, lex->spname->m_name,
 -                                  lex->drop_if_exists)))
 +                                  lex->check_exists)))
        my_ok(thd);
      break;
  #else
diff --cc sql/sql_plugin.cc
index 013e00faeb9,81b59a5be90..b1ffa90dd2f
--- a/sql/sql_plugin.cc
+++ b/sql/sql_plugin.cc
@@@ -2082,11 -2084,20 +2082,14 @@@ bool mysql_install_plugin(THD *thd, con
    bool error;
    int argc=orig_argc;
    char **argv=orig_argv;
+   unsigned long event_class_mask[MYSQL_AUDIT_CLASS_MASK_SIZE] =
+   { MYSQL_AUDIT_GENERAL_CLASSMASK };
    DBUG_ENTER("mysql_install_plugin");
  
 -  if (opt_noacl)
 -  {
 -    my_error(ER_OPTION_PREVENTS_STATEMENT, MYF(0), "--skip-grant-tables");
 -    DBUG_RETURN(TRUE);
 -  }
 -
    tables.init_one_table("mysql", 5, "plugin", 6, "plugin", TL_WRITE);
 -  if (check_table_access(thd, INSERT_ACL, &tables, FALSE, 1, FALSE))
 +  if (!opt_noacl && check_table_access(thd, INSERT_ACL, &tables, FALSE, 1, FALSE))
      DBUG_RETURN(TRUE);
+   WSREP_TO_ISOLATION_BEGIN(WSREP_MYSQL_DB, NULL, NULL);
  
    /* need to open before acquiring LOCK_plugin or it will deadlock */
    if (! (table = open_ltable(thd, &tables, TL_WRITE,
@@@ -2218,12 -2232,21 +2224,15 @@@ bool mysql_uninstall_plugin(THD *thd, c
    TABLE_LIST tables;
    LEX_STRING dl= *dl_arg;
    bool error= false;
+   unsigned long event_class_mask[MYSQL_AUDIT_CLASS_MASK_SIZE] =
+   { MYSQL_AUDIT_GENERAL_CLASSMASK };
    DBUG_ENTER("mysql_uninstall_plugin");
  
 -  if (opt_noacl)
 -  {
 -    my_error(ER_OPTION_PREVENTS_STATEMENT, MYF(0), "--skip-grant-tables");
 -    DBUG_RETURN(TRUE);
 -  }
 -
    tables.init_one_table("mysql", 5, "plugin", 6, "plugin", TL_WRITE);
  
 -  if (check_table_access(thd, DELETE_ACL, &tables, FALSE, 1, FALSE))
 +  if (!opt_noacl && check_table_access(thd, DELETE_ACL, &tables, FALSE, 1, FALSE))
      DBUG_RETURN(TRUE);
+   WSREP_TO_ISOLATION_BEGIN(WSREP_MYSQL_DB, NULL, NULL);
  
    /* need to open before acquiring LOCK_plugin or it will deadlock */
    if (! (table= open_ltable(thd, &tables, TL_WRITE, MYSQL_LOCK_IGNORE_TIMEOUT)))
diff --cc sql/sql_view.cc
index 9fe4dd4849d,bbc5f002461..8fdd86535d1
--- a/sql/sql_view.cc
+++ b/sql/sql_view.cc
@@@ -429,18 -432,9 +429,19 @@@ bool mysql_create_view(THD *thd, TABLE_
  
    lex->link_first_table_back(view, link_to_local);
    view->open_type= OT_BASE_ONLY;
+   WSREP_TO_ISOLATION_BEGIN(WSREP_MYSQL_DB, NULL, NULL);
  
 -  if (open_and_lock_tables(thd, lex->query_tables, TRUE, 0))
 +  /*
 +    ignore lock specs for CREATE statement
 +  */
 +  if (lex->current_select->lock_type != TL_READ_DEFAULT)
 +  {
 +    lex->current_select->set_lock_for_tables(TL_READ_DEFAULT);
 +    view->mdl_request.set_type(MDL_EXCLUSIVE);
 +  }
 +
 +  if (open_temporary_tables(thd, lex->query_tables) ||
 +      open_and_lock_tables(thd, lex->query_tables, TRUE, 0))
    {
      view= lex->unlink_first_table(&link_to_local);
      res= TRUE;
@@@ -686,8 -722,12 +687,12 @@@
    lex->link_first_table_back(view, link_to_local);
    DBUG_RETURN(0);
  
+ #ifdef WITH_WSREP
+  error:
+   res= TRUE;
+ #endif /* WITH_WSREP */
  err:
 -  thd_proc_info(thd, "end");
 +  THD_STAGE_INFO(thd, stage_end);
    lex->link_first_table_back(view, link_to_local);
    unit->cleanup();
    DBUG_RETURN(res || thd->is_error());
diff --cc sql/wsrep_hton.cc
index a9dbc1a17c2,1676daab5fe..0a2264ac03c
--- a/sql/wsrep_hton.cc
+++ b/sql/wsrep_hton.cc
@@@ -131,18 -115,30 +131,30 @@@ void wsrep_post_commit(THD* thd, bool a
        wsrep_cleanup_transaction(thd);
        break;
      }
-  case LOCAL_STATE:
-    {
-      /*
-        Non-InnoDB statements may have populated events in stmt cache => cleanup
-      */
-      WSREP_DEBUG("cleanup transaction for LOCAL_STATE: %s", thd->query());
-      wsrep_cleanup_transaction(thd);
-      break;
-    }
-   default: break;
+     case LOCAL_STATE:
+     {
+       /* non-InnoDB statements may have populated events in stmt cache
+         => cleanup
+       */
+       WSREP_DEBUG("cleanup transaction for LOCAL_STATE");
+       /*
+         Run post-rollback hook to clean up in the case if
+         some keys were populated for the transaction in provider
+         but during commit time there was no write set to replicate.
+         This may happen when client sets the SAVEPOINT and immediately
+         rolls back to savepoint after first operation.
+       */
+       if (all && thd->wsrep_conflict_state != MUST_REPLAY &&
+           wsrep && wsrep->post_rollback(wsrep, &thd->wsrep_ws_handle))
+       {
+         WSREP_WARN("post_rollback fail: %llu %d",
 -                   (long long)thd->thread_id, thd->stmt_da->status());
++		(long long)thd->thread_id, thd->get_stmt_da()->status());
+       }
+       wsrep_cleanup_transaction(thd);
+       break;
+     }
+     default: break;
    }
- 
  }
  
  /*
diff --cc sql/wsrep_mysqld.cc
index 49988287933,54fdf430f86..bd397a9a012
--- a/sql/wsrep_mysqld.cc
+++ b/sql/wsrep_mysqld.cc
@@@ -931,76 -1019,84 +932,76 @@@ static bool wsrep_prepare_key_for_isola
  }
  
  /* Prepare key list from db/table and table_list */
- static bool wsrep_prepare_keys_for_isolation(THD*              thd,
-                                              const char*       db,
-                                              const char*       table,
-                                              const TABLE_LIST* table_list,
-                                              wsrep_key_arr_t*  ka)
+ bool wsrep_prepare_keys_for_isolation(THD*              thd,
+                                       const char*       db,
+                                       const char*       table,
+                                       const TABLE_LIST* table_list,
+                                       wsrep_key_arr_t*  ka)
  {
 -    ka->keys= 0;
 -    ka->keys_len= 0;
 +  ka->keys= 0;
 +  ka->keys_len= 0;
  
 -    extern TABLE* find_temporary_table(THD*, const TABLE_LIST*);
 -
 -    if (db || table)
 +  if (db || table)
 +  {
 +    if (!(ka->keys= (wsrep_key_t*)my_malloc(sizeof(wsrep_key_t), MYF(0))))
      {
 -        TABLE_LIST tmp_table;
 -        bzero((char*) &tmp_table,sizeof(tmp_table));
 -        tmp_table.table_name= (char*)db;
 -        tmp_table.db= (char*)table;
 -        if (!table || !find_temporary_table(thd, &tmp_table))
 -        {
 -            if (!(ka->keys= (wsrep_key_t*)my_malloc(sizeof(wsrep_key_t), MYF(0))))
 -            {
 -                WSREP_ERROR("Can't allocate memory for key_array");
 -                goto err;
 -            }
 -            ka->keys_len= 1;
 -            if (!(ka->keys[0].key_parts= (wsrep_buf_t*)
 -                  my_malloc(sizeof(wsrep_buf_t)*2, MYF(0))))
 -            {
 -                WSREP_ERROR("Can't allocate memory for key_parts");
 -                goto err;
 -            }
 -            ka->keys[0].key_parts_num= 2;
 -            if (!wsrep_prepare_key_for_isolation(
 -                    db, table,
 -                    (wsrep_buf_t*)ka->keys[0].key_parts,
 -                    &ka->keys[0].key_parts_num))
 -            {
 -                WSREP_ERROR("Preparing keys for isolation failed");
 -                goto err;
 -            }
 -        }
 +      WSREP_ERROR("Can't allocate memory for key_array");
 +      goto err;
 +    }
 +    ka->keys_len= 1;
 +    if (!(ka->keys[0].key_parts= (wsrep_buf_t*)
 +          my_malloc(sizeof(wsrep_buf_t)*2, MYF(0))))
 +    {
 +      WSREP_ERROR("Can't allocate memory for key_parts");
 +      goto err;
 +     }
 +    ka->keys[0].key_parts_num= 2;
 +    if (!wsrep_prepare_key_for_isolation(
 +                                         db, table,
 +                                         (wsrep_buf_t*)ka->keys[0].key_parts,
 +                                         &ka->keys[0].key_parts_num))
 +    {
 +      WSREP_ERROR("Preparing keys for isolation failed (1)");
 +      goto err;
      }
 +  }
 +
 +  for (const TABLE_LIST* table= table_list; table; table= table->next_global)
 +  {
 +    wsrep_key_t* tmp;
 +    if (ka->keys)
 +      tmp= (wsrep_key_t*)my_realloc(ka->keys,
 +                                  (ka->keys_len + 1) * sizeof(wsrep_key_t),
 +                                  MYF(0));
 +    else
 +      tmp= (wsrep_key_t*)my_malloc((ka->keys_len + 1) * sizeof(wsrep_key_t), MYF(0));
  
 -    for (const TABLE_LIST* table= table_list; table; table= table->next_global)
 +    if (!tmp)
      {
 -        if (!find_temporary_table(thd, table))
 -        {
 -            wsrep_key_t* tmp;
 -            tmp= (wsrep_key_t*)my_realloc(
 -                ka->keys, (ka->keys_len + 1) * sizeof(wsrep_key_t), MYF(0));
 -            if (!tmp)
 -            {
 -                WSREP_ERROR("Can't allocate memory for key_array");
 -                goto err;
 -            }
 -            ka->keys= tmp;
 -            if (!(ka->keys[ka->keys_len].key_parts= (wsrep_buf_t*)
 -                  my_malloc(sizeof(wsrep_buf_t)*2, MYF(0))))
 -            {
 -                WSREP_ERROR("Can't allocate memory for key_parts");
 -                goto err;
 -            }
 -            ka->keys[ka->keys_len].key_parts_num= 2;
 -            ++ka->keys_len;
 -            if (!wsrep_prepare_key_for_isolation(
 -                    table->db, table->table_name,
 -                    (wsrep_buf_t*)ka->keys[ka->keys_len - 1].key_parts,
 -                    &ka->keys[ka->keys_len - 1].key_parts_num))
 -            {
 -                WSREP_ERROR("Preparing keys for isolation failed");
 -                goto err;
 -            }
 -        }
 +      WSREP_ERROR("Can't allocate memory for key_array");
 +      goto err;
      }
 -    return true;
 +    ka->keys= tmp;
 +    if (!(ka->keys[ka->keys_len].key_parts= (wsrep_buf_t*)
 +          my_malloc(sizeof(wsrep_buf_t)*2, MYF(0))))
 +    {
 +      WSREP_ERROR("Can't allocate memory for key_parts");
 +      goto err;
 +    }
 +    ka->keys[ka->keys_len].key_parts_num= 2;
 +    ++ka->keys_len;
 +    if (!wsrep_prepare_key_for_isolation(table->db, table->table_name,
 +                                         (wsrep_buf_t*)ka->keys[ka->keys_len - 1].key_parts,
 +                                         &ka->keys[ka->keys_len - 1].key_parts_num))
 +    {
 +      WSREP_ERROR("Preparing keys for isolation failed (2)");
 +      goto err;
 +    }
 +  }
 +    return 0;
  err:
      wsrep_keys_free(ka);
 -    return false;
 +    return 1;
  }
  
  
diff --cc sql/wsrep_mysqld.h
index 6dabdb66022,56e3baae7cc..94c97f04aab
--- a/sql/wsrep_mysqld.h
+++ b/sql/wsrep_mysqld.h
@@@ -139,9 -125,17 +139,10 @@@ extern const char* wsrep_provider_name
  extern const char* wsrep_provider_version;
  extern const char* wsrep_provider_vendor;
  
 -// Other wsrep global variables
 -extern my_bool     wsrep_inited; // whether wsrep is initialized ?
 -
  int  wsrep_show_status(THD *thd, SHOW_VAR *var, char *buff);
+ int  wsrep_show_ready(THD *thd, SHOW_VAR *var, char *buff);
  void wsrep_free_status(THD *thd);
  
 -/* Filters out --wsrep-new-cluster oprtion from argv[]
 - * should be called in the very beginning of main() */
 -void wsrep_filter_new_cluster (int* argc, char* argv[]);
 -
  int  wsrep_init();
  void wsrep_deinit(bool free_options);
  void wsrep_recover();
@@@ -255,8 -245,7 +256,9 @@@ extern wsrep_seqno_t wsrep_locked_seqno
  #define WSREP_PROVIDER_EXISTS                                                  \
    (wsrep_provider && strncasecmp(wsrep_provider, WSREP_NONE, FN_REFLEN))
  
 +#define WSREP_QUERY(thd) (thd->query())
 +
+ extern my_bool wsrep_ready_get();
  extern void wsrep_ready_wait();
  
  enum wsrep_trx_status {
@@@ -332,11 -316,23 +334,22 @@@ int wsrep_create_trigger_query(THD *thd
  int wsrep_create_event_query(THD *thd, uchar** buf, size_t* buf_len);
  int wsrep_alter_event_query(THD *thd, uchar** buf, size_t* buf_len);
  
 -struct xid_t;
 -void wsrep_set_SE_checkpoint(xid_t*);
 -void wsrep_get_SE_checkpoint(wsrep_uuid_t&, wsrep_seqno_t&);
 -void wsrep_xid_init(xid_t*, const wsrep_uuid_t*, wsrep_seqno_t);
 -const wsrep_uuid_t* wsrep_xid_uuid(const xid_t*);
 -wsrep_seqno_t wsrep_xid_seqno(const xid_t*);
 -extern "C" int wsrep_is_wsrep_xid(const void* xid);
 +#ifdef GTID_SUPPORT
 +void wsrep_init_sidno(const wsrep_uuid_t&);
 +#endif /* GTID_SUPPORT */
 +
 +bool wsrep_node_is_donor();
 +bool wsrep_node_is_synced();
  
+ typedef struct wsrep_key_arr
+ {
+     wsrep_key_t* keys;
+     size_t       keys_len;
+ } wsrep_key_arr_t;
+ bool wsrep_prepare_keys_for_isolation(THD*              thd,
+                                       const char*       db,
+                                       const char*       table,
+                                       const TABLE_LIST* table_list,
+                                       wsrep_key_arr_t*  ka);
+ void wsrep_keys_free(wsrep_key_arr_t* key_arr);
  #endif /* WSREP_MYSQLD_H */
diff --cc sql/wsrep_thd.cc
index 307745ff1b0,4d665775f2d..328bcbd6be6
--- a/sql/wsrep_thd.cc
+++ b/sql/wsrep_thd.cc
@@@ -381,7 -287,7 +381,7 @@@ static void wsrep_replication_process(T
    case WSREP_TRX_MISSING:
      /* these suggests a bug in provider code */
      WSREP_WARN("bad return from recv() call: %d", rcode);
--    /* fall through to node shutdown */
++    /* fall through */
    case WSREP_FATAL:
      /* Cluster connectivity is lost.
       *
diff --cc storage/heap/ha_heap.cc
index c1dad6a9943,ec76d08bf97..29bf924dc26
--- a/storage/heap/ha_heap.cc
+++ b/storage/heap/ha_heap.cc
@@@ -91,16 -100,7 +91,7 @@@ ha_heap::ha_heap(handlerton *hton, TABL
  
  int ha_heap::open(const char *name, int mode, uint test_if_locked)
  {
-   if (table->s->reclength < sizeof (char*))
-   {
-     MEM_UNDEFINED(table->s->default_values + table->s->reclength,
-                   sizeof(char*) - table->s->reclength);
-     table->s->reclength= sizeof(char*);
-     MEM_UNDEFINED(table->record[0], table->s->reclength);
-     MEM_UNDEFINED(table->record[1], table->s->reclength);
-   }
- 
 -  internal_table= test(test_if_locked & HA_OPEN_INTERNAL_TABLE);
 +  internal_table= MY_TEST(test_if_locked & HA_OPEN_INTERNAL_TABLE);
    if (internal_table || (!(file= heap_open(name, mode)) && my_errno == ENOENT))
    {
      HP_CREATE_INFO create_info;
@@@ -723,7 -727,7 +714,7 @@@ heap_prepare_hp_create_info(TABLE *tabl
        }
      }
    }
-   mem_per_row+= MY_ALIGN(share->reclength + 1, sizeof(char*));
 -  mem_per_row+= MY_ALIGN(max(share->reclength, sizeof(char*)) + 1, sizeof(char*));
++  mem_per_row+= MY_ALIGN(MY_MAX(share->reclength, sizeof(char*)) + 1, sizeof(char*));
    if (table_arg->found_next_number_field)
    {
      keydef[share->next_number_index].flag|= HA_AUTO_KEY;
diff --cc storage/heap/hp_create.c
index d03c7c46f15,1daca0beeb5..29c031c466c
--- a/storage/heap/hp_create.c
+++ b/storage/heap/hp_create.c
@@@ -58,9 -59,9 +59,9 @@@ int heap_create(const char *name, HP_CR
      
      /*
        We have to store sometimes uchar* del_link in records,
-       so the record length should be at least sizeof(uchar*)
+       so the visible_offset must be least at sizeof(uchar*)
      */
-     set_if_bigger(reclength, sizeof (uchar*));
 -    visible_offset= max(reclength, sizeof (char*));
++    visible_offset= MY_MAX(reclength, sizeof (char*));
      
      for (i= key_segs= max_length= 0, keyinfo= keydef; i < keys; i++, keyinfo++)
      {
diff --cc storage/innobase/handler/ha_innodb.cc
index 5dbd7a1ca91,7aab200fed1..7e943782165
--- a/storage/innobase/handler/ha_innodb.cc
+++ b/storage/innobase/handler/ha_innodb.cc
@@@ -1,10 -1,8 +1,10 @@@
  /*****************************************************************************
  
- Copyright (c) 2000, 2017, Oracle and/or its affiliates. All Rights Reserved.
+ Copyright (c) 2000, 2018, Oracle and/or its affiliates. All Rights Reserved.
  Copyright (c) 2008, 2009 Google Inc.
  Copyright (c) 2009, Percona Inc.
 +Copyright (c) 2012, Facebook Inc.
- Copyright (c) 2013, 2017, MariaDB Corporation.
++Copyright (c) 2013, 2018, MariaDB Corporation.
  
  Portions of this file contain modifications contributed and copyrighted by
  Google, Inc. Those modifications are gratefully acknowledged and are described
@@@ -1603,2924 -934,700 +1603,2928 @@@ innobase_release_temporary_latches
  
  	if (!innodb_inited) {
  
 -		return(0);
 +		return(0);
 +	}
 +
 +	trx_t*	trx = thd_to_trx(thd);
 +
 +	if (trx != NULL) {
 +		trx_search_latch_release_if_reserved(trx);
 +	}
 +
 +	return(0);
 +}
 +
 +/********************************************************************//**
 +Increments innobase_active_counter and every INNOBASE_WAKE_INTERVALth
 +time calls srv_active_wake_master_thread. This function should be used
 +when a single database operation may introduce a small need for
 +server utility activity, like checkpointing. */
 +static inline
 +void
 +innobase_active_small(void)
 +/*=======================*/
 +{
 +	innobase_active_counter++;
 +
 +	if ((innobase_active_counter % INNOBASE_WAKE_INTERVAL) == 0) {
 +		srv_active_wake_master_thread();
 +	}
 +}
 +
 +/********************************************************************//**
 +Converts an InnoDB error code to a MySQL error code and also tells to MySQL
 +about a possible transaction rollback inside InnoDB caused by a lock wait
 +timeout or a deadlock.
 +@return	MySQL error code */
 +static
 +int
 +convert_error_code_to_mysql(
 +/*========================*/
 +	dberr_t	error,	/*!< in: InnoDB error code */
 +	ulint	flags,  /*!< in: InnoDB table flags, or 0 */
 +	THD*	thd)	/*!< in: user thread handle or NULL */
 +{
 +	switch (error) {
 +	case DB_SUCCESS:
 +		return(0);
 +
 +	case DB_INTERRUPTED:
 +                return(HA_ERR_ABORTED_BY_USER);
 +
 +	case DB_FOREIGN_EXCEED_MAX_CASCADE:
 +		ut_ad(thd);
 +		push_warning_printf(thd, Sql_condition::WARN_LEVEL_WARN,
 +				    HA_ERR_ROW_IS_REFERENCED,
 +				    "InnoDB: Cannot delete/update "
 +				    "rows with cascading foreign key "
 +				    "constraints that exceed max "
 +				    "depth of %d. Please "
 +				    "drop extra constraints and try "
 +				    "again", DICT_FK_MAX_RECURSIVE_LOAD);
 +
 +		/* fall through */
 +
 +	case DB_ERROR:
 +	default:
 +		return(-1); /* unspecified error */
 +
 +	case DB_DUPLICATE_KEY:
 +		/* Be cautious with returning this error, since
 +		mysql could re-enter the storage layer to get
 +		duplicated key info, the operation requires a
 +		valid table handle and/or transaction information,
 +		which might not always be available in the error
 +		handling stage. */
 +		return(HA_ERR_FOUND_DUPP_KEY);
 +
 +	case DB_READ_ONLY:
 +		return(HA_ERR_TABLE_READONLY);
 +
 +	case DB_FOREIGN_DUPLICATE_KEY:
 +		return(HA_ERR_FOREIGN_DUPLICATE_KEY);
 +
 +	case DB_MISSING_HISTORY:
 +		return(HA_ERR_TABLE_DEF_CHANGED);
 +
 +	case DB_RECORD_NOT_FOUND:
 +		return(HA_ERR_NO_ACTIVE_RECORD);
 +
 +	case DB_DEADLOCK:
 +		/* Since we rolled back the whole transaction, we must
 +		tell it also to MySQL so that MySQL knows to empty the
 +		cached binlog for this transaction */
 +
 +		if (thd) {
 +			thd_mark_transaction_to_rollback(thd, TRUE);
 +		}
 +
 +		return(HA_ERR_LOCK_DEADLOCK);
 +
 +	case DB_LOCK_WAIT_TIMEOUT:
 +		/* Starting from 5.0.13, we let MySQL just roll back the
 +		latest SQL statement in a lock wait timeout. Previously, we
 +		rolled back the whole transaction. */
 +
 +		if (thd) {
 +			thd_mark_transaction_to_rollback(
 +				thd, (bool) row_rollback_on_timeout);
 +		}
 +
 +		return(HA_ERR_LOCK_WAIT_TIMEOUT);
 +
 +	case DB_NO_REFERENCED_ROW:
 +		return(HA_ERR_NO_REFERENCED_ROW);
 +
 +	case DB_ROW_IS_REFERENCED:
 +		return(HA_ERR_ROW_IS_REFERENCED);
 +
 +	case DB_CANNOT_ADD_CONSTRAINT:
 +	case DB_CHILD_NO_INDEX:
 +	case DB_PARENT_NO_INDEX:
 +		return(HA_ERR_CANNOT_ADD_FOREIGN);
 +
 +	case DB_CANNOT_DROP_CONSTRAINT:
 +
 +		return(HA_ERR_ROW_IS_REFERENCED); /* TODO: This is a bit
 +						misleading, a new MySQL error
 +						code should be introduced */
 +
 +	case DB_CORRUPTION:
 +		return(HA_ERR_CRASHED);
 +
 +	case DB_OUT_OF_FILE_SPACE:
 +		return(HA_ERR_RECORD_FILE_FULL);
 +
 +	case DB_TEMP_FILE_WRITE_FAILURE:
 +		my_error(ER_GET_ERRMSG, MYF(0),
 +                         DB_TEMP_FILE_WRITE_FAILURE,
 +                         ut_strerr(DB_TEMP_FILE_WRITE_FAILURE),
 +                         "InnoDB");
 +		return(HA_ERR_INTERNAL_ERROR);
 +
 +	case DB_TABLE_IN_FK_CHECK:
 +		return(HA_ERR_TABLE_IN_FK_CHECK);
 +
 +	case DB_TABLE_IS_BEING_USED:
 +		return(HA_ERR_WRONG_COMMAND);
 +
 +	case DB_TABLESPACE_DELETED:
 +	case DB_TABLE_NOT_FOUND:
 +		return(HA_ERR_NO_SUCH_TABLE);
 +
 +	case DB_TABLESPACE_NOT_FOUND:
 +		return(HA_ERR_NO_SUCH_TABLE);
 +
 +	case DB_TOO_BIG_RECORD: {
 +		/* If prefix is true then a 768-byte prefix is stored
 +		locally for BLOB fields. Refer to dict_table_get_format() */
 +		bool prefix = (dict_tf_get_format(flags) == UNIV_FORMAT_A);
 +		my_printf_error(ER_TOO_BIG_ROWSIZE,
 +			"Row size too large (> %lu). Changing some columns "
 +			"to TEXT or BLOB %smay help. In current row "
 +			"format, BLOB prefix of %d bytes is stored inline.",
 +			MYF(0),
 +			page_get_free_space_of_empty(flags &
 +				DICT_TF_COMPACT) / 2,
 +			prefix ? "or using ROW_FORMAT=DYNAMIC "
 +			"or ROW_FORMAT=COMPRESSED ": "",
 +			prefix ? DICT_MAX_FIXED_COL_LEN : 0);
 +		return(HA_ERR_TO_BIG_ROW);
 +	}
 +
 +
 +	case DB_TOO_BIG_FOR_REDO:
 +		my_printf_error(ER_TOO_BIG_ROWSIZE, "%s" , MYF(0),
 +				"The size of BLOB/TEXT data inserted"
 +				" in one transaction is greater than"
 +				" 10% of redo log size. Increase the"
 +				" redo log size using innodb_log_file_size.");
 +		return(HA_ERR_TO_BIG_ROW);
 +
 +	case DB_TOO_BIG_INDEX_COL:
 +		my_error(ER_INDEX_COLUMN_TOO_LONG, MYF(0),
 +			 DICT_MAX_FIELD_LEN_BY_FORMAT_FLAG(flags));
 +		return(HA_ERR_INDEX_COL_TOO_LONG);
 +
 +	case DB_NO_SAVEPOINT:
 +		return(HA_ERR_NO_SAVEPOINT);
 +
 +	case DB_LOCK_TABLE_FULL:
 +		/* Since we rolled back the whole transaction, we must
 +		tell it also to MySQL so that MySQL knows to empty the
 +		cached binlog for this transaction */
 +
 +		if (thd) {
 +			thd_mark_transaction_to_rollback(thd, TRUE);
 +		}
 +
 +		return(HA_ERR_LOCK_TABLE_FULL);
 +
 +	case DB_FTS_INVALID_DOCID:
 +		return(HA_FTS_INVALID_DOCID);
 +	case DB_FTS_EXCEED_RESULT_CACHE_LIMIT:
 +		return(HA_ERR_OUT_OF_MEM);
 +	case DB_TOO_MANY_CONCURRENT_TRXS:
 +		return(HA_ERR_TOO_MANY_CONCURRENT_TRXS);
 +	case DB_UNSUPPORTED:
 +		return(HA_ERR_UNSUPPORTED);
 +	case DB_INDEX_CORRUPT:
 +		return(HA_ERR_INDEX_CORRUPT);
 +	case DB_UNDO_RECORD_TOO_BIG:
 +		return(HA_ERR_UNDO_REC_TOO_BIG);
 +	case DB_OUT_OF_MEMORY:
 +		return(HA_ERR_OUT_OF_MEM);
 +	case DB_TABLESPACE_EXISTS:
 +		return(HA_ERR_TABLESPACE_EXISTS);
 +	case DB_IDENTIFIER_TOO_LONG:
 +		return(HA_ERR_INTERNAL_ERROR);
 +	case DB_FTS_TOO_MANY_WORDS_IN_PHRASE:
 +		return(HA_ERR_FTS_TOO_MANY_WORDS_IN_PHRASE);
 +	}
 +}
 +
 +/*************************************************************//**
 +Prints info of a THD object (== user session thread) to the given file. */
 +UNIV_INTERN
 +void
 +innobase_mysql_print_thd(
 +/*=====================*/
 +	FILE*	f,		/*!< in: output stream */
 +	THD*	thd,		/*!< in: MySQL THD object */
 +	uint	max_query_len)	/*!< in: max query length to print, or 0 to
 +				use the default max length */
 +{
 +	char	buffer[1024];
 +
 +	fputs(thd_get_error_context_description(thd, buffer, sizeof buffer,
 +						max_query_len), f);
 +	putc('\n', f);
 +}
 +
 +/******************************************************************//**
 +Get the error message format string.
 +@return the format string or 0 if not found. */
 +UNIV_INTERN
 +const char*
 +innobase_get_err_msg(
 +/*=================*/
 +	int	error_code)	/*!< in: MySQL error code */
 +{
 +	return(my_get_err_msg(error_code));
 +}
 +
 +/******************************************************************//**
 +Get the variable length bounds of the given character set. */
 +UNIV_INTERN
 +void
 +innobase_get_cset_width(
 +/*====================*/
 +	ulint	cset,		/*!< in: MySQL charset-collation code */
 +	ulint*	mbminlen,	/*!< out: minimum length of a char (in bytes) */
 +	ulint*	mbmaxlen)	/*!< out: maximum length of a char (in bytes) */
 +{
 +	CHARSET_INFO*	cs;
 +	ut_ad(cset <= MAX_CHAR_COLL_NUM);
 +	ut_ad(mbminlen);
 +	ut_ad(mbmaxlen);
 +
 +	cs = all_charsets[cset];
 +	if (cs) {
 +		*mbminlen = cs->mbminlen;
 +		*mbmaxlen = cs->mbmaxlen;
 +		ut_ad(*mbminlen < DATA_MBMAX);
 +		ut_ad(*mbmaxlen < DATA_MBMAX);
 +	} else {
 +		THD*	thd = current_thd;
 +
 +		if (thd && thd_sql_command(thd) == SQLCOM_DROP_TABLE) {
 +
 +			/* Fix bug#46256: allow tables to be dropped if the
 +			collation is not found, but issue a warning. */
 +			if ((global_system_variables.log_warnings)
 +			    && (cset != 0)){
 +
 +				sql_print_warning(
 +					"Unknown collation #%lu.", cset);
 +			}
 +		} else {
 +
 +			ut_a(cset == 0);
 +		}
 +
 +		*mbminlen = *mbmaxlen = 0;
 +	}
 +}
 +
 +/******************************************************************//**
 +Converts an identifier to a table name. */
 +UNIV_INTERN
 +void
 +innobase_convert_from_table_id(
 +/*===========================*/
 +	struct charset_info_st*	cs,	/*!< in: the 'from' character set */
 +	char*			to,	/*!< out: converted identifier */
 +	const char*		from,	/*!< in: identifier to convert */
 +	ulint			len)	/*!< in: length of 'to', in bytes */
 +{
 +	uint	errors;
 +
 +	strconvert(cs, from, FN_REFLEN, &my_charset_filename, to, (uint) len, &errors);
 +}
 +
 +/**********************************************************************
 +Check if the length of the identifier exceeds the maximum allowed.
 +return true when length of identifier is too long. */
 +UNIV_INTERN
 +my_bool
 +innobase_check_identifier_length(
 +/*=============================*/
 +	const char*	id)	/* in: FK identifier to check excluding the
 +				database portion. */
 +{
 +	int		well_formed_error = 0;
 +	CHARSET_INFO	*cs = system_charset_info;
 +	DBUG_ENTER("innobase_check_identifier_length");
 +
 +	size_t len = cs->cset->well_formed_len(
 +		cs, id, id + strlen(id),
 +		NAME_CHAR_LEN, &well_formed_error);
 +
 +	if (well_formed_error || len == NAME_CHAR_LEN) {
 +		my_error(ER_TOO_LONG_IDENT, MYF(0), id);
 +		DBUG_RETURN(true);
 +	}
 +	DBUG_RETURN(false);
 +}
 +
 +/******************************************************************//**
 +Converts an identifier to UTF-8. */
 +UNIV_INTERN
 +void
 +innobase_convert_from_id(
 +/*=====================*/
 +	struct charset_info_st*	cs,	/*!< in: the 'from' character set */
 +	char*			to,	/*!< out: converted identifier */
 +	const char*		from,	/*!< in: identifier to convert */
 +	ulint			len)	/*!< in: length of 'to', in bytes */
 +{
 +	uint	errors;
 +
 +	strconvert(cs, from, FN_REFLEN, system_charset_info, to, (uint) len, &errors);
 +}
 +
 +/******************************************************************//**
 +Compares NUL-terminated UTF-8 strings case insensitively.
 +@return	0 if a=b, <0 if a<b, >1 if a>b */
 +UNIV_INTERN
 +int
 +innobase_strcasecmp(
 +/*================*/
 +	const char*	a,	/*!< in: first string to compare */
 +	const char*	b)	/*!< in: second string to compare */
 +{
 +	if (!a) {
 +		if (!b) {
 +			return(0);
 +		} else {
 +			return(-1);
 +		}
 +	} else if (!b) {
 +		return(1);
 +	}
 +
 +	return(my_strcasecmp(system_charset_info, a, b));
 +}
 +
 +/******************************************************************//**
 +Compares NUL-terminated UTF-8 strings case insensitively. The
 +second string contains wildcards.
 +@return 0 if a match is found, 1 if not */
 +UNIV_INTERN
 +int
 +innobase_wildcasecmp(
 +/*=================*/
 +	const char*	a,	/*!< in: string to compare */
 +	const char*	b)	/*!< in: wildcard string to compare */
 +{
 +	return(wild_case_compare(system_charset_info, a, b));
 +}
 +
 +/******************************************************************//**
 +Strip dir name from a full path name and return only the file name
 +@return file name or "null" if no file name */
 +UNIV_INTERN
 +const char*
 +innobase_basename(
 +/*==============*/
 +	const char*	path_name)	/*!< in: full path name */
 +{
 +	const char*	name = base_name(path_name);
 +
 +	return((name) ? name : "null");
 +}
 +
 +/******************************************************************//**
 +Makes all characters in a NUL-terminated UTF-8 string lower case. */
 +UNIV_INTERN
 +void
 +innobase_casedn_str(
 +/*================*/
 +	char*	a)	/*!< in/out: string to put in lower case */
 +{
 +	my_casedn_str(system_charset_info, a);
 +}
 +
 +/**********************************************************************//**
 +Determines the connection character set.
 +@return	connection character set */
 +UNIV_INTERN
 +struct charset_info_st*
 +innobase_get_charset(
 +/*=================*/
 +	THD*	mysql_thd)	/*!< in: MySQL thread handle */
 +{
 +	return(thd_charset(mysql_thd));
 +}
 +
 +/**********************************************************************//**
 +Determines the current SQL statement.
 +@return	SQL statement string */
 +UNIV_INTERN
 +const char*
 +innobase_get_stmt(
 +/*==============*/
 +	THD*	thd,		/*!< in: MySQL thread handle */
 +	size_t*	length)		/*!< out: length of the SQL statement */
 +{
 +	if (const LEX_STRING *stmt = thd_query_string(thd)) {
 +		*length = stmt->length;
 +		return stmt->str;
 +	}
 +	return NULL;
 +}
 +
 +/**********************************************************************//**
 +Get the current setting of the tdc_size global parameter. We do
 +a dirty read because for one there is no synchronization object and
 +secondly there is little harm in doing so even if we get a torn read.
 +@return	value of tdc_size */
 +UNIV_INTERN
 +ulint
 +innobase_get_table_cache_size(void)
 +/*===============================*/
 +{
 +	return(tdc_size);
 +}
 +
 +/**********************************************************************//**
 +Get the current setting of the lower_case_table_names global parameter from
 +mysqld.cc. We do a dirty read because for one there is no synchronization
 +object and secondly there is little harm in doing so even if we get a torn
 +read.
 +@return	value of lower_case_table_names */
 +UNIV_INTERN
 +ulint
 +innobase_get_lower_case_table_names(void)
 +/*=====================================*/
 +{
 +	return(lower_case_table_names);
 +}
 +
 +/** Create a temporary file in the location specified by the parameter
 +path. If the path is null, then it will be created in tmpdir.
 +@param[in]	path	location for creating temporary file
 +@return	temporary file descriptor, or < 0 on error */
 +UNIV_INTERN
 +int
 +innobase_mysql_tmpfile(
 +	const char*	path)
 +{
 +#ifdef WITH_INNODB_DISALLOW_WRITES
 +	os_event_wait(srv_allow_writes_event);
 +#endif /* WITH_INNODB_DISALLOW_WRITES */
 +	int	fd2 = -1;
 +	File	fd;
 +
 +	DBUG_EXECUTE_IF(
 +		"innobase_tmpfile_creation_failure",
 +		return(-1);
 +	);
 +
 +	if (path == NULL) {
 +		fd = mysql_tmpfile("ib");
 +	} else {
 +		fd = mysql_tmpfile_path(path, "ib");
 +	}
 +
 +	if (fd >= 0) {
 +		/* Copy the file descriptor, so that the additional resources
 +		allocated by create_temp_file() can be freed by invoking
 +		my_close().
 +
 +		Because the file descriptor returned by this function
 +		will be passed to fdopen(), it will be closed by invoking
 +		fclose(), which in turn will invoke close() instead of
 +		my_close(). */
 +
 +#ifdef _WIN32
 +		/* Note that on Windows, the integer returned by mysql_tmpfile
 +		has no relation to C runtime file descriptor. Here, we need
 +		to call my_get_osfhandle to get the HANDLE and then convert it
 +		to C runtime filedescriptor. */
 +		{
 +			HANDLE hFile = my_get_osfhandle(fd);
 +			HANDLE hDup;
 +			BOOL bOK = DuplicateHandle(
 +					GetCurrentProcess(),
 +					hFile, GetCurrentProcess(),
 +					&hDup, 0, FALSE, DUPLICATE_SAME_ACCESS);
 +			if (bOK) {
 +				fd2 = _open_osfhandle((intptr_t) hDup, 0);
 +			} else {
 +				my_osmaperr(GetLastError());
 +				fd2 = -1;
 +			}
 +		}
++#else
++#ifdef F_DUPFD_CLOEXEC
++		fd2 = fcntl(fd, F_DUPFD_CLOEXEC, 0);
 +#else
 +		fd2 = dup(fd);
++#endif
 +#endif
 +		if (fd2 < 0) {
 +			DBUG_PRINT("error",("Got error %d on dup",fd2));
 +			my_errno=errno;
 +			my_error(EE_OUT_OF_FILERESOURCES,
 +				 MYF(ME_BELL+ME_WAITTANG),
 +				 "ib*", my_errno);
 +		}
 +		my_close(fd, MYF(MY_WME));
 +	}
 +	return(fd2);
 +}
 +
 +/*********************************************************************//**
 +Wrapper around MySQL's copy_and_convert function.
 +@return	number of bytes copied to 'to' */
 +UNIV_INTERN
 +ulint
 +innobase_convert_string(
 +/*====================*/
 +	void*		to,		/*!< out: converted string */
 +	ulint		to_length,	/*!< in: number of bytes reserved
 +					for the converted string */
 +	CHARSET_INFO*	to_cs,		/*!< in: character set to convert to */
 +	const void*	from,		/*!< in: string to convert */
 +	ulint		from_length,	/*!< in: number of bytes to convert */
 +	CHARSET_INFO*	from_cs,	/*!< in: character set to convert
 +					from */
 +	uint*		errors)		/*!< out: number of errors encountered
 +					during the conversion */
 +{
 +	return(copy_and_convert(
 +			(char*) to, (uint32) to_length, to_cs,
 +			(const char*) from, (uint32) from_length, from_cs,
 +			errors));
 +}
 +
 +/*******************************************************************//**
 +Formats the raw data in "data" (in InnoDB on-disk format) that is of
 +type DATA_(CHAR|VARCHAR|MYSQL|VARMYSQL) using "charset_coll" and writes
 +the result to "buf". The result is converted to "system_charset_info".
 +Not more than "buf_size" bytes are written to "buf".
 +The result is always NUL-terminated (provided buf_size > 0) and the
 +number of bytes that were written to "buf" is returned (including the
 +terminating NUL).
 +@return	number of bytes that were written */
 +UNIV_INTERN
 +ulint
 +innobase_raw_format(
 +/*================*/
 +	const char*	data,		/*!< in: raw data */
 +	ulint		data_len,	/*!< in: raw data length
 +					in bytes */
 +	ulint		charset_coll,	/*!< in: charset collation */
 +	char*		buf,		/*!< out: output buffer */
 +	ulint		buf_size)	/*!< in: output buffer size
 +					in bytes */
 +{
 +	/* XXX we use a hard limit instead of allocating
 +	but_size bytes from the heap */
 +	CHARSET_INFO*	data_cs;
 +	char		buf_tmp[8192];
 +	ulint		buf_tmp_used;
 +	uint		num_errors;
 +
 +	data_cs = all_charsets[charset_coll];
 +
 +	buf_tmp_used = innobase_convert_string(buf_tmp, sizeof(buf_tmp),
 +					       system_charset_info,
 +					       data, data_len, data_cs,
 +					       &num_errors);
 +
 +	return(ut_str_sql_format(buf_tmp, buf_tmp_used, buf, buf_size));
 +}
 +
 +/*********************************************************************//**
 +Compute the next autoinc value.
 +
 +For MySQL replication the autoincrement values can be partitioned among
 +the nodes. The offset is the start or origin of the autoincrement value
 +for a particular node. For n nodes the increment will be n and the offset
 +will be in the interval [1, n]. The formula tries to allocate the next
 +value for a particular node.
 +
 +Note: This function is also called with increment set to the number of
 +values we want to reserve for multi-value inserts e.g.,
 +
 +	INSERT INTO T VALUES(), (), ();
 +
 +innobase_next_autoinc() will be called with increment set to 3 where
 +autoinc_lock_mode != TRADITIONAL because we want to reserve 3 values for
 +the multi-value INSERT above.
 +@return	the next value */
 +UNIV_INTERN
 +ulonglong
 +innobase_next_autoinc(
 +/*==================*/
 +	ulonglong	current,	/*!< in: Current value */
 +	ulonglong	need,		/*!< in: count of values needed */
 +	ulonglong	step,		/*!< in: AUTOINC increment step */
 +	ulonglong	offset,		/*!< in: AUTOINC offset */
 +	ulonglong	max_value)	/*!< in: max value for type */
 +{
 +	ulonglong	next_value;
 +	ulonglong	block = need * step;
 +
 +	/* Should never be 0. */
 +	ut_a(need > 0);
 +	ut_a(block > 0);
 +	ut_a(max_value > 0);
 +
 +        /*
 +          Allow auto_increment to go over max_value up to max ulonglong.
 +          This allows us to detect that all values are exhausted.
 +          If we don't do this, we will return max_value several times
 +          and get duplicate key errors instead of auto increment value
 +          out of range.
 +        */
 +        max_value= (~(ulonglong) 0);
 +
 +	/* According to MySQL documentation, if the offset is greater than
 +	the step then the offset is ignored. */
 +	if (offset > block) {
 +		offset = 0;
 +	}
 +
 +	/* Check for overflow. Current can be > max_value if the value is
 +	in reality a negative value.The visual studio compilers converts
 +	large double values automatically into unsigned long long datatype
 +	maximum value */
 +
 +	if (block >= max_value
 +	    || offset > max_value
 +	    || current >= max_value
 +	    || max_value - offset <= offset) {
 +
 +		next_value = max_value;
 +	} else {
 +		ut_a(max_value > current);
 +
 +		ulonglong	free = max_value - current;
 +
 +		if (free < offset || free - offset <= block) {
 +			next_value = max_value;
 +		} else {
 +			next_value = 0;
 +		}
 +	}
 +
 +	if (next_value == 0) {
 +		ulonglong	next;
 +
 +		if (current >= offset) {
 +			next = (current - offset) / step;
 +		} else {
 +			next = 0;
 +			block -= step;
 +		}
 +
 +		ut_a(max_value > next);
 +		next_value = next * step;
 +		/* Check for multiplication overflow. */
 +		ut_a(next_value >= next);
 +		ut_a(max_value > next_value);
 +
 +		/* Check for overflow */
 +		if (max_value - next_value >= block) {
 +
 +			next_value += block;
 +
 +			if (max_value - next_value >= offset) {
 +				next_value += offset;
 +			} else {
 +				next_value = max_value;
 +			}
 +		} else {
 +			next_value = max_value;
 +		}
 +	}
 +
 +	ut_a(next_value != 0);
 +	ut_a(next_value <= max_value);
 +
 +	return(next_value);
 +}
 +
 +/*********************************************************************//**
 +Initializes some fields in an InnoDB transaction object. */
 +static
 +void
 +innobase_trx_init(
 +/*==============*/
 +	THD*	thd,	/*!< in: user thread handle */
 +	trx_t*	trx)	/*!< in/out: InnoDB transaction handle */
 +{
 +	DBUG_ENTER("innobase_trx_init");
 +	DBUG_ASSERT(thd == trx->mysql_thd);
 +
 +	trx->check_foreigns = !thd_test_options(
 +		thd, OPTION_NO_FOREIGN_KEY_CHECKS);
 +
 +	trx->check_unique_secondary = !thd_test_options(
 +		thd, OPTION_RELAXED_UNIQUE_CHECKS);
 +
 +	DBUG_VOID_RETURN;
 +}
 +
 +/*********************************************************************//**
 +Allocates an InnoDB transaction for a MySQL handler object for DML.
 +@return	InnoDB transaction handle */
 +UNIV_INTERN
 +trx_t*
 +innobase_trx_allocate(
 +/*==================*/
 +	THD*	thd)	/*!< in: user thread handle */
 +{
 +	trx_t*	trx;
 +
 +	DBUG_ENTER("innobase_trx_allocate");
 +	DBUG_ASSERT(thd != NULL);
 +	DBUG_ASSERT(EQ_CURRENT_THD(thd));
 +
 +	trx = trx_allocate_for_mysql();
 +
 +	trx->mysql_thd = thd;
 +
 +	innobase_trx_init(thd, trx);
 +
 +	DBUG_RETURN(trx);
 +}
 +
 +/*********************************************************************//**
 +Gets the InnoDB transaction handle for a MySQL handler object, creates
 +an InnoDB transaction struct if the corresponding MySQL thread struct still
 +lacks one.
 +@return	InnoDB transaction handle */
 +static inline
 +trx_t*
 +check_trx_exists(
 +/*=============*/
 +	THD*	thd)	/*!< in: user thread handle */
 +{
 +	trx_t*&	trx = thd_to_trx(thd);
 +
 +	if (trx == NULL) {
 +		trx = innobase_trx_allocate(thd);
 +		thd_set_ha_data(thd, innodb_hton_ptr, trx);
 +	} else if (UNIV_UNLIKELY(trx->magic_n != TRX_MAGIC_N)) {
 +		mem_analyze_corruption(trx);
 +		ut_error;
 +	}
 +
 +	innobase_trx_init(thd, trx);
 +
 +	return(trx);
 +}
 +
 +/*********************************************************************//**
 +Note that a transaction has been registered with MySQL.
 +@return true if transaction is registered with MySQL 2PC coordinator */
 +static inline
 +bool
 +trx_is_registered_for_2pc(
 +/*=========================*/
 +	const trx_t*	trx)	/* in: transaction */
 +{
 +	return(trx->is_registered == 1);
 +}
 +
 +/*********************************************************************//**
 +Note that innobase_commit_ordered() was run. */
 +static inline
 +void
 +trx_set_active_commit_ordered(
 +/*==============================*/
 +	trx_t*	trx)	/* in: transaction */
 +{
 +	ut_a(trx_is_registered_for_2pc(trx));
 +	trx->active_commit_ordered = 1;
 +}
 +
 +/*********************************************************************//**
 +Note that a transaction has been registered with MySQL 2PC coordinator. */
 +static inline
 +void
 +trx_register_for_2pc(
 +/*==================*/
 +	trx_t*	trx)	/* in: transaction */
 +{
 +	trx->is_registered = 1;
 +	ut_ad(trx->active_commit_ordered == 0);
 +}
 +
 +/*********************************************************************//**
 +Note that a transaction has been deregistered. */
 +static inline
 +void
 +trx_deregister_from_2pc(
 +/*====================*/
 +	trx_t*	trx)	/* in: transaction */
 +{
 +	trx->is_registered = 0;
 +        trx->active_commit_ordered = 0;
 +}
 +
 +/*********************************************************************//**
 +Check whether a transaction has active_commit_ordered set */
 +static inline
 +bool
 +trx_is_active_commit_ordered(
 +/*=========================*/
 +	const trx_t*	trx)	/* in: transaction */
 +{
 +	return(trx->active_commit_ordered == 1);
 +}
 +
 +/*********************************************************************//**
 +Check if transaction is started.
 +@reutrn true if transaction is in state started */
 +static
 +bool
 +trx_is_started(
 +/*===========*/
 +	trx_t*	trx)	/* in: transaction */
 +{
 +	return(trx->state != TRX_STATE_NOT_STARTED);
 +}
 +
 +/*********************************************************************//**
 +Copy table flags from MySQL's HA_CREATE_INFO into an InnoDB table object.
 +Those flags are stored in .frm file and end up in the MySQL table object,
 +but are frequently used inside InnoDB so we keep their copies into the
 +InnoDB table object. */
 +UNIV_INTERN
 +void
 +innobase_copy_frm_flags_from_create_info(
 +/*=====================================*/
 +	dict_table_t*		innodb_table,	/*!< in/out: InnoDB table */
 +	const HA_CREATE_INFO*	create_info)	/*!< in: create info */
 +{
 +	ibool	ps_on;
 +	ibool	ps_off;
 +
 +	if (dict_table_is_temporary(innodb_table)) {
 +		/* Temp tables do not use persistent stats. */
 +		ps_on = FALSE;
 +		ps_off = TRUE;
 +	} else {
 +		ps_on = create_info->table_options
 +			& HA_OPTION_STATS_PERSISTENT;
 +		ps_off = create_info->table_options
 +			& HA_OPTION_NO_STATS_PERSISTENT;
 +	}
 +
 +	dict_stats_set_persistent(innodb_table, ps_on, ps_off);
 +
 +	dict_stats_auto_recalc_set(
 +		innodb_table,
 +		create_info->stats_auto_recalc == HA_STATS_AUTO_RECALC_ON,
 +		create_info->stats_auto_recalc == HA_STATS_AUTO_RECALC_OFF);
 +
 +	innodb_table->stats_sample_pages = create_info->stats_sample_pages;
 +}
 +
 +/*********************************************************************//**
 +Copy table flags from MySQL's TABLE_SHARE into an InnoDB table object.
 +Those flags are stored in .frm file and end up in the MySQL table object,
 +but are frequently used inside InnoDB so we keep their copies into the
 +InnoDB table object. */
 +UNIV_INTERN
 +void
 +innobase_copy_frm_flags_from_table_share(
 +/*=====================================*/
 +	dict_table_t*		innodb_table,	/*!< in/out: InnoDB table */
 +	const TABLE_SHARE*	table_share)	/*!< in: table share */
 +{
 +	ibool	ps_on;
 +	ibool	ps_off;
 +
 +	if (dict_table_is_temporary(innodb_table)) {
 +		/* Temp tables do not use persistent stats */
 +		ps_on = FALSE;
 +		ps_off = TRUE;
 +	} else {
 +		ps_on = table_share->db_create_options
 +			& HA_OPTION_STATS_PERSISTENT;
 +		ps_off = table_share->db_create_options
 +			& HA_OPTION_NO_STATS_PERSISTENT;
 +	}
 +
 +	dict_stats_set_persistent(innodb_table, ps_on, ps_off);
 +
 +	dict_stats_auto_recalc_set(
 +		innodb_table,
 +		table_share->stats_auto_recalc == HA_STATS_AUTO_RECALC_ON,
 +		table_share->stats_auto_recalc == HA_STATS_AUTO_RECALC_OFF);
 +
 +	innodb_table->stats_sample_pages = table_share->stats_sample_pages;
 +}
 +
 +/*********************************************************************//**
 +Construct ha_innobase handler. */
 +UNIV_INTERN
 +ha_innobase::ha_innobase(
 +/*=====================*/
 +	handlerton*	hton,
 +	TABLE_SHARE*	table_arg)
 +	:handler(hton, table_arg),
 +	int_table_flags(HA_REC_NOT_IN_SEQ |
 +		  HA_NULL_IN_KEY | HA_CAN_VIRTUAL_COLUMNS |
 +		  HA_CAN_INDEX_BLOBS |
 +		  HA_CAN_SQL_HANDLER |
 +		  HA_PRIMARY_KEY_REQUIRED_FOR_POSITION |
 +		  HA_PRIMARY_KEY_IN_READ_INDEX |
 +		  HA_BINLOG_ROW_CAPABLE |
 +		  HA_CAN_GEOMETRY | HA_PARTIAL_COLUMN_READ |
 +		  HA_TABLE_SCAN_ON_INDEX | HA_CAN_FULLTEXT |
 +		  HA_CAN_FULLTEXT_EXT | HA_CAN_EXPORT),
 +	start_of_scan(0),
 +	num_write_row(0)
 +{}
 +
 +/*********************************************************************//**
 +Destruct ha_innobase handler. */
 +UNIV_INTERN
 +ha_innobase::~ha_innobase()
 +/*======================*/
 +{
 +}
 +
 +/*********************************************************************//**
 +Updates the user_thd field in a handle and also allocates a new InnoDB
 +transaction handle if needed, and updates the transaction fields in the
 +prebuilt struct. */
 +UNIV_INTERN inline
 +void
 +ha_innobase::update_thd(
 +/*====================*/
 +	THD*	thd)	/*!< in: thd to use the handle */
 +{
 +	trx_t*		trx;
 +
 +	DBUG_ENTER("ha_innobase::update_thd");
 +	DBUG_PRINT("ha_innobase::update_thd", ("user_thd: %p -> %p",
 +		   user_thd, thd));
 +
 +	/* The table should have been opened in ha_innobase::open(). */
 +	DBUG_ASSERT(prebuilt->table->n_ref_count > 0);
 +
 +	trx = check_trx_exists(thd);
 +
 +	if (prebuilt->trx != trx) {
 +
 +		row_update_prebuilt_trx(prebuilt, trx);
 +	}
 +
 +	user_thd = thd;
 +	DBUG_VOID_RETURN;
 +}
 +
 +/*********************************************************************//**
 +Updates the user_thd field in a handle and also allocates a new InnoDB
 +transaction handle if needed, and updates the transaction fields in the
 +prebuilt struct. */
 +UNIV_INTERN
 +void
 +ha_innobase::update_thd()
 +/*=====================*/
 +{
 +	THD*	thd = ha_thd();
 +
 +	ut_ad(EQ_CURRENT_THD(thd));
 +	update_thd(thd);
 +}
 +
 +/*********************************************************************//**
 +Registers an InnoDB transaction with the MySQL 2PC coordinator, so that
 +the MySQL XA code knows to call the InnoDB prepare and commit, or rollback
 +for the transaction. This MUST be called for every transaction for which
 +the user may call commit or rollback. Calling this several times to register
 +the same transaction is allowed, too. This function also registers the
 +current SQL statement. */
 +static inline
 +void
 +innobase_register_trx(
 +/*==================*/
 +	handlerton*	hton,	/* in: Innobase handlerton */
 +	THD*		thd,	/* in: MySQL thd (connection) object */
 +	trx_t*		trx)	/* in: transaction to register */
 +{
 +	trans_register_ha(thd, FALSE, hton);
 +
 +	if (!trx_is_registered_for_2pc(trx)
 +	    && thd_test_options(thd, OPTION_NOT_AUTOCOMMIT | OPTION_BEGIN)) {
 +
 +		trans_register_ha(thd, TRUE, hton);
 +	}
 +
 +	trx_register_for_2pc(trx);
 +}
 +
 +/*	BACKGROUND INFO: HOW THE MYSQL QUERY CACHE WORKS WITH INNODB
 +	------------------------------------------------------------
 +
 +1) The use of the query cache for TBL is disabled when there is an
 +uncommitted change to TBL.
 +
 +2) When a change to TBL commits, InnoDB stores the current value of
 +its global trx id counter, let us denote it by INV_TRX_ID, to the table object
 +in the InnoDB data dictionary, and does only allow such transactions whose
 +id <= INV_TRX_ID to use the query cache.
 +
 +3) When InnoDB does an INSERT/DELETE/UPDATE to a table TBL, or an implicit
 +modification because an ON DELETE CASCADE, we invalidate the MySQL query cache
 +of TBL immediately.
 +
 +How this is implemented inside InnoDB:
 +
 +1) Since every modification always sets an IX type table lock on the InnoDB
 +table, it is easy to check if there can be uncommitted modifications for a
 +table: just check if there are locks in the lock list of the table.
 +
 +2) When a transaction inside InnoDB commits, it reads the global trx id
 +counter and stores the value INV_TRX_ID to the tables on which it had a lock.
 +
 +3) If there is an implicit table change from ON DELETE CASCADE or SET NULL,
 +InnoDB calls an invalidate method for the MySQL query cache for that table.
 +
 +How this is implemented inside sql_cache.cc:
 +
 +1) The query cache for an InnoDB table TBL is invalidated immediately at an
 +INSERT/UPDATE/DELETE, just like in the case of MyISAM. No need to delay
 +invalidation to the transaction commit.
 +
 +2) To store or retrieve a value from the query cache of an InnoDB table TBL,
 +any query must first ask InnoDB's permission. We must pass the thd as a
 +parameter because InnoDB will look at the trx id, if any, associated with
 +that thd. Also the full_name which is used as key to search for the table
 +object. The full_name is a string containing the normalized path to the
 +table in the canonical format.
 +
 +3) Use of the query cache for InnoDB tables is now allowed also when
 +AUTOCOMMIT==0 or we are inside BEGIN ... COMMIT. Thus transactions no longer
 +put restrictions on the use of the query cache.
 +*/
 +
 +/******************************************************************//**
 +The MySQL query cache uses this to check from InnoDB if the query cache at
 +the moment is allowed to operate on an InnoDB table. The SQL query must
 +be a non-locking SELECT.
 +
 +The query cache is allowed to operate on certain query only if this function
 +returns TRUE for all tables in the query.
 +
 +If thd is not in the autocommit state, this function also starts a new
 +transaction for thd if there is no active trx yet, and assigns a consistent
 +read view to it if there is no read view yet.
 +
 +Why a deadlock of threads is not possible: the query cache calls this function
 +at the start of a SELECT processing. Then the calling thread cannot be
 +holding any InnoDB semaphores. The calling thread is holding the
 +query cache mutex, and this function will reserve the InnoDB trx_sys->mutex.
 +Thus, the 'rank' in sync0sync.h of the MySQL query cache mutex is above
 +the InnoDB trx_sys->mutex.
 +@return TRUE if permitted, FALSE if not; note that the value FALSE
 +does not mean we should invalidate the query cache: invalidation is
 +called explicitly */
 +static
 +my_bool
 +innobase_query_caching_of_table_permitted(
 +/*======================================*/
 +	THD*	thd,		/*!< in: thd of the user who is trying to
 +				store a result to the query cache or
 +				retrieve it */
 +	char*	full_name,	/*!< in: normalized path to the table */
 +	uint	full_name_len,	/*!< in: length of the normalized path
 +                                to the table */
 +	ulonglong *unused)	/*!< unused for this engine */
 +{
 +	ibool	is_autocommit;
 +	trx_t*	trx;
 +	char	norm_name[1000];
 +
 +	ut_a(full_name_len < 999);
 +
 +	trx = check_trx_exists(thd);
 +
 +	if (trx->isolation_level == TRX_ISO_SERIALIZABLE) {
 +		/* In the SERIALIZABLE mode we add LOCK IN SHARE MODE to every
 +		plain SELECT if AUTOCOMMIT is not on. */
 +
 +		return((my_bool)FALSE);
 +	}
 +
 +	if (UNIV_UNLIKELY(trx->has_search_latch)) {
 +		sql_print_error("The calling thread is holding the adaptive "
 +				"search, latch though calling "
 +				"innobase_query_caching_of_table_permitted.");
 +		trx_print(stderr, trx, 1024);
 +	}
 +
 +	trx_search_latch_release_if_reserved(trx);
 +
 +	innobase_srv_conc_force_exit_innodb(trx);
 +
 +	if (!thd_test_options(thd, OPTION_NOT_AUTOCOMMIT | OPTION_BEGIN)) {
 +
 +		is_autocommit = TRUE;
 +	} else {
 +		is_autocommit = FALSE;
 +
 +	}
 +
 +	if (is_autocommit && trx->n_mysql_tables_in_use == 0) {
 +		/* We are going to retrieve the query result from the query
 +		cache. This cannot be a store operation to the query cache
 +		because then MySQL would have locks on tables already.
 +
 +		TODO: if the user has used LOCK TABLES to lock the table,
 +		then we open a transaction in the call of row_.. below.
 +		That trx can stay open until UNLOCK TABLES. The same problem
 +		exists even if we do not use the query cache. MySQL should be
 +		modified so that it ALWAYS calls some cleanup function when
 +		the processing of a query ends!
 +
 +		We can imagine we instantaneously serialize this consistent
 +		read trx to the current trx id counter. If trx2 would have
 +		changed the tables of a query result stored in the cache, and
 +		trx2 would have already committed, making the result obsolete,
 +		then trx2 would have already invalidated the cache. Thus we
 +		can trust the result in the cache is ok for this query. */
 +
 +		return((my_bool)TRUE);
 +	}
 +
 +	/* Normalize the table name to InnoDB format */
 +	normalize_table_name(norm_name, full_name);
 +
 +	innobase_register_trx(innodb_hton_ptr, thd, trx);
 +
 +	if (row_search_check_if_query_cache_permitted(trx, norm_name)) {
 +
 +		/* printf("Query cache for %s permitted\n", norm_name); */
 +
 +		return((my_bool)TRUE);
 +	}
 +
 +	/* printf("Query cache for %s NOT permitted\n", norm_name); */
 +
 +	return((my_bool)FALSE);
 +}
 +
 +/*****************************************************************//**
 +Invalidates the MySQL query cache for the table. */
 +UNIV_INTERN
 +void
 +innobase_invalidate_query_cache(
 +/*============================*/
 +	trx_t*		trx,		/*!< in: transaction which
 +					modifies the table */
 +	const char*	full_name,	/*!< in: concatenation of
 +					database name, null char NUL,
 +					table name, null char NUL;
 +					NOTE that in Windows this is
 +					always in LOWER CASE! */
 +	ulint		full_name_len)	/*!< in: full name length where
 +					also the null chars count */
 +{
 +	/* Note that the sync0sync.h rank of the query cache mutex is just
 +	above the InnoDB trx_sys_t->lock. The caller of this function must
 +	not have latches of a lower rank. */
 +
 +#ifdef HAVE_QUERY_CACHE
 +	char	qcache_key_name[2 * (NAME_LEN + 1)];
 +	size_t	tabname_len;
 +	size_t	dbname_len;
 +
 +	/* Construct the key("db-name\0table$name\0") for the query cache using
 +	the path name("db@002dname\0table@0024name\0") of the table in its
 +        canonical form. */
 +	dbname_len = filename_to_tablename(full_name, qcache_key_name,
 +					   sizeof(qcache_key_name));
 +	tabname_len = filename_to_tablename(full_name + strlen(full_name) + 1,
 +					    qcache_key_name + dbname_len + 1,
 +					    sizeof(qcache_key_name)
 +                                            - dbname_len - 1);
 +
 +	/* Argument TRUE below means we are using transactions */
 +	mysql_query_cache_invalidate4(trx->mysql_thd,
 +				      qcache_key_name,
 +				      (dbname_len + tabname_len + 2),
 +				      TRUE);
 +#endif
 +}
 +
 +/*****************************************************************//**
 +Convert an SQL identifier to the MySQL system_charset_info (UTF-8)
 +and quote it if needed.
 +@return	pointer to the end of buf */
 +static
 +char*
 +innobase_convert_identifier(
 +/*========================*/
 +	char*		buf,	/*!< out: buffer for converted identifier */
 +	ulint		buflen,	/*!< in: length of buf, in bytes */
 +	const char*	id,	/*!< in: identifier to convert */
 +	ulint		idlen,	/*!< in: length of id, in bytes */
 +	THD*		thd,	/*!< in: MySQL connection thread, or NULL */
 +	ibool		file_id)/*!< in: TRUE=id is a table or database name;
 +				FALSE=id is an UTF-8 string */
 +{
 +	char nz2[MAX_TABLE_NAME_LEN + 1];
 +	const char*	s	= id;
 +	int		q;
 +
 +	if (file_id) {
 +
 +		char nz[MAX_TABLE_NAME_LEN + 1];
 +
 +		/* Decode the table name.  The MySQL function expects
 +		a NUL-terminated string.  The input and output strings
 +		buffers must not be shared. */
 +		ut_a(idlen <= MAX_TABLE_NAME_LEN);
 +		memcpy(nz, id, idlen);
 +		nz[idlen] = 0;
 +
 +		s = nz2;
 +		idlen = explain_filename(thd, nz, nz2, sizeof nz2,
 +					 EXPLAIN_PARTITIONS_AS_COMMENT);
 +		goto no_quote;
 +	}
 +
 +	/* See if the identifier needs to be quoted. */
 +	if (UNIV_UNLIKELY(!thd)) {
 +		q = '"';
 +	} else {
 +		q = get_quote_char_for_identifier(thd, s, (int) idlen);
 +	}
 +
 +	if (q == EOF) {
 +no_quote:
 +		if (UNIV_UNLIKELY(idlen > buflen)) {
 +			idlen = buflen;
 +		}
 +		memcpy(buf, s, idlen);
 +		return(buf + idlen);
 +	}
 +
 +	/* Quote the identifier. */
 +	if (buflen < 2) {
 +		return(buf);
 +	}
 +
 +	*buf++ = q;
 +	buflen--;
 +
 +	for (; idlen; idlen--) {
 +		int	c = *s++;
 +		if (UNIV_UNLIKELY(c == q)) {
 +			if (UNIV_UNLIKELY(buflen < 3)) {
 +				break;
 +			}
 +
 +			*buf++ = c;
 +			*buf++ = c;
 +			buflen -= 2;
 +		} else {
 +			if (UNIV_UNLIKELY(buflen < 2)) {
 +				break;
 +			}
 +
 +			*buf++ = c;
 +			buflen--;
 +		}
 +	}
 +
 +	*buf++ = q;
 +	return(buf);
 +}
 +
 +/*****************************************************************//**
 +Convert a table or index name to the MySQL system_charset_info (UTF-8)
 +and quote it if needed.
 +@return	pointer to the end of buf */
 +UNIV_INTERN
 +char*
 +innobase_convert_name(
 +/*==================*/
 +	char*		buf,	/*!< out: buffer for converted identifier */
 +	ulint		buflen,	/*!< in: length of buf, in bytes */
 +	const char*	id,	/*!< in: identifier to convert */
 +	ulint		idlen,	/*!< in: length of id, in bytes */
 +	THD*		thd,	/*!< in: MySQL connection thread, or NULL */
 +	ibool		table_id)/*!< in: TRUE=id is a table or database name;
 +				FALSE=id is an index name */
 +{
 +	char*		s	= buf;
 +	const char*	bufend	= buf + buflen;
 +
 +	if (table_id) {
 +		const char*	slash = (const char*) memchr(id, '/', idlen);
 +		if (!slash) {
 +
 +			goto no_db_name;
 +		}
 +
 +		/* Print the database name and table name separately. */
 +		s = innobase_convert_identifier(s, bufend - s, id, slash - id,
 +						thd, TRUE);
 +		if (UNIV_LIKELY(s < bufend)) {
 +			*s++ = '.';
 +			s = innobase_convert_identifier(s, bufend - s,
 +							slash + 1, idlen
 +							- (slash - id) - 1,
 +							thd, TRUE);
 +		}
 +	} else if (UNIV_UNLIKELY(*id == TEMP_INDEX_PREFIX)) {
 +		/* Temporary index name (smart ALTER TABLE) */
 +		const char temp_index_suffix[]= "--temporary--";
 +
 +		s = innobase_convert_identifier(buf, buflen, id + 1, idlen - 1,
 +						thd, FALSE);
 +		if (s - buf + (sizeof temp_index_suffix - 1) < buflen) {
 +			memcpy(s, temp_index_suffix,
 +			       sizeof temp_index_suffix - 1);
 +			s += sizeof temp_index_suffix - 1;
 +		}
 +	} else {
 +no_db_name:
 +		s = innobase_convert_identifier(buf, buflen, id, idlen,
 +						thd, table_id);
 +	}
 +
 +	return(s);
 +}
 +
 +/*****************************************************************//**
 +A wrapper function of innobase_convert_name(), convert a table or
 +index name to the MySQL system_charset_info (UTF-8) and quote it if needed.
 +@return	pointer to the end of buf */
 +UNIV_INTERN
 +void
 +innobase_format_name(
 +/*==================*/
 +	char*		buf,	/*!< out: buffer for converted identifier */
 +	ulint		buflen,	/*!< in: length of buf, in bytes */
 +	const char*	name,	/*!< in: index or table name to format */
 +	ibool		is_index_name) /*!< in: index name */
 +{
 +	const char*     bufend;
 +
 +	bufend = innobase_convert_name(buf, buflen, name, strlen(name),
 +				       NULL, !is_index_name);
 +
 +	ut_ad((ulint) (bufend - buf) < buflen);
 +
 +	buf[bufend - buf] = '\0';
 +}
 +
 +/**********************************************************************//**
 +Determines if the currently running transaction has been interrupted.
 +@return	TRUE if interrupted */
 +UNIV_INTERN
 +ibool
 +trx_is_interrupted(
 +/*===============*/
 +	const trx_t*	trx)	/*!< in: transaction */
 +{
 +  return(trx && trx->mysql_thd && thd_kill_level(trx->mysql_thd));
 +}
 +
 +/**********************************************************************//**
 +Determines if the currently running transaction is in strict mode.
 +@return	TRUE if strict */
 +UNIV_INTERN
 +ibool
 +trx_is_strict(
 +/*==========*/
 +	trx_t*	trx)	/*!< in: transaction */
 +{
 +	return(trx && trx->mysql_thd && THDVAR(trx->mysql_thd, strict_mode));
 +}
 +
 +/**************************************************************//**
 +Resets some fields of a prebuilt struct. The template is used in fast
 +retrieval of just those column values MySQL needs in its processing. */
 +inline
 +void
 +ha_innobase::reset_template(void)
 +/*=============================*/
 +{
 +	ut_ad(prebuilt->magic_n == ROW_PREBUILT_ALLOCATED);
 +	ut_ad(prebuilt->magic_n2 == prebuilt->magic_n);
 +
 +	/* Force table to be freed in close_thread_table(). */
 +	DBUG_EXECUTE_IF("free_table_in_fts_query",
 +		if (prebuilt->in_fts_query) {
 +			table->m_needs_reopen = true;
 +		}
 +	);
 +
 +	prebuilt->keep_other_fields_on_keyread = 0;
 +	prebuilt->read_just_key = 0;
 +	prebuilt->in_fts_query = 0;
 +	/* Reset index condition pushdown state. */
 +	if (prebuilt->idx_cond) {
 +		prebuilt->idx_cond = NULL;
 +		prebuilt->idx_cond_n_cols = 0;
 +		/* Invalidate prebuilt->mysql_template
 +		in ha_innobase::write_row(). */
 +		prebuilt->template_type = ROW_MYSQL_NO_TEMPLATE;
 +	}
 +}
 +
 +/*****************************************************************//**
 +Call this when you have opened a new table handle in HANDLER, before you
 +call index_read_idx() etc. Actually, we can let the cursor stay open even
 +over a transaction commit! Then you should call this before every operation,
 +fetch next etc. This function inits the necessary things even after a
 +transaction commit. */
 +UNIV_INTERN
 +void
 +ha_innobase::init_table_handle_for_HANDLER(void)
 +/*============================================*/
 +{
 +	/* If current thd does not yet have a trx struct, create one.
 +	If the current handle does not yet have a prebuilt struct, create
 +	one. Update the trx pointers in the prebuilt struct. Normally
 +	this operation is done in external_lock. */
 +
 +	update_thd(ha_thd());
 +
 +	/* Initialize the prebuilt struct much like it would be inited in
 +	external_lock */
 +
 +	trx_search_latch_release_if_reserved(prebuilt->trx);
 +
 +	innobase_srv_conc_force_exit_innodb(prebuilt->trx);
 +
 +	/* If the transaction is not started yet, start it */
 +
 +	trx_start_if_not_started_xa(prebuilt->trx);
 +
 +	/* Assign a read view if the transaction does not have it yet */
 +
 +	trx_assign_read_view(prebuilt->trx);
 +
 +	innobase_register_trx(ht, user_thd, prebuilt->trx);
 +
 +	/* We did the necessary inits in this function, no need to repeat them
 +	in row_search_for_mysql */
 +
 +	prebuilt->sql_stat_start = FALSE;
 +
 +	/* We let HANDLER always to do the reads as consistent reads, even
 +	if the trx isolation level would have been specified as SERIALIZABLE */
 +
 +	prebuilt->select_lock_type = LOCK_NONE;
 +	prebuilt->stored_select_lock_type = LOCK_NONE;
 +
 +	/* Always fetch all columns in the index record */
 +
 +	prebuilt->hint_need_to_fetch_extra_cols = ROW_RETRIEVE_ALL_COLS;
 +
 +	/* We want always to fetch all columns in the whole row? Or do
 +	we???? */
 +
 +	prebuilt->used_in_HANDLER = TRUE;
 +	reset_template();
 +}
 +
 +/****************************************************************//**
 +Gives the file extension of an InnoDB single-table tablespace. */
 +static const char* ha_innobase_exts[] = {
 +	".ibd",
 +	".isl",
 +  NullS
 +};
 +
 +/*********************************************************************//**
 +Opens an InnoDB database.
 +@return	0 on success, error code on failure */
 +static
 +int
 +innobase_init(
 +/*==========*/
 +	void	*p)	/*!< in: InnoDB handlerton */
 +{
 +	static char	current_dir[3];		/*!< Set if using current lib */
 +	int		err;
 +	bool		ret;
 +	char		*default_path;
 +	uint		format_id;
 +	ulong		num_pll_degree;
 +
 +	DBUG_ENTER("innobase_init");
 +	handlerton *innobase_hton= (handlerton*) p;
 +	innodb_hton_ptr = innobase_hton;
 +
 +	innobase_hton->state = SHOW_OPTION_YES;
 +	innobase_hton->db_type= DB_TYPE_INNODB;
 +	innobase_hton->savepoint_offset = sizeof(trx_named_savept_t);
 +	innobase_hton->close_connection = innobase_close_connection;
 +	innobase_hton->savepoint_set = innobase_savepoint;
 +	innobase_hton->savepoint_rollback = innobase_rollback_to_savepoint;
 +	innobase_hton->savepoint_rollback_can_release_mdl =
 +				innobase_rollback_to_savepoint_can_release_mdl;
 +	innobase_hton->savepoint_release = innobase_release_savepoint;
 +	innobase_hton->prepare_ordered= NULL;
 +	innobase_hton->commit_ordered= innobase_commit_ordered;
 +	innobase_hton->commit = innobase_commit;
 +	innobase_hton->rollback = innobase_rollback;
 +	innobase_hton->prepare = innobase_xa_prepare;
 +	innobase_hton->recover = innobase_xa_recover;
 +	innobase_hton->commit_by_xid = innobase_commit_by_xid;
 +	innobase_hton->rollback_by_xid = innobase_rollback_by_xid;
 +	innobase_hton->commit_checkpoint_request=innobase_checkpoint_request;
 +	innobase_hton->create_cursor_read_view = innobase_create_cursor_view;
 +	innobase_hton->set_cursor_read_view = innobase_set_cursor_view;
 +	innobase_hton->close_cursor_read_view = innobase_close_cursor_view;
 +	innobase_hton->create = innobase_create_handler;
 +	innobase_hton->drop_database = innobase_drop_database;
 +	innobase_hton->panic = innobase_end;
 +
 +	innobase_hton->start_consistent_snapshot =
 +		innobase_start_trx_and_assign_read_view;
 +
 +	innobase_hton->flush_logs = innobase_flush_logs;
 +	innobase_hton->show_status = innobase_show_status;
 +	innobase_hton->flags =
 +		HTON_SUPPORTS_EXTENDED_KEYS | HTON_SUPPORTS_FOREIGN_KEYS;
 +
 +	innobase_hton->release_temporary_latches =
 +		innobase_release_temporary_latches;
 +#ifdef WITH_WSREP
 +        innobase_hton->wsrep_abort_transaction=wsrep_abort_transaction;
 +        innobase_hton->wsrep_set_checkpoint=innobase_wsrep_set_checkpoint;
 +        innobase_hton->wsrep_get_checkpoint=innobase_wsrep_get_checkpoint;
 +        innobase_hton->wsrep_fake_trx_id=wsrep_fake_trx_id;
 +#endif /* WITH_WSREP */
 +	innobase_hton->kill_query = innobase_kill_query;
 +
 +        if (srv_file_per_table)
 +          innobase_hton->tablefile_extensions = ha_innobase_exts;
 +
 +	ut_a(DATA_MYSQL_TRUE_VARCHAR == (ulint)MYSQL_TYPE_VARCHAR);
 +
 +#ifndef DBUG_OFF
 +	static const char	test_filename[] = "-@";
 +	char			test_tablename[sizeof test_filename
 +				+ sizeof(srv_mysql50_table_name_prefix) - 1];
 +	if ((sizeof(test_tablename)) - 1
 +			!= filename_to_tablename(test_filename,
 +						 test_tablename,
 +						 sizeof(test_tablename), true)
 +			|| strncmp(test_tablename,
 +				   srv_mysql50_table_name_prefix,
 +				   sizeof(srv_mysql50_table_name_prefix) - 1)
 +			|| strcmp(test_tablename
 +				  + sizeof(srv_mysql50_table_name_prefix) - 1,
 +				  test_filename)) {
 +
 +		sql_print_error("tablename encoding has been changed");
 +
 +		goto error;
 +	}
 +#endif /* DBUG_OFF */
 +
 +	/* Check that values don't overflow on 32-bit systems. */
 +	if (sizeof(ulint) == 4) {
 +		if (innobase_buffer_pool_size > UINT_MAX32) {
 +			sql_print_error(
 +				"innobase_buffer_pool_size can't be over 4GB"
 +				" on 32-bit systems");
 +
 +			goto error;
 +		}
 +	}
 +
 +	os_innodb_umask = (ulint) my_umask;
 +
 +	/* First calculate the default path for innodb_data_home_dir etc.,
 +	in case the user has not given any value.
 +
 +	Note that when using the embedded server, the datadirectory is not
 +	necessarily the current directory of this program. */
 +
 +	if (mysqld_embedded) {
 +		default_path = mysql_real_data_home;
 +		fil_path_to_mysql_datadir = mysql_real_data_home;
 +	} else {
 +		/* It's better to use current lib, to keep paths short */
 +		current_dir[0] = FN_CURLIB;
 +		current_dir[1] = FN_LIBCHAR;
 +		current_dir[2] = 0;
 +		default_path = current_dir;
 +	}
 +
 +	ut_a(default_path);
 +
 +	/* Set InnoDB initialization parameters according to the values
 +	read from MySQL .cnf file */
 +
 +	/*--------------- Data files -------------------------*/
 +
 +	/* The default dir for data files is the datadir of MySQL */
 +
 +	srv_data_home = (innobase_data_home_dir ? innobase_data_home_dir :
 +			 default_path);
 +
 +	/* Set default InnoDB data file size to 12 MB and let it be
 +	auto-extending. Thus users can use InnoDB in >= 4.0 without having
 +	to specify any startup options. */
 +
 +	if (!innobase_data_file_path) {
 +		innobase_data_file_path = (char*) "ibdata1:12M:autoextend";
 +	}
 +
 +	/* Since InnoDB edits the argument in the next call, we make another
 +	copy of it: */
 +
 +	internal_innobase_data_file_path = my_strdup(innobase_data_file_path,
 +						   MYF(MY_FAE));
 +
 +	ret = (bool) srv_parse_data_file_paths_and_sizes(
 +		internal_innobase_data_file_path);
 +	if (ret == FALSE) {
 +		sql_print_error(
 +			"InnoDB: syntax error in innodb_data_file_path"
 +			" or size specified is less than 1 megabyte");
 +mem_free_and_error:
 +		srv_free_paths_and_sizes();
 +		my_free(internal_innobase_data_file_path);
 +		goto error;
 +	}
 +
 +	/* -------------- All log files ---------------------------*/
 +
 +	/* The default dir for log files is the datadir of MySQL */
 +
 +	if (!srv_log_group_home_dir) {
 +		srv_log_group_home_dir = default_path;
 +	}
 +
 +#ifdef UNIV_LOG_ARCHIVE
 +	/* Since innodb_log_arch_dir has no relevance under MySQL,
 +	starting from 4.0.6 we always set it the same as
 +	innodb_log_group_home_dir: */
 +
 +	innobase_log_arch_dir = innobase_log_group_home_dir;
 +
 +	srv_arch_dir = innobase_log_arch_dir;
 +#endif /* UNIG_LOG_ARCHIVE */
 +
 +	srv_normalize_path_for_win(srv_log_group_home_dir);
 +
 +	if (strchr(srv_log_group_home_dir, ';')) {
 +		sql_print_error("syntax error in innodb_log_group_home_dir");
 +		goto mem_free_and_error;
 +	}
 +
 +	if (innobase_mirrored_log_groups == 1) {
 +		sql_print_warning(
 +			"innodb_mirrored_log_groups is an unimplemented "
 +			"feature and the variable will be completely "
 +			"removed in a future version.");
 +	}
 +
 +	if (innobase_mirrored_log_groups > 1) {
 +		sql_print_error(
 +		"innodb_mirrored_log_groups is an unimplemented feature and "
 +		"the variable will be completely removed in a future version. "
 +		"Using values other than 1 is not supported.");
 +		goto mem_free_and_error;
 +	}
 +
 +	if (innobase_mirrored_log_groups == 0) {
 +		/* To throw a deprecation warning message when the option is
 +		passed, the default was changed to '0' (as a workaround). Since
 +		the only value accepted for this option is '1', reset it to 1 */
 +		innobase_mirrored_log_groups = 1;
 +	}
 +
 +	/* Validate the file format by animal name */
 +	if (innobase_file_format_name != NULL) {
 +
 +		format_id = innobase_file_format_name_lookup(
 +			innobase_file_format_name);
 +
 +		if (format_id > UNIV_FORMAT_MAX) {
 +
 +			sql_print_error("InnoDB: wrong innodb_file_format.");
 +
 +			goto mem_free_and_error;
 +		}
 +	} else {
 +		/* Set it to the default file format id. Though this
 +		should never happen. */
 +		format_id = 0;
 +	}
 +
 +	srv_file_format = format_id;
 +
 +	/* Given the type of innobase_file_format_name we have little
 +	choice but to cast away the constness from the returned name.
 +	innobase_file_format_name is used in the MySQL set variable
 +	interface and so can't be const. */
 +
 +	innobase_file_format_name =
 +		(char*) trx_sys_file_format_id_to_name(format_id);
 +
 +	/* Check innobase_file_format_check variable */
 +	if (!innobase_file_format_check) {
 +
 +		/* Set the value to disable checking. */
 +		srv_max_file_format_at_startup = UNIV_FORMAT_MAX + 1;
 +
 +	} else {
 +
 +		/* Set the value to the lowest supported format. */
 +		srv_max_file_format_at_startup = UNIV_FORMAT_MIN;
 +	}
 +
 +	/* Did the user specify a format name that we support?
 +	As a side effect it will update the variable
 +	srv_max_file_format_at_startup */
 +	if (innobase_file_format_validate_and_set(
 +			innobase_file_format_max) < 0) {
 +
 +		sql_print_error("InnoDB: invalid "
 +				"innodb_file_format_max value: "
 +				"should be any value up to %s or its "
 +				"equivalent numeric id",
 +				trx_sys_file_format_id_to_name(
 +					UNIV_FORMAT_MAX));
 +
 +		goto mem_free_and_error;
 +	}
 +
 +	if (innobase_change_buffering) {
 +		ulint	use;
 +
 +		for (use = 0;
 +		     use < UT_ARR_SIZE(innobase_change_buffering_values);
 +		     use++) {
 +			if (!innobase_strcasecmp(
 +				    innobase_change_buffering,
 +				    innobase_change_buffering_values[use])) {
 +				ibuf_use = (ibuf_use_t) use;
 +				goto innobase_change_buffering_inited_ok;
 +			}
 +		}
 +
 +		sql_print_error("InnoDB: invalid value "
 +				"innodb_change_buffering=%s",
 +				innobase_change_buffering);
 +		goto mem_free_and_error;
 +	}
 +
 +innobase_change_buffering_inited_ok:
 +	ut_a((ulint) ibuf_use < UT_ARR_SIZE(innobase_change_buffering_values));
 +	innobase_change_buffering = (char*)
 +		innobase_change_buffering_values[ibuf_use];
 +
 +	/* Check that interdependent parameters have sane values. */
 +	if (srv_max_buf_pool_modified_pct < srv_max_dirty_pages_pct_lwm) {
 +		sql_print_warning("InnoDB: innodb_max_dirty_pages_pct_lwm"
 +				  " cannot be set higher than"
 +				  " innodb_max_dirty_pages_pct.\n"
 +				  "InnoDB: Setting"
 +				  " innodb_max_dirty_pages_pct_lwm to %lf\n",
 +				  srv_max_buf_pool_modified_pct);
 +
 +		srv_max_dirty_pages_pct_lwm = srv_max_buf_pool_modified_pct;
 +	}
 +
 +	if (srv_max_io_capacity == SRV_MAX_IO_CAPACITY_DUMMY_DEFAULT) {
 +
 +		if (srv_io_capacity >= SRV_MAX_IO_CAPACITY_LIMIT / 2) {
 +			/* Avoid overflow. */
 +			srv_max_io_capacity = SRV_MAX_IO_CAPACITY_LIMIT;
 +		} else {
 +			/* The user has not set the value. We should
 +			set it based on innodb_io_capacity. */
 +			srv_max_io_capacity = static_cast<ulong>(
 +				ut_max(2 * srv_io_capacity, 2000));
 +		}
 +
 +	} else if (srv_max_io_capacity < srv_io_capacity) {
 +		sql_print_warning("InnoDB: innodb_io_capacity"
 +				  " cannot be set higher than"
 +				  " innodb_io_capacity_max.\n"
 +				  "InnoDB: Setting"
 +				  " innodb_io_capacity to %lu\n",
 +				  srv_max_io_capacity);
 +
 +		srv_io_capacity = srv_max_io_capacity;
 +	}
 +
 +	if (!is_filename_allowed(srv_buf_dump_filename,
 +				 strlen(srv_buf_dump_filename), FALSE)) {
 +		sql_print_error("InnoDB: innodb_buffer_pool_filename"
 +			" cannot have colon (:) in the file name.");
 +		goto mem_free_and_error;
 +	}
 +
 +	/* --------------------------------------------------*/
 +
 +	srv_file_flush_method_str = innobase_file_flush_method;
 +
 +	srv_log_file_size = (ib_uint64_t) innobase_log_file_size;
 +
 +#ifdef UNIV_LOG_ARCHIVE
 +	srv_log_archive_on = (ulint) innobase_log_archive;
 +#endif /* UNIV_LOG_ARCHIVE */
 +
 +	/* Check that the value of system variable innodb_page_size was
 +	set correctly.  Its value was put into srv_page_size. If valid,
 +	return the associated srv_page_size_shift.*/
 +	srv_page_size_shift = innodb_page_size_validate(srv_page_size);
 +	if (!srv_page_size_shift) {
 +		sql_print_error("InnoDB: Invalid page size=%lu.\n",
 +				srv_page_size);
 +		goto mem_free_and_error;
 +	}
 +	if (UNIV_PAGE_SIZE_DEF != srv_page_size) {
 +		ut_print_timestamp(stderr);
 +		fprintf(stderr,
 +			" InnoDB: innodb-page-size has been changed"
 +			" from the default value %d to %lu.\n",
 +			UNIV_PAGE_SIZE_DEF, srv_page_size);
 +	}
 +
 +	srv_log_buffer_size = (ulint) innobase_log_buffer_size;
 +
 +	if (innobase_buffer_pool_instances == 0) {
 +		innobase_buffer_pool_instances = 8;
 +
 +#if defined(__WIN__) && !defined(_WIN64)
 +		if (innobase_buffer_pool_size > 1331 * 1024 * 1024) {
 +			innobase_buffer_pool_instances
 +				= ut_min(MAX_BUFFER_POOLS,
 +					(long) (innobase_buffer_pool_size
 +					/ (128 * 1024 * 1024)));
 +		}
 +#endif /* defined(__WIN__) && !defined(_WIN64) */
 +	}
 +	srv_buf_pool_size = (ulint) innobase_buffer_pool_size;
 +	srv_buf_pool_instances = (ulint) innobase_buffer_pool_instances;
 +
 +	srv_mem_pool_size = (ulint) innobase_additional_mem_pool_size;
 +
 +	if (innobase_additional_mem_pool_size
 +	    != 8*1024*1024L /* the default */ ) {
 +
 +		ut_print_timestamp(stderr);
 +		fprintf(stderr,
 +			" InnoDB: Warning: Using "
 +			"innodb_additional_mem_pool_size is DEPRECATED. "
 +			"This option may be removed in future releases, "
 +			"together with the option innodb_use_sys_malloc "
 +			"and with the InnoDB's internal memory "
 +			"allocator.\n");
 +	}
 +
 +	if (!srv_use_sys_malloc ) {
 +		ut_print_timestamp(stderr);
 +		fprintf(stderr,
 +			" InnoDB: Warning: Setting "
 +			"innodb_use_sys_malloc to FALSE is DEPRECATED. "
 +			"This option may be removed in future releases, "
 +			"together with the InnoDB's internal memory "
 +			"allocator.\n");
 +	}
 +
 +	srv_n_file_io_threads = (ulint) innobase_file_io_threads;
 +	srv_n_read_io_threads = (ulint) innobase_read_io_threads;
 +	srv_n_write_io_threads = (ulint) innobase_write_io_threads;
 +
 +	srv_use_doublewrite_buf = (ibool) innobase_use_doublewrite;
 +
 +	if (!innobase_use_checksums) {
 +		ut_print_timestamp(stderr);
 +		fprintf(stderr,
 +			" InnoDB: Warning: Setting "
 +			"innodb_checksums to OFF is DEPRECATED. "
 +			"This option may be removed in future releases. "
 +			"You should set innodb_checksum_algorithm=NONE "
 +			"instead.\n");
 +		srv_checksum_algorithm = SRV_CHECKSUM_ALGORITHM_NONE;
 +	}
 +
 +#ifdef HAVE_LARGE_PAGES
 +	if ((os_use_large_pages = (ibool) my_use_large_pages)) {
 +		os_large_page_size = (ulint) opt_large_page_size;
 +	}
 +#endif
 +
 +	row_rollback_on_timeout = (ibool) innobase_rollback_on_timeout;
 +
 +	srv_locks_unsafe_for_binlog = (ibool) innobase_locks_unsafe_for_binlog;
 +	if (innobase_locks_unsafe_for_binlog) {
 +		ut_print_timestamp(stderr);
 +		fprintf(stderr,
 +			" InnoDB: Warning: Using "
 +			"innodb_locks_unsafe_for_binlog is DEPRECATED. "
 +			"This option may be removed in future releases. "
 +			"Please use READ COMMITTED transaction isolation "
 +			"level instead, see " REFMAN "set-transaction.html.\n");
 +	}
 +
 +	if (innobase_open_files < 10) {
 +		innobase_open_files = 300;
 +		if (srv_file_per_table && tc_size > 300) {
 +			innobase_open_files = tc_size;
 +		}
 +	}
 +
 +	if (innobase_open_files > (long) tc_size) {
 +		fprintf(stderr,
 +                       "innodb_open_files should not be greater"
 +                       " than the open_files_limit.\n");
 +		innobase_open_files = tc_size;
 +	}
 +
 +	srv_max_n_open_files = (ulint) innobase_open_files;
 +	srv_innodb_status = (ibool) innobase_create_status_file;
 +
 +	srv_print_verbose_log = mysqld_embedded ? 0 : 1;
 +
 +	/* Round up fts_sort_pll_degree to nearest power of 2 number */
 +	for (num_pll_degree = 1;
 +	     num_pll_degree < fts_sort_pll_degree;
 +	     num_pll_degree <<= 1) {
 +
 +		/* No op */
 +	}
 +
 +	fts_sort_pll_degree = num_pll_degree;
 +
 +	/* Store the default charset-collation number of this MySQL
 +	installation */
 +
 +	data_mysql_default_charset_coll = (ulint) default_charset_info->number;
 +
 +	ut_a(DATA_MYSQL_LATIN1_SWEDISH_CHARSET_COLL ==
 +					my_charset_latin1.number);
 +	ut_a(DATA_MYSQL_BINARY_CHARSET_COLL == my_charset_bin.number);
 +
 +	/* Store the latin1_swedish_ci character ordering table to InnoDB. For
 +	non-latin1_swedish_ci charsets we use the MySQL comparison functions,
 +	and consequently we do not need to know the ordering internally in
 +	InnoDB. */
 +
 +	srv_latin1_ordering = my_charset_latin1.sort_order;
 +
 +	innobase_commit_concurrency_init_default();
 +
 +#ifdef HAVE_POSIX_FALLOCATE
 +	srv_use_posix_fallocate = (ibool) innobase_use_fallocate;
 +#endif
 +	srv_use_atomic_writes = (ibool) innobase_use_atomic_writes;
 +
 +	if (innobase_use_atomic_writes) {
 +		fprintf(stderr, "InnoDB: using atomic writes.\n");
 +
 +		/* Force doublewrite buffer off, atomic writes replace it. */
 +		if (srv_use_doublewrite_buf) {
 +			fprintf(stderr, "InnoDB: Switching off doublewrite buffer "
 +				"because of atomic writes.\n");
 +				innobase_use_doublewrite = srv_use_doublewrite_buf = FALSE;
 +		}
 +
 +		/* Force O_DIRECT on Unixes (on Windows writes are always unbuffered)*/
 +#ifndef _WIN32
 +		if(!innobase_file_flush_method ||
 +			!strstr(innobase_file_flush_method, "O_DIRECT")) {
 +			innobase_file_flush_method =
 +				srv_file_flush_method_str = (char*)"O_DIRECT";
 +			fprintf(stderr, "InnoDB: using O_DIRECT due to atomic writes.\n");
 +		}
 +#endif
 +#ifdef HAVE_POSIX_FALLOCATE
 +		/* Due to a bug in directFS, using atomics needs
 +		 * posix_fallocate to extend the file
 +		 * pwrite()  past end of the file won't work
 +		 */
 +		srv_use_posix_fallocate = TRUE;
 +#endif
 +	}
 +
 +#ifdef HAVE_PSI_INTERFACE
 +	/* Register keys with MySQL performance schema */
 +	int	count;
 +
 +	count = array_elements(all_pthread_mutexes);
 + 	mysql_mutex_register("innodb", all_pthread_mutexes, count);
 +
 +# ifdef UNIV_PFS_MUTEX
 +	count = array_elements(all_innodb_mutexes);
 +	mysql_mutex_register("innodb", all_innodb_mutexes, count);
 +# endif /* UNIV_PFS_MUTEX */
 +
 +# ifdef UNIV_PFS_RWLOCK
 +	count = array_elements(all_innodb_rwlocks);
 +	mysql_rwlock_register("innodb", all_innodb_rwlocks, count);
 +# endif /* UNIV_PFS_MUTEX */
 +
 +# ifdef UNIV_PFS_THREAD
 +	count = array_elements(all_innodb_threads);
 +	mysql_thread_register("innodb", all_innodb_threads, count);
 +# endif /* UNIV_PFS_THREAD */
 +
 +# ifdef UNIV_PFS_IO
 +	count = array_elements(all_innodb_files);
 +	mysql_file_register("innodb", all_innodb_files, count);
 +# endif /* UNIV_PFS_IO */
 +
 +	count = array_elements(all_innodb_conds);
 +	mysql_cond_register("innodb", all_innodb_conds, count);
 +#endif /* HAVE_PSI_INTERFACE */
 +
 +	/* Since we in this module access directly the fields of a trx
 +	struct, and due to different headers and flags it might happen that
 +	ib_mutex_t has a different size in this module and in InnoDB
 +	modules, we check at run time that the size is the same in
 +	these compilation modules. */
 +
 +	err = innobase_start_or_create_for_mysql();
 +
 +	if (err != DB_SUCCESS) {
 +		goto mem_free_and_error;
  	}
  
 -	trx = thd_to_trx(thd);
 +	/* Adjust the innodb_undo_logs config object */
 +	innobase_undo_logs_init_default_max();
  
 -	if (trx != NULL) {
 -		trx_search_latch_release_if_reserved(trx);
 +	innobase_old_blocks_pct = static_cast<uint>(
 +		buf_LRU_old_ratio_update(innobase_old_blocks_pct, TRUE));
 +
 +	ibuf_max_size_update(innobase_change_buffer_max_size);
 +
 +	innobase_open_tables = hash_create(200);
 +	mysql_mutex_init(innobase_share_mutex_key,
 +			 &innobase_share_mutex,
 +			 MY_MUTEX_INIT_FAST);
 +	mysql_mutex_init(commit_cond_mutex_key,
 +			 &commit_cond_m, MY_MUTEX_INIT_FAST);
 +	mysql_cond_init(commit_cond_key, &commit_cond, NULL);
 +	mysql_mutex_init(pending_checkpoint_mutex_key,
 +			 &pending_checkpoint_mutex,
 +			 MY_MUTEX_INIT_FAST);
 +	innodb_inited= 1;
 +#ifdef MYSQL_DYNAMIC_PLUGIN
 +	if (innobase_hton != p) {
 +		innobase_hton = reinterpret_cast<handlerton*>(p);
 +		*innobase_hton = *innodb_hton_ptr;
  	}
 +#endif /* MYSQL_DYNAMIC_PLUGIN */
  
 -	return(0);
 +	/* Get the current high water mark format. */
 +	innobase_file_format_max = (char*) trx_sys_file_format_max_get();
 +
 +	/* Currently, monitor counter information are not persistent. */
 +	memset(monitor_set_tbl, 0, sizeof monitor_set_tbl);
 +
 +	memset(innodb_counter_value, 0, sizeof innodb_counter_value);
 +
 +	/* Do this as late as possible so server is fully starts up,
 +	since  we might get some initial stats if user choose to turn
 +	on some counters from start up */
 +	if (innobase_enable_monitor_counter) {
 +		innodb_enable_monitor_at_startup(
 +			innobase_enable_monitor_counter);
 +	}
 +
 +	/* Turn on monitor counters that are default on */
 +	srv_mon_default_on();
 +
 +	DBUG_RETURN(FALSE);
 +error:
 +	DBUG_RETURN(TRUE);
  }
  
 -#ifdef WITH_WSREP
 -static int 
 -wsrep_abort_transaction(handlerton* hton, THD *bf_thd, THD *victim_thd, 
 -			my_bool signal);
 -static void
 -wsrep_fake_trx_id(handlerton* hton, THD *thd);
 -static int innobase_wsrep_set_checkpoint(handlerton* hton, const XID* xid);
 -static int innobase_wsrep_get_checkpoint(handlerton* hton, XID* xid);
 -#endif
 -/********************************************************************//**
 -Increments innobase_active_counter and every INNOBASE_WAKE_INTERVALth
 -time calls srv_active_wake_master_thread. This function should be used
 -when a single database operation may introduce a small need for
 -server utility activity, like checkpointing. */
 -static inline
 +/** Shut down the InnoDB storage engine.
 +@return	0 */
 +static
 +int
 +innobase_end(handlerton*, ha_panic_function)
 +{
 +	DBUG_ENTER("innobase_end");
 +
 +	if (innodb_inited) {
 +
 +		THD *thd= current_thd;
 +		if (thd) { // may be UNINSTALL PLUGIN statement
 +		 	trx_t* trx = thd_to_trx(thd);
 +		 	if (trx) {
 +		 		trx_free_for_mysql(trx);
 +		 	}
 +		}
 +
 +		srv_fast_shutdown = (ulint) innobase_fast_shutdown;
 +
 +		innodb_inited = 0;
 +		hash_table_free(innobase_open_tables);
 +		innobase_open_tables = NULL;
 +		innodb_shutdown();
 +		srv_free_paths_and_sizes();
 +		my_free(internal_innobase_data_file_path);
 +		mysql_mutex_destroy(&innobase_share_mutex);
 +		mysql_mutex_destroy(&commit_cond_m);
 +		mysql_cond_destroy(&commit_cond);
 +		mysql_mutex_destroy(&pending_checkpoint_mutex);
 +	}
 +
 +	DBUG_RETURN(0);
 +}
 +
 +/****************************************************************//**
 +Flushes InnoDB logs to disk and makes a checkpoint. Really, a commit flushes
 +the logs, and the name of this function should be innobase_checkpoint.
 +@return	TRUE if error */
 +static
 +bool
 +innobase_flush_logs(
 +/*================*/
 +	handlerton*	hton)	/*!< in/out: InnoDB handlerton */
 +{
 +	bool	result = 0;
 +
 +	DBUG_ENTER("innobase_flush_logs");
 +	DBUG_ASSERT(hton == innodb_hton_ptr);
 +
 +	if (!srv_read_only_mode) {
 +		log_buffer_flush_to_disk();
 +	}
 +
 +	DBUG_RETURN(result);
 +}
 +
 +/*****************************************************************//**
 +Commits a transaction in an InnoDB database. */
 +static
  void
 -innobase_active_small(void)
 -/*=======================*/
 +innobase_commit_low(
 +/*================*/
 +	trx_t*	trx)	/*!< in: transaction handle */
  {
 -	innobase_active_counter++;
 +#ifdef WITH_WSREP
 +	THD* thd = (THD*)trx->mysql_thd;
 +	const char* tmp = 0;
 +	if (wsrep_on((void*)thd)) {
 +#ifdef WSREP_PROC_INFO
 +		char info[64];
 +		info[sizeof(info) - 1] = '\0';
 +		snprintf(info, sizeof(info) - 1,
 +			 "innobase_commit_low():trx_commit_for_mysql(%lld)",
 +			 (long long) wsrep_thd_trx_seqno(thd));
 +		tmp = thd_proc_info(thd, info);
  
 -	if ((innobase_active_counter % INNOBASE_WAKE_INTERVAL) == 0) {
 -		srv_active_wake_master_thread();
 +#else
 +		tmp = thd_proc_info(thd, "innobase_commit_low()");
 +#endif /* WSREP_PROC_INFO */
 +	}
 +#endif /* WITH_WSREP */
 +	if (trx_is_started(trx)) {
 +
 +		trx_commit_for_mysql(trx);
  	}
 +#ifdef WITH_WSREP
 +	if (wsrep_on((void*)thd)) { thd_proc_info(thd, tmp); }
 +#endif /* WITH_WSREP */
  }
  
 -/********************************************************************//**
 -Converts an InnoDB error code to a MySQL error code and also tells to MySQL
 -about a possible transaction rollback inside InnoDB caused by a lock wait
 -timeout or a deadlock.
 -@return	MySQL error code */
 -extern "C" UNIV_INTERN
 +/*****************************************************************//**
 +Creates an InnoDB transaction struct for the thd if it does not yet have one.
 +Starts a new InnoDB transaction if a transaction is not yet started. And
 +assigns a new snapshot for a consistent read if the transaction does not yet
 +have one.
 +@return	0 */
 +static
  int
 -convert_error_code_to_mysql(
 -/*========================*/
 -	int	error,	/*!< in: InnoDB error code */
 -	ulint	flags,  /*!< in: InnoDB table flags, or 0 */
 -	THD*	thd)	/*!< in: user thread handle or NULL */
 +innobase_start_trx_and_assign_read_view(
 +/*====================================*/
 +	handlerton*	hton,	/*!< in: Innodb handlerton */
 +	THD*		thd)	/*!< in: MySQL thread handle of the user for
 +				whom the transaction should be committed */
  {
 -	switch (error) {
 -	case DB_SUCCESS:
 -		return(0);
 +	trx_t*	trx;
  
 -	case DB_INTERRUPTED:
 -                return(HA_ERR_ABORTED_BY_USER);
 +	DBUG_ENTER("innobase_start_trx_and_assign_read_view");
 +	DBUG_ASSERT(hton == innodb_hton_ptr);
  
 -	case DB_FOREIGN_EXCEED_MAX_CASCADE:
 -		push_warning_printf(thd, MYSQL_ERROR::WARN_LEVEL_WARN,
 -				    HA_ERR_ROW_IS_REFERENCED,
 -				    "InnoDB: Cannot delete/update "
 -				    "rows with cascading foreign key "
 -				    "constraints that exceed max "
 -				    "depth of %d. Please "
 -				    "drop extra constraints and try "
 -				    "again", DICT_FK_MAX_RECURSIVE_LOAD);
 +	/* Create a new trx struct for thd, if it does not yet have one */
  
 -		/* fall through */
 +	trx = check_trx_exists(thd);
  
 -	case DB_ERROR:
 -	default:
 -		return(-1); /* unspecified error */
 +	/* This is just to play safe: release a possible FIFO ticket and
 +	search latch. Since we can potentially reserve the trx_sys->mutex,
 +	we have to release the search system latch first to obey the latching
 +	order. */
 +
 +	trx_search_latch_release_if_reserved(trx);
 +
 +	innobase_srv_conc_force_exit_innodb(trx);
 +
 +	/* If the transaction is not started yet, start it */
 +
 +	trx_start_if_not_started_xa(trx);
 +
 +	/* Assign a read view if the transaction does not have it yet.
 +	Do this only if transaction is using REPEATABLE READ isolation
 +	level. */
 +	trx->isolation_level = innobase_map_isolation_level(
 +		thd_get_trx_isolation(thd));
 +
 +	if (trx->isolation_level == TRX_ISO_REPEATABLE_READ) {
 +		trx_assign_read_view(trx);
 +	} else {
 +		push_warning_printf(thd, Sql_condition::WARN_LEVEL_WARN,
 +				    HA_ERR_UNSUPPORTED,
 +				    "InnoDB: WITH CONSISTENT SNAPSHOT "
 +				    "was ignored because this phrase "
 +				    "can only be used with "
 +				    "REPEATABLE READ isolation level.");
 +	}
 +
 +	/* Set the MySQL flag to mark that there is an active transaction */
 +
 +	innobase_register_trx(hton, current_thd, trx);
 +
 +	DBUG_RETURN(0);
 +}
 +
 +static
 +void
 +innobase_commit_ordered_2(
 +/*============*/
 +	trx_t*	trx, 	/*!< in: Innodb transaction */
 +	THD*	thd)	/*!< in: MySQL thread handle */
 +{
 +	DBUG_ENTER("innobase_commit_ordered_2");
 +
 +	/* We need current binlog position for mysqlbackup to work.
 +	Note, the position is current because commit_ordered is guaranteed
 +	to be called in same sequenece as writing to binlog. */
 +
 +retry:
 +	if (innobase_commit_concurrency > 0) {
 +		mysql_mutex_lock(&commit_cond_m);
 +		commit_threads++;
 +
 +		if (commit_threads > innobase_commit_concurrency) {
 +			commit_threads--;
 +			mysql_cond_wait(&commit_cond,
 +					  &commit_cond_m);
 +			mysql_mutex_unlock(&commit_cond_m);
 +			goto retry;
 +		}
 +		else {
 +			mysql_mutex_unlock(&commit_cond_m);
 +		}
 +	}
 +
 +        unsigned long long pos;
 +        thd_binlog_pos(thd, &trx->mysql_log_file_name, &pos);
 +        trx->mysql_log_offset= static_cast<ib_int64_t>(pos);
 +	/* Don't do write + flush right now. For group commit
 +	   to work we want to do the flush in the innobase_commit()
 +	   method, which runs without holding any locks. */
 +	trx->flush_log_later = TRUE;
 +	innobase_commit_low(trx);
 +	trx->flush_log_later = FALSE;
 +
 +	if (innobase_commit_concurrency > 0) {
 +		mysql_mutex_lock(&commit_cond_m);
 +		commit_threads--;
 +		mysql_cond_signal(&commit_cond);
 +		mysql_mutex_unlock(&commit_cond_m);
 +	}
 +
 +	DBUG_VOID_RETURN;
 +}
  
 -	case DB_DUPLICATE_KEY:
 -		/* Be cautious with returning this error, since
 -		mysql could re-enter the storage layer to get
 -		duplicated key info, the operation requires a
 -		valid table handle and/or transaction information,
 -		which might not always be available in the error
 -		handling stage. */
 -		return(HA_ERR_FOUND_DUPP_KEY);
 +/*****************************************************************//**
 +Perform the first, fast part of InnoDB commit.
  
 -	case DB_FOREIGN_DUPLICATE_KEY:
 -		return(HA_ERR_FOREIGN_DUPLICATE_KEY);
 +Doing it in this call ensures that we get the same commit order here
 +as in binlog and any other participating transactional storage engines.
  
 -	case DB_MISSING_HISTORY:
 -		return(HA_ERR_TABLE_DEF_CHANGED);
 +Note that we want to do as little as really needed here, as we run
 +under a global mutex. The expensive fsync() is done later, in
 +innobase_commit(), without a lock so group commit can take place.
  
 -	case DB_RECORD_NOT_FOUND:
 -		return(HA_ERR_NO_ACTIVE_RECORD);
 +Note also that this method can be called from a different thread than
 +the one handling the rest of the transaction. */
 +static
 +void
 +innobase_commit_ordered(
 +/*============*/
 +	handlerton *hton, /*!< in: Innodb handlerton */
 +	THD*	thd,	/*!< in: MySQL thread handle of the user for whom
 +			the transaction should be committed */
 +	bool	all)	/*!< in:	TRUE - commit transaction
 +				FALSE - the current SQL statement ended */
 +{
 +	trx_t*		trx;
 +	DBUG_ENTER("innobase_commit_ordered");
 +	DBUG_ASSERT(hton == innodb_hton_ptr);
  
 -	case DB_DEADLOCK:
 -		/* Since we rolled back the whole transaction, we must
 -		tell it also to MySQL so that MySQL knows to empty the
 -		cached binlog for this transaction */
 +	trx = check_trx_exists(thd);
  
 -		if (thd) {
 -			thd_mark_transaction_to_rollback(thd, TRUE);
 -		}
 +	/* Since we will reserve the kernel mutex, we must not be holding the
 +	search system latch, or we will disobey the latching order. But we
 +	already released it in innobase_xa_prepare() (if not before), so just
 +	have an assert here.*/
 +	ut_ad(!trx->has_search_latch);
  
 -		return(HA_ERR_LOCK_DEADLOCK);
 +	if (!trx_is_registered_for_2pc(trx) && trx_is_started(trx)) {
 +		/* We cannot throw error here; instead we will catch this error
 +		again in innobase_commit() and report it from there. */
 +		DBUG_VOID_RETURN;
 +	}
  
 -	case DB_LOCK_WAIT_TIMEOUT:
 -		/* Starting from 5.0.13, we let MySQL just roll back the
 -		latest SQL statement in a lock wait timeout. Previously, we
 -		rolled back the whole transaction. */
 +	/* commit_ordered is only called when committing the whole transaction
 +	(or an SQL statement when autocommit is on). */
 +	DBUG_ASSERT(all ||
 +		(!thd_test_options(thd, OPTION_NOT_AUTOCOMMIT | OPTION_BEGIN)));
  
 -		if (thd) {
 -			thd_mark_transaction_to_rollback(
 -				thd, (bool)row_rollback_on_timeout);
 -		}
 +	innobase_commit_ordered_2(trx, thd);
  
 -		return(HA_ERR_LOCK_WAIT_TIMEOUT);
 +        trx_set_active_commit_ordered(trx);
  
 -	case DB_NO_REFERENCED_ROW:
 -		return(HA_ERR_NO_REFERENCED_ROW);
 +	DBUG_VOID_RETURN;
 +}
  
 -	case DB_ROW_IS_REFERENCED:
 -		return(HA_ERR_ROW_IS_REFERENCED);
 +/*****************************************************************//**
 +Commits a transaction in an InnoDB database or marks an SQL statement
 +ended.
 +@return	0 */
 +static
 +int
 +innobase_commit(
 +/*============*/
 +	handlerton*	hton,		/*!< in: Innodb handlerton */
 +	THD*		thd,		/*!< in: MySQL thread handle of the
 +					user for whom the transaction should
 +					be committed */
 +	bool		commit_trx)	/*!< in: true - commit transaction
 +					false - the current SQL statement
 +					ended */
 +{
 +	trx_t*		trx;
  
 -	case DB_CANNOT_ADD_CONSTRAINT:
 -	case DB_CHILD_NO_INDEX:
 -	case DB_PARENT_NO_INDEX:
 -		return(HA_ERR_CANNOT_ADD_FOREIGN);
 +	DBUG_ENTER("innobase_commit");
 +	DBUG_ASSERT(hton == innodb_hton_ptr);
 +	DBUG_PRINT("trans", ("ending transaction"));
  
 -	case DB_CANNOT_DROP_CONSTRAINT:
 +	trx = check_trx_exists(thd);
  
 -		return(HA_ERR_ROW_IS_REFERENCED); /* TODO: This is a bit
 -						misleading, a new MySQL error
 -						code should be introduced */
 +	/* Since we will reserve the trx_sys->mutex, we have to release
 +	the search system latch first to obey the latching order. */
  
 -	case DB_CORRUPTION:
 -		return(HA_ERR_CRASHED);
 +	if (trx->has_search_latch && !trx_is_active_commit_ordered(trx)) {
 +		trx_search_latch_release_if_reserved(trx);
 +	}
  
 -	case DB_OUT_OF_FILE_SPACE:
 -		return(HA_ERR_RECORD_FILE_FULL);
 +	/* Transaction is deregistered only in a commit or a rollback. If
 +	it is deregistered we know there cannot be resources to be freed
 +	and we could return immediately.  For the time being, we play safe
 +	and do the cleanup though there should be nothing to clean up. */
  
 -	case DB_TABLE_IN_FK_CHECK:
 -		return(HA_ERR_TABLE_IN_FK_CHECK);
 +	if (!trx_is_registered_for_2pc(trx) && trx_is_started(trx)) {
  
 -	case DB_TABLE_IS_BEING_USED:
 -		return(HA_ERR_WRONG_COMMAND);
 +		sql_print_error("Transaction not registered for MySQL 2PC, "
 +				"but transaction is active");
 +	}
  
 -	case DB_TABLE_NOT_FOUND:
 -		return(HA_ERR_NO_SUCH_TABLE);
 +	if (commit_trx
 +	    || (!thd_test_options(thd, OPTION_NOT_AUTOCOMMIT | OPTION_BEGIN))) {
  
 -	case DB_TOO_BIG_RECORD: {
 -		/* If prefix is true then a 768-byte prefix is stored
 -		locally for BLOB fields. Refer to dict_table_get_format() */
 -		bool prefix = ((flags & DICT_TF_FORMAT_MASK)
 -		 	       >> DICT_TF_FORMAT_SHIFT) < UNIV_FORMAT_B;
 -		my_printf_error(ER_TOO_BIG_ROWSIZE,
 -			"Row size too large (> %lu). Changing some columns "
 -			"to TEXT or BLOB %smay help. In current row "
 -			"format, BLOB prefix of %d bytes is stored inline.",
 -			MYF(0),
 -			page_get_free_space_of_empty(flags &
 -				DICT_TF_COMPACT) / 2,
 -			prefix ? "or using ROW_FORMAT=DYNAMIC "
 -			"or ROW_FORMAT=COMPRESSED ": "",
 -			prefix ? DICT_MAX_FIXED_COL_LEN : 0);
 -		return(HA_ERR_TO_BIG_ROW);
 -        }
 +		/* Run the fast part of commit if we did not already. */
 +		if (!trx_is_active_commit_ordered(trx)) {
 +			innobase_commit_ordered_2(trx, thd);
 +		}
  
 -	case DB_TOO_BIG_INDEX_COL:
 -		my_error(ER_INDEX_COLUMN_TOO_LONG, MYF(0),
 -			 DICT_MAX_FIELD_LEN_BY_FORMAT_FLAG(flags));
 -		return(HA_ERR_INDEX_COL_TOO_LONG);
 +		/* We were instructed to commit the whole transaction, or
 +		this is an SQL statement end and autocommit is on */
  
 -	case DB_NO_SAVEPOINT:
 -		return(HA_ERR_NO_SAVEPOINT);
 +		/* At this point commit order is fixed and transaction is
 +		visible to others. So we can wakeup other commits waiting for
 +		this one, to allow then to group commit with us. */
 +		thd_wakeup_subsequent_commits(thd, 0);
  
 -	case DB_LOCK_TABLE_FULL:
 -		/* Since we rolled back the whole transaction, we must
 -		tell it also to MySQL so that MySQL knows to empty the
 -		cached binlog for this transaction */
 +		/* We did the first part already in innobase_commit_ordered(),
 +		Now finish by doing a write + flush of logs. */
 +		trx_commit_complete_for_mysql(trx);
 +                trx_deregister_from_2pc(trx);
 +	} else {
 +		/* We just mark the SQL statement ended and do not do a
 +		transaction commit */
  
 -		if (thd) {
 -			thd_mark_transaction_to_rollback(thd, TRUE);
 -		}
 +		/* If we had reserved the auto-inc lock for some
 +		table in this SQL statement we release it now */
  
 -		return(HA_ERR_LOCK_TABLE_FULL);
 +		lock_unlock_table_autoinc(trx);
  
 -	case DB_PRIMARY_KEY_IS_NULL:
 -		return(ER_PRIMARY_CANT_HAVE_NULL);
 +		/* Store the current undo_no of the transaction so that we
 +		know where to roll back if we have to roll back the next
 +		SQL statement */
  
 -	case DB_TOO_MANY_CONCURRENT_TRXS:
 -		/* New error code HA_ERR_TOO_MANY_CONCURRENT_TRXS is only
 -		available in 5.1.38 and later, but the plugin should still
 -		work with previous versions of MySQL. */
 -#ifdef HA_ERR_TOO_MANY_CONCURRENT_TRXS
 -		return(HA_ERR_TOO_MANY_CONCURRENT_TRXS);
 -#else /* HA_ERR_TOO_MANY_CONCURRENT_TRXS */
 -		return(HA_ERR_RECORD_FILE_FULL);
 -#endif /* HA_ERR_TOO_MANY_CONCURRENT_TRXS */
 -	case DB_UNSUPPORTED:
 -		return(HA_ERR_UNSUPPORTED);
 -	case DB_INDEX_CORRUPT:
 -		return(HA_ERR_INDEX_CORRUPT);
 -	case DB_UNDO_RECORD_TOO_BIG:
 -		return(HA_ERR_UNDO_REC_TOO_BIG);
 -	case DB_OUT_OF_MEMORY:
 -		return(HA_ERR_OUT_OF_MEM);
 -	case DB_IDENTIFIER_TOO_LONG:
 -		return(HA_ERR_INTERNAL_ERROR);
 +		trx_mark_sql_stat_end(trx);
  	}
 -}
  
 -/*************************************************************//**
 -Prints info of a THD object (== user session thread) to the given file. */
 -extern "C" UNIV_INTERN
 -void
 -innobase_mysql_print_thd(
 -/*=====================*/
 -	FILE*	f,		/*!< in: output stream */
 -	void*	thd,		/*!< in: pointer to a MySQL THD object */
 -	uint	max_query_len)	/*!< in: max query length to print, or 0 to
 -				   use the default max length */
 -{
 -	char	buffer[1024];
 +	trx->n_autoinc_rows = 0; /* Reset the number AUTO-INC rows required */
  
 -	fputs(thd_security_context((THD*) thd, buffer, sizeof buffer,
 -				   max_query_len), f);
 -	putc('\n', f);
 +	/* This is a statement level variable. */
 +	trx->fts_next_doc_id = 0;
 +
 +	innobase_srv_conc_force_exit_innodb(trx);
 +
 +	DBUG_RETURN(0);
  }
  
 -/******************************************************************//**
 -Get the variable length bounds of the given character set. */
 -extern "C" UNIV_INTERN
 -void
 -innobase_get_cset_width(
 -/*====================*/
 -	ulint	cset,		/*!< in: MySQL charset-collation code */
 -	ulint*	mbminlen,	/*!< out: minimum length of a char (in bytes) */
 -	ulint*	mbmaxlen)	/*!< out: maximum length of a char (in bytes) */
 +/*****************************************************************//**
 +Rolls back a transaction or the latest SQL statement.
 +@return	0 or error number */
 +static
 +int
 +innobase_rollback(
 +/*==============*/
 +	handlerton*	hton,		/*!< in: Innodb handlerton */
 +	THD*		thd,		/*!< in: handle to the MySQL thread
 +					of the user whose transaction should
 +					be rolled back */
 +	bool		rollback_trx)	/*!< in: TRUE - rollback entire
 +					transaction FALSE - rollback the current
 +					statement only */
  {
 -	CHARSET_INFO*	cs;
 -	ut_ad(cset < 256);
 -	ut_ad(mbminlen);
 -	ut_ad(mbmaxlen);
 -
 -	cs = all_charsets[cset];
 -	if (cs) {
 -		*mbminlen = cs->mbminlen;
 -		*mbmaxlen = cs->mbmaxlen;
 -		ut_ad(*mbminlen < DATA_MBMAX);
 -		ut_ad(*mbmaxlen < DATA_MBMAX);
 -	} else {
 -		THD*	thd = current_thd;
 +	dberr_t	error;
 +	trx_t*	trx;
  
 -		if (thd && thd_sql_command(thd) == SQLCOM_DROP_TABLE) {
 +	DBUG_ENTER("innobase_rollback");
 +	DBUG_ASSERT(hton == innodb_hton_ptr);
 +	DBUG_PRINT("trans", ("aborting transaction"));
  
 -			/* Fix bug#46256: allow tables to be dropped if the
 -			collation is not found, but issue a warning. */
 -			if ((global_system_variables.log_warnings)
 -			    && (cset != 0)){
 +	trx = check_trx_exists(thd);
  
 -				sql_print_warning(
 -					"Unknown collation #%lu.", cset);
 -			}
 -		} else {
 +	/* Release a possible FIFO ticket and search latch. Since we will
 +	reserve the trx_sys->mutex, we have to release the search system
 +	latch first to obey the latching order. */
  
 -			ut_a(cset == 0);
 -		}
 +	trx_search_latch_release_if_reserved(trx);
  
 -		*mbminlen = *mbmaxlen = 0;
 -	}
 -}
 +	innobase_srv_conc_force_exit_innodb(trx);
  
 -/******************************************************************//**
 -Converts an identifier to a table name. */
 -extern "C" UNIV_INTERN
 -void
 -innobase_convert_from_table_id(
 -/*===========================*/
 -	struct charset_info_st*	cs,	/*!< in: the 'from' character set */
 -	char*			to,	/*!< out: converted identifier */
 -	const char*		from,	/*!< in: identifier to convert */
 -	ulint			len)	/*!< in: length of 'to', in bytes */
 -{
 -	uint	errors;
 +	trx->n_autoinc_rows = 0; /* Reset the number AUTO-INC rows required */
  
 -	strconvert(cs, from, &my_charset_filename, to, (uint) len, &errors);
 -}
 +	/* If we had reserved the auto-inc lock for some table (if
 +	we come here to roll back the latest SQL statement) we
 +	release it now before a possibly lengthy rollback */
  
 -/**********************************************************************
 -Check if the length of the identifier exceeds the maximum allowed.
 -return true when length of identifier is too long. */
 -extern "C"
 -my_bool
 -innobase_check_identifier_length(
 -/*=============================*/
 -	const char*	id)	/* in: FK identifier to check excluding the
 -				database portion. */
 -{
 -	int		well_formed_error = 0;
 -	CHARSET_INFO	*cs = system_charset_info;
 -	DBUG_ENTER("innobase_check_identifier_length");
 +	lock_unlock_table_autoinc(trx);
  
 -	uint res = cs->cset->well_formed_len(cs, id, id + strlen(id),
 -					     NAME_CHAR_LEN,
 -					     &well_formed_error);
 +	/* This is a statement level variable. */
 +	trx->fts_next_doc_id = 0;
  
 -	if (well_formed_error || res == NAME_CHAR_LEN) {
 -		my_error(ER_TOO_LONG_IDENT, MYF(0), id);
 -		DBUG_RETURN(true);
 +	if (rollback_trx
 +	    || !thd_test_options(thd, OPTION_NOT_AUTOCOMMIT | OPTION_BEGIN)) {
 +
 +		error = trx_rollback_for_mysql(trx);
 +		trx_deregister_from_2pc(trx);
 +	} else {
 +		error = trx_rollback_last_sql_stat_for_mysql(trx);
  	}
 -	DBUG_RETURN(false);
 +
 +	DBUG_RETURN(convert_error_code_to_mysql(error, 0, NULL));
  }
  
 -/******************************************************************//**
 -Converts an identifier to UTF-8. */
 -extern "C" UNIV_INTERN
 -void
 -innobase_convert_from_id(
 -/*=====================*/
 -	struct charset_info_st*	cs,	/*!< in: the 'from' character set */
 -	char*			to,	/*!< out: converted identifier */
 -	const char*		from,	/*!< in: identifier to convert */
 -	ulint			len)	/*!< in: length of 'to', in bytes */
 +/*****************************************************************//**
 +Rolls back a transaction
 +@return	0 or error number */
 +static
 +int
 +innobase_rollback_trx(
 +/*==================*/
 +	trx_t*	trx)	/*!< in: transaction */
  {
 -	uint	errors;
 +	dberr_t	error = DB_SUCCESS;
  
 -	strconvert(cs, from, system_charset_info, to, (uint) len, &errors);
 -}
 +	DBUG_ENTER("innobase_rollback_trx");
 +	DBUG_PRINT("trans", ("aborting transaction"));
  
 -/**********************************************************************
 -Converts an identifier from my_charset_filename to UTF-8 charset.
 -@return result string length, as returned by strconvert() */
 -extern "C"
 -uint
 -innobase_convert_to_system_charset(
 -/*===============================*/
 -	char*		to,	/* out: converted identifier */
 -	const char*	from,	/* in: identifier to convert */
 -	ulint		len,	/* in: length of 'to', in bytes */
 -	uint*		errors)	/* out: error return */
 -{
 -	CHARSET_INFO*	cs1 = &my_charset_filename;
 -	CHARSET_INFO*	cs2 = system_charset_info;
 +	/* Release a possible FIFO ticket and search latch. Since we will
 +	reserve the trx_sys->mutex, we have to release the search system
 +	latch first to obey the latching order. */
  
 -	return(strconvert(cs1, from, cs2, to, len, errors));
 -}
 +	trx_search_latch_release_if_reserved(trx);
  
 -/******************************************************************//**
 -Compares NUL-terminated UTF-8 strings case insensitively.
 -@return	0 if a=b, <0 if a<b, >1 if a>b */
 -extern "C" UNIV_INTERN
 -int
 -innobase_strcasecmp(
 -/*================*/
 -	const char*	a,	/*!< in: first string to compare */
 -	const char*	b)	/*!< in: second string to compare */
 -{
 -	return(my_strcasecmp(system_charset_info, a, b));
 -}
 +	innobase_srv_conc_force_exit_innodb(trx);
  
 -/******************************************************************//**
 -Strip dir name from a full path name and return only the file name
 -@return file name or "null" if no file name */
 -extern "C" UNIV_INTERN
 -const char*
 -innobase_basename(
 -/*==============*/
 -	const char*	path_name)	/*!< in: full path name */
 -{
 -	const char*	name = base_name(path_name);
 +	/* If we had reserved the auto-inc lock for some table (if
 +	we come here to roll back the latest SQL statement) we
 +	release it now before a possibly lengthy rollback */
  
 -	return((name) ? name : "null");
 +	lock_unlock_table_autoinc(trx);
 +
 +	if (!trx->read_only) {
 +		error = trx_rollback_for_mysql(trx);
 +	}
 +
 +	DBUG_RETURN(convert_error_code_to_mysql(error, 0, NULL));
  }
  
 -/******************************************************************//**
 -Makes all characters in a NUL-terminated UTF-8 string lower case. */
 -extern "C" UNIV_INTERN
 +
 +struct pending_checkpoint {
 +	struct pending_checkpoint *next;
 +	handlerton *hton;
 +	void *cookie;
 +	ib_uint64_t lsn;
 +};
 +static struct pending_checkpoint *pending_checkpoint_list;
 +static struct pending_checkpoint *pending_checkpoint_list_end;
 +
 +/*****************************************************************//**
 +Handle a commit checkpoint request from server layer.
 +We put the request in a queue, so that we can notify upper layer about
 +checkpoint complete when we have flushed the redo log.
 +If we have already flushed all relevant redo log, we notify immediately.*/
 +static
  void
 -innobase_casedn_str(
 -/*================*/
 -	char*	a)	/*!< in/out: string to put in lower case */
 +innobase_checkpoint_request(
 +	handlerton *hton,
 +	void *cookie)
  {
 -	my_casedn_str(system_charset_info, a);
 -}
 +	ib_uint64_t			lsn;
 +	ib_uint64_t			flush_lsn;
 +	struct pending_checkpoint *	entry;
 +
 +	/* Do the allocation outside of lock to reduce contention. The normal
 +	case is that not everything is flushed, so we will need to enqueue. */
 +	entry = static_cast<struct pending_checkpoint *>
 +		(my_malloc(sizeof(*entry), MYF(MY_WME)));
 +	if (!entry) {
 +		sql_print_error("Failed to allocate %u bytes."
 +				" Commit checkpoint will be skipped.",
 +				static_cast<unsigned>(sizeof(*entry)));
 +		return;
 +	}
  
 -/**********************************************************************//**
 -Determines the connection character set.
 -@return	connection character set */
 -extern "C" UNIV_INTERN
 -struct charset_info_st*
 -innobase_get_charset(
 -/*=================*/
 -	void*	mysql_thd)	/*!< in: MySQL thread handle */
 -{
 -	return(thd_charset((THD*) mysql_thd));
 +	entry->next = NULL;
 +	entry->hton = hton;
 +	entry->cookie = cookie;
 +
 +	mysql_mutex_lock(&pending_checkpoint_mutex);
 +	lsn = log_get_lsn();
 +	flush_lsn = log_get_flush_lsn();
 +	if (lsn > flush_lsn) {
 +		/* Put the request in queue.
 +		When the log gets flushed past the lsn, we will remove the
 +		entry from the queue and notify the upper layer. */
 +		entry->lsn = lsn;
 +		if (pending_checkpoint_list_end) {
 +			pending_checkpoint_list_end->next = entry;
 +			/* There is no need to order the entries in the list
 +			by lsn. The upper layer can accept notifications in
 +			any order, and short delays in notifications do not
 +			significantly impact performance. */
 +		} else {
 +			pending_checkpoint_list = entry;
 +		}
 +		pending_checkpoint_list_end = entry;
 +		entry = NULL;
 +	}
 +	mysql_mutex_unlock(&pending_checkpoint_mutex);
 +
 +	if (entry) {
 +		/* We are already flushed. Notify the checkpoint immediately. */
 +		commit_checkpoint_notify_ha(entry->hton, entry->cookie);
 +		my_free(entry);
 +	}
  }
  
 -/**********************************************************************//**
 -Determines the current SQL statement.
 -@return	SQL statement string */
 -extern "C" UNIV_INTERN
 -const char*
 -innobase_get_stmt(
 -/*==============*/
 -	void*	mysql_thd,	/*!< in: MySQL thread handle */
 -	size_t*	length)		/*!< out: length of the SQL statement */
 +/*****************************************************************//**
 +Log code calls this whenever log has been written and/or flushed up
 +to a new position. We use this to notify upper layer of a new commit
 +checkpoint when necessary.*/
 +UNIV_INTERN
 +void
 +innobase_mysql_log_notify(
 +/*===============*/
 +	ib_uint64_t	write_lsn,	/*!< in: LSN written to log file */
 +	ib_uint64_t	flush_lsn)	/*!< in: LSN flushed to disk */
  {
 -	LEX_STRING* stmt;
 +	struct pending_checkpoint *	pending;
 +	struct pending_checkpoint *	entry;
 +	struct pending_checkpoint *	last_ready;
 +
 +	/* It is safe to do a quick check for NULL first without lock.
 +	Even if we should race, we will at most skip one checkpoint and
 +	take the next one, which is harmless. */
 +	if (!pending_checkpoint_list)
 +		return;
  
 -	stmt = thd_query_string((THD*) mysql_thd);
 -	*length = stmt->length;
 -	return(stmt->str);
 -}
 +	mysql_mutex_lock(&pending_checkpoint_mutex);
 +	pending = pending_checkpoint_list;
 +	if (!pending)
 +	{
 +		mysql_mutex_unlock(&pending_checkpoint_mutex);
 +		return;
 +	}
  
 -/**********************************************************************//**
 -Get the current setting of the lower_case_table_names global parameter from
 -mysqld.cc. We do a dirty read because for one there is no synchronization
 -object and secondly there is little harm in doing so even if we get a torn
 -read.
 -@return	value of lower_case_table_names */
 -extern "C" UNIV_INTERN
 -ulint
 -innobase_get_lower_case_table_names(void)
 -/*=====================================*/
 -{
 -	return(lower_case_table_names);
 -}
 +	last_ready = NULL;
 +	for (entry = pending; entry != NULL; entry = entry -> next)
 +	{
 +		/* Notify checkpoints up until the first entry that has not
 +		been fully flushed to the redo log. Since we do not maintain
 +		the list ordered, in principle there could be more entries
 +		later than were also flushed. But there is no harm in
 +		delaying notifications for those a bit. And in practise, the
 +		list is unlikely to have more than one element anyway, as we
 +		flush the redo log at least once every second. */
 +		if (entry->lsn > flush_lsn)
 +			break;
 +		last_ready = entry;
 +	}
  
 -/*********************************************************************//**
 -Creates a temporary file.
 -@return	temporary file descriptor, or < 0 on error */
 -extern "C" UNIV_INTERN
 -int
 -innobase_mysql_tmpfile(void)
 -/*========================*/
 -{
 -#ifdef WITH_INNODB_DISALLOW_WRITES
 -	os_event_wait(srv_allow_writes_event);
 -#endif /* WITH_INNODB_DISALLOW_WRITES */
 -	int	fd2 = -1;
 -	File	fd;
 +	if (last_ready)
 +	{
 +		/* We found some pending checkpoints that are now flushed to
 +		disk. So remove them from the list. */
 +		pending_checkpoint_list = entry;
 +		if (!entry)
 +			pending_checkpoint_list_end = NULL;
 +	}
  
 -	DBUG_EXECUTE_IF(
 -		"innobase_tmpfile_creation_failure",
 -		return(-1);
 -	);
 +	mysql_mutex_unlock(&pending_checkpoint_mutex);
  
 -	fd = mysql_tmpfile("ib");
 +	if (!last_ready)
 +		return;
  
 -	if (fd >= 0) {
 -		/* Copy the file descriptor, so that the additional resources
 -		allocated by create_temp_file() can be freed by invoking
 -		my_close().
 +	/* Now that we have released the lock, notify upper layer about all
 +	commit checkpoints that have now completed. */
 +	for (;;) {
 +		entry = pending;
 +		pending = pending->next;
  
 -		Because the file descriptor returned by this function
 -		will be passed to fdopen(), it will be closed by invoking
 -		fclose(), which in turn will invoke close() instead of
 -		my_close(). */
 +		commit_checkpoint_notify_ha(entry->hton, entry->cookie);
  
 -#ifdef _WIN32
 -		/* Note that on Windows, the integer returned by mysql_tmpfile
 -		has no relation to C runtime file descriptor. Here, we need
 -		to call my_get_osfhandle to get the HANDLE and then convert it 
 -		to C runtime filedescriptor. */
 -		{
 -			HANDLE hFile = my_get_osfhandle(fd);
 -			HANDLE hDup;
 -			BOOL bOK = 
 -				DuplicateHandle(GetCurrentProcess(), hFile, GetCurrentProcess(),
 -								&hDup, 0, FALSE, DUPLICATE_SAME_ACCESS);
 -			if(bOK) {
 -				fd2 = _open_osfhandle((intptr_t)hDup,0);
 -			}
 -			else {
 -				my_osmaperr(GetLastError());
 -				fd2 = -1;
 -			}	
 -		}
 -#else
 -#ifdef F_DUPFD_CLOEXEC
 -		fd2 = fcntl(fd, F_DUPFD_CLOEXEC, 0);
 -#else
 -		fd2 = dup(fd);
 -#endif
 -#endif
 -		if (fd2 < 0) {
 -			DBUG_PRINT("error",("Got error %d on dup",fd2));
 -			my_errno=errno;
 -			my_error(EE_OUT_OF_FILERESOURCES,
 -				 MYF(ME_BELL+ME_WAITTANG),
 -				 "ib*", my_errno);
 -		}
 -		my_close(fd, MYF(MY_WME));
 +		my_free(entry);
 +		if (entry == last_ready)
 +			break;
  	}
 -	return(fd2);
 -}
 -
 -/*********************************************************************//**
 -Wrapper around MySQL's copy_and_convert function.
 -@return	number of bytes copied to 'to' */
 -extern "C" UNIV_INTERN
 -ulint
 -innobase_convert_string(
 -/*====================*/
 -	void*		to,		/*!< out: converted string */
 -	ulint		to_length,	/*!< in: number of bytes reserved
 -					for the converted string */
 -	CHARSET_INFO*	to_cs,		/*!< in: character set to convert to */
 -	const void*	from,		/*!< in: string to convert */
 -	ulint		from_length,	/*!< in: number of bytes to convert */
 -	CHARSET_INFO*	from_cs,	/*!< in: character set to convert from */
 -	uint*		errors)		/*!< out: number of errors encountered
 -					during the conversion */
 -{
 -  return(copy_and_convert((char*)to, (uint32) to_length, to_cs,
 -                          (const char*)from, (uint32) from_length, from_cs,
 -                          errors));
  }
  
 -/*******************************************************************//**
 -Formats the raw data in "data" (in InnoDB on-disk format) that is of
 -type DATA_(CHAR|VARCHAR|MYSQL|VARMYSQL) using "charset_coll" and writes
 -the result to "buf". The result is converted to "system_charset_info".
 -Not more than "buf_size" bytes are written to "buf".
 -The result is always NUL-terminated (provided buf_size > 0) and the
 -number of bytes that were written to "buf" is returned (including the
 -terminating NUL).
 -@return	number of bytes that were written */
 -extern "C" UNIV_INTERN
 -ulint
 -innobase_raw_format(
 -/*================*/
 -	const char*	data,		/*!< in: raw data */
 -	ulint		data_len,	/*!< in: raw data length
 -					in bytes */
 -	ulint		charset_coll,	/*!< in: charset collation */
 -	char*		buf,		/*!< out: output buffer */
 -	ulint		buf_size)	/*!< in: output buffer size
 -					in bytes */
 +/*****************************************************************//**
 +Rolls back a transaction to a savepoint.
 +@return 0 if success, HA_ERR_NO_SAVEPOINT if no savepoint with the
 +given name */
 +static
 +int
 +innobase_rollback_to_savepoint(
 +/*===========================*/
 +	handlerton*	hton,		/*!< in: Innodb handlerton */
 +	THD*		thd,		/*!< in: handle to the MySQL thread
 +					of the user whose transaction should
 +					be rolled back to savepoint */
 +	void*		savepoint)	/*!< in: savepoint data */
  {
 -	/* XXX we use a hard limit instead of allocating
 -	but_size bytes from the heap */
 -	CHARSET_INFO*	data_cs;
 -	char		buf_tmp[8192];
 -	ulint		buf_tmp_used;
 -	uint		num_errors;
 +	ib_int64_t	mysql_binlog_cache_pos;
 +	dberr_t		error;
 +	trx_t*		trx;
 +	char		name[64];
  
 -	data_cs = all_charsets[charset_coll];
 +	DBUG_ENTER("innobase_rollback_to_savepoint");
 +	DBUG_ASSERT(hton == innodb_hton_ptr);
  
 -	buf_tmp_used = innobase_convert_string(buf_tmp, sizeof(buf_tmp),
 -					       system_charset_info,
 -					       data, data_len, data_cs,
 -					       &num_errors);
 +	trx = check_trx_exists(thd);
  
 -	return(ut_str_sql_format(buf_tmp, buf_tmp_used, buf, buf_size));
 +	/* Release a possible FIFO ticket and search latch. Since we will
 +	reserve the trx_sys->mutex, we have to release the search system
 +	latch first to obey the latching order. */
 +
 +	trx_search_latch_release_if_reserved(trx);
 +
 +	innobase_srv_conc_force_exit_innodb(trx);
 +
 +	/* TODO: use provided savepoint data area to store savepoint data */
 +
 +	longlong2str((ulint) savepoint, name, 36);
 +
 +	error = trx_rollback_to_savepoint_for_mysql(
 +		trx, name, &mysql_binlog_cache_pos);
 +
 +	if (error == DB_SUCCESS && trx->fts_trx != NULL) {
 +		fts_savepoint_rollback(trx, name);
 +	}
 +
 +	DBUG_RETURN(convert_error_code_to_mysql(error, 0, NULL));
  }
  
 -/*********************************************************************//**
 -Compute the next autoinc value.
 +/*****************************************************************//**
 +Check whether innodb state allows to safely release MDL locks after
 +rollback to savepoint.
 +When binlog is on, MDL locks acquired after savepoint unit are not
 +released if there are any locks held in InnoDB.
 +@return true if it is safe, false if its not safe. */
 +static
 +bool
 +innobase_rollback_to_savepoint_can_release_mdl(
 +/*===========================================*/
 +	handlerton*	hton,		/*!< in: InnoDB handlerton */
 +	THD*		thd)		/*!< in: handle to the MySQL thread
 +					of the user whose transaction should
 +					be rolled back to savepoint */
 +{
 +	trx_t*		trx;
  
 -For MySQL replication the autoincrement values can be partitioned among
 -the nodes. The offset is the start or origin of the autoincrement value
 -for a particular node. For n nodes the increment will be n and the offset
 -will be in the interval [1, n]. The formula tries to allocate the next
 -value for a particular node.
 +	DBUG_ENTER("innobase_rollback_to_savepoint_can_release_mdl");
 +	DBUG_ASSERT(hton == innodb_hton_ptr);
  
 -Note: This function is also called with increment set to the number of
 -values we want to reserve for multi-value inserts e.g.,
 +	trx = check_trx_exists(thd);
 +	ut_ad(trx);
  
 -	INSERT INTO T VALUES(), (), ();
 +        /* If transaction has not acquired any locks then it is safe
 +	   to release MDL after rollback to savepoint */
 +	if (!(UT_LIST_GET_LEN(trx->lock.trx_locks))) {
 +		DBUG_RETURN(true);
 +	}
  
 -innobase_next_autoinc() will be called with increment set to 3 where
 -autoinc_lock_mode != TRADITIONAL because we want to reserve 3 values for
 -the multi-value INSERT above.
 -@return	the next value */
 +	DBUG_RETURN(false);
 +}
 +
 +/*****************************************************************//**
 +Release transaction savepoint name.
 +@return 0 if success, HA_ERR_NO_SAVEPOINT if no savepoint with the
 +given name */
  static
 -ulonglong
 -innobase_next_autoinc(
 -/*==================*/
 -	ulonglong	current,	/*!< in: Current value */
 -	ulonglong	need,		/*!< in: count of values needed */
 -	ulonglong	step,		/*!< in: AUTOINC increment step */
 -	ulonglong	offset,		/*!< in: AUTOINC offset */
 -	ulonglong	max_value)	/*!< in: max value for type */
 +int
 +innobase_release_savepoint(
 +/*=======================*/
 +	handlerton*	hton,		/*!< in: handlerton for Innodb */
 +	THD*		thd,		/*!< in: handle to the MySQL thread
 +					of the user whose transaction's
 +					savepoint should be released */
 +	void*		savepoint)	/*!< in: savepoint data */
  {
 -	ulonglong	next_value;
 -	ulonglong	block = need * step;
 +	dberr_t		error;
 +	trx_t*		trx;
 +	char		name[64];
  
 -	/* Should never be 0. */
 -	ut_a(need > 0);
 -	ut_a(block > 0);
 -	ut_a(max_value > 0);
 +	DBUG_ENTER("innobase_release_savepoint");
 +	DBUG_ASSERT(hton == innodb_hton_ptr);
  
 -        /*
 -          Allow auto_increment to go over max_value up to max ulonglong.
 -          This allows us to detect that all values are exhausted.
 -          If we don't do this, we will return max_value several times
 -          and get duplicate key errors instead of auto increment value
 -          out of range.
 -        */
 -        max_value= (~(ulonglong) 0);
 +	trx = check_trx_exists(thd);
  
 -	/* According to MySQL documentation, if the offset is greater than
 -	the step then the offset is ignored. */
 -	if (offset > block) {
 -		offset = 0;
 +	if (trx->state == TRX_STATE_NOT_STARTED) {
 +		trx_start_if_not_started(trx);
  	}
  
 -	/* Check for overflow. Current can be > max_value if the value is
 -	in reality a negative value.The visual studio compilers converts
 -	large double values automatically into unsigned long long datatype
 -	maximum value */
 -	if (block >= max_value
 -	    || offset > max_value
 -	    || current >= max_value
 -	    || max_value - offset <= offset) {
 +	/* TODO: use provided savepoint data area to store savepoint data */
  
 -		next_value = max_value;
 -	} else {
 -		ut_a(max_value > current);
 +	longlong2str((ulint) savepoint, name, 36);
  
 -		ulonglong	free = max_value - current;
 +	error = trx_release_savepoint_for_mysql(trx, name);
  
 -		if (free < offset || free - offset <= block) {
 -			next_value = max_value;
 -		} else {
 -			next_value = 0;
 -		}
 +	if (error == DB_SUCCESS && trx->fts_trx != NULL) {
 +		fts_savepoint_release(trx, name);
  	}
  
 -	if (next_value == 0) {
 -		ulonglong	next;
 +	DBUG_RETURN(convert_error_code_to_mysql(error, 0, NULL));
 +}
  
 -		if (current >= offset) {
 -			next = (current - offset) / step;
 -		} else {
 -			next = 0;
 -			block -= step;
 -		}
 +/*****************************************************************//**
 +Sets a transaction savepoint.
 +@return	always 0, that is, always succeeds */
 +static
 +int
 +innobase_savepoint(
 +/*===============*/
 +	handlerton*	hton,	/*!< in: handle to the Innodb handlerton */
 +	THD*	thd,		/*!< in: handle to the MySQL thread */
 +	void*	savepoint)	/*!< in: savepoint data */
 +{
 +	dberr_t	error;
 +	trx_t*	trx;
  
 -		ut_a(max_value > next);
 -		next_value = next * step;
 -		/* Check for multiplication overflow. */
 -		ut_a(next_value >= next);
 -		ut_a(max_value > next_value);
 +	DBUG_ENTER("innobase_savepoint");
 +	DBUG_ASSERT(hton == innodb_hton_ptr);
  
 -		/* Check for overflow */
 -		if (max_value - next_value >= block) {
 +	/* In the autocommit mode there is no sense to set a savepoint
 +	(unless we are in sub-statement), so SQL layer ensures that
 +	this method is never called in such situation.  */
  
 -			next_value += block;
 +	trx = check_trx_exists(thd);
  
 -			if (max_value - next_value >= offset) {
 -				next_value += offset;
 -			} else {
 -				next_value = max_value;
 -			}
 -		} else {
 -			next_value = max_value;
 -		}
 -	}
 +	/* Release a possible FIFO ticket and search latch. Since we will
 +	reserve the trx_sys->mutex, we have to release the search system
 +	latch first to obey the latching order. */
  
 -	ut_a(next_value != 0);
 -	ut_a(next_value <= max_value);
 +	trx_search_latch_release_if_reserved(trx);
  
 -	return(next_value);
 +	innobase_srv_conc_force_exit_innodb(trx);
 +
 +	/* Cannot happen outside of transaction */
 +	DBUG_ASSERT(trx_is_registered_for_2pc(trx));
 +
 +	/* TODO: use provided savepoint data area to store savepoint data */
 +	char name[64];
 +	longlong2str((ulint) savepoint,name,36);
 +
 +	error = trx_savepoint_for_mysql(trx, name, (ib_int64_t)0);
 +
 +	if (error == DB_SUCCESS && trx->fts_trx != NULL) {
 +		fts_savepoint_take(trx, trx->fts_trx, name);
 +	}
 +
 +	DBUG_RETURN(convert_error_code_to_mysql(error, 0, NULL));
  }
  
 -/*********************************************************************//**
 -Initializes some fields in an InnoDB transaction object. */
 +/*****************************************************************//**
 +Frees a possible InnoDB trx object associated with the current THD.
 +@return	0 or error number */
  static
 -void
 -innobase_trx_init(
 -/*==============*/
 -	THD*	thd,	/*!< in: user thread handle */
 -	trx_t*	trx)	/*!< in/out: InnoDB transaction handle */
 +int
 +innobase_close_connection(
 +/*======================*/
 +	handlerton*	hton,	/*!< in: innobase handlerton */
 +	THD*		thd)	/*!< in: handle to the MySQL thread of the user
 +				whose resources should be free'd */
  {
 -	DBUG_ENTER("innobase_trx_init");
 -	DBUG_ASSERT(thd == trx->mysql_thd);
 +	trx_t*	trx;
  
 -	trx->check_foreigns = !thd_test_options(
 -		thd, OPTION_NO_FOREIGN_KEY_CHECKS);
 +	DBUG_ENTER("innobase_close_connection");
 +	DBUG_ASSERT(hton == innodb_hton_ptr);
 +	trx = thd_to_trx(thd);
  
 -	trx->check_unique_secondary = !thd_test_options(
 -		thd, OPTION_RELAXED_UNIQUE_CHECKS);
 +	ut_a(trx);
  
 -	DBUG_VOID_RETURN;
 +	if (!trx_is_registered_for_2pc(trx) && trx_is_started(trx)) {
 +
 +		sql_print_error("Transaction not registered for MySQL 2PC, "
 +				"but transaction is active");
 +	}
 +
 +	if (trx_is_started(trx) && global_system_variables.log_warnings) {
 +
 +		sql_print_warning(
 +			"MySQL is closing a connection that has an active "
 +			"InnoDB transaction.  " TRX_ID_FMT " row modifications "
 +			"will roll back.",
 +			trx->undo_no);
 +	}
 +
 +	innobase_rollback_trx(trx);
 +
 +	trx_free_for_mysql(trx);
 +
 +	DBUG_RETURN(0);
  }
  
 -/*********************************************************************//**
 -Allocates an InnoDB transaction for a MySQL handler object.
 -@return	InnoDB transaction handle */
 -extern "C" UNIV_INTERN
 -trx_t*
 -innobase_trx_allocate(
 -/*==================*/
 -	THD*	thd)	/*!< in: user thread handle */
 +/*****************************************************************//**
 +Frees a possible InnoDB trx object associated with the current THD.
 +@return	0 or error number */
 +UNIV_INTERN
 +int
 +innobase_close_thd(
 +/*===============*/
 +	THD*		thd)	/*!< in: handle to the MySQL thread of the user
 +				whose resources should be free'd */
 +{
 +	trx_t*	trx = thd_to_trx(thd);
 +
 +	if (!trx) {
 +		return(0);
 +	}
 +
 +	return(innobase_close_connection(innodb_hton_ptr, thd));
 +}
 +
 +UNIV_INTERN void lock_cancel_waiting_and_release(lock_t* lock);
 +
 +/*****************************************************************//**
 +Cancel any pending lock request associated with the current THD. */
 +static
 +void
 +innobase_kill_query(
 +/*======================*/
 +        handlerton*	hton,	    /*!< in: innobase handlerton */
 +	THD*	thd,	            /*!< in: MySQL thread being killed */
 +        enum thd_kill_levels level) /*!< in: kill level */
  {
  	trx_t*	trx;
  
@@@ -7149,995 -3773,862 +7153,993 @@@ build_template_field
  			}
  		}
  
 -		if (col_type != mtype) {
 -			/* Column Type mismatches */
 -			DBUG_RETURN(FALSE);
 +		ib_logf(IB_LOG_LEVEL_INFO,
 +			"Looking for field %lu name %s from table %s",
 +			i,
 +			(tb_col_name ? tb_col_name : "NULL"),
 +			clust_index->table->name);
 +
 +
 +		for(ulint j=0; j < clust_index->n_user_defined_cols; j++) {
 +			dict_field_t* ifield = &(clust_index->fields[j]);
 +			ib_logf(IB_LOG_LEVEL_INFO,
 +				"InnoDB Table %s field %lu name %s",
 +				clust_index->table->name,
 +				j,
 +				(ifield ? ifield->name : "NULL"));
  		}
  
 -		innodb_idx_fld++;
 +		for(ulint j=0; j < table->s->stored_fields; j++) {
 +			ib_logf(IB_LOG_LEVEL_INFO,
 +				"MySQL table %s field %lu name %s",
 +				table->s->table_name.str,
 +				j,
 +				table->field[j]->field_name);
 +		}
 +
 +		ib_logf(IB_LOG_LEVEL_ERROR,
 +			"Clustered record field for column %lu"
 +			" not found table n_user_defined %d"
 +			" index n_user_defined %d"
 +			" InnoDB table %s field name %s"
 +			" MySQL table %s field name %s n_fields %d"
 +			" query %s",
 +			i,
 +			clust_index->n_user_defined_cols,
 +			clust_index->table->n_cols - DATA_N_SYS_COLS,
 +			clust_index->table->name,
 +			(field ? field->name : "NULL"),
 +			table->s->table_name.str,
 +			(tb_col_name ? tb_col_name : "NULL"),
 +			table->s->stored_fields,
 +			innobase_get_stmt(current_thd, &size));
 +
 +		ut_a(templ->clust_rec_field_no != ULINT_UNDEFINED);
 +	}
 +
 +	if (dict_index_is_clust(index)) {
 +		templ->rec_field_no = templ->clust_rec_field_no;
 +	} else {
 +		templ->rec_field_no = dict_index_get_nth_col_pos(index, i);
  	}
  
 -	DBUG_RETURN(TRUE);
 -}
 +	if (field->real_maybe_null()) {
 +		templ->mysql_null_byte_offset =
 +			field->null_offset();
  
 -/*******************************************************************//**
 -This function builds a translation table in INNOBASE_SHARE
 -structure for fast index location with mysql array number from its
 -table->key_info structure. This also provides the necessary translation
 -between the key order in mysql key_info and Innodb ib_table->indexes if
 -they are not fully matched with each other.
 -Note we do not have any mutex protecting the translation table
 -building based on the assumption that there is no concurrent
 -index creation/drop and DMLs that requires index lookup. All table
 -handle will be closed before the index creation/drop.
 -@return TRUE if index translation table built successfully */
 -static
 -ibool
 -innobase_build_index_translation(
 -/*=============================*/
 -	const TABLE*		table,	  /*!< in: table in MySQL data
 -					  dictionary */
 -	dict_table_t*		ib_table, /*!< in: table in Innodb data
 -					  dictionary */
 -	INNOBASE_SHARE*		share)	  /*!< in/out: share structure
 -					  where index translation table
 -					  will be constructed in. */
 -{
 -	ulint		mysql_num_index;
 -	ulint		ib_num_index;
 -	dict_index_t**	index_mapping;
 -	ibool		ret = TRUE;
 +		templ->mysql_null_bit_mask = (ulint) field->null_bit;
 +	} else {
 +		templ->mysql_null_bit_mask = 0;
 +	}
  
 -	DBUG_ENTER("innobase_build_index_translation");
 +	templ->mysql_col_offset = (ulint) get_field_offset(table, field);
  
 -	mutex_enter(&dict_sys->mutex);
 +	templ->mysql_col_len = (ulint) field->pack_length();
 +	templ->type = col->mtype;
 +	templ->mysql_type = (ulint) field->type();
  
 -	mysql_num_index = table->s->keys;
 -	ib_num_index = UT_LIST_GET_LEN(ib_table->indexes);
 +	if (templ->mysql_type == DATA_MYSQL_TRUE_VARCHAR) {
 +		templ->mysql_length_bytes = (ulint)
 +			(((Field_varstring*) field)->length_bytes);
 +	}
  
 -	index_mapping = share->idx_trans_tbl.index_mapping;
 +	templ->charset = dtype_get_charset_coll(col->prtype);
 +	templ->mbminlen = dict_col_get_mbminlen(col);
 +	templ->mbmaxlen = dict_col_get_mbmaxlen(col);
 +	templ->is_unsigned = col->prtype & DATA_UNSIGNED;
  
 -	/* If there exists inconsistency between MySQL and InnoDB dictionary
 -	(metadata) information, the number of index defined in MySQL
 -	could exceed that in InnoDB, do not build index translation
 -	table in such case */
 -	if (UNIV_UNLIKELY(ib_num_index < mysql_num_index)) {
 -		ret = FALSE;
 -		goto func_exit;
 +	if (!dict_index_is_clust(index)
 +	    && templ->rec_field_no == ULINT_UNDEFINED) {
 +		prebuilt->need_to_access_clustered = TRUE;
  	}
  
 -	/* If index entry count is non-zero, nothing has
 -	changed since last update, directly return TRUE */
 -	if (share->idx_trans_tbl.index_count) {
 -		/* Index entry count should still match mysql_num_index */
 -		ut_a(share->idx_trans_tbl.index_count == mysql_num_index);
 -		goto func_exit;
 +	if (prebuilt->mysql_prefix_len < templ->mysql_col_offset
 +	    + templ->mysql_col_len) {
 +		prebuilt->mysql_prefix_len = templ->mysql_col_offset
 +			+ templ->mysql_col_len;
  	}
  
 -	/* The number of index increased, rebuild the mapping table */
 -	if (mysql_num_index > share->idx_trans_tbl.array_size) {
 -		index_mapping = (dict_index_t**) my_realloc(index_mapping,
 -							mysql_num_index *
 -							sizeof(*index_mapping),
 -							MYF(MY_ALLOW_ZERO_PTR));
 +	if (templ->type == DATA_BLOB) {
 +		prebuilt->templ_contains_blob = TRUE;
 +	}
 +
 +	return(templ);
 +}
 +
 +/**************************************************************//**
 +Builds a 'template' to the prebuilt struct. The template is used in fast
 +retrieval of just those column values MySQL needs in its processing. */
 +UNIV_INTERN
 +void
 +ha_innobase::build_template(
 +/*========================*/
 +	bool		whole_row)	/*!< in: true=ROW_MYSQL_WHOLE_ROW,
 +					false=ROW_MYSQL_REC_FIELDS */
 +{
 +	dict_index_t*	index;
 +	dict_index_t*	clust_index;
 +	ulint		n_stored_fields;
 +	ibool		fetch_all_in_key	= FALSE;
 +	ibool		fetch_primary_key_cols	= FALSE;
 +	ulint		i, sql_idx;
 +
 +	if (prebuilt->select_lock_type == LOCK_X) {
 +		/* We always retrieve the whole clustered index record if we
 +		use exclusive row level locks, for example, if the read is
 +		done in an UPDATE statement. */
 +
 +		whole_row = true;
 +	} else if (!whole_row) {
 +		if (prebuilt->hint_need_to_fetch_extra_cols
 +			== ROW_RETRIEVE_ALL_COLS) {
 +
 +			/* We know we must at least fetch all columns in the
 +			key, or all columns in the table */
 +
 +			if (prebuilt->read_just_key) {
 +				/* MySQL has instructed us that it is enough
 +				to fetch the columns in the key; looks like
 +				MySQL can set this flag also when there is
 +				only a prefix of the column in the key: in
 +				that case we retrieve the whole column from
 +				the clustered index */
 +
 +				fetch_all_in_key = TRUE;
 +			} else {
 +				whole_row = true;
 +			}
 +		} else if (prebuilt->hint_need_to_fetch_extra_cols
 +			== ROW_RETRIEVE_PRIMARY_KEY) {
 +			/* We must at least fetch all primary key cols. Note
 +			that if the clustered index was internally generated
 +			by InnoDB on the row id (no primary key was
 +			defined), then row_search_for_mysql() will always
 +			retrieve the row id to a special buffer in the
 +			prebuilt struct. */
  
 -		if (!index_mapping) {
 -			/* Report an error if index_mapping continues to be
 -			NULL and mysql_num_index is a non-zero value */
 -			sql_print_error("InnoDB: fail to allocate memory for "
 -					"index translation table. Number of "
 -					"Index:%lu, array size:%lu",
 -					mysql_num_index,
 -					share->idx_trans_tbl.array_size);
 -			ret = FALSE;
 -			goto func_exit;
 +			fetch_primary_key_cols = TRUE;
  		}
 -
 -		share->idx_trans_tbl.array_size = mysql_num_index;
  	}
  
 -	/* For each index in the mysql key_info array, fetch its
 -	corresponding InnoDB index pointer into index_mapping
 -	array. */
 -	for (ulint count = 0; count < mysql_num_index; count++) {
 +	clust_index = dict_table_get_first_index(prebuilt->table);
  
 -		/* Fetch index pointers into index_mapping according to mysql
 -		index sequence */
 -		index_mapping[count] = dict_table_get_index_on_name(
 -			ib_table, table->key_info[count].name);
 +	index = whole_row ? clust_index : prebuilt->index;
  
 -		if (!index_mapping[count]) {
 -			sql_print_error("Cannot find index %s in InnoDB "
 -					"index dictionary.",
 -					table->key_info[count].name);
 -			ret = FALSE;
 -			goto func_exit;
 -		}
 +	prebuilt->need_to_access_clustered = (index == clust_index);
  
 -		/* Double check fetched index has the same
 -		column info as those in mysql key_info. */
 -		if (!innobase_match_index_columns(&table->key_info[count],
 -					          index_mapping[count])) {
 -			sql_print_error("Found index %s whose column info "
 -					"does not match that of MySQL.",
 -					table->key_info[count].name);
 -			ret = FALSE;
 -			goto func_exit;
 -		}
 -	}
 +	/* Either prebuilt->index should be a secondary index, or it
 +	should be the clustered index. */
 +	ut_ad(dict_index_is_clust(index) == (index == clust_index));
  
 -	/* Successfully built the translation table */
 -	share->idx_trans_tbl.index_count = mysql_num_index;
 +	/* Below we check column by column if we need to access
 +	the clustered index. */
  
 -func_exit:
 -	if (!ret) {
 -		/* Build translation table failed. */
 -		my_free(index_mapping);
 +        n_stored_fields= (ulint)table->s->stored_fields; /* number of stored columns */
  
 -		share->idx_trans_tbl.array_size = 0;
 -		share->idx_trans_tbl.index_count = 0;
 -		index_mapping = NULL;
 +	if (!prebuilt->mysql_template) {
 +		prebuilt->mysql_template = (mysql_row_templ_t*)
 +			mem_alloc(n_stored_fields * sizeof(mysql_row_templ_t));
  	}
  
 -	share->idx_trans_tbl.index_mapping = index_mapping;
 +	prebuilt->template_type = whole_row
 +		? ROW_MYSQL_WHOLE_ROW : ROW_MYSQL_REC_FIELDS;
 +	prebuilt->null_bitmap_len = table->s->null_bytes;
  
 -	mutex_exit(&dict_sys->mutex);
 +	/* Prepare to build prebuilt->mysql_template[]. */
 +	prebuilt->templ_contains_blob = FALSE;
 +	prebuilt->mysql_prefix_len = 0;
 +	prebuilt->n_template = 0;
 +	prebuilt->idx_cond_n_cols = 0;
  
 -	DBUG_RETURN(ret);
 -}
 +	/* Note that in InnoDB, i is the column number in the table.
 +	MySQL calls columns 'fields'. */
  
 -/*******************************************************************//**
 -This function uses index translation table to quickly locate the
 -requested index structure.
 -Note we do not have mutex protection for the index translatoin table
 -access, it is based on the assumption that there is no concurrent
 -translation table rebuild (fter create/drop index) and DMLs that
 -require index lookup.
 -@return dict_index_t structure for requested index. NULL if
 -fail to locate the index structure. */
 -static
 -dict_index_t*
 -innobase_index_lookup(
 -/*==================*/
 -	INNOBASE_SHARE*	share,	/*!< in: share structure for index
 -				translation table. */
 -	uint		keynr)	/*!< in: index number for the requested
 -				index */
 -{
 -	if (!share->idx_trans_tbl.index_mapping
 -	    || keynr >= share->idx_trans_tbl.index_count) {
 -		return(NULL);
 -	}
 +	if (active_index != MAX_KEY && active_index == pushed_idx_cond_keyno) {
 +		/* Push down an index condition or an end_range check. */
 +		for (i = 0, sql_idx = 0; i < n_stored_fields; i++, sql_idx++) {
  
 -	return(share->idx_trans_tbl.index_mapping[keynr]);
 -}
 +                        while (!table->field[sql_idx]->stored_in_db) {
 +			        sql_idx++;
 +                        }
  
 -/************************************************************************
 -Set the autoinc column max value. This should only be called once from
 -ha_innobase::open(). Therefore there's no need for a covering lock. */
 -UNIV_INTERN
 -void
 -ha_innobase::innobase_initialize_autoinc()
 -/*======================================*/
 -{
 -	ulonglong	auto_inc;
 -	const Field*	field = table->found_next_number_field;
 +			const ibool		index_contains
 +				= dict_index_contains_col_or_prefix(index, i);
 +
 +			/* Test if an end_range or an index condition
 +			refers to the field. Note that "index" and
 +			"index_contains" may refer to the clustered index.
 +			Index condition pushdown is relative to prebuilt->index
 +			(the index that is being looked up first). */
 +
 +			/* When join_read_always_key() invokes this
 +			code via handler::ha_index_init() and
 +			ha_innobase::index_init(), end_range is not
 +			yet initialized. Because of that, we must
 +			always check for index_contains, instead of
 +			the subset
 +			field->part_of_key.is_set(active_index)
 +			which would be acceptable if end_range==NULL. */
 +			if (build_template_needs_field_in_icp(
 +				    index, prebuilt, index_contains, i)) {
 +				/* Needed in ICP */
 +				const Field*		field;
 +				mysql_row_templ_t*	templ;
 +
 +				if (whole_row) {
 +					field = table->field[sql_idx];
 +				} else {
 +					field = build_template_needs_field(
 +						index_contains,
 +						prebuilt->read_just_key,
 +						fetch_all_in_key,
 +						fetch_primary_key_cols,
 +						index, table, i, sql_idx);
 +					if (!field) {
 +						continue;
 +					}
 +				}
  
 -	if (field != NULL) {
 -		auto_inc = innobase_get_int_col_max_value(field);
 -	} else {
 -		/* We have no idea what's been passed in to us as the
 -		autoinc column. We set it to the 0, effectively disabling
 -		updates to the table. */
 -		auto_inc = 0;
 +				templ = build_template_field(
 +					prebuilt, clust_index, index,
 +					table, field, i);
 +				prebuilt->idx_cond_n_cols++;
 +				ut_ad(prebuilt->idx_cond_n_cols
 +				      == prebuilt->n_template);
 +
 +				if (index == prebuilt->index) {
 +					templ->icp_rec_field_no
 +						= templ->rec_field_no;
 +				} else {
 +					templ->icp_rec_field_no
 +						= dict_index_get_nth_col_pos(
 +							prebuilt->index, i);
 +				}
  
 -		ut_print_timestamp(stderr);
 -		fprintf(stderr, "  InnoDB: Unable to determine the AUTOINC "
 -				"column name\n");
 -	}
 +				if (dict_index_is_clust(prebuilt->index)) {
 +					ut_ad(templ->icp_rec_field_no
 +					      != ULINT_UNDEFINED);
 +					/* If the primary key includes
 +					a column prefix, use it in
 +					index condition pushdown,
 +					because the condition is
 +					evaluated before fetching any
 +					off-page (externally stored)
 +					columns. */
 +					if (templ->icp_rec_field_no
 +					    < prebuilt->index->n_uniq) {
 +						/* This is a key column;
 +						all set. */
 +						continue;
 +					}
 +				} else if (templ->icp_rec_field_no
 +					   != ULINT_UNDEFINED) {
 +					continue;
 +				}
  
 -	if (srv_force_recovery >= SRV_FORCE_NO_IBUF_MERGE) {
 -		/* If the recovery level is set so high that writes
 -		are disabled we force the AUTOINC counter to 0
 -		value effectively disabling writes to the table.
 -		Secondly, we avoid reading the table in case the read
 -		results in failure due to a corrupted table/index.
 +				/* This is a column prefix index.
 +				The column prefix can be used in
 +				an end_range comparison. */
 +
 +				templ->icp_rec_field_no
 +					= dict_index_get_nth_col_or_prefix_pos(
 +						prebuilt->index, i, TRUE);
 +				ut_ad(templ->icp_rec_field_no
 +				      != ULINT_UNDEFINED);
 +
 +				/* Index condition pushdown can be used on
 +				all columns of a secondary index, and on
 +				the PRIMARY KEY columns. On the clustered
 +				index, it must never be used on other than
 +				PRIMARY KEY columns, because those columns
 +				may be stored off-page, and we will not
 +				fetch externally stored columns before
 +				checking the index condition. */
 +				/* TODO: test the above with an assertion
 +				like this. Note that index conditions are
 +				currently pushed down as part of the
 +				"optimizer phase" while end_range is done
 +				as part of the execution phase. Therefore,
 +				we were unable to use an accurate condition
 +				for end_range in the "if" condition above,
 +				and the following assertion would fail.
 +				ut_ad(!dict_index_is_clust(prebuilt->index)
 +				      || templ->rec_field_no
 +				      < prebuilt->index->n_uniq);
 +				*/
 +			}
 +		}
  
 -		We will not return an error to the client, so that the
 -		tables can be dumped with minimal hassle.  If an error
 -		were returned in this case, the first attempt to read
 -		the table would fail and subsequent SELECTs would succeed. */
 -		auto_inc = 0;
 -	} else if (field == NULL) {
 -		/* This is a far more serious error, best to avoid
 -		opening the table and return failure. */
 -		my_error(ER_AUTOINC_READ_FAILED, MYF(0));
 +		ut_ad(prebuilt->idx_cond_n_cols > 0);
 +		ut_ad(prebuilt->idx_cond_n_cols == prebuilt->n_template);
 +
 +		/* Include the fields that are not needed in index condition
 +		pushdown. */
 +		for (i = 0, sql_idx = 0; i < n_stored_fields; i++, sql_idx++) {
 +
 +                        while (!table->field[sql_idx]->stored_in_db) {
 +			        sql_idx++;
 +                        }
 +
 +			const ibool		index_contains
 +				= dict_index_contains_col_or_prefix(index, i);
 +
 +			if (!build_template_needs_field_in_icp(
 +				    index, prebuilt, index_contains, i)) {
 +				/* Not needed in ICP */
 +				const Field*	field;
 +
 +				if (whole_row) {
 +					field = table->field[sql_idx];
 +				} else {
 +					field = build_template_needs_field(
 +						index_contains,
 +						prebuilt->read_just_key,
 +						fetch_all_in_key,
 +						fetch_primary_key_cols,
 +						index, table, i, sql_idx);
 +					if (!field) {
 +						continue;
 +					}
 +				}
 +
 +				build_template_field(prebuilt,
 +						     clust_index, index,
 +						     table, field, i);
 +			}
 +		}
 +
 +		prebuilt->idx_cond = this;
  	} else {
 -		dict_index_t*	index;
 -		const char*	col_name;
 -		ulonglong	read_auto_inc;
 -		ulint		err;
 +		/* No index condition pushdown */
 +		prebuilt->idx_cond = NULL;
  
 -		update_thd(ha_thd());
 +		for (i = 0, sql_idx = 0; i < n_stored_fields; i++, sql_idx++) {
 +			const Field*	field;
  
 -		ut_a(prebuilt->trx == thd_to_trx(user_thd));
 +                        while (!table->field[sql_idx]->stored_in_db) {
 +			        sql_idx++;
 +                        }
  
 -		col_name = field->field_name;
 -		index = innobase_get_index(table->s->next_number_index);
 +			if (whole_row) {
 +				field = table->field[sql_idx];
 +			} else {
 +				field = build_template_needs_field(
 +					dict_index_contains_col_or_prefix(
 +						index, i),
 +					prebuilt->read_just_key,
 +					fetch_all_in_key,
 +					fetch_primary_key_cols,
 +					index, table, i, sql_idx);
 +				if (!field) {
 +					continue;
 +				}
 +			}
  
 -		/* Execute SELECT MAX(col_name) FROM TABLE; */
 -		err = row_search_max_autoinc(index, col_name, &read_auto_inc);
 +			build_template_field(prebuilt, clust_index, index,
 +					     table, field, i);
 +		}
 +	}
  
 -		switch (err) {
 -		case DB_SUCCESS: {
 -			ulonglong	col_max_value;
 +	if (index != clust_index && prebuilt->need_to_access_clustered) {
 +		/* Change rec_field_no's to correspond to the clustered index
 +		record */
 +		for (i = 0; i < prebuilt->n_template; i++) {
  
 -			col_max_value = innobase_get_int_col_max_value(field);
 +			mysql_row_templ_t*	templ
 +				= &prebuilt->mysql_template[i];
  
 -			/* At the this stage we do not know the increment
 -			nor the offset, so use a default increment of 1. */
 +			templ->rec_field_no = templ->clust_rec_field_no;
 +		}
 +	}
 +}
  
 -			auto_inc = innobase_next_autoinc(
 -				read_auto_inc, 1, 1, 0, col_max_value);
 +/********************************************************************//**
 +This special handling is really to overcome the limitations of MySQL's
 +binlogging. We need to eliminate the non-determinism that will arise in
 +INSERT ... SELECT type of statements, since MySQL binlog only stores the
 +min value of the autoinc interval. Once that is fixed we can get rid of
 +the special lock handling.
 +@return	DB_SUCCESS if all OK else error code */
 +UNIV_INTERN
 +dberr_t
 +ha_innobase::innobase_lock_autoinc(void)
 +/*====================================*/
 +{
 +	DBUG_ENTER("ha_innobase::innobase_lock_autoinc");
 +	dberr_t		error = DB_SUCCESS;
  
 -			break;
 +	ut_ad(!srv_read_only_mode);
 +
 +	switch (innobase_autoinc_lock_mode) {
 +	case AUTOINC_NO_LOCKING:
 +		/* Acquire only the AUTOINC mutex. */
 +		dict_table_autoinc_lock(prebuilt->table);
 +		break;
 +
 +	case AUTOINC_NEW_STYLE_LOCKING:
 +		/* For simple (single/multi) row INSERTs/REPLACEs and RBR
 +		events, we fallback to the old style only if another
 +		transaction has already acquired the AUTOINC lock on
 +		behalf of a LOAD FILE or INSERT ... SELECT etc. type of
 +		statement. */
 +		if (thd_sql_command(user_thd) == SQLCOM_INSERT
 +		    || thd_sql_command(user_thd) == SQLCOM_REPLACE
 +		    || thd_sql_command(user_thd) == SQLCOM_END // RBR event
 +		) {
 +			dict_table_t*	ib_table = prebuilt->table;
 +
 +			/* Acquire the AUTOINC mutex. */
 +			dict_table_autoinc_lock(ib_table);
 +
 +			/* We need to check that another transaction isn't
 +			already holding the AUTOINC lock on the table. */
 +			if (ib_table->n_waiting_or_granted_auto_inc_locks) {
 +				/* Release the mutex to avoid deadlocks and
 +				fall back to old style locking. */
 +				dict_table_autoinc_unlock(ib_table);
 +			} else {
 +				/* Do not fall back to old style locking. */
 +				break;
 +			}
  		}
 -		case DB_RECORD_NOT_FOUND:
 -			ut_print_timestamp(stderr);
 -			fprintf(stderr, "  InnoDB: MySQL and InnoDB data "
 -				"dictionaries are out of sync.\n"
 -				"InnoDB: Unable to find the AUTOINC column "
 -				"%s in the InnoDB table %s.\n"
 -				"InnoDB: We set the next AUTOINC column "
 -				"value to 0,\n"
 -				"InnoDB: in effect disabling the AUTOINC "
 -				"next value generation.\n"
 -				"InnoDB: You can either set the next "
 -				"AUTOINC value explicitly using ALTER TABLE\n"
 -				"InnoDB: or fix the data dictionary by "
 -				"recreating the table.\n",
 -				col_name, index->table->name);
 +		/* Use old style locking. */
 +		/* fall through */
 +	case AUTOINC_OLD_STYLE_LOCKING:
 +		DBUG_EXECUTE_IF("die_if_autoinc_old_lock_style_used",
 +				ut_ad(0););
 +		error = row_lock_table_autoinc_for_mysql(prebuilt);
  
 -			/* This will disable the AUTOINC generation. */
 -			auto_inc = 0;
 +		if (error == DB_SUCCESS) {
  
 -			/* We want the open to succeed, so that the user can
 -			take corrective action. ie. reads should succeed but
 -			updates should fail. */
 -			err = DB_SUCCESS;
 -			break;
 -		default:
 -			/* row_search_max_autoinc() should only return
 -			one of DB_SUCCESS or DB_RECORD_NOT_FOUND. */
 -			ut_error;
 +			/* Acquire the AUTOINC mutex. */
 +			dict_table_autoinc_lock(prebuilt->table);
  		}
 +		break;
 +
 +	default:
 +		ut_error;
  	}
  
 -	dict_table_autoinc_initialize(prebuilt->table, auto_inc);
 +	DBUG_RETURN(error);
  }
  
 -/*****************************************************************//**
 -Creates and opens a handle to a table which already exists in an InnoDB
 -database.
 -@return	1 if error, 0 if success */
 -UNIV_INTERN
 -int
 -ha_innobase::open(
 -/*==============*/
 -	const char*		name,		/*!< in: table name */
 -	int			mode,		/*!< in: not used */
 -	uint			test_if_locked)	/*!< in: not used */
 +/********************************************************************//**
 +Reset the autoinc value in the table.
 +@return	DB_SUCCESS if all went well else error code */
 +UNIV_INTERN
 +dberr_t
 +ha_innobase::innobase_reset_autoinc(
 +/*================================*/
 +	ulonglong	autoinc)	/*!< in: value to store */
  {
 -	dict_table_t*		ib_table;
 -	char			norm_name[1000];
 -	THD*			thd;
 -	char*			is_part = NULL;
 -	ibool			par_case_name_set = FALSE;
 -	char			par_case_name[MAX_FULL_NAME_LEN + 1];
 -	dict_err_ignore_t	ignore_err = DICT_ERR_IGNORE_NONE;
 +	dberr_t		error;
  
 -	DBUG_ENTER("ha_innobase::open");
 +	error = innobase_lock_autoinc();
  
 -	UT_NOT_USED(mode);
 -	UT_NOT_USED(test_if_locked);
 +	if (error == DB_SUCCESS) {
  
 -	thd = ha_thd();
 +		dict_table_autoinc_initialize(prebuilt->table, autoinc);
  
 -	/* Under some cases MySQL seems to call this function while
 -	holding btr_search_latch. This breaks the latching order as
 -	we acquire dict_sys->mutex below and leads to a deadlock. */
 -	if (thd != NULL) {
 -		innobase_release_temporary_latches(ht, thd);
 +		dict_table_autoinc_unlock(prebuilt->table);
  	}
  
 -	normalize_table_name(norm_name, name);
 -
 -	user_thd = NULL;
 +	return(error);
 +}
  
 -	if (!(share=get_share(name))) {
 +/********************************************************************//**
 +Store the autoinc value in the table. The autoinc value is only set if
 +it's greater than the existing autoinc value in the table.
 +@return	DB_SUCCESS if all went well else error code */
 +UNIV_INTERN
 +dberr_t
 +ha_innobase::innobase_set_max_autoinc(
 +/*==================================*/
 +	ulonglong	auto_inc)	/*!< in: value to store */
 +{
 +	dberr_t		error;
  
 -		DBUG_RETURN(1);
 -	}
 +	error = innobase_lock_autoinc();
  
 -	/* Will be allocated if it is needed in ::update_row() */
 -	upd_buf = NULL;
 -	upd_buf_size = 0;
 +	if (error == DB_SUCCESS) {
  
 -	/* We look for pattern #P# to see if the table is partitioned
 -	MySQL table. */
 -#ifdef __WIN__
 -	is_part = strstr(norm_name, "#p#");
 -#else
 -	is_part = strstr(norm_name, "#P#");
 -#endif /* __WIN__ */
 +		dict_table_autoinc_update_if_greater(prebuilt->table, auto_inc);
  
 -	/* Check whether FOREIGN_KEY_CHECKS is set to 0. If so, the table
 -	can be opened even if some FK indexes are missing. If not, the table
 -	can't be opened in the same situation */
 -	if (thd_test_options(thd, OPTION_NO_FOREIGN_KEY_CHECKS)) {
 -		ignore_err = DICT_ERR_IGNORE_FK_NOKEY;
 +		dict_table_autoinc_unlock(prebuilt->table);
  	}
  
 -	/* Get pointer to a table object in InnoDB dictionary cache */
 -	ib_table = dict_table_get(norm_name, TRUE, ignore_err);
 +	return(error);
 +}
  
 -	if (NULL == ib_table) {
 -		if (is_part) {
 -			/* MySQL partition engine hard codes the file name
 -			separator as "#P#". The text case is fixed even if
 -			lower_case_table_names is set to 1 or 2. This is true
 -			for sub-partition names as well. InnoDB always
 -			normalises file names to lower case on Windows, this
 -			can potentially cause problems when copying/moving
 -			tables between platforms.
 +/********************************************************************//**
 +Stores a row in an InnoDB database, to the table specified in this
 +handle.
 +@return	error code */
 +UNIV_INTERN
 +int
 +ha_innobase::write_row(
 +/*===================*/
 +	uchar*	record)	/*!< in: a row in MySQL format */
 +{
 +	dberr_t		error;
 +	int		error_result= 0;
 +	ibool		auto_inc_used= FALSE;
 +#ifdef WITH_WSREP
 +	ibool           auto_inc_inserted= FALSE; /* if NULL was inserted */
 +#endif
 +	ulint		sql_command;
 +	trx_t*		trx = thd_to_trx(user_thd);
  
 -			1) If boot against an installation from Windows
 -			platform, then its partition table name could
 -			be all be in lower case in system tables. So we
 -			will need to check lower case name when load table.
 +	DBUG_ENTER("ha_innobase::write_row");
  
 -			2) If  we boot an installation from other case
 -			sensitive platform in Windows, we might need to
 -			check the existence of table name without lowering
 -			case them in the system table. */
 -			if (innobase_get_lower_case_table_names() == 1) {
 +	if (high_level_read_only) {
 +		ib_senderrf(ha_thd(), IB_LOG_LEVEL_WARN, ER_READ_ONLY_MODE);
 +		DBUG_RETURN(HA_ERR_TABLE_READONLY);
 +	} else if (prebuilt->trx != trx) {
 +		sql_print_error("The transaction object for the table handle "
 +				"is at %p, but for the current thread it is at "
 +				"%p",
 +				(const void*) prebuilt->trx, (const void*) trx);
  
 -				if (!par_case_name_set) {
 -#ifndef __WIN__
 -					/* Check for the table using lower
 -					case name, including the partition
 -					separator "P" */
 -					memcpy(par_case_name, norm_name,
 -					       strlen(norm_name));
 -					par_case_name[strlen(norm_name)] = 0;
 -					innobase_casedn_str(par_case_name);
 -#else
 -					/* On Windows platfrom, check
 -					whether there exists table name in
 -					system table whose name is
 -					not being normalized to lower case */
 -					normalize_table_name_low(
 -						par_case_name, name, FALSE);
 -#endif
 -					par_case_name_set = TRUE;
 -				}
 +		fputs("InnoDB: Dump of 200 bytes around prebuilt: ", stderr);
 +		ut_print_buf(stderr, ((const byte*) prebuilt) - 100, 200);
 +		fputs("\n"
 +			"InnoDB: Dump of 200 bytes around ha_data: ",
 +			stderr);
 +		ut_print_buf(stderr, ((const byte*) trx) - 100, 200);
 +		putc('\n', stderr);
 +		ut_error;
 +	} else if (!trx_is_started(trx)) {
 +		++trx->will_lock;
 +	}
  
 -				ib_table = dict_table_get(
 -					par_case_name, TRUE, ignore_err);
 -			}
 -			if (ib_table) {
 -#ifndef __WIN__
 -				sql_print_warning("Partition table %s opened "
 -						  "after converting to lower "
 -						  "case. The table may have "
 -						  "been moved from a case "
 -						  "in-sensitive file system. "
 -						  "Please recreate table in "
 -						  "the current file system\n",
 -						  norm_name);
 -#else
 -				sql_print_warning("Partition table %s opened "
 -						  "after skipping the step to "
 -						  "lower case the table name. "
 -						  "The table may have been "
 -						  "moved from a case sensitive "
 -						  "file system. Please "
 -						  "recreate table in the "
 -						  "current file system\n",
 -						  norm_name);
 -#endif
 -				goto table_opened;
 -			}
 -		}
 +	ha_statistic_increment(&SSV::ha_write_count);
  
 -		if (is_part) {
 -			sql_print_error("Failed to open table %s.\n",
 -					norm_name);
 -		}
 +	sql_command = thd_sql_command(user_thd);
  
 -		sql_print_error("Cannot find or open table %s from\n"
 -				"the internal data dictionary of InnoDB "
 -				"though the .frm file for the\n"
 -				"table exists. Maybe you have deleted and "
 -				"recreated InnoDB data\n"
 -				"files but have forgotten to delete the "
 -				"corresponding .frm files\n"
 -				"of InnoDB tables, or you have moved .frm "
 -				"files to another database?\n"
 -				"or, the table contains indexes that this "
 -				"version of the engine\n"
 -				"doesn't support.\n"
 -				"See " REFMAN "innodb-troubleshooting.html\n"
 -				"how you can resolve the problem.\n",
 -				norm_name);
 -		free_share(share);
 -		my_errno = ENOENT;
 +	if ((sql_command == SQLCOM_ALTER_TABLE
 +	     || sql_command == SQLCOM_OPTIMIZE
 +	     || sql_command == SQLCOM_CREATE_INDEX
 +#ifdef WITH_WSREP
 +	     || (wsrep_on(user_thd) && wsrep_load_data_splitting &&
 +		 sql_command == SQLCOM_LOAD                      &&
 +		 !thd_test_options(
 +			user_thd, OPTION_NOT_AUTOCOMMIT | OPTION_BEGIN))
 +#endif /* WITH_WSREP */
 +	     || sql_command == SQLCOM_DROP_INDEX)
 +	    && num_write_row >= 10000) {
 +#ifdef WITH_WSREP
 +		if (wsrep_on(user_thd) && sql_command == SQLCOM_LOAD) {
 +			WSREP_DEBUG("forced trx split for LOAD: %s", 
 +				    wsrep_thd_query(user_thd));
 +		}
 +#endif /* WITH_WSREP */
 +		/* ALTER TABLE is COMMITted at every 10000 copied rows.
 +		The IX table lock for the original table has to be re-issued.
 +		As this method will be called on a temporary table where the
 +		contents of the original table is being copied to, it is
 +		a bit tricky to determine the source table.  The cursor
 +		position in the source table need not be adjusted after the
 +		intermediate COMMIT, since writes by other transactions are
 +		being blocked by a MySQL table lock TL_WRITE_ALLOW_READ. */
  
 -		DBUG_RETURN(HA_ERR_NO_SUCH_TABLE);
 -	}
 +		dict_table_t*	src_table;
 +		enum lock_mode	mode;
  
 -table_opened:
 +		num_write_row = 0;
  
 -	if (ib_table->ibd_file_missing && !thd_tablespace_op(thd)) {
 -		sql_print_error("MySQL is trying to open a table handle but "
 -				"the .ibd file for\ntable %s does not exist.\n"
 -				"Have you deleted the .ibd file from the "
 -				"database directory under\nthe MySQL datadir, "
 -				"or have you used DISCARD TABLESPACE?\n"
 -				"See " REFMAN "innodb-troubleshooting.html\n"
 -				"how you can resolve the problem.\n",
 -				norm_name);
 -		free_share(share);
 -		my_errno = ENOENT;
 +		/* Commit the transaction.  This will release the table
 +		locks, so they have to be acquired again. */
  
 -		dict_table_decrement_handle_count(ib_table, FALSE);
 -		DBUG_RETURN(HA_ERR_NO_SUCH_TABLE);
 -	}
 +		/* Altering an InnoDB table */
 +		/* Get the source table. */
 +		src_table = lock_get_src_table(
 +				prebuilt->trx, prebuilt->table, &mode);
 +		if (!src_table) {
 +no_commit:
 +			/* Unknown situation: do not commit */
 +			/*
 +			ut_print_timestamp(stderr);
 +			fprintf(stderr,
 +				"  InnoDB: ALTER TABLE is holding lock"
 +				" on %lu tables!\n",
 +				prebuilt->trx->mysql_n_tables_locked);
 +			*/
 +			;
 +		} else if (src_table == prebuilt->table) {
 +#ifdef WITH_WSREP
 +			if (wsrep_on(user_thd)                              &&
 +			    wsrep_load_data_splitting                       &&
 +			    sql_command == SQLCOM_LOAD                      &&
 +			    !thd_test_options(user_thd,
 +			                      OPTION_NOT_AUTOCOMMIT | OPTION_BEGIN))
 +			{
 +				switch (wsrep_run_wsrep_commit(user_thd, wsrep_hton, 1))
 +				{
 +				case WSREP_TRX_OK:
 +				  break;
 +				case WSREP_TRX_SIZE_EXCEEDED:
 +				case WSREP_TRX_CERT_FAIL:
 +				case WSREP_TRX_ERROR:
 +				  DBUG_RETURN(1);
 +				}
  
 -	prebuilt = row_create_prebuilt(ib_table, table->s->reclength);
 +				if (binlog_hton->commit(binlog_hton, user_thd, 1))
 +					DBUG_RETURN(1);
 +				wsrep_post_commit(user_thd, TRUE);
 +			}
 +#endif /* WITH_WSREP */
 +			/* Source table is not in InnoDB format:
 +			no need to re-acquire locks on it. */
  
 -	prebuilt->default_rec = table->s->default_values;
 -	ut_ad(prebuilt->default_rec);
 +			/* Altering to InnoDB format */
 +			innobase_commit(ht, user_thd, 1);
 +			/* Note that this transaction is still active. */
 +			trx_register_for_2pc(prebuilt->trx);
 +			/* We will need an IX lock on the destination table. */
 +			prebuilt->sql_stat_start = TRUE;
 +		} else {
 +#ifdef WITH_WSREP
 +			if (wsrep_on(user_thd)                              &&
 +			    wsrep_load_data_splitting                       &&
 +			    sql_command == SQLCOM_LOAD                      &&
 +			    !thd_test_options(user_thd,
 +			                      OPTION_NOT_AUTOCOMMIT | OPTION_BEGIN))
 +			{
 +				switch (wsrep_run_wsrep_commit(user_thd, wsrep_hton, 1))
 +				{
 +				case WSREP_TRX_OK:
 +				  break;
 +				case WSREP_TRX_SIZE_EXCEEDED:
 +				case WSREP_TRX_CERT_FAIL:
 +				case WSREP_TRX_ERROR:
 +				  DBUG_RETURN(1);
 +				}
  
 -	/* Looks like MySQL-3.23 sometimes has primary key number != 0 */
 +				if (binlog_hton->commit(binlog_hton, user_thd, 1))
 +					DBUG_RETURN(1);
 +				wsrep_post_commit(user_thd, TRUE);
 +			}
 +#endif /* WITH_WSREP */
 +			/* Ensure that there are no other table locks than
 +			LOCK_IX and LOCK_AUTO_INC on the destination table. */
  
 -	primary_key = table->s->primary_key;
 -	key_used_on_scan = primary_key;
 +			if (!lock_is_table_exclusive(prebuilt->table,
 +							prebuilt->trx)) {
 +				goto no_commit;
 +			}
  
 -	if (!innobase_build_index_translation(table, ib_table, share)) {
 -		  sql_print_error("Build InnoDB index translation table for"
 -				  " Table %s failed", name);
 +			/* Commit the transaction.  This will release the table
 +			locks, so they have to be acquired again. */
 +			innobase_commit(ht, user_thd, 1);
 +			/* Note that this transaction is still active. */
 +			trx_register_for_2pc(prebuilt->trx);
 +			/* Re-acquire the table lock on the source table. */
 +			row_lock_table_for_mysql(prebuilt, src_table, mode);
 +			/* We will need an IX lock on the destination table. */
 +			prebuilt->sql_stat_start = TRUE;
 +		}
  	}
  
 -	/* Allocate a buffer for a 'row reference'. A row reference is
 -	a string of bytes of length ref_length which uniquely specifies
 -	a row in our table. Note that MySQL may also compare two row
 -	references for equality by doing a simple memcmp on the strings
 -	of length ref_length! */
 -
 -	if (!row_table_got_default_clust_index(ib_table)) {
 -
 -		prebuilt->clust_index_was_generated = FALSE;
 +	num_write_row++;
  
 -		if (UNIV_UNLIKELY(primary_key >= MAX_KEY)) {
 -			sql_print_error("Table %s has a primary key in "
 -					"InnoDB data dictionary, but not "
 -					"in MySQL!", name);
 +	/* This is the case where the table has an auto-increment column */
 +	if (table->next_number_field && record == table->record[0]) {
  
 -			/* This mismatch could cause further problems
 -			if not attended, bring this to the user's attention
 -			by printing a warning in addition to log a message
 -			in the errorlog */
 -			push_warning_printf(thd, MYSQL_ERROR::WARN_LEVEL_WARN,
 -					    ER_NO_SUCH_INDEX,
 -					    "InnoDB: Table %s has a "
 -					    "primary key in InnoDB data "
 -					    "dictionary, but not in "
 -					    "MySQL!", name);
 +		/* Reset the error code before calling
 +		innobase_get_auto_increment(). */
 +		prebuilt->autoinc_error = DB_SUCCESS;
  
 -			/* If primary_key >= MAX_KEY, its (primary_key)
 -			value could be out of bound if continue to index
 -			into key_info[] array. Find InnoDB primary index,
 -			and assign its key_length to ref_length.
 -			In addition, since MySQL indexes are sorted starting
 -			with primary index, unique index etc., initialize
 -			ref_length to the first index key length in
 -			case we fail to find InnoDB cluster index.
 +#ifdef WITH_WSREP
 +		auto_inc_inserted= (table->next_number_field->val_int() == 0);
 +#endif
  
 -			Please note, this will not resolve the primary
 -			index mismatch problem, other side effects are
 -			possible if users continue to use the table.
 -			However, we allow this table to be opened so
 -			that user can adopt necessary measures for the
 -			mismatch while still being accessible to the table
 -			date. */
 -			ref_length = table->key_info[0].key_length;
 +		if ((error_result = update_auto_increment())) {
 +			/* We don't want to mask autoinc overflow errors. */
  
 -			/* Find correspoinding cluster index
 -			key length in MySQL's key_info[] array */
 -			for (ulint i = 0; i < table->s->keys; i++) {
 -				dict_index_t*	index;
 -				index = innobase_get_index(i);
 -				if (dict_index_is_clust(index)) {
 -					ref_length =
 -						 table->key_info[i].key_length;
 -				}
 +			/* Handle the case where the AUTOINC sub-system
 +			failed during initialization. */
 +			if (prebuilt->autoinc_error == DB_UNSUPPORTED) {
 +				error_result = ER_AUTOINC_READ_FAILED;
 +				/* Set the error message to report too. */
 +				my_error(ER_AUTOINC_READ_FAILED, MYF(0));
 +				goto func_exit;
 +			} else if (prebuilt->autoinc_error != DB_SUCCESS) {
 +				error = prebuilt->autoinc_error;
 +				goto report_error;
  			}
 -		} else {
 -			/* MySQL allocates the buffer for ref.
 -			key_info->key_length includes space for all key
 -			columns + one byte for each column that may be
 -			NULL. ref_length must be as exact as possible to
 -			save space, because all row reference buffers are
 -			allocated based on ref_length. */
 -
 -			ref_length = table->key_info[primary_key].key_length;
 -		}
 -	} else {
 -		if (primary_key != MAX_KEY) {
 -			sql_print_error(
 -				"Table %s has no primary key in InnoDB data "
 -				"dictionary, but has one in MySQL! If you "
 -				"created the table with a MySQL version < "
 -				"3.23.54 and did not define a primary key, "
 -				"but defined a unique key with all non-NULL "
 -				"columns, then MySQL internally treats that "
 -				"key as the primary key. You can fix this "
 -				"error by dump + DROP + CREATE + reimport "
 -				"of the table.", name);
  
 -			/* This mismatch could cause further problems
 -			if not attended, bring this to the user attention
 -			by printing a warning in addition to log a message
 -			in the errorlog */
 -			push_warning_printf(thd, MYSQL_ERROR::WARN_LEVEL_WARN,
 -					    ER_NO_SUCH_INDEX,
 -					    "InnoDB: Table %s has no "
 -					    "primary key in InnoDB data "
 -					    "dictionary, but has one in "
 -					    "MySQL!", name);
 +			/* MySQL errors are passed straight back. */
 +			goto func_exit;
  		}
  
 -		prebuilt->clust_index_was_generated = TRUE;
 -
 -		ref_length = DATA_ROW_ID_LEN;
 -
 -		/* If we automatically created the clustered index, then
 -		MySQL does not know about it, and MySQL must NOT be aware
 -		of the index used on scan, to make it avoid checking if we
 -		update the column of the index. That is why we assert below
 -		that key_used_on_scan is the undefined value MAX_KEY.
 -		The column is the row id in the automatical generation case,
 -		and it will never be updated anyway. */
 -
 -		if (key_used_on_scan != MAX_KEY) {
 -			sql_print_warning(
 -				"Table %s key_used_on_scan is %lu even "
 -				"though there is no primary key inside "
 -				"InnoDB.", name, (ulong) key_used_on_scan);
 -		}
 +		auto_inc_used = TRUE;
  	}
  
 -	/* Index block size in InnoDB: used by MySQL in query optimization */
 -	stats.block_size = 16 * 1024;
 -
 -	/* Init table lock structure */
 -	thr_lock_data_init(&share->lock,&lock,(void*) 0);
 +	if (prebuilt->mysql_template == NULL
 +	    || prebuilt->template_type != ROW_MYSQL_WHOLE_ROW) {
  
 -	if (prebuilt->table) {
 -		/* We update the highest file format in the system table
 -		space, if this table has higher file format setting. */
 +		/* Build the template used in converting quickly between
 +		the two database formats */
  
 -		trx_sys_file_format_max_upgrade(
 -			(const char**) &innobase_file_format_max,
 -			dict_table_get_format(prebuilt->table));
 +		build_template(true);
  	}
  
 -	/* Only if the table has an AUTOINC column. */
 -	if (prebuilt->table != NULL && table->found_next_number_field != NULL) {
 -		dict_table_autoinc_lock(prebuilt->table);
 -
 -		/* Since a table can already be "open" in InnoDB's internal
 -		data dictionary, we only init the autoinc counter once, the
 -		first time the table is loaded. We can safely reuse the
 -		autoinc value from a previous MySQL open. */
 -		if (dict_table_autoinc_read(prebuilt->table) == 0) {
 -
 -			innobase_initialize_autoinc();
 -		}
 +	innobase_srv_conc_enter_innodb(prebuilt->trx);
  
 -		dict_table_autoinc_unlock(prebuilt->table);
 -	}
 +	error = row_insert_for_mysql((byte*) record, prebuilt);
 +	DEBUG_SYNC(user_thd, "ib_after_row_insert");
  
 -	info(HA_STATUS_NO_LOCK | HA_STATUS_VARIABLE | HA_STATUS_CONST);
 +	/* Handle duplicate key errors */
 +	if (auto_inc_used) {
 +		ulonglong	auto_inc;
 +		ulonglong	col_max_value;
  
 -	DBUG_RETURN(0);
 -}
 +		/* Note the number of rows processed for this statement, used
 +		by get_auto_increment() to determine the number of AUTO-INC
 +		values to reserve. This is only useful for a mult-value INSERT
 +		and is a statement level counter.*/
 +		if (trx->n_autoinc_rows > 0) {
 +			--trx->n_autoinc_rows;
 +		}
  
 -UNIV_INTERN
 -handler*
 -ha_innobase::clone(
 -/*===============*/
 -	const char*	name,		/*!< in: table name */
 -	MEM_ROOT*	mem_root)	/*!< in: memory context */
 -{
 -	ha_innobase* new_handler;
 +		/* We need the upper limit of the col type to check for
 +		whether we update the table autoinc counter or not. */
 +		col_max_value = innobase_get_int_col_max_value(
 +			table->next_number_field);
  
 -	DBUG_ENTER("ha_innobase::clone");
 +		/* Get the value that MySQL attempted to store in the table.*/
 +		auto_inc = table->next_number_field->val_uint();
  
 -	new_handler = static_cast<ha_innobase*>(handler::clone(name,
 -							       mem_root));
 -	if (new_handler) {
 -		DBUG_ASSERT(new_handler->prebuilt != NULL);
 -		DBUG_ASSERT(new_handler->user_thd == user_thd);
 -		DBUG_ASSERT(new_handler->prebuilt->trx == prebuilt->trx);
 +		switch (error) {
 +		case DB_DUPLICATE_KEY:
  
 -		new_handler->prebuilt->select_lock_type
 -			= prebuilt->select_lock_type;
 -	}
 +			/* A REPLACE command and LOAD DATA INFILE REPLACE
 +			handle a duplicate key error themselves, but we
 +			must update the autoinc counter if we are performing
 +			those statements. */
  
 -	DBUG_RETURN(new_handler);
 -}
 +			switch (sql_command) {
 +			case SQLCOM_LOAD:
 +				if (trx->duplicates) {
  
 -UNIV_INTERN
 -uint
 -ha_innobase::max_supported_key_part_length() const
 -{
 -	/* A table format specific index column length check will be performed
 -	at ha_innobase::add_index() and row_create_index_for_mysql() */
 -	return(innobase_large_prefix
 -		? REC_VERSION_56_MAX_INDEX_COL_LEN
 -		: REC_ANTELOPE_MAX_INDEX_COL_LEN - 1);
 -}
 +					goto set_max_autoinc;
 +				}
 +				break;
  
 -/******************************************************************//**
 -Closes a handle to an InnoDB table.
 -@return	0 */
 -UNIV_INTERN
 -int
 -ha_innobase::close(void)
 -/*====================*/
 -{
 -	THD*	thd;
 +			case SQLCOM_REPLACE:
 +			case SQLCOM_INSERT_SELECT:
 +			case SQLCOM_REPLACE_SELECT:
 +				goto set_max_autoinc;
  
 -	DBUG_ENTER("ha_innobase::close");
 +#ifdef WITH_WSREP
 +			/* workaround for LP bug #355000, retrying the insert */
 +			case SQLCOM_INSERT:
  
 -	thd = ha_thd();
 -	if (thd != NULL) {
 -		innobase_release_temporary_latches(ht, thd);
 -	}
 +				WSREP_DEBUG("DUPKEY error for autoinc\n"
 +				      "THD %ld, value %llu, off %llu inc %llu",
 +				      wsrep_thd_thread_id(current_thd),
 +				      auto_inc,
 +				      prebuilt->autoinc_offset,
 +				      prebuilt->autoinc_increment);
  
 -	row_prebuilt_free(prebuilt, FALSE);
 +                               if (wsrep_on(current_thd)                     &&
 +                                   auto_inc_inserted                         &&
 +                                   wsrep_drupal_282555_workaround            &&
 +                                   wsrep_thd_retry_counter(current_thd) == 0 &&
 +				    !thd_test_options(current_thd, 
 +						      OPTION_NOT_AUTOCOMMIT | 
 +						      OPTION_BEGIN)) {
 +					WSREP_DEBUG(
 +					    "retrying insert: %s",
 +					    (*wsrep_thd_query(current_thd)) ? 
 +						wsrep_thd_query(current_thd) : 
 +						(char *)"void");
 +					error= DB_SUCCESS;
 +					wsrep_thd_set_conflict_state(
 +						current_thd, MUST_ABORT);
 +                                        innobase_srv_conc_exit_innodb(prebuilt->trx);
 +                                        /* jump straight to func exit over
 +                                         * later wsrep hooks */
 +                                        goto func_exit;
 +				}
 +                                break;
 +#endif /* WITH_WSREP */
  
 -	if (upd_buf != NULL) {
 -		ut_ad(upd_buf_size != 0);
 -		my_free(upd_buf);
 -		upd_buf = NULL;
 -		upd_buf_size = 0;
 -	}
 +			default:
 +				break;
 +			}
  
 -	free_share(share);
 +			break;
  
 -	/* Tell InnoDB server that there might be work for
 -	utility threads: */
 +		case DB_SUCCESS:
 +			/* If the actual value inserted is greater than
 +			the upper limit of the interval, then we try and
 +			update the table upper limit. Note: last_value
 +			will be 0 if get_auto_increment() was not called.*/
  
 -	srv_active_wake_master_thread();
 +			if (auto_inc >= prebuilt->autoinc_last_value) {
 +set_max_autoinc:
 +				/* This should filter out the negative
 +				values set explicitly by the user. */
 +				if (auto_inc <= col_max_value) {
 +					ut_a(prebuilt->autoinc_increment > 0);
  
 -	DBUG_RETURN(0);
 -}
 +					ulonglong	offset;
 +					ulonglong	increment;
 +					dberr_t		err;
  
 -/* The following accessor functions should really be inside MySQL code! */
 +					offset = prebuilt->autoinc_offset;
 +					increment = prebuilt->autoinc_increment;
  
 -/**************************************************************//**
 -Gets field offset for a field in a table.
 -@return	offset */
 -static inline
 -uint
 -get_field_offset(
 -/*=============*/
 -	TABLE*	table,	/*!< in: MySQL table object */
 -	Field*	field)	/*!< in: MySQL field object */
 -{
 -	return((uint) (field->ptr - table->record[0]));
 -}
 +					auto_inc = innobase_next_autoinc(
 +						auto_inc,
 +						1, increment, offset,
 +						col_max_value);
  
 -/**************************************************************//**
 -Checks if a field in a record is SQL NULL. Uses the record format
 -information in table to track the null bit in record.
 -@return	1 if NULL, 0 otherwise */
 -static inline
 -uint
 -field_in_record_is_null(
 -/*====================*/
 -	TABLE*	table,	/*!< in: MySQL table object */
 -	Field*	field,	/*!< in: MySQL field object */
 -	char*	record)	/*!< in: a row in MySQL format */
 -{
 -	int	null_offset;
 +					err = innobase_set_max_autoinc(
 +						auto_inc);
 +
 +					if (err != DB_SUCCESS) {
 +						error = err;
 +					}
 +				}
 +			}
 +			break;
 +		default:
 +			break;
 +		}
 +	}
  
 -	if (!field->null_ptr) {
 +	innobase_srv_conc_exit_innodb(prebuilt->trx);
  
 -		return(0);
 +report_error:
 +	if (error == DB_TABLESPACE_DELETED) {
 +		ib_senderrf(
 +			trx->mysql_thd, IB_LOG_LEVEL_ERROR,
 +			ER_TABLESPACE_DISCARDED,
 +			table->s->table_name.str);
  	}
  
 -	null_offset = (uint) ((char*) field->null_ptr
 -					- (char*) table->record[0]);
 +	error_result = convert_error_code_to_mysql(error,
 +						   prebuilt->table->flags,
 +						   user_thd);
  
 -	if (record[null_offset] & field->null_bit) {
 +#ifdef WITH_WSREP
- 	if (!error_result                                &&
- 	    wsrep_thd_exec_mode(user_thd) == LOCAL_STATE &&
- 	    wsrep_on(user_thd)                           &&
- 	    !wsrep_consistency_check(user_thd)           &&
- 	    !wsrep_thd_skip_append_keys(user_thd))
- 	{
- 		if (wsrep_append_keys(user_thd, false, record, NULL))
- 		{
++	if (!error_result
++	    && wsrep_on(user_thd)
++	    && wsrep_thd_exec_mode(user_thd) == LOCAL_STATE
++	    && !wsrep_consistency_check(user_thd)
++	    && !wsrep_thd_skip_append_keys(user_thd)) {
++		if (wsrep_append_keys(user_thd, false, record, NULL)) {
 +			DBUG_PRINT("wsrep", ("row key failed"));
 +			error_result = HA_ERR_INTERNAL_ERROR;
 +			goto wsrep_error;
 +		}
 +	}
 +wsrep_error:
 +#endif /* WITH_WSREP */
  
 -		return(1);
 +	if (error_result == HA_FTS_INVALID_DOCID) {
 +		my_error(HA_FTS_INVALID_DOCID, MYF(0));
  	}
  
 -	return(0);
 +func_exit:
 +	innobase_active_small();
 +
 +	DBUG_RETURN(error_result);
  }
  
 -/*************************************************************//**
 -InnoDB uses this function to compare two data fields for which the data type
 -is such that we must use MySQL code to compare them. NOTE that the prototype
 -of this function is in rem0cmp.c in InnoDB source code! If you change this
 -function, remember to update the prototype there!
 -@return	1, 0, -1, if a is greater, equal, less than b, respectively */
 -extern "C" UNIV_INTERN
 -int
 -innobase_mysql_cmp(
 -/*===============*/
 -	int		mysql_type,	/*!< in: MySQL type */
 -	uint		charset_number,	/*!< in: number of the charset */
 -	const unsigned char* a,		/*!< in: data field */
 -	unsigned int	a_length,	/*!< in: data field length,
 -					not UNIV_SQL_NULL */
 -	const unsigned char* b,		/*!< in: data field */
 -	unsigned int	b_length)	/*!< in: data field length,
 -					not UNIV_SQL_NULL */
 +/**********************************************************************//**
 +Checks which fields have changed in a row and stores information
 +of them to an update vector.
 +@return	DB_SUCCESS or error code */
 +static
 +dberr_t
 +calc_row_difference(
 +/*================*/
 +	upd_t*		uvect,		/*!< in/out: update vector */
 +	uchar*		old_row,	/*!< in: old row in MySQL format */
 +	uchar*		new_row,	/*!< in: new row in MySQL format */
 +	TABLE*		table,		/*!< in: table in MySQL data
 +					dictionary */
 +	uchar*		upd_buff,	/*!< in: buffer to use */
 +	ulint		buff_len,	/*!< in: buffer length */
 +	row_prebuilt_t*	prebuilt,	/*!< in: InnoDB prebuilt struct */
 +	THD*		thd)		/*!< in: user thread */
  {
 -	CHARSET_INFO*		charset;
 -	enum_field_types	mysql_tp;
 -	int			ret;
 +	uchar*		original_upd_buff = upd_buff;
 +	Field*		field;
 +	enum_field_types field_mysql_type;
 +	uint		n_fields;
 +	ulint		o_len;
 +	ulint		n_len;
 +	ulint		col_pack_len;
 +	const byte*	new_mysql_row_col;
 +	const byte*	o_ptr;
 +	const byte*	n_ptr;
 +	byte*		buf;
 +	upd_field_t*	ufield;
 +	ulint		col_type;
 +	ulint		n_changed = 0;
 +	dfield_t	dfield;
 +	dict_index_t*	clust_index;
 +        uint		sql_idx, innodb_idx= 0;
 +	ibool		changes_fts_column = FALSE;
 +	ibool		changes_fts_doc_col = FALSE;
 +	trx_t*          trx = thd_to_trx(thd);
 +	doc_id_t	doc_id = FTS_NULL_DOC_ID;
  
 -	DBUG_ASSERT(a_length != UNIV_SQL_NULL);
 -	DBUG_ASSERT(b_length != UNIV_SQL_NULL);
 +	ut_ad(!srv_read_only_mode);
  
 -	mysql_tp = (enum_field_types) mysql_type;
 +	n_fields = table->s->fields;
 +	clust_index = dict_table_get_first_index(prebuilt->table);
  
 -	switch (mysql_tp) {
 +	/* We use upd_buff to convert changed fields */
 +	buf = (byte*) upd_buff;
  
 -	case MYSQL_TYPE_BIT:
 -	case MYSQL_TYPE_STRING:
 -	case MYSQL_TYPE_VAR_STRING:
 -	case MYSQL_TYPE_TINY_BLOB:
 -	case MYSQL_TYPE_MEDIUM_BLOB:
 -	case MYSQL_TYPE_BLOB:
 -	case MYSQL_TYPE_LONG_BLOB:
 -	case MYSQL_TYPE_VARCHAR:
 -		/* Use the charset number to pick the right charset struct for
 -		the comparison. Since the MySQL function get_charset may be
 -		slow before Bar removes the mutex operation there, we first
 -		look at 2 common charsets directly. */
 +	for (sql_idx = 0; sql_idx < n_fields; sql_idx++) {
 +		field = table->field[sql_idx];
 +                if (!field->stored_in_db)
 +		  continue;
  
 -		if (charset_number == default_charset_info->number) {
 -			charset = default_charset_info;
 -		} else if (charset_number == my_charset_latin1.number) {
 -			charset = &my_charset_latin1;
 -		} else {
 -			charset = get_charset(charset_number, MYF(MY_WME));
 +		o_ptr = (const byte*) old_row + get_field_offset(table, field);
 +		n_ptr = (const byte*) new_row + get_field_offset(table, field);
  
 -			if (charset == NULL) {
 -			  sql_print_error("InnoDB needs charset %lu for doing "
 -					  "a comparison, but MySQL cannot "
 -					  "find that charset.",
 -					  (ulong) charset_number);
 -				ut_a(0);
 -			}
 -		}
 +		/* Use new_mysql_row_col and col_pack_len save the values */
  
 -		/* Starting from 4.1.3, we use strnncollsp() in comparisons of
 -		non-latin1_swedish_ci strings. NOTE that the collation order
 -		changes then: 'b\0\0...' is ordered BEFORE 'b  ...'. Users
 -		having indexes on such data need to rebuild their tables! */
 +		new_mysql_row_col = n_ptr;
 +		col_pack_len = field->pack_length();
  
 -		ret = charset->coll->strnncollsp(charset,
 -				  a, a_length,
 -						 b, b_length, 0);
 -		if (ret < 0) {
 -			return(-1);
 -		} else if (ret > 0) {
 -			return(1);
 -		} else {
 -			return(0);
 -		}
 -	default:
 -		ut_error;
 -	}
 +		o_len = col_pack_len;
 +		n_len = col_pack_len;
  
 -	return(0);
 -}
 -#ifdef WITH_WSREP
 -extern "C" UNIV_INTERN
 -int
 -wsrep_innobase_mysql_sort(
 -/*===============*/
 -					/* out: str contains sort string */
 -	int		mysql_type,	/* in: MySQL type */
 -	uint		charset_number,	/* in: number of the charset */
 -	unsigned char*	str,		/* in: data field */
 -	unsigned int	str_length,	/* in: data field length,
 -					not UNIV_SQL_NULL */
 -	unsigned int	buf_length)	/* in: total str buffer length */
 +		/* We use o_ptr and n_ptr to dig up the actual data for
 +		comparison. */
  
 -{
 -	CHARSET_INFO*		charset;
 -	enum_field_types	mysql_tp;
 -	int ret_length =	str_length;
 +		field_mysql_type = field->type();
  
 -	DBUG_ASSERT(str_length != UNIV_SQL_NULL);
 +		col_type = prebuilt->table->cols[innodb_idx].mtype;
  
 -	mysql_tp = (enum_field_types) mysql_type;
 +		switch (col_type) {
  
 -	switch (mysql_tp) {
 +		case DATA_BLOB:
 +			o_ptr = row_mysql_read_blob_ref(&o_len, o_ptr, o_len);
 +			n_ptr = row_mysql_read_blob_ref(&n_len, n_ptr, n_len);
  
 -	case MYSQL_TYPE_BIT:
 -	case MYSQL_TYPE_STRING:
 -	case MYSQL_TYPE_VAR_STRING:
 -	case MYSQL_TYPE_TINY_BLOB:
 -	case MYSQL_TYPE_MEDIUM_BLOB:
 -	case MYSQL_TYPE_BLOB:
 -	case MYSQL_TYPE_LONG_BLOB:
 -	case MYSQL_TYPE_VARCHAR:
 -	{
 -		uchar tmp_str[REC_VERSION_56_MAX_INDEX_COL_LEN];
 -		uint tmp_length = REC_VERSION_56_MAX_INDEX_COL_LEN;
 +			break;
  
 -		/* Use the charset number to pick the right charset struct for
 -		the comparison. Since the MySQL function get_charset may be
 -		slow before Bar removes the mutex operation there, we first
 -		look at 2 common charsets directly. */
 +		case DATA_VARCHAR:
 +		case DATA_BINARY:
 +		case DATA_VARMYSQL:
 +			if (field_mysql_type == MYSQL_TYPE_VARCHAR) {
 +				/* This is a >= 5.0.3 type true VARCHAR where
 +				the real payload data length is stored in
 +				1 or 2 bytes */
  
 -		if (charset_number == default_charset_info->number) {
 -			charset = default_charset_info;
 -		} else if (charset_number == my_charset_latin1.number) {
 -			charset = &my_charset_latin1;
 -		} else {
 -			charset = get_charset(charset_number, MYF(MY_WME));
 +				o_ptr = row_mysql_read_true_varchar(
 +					&o_len, o_ptr,
 +					(ulint)
 +					(((Field_varstring*) field)->length_bytes));
 +
 +				n_ptr = row_mysql_read_true_varchar(
 +					&n_len, n_ptr,
 +					(ulint)
 +					(((Field_varstring*) field)->length_bytes));
 +			}
 +
 +			break;
 +		default:
 +			;
 +		}
  
 -			if (charset == NULL) {
 -			  sql_print_error("InnoDB needs charset %lu for doing "
 -					  "a comparison, but MySQL cannot "
 -					  "find that charset.",
 -					  (ulong) charset_number);
 -				ut_a(0);
 +		if (field_mysql_type == MYSQL_TYPE_LONGLONG
 +		    && prebuilt->table->fts
 +		    && innobase_strcasecmp(
 +			field->field_name, FTS_DOC_ID_COL_NAME) == 0) {
 +			doc_id = (doc_id_t) mach_read_from_n_little_endian(
 +				n_ptr, 8);
 +			if (doc_id == 0) {
 +				return(DB_FTS_INVALID_DOCID);
  			}
  		}
  
@@@ -13725,277 -9359,313 +13727,278 @@@ ha_innobase::get_parent_foreign_key_lis
  
  	trx_search_latch_release_if_reserved(prebuilt->trx);
  
 -	ib_table = prebuilt->table;
 -
 -	if (flag & HA_STATUS_TIME) {
 -		if (called_from_analyze || innobase_stats_on_metadata) {
 -			/* In sql_show we call with this flag: update
 -			then statistics so that they are up-to-date */
 -
 -			prebuilt->trx->op_info = "updating table statistics";
 +	mutex_enter(&(dict_sys->mutex));
  
 -			DEBUG_SYNC_C("info_before_stats_update");
 +	for (dict_foreign_set::iterator it
 +		= prebuilt->table->referenced_set.begin();
 +	     it != prebuilt->table->referenced_set.end();
 +	     ++it) {
  
 -			dict_update_statistics(
 -				ib_table,
 -				FALSE, /* update even if initialized */
 -				FALSE /* update even if not changed too much */);
 +		foreign = *it;
  
 -			prebuilt->trx->op_info = "returning various info to MySQL";
 +		pf_key_info = get_foreign_key_info(thd, foreign);
 +		if (pf_key_info) {
 +			f_key_list->push_back(pf_key_info);
  		}
 -
  	}
  
 -	if (flag & HA_STATUS_VARIABLE) {
 -
 -		ulint	page_size;
 -
 -		dict_table_stats_lock(ib_table, RW_S_LATCH);
 +	mutex_exit(&(dict_sys->mutex));
  
 -		n_rows = ib_table->stat_n_rows;
 +	prebuilt->trx->op_info = "";
  
 -		/* Because we do not protect stat_n_rows by any mutex in a
 -		delete, it is theoretically possible that the value can be
 -		smaller than zero! TODO: fix this race.
 +	return(0);
 +}
  
 -		The MySQL optimizer seems to assume in a left join that n_rows
 -		is an accurate estimate if it is zero. Of course, it is not,
 -		since we do not have any locks on the rows yet at this phase.
 -		Since SHOW TABLE STATUS seems to call this function with the
 -		HA_STATUS_TIME flag set, while the left join optimizer does not
 -		set that flag, we add one to a zero value if the flag is not
 -		set. That way SHOW TABLE STATUS will show the best estimate,
 -		while the optimizer never sees the table empty. */
 +/*****************************************************************//**
 +Checks if ALTER TABLE may change the storage engine of the table.
 +Changing storage engines is not allowed for tables for which there
 +are foreign key constraints (parent or child tables).
 +@return	TRUE if can switch engines */
 +UNIV_INTERN
 +bool
 +ha_innobase::can_switch_engines(void)
 +/*=================================*/
 +{
 +	bool	can_switch;
  
 -		if (n_rows < 0) {
 -			n_rows = 0;
 -		}
 +	DBUG_ENTER("ha_innobase::can_switch_engines");
 +	update_thd();
  
 -		if (n_rows == 0 && !(flag & HA_STATUS_TIME)) {
 -			n_rows++;
 -		}
 +	prebuilt->trx->op_info =
 +			"determining if there are foreign key constraints";
 +	row_mysql_freeze_data_dictionary(prebuilt->trx);
  
 -		/* Fix bug#40386: Not flushing query cache after truncate.
 -		n_rows can not be 0 unless the table is empty, set to 1
 -		instead. The original problem of bug#29507 is actually
 -		fixed in the server code. */
 -		if (thd_sql_command(user_thd) == SQLCOM_TRUNCATE) {
 +	can_switch = prebuilt->table->referenced_set.empty()
 +		&& prebuilt->table->foreign_set.empty();
  
 -			n_rows = 1;
 +	row_mysql_unfreeze_data_dictionary(prebuilt->trx);
 +	prebuilt->trx->op_info = "";
  
 -			/* We need to reset the prebuilt value too, otherwise
 -			checks for values greater than the last value written
 -			to the table will fail and the autoinc counter will
 -			not be updated. This will force write_row() into
 -			attempting an update of the table's AUTOINC counter. */
 +	DBUG_RETURN(can_switch);
 +}
  
 -			prebuilt->autoinc_last_value = 0;
 -		}
 +/*******************************************************************//**
 +Checks if a table is referenced by a foreign key. The MySQL manual states that
 +a REPLACE is either equivalent to an INSERT, or DELETE(s) + INSERT. Only a
 +delete is then allowed internally to resolve a duplicate key conflict in
 +REPLACE, not an update.
 +@return	> 0 if referenced by a FOREIGN KEY */
 +UNIV_INTERN
 +uint
 +ha_innobase::referenced_by_foreign_key(void)
 +/*========================================*/
 +{
 +	if (dict_table_is_referenced_by_foreign_key(prebuilt->table)) {
  
 -		page_size = dict_table_zip_size(ib_table);
 -		if (page_size == 0) {
 -			page_size = UNIV_PAGE_SIZE;
 -		}
 +		return(1);
 +	}
  
 -		stats.records = (ha_rows)n_rows;
 -		stats.deleted = 0;
 -		stats.data_file_length
 -			= ((ulonglong) ib_table->stat_clustered_index_size)
 -			* page_size;
 -		stats.index_file_length =
 -			((ulonglong) ib_table->stat_sum_of_other_index_sizes)
 -			* page_size;
 +	return(0);
 +}
  
 -		dict_table_stats_unlock(ib_table, RW_S_LATCH);
 +/*******************************************************************//**
 +Frees the foreign key create info for a table stored in InnoDB, if it is
 +non-NULL. */
 +UNIV_INTERN
 +void
 +ha_innobase::free_foreign_key_create_info(
 +/*======================================*/
 +	char*	str)	/*!< in, own: create info string to free */
 +{
 +	if (str) {
 +		my_free(str);
 +	}
 +}
  
 -		/* Since fsp_get_available_space_in_free_extents() is
 -		acquiring latches inside InnoDB, we do not call it if we
 -		are asked by MySQL to avoid locking. Another reason to
 -		avoid the call is that it uses quite a lot of CPU.
 -		See Bug#38185. */
 -		if (flag & HA_STATUS_NO_LOCK
 -		    || !(flag & HA_STATUS_VARIABLE_EXTRA)) {
 -			/* We do not update delete_length if no
 -			locking is requested so the "old" value can
 -			remain. delete_length is initialized to 0 in
 -			the ha_statistics' constructor. Also we only
 -			need delete_length to be set when
 -			HA_STATUS_VARIABLE_EXTRA is set */
 -		} else if (UNIV_UNLIKELY
 -			   (srv_force_recovery >= SRV_FORCE_NO_IBUF_MERGE)) {
 -			/* Avoid accessing the tablespace if
 -			innodb_crash_recovery is set to a high value. */
 -			stats.delete_length = 0;
 -		} else {
 -			ullint	avail_space;
 +/*******************************************************************//**
 +Tells something additional to the handler about how to do things.
 +@return	0 or error number */
 +UNIV_INTERN
 +int
 +ha_innobase::extra(
 +/*===============*/
 +	enum ha_extra_function operation)
 +			   /*!< in: HA_EXTRA_FLUSH or some other flag */
 +{
 +	check_trx_exists(ha_thd());
  
 -			avail_space = fsp_get_available_space_in_free_extents(
 -				ib_table->space);
 +	/* Warning: since it is not sure that MySQL calls external_lock
 +	before calling this function, the trx field in prebuilt can be
 +	obsolete! */
  
 -			if (avail_space == ULLINT_UNDEFINED) {
 -				THD*	thd;
 +	switch (operation) {
 +	case HA_EXTRA_FLUSH:
 +		if (prebuilt->blob_heap) {
 +			row_mysql_prebuilt_free_blob_heap(prebuilt);
 +		}
 +		break;
 +	case HA_EXTRA_RESET_STATE:
 +		reset_template();
 +		thd_to_trx(ha_thd())->duplicates = 0;
 +		break;
 +	case HA_EXTRA_NO_KEYREAD:
 +		prebuilt->read_just_key = 0;
 +		break;
 +	case HA_EXTRA_KEYREAD:
 +		prebuilt->read_just_key = 1;
 +		break;
 +	case HA_EXTRA_KEYREAD_PRESERVE_FIELDS:
 +		prebuilt->keep_other_fields_on_keyread = 1;
 +		break;
  
 -				thd = ha_thd();
 +		/* IMPORTANT: prebuilt->trx can be obsolete in
 +		this method, because it is not sure that MySQL
 +		calls external_lock before this method with the
 +		parameters below.  We must not invoke update_thd()
 +		either, because the calling threads may change.
 +		CAREFUL HERE, OR MEMORY CORRUPTION MAY OCCUR! */
 +	case HA_EXTRA_INSERT_WITH_UPDATE:
 +		thd_to_trx(ha_thd())->duplicates |= TRX_DUP_IGNORE;
 +		break;
 +	case HA_EXTRA_NO_IGNORE_DUP_KEY:
 +		thd_to_trx(ha_thd())->duplicates &= ~TRX_DUP_IGNORE;
 +		break;
 +	case HA_EXTRA_WRITE_CAN_REPLACE:
 +		thd_to_trx(ha_thd())->duplicates |= TRX_DUP_REPLACE;
 +		break;
 +	case HA_EXTRA_WRITE_CANNOT_REPLACE:
 +		thd_to_trx(ha_thd())->duplicates &= ~TRX_DUP_REPLACE;
 +		break;
 +	default:/* Do nothing */
 +		;
 +	}
  
 -				push_warning_printf(
 -					thd,
 -					MYSQL_ERROR::WARN_LEVEL_WARN,
 -					ER_CANT_GET_STAT,
 -					"InnoDB: Trying to get the free "
 -					"space for table %s but its "
 -					"tablespace has been discarded or "
 -					"the .ibd file is missing. Setting "
 -					"the free space to zero.",
 -					ib_table->name);
 +	return(0);
 +}
  
 -				stats.delete_length = 0;
 -			} else {
 -				stats.delete_length = avail_space * 1024;
 -			}
 -		}
 +/******************************************************************//**
 +*/
 +UNIV_INTERN
 +int
 +ha_innobase::reset()
 +/*================*/
 +{
 +	if (prebuilt->blob_heap) {
 +		row_mysql_prebuilt_free_blob_heap(prebuilt);
 +	}
  
 -		stats.check_time = 0;
 -                stats.mrr_length_per_rec= ref_length +  8; // 8 = max(sizeof(void *));
 +	reset_template();
 +	ds_mrr.dsmrr_close();
  
 +	/* TODO: This should really be reset in reset_template() but for now
 +	it's safer to do it explicitly here. */
  
 -		if (stats.records == 0) {
 -			stats.mean_rec_length = 0;
 -		} else {
 -			stats.mean_rec_length = (ulong) (stats.data_file_length / stats.records);
 -		}
 -	}
 +	/* This is a statement level counter. */
 +	prebuilt->autoinc_last_value = 0;
  
 -	if (flag & HA_STATUS_CONST) {
 -		ulong	i;
 -		/* Verify the number of index in InnoDB and MySQL
 -		matches up. If prebuilt->clust_index_was_generated
 -		holds, InnoDB defines GEN_CLUST_INDEX internally */
 -		ulint	num_innodb_index = UT_LIST_GET_LEN(ib_table->indexes)
 -					- prebuilt->clust_index_was_generated;
 +	return(0);
 +}
  
 -		if (table->s->keys != num_innodb_index) {
 -			sql_print_error("Table %s contains %lu "
 -					"indexes inside InnoDB, which "
 -					"is different from the number of "
 -					"indexes %u defined in the MySQL ",
 -					ib_table->name, num_innodb_index,
 -					table->s->keys);
 -		}
 +/******************************************************************//**
 +MySQL calls this function at the start of each SQL statement inside LOCK
 +TABLES. Inside LOCK TABLES the ::external_lock method does not work to
 +mark SQL statement borders. Note also a special case: if a temporary table
 +is created inside LOCK TABLES, MySQL has not called external_lock() at all
 +on that table.
 +MySQL-5.0 also calls this before each statement in an execution of a stored
 +procedure. To make the execution more deterministic for binlogging, MySQL-5.0
 +locks all tables involved in a stored procedure with full explicit table
 +locks (thd_in_lock_tables(thd) holds in store_lock()) before executing the
 +procedure.
 +@return	0 or error code */
 +UNIV_INTERN
 +int
 +ha_innobase::start_stmt(
 +/*====================*/
 +	THD*		thd,	/*!< in: handle to the user thread */
 +	thr_lock_type	lock_type)
 +{
 +	trx_t*		trx;
 +	DBUG_ENTER("ha_innobase::start_stmt");
  
 -		dict_table_stats_lock(ib_table, RW_S_LATCH);
 +	update_thd(thd);
  
 -		for (i = 0; i < table->s->keys; i++) {
 -			ulong	j;
 -                        rec_per_key = 1;
 -			/* We could get index quickly through internal
 -			index mapping with the index translation table.
 -			The identity of index (match up index name with
 -			that of table->key_info[i]) is already verified in
 -			innobase_get_index().  */
 -			index = innobase_get_index(i);
 +	trx = prebuilt->trx;
  
 -			if (index == NULL) {
 -				sql_print_error("Table %s contains fewer "
 -						"indexes inside InnoDB than "
 -						"are defined in the MySQL "
 -						".frm file. Have you mixed up "
 -						".frm files from different "
 -						"installations? See "
 -						REFMAN
 -						"innodb-troubleshooting.html\n",
 -						ib_table->name);
 -				break;
 -			}
 +	/* Here we release the search latch and the InnoDB thread FIFO ticket
 +	if they were reserved. They should have been released already at the
 +	end of the previous statement, but because inside LOCK TABLES the
 +	lock count method does not work to mark the end of a SELECT statement,
 +	that may not be the case. We MUST release the search latch before an
 +	INSERT, for example. */
  
 -			for (j = 0; j < table->key_info[i].key_parts; j++) {
 +	trx_search_latch_release_if_reserved(trx);
  
 -				if (j + 1 > index->n_uniq) {
 -					sql_print_error(
 -"Index %s of %s has %lu columns unique inside InnoDB, but MySQL is asking "
 -"statistics for %lu columns. Have you mixed up .frm files from different "
 -"installations? "
 -"See " REFMAN "innodb-troubleshooting.html\n",
 -							index->name,
 -							ib_table->name,
 -							(unsigned long)
 -							index->n_uniq, j + 1);
 -					break;
 -				}
 +	innobase_srv_conc_force_exit_innodb(trx);
  
 -				rec_per_key = innodb_rec_per_key(
 -					index, j, stats.records);
 +	/* Reset the AUTOINC statement level counter for multi-row INSERTs. */
 +	trx->n_autoinc_rows = 0;
  
 -				/* Since MySQL seems to favor table scans
 -				too much over index searches, we pretend
 -				index selectivity is 2 times better than
 -				our estimate: */
 +	prebuilt->sql_stat_start = TRUE;
 +	prebuilt->hint_need_to_fetch_extra_cols = 0;
 +	reset_template();
  
 -				rec_per_key = rec_per_key / 2;
 +	if (dict_table_is_temporary(prebuilt->table)
 +	    && prebuilt->mysql_has_locked
 +	    && prebuilt->select_lock_type == LOCK_NONE) {
 +		dberr_t error;
  
 -				if (rec_per_key == 0) {
 -					rec_per_key = 1;
 -				}
 +		switch (thd_sql_command(thd)) {
 +		case SQLCOM_INSERT:
 +		case SQLCOM_UPDATE:
 +		case SQLCOM_DELETE:
++		case SQLCOM_REPLACE:
 +			init_table_handle_for_HANDLER();
 +			prebuilt->select_lock_type = LOCK_X;
 +			prebuilt->stored_select_lock_type = LOCK_X;
 +			error = row_lock_table_for_mysql(prebuilt, NULL, 1);
  
 -				table->key_info[i].rec_per_key[j]=
 -				  rec_per_key >= ~(ulong) 0 ? ~(ulong) 0 :
 -				  (ulong) rec_per_key;
 +			if (error != DB_SUCCESS) {
 +				int st = convert_error_code_to_mysql(
 +					error, 0, thd);
 +				DBUG_RETURN(st);
  			}
 -
 -                        KEY *key_info= table->key_info+i; 
 -                        key_part_map ext_key_part_map=
 -                                             key_info->ext_key_part_map;                               
 -
 -                        if (key_info->key_parts != key_info->ext_key_parts) {
 -
 -                                KEY *pk_key_info= key_info+
 -                                                  table->s->primary_key;
 -                                uint k = key_info->key_parts;
 -                                ha_rows k_rec_per_key = rec_per_key;
 -                                uint pk_parts = pk_key_info->key_parts;
 -                          
 -		                index= innobase_get_index(
 -                                        table->s->primary_key);
 -                                
 -                                n_rows= ib_table->stat_n_rows;
 -    
 -                                for (j = 0; j < pk_parts; j++) {
 - 
 -				         if (ext_key_part_map & 1<<j) {
 -
 -                                                rec_per_key =
 -						innodb_rec_per_key(index,
 -                                                        j, stats.records);
 -                               
 -				                if (rec_per_key == 0) {
 -					                rec_per_key = 1;
 -				                }
 -                                                else if (rec_per_key > 1) {
 -                                                        rec_per_key =
 -                                                        (ha_rows)
 -                                                          (k_rec_per_key *
 -						          (double)rec_per_key /
 -                                                           n_rows);
 -						}
 -                                                
 -				                key_info->rec_per_key[k++]=
 -				                rec_per_key >= ~(ulong) 0 ?
 -                                                ~(ulong) 0 :
 -                                                (ulong) rec_per_key;
 -
 -					} 
 -				}
 -			}                                         
 +			break;
  		}
 +	}
  
 -		dict_table_stats_unlock(ib_table, RW_S_LATCH);
 -
 -		my_snprintf(path, sizeof(path), "%s/%s%s",
 -			    mysql_data_home,
 -			    table->s->normalized_path.str,
 -			    reg_ext);
 +	if (!prebuilt->mysql_has_locked) {
 +		/* This handle is for a temporary table created inside
 +		this same LOCK TABLES; since MySQL does NOT call external_lock
 +		in this case, we must use x-row locks inside InnoDB to be
 +		prepared for an update of a row */
  
 -		unpack_filename(path,path);
 +		prebuilt->select_lock_type = LOCK_X;
  
 -		/* Note that we do not know the access time of the table,
 -		nor the CHECK TABLE time, nor the UPDATE or INSERT time. */
 +	} else if (trx->isolation_level != TRX_ISO_SERIALIZABLE
 +		   && thd_sql_command(thd) == SQLCOM_SELECT
 +		   && lock_type == TL_READ) {
  
 -		if (os_file_get_status(path,&stat_info)) {
 -			stats.create_time = (ulong) stat_info.ctime;
 -		}
 -	}
 +		/* For other than temporary tables, we obtain
 +		no lock for consistent read (plain SELECT). */
  
 -	if (srv_force_recovery >= SRV_FORCE_NO_IBUF_MERGE) {
 +		prebuilt->select_lock_type = LOCK_NONE;
 +	} else {
 +		/* Not a consistent read: restore the
 +		select_lock_type value. The value of
 +		stored_select_lock_type was decided in:
 +		1) ::store_lock(),
 +		2) ::external_lock(),
 +		3) ::init_table_handle_for_HANDLER(), and
 +		4) ::transactional_table_lock(). */
  
 -		goto func_exit;
 +		ut_a(prebuilt->stored_select_lock_type != LOCK_NONE_UNSET);
 +		prebuilt->select_lock_type = prebuilt->stored_select_lock_type;
  	}
  
 -	if (flag & HA_STATUS_ERRKEY) {
 -		const dict_index_t*	err_index;
 -
 -		ut_a(prebuilt->trx);
 -		ut_a(prebuilt->trx->magic_n == TRX_MAGIC_N);
 -
 -		err_index = trx_get_error_info(prebuilt->trx);
 +	*trx->detailed_error = 0;
  
 -		if (err_index) {
 -			errkey = innobase_get_mysql_key_number_for_index(
 -					share, table, ib_table, err_index);
 -		} else {
 -			errkey = (unsigned int) prebuilt->trx->error_key_num;
 -		}
 -	}
 +	innobase_register_trx(ht, thd, trx);
  
 -	if ((flag & HA_STATUS_AUTO) && table->found_next_number_field) {
 -		stats.auto_increment_value = innobase_peek_autoinc();
 +	if (!trx_is_started(trx)) {
 +		++trx->will_lock;
  	}
  
 -func_exit:
 -	prebuilt->trx->op_info = (char*)"";
 -
  	DBUG_RETURN(0);
  }
  
@@@ -17689,22 -12815,15 +17692,24 @@@ wsrep_innobase_kill_one_trx
   		    wsrep_thd_thread_id(thd),
  		    victim_trx->id);
  
- 	WSREP_DEBUG("Aborting query: %s",
- 		  (thd && wsrep_thd_query(thd)) ? wsrep_thd_query(thd) : "void");
+ 	WSREP_DEBUG("Aborting query: %s conf %d trx: %lu",
+ 		    (thd && wsrep_thd_query(thd)) ? wsrep_thd_query(thd) : "void",
+ 		    wsrep_thd_conflict_state(thd),
+ 		    wsrep_thd_ws_handle(thd)->trx_id);
  
  	wsrep_thd_LOCK(thd);
 +        DBUG_EXECUTE_IF("sync.wsrep_after_BF_victim_lock",
 +                 {
 +                   const char act[]=
 +                     "now "
 +                     "wait_for signal.wsrep_after_BF_victim_lock";
 +                   DBUG_ASSERT(!debug_sync_set_action(bf_thd,
 +                                                      STRING_WITH_LEN(act)));
 +                 };);
 +
  
  	if (wsrep_thd_query_state(thd) == QUERY_EXITING) {
 -		WSREP_DEBUG("kill trx EXITING for %llu", victim_trx->id);
 +		WSREP_DEBUG("kill trx EXITING for %lu", victim_trx->id);
  		wsrep_thd_UNLOCK(thd);
  		DBUG_RETURN(0);
  	}
@@@ -17751,15 -12870,13 +17756,15 @@@
  		} else {
  			rcode = wsrep->abort_pre_commit(
  				wsrep, bf_seqno,
- 				(wsrep_trx_id_t)victim_trx->id
+ 				(wsrep_trx_id_t)wsrep_thd_ws_handle(thd)->trx_id
  			);
 +
  			switch (rcode) {
  			case WSREP_WARNING:
 -				WSREP_DEBUG("cancel commit warning: %llu",
 +				WSREP_DEBUG("cancel commit warning: %lu",
  					    victim_trx->id);
  				wsrep_thd_UNLOCK(thd);
 +				wsrep_thd_awake(thd, signal);
  				DBUG_RETURN(1);
  				break;
  			case WSREP_OK:
@@@ -17871,40 -12988,34 +17876,42 @@@
  		break;
  	}
  	default:
 -		WSREP_WARN("bad wsrep query state: %d", 
 +		WSREP_WARN("bad wsrep query state: %d",
  			  wsrep_thd_query_state(thd));
 +		wsrep_thd_UNLOCK(thd);
  		break;
  	}
 -	wsrep_thd_UNLOCK(thd);
 -     
 +
  	DBUG_RETURN(0);
  }
 -static int 
 -wsrep_abort_transaction(handlerton* hton, THD *bf_thd, THD *victim_thd, 
 -			my_bool signal)
 +
 +static
 +int
 +wsrep_abort_transaction(
 +	handlerton* hton,
 +	THD *bf_thd,
 +	THD *victim_thd,
 +	my_bool signal)
  {
  	DBUG_ENTER("wsrep_innobase_abort_thd");
- 	trx_t* victim_trx = thd_to_trx(victim_thd);
- 	trx_t* bf_trx     = (bf_thd) ? thd_to_trx(bf_thd) : NULL;
+ 	
+ 	trx_t* victim_trx	= thd_to_trx(victim_thd);
+ 	trx_t* bf_trx		= (bf_thd) ? thd_to_trx(bf_thd) : NULL;
  
- 	WSREP_DEBUG("abort transaction: BF: %s victim: %s",
- 		    wsrep_thd_query(bf_thd),
- 		    wsrep_thd_query(victim_thd));
+ 	WSREP_DEBUG("abort transaction: BF: %s victim: %s victim conf: %d",
+ 			wsrep_thd_query(bf_thd),
+ 			wsrep_thd_query(victim_thd),
+ 			wsrep_thd_conflict_state(victim_thd));
  
 -	ut_ad(!mutex_own(&kernel_mutex));
 -
 -	if (victim_trx)
 -	{
 -		int rcode = wsrep_innobase_kill_one_trx(
 -			bf_thd, bf_trx, victim_trx, signal, FALSE);
 +	if (victim_trx) {
 +		lock_mutex_enter();
 +		trx_mutex_enter(victim_trx);
 +		victim_trx->abort_type = TRX_WSREP_ABORT;
 +		int rcode = wsrep_innobase_kill_one_trx(bf_thd, bf_trx,
 +                                                        victim_trx, signal);
 +		trx_mutex_exit(victim_trx);
 +		lock_mutex_exit();
 +		victim_trx->abort_type = TRX_SERVER_ABORT;
  		wsrep_srv_conc_cancel_wait(victim_trx);
  		DBUG_RETURN(rcode);
  	} else {
diff --cc storage/innobase/os/os0file.cc
index df096dcc6fd,00000000000..d4b8e82b0d8
mode 100644,000000..100644
--- a/storage/innobase/os/os0file.cc
+++ b/storage/innobase/os/os0file.cc
@@@ -1,5785 -1,0 +1,5785 @@@
 +/***********************************************************************
 +
 +Copyright (c) 1995, 2017, Oracle and/or its affiliates. All Rights Reserved.
 +Copyright (c) 2009, Percona Inc.
- Copyright (c) 2013, 2017, MariaDB Corporation.
++Copyright (c) 2013, 2018, MariaDB Corporation.
 +
 +Portions of this file contain modifications contributed and copyrighted
 +by Percona Inc.. Those modifications are
 +gratefully acknowledged and are described briefly in the InnoDB
 +documentation. The contributions by Percona Inc. are incorporated with
 +their permission, and subject to the conditions contained in the file
 +COPYING.Percona.
 +
 +This program is free software; you can redistribute it and/or modify it
 +under the terms of the GNU General Public License as published by the
 +Free Software Foundation; version 2 of the License.
 +
 +This program is distributed in the hope that it will be useful, but
 +WITHOUT ANY WARRANTY; without even the implied warranty of
 +MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General
 +Public License for more details.
 +
 +You should have received a copy of the GNU General Public License along with
 +this program; if not, write to the Free Software Foundation, Inc.,
 +51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA
 +
 +***********************************************************************/
 +
 +/**************************************************//**
 +@file os/os0file.cc
 +The interface to the operating system file i/o primitives
 +
 +Created 10/21/1995 Heikki Tuuri
 +*******************************************************/
 +
 +#include "os0file.h"
 +
 +#ifdef UNIV_NONINL
 +#include "os0file.ic"
 +#endif
 +
 +#include "ut0mem.h"
 +#include "srv0srv.h"
 +#include "srv0start.h"
 +#include "fil0fil.h"
 +#include "buf0buf.h"
 +#include "srv0mon.h"
 +#ifndef UNIV_HOTBACKUP
 +# include "os0sync.h"
 +# include "os0thread.h"
 +#else /* !UNIV_HOTBACKUP */
 +# ifdef __WIN__
 +/* Add includes for the _stat() call to compile on Windows */
 +#  include <sys/types.h>
 +#  include <sys/stat.h>
 +#  include <errno.h>
 +# endif /* __WIN__ */
 +#endif /* !UNIV_HOTBACKUP */
 +
 +#if defined(LINUX_NATIVE_AIO)
 +#include <libaio.h>
 +#endif
 +
 +/** Insert buffer segment id */
 +static const ulint IO_IBUF_SEGMENT = 0;
 +
 +/** Log segment id */
 +static const ulint IO_LOG_SEGMENT = 1;
 +
 +/* This specifies the file permissions InnoDB uses when it creates files in
 +Unix; the value of os_innodb_umask is initialized in ha_innodb.cc to
 +my_umask */
 +
 +#ifndef __WIN__
 +/** Umask for creating files */
 +UNIV_INTERN ulint	os_innodb_umask = S_IRUSR | S_IWUSR | S_IRGRP | S_IWGRP;
 +#else
 +/** Umask for creating files */
 +UNIV_INTERN ulint	os_innodb_umask	= 0;
 +#endif /* __WIN__ */
 +
 +#ifndef UNIV_HOTBACKUP
 +/* We use these mutexes to protect lseek + file i/o operation, if the
 +OS does not provide an atomic pread or pwrite, or similar */
 +#define OS_FILE_N_SEEK_MUTEXES	16
 +UNIV_INTERN os_ib_mutex_t	os_file_seek_mutexes[OS_FILE_N_SEEK_MUTEXES];
 +
 +/* In simulated aio, merge at most this many consecutive i/os */
 +#define OS_AIO_MERGE_N_CONSECUTIVE	64
 +
 +#ifdef WITH_INNODB_DISALLOW_WRITES
 +#define WAIT_ALLOW_WRITES() os_event_wait(srv_allow_writes_event)
 +#else
 +#define WAIT_ALLOW_WRITES() do { } while (0)
 +#endif /* WITH_INNODB_DISALLOW_WRITES */
 +
 +/**********************************************************************
 +
 +InnoDB AIO Implementation:
 +=========================
 +
 +We support native AIO for windows and linux. For rest of the platforms
 +we simulate AIO by special io-threads servicing the IO-requests.
 +
 +Simulated AIO:
 +==============
 +
 +In platforms where we 'simulate' AIO following is a rough explanation
 +of the high level design.
 +There are four io-threads (for ibuf, log, read, write).
 +All synchronous IO requests are serviced by the calling thread using
 +os_file_write/os_file_read. The Asynchronous requests are queued up
 +in an array (there are four such arrays) by the calling thread.
 +Later these requests are picked up by the io-thread and are serviced
 +synchronously.
 +
 +Windows native AIO:
 +==================
 +
 +If srv_use_native_aio is not set then windows follow the same
 +code as simulated AIO. If the flag is set then native AIO interface
 +is used. On windows, one of the limitation is that if a file is opened
 +for AIO no synchronous IO can be done on it. Therefore we have an
 +extra fifth array to queue up synchronous IO requests.
 +There are innodb_file_io_threads helper threads. These threads work
 +on the four arrays mentioned above in Simulated AIO. No thread is
 +required for the sync array.
 +If a synchronous IO request is made, it is first queued in the sync
 +array. Then the calling thread itself waits on the request, thus
 +making the call synchronous.
 +If an AIO request is made the calling thread not only queues it in the
 +array but also submits the requests. The helper thread then collects
 +the completed IO request and calls completion routine on it.
 +
 +Linux native AIO:
 +=================
 +
 +If we have libaio installed on the system and innodb_use_native_aio
 +is set to TRUE we follow the code path of native AIO, otherwise we
 +do simulated AIO.
 +There are innodb_file_io_threads helper threads. These threads work
 +on the four arrays mentioned above in Simulated AIO.
 +If a synchronous IO request is made, it is handled by calling
 +os_file_write/os_file_read.
 +If an AIO request is made the calling thread not only queues it in the
 +array but also submits the requests. The helper thread then collects
 +the completed IO request and calls completion routine on it.
 +
 +**********************************************************************/
 +
 +/** Flag: enable debug printout for asynchronous i/o */
 +UNIV_INTERN ibool	os_aio_print_debug	= FALSE;
 +
 +#ifdef UNIV_PFS_IO
 +/* Keys to register InnoDB I/O with performance schema */
 +UNIV_INTERN mysql_pfs_key_t  innodb_file_data_key;
 +UNIV_INTERN mysql_pfs_key_t  innodb_file_log_key;
 +UNIV_INTERN mysql_pfs_key_t  innodb_file_temp_key;
 +#endif /* UNIV_PFS_IO */
 +
 +/** The asynchronous i/o array slot structure */
 +struct os_aio_slot_t{
 +	ibool		is_read;	/*!< TRUE if a read operation */
 +	ulint		pos;		/*!< index of the slot in the aio
 +					array */
 +	ibool		reserved;	/*!< TRUE if this slot is reserved */
 +	time_t		reservation_time;/*!< time when reserved */
 +	ulint		len;		/*!< length of the block to read or
 +					write */
 +	byte*		buf;		/*!< buffer used in i/o */
 +	ulint		type;		/*!< OS_FILE_READ or OS_FILE_WRITE */
 +	os_offset_t	offset;		/*!< file offset in bytes */
 +	pfs_os_file_t	file;		/*!< file where to read or write */
 +	const char*	name;		/*!< file name or path */
 +	ibool		io_already_done;/*!< used only in simulated aio:
 +					TRUE if the physical i/o already
 +					made and only the slot message
 +					needs to be passed to the caller
 +					of os_aio_simulated_handle */
 +	fil_node_t*	message1;	/*!< message which is given by the */
 +	void*		message2;	/*!< the requester of an aio operation
 +					and which can be used to identify
 +					which pending aio operation was
 +					completed */
 +#ifdef WIN_ASYNC_IO
 +	HANDLE		handle;		/*!< handle object we need in the
 +					OVERLAPPED struct */
 +	OVERLAPPED	control;	/*!< Windows control block for the
 +					aio request */
 +#elif defined(LINUX_NATIVE_AIO)
 +	struct iocb	control;	/* Linux control block for aio */
 +	int		n_bytes;	/* bytes written/read. */
 +	int		ret;		/* AIO return code */
 +#endif /* WIN_ASYNC_IO */
 +};
 +
 +/** The asynchronous i/o array structure */
 +struct os_aio_array_t{
 +	os_ib_mutex_t	mutex;	/*!< the mutex protecting the aio array */
 +	os_event_t	not_full;
 +				/*!< The event which is set to the
 +				signaled state when there is space in
 +				the aio outside the ibuf segment;
 +				os_event_set() and os_event_reset()
 +				are protected by os_aio_array_t::mutex */
 +	os_event_t	is_empty;
 +				/*!< The event which is set to the
 +				signaled state when there are no
 +				pending i/os in this array;
 +				os_event_set() and os_event_reset()
 +				are protected by os_aio_array_t::mutex */
 +	ulint		n_slots;/*!< Total number of slots in the aio
 +				array.  This must be divisible by
 +				n_threads. */
 +	ulint		n_segments;
 +				/*!< Number of segments in the aio
 +				array of pending aio requests. A
 +				thread can wait separately for any one
 +				of the segments. */
 +	ulint		cur_seg;/*!< We reserve IO requests in round
 +				robin fashion to different segments.
 +				This points to the segment that is to
 +				be used to service next IO request. */
 +	ulint		n_reserved;
 +				/*!< Number of reserved slots in the
 +				aio array outside the ibuf segment */
 +	os_aio_slot_t*	slots;	/*!< Pointer to the slots in the array */
 +#ifdef __WIN__
 +	HANDLE*		handles;
 +				/*!< Pointer to an array of OS native
 +				event handles where we copied the
 +				handles from slots, in the same
 +				order. This can be used in
 +				WaitForMultipleObjects; used only in
 +				Windows */
 +#endif /* __WIN__ */
 +
 +#if defined(LINUX_NATIVE_AIO)
 +	io_context_t*		aio_ctx;
 +				/* completion queue for IO. There is
 +				one such queue per segment. Each thread
 +				will work on one ctx exclusively. */
 +	struct io_event*	aio_events;
 +				/* The array to collect completed IOs.
 +				There is one such event for each
 +				possible pending IO. The size of the
 +				array is equal to n_slots. */
 +#endif /* LINUX_NATIV_AIO */
 +};
 +
 +#if defined(LINUX_NATIVE_AIO)
 +/** timeout for each io_getevents() call = 500ms. */
 +#define OS_AIO_REAP_TIMEOUT	(500000000UL)
 +
 +/** time to sleep, in microseconds if io_setup() returns EAGAIN. */
 +#define OS_AIO_IO_SETUP_RETRY_SLEEP	(500000UL)
 +
 +/** number of attempts before giving up on io_setup(). */
 +#define OS_AIO_IO_SETUP_RETRY_ATTEMPTS	5
 +#endif
 +
 +/** Array of events used in simulated aio. */
 +static os_event_t*	os_aio_segment_wait_events;
 +
 +/** The aio arrays for non-ibuf i/o and ibuf i/o, as well as sync aio. These
 +are NULL when the module has not yet been initialized. @{ */
 +static os_aio_array_t*	os_aio_read_array	= NULL;	/*!< Reads */
 +static os_aio_array_t*	os_aio_write_array	= NULL;	/*!< Writes */
 +static os_aio_array_t*	os_aio_ibuf_array	= NULL;	/*!< Insert buffer */
 +static os_aio_array_t*	os_aio_log_array	= NULL;	/*!< Redo log */
 +static os_aio_array_t*	os_aio_sync_array	= NULL;	/*!< Synchronous I/O */
 +/* @} */
 +
 +/** Number of asynchronous I/O segments.  Set by os_aio_init(). */
 +static ulint	os_aio_n_segments	= ULINT_UNDEFINED;
 +
 +/** If the following is TRUE, read i/o handler threads try to
 +wait until a batch of new read requests have been posted */
 +static ibool	os_aio_recommend_sleep_for_read_threads	= FALSE;
 +#endif /* !UNIV_HOTBACKUP */
 +
 +UNIV_INTERN ulint	os_n_file_reads		= 0;
 +UNIV_INTERN ulint	os_bytes_read_since_printout = 0;
 +UNIV_INTERN ulint	os_n_file_writes	= 0;
 +UNIV_INTERN ulint	os_n_fsyncs		= 0;
 +UNIV_INTERN ulint	os_n_file_reads_old	= 0;
 +UNIV_INTERN ulint	os_n_file_writes_old	= 0;
 +UNIV_INTERN ulint	os_n_fsyncs_old		= 0;
 +UNIV_INTERN time_t	os_last_printout;
 +
 +UNIV_INTERN ibool	os_has_said_disk_full	= FALSE;
 +
 +#ifdef UNIV_DEBUG
 +# ifndef UNIV_HOTBACKUP
 +/**********************************************************************//**
 +Validates the consistency the aio system some of the time.
 +@return	TRUE if ok or the check was skipped */
 +UNIV_INTERN
 +ibool
 +os_aio_validate_skip(void)
 +/*======================*/
 +{
 +/** Try os_aio_validate() every this many times */
 +# define OS_AIO_VALIDATE_SKIP	13
 +
 +	/** The os_aio_validate() call skip counter.
 +	Use a signed type because of the race condition below. */
 +	static int os_aio_validate_count = OS_AIO_VALIDATE_SKIP;
 +
 +	/* There is a race condition below, but it does not matter,
 +	because this call is only for heuristic purposes. We want to
 +	reduce the call frequency of the costly os_aio_validate()
 +	check in debug builds. */
 +	if (--os_aio_validate_count > 0) {
 +		return(TRUE);
 +	}
 +
 +	os_aio_validate_count = OS_AIO_VALIDATE_SKIP;
 +	return(os_aio_validate());
 +}
 +# endif /* !UNIV_HOTBACKUP */
 +#endif /* UNIV_DEBUG */
 +
 +#ifdef __WIN__
 +/***********************************************************************//**
 +Gets the operating system version. Currently works only on Windows.
 +@return	OS_WIN95, OS_WIN31, OS_WINNT, OS_WIN2000, OS_WINXP, OS_WINVISTA,
 +OS_WIN7. */
 +UNIV_INTERN
 +ulint
 +os_get_os_version(void)
 +/*===================*/
 +{
 +	OSVERSIONINFO	os_info;
 +
 +	os_info.dwOSVersionInfoSize = sizeof(OSVERSIONINFO);
 +
 +	ut_a(GetVersionEx(&os_info));
 +
 +	if (os_info.dwPlatformId == VER_PLATFORM_WIN32s) {
 +		return(OS_WIN31);
 +	} else if (os_info.dwPlatformId == VER_PLATFORM_WIN32_WINDOWS) {
 +		return(OS_WIN95);
 +	} else if (os_info.dwPlatformId == VER_PLATFORM_WIN32_NT) {
 +		switch (os_info.dwMajorVersion) {
 +		case 3:
 +		case 4:
 +			return(OS_WINNT);
 +		case 5:
 +			return (os_info.dwMinorVersion == 0)
 +				? OS_WIN2000 : OS_WINXP;
 +		case 6:
 +			return (os_info.dwMinorVersion == 0)
 +				? OS_WINVISTA : OS_WIN7;
 +		default:
 +			return(OS_WIN7);
 +		}
 +	} else {
 +		ut_error;
 +		return(0);
 +	}
 +}
 +#endif /* __WIN__ */
 +
 +/***********************************************************************//**
 +Retrieves the last error number if an error occurs in a file io function.
 +The number should be retrieved before any other OS calls (because they may
 +overwrite the error number). If the number is not known to this program,
 +the OS error number + 100 is returned.
 +@return	error number, or OS error number + 100 */
 +static
 +ulint
 +os_file_get_last_error_low(
 +/*=======================*/
 +	bool	report_all_errors,	/*!< in: TRUE if we want an error
 +					message printed of all errors */
 +	bool	on_error_silent)	/*!< in: TRUE then don't print any
 +					diagnostic to the log */
 +{
 +#ifdef __WIN__
 +
 +	ulint	err = (ulint) GetLastError();
 +	if (err == ERROR_SUCCESS) {
 +		return(0);
 +	}
 +
 +	if (report_all_errors
 +	    || (!on_error_silent
 +		&& err != ERROR_DISK_FULL
 +		&& err != ERROR_FILE_EXISTS)) {
 +
 +		ut_print_timestamp(stderr);
 +		fprintf(stderr,
 +			"  InnoDB: Operating system error number %lu"
 +			" in a file operation.\n", (ulong) err);
 +
 +		if (err == ERROR_PATH_NOT_FOUND) {
 +			fprintf(stderr,
 +				"InnoDB: The error means the system"
 +				" cannot find the path specified.\n");
 +
 +			if (srv_is_being_started) {
 +				fprintf(stderr,
 +					"InnoDB: If you are installing InnoDB,"
 +					" remember that you must create\n"
 +					"InnoDB: directories yourself, InnoDB"
 +					" does not create them.\n");
 +			}
 +		} else if (err == ERROR_ACCESS_DENIED) {
 +			fprintf(stderr,
 +				"InnoDB: The error means mysqld does not have"
 +				" the access rights to\n"
 +				"InnoDB: the directory. It may also be"
 +				" you have created a subdirectory\n"
 +				"InnoDB: of the same name as a data file.\n");
 +		} else if (err == ERROR_SHARING_VIOLATION
 +			   || err == ERROR_LOCK_VIOLATION) {
 +			fprintf(stderr,
 +				"InnoDB: The error means that another program"
 +				" is using InnoDB's files.\n"
 +				"InnoDB: This might be a backup or antivirus"
 +				" software or another instance\n"
 +				"InnoDB: of MySQL."
 +				" Please close it to get rid of this error.\n");
 +		} else if (err == ERROR_WORKING_SET_QUOTA
 +			   || err == ERROR_NO_SYSTEM_RESOURCES) {
 +			fprintf(stderr,
 +				"InnoDB: The error means that there are no"
 +				" sufficient system resources or quota to"
 +				" complete the operation.\n");
 +		} else if (err == ERROR_OPERATION_ABORTED) {
 +			fprintf(stderr,
 +				"InnoDB: The error means that the I/O"
 +				" operation has been aborted\n"
 +				"InnoDB: because of either a thread exit"
 +				" or an application request.\n"
 +				"InnoDB: Retry attempt is made.\n");
 +		} else {
 +			fprintf(stderr,
 +				"InnoDB: Some operating system error numbers"
 +				" are described at\n"
 +				"InnoDB: "
 +				REFMAN
 +				"operating-system-error-codes.html\n");
 +		}
 +	}
 +
 +	fflush(stderr);
 +
 +	if (err == ERROR_FILE_NOT_FOUND) {
 +		return(OS_FILE_NOT_FOUND);
 +	} else if (err == ERROR_DISK_FULL) {
 +		return(OS_FILE_DISK_FULL);
 +	} else if (err == ERROR_FILE_EXISTS) {
 +		return(OS_FILE_ALREADY_EXISTS);
 +	} else if (err == ERROR_SHARING_VIOLATION
 +		   || err == ERROR_LOCK_VIOLATION) {
 +		return(OS_FILE_SHARING_VIOLATION);
 +	} else if (err == ERROR_WORKING_SET_QUOTA
 +		   || err == ERROR_NO_SYSTEM_RESOURCES) {
 +		return(OS_FILE_INSUFFICIENT_RESOURCE);
 +	} else if (err == ERROR_OPERATION_ABORTED) {
 +		return(OS_FILE_OPERATION_ABORTED);
 +	} else if (err == ERROR_ACCESS_DENIED) {
 +		return(OS_FILE_ACCESS_VIOLATION);
 +	} else if (err == ERROR_BUFFER_OVERFLOW) {
 +		return(OS_FILE_NAME_TOO_LONG);
 +	} else {
 +		return(OS_FILE_ERROR_MAX + err);
 +	}
 +#else
 +	int err = errno;
 +	if (err == 0) {
 +		return(0);
 +	}
 +
 +	if (report_all_errors
 +	    || (err != ENOSPC && err != EEXIST && !on_error_silent)) {
 +
 +		ut_print_timestamp(stderr);
 +		fprintf(stderr,
 +			"  InnoDB: Operating system error number %d"
 +			" in a file operation.\n", err);
 +
 +		if (err == ENOENT) {
 +			fprintf(stderr,
 +				"InnoDB: The error means the system"
 +				" cannot find the path specified.\n");
 +
 +			if (srv_is_being_started) {
 +				fprintf(stderr,
 +					"InnoDB: If you are installing InnoDB,"
 +					" remember that you must create\n"
 +					"InnoDB: directories yourself, InnoDB"
 +					" does not create them.\n");
 +			}
 +		} else if (err == EACCES) {
 +			fprintf(stderr,
 +				"InnoDB: The error means mysqld does not have"
 +				" the access rights to\n"
 +				"InnoDB: the directory.\n");
 +		} else {
 +			if (strerror(err) != NULL) {
 +				fprintf(stderr,
 +					"InnoDB: Error number %d"
 +					" means '%s'.\n",
 +					err, strerror(err));
 +			}
 +
 +
 +			fprintf(stderr,
 +				"InnoDB: Some operating system"
 +				" error numbers are described at\n"
 +				"InnoDB: "
 +				REFMAN
 +				"operating-system-error-codes.html\n");
 +		}
 +	}
 +
 +	fflush(stderr);
 +
 +	switch (err) {
 +	case ENOSPC:
 +		return(OS_FILE_DISK_FULL);
 +	case ENOENT:
 +		return(OS_FILE_NOT_FOUND);
 +	case EEXIST:
 +		return(OS_FILE_ALREADY_EXISTS);
 +	case ENAMETOOLONG:
 +		return(OS_FILE_NAME_TOO_LONG);
 +	case EXDEV:
 +	case ENOTDIR:
 +	case EISDIR:
 +		return(OS_FILE_PATH_ERROR);
 +	case EAGAIN:
 +		if (srv_use_native_aio) {
 +			return(OS_FILE_AIO_RESOURCES_RESERVED);
 +		}
 +		break;
 +	case EINTR:
 +		if (srv_use_native_aio) {
 +			return(OS_FILE_AIO_INTERRUPTED);
 +		}
 +		break;
 +	case EACCES:
 +		return(OS_FILE_ACCESS_VIOLATION);
 +	}
 +	return(OS_FILE_ERROR_MAX + err);
 +#endif
 +}
 +
 +/***********************************************************************//**
 +Retrieves the last error number if an error occurs in a file io function.
 +The number should be retrieved before any other OS calls (because they may
 +overwrite the error number). If the number is not known to this program,
 +the OS error number + 100 is returned.
 +@return	error number, or OS error number + 100 */
 +UNIV_INTERN
 +ulint
 +os_file_get_last_error(
 +/*===================*/
 +	bool	report_all_errors)	/*!< in: TRUE if we want an error
 +					message printed of all errors */
 +{
 +	return(os_file_get_last_error_low(report_all_errors, false));
 +}
 +
 +/****************************************************************//**
 +Does error handling when a file operation fails.
 +Conditionally exits (calling exit(3)) based on should_exit value and the
 +error type, if should_exit is TRUE then on_error_silent is ignored.
 +@return	TRUE if we should retry the operation */
 +static
 +ibool
 +os_file_handle_error_cond_exit(
 +/*===========================*/
 +	const char*	name,		/*!< in: name of a file or NULL */
 +	const char*	operation,	/*!< in: operation */
 +	ibool		should_exit,	/*!< in: call exit(3) if unknown error
 +					and this parameter is TRUE */
 +	ibool		on_error_silent)/*!< in: if TRUE then don't print
 +					any message to the log iff it is
 +					an unknown non-fatal error */
 +{
 +	ulint	err;
 +
 +	err = os_file_get_last_error_low(false, on_error_silent);
 +
 +	switch (err) {
 +	case OS_FILE_DISK_FULL:
 +		/* We only print a warning about disk full once */
 +
 +		if (os_has_said_disk_full) {
 +
 +			return(FALSE);
 +		}
 +
 +		/* Disk full error is reported irrespective of the
 +		on_error_silent setting. */
 +
 +		if (name) {
 +			ut_print_timestamp(stderr);
 +			fprintf(stderr,
 +				"  InnoDB: Encountered a problem with"
 +				" file %s\n", name);
 +		}
 +
 +		ut_print_timestamp(stderr);
 +		fprintf(stderr,
 +			"  InnoDB: Disk is full. Try to clean the disk"
 +			" to free space.\n");
 +
 +		os_has_said_disk_full = TRUE;
 +
 +		fflush(stderr);
 +		ut_error;
 +		return(FALSE);
 +
 +	case OS_FILE_AIO_RESOURCES_RESERVED:
 +	case OS_FILE_AIO_INTERRUPTED:
 +
 +		return(TRUE);
 +
 +	case OS_FILE_PATH_ERROR:
 +	case OS_FILE_ALREADY_EXISTS:
 +	case OS_FILE_ACCESS_VIOLATION:
 +
 +		return(FALSE);
 +
 +	case OS_FILE_SHARING_VIOLATION:
 +
 +		os_thread_sleep(10000000);  /* 10 sec */
 +		return(TRUE);
 +
 +	case OS_FILE_OPERATION_ABORTED:
 +	case OS_FILE_INSUFFICIENT_RESOURCE:
 +
 +		os_thread_sleep(100000);	/* 100 ms */
 +		return(TRUE);
 +
 +	default:
 +
 +		/* If it is an operation that can crash on error then it
 +		is better to ignore on_error_silent and print an error message
 +		to the log. */
 +
 +		if (should_exit || !on_error_silent) {
 +			ib_logf(IB_LOG_LEVEL_ERROR, "File %s: '%s' returned OS "
 +				"error " ULINTPF ".%s", name ? name : "(unknown)",
 +				operation, err, should_exit
 +				? " Cannot continue operation" : "");
 +		}
 +
 +		if (should_exit) {
 +			exit(1);
 +		}
 +	}
 +
 +	return(FALSE);
 +}
 +
 +/****************************************************************//**
 +Does error handling when a file operation fails.
 +@return	TRUE if we should retry the operation */
 +static
 +ibool
 +os_file_handle_error(
 +/*=================*/
 +	const char*	name,		/*!< in: name of a file or NULL */
 +	const char*	operation)	/*!< in: operation */
 +{
 +	/* exit in case of unknown error */
 +	return(os_file_handle_error_cond_exit(name, operation, TRUE, FALSE));
 +}
 +
 +/****************************************************************//**
 +Does error handling when a file operation fails.
 +@return	TRUE if we should retry the operation */
 +ibool
 +os_file_handle_error_no_exit(
 +/*=========================*/
 +	const char*	name,		/*!< in: name of a file or NULL */
 +	const char*	operation,	/*!< in: operation */
 +	ibool		on_error_silent)/*!< in: if TRUE then don't print
 +					any message to the log. */
 +{
 +	/* don't exit in case of unknown error */
 +	return(os_file_handle_error_cond_exit(
 +			name, operation, FALSE, on_error_silent));
 +}
 +
 +#undef USE_FILE_LOCK
 +#define USE_FILE_LOCK
 +#if defined(UNIV_HOTBACKUP) || defined(__WIN__)
 +/* InnoDB Hot Backup does not lock the data files.
 + * On Windows, mandatory locking is used.
 + */
 +# undef USE_FILE_LOCK
 +#endif
 +#ifdef USE_FILE_LOCK
 +/****************************************************************//**
 +Obtain an exclusive lock on a file.
 +@return	0 on success */
 +static
 +int
 +os_file_lock(
 +/*=========*/
 +	int		fd,	/*!< in: file descriptor */
 +	const char*	name)	/*!< in: file name */
 +{
 +	struct flock lk;
 +
 +	ut_ad(!srv_read_only_mode);
 +
 +	lk.l_type = F_WRLCK;
 +	lk.l_whence = SEEK_SET;
 +	lk.l_start = lk.l_len = 0;
 +
 +	if (fcntl(fd, F_SETLK, &lk) == -1) {
 +
 +		ib_logf(IB_LOG_LEVEL_ERROR,
 +			"Unable to lock %s, error: %d", name, errno);
 +
 +		if (errno == EAGAIN || errno == EACCES) {
 +			ib_logf(IB_LOG_LEVEL_INFO,
 +				"Check that you do not already have "
 +				"another mysqld process using the "
 +				"same InnoDB data or log files.");
 +		}
 +
 +		return(-1);
 +	}
 +
 +	return(0);
 +}
 +#endif /* USE_FILE_LOCK */
 +
 +#ifndef UNIV_HOTBACKUP
 +/****************************************************************//**
 +Creates the seek mutexes used in positioned reads and writes. */
 +UNIV_INTERN
 +void
 +os_io_init_simple(void)
 +/*===================*/
 +{
 +	for (ulint i = 0; i < OS_FILE_N_SEEK_MUTEXES; i++) {
 +		os_file_seek_mutexes[i] = os_mutex_create();
 +	}
 +}
 +
 +/** Create a temporary file. This function is like tmpfile(3), but
 +the temporary file is created in the given parameter path. If the path
 +is null then it will create the file in the mysql server configuration
 +parameter (--tmpdir).
 +@param[in]	path	location for creating temporary file
 +@return temporary file handle, or NULL on error */
 +UNIV_INTERN
 +FILE*
 +os_file_create_tmpfile(
 +	const char*	path)
 +{
 +	FILE*	file	= NULL;
 +	WAIT_ALLOW_WRITES();
 +	int	fd	= innobase_mysql_tmpfile(path);
 +
 +	ut_ad(!srv_read_only_mode);
 +
 +	if (fd >= 0) {
 +		file = fdopen(fd, "w+b");
 +	}
 +
 +	if (!file) {
 +		ut_print_timestamp(stderr);
 +		fprintf(stderr,
 +			"  InnoDB: Error: unable to create temporary file;"
 +			" errno: %d\n", errno);
 +		if (fd >= 0) {
 +			close(fd);
 +		}
 +	}
 +
 +	return(file);
 +}
 +#endif /* !UNIV_HOTBACKUP */
 +
 +/***********************************************************************//**
 +The os_file_opendir() function opens a directory stream corresponding to the
 +directory named by the dirname argument. The directory stream is positioned
 +at the first entry. In both Unix and Windows we automatically skip the '.'
 +and '..' items at the start of the directory listing.
 +@return	directory stream, NULL if error */
 +UNIV_INTERN
 +os_file_dir_t
 +os_file_opendir(
 +/*============*/
 +	const char*	dirname,	/*!< in: directory name; it must not
 +					contain a trailing '\' or '/' */
 +	ibool		error_is_fatal)	/*!< in: TRUE if we should treat an
 +					error as a fatal error; if we try to
 +					open symlinks then we do not wish a
 +					fatal error if it happens not to be
 +					a directory */
 +{
 +	os_file_dir_t		dir;
 +#ifdef __WIN__
 +	LPWIN32_FIND_DATA	lpFindFileData;
 +	char			path[OS_FILE_MAX_PATH + 3];
 +
 +	ut_a(strlen(dirname) < OS_FILE_MAX_PATH);
 +
 +	strcpy(path, dirname);
 +	strcpy(path + strlen(path), "\\*");
 +
 +	/* Note that in Windows opening the 'directory stream' also retrieves
 +	the first entry in the directory. Since it is '.', that is no problem,
 +	as we will skip over the '.' and '..' entries anyway. */
 +
 +	lpFindFileData = static_cast<LPWIN32_FIND_DATA>(
 +		ut_malloc(sizeof(WIN32_FIND_DATA)));
 +
 +	dir = FindFirstFile((LPCTSTR) path, lpFindFileData);
 +
 +	ut_free(lpFindFileData);
 +
 +	if (dir == INVALID_HANDLE_VALUE) {
 +
 +		if (error_is_fatal) {
 +			os_file_handle_error(dirname, "opendir");
 +		}
 +
 +		return(NULL);
 +	}
 +
 +	return(dir);
 +#else
 +	dir = opendir(dirname);
 +
 +	if (dir == NULL && error_is_fatal) {
 +		os_file_handle_error(dirname, "opendir");
 +	}
 +
 +	return(dir);
 +#endif /* __WIN__ */
 +}
 +
 +/***********************************************************************//**
 +Closes a directory stream.
 +@return	0 if success, -1 if failure */
 +UNIV_INTERN
 +int
 +os_file_closedir(
 +/*=============*/
 +	os_file_dir_t	dir)	/*!< in: directory stream */
 +{
 +#ifdef __WIN__
 +	BOOL		ret;
 +
 +	ret = FindClose(dir);
 +
 +	if (!ret) {
 +		os_file_handle_error_no_exit(NULL, "closedir", FALSE);
 +
 +		return(-1);
 +	}
 +
 +	return(0);
 +#else
 +	int	ret;
 +
 +	ret = closedir(dir);
 +
 +	if (ret) {
 +		os_file_handle_error_no_exit(NULL, "closedir", FALSE);
 +	}
 +
 +	return(ret);
 +#endif /* __WIN__ */
 +}
 +
 +/***********************************************************************//**
 +This function returns information of the next file in the directory. We jump
 +over the '.' and '..' entries in the directory.
 +@return	0 if ok, -1 if error, 1 if at the end of the directory */
 +UNIV_INTERN
 +int
 +os_file_readdir_next_file(
 +/*======================*/
 +	const char*	dirname,/*!< in: directory name or path */
 +	os_file_dir_t	dir,	/*!< in: directory stream */
 +	os_file_stat_t*	info)	/*!< in/out: buffer where the info is returned */
 +{
 +#ifdef __WIN__
 +	LPWIN32_FIND_DATA	lpFindFileData;
 +	BOOL			ret;
 +
 +	lpFindFileData = static_cast<LPWIN32_FIND_DATA>(
 +		ut_malloc(sizeof(WIN32_FIND_DATA)));
 +next_file:
 +	ret = FindNextFile(dir, lpFindFileData);
 +
 +	if (ret) {
 +		ut_a(strlen((char*) lpFindFileData->cFileName)
 +		     < OS_FILE_MAX_PATH);
 +
 +		if (strcmp((char*) lpFindFileData->cFileName, ".") == 0
 +		    || strcmp((char*) lpFindFileData->cFileName, "..") == 0) {
 +
 +			goto next_file;
 +		}
 +
 +		strcpy(info->name, (char*) lpFindFileData->cFileName);
 +
 +		info->size = (ib_int64_t)(lpFindFileData->nFileSizeLow)
 +			+ (((ib_int64_t)(lpFindFileData->nFileSizeHigh))
 +			   << 32);
 +
 +		if (lpFindFileData->dwFileAttributes
 +		    & FILE_ATTRIBUTE_REPARSE_POINT) {
 +			/* TODO: test Windows symlinks */
 +			/* TODO: MySQL has apparently its own symlink
 +			implementation in Windows, dbname.sym can
 +			redirect a database directory:
 +			REFMAN "windows-symbolic-links.html" */
 +			info->type = OS_FILE_TYPE_LINK;
 +		} else if (lpFindFileData->dwFileAttributes
 +			   & FILE_ATTRIBUTE_DIRECTORY) {
 +			info->type = OS_FILE_TYPE_DIR;
 +		} else {
 +			/* It is probably safest to assume that all other
 +			file types are normal. Better to check them rather
 +			than blindly skip them. */
 +
 +			info->type = OS_FILE_TYPE_FILE;
 +		}
 +	}
 +
 +	ut_free(lpFindFileData);
 +
 +	if (ret) {
 +		return(0);
 +	} else if (GetLastError() == ERROR_NO_MORE_FILES) {
 +
 +		return(1);
 +	} else {
 +		os_file_handle_error_no_exit(NULL, "readdir_next_file", FALSE);
 +		return(-1);
 +	}
 +#else
 +	struct dirent*	ent;
 +	char*		full_path;
 +	int		ret;
 +	struct stat	statinfo;
 +#ifdef HAVE_READDIR_R
 +	char		dirent_buf[sizeof(struct dirent)
 +				   + _POSIX_PATH_MAX + 100];
 +	/* In /mysys/my_lib.c, _POSIX_PATH_MAX + 1 is used as
 +	the max file name len; but in most standards, the
 +	length is NAME_MAX; we add 100 to be even safer */
 +#endif
 +
 +next_file:
 +
 +#ifdef HAVE_READDIR_R
 +	ret = readdir_r(dir, (struct dirent*) dirent_buf, &ent);
 +
 +	if (ret != 0
 +#ifdef UNIV_AIX
 +	    /* On AIX, only if we got non-NULL 'ent' (result) value and
 +	    a non-zero 'ret' (return) value, it indicates a failed
 +	    readdir_r() call. An NULL 'ent' with an non-zero 'ret'
 +	    would indicate the "end of the directory" is reached. */
 +	    && ent != NULL
 +#endif
 +	   ) {
 +		fprintf(stderr,
 +			"InnoDB: cannot read directory %s, error %lu\n",
 +			dirname, (ulong) ret);
 +
 +		return(-1);
 +	}
 +
 +	if (ent == NULL) {
 +		/* End of directory */
 +
 +		return(1);
 +	}
 +
 +	ut_a(strlen(ent->d_name) < _POSIX_PATH_MAX + 100 - 1);
 +#else
 +	ent = readdir(dir);
 +
 +	if (ent == NULL) {
 +
 +		return(1);
 +	}
 +#endif
 +	ut_a(strlen(ent->d_name) < OS_FILE_MAX_PATH);
 +
 +	if (strcmp(ent->d_name, ".") == 0 || strcmp(ent->d_name, "..") == 0) {
 +
 +		goto next_file;
 +	}
 +
 +	strcpy(info->name, ent->d_name);
 +
 +	full_path = static_cast<char*>(
 +		ut_malloc(strlen(dirname) + strlen(ent->d_name) + 10));
 +
 +	sprintf(full_path, "%s/%s", dirname, ent->d_name);
 +
 +	ret = stat(full_path, &statinfo);
 +
 +	if (ret) {
 +
 +		if (errno == ENOENT) {
 +			/* readdir() returned a file that does not exist,
 +			it must have been deleted in the meantime. Do what
 +			would have happened if the file was deleted before
 +			readdir() - ignore and go to the next entry.
 +			If this is the last entry then info->name will still
 +			contain the name of the deleted file when this
 +			function returns, but this is not an issue since the
 +			caller shouldn't be looking at info when end of
 +			directory is returned. */
 +
 +			ut_free(full_path);
 +
 +			goto next_file;
 +		}
 +
 +		os_file_handle_error_no_exit(full_path, "stat", FALSE);
 +
 +		ut_free(full_path);
 +
 +		return(-1);
 +	}
 +
 +	info->size = (ib_int64_t) statinfo.st_size;
 +
 +	if (S_ISDIR(statinfo.st_mode)) {
 +		info->type = OS_FILE_TYPE_DIR;
 +	} else if (S_ISLNK(statinfo.st_mode)) {
 +		info->type = OS_FILE_TYPE_LINK;
 +	} else if (S_ISREG(statinfo.st_mode)) {
 +		info->type = OS_FILE_TYPE_FILE;
 +	} else {
 +		info->type = OS_FILE_TYPE_UNKNOWN;
 +	}
 +
 +	ut_free(full_path);
 +
 +	return(0);
 +#endif
 +}
 +
 +/*****************************************************************//**
 +This function attempts to create a directory named pathname. The new
 +directory gets default permissions. On Unix the permissions are
 +(0770 & ~umask). If the directory exists already, nothing is done and
 +the call succeeds, unless the fail_if_exists arguments is true.
 +If another error occurs, such as a permission error, this does not crash,
 +but reports the error and returns FALSE.
 +@return	TRUE if call succeeds, FALSE on error */
 +UNIV_INTERN
 +ibool
 +os_file_create_directory(
 +/*=====================*/
 +	const char*	pathname,	/*!< in: directory name as
 +					null-terminated string */
 +	ibool		fail_if_exists)	/*!< in: if TRUE, pre-existing directory
 +					is treated as an error. */
 +{
 +#ifdef __WIN__
 +	BOOL	rcode;
 +
 +	rcode = CreateDirectory((LPCTSTR) pathname, NULL);
 +	if (!(rcode != 0
 +	      || (GetLastError() == ERROR_ALREADY_EXISTS
 +		  && !fail_if_exists))) {
 +
 +		os_file_handle_error_no_exit(
 +			pathname, "CreateDirectory", FALSE);
 +
 +		return(FALSE);
 +	}
 +
 +	return(TRUE);
 +#else
 +	int	rcode;
 +	WAIT_ALLOW_WRITES();
 +
 +	rcode = mkdir(pathname, 0770);
 +
 +	if (!(rcode == 0 || (errno == EEXIST && !fail_if_exists))) {
 +		/* failure */
 +		os_file_handle_error_no_exit(pathname, "mkdir", FALSE);
 +
 +		return(FALSE);
 +	}
 +
 +	return (TRUE);
 +#endif /* __WIN__ */
 +}
 +
 +/****************************************************************//**
 +NOTE! Use the corresponding macro os_file_create_simple(), not directly
 +this function!
 +A simple function to open or create a file.
 +@return own: handle to the file, not defined if error, error number
 +can be retrieved with os_file_get_last_error */
 +UNIV_INTERN
 +os_file_t
 +os_file_create_simple_func(
 +/*=======================*/
 +	const char*	name,	/*!< in: name of the file or path as a
 +				null-terminated string */
 +	ulint		create_mode,/*!< in: create mode */
 +	ulint		access_type,/*!< in: OS_FILE_READ_ONLY or
 +				OS_FILE_READ_WRITE */
 +	ibool*		success)/*!< out: TRUE if succeed, FALSE if error */
 +{
 +	os_file_t	file;
 +	ibool		retry;
 +
 +	*success = FALSE;
 +#ifdef __WIN__
 +	DWORD		access;
 +	DWORD		create_flag;
 +	DWORD		attributes	= 0;
 +
 +	ut_a(!(create_mode & OS_FILE_ON_ERROR_SILENT));
 +	ut_a(!(create_mode & OS_FILE_ON_ERROR_NO_EXIT));
 +
 +	if (create_mode == OS_FILE_OPEN) {
 +
 +		create_flag = OPEN_EXISTING;
 +
 +	} else if (srv_read_only_mode) {
 +
 +		create_flag = OPEN_EXISTING;
 +
 +	} else if (create_mode == OS_FILE_CREATE) {
 +
 +		create_flag = CREATE_NEW;
 +
 +	} else if (create_mode == OS_FILE_CREATE_PATH) {
 +
 +		ut_a(!srv_read_only_mode);
 +
 +		/* Create subdirs along the path if needed  */
 +		*success = os_file_create_subdirs_if_needed(name);
 +
 +		if (!*success) {
 +
 +			ib_logf(IB_LOG_LEVEL_ERROR,
 +				"Unable to create subdirectories '%s'",
 +				name);
 +
 +			return((os_file_t) -1);
 +		}
 +
 +		create_flag = CREATE_NEW;
 +		create_mode = OS_FILE_CREATE;
 +
 +	} else {
 +		ib_logf(IB_LOG_LEVEL_ERROR,
 +			"Unknown file create mode (%lu) for file '%s'",
 +			create_mode, name);
 +
 +		return((os_file_t) -1);
 +	}
 +
 +	if (access_type == OS_FILE_READ_ONLY) {
 +		access = GENERIC_READ;
 +	} else if (srv_read_only_mode) {
 +
 +		ib_logf(IB_LOG_LEVEL_INFO,
 +			"read only mode set. Unable to "
 +			"open file '%s' in RW mode, trying RO mode", name);
 +
 +		access = GENERIC_READ;
 +
 +	} else if (access_type == OS_FILE_READ_WRITE) {
 +		access = GENERIC_READ | GENERIC_WRITE;
 +	} else {
 +		ib_logf(IB_LOG_LEVEL_ERROR,
 +			"Unknown file access type (%lu) for file '%s'",
 +			access_type, name);
 +
 +		return((os_file_t) -1);
 +	}
 +
 +	do {
 +		/* Use default security attributes and no template file. */
 +
 +		file = CreateFile(
 +			(LPCTSTR) name, access, FILE_SHARE_READ, NULL,
 +			create_flag, attributes, NULL);
 +
 +		if (file == INVALID_HANDLE_VALUE) {
 +
 +			*success = FALSE;
 +
 +			retry = os_file_handle_error(
 +				name, create_mode == OS_FILE_OPEN ?
 +				"open" : "create");
 +
 +		} else {
 +			*success = TRUE;
 +			retry = false;
 +		}
 +
 +	} while (retry);
 +
 +#else /* __WIN__ */
 +	int		create_flag;
 +	if (create_mode != OS_FILE_OPEN && create_mode != OS_FILE_OPEN_RAW)
 +		WAIT_ALLOW_WRITES();
 +
 +	ut_a(!(create_mode & OS_FILE_ON_ERROR_SILENT));
 +	ut_a(!(create_mode & OS_FILE_ON_ERROR_NO_EXIT));
 +
 +	if (create_mode == OS_FILE_OPEN) {
 +
 +		if (access_type == OS_FILE_READ_ONLY) {
 +			create_flag = O_RDONLY;
 +		} else if (srv_read_only_mode) {
 +			create_flag = O_RDONLY;
 +		} else {
 +			create_flag = O_RDWR;
 +		}
 +
 +	} else if (srv_read_only_mode) {
 +
 +		create_flag = O_RDONLY;
 +
 +	} else if (create_mode == OS_FILE_CREATE) {
 +
 +		create_flag = O_RDWR | O_CREAT | O_EXCL;
 +
 +	} else if (create_mode == OS_FILE_CREATE_PATH) {
 +
 +		/* Create subdirs along the path if needed  */
 +
 +		*success = os_file_create_subdirs_if_needed(name);
 +
 +		if (!*success) {
 +
 +			ib_logf(IB_LOG_LEVEL_ERROR,
 +				"Unable to create subdirectories '%s'",
 +				name);
 +
 +			return((os_file_t) -1);
 +		}
 +
 +		create_flag = O_RDWR | O_CREAT | O_EXCL;
 +		create_mode = OS_FILE_CREATE;
 +	} else {
 +
 +		ib_logf(IB_LOG_LEVEL_ERROR,
 +			"Unknown file create mode (%lu) for file '%s'",
 +			create_mode, name);
 +
 +		return((os_file_t) -1);
 +	}
 +
 +	do {
- 		file = ::open(name, create_flag, os_innodb_umask);
++		file = ::open(name, create_flag | O_CLOEXEC, os_innodb_umask);
 +
 +		if (file == -1) {
 +			*success = FALSE;
 +
 +			retry = os_file_handle_error(
 +				name,
 +				create_mode == OS_FILE_OPEN
 +				?  "open" : "create");
 +		} else {
 +			*success = TRUE;
 +			retry = false;
 +		}
 +
 +	} while (retry);
 +
 +#ifdef USE_FILE_LOCK
 +	if (!srv_read_only_mode
 +	    && *success
 +	    && access_type == OS_FILE_READ_WRITE
 +	    && os_file_lock(file, name)) {
 +
 +		*success = FALSE;
 +		close(file);
 +		file = -1;
 +	}
 +#endif /* USE_FILE_LOCK */
 +
 +#endif /* __WIN__ */
 +
 +	return(file);
 +}
 +
 +/****************************************************************//**
 +NOTE! Use the corresponding macro
 +os_file_create_simple_no_error_handling(), not directly this function!
 +A simple function to open or create a file.
 +@return own: handle to the file, not defined if error, error number
 +can be retrieved with os_file_get_last_error */
 +UNIV_INTERN
 +pfs_os_file_t
 +os_file_create_simple_no_error_handling_func(
 +/*=========================================*/
 +	const char*	name,	/*!< in: name of the file or path as a
 +				null-terminated string */
 +	ulint		create_mode,/*!< in: create mode */
 +	ulint		access_type,/*!< in: OS_FILE_READ_ONLY,
 +				OS_FILE_READ_WRITE, or
 +				OS_FILE_READ_ALLOW_DELETE; the last option is
 +				used by a backup program reading the file */
 +	ibool*		success)/*!< out: TRUE if succeed, FALSE if error */
 +{
 +	pfs_os_file_t	file;
 +
 +	*success = FALSE;
 +#ifdef __WIN__
 +	DWORD		access;
 +	DWORD		create_flag;
 +	DWORD		attributes	= 0;
 +	DWORD		share_mode	= FILE_SHARE_READ;
 +	ut_a(name);
 +
 +	ut_a(!(create_mode & OS_FILE_ON_ERROR_SILENT));
 +	ut_a(!(create_mode & OS_FILE_ON_ERROR_NO_EXIT));
 +
 +	if (create_mode == OS_FILE_OPEN) {
 +		create_flag = OPEN_EXISTING;
 +	} else if (srv_read_only_mode) {
 +		create_flag = OPEN_EXISTING;
 +	} else if (create_mode == OS_FILE_CREATE) {
 +		create_flag = CREATE_NEW;
 +	} else {
 +
 +		ib_logf(IB_LOG_LEVEL_ERROR,
 +			"Unknown file create mode (%lu) for file '%s'",
 +			create_mode, name);
 +		file.m_file = (os_file_t)-1;
 +		return(file);
 +	}
 +
 +	if (access_type == OS_FILE_READ_ONLY) {
 +		access = GENERIC_READ;
 +	} else if (srv_read_only_mode) {
 +		access = GENERIC_READ;
 +	} else if (access_type == OS_FILE_READ_WRITE) {
 +		access = GENERIC_READ | GENERIC_WRITE;
 +	} else if (access_type == OS_FILE_READ_ALLOW_DELETE) {
 +
 +		ut_a(!srv_read_only_mode);
 +
 +		access = GENERIC_READ;
 +
 +		/*!< A backup program has to give mysqld the maximum
 +		freedom to do what it likes with the file */
 +
 +		share_mode |= FILE_SHARE_DELETE | FILE_SHARE_WRITE;
 +	} else {
 +		ib_logf(IB_LOG_LEVEL_ERROR,
 +			"Unknown file access type (%lu) for file '%s'",
 +			access_type, name);
 +		file.m_file = (os_file_t)-1;
 +		return(file);
 +	}
 +
 +	file.m_file = CreateFile((LPCTSTR) name,
 +			  access,
 +			  share_mode,
 +			  NULL,			// Security attributes
 +			  create_flag,
 +			  attributes,
 +			  NULL);		// No template file
 +
 +	*success = (file.m_file != INVALID_HANDLE_VALUE);
 +#else /* __WIN__ */
 +	int		create_flag;
 +	const char*	mode_str	= NULL;
 +	ut_a(name);
 +	if (create_mode != OS_FILE_OPEN && create_mode != OS_FILE_OPEN_RAW)
 +		WAIT_ALLOW_WRITES();
 +
 +	ut_a(!(create_mode & OS_FILE_ON_ERROR_SILENT));
 +	ut_a(!(create_mode & OS_FILE_ON_ERROR_NO_EXIT));
 +
 +	if (create_mode == OS_FILE_OPEN) {
 +
 +		mode_str = "OPEN";
 +
 +		if (access_type == OS_FILE_READ_ONLY) {
 +
 +			create_flag = O_RDONLY;
 +
 +		} else if (srv_read_only_mode) {
 +
 +			create_flag = O_RDONLY;
 +
 +		} else {
 +
 +			ut_a(access_type == OS_FILE_READ_WRITE
 +			     || access_type == OS_FILE_READ_ALLOW_DELETE);
 +
 +			create_flag = O_RDWR;
 +		}
 +
 +	} else if (srv_read_only_mode) {
 +
 +		mode_str = "OPEN";
 +
 +		create_flag = O_RDONLY;
 +
 +	} else if (create_mode == OS_FILE_CREATE) {
 +
 +		mode_str = "CREATE";
 +
 +		create_flag = O_RDWR | O_CREAT | O_EXCL;
 +
 +	} else {
 +		ib_logf(IB_LOG_LEVEL_ERROR,
 +			"Unknown file create mode (%lu) for file '%s'",
 +			create_mode, name);
 +		file.m_file = -1;
 +		return(file);
 +	}
 +
- 	file.m_file = ::open(name, create_flag, os_innodb_umask);
++	file.m_file = ::open(name, create_flag | O_CLOEXEC, os_innodb_umask);
 +
 +	*success = file.m_file == -1 ? FALSE : TRUE;
 +
 +	/* This function is always called for data files, we should disable
 +	OS caching (O_DIRECT) here as we do in os_file_create_func(), so
 +	we open the same file in the same mode, see man page of open(2). */
 +	if (!srv_read_only_mode
 +	    && *success
 +	    && (srv_unix_file_flush_method == SRV_UNIX_O_DIRECT
 +		|| srv_unix_file_flush_method == SRV_UNIX_O_DIRECT_NO_FSYNC)) {
 +
 +		os_file_set_nocache(file.m_file, name, mode_str);
 +	}
 +
 +#ifdef USE_FILE_LOCK
 +	if (!srv_read_only_mode
 +	    && *success
 +	    && access_type == OS_FILE_READ_WRITE
 +	    && os_file_lock(file.m_file, name)) {
 +
 +		*success = FALSE;
 +		close(file.m_file);
 +		file.m_file = -1;
 +
 +	}
 +#endif /* USE_FILE_LOCK */
 +
 +#endif /* __WIN__ */
 +
 +	return(file);
 +}
 +
 +/****************************************************************//**
 +Tries to disable OS caching on an opened file descriptor. */
 +UNIV_INTERN
 +void
 +os_file_set_nocache(
 +/*================*/
 +	int		fd		/*!< in: file descriptor to alter */
 +					MY_ATTRIBUTE((unused)),
 +	const char*	file_name	/*!< in: used in the diagnostic
 +					message */
 +					MY_ATTRIBUTE((unused)),
 +	const char*	operation_name MY_ATTRIBUTE((unused)))
 +					/*!< in: "open" or "create"; used
 +					in the diagnostic message */
 +{
 +	/* some versions of Solaris may not have DIRECTIO_ON */
 +#if defined(UNIV_SOLARIS) && defined(DIRECTIO_ON)
 +	if (directio(fd, DIRECTIO_ON) == -1) {
 +		int	errno_save = errno;
 +
 +		ib_logf(IB_LOG_LEVEL_ERROR,
 +			"Failed to set DIRECTIO_ON on file %s: %s: %s, "
 +			"continuing anyway.",
 +			file_name, operation_name, strerror(errno_save));
 +	}
 +#elif defined(O_DIRECT)
 +	if (fcntl(fd, F_SETFL, O_DIRECT) == -1) {
 +		int		errno_save = errno;
 +		static bool	warning_message_printed = false;
 +		if (errno_save == EINVAL) {
 +			if (!warning_message_printed) {
 +				warning_message_printed = true;
 +# ifdef UNIV_LINUX
 +				ib_logf(IB_LOG_LEVEL_WARN,
 +					"Failed to set O_DIRECT on file "
 +					"%s: %s: %s, continuing anyway. "
 +					"O_DIRECT is known to result "
 +					"in 'Invalid argument' on Linux on "
 +					"tmpfs, see MySQL Bug#26662.",
 +					file_name, operation_name,
 +					strerror(errno_save));
 +# else /* UNIV_LINUX */
 +				goto short_warning;
 +# endif /* UNIV_LINUX */
 +			}
 +		} else {
 +# ifndef UNIV_LINUX
 +short_warning:
 +# endif
 +			ib_logf(IB_LOG_LEVEL_WARN,
 +				"Failed to set O_DIRECT on file %s: %s: %s, "
 +				"continuing anyway.",
 +				file_name, operation_name, strerror(errno_save));
 +		}
 +	}
 +#endif /* defined(UNIV_SOLARIS) && defined(DIRECTIO_ON) */
 +}
 +
 +/****************************************************************//**
 +NOTE! Use the corresponding macro os_file_create(), not directly
 +this function!
 +Opens an existing file or creates a new.
 +@return own: handle to the file, not defined if error, error number
 +can be retrieved with os_file_get_last_error */
 +UNIV_INTERN
 +pfs_os_file_t
 +os_file_create_func(
 +/*================*/
 +	const char*	name,	/*!< in: name of the file or path as a
 +				null-terminated string */
 +	ulint		create_mode,/*!< in: create mode */
 +	ulint		purpose,/*!< in: OS_FILE_AIO, if asynchronous,
 +				non-buffered i/o is desired,
 +				OS_FILE_NORMAL, if any normal file;
 +				NOTE that it also depends on type, os_aio_..
 +				and srv_.. variables whether we really use
 +				async i/o or unbuffered i/o: look in the
 +				function source code for the exact rules */
 +	ulint		type,	/*!< in: OS_DATA_FILE or OS_LOG_FILE */
 +	ibool*		success)/*!< out: TRUE if succeed, FALSE if error */
 +{
 +	pfs_os_file_t	file;
 +	ibool		retry;
 +	ibool		on_error_no_exit;
 +	ibool		on_error_silent;
 +#ifdef __WIN__
 +	DBUG_EXECUTE_IF(
 +		"ib_create_table_fail_disk_full",
 +		*success = FALSE;
 +		SetLastError(ERROR_DISK_FULL);
 +		file.m_file = (os_file_t)-1;
 +		return(file);
 +	);
 +#else /* __WIN__ */
 +	DBUG_EXECUTE_IF(
 +		"ib_create_table_fail_disk_full",
 +		*success = FALSE;
 +		errno = ENOSPC;
 +		file.m_file = -1;
 +		return(file);
 +	);
 +#endif /* __WIN__ */
 +
 +#ifdef __WIN__
 +	DWORD		create_flag;
 +	DWORD		share_mode	= FILE_SHARE_READ;
 +
 +	on_error_no_exit = create_mode & OS_FILE_ON_ERROR_NO_EXIT
 +		? TRUE : FALSE;
 +
 +	on_error_silent = create_mode & OS_FILE_ON_ERROR_SILENT
 +		? TRUE : FALSE;
 +
 +	create_mode &= ~OS_FILE_ON_ERROR_NO_EXIT;
 +	create_mode &= ~OS_FILE_ON_ERROR_SILENT;
 +
 +	if (create_mode == OS_FILE_OPEN_RAW) {
 +
 +		ut_a(!srv_read_only_mode);
 +
 +		create_flag = OPEN_EXISTING;
 +
 +		/* On Windows Physical devices require admin privileges and
 +		have to have the write-share mode set. See the remarks
 +		section for the CreateFile() function documentation in MSDN. */
 +
 +		share_mode |= FILE_SHARE_WRITE;
 +
 +	} else if (create_mode == OS_FILE_OPEN
 +		   || create_mode == OS_FILE_OPEN_RETRY) {
 +
 +		create_flag = OPEN_EXISTING;
 +
 +	} else if (srv_read_only_mode) {
 +
 +		create_flag = OPEN_EXISTING;
 +
 +	} else if (create_mode == OS_FILE_CREATE) {
 +
 +		create_flag = CREATE_NEW;
 +
 +	} else if (create_mode == OS_FILE_OVERWRITE) {
 +
 +		create_flag = CREATE_ALWAYS;
 +
 +	} else {
 +		ib_logf(IB_LOG_LEVEL_ERROR,
 +			"Unknown file create mode (%lu) for file '%s'",
 +			create_mode, name);
 +
 +		file.m_file = (os_file_t)-1;
 +		return(file);
 +	}
 +
 +	DWORD		attributes = 0;
 +
 +#ifdef UNIV_HOTBACKUP
 +	attributes |= FILE_FLAG_NO_BUFFERING;
 +#else
 +	if (purpose == OS_FILE_AIO) {
 +
 +#ifdef WIN_ASYNC_IO
 +		/* If specified, use asynchronous (overlapped) io and no
 +		buffering of writes in the OS */
 +
 +		if (srv_use_native_aio) {
 +			attributes |= FILE_FLAG_OVERLAPPED;
 +		}
 +#endif /* WIN_ASYNC_IO */
 +
 +	} else if (purpose == OS_FILE_NORMAL) {
 +		/* Use default setting. */
 +	} else {
 +		ib_logf(IB_LOG_LEVEL_ERROR,
 +			"Unknown purpose flag (%lu) while opening file '%s'",
 +			purpose, name);
 +		file.m_file = (os_file_t)-1;
 +		return(file);
 +	}
 +
 +#ifdef UNIV_NON_BUFFERED_IO
 +	// TODO: Create a bug, this looks wrong. The flush log
 +	// parameter is dynamic.
 +	if (type == OS_LOG_FILE && srv_flush_log_at_trx_commit == 2) {
 +
 +		/* Do not use unbuffered i/o for the log files because
 +		value 2 denotes that we do not flush the log at every
 +		commit, but only once per second */
 +
 +	} else if (srv_win_file_flush_method == SRV_WIN_IO_UNBUFFERED) {
 +
 +		attributes |= FILE_FLAG_NO_BUFFERING;
 +	}
 +#endif /* UNIV_NON_BUFFERED_IO */
 +
 +#endif /* UNIV_HOTBACKUP */
 +	DWORD	access = GENERIC_READ;
 +
 +	if (!srv_read_only_mode) {
 +		access |= GENERIC_WRITE;
 +	}
 +
 +	do {
 +		/* Use default security attributes and no template file. */
 +		file.m_file = CreateFile(
 +			(LPCTSTR) name, access, share_mode, NULL,
 +			create_flag, attributes, NULL);
 +
 +		if (file.m_file == INVALID_HANDLE_VALUE) {
 +			const char*	operation;
 +
 +			operation = (create_mode == OS_FILE_CREATE
 +				     && !srv_read_only_mode)
 +				? "create" : "open";
 +
 +			*success = FALSE;
 +
 +			if (on_error_no_exit) {
 +				retry = os_file_handle_error_no_exit(
 +					name, operation, on_error_silent);
 +			} else {
 +				retry = os_file_handle_error(name, operation);
 +			}
 +		} else {
 +			*success = TRUE;
 +			retry = FALSE;
 +		}
 +
 +	} while (retry);
 +
 +#else /* __WIN__ */
 +	int		create_flag;
 +	const char*	mode_str	= NULL;
 +	if (create_mode != OS_FILE_OPEN && create_mode != OS_FILE_OPEN_RAW)
 +		WAIT_ALLOW_WRITES();
 +
 +	on_error_no_exit = create_mode & OS_FILE_ON_ERROR_NO_EXIT
 +		? TRUE : FALSE;
 +	on_error_silent = create_mode & OS_FILE_ON_ERROR_SILENT
 +		? TRUE : FALSE;
 +
 +	create_mode &= ~OS_FILE_ON_ERROR_NO_EXIT;
 +	create_mode &= ~OS_FILE_ON_ERROR_SILENT;
 +
 +	if (create_mode == OS_FILE_OPEN
 +	    || create_mode == OS_FILE_OPEN_RAW
 +	    || create_mode == OS_FILE_OPEN_RETRY) {
 +
 +		mode_str = "OPEN";
 +
 +		create_flag = srv_read_only_mode ? O_RDONLY : O_RDWR;
 +
 +	} else if (srv_read_only_mode) {
 +
 +		mode_str = "OPEN";
 +
 +		create_flag = O_RDONLY;
 +
 +	} else if (create_mode == OS_FILE_CREATE) {
 +
 +		mode_str = "CREATE";
 +		create_flag = O_RDWR | O_CREAT | O_EXCL;
 +
 +	} else if (create_mode == OS_FILE_OVERWRITE) {
 +
 +		mode_str = "OVERWRITE";
 +		create_flag = O_RDWR | O_CREAT | O_TRUNC;
 +
 +	} else {
 +		ib_logf(IB_LOG_LEVEL_ERROR,
 +			"Unknown file create mode (%lu) for file '%s'",
 +			create_mode, name);
 +
 +		file.m_file = -1;
 +		return(file);
 +	}
 +
 +	ut_a(type == OS_LOG_FILE || type == OS_DATA_FILE);
 +	ut_a(purpose == OS_FILE_AIO || purpose == OS_FILE_NORMAL);
 +
 +#ifdef O_SYNC
 +	/* We let O_SYNC only affect log files; note that we map O_DSYNC to
 +	O_SYNC because the datasync options seemed to corrupt files in 2001
 +	in both Linux and Solaris */
 +
 +	if (!srv_read_only_mode
 +	    && type == OS_LOG_FILE
 +	    && srv_unix_file_flush_method == SRV_UNIX_O_DSYNC) {
 +
 +		create_flag |= O_SYNC;
 +	}
 +#endif /* O_SYNC */
 +
 +	do {
- 		file.m_file = ::open(name, create_flag, os_innodb_umask);
++		file.m_file = ::open(name, create_flag | O_CLOEXEC, os_innodb_umask);
 +
 +		if (file.m_file == -1) {
 +			const char*	operation;
 +
 +			operation = (create_mode == OS_FILE_CREATE
 +				     && !srv_read_only_mode)
 +				? "create" : "open";
 +
 +			*success = FALSE;
 +
 +			if (on_error_no_exit) {
 +				retry = os_file_handle_error_no_exit(
 +					name, operation, on_error_silent);
 +			} else {
 +				retry = os_file_handle_error(name, operation);
 +			}
 +		} else {
 +			*success = TRUE;
 +			retry = false;
 +		}
 +
 +	} while (retry);
 +
 +	/* We disable OS caching (O_DIRECT) only on data files */
 +
 +	if (!srv_read_only_mode
 +	    && *success
 +	    && type != OS_LOG_FILE
 +	    && (srv_unix_file_flush_method == SRV_UNIX_O_DIRECT
 +		|| srv_unix_file_flush_method == SRV_UNIX_O_DIRECT_NO_FSYNC)) {
 +
 +		os_file_set_nocache(file.m_file, name, mode_str);
 +	}
 +
 +#ifdef USE_FILE_LOCK
 +	if (!srv_read_only_mode
 +	    && *success
 +	    && create_mode != OS_FILE_OPEN_RAW
 +	    && os_file_lock(file.m_file, name)) {
 +
 +		if (create_mode == OS_FILE_OPEN_RETRY) {
 +
 +			ut_a(!srv_read_only_mode);
 +
 +			ib_logf(IB_LOG_LEVEL_INFO,
 +				"Retrying to lock the first data file");
 +
 +			for (int i = 0; i < 100; i++) {
 +				os_thread_sleep(1000000);
 +
 +				if (!os_file_lock(file.m_file, name)) {
 +					*success = TRUE;
 +					return(file);
 +				}
 +			}
 +
 +			ib_logf(IB_LOG_LEVEL_INFO,
 +				"Unable to open the first data file");
 +		}
 +
 +		*success = FALSE;
 +		close(file.m_file);
 +		file.m_file = -1;
 +	}
 +#endif /* USE_FILE_LOCK */
 +
 +#endif /* __WIN__ */
 +
 +	return(file);
 +}
 +
 +/***********************************************************************//**
 +Deletes a file if it exists. The file has to be closed before calling this.
 +@return	TRUE if success */
 +UNIV_INTERN
 +bool
 +os_file_delete_if_exists_func(
 +/*==========================*/
 +	const char*	name)	/*!< in: file path as a null-terminated
 +				string */
 +{
 +#ifdef __WIN__
 +	bool	ret;
 +	ulint	count	= 0;
 +loop:
 +	/* In Windows, deleting an .ibd file may fail if mysqlbackup is copying
 +	it */
 +
 +	ret = DeleteFile((LPCTSTR) name);
 +
 +	if (ret) {
 +		return(true);
 +	}
 +
 +	DWORD lasterr = GetLastError();
 +	if (lasterr == ERROR_FILE_NOT_FOUND
 +	    || lasterr == ERROR_PATH_NOT_FOUND) {
 +		/* the file does not exist, this not an error */
 +
 +		return(true);
 +	}
 +
 +	count++;
 +
 +	if (count > 100 && 0 == (count % 10)) {
 +		os_file_get_last_error(true); /* print error information */
 +
 +		ib_logf(IB_LOG_LEVEL_WARN, "Delete of file %s failed.", name);
 +	}
 +
 +	os_thread_sleep(500000);	/* sleep for 0.5 second */
 +
 +	if (count > 2000) {
 +
 +		return(false);
 +	}
 +
 +	goto loop;
 +#else
 +	int	ret;
 +	WAIT_ALLOW_WRITES();
 +
 +	ret = unlink(name);
 +
 +	if (ret != 0 && errno != ENOENT) {
 +		os_file_handle_error_no_exit(name, "delete", FALSE);
 +
 +		return(false);
 +	}
 +
 +	return(true);
 +#endif /* __WIN__ */
 +}
 +
 +/***********************************************************************//**
 +Deletes a file. The file has to be closed before calling this.
 +@return	TRUE if success */
 +UNIV_INTERN
 +bool
 +os_file_delete_func(
 +/*================*/
 +	const char*	name)	/*!< in: file path as a null-terminated
 +				string */
 +{
 +#ifdef __WIN__
 +	BOOL	ret;
 +	ulint	count	= 0;
 +loop:
 +	/* In Windows, deleting an .ibd file may fail if mysqlbackup is copying
 +	it */
 +
 +	ret = DeleteFile((LPCTSTR) name);
 +
 +	if (ret) {
 +		return(true);
 +	}
 +
 +	if (GetLastError() == ERROR_FILE_NOT_FOUND) {
 +		/* If the file does not exist, we classify this as a 'mild'
 +		error and return */
 +
 +		return(false);
 +	}
 +
 +	count++;
 +
 +	if (count > 100 && 0 == (count % 10)) {
 +		os_file_get_last_error(true); /* print error information */
 +
 +		fprintf(stderr,
 +			"InnoDB: Warning: cannot delete file %s\n"
 +			"InnoDB: Are you running mysqlbackup"
 +			" to back up the file?\n", name);
 +	}
 +
 +	os_thread_sleep(1000000);	/* sleep for a second */
 +
 +	if (count > 2000) {
 +
 +		return(false);
 +	}
 +
 +	goto loop;
 +#else
 +	int	ret;
 +	WAIT_ALLOW_WRITES();
 +
 +	ret = unlink(name);
 +
 +	if (ret != 0) {
 +		os_file_handle_error_no_exit(name, "delete", FALSE);
 +
 +		return(false);
 +	}
 +
 +	return(true);
 +#endif
 +}
 +
 +/***********************************************************************//**
 +NOTE! Use the corresponding macro os_file_rename(), not directly this function!
 +Renames a file (can also move it to another directory). It is safest that the
 +file is closed before calling this function.
 +@return	TRUE if success */
 +UNIV_INTERN
 +ibool
 +os_file_rename_func(
 +/*================*/
 +	const char*	oldpath,/*!< in: old file path as a null-terminated
 +				string */
 +	const char*	newpath)/*!< in: new file path */
 +{
 +#ifdef UNIV_DEBUG
 +	os_file_type_t	type;
 +	ibool		exists;
 +
 +	/* New path must not exist. */
 +	ut_ad(os_file_status(newpath, &exists, &type));
 +	ut_ad(!exists);
 +
 +	/* Old path must exist. */
 +	ut_ad(os_file_status(oldpath, &exists, &type));
 +	ut_ad(exists);
 +#endif /* UNIV_DEBUG */
 +
 +#ifdef __WIN__
 +	BOOL	ret;
 +
 +	ret = MoveFile((LPCTSTR) oldpath, (LPCTSTR) newpath);
 +
 +	if (ret) {
 +		return(TRUE);
 +	}
 +
 +	os_file_handle_error_no_exit(oldpath, "rename", FALSE);
 +
 +	return(FALSE);
 +#else
 +	int	ret;
 +	WAIT_ALLOW_WRITES();
 +
 +	ret = rename(oldpath, newpath);
 +
 +	if (ret != 0) {
 +		os_file_handle_error_no_exit(oldpath, "rename", FALSE);
 +
 +		return(FALSE);
 +	}
 +
 +	return(TRUE);
 +#endif /* __WIN__ */
 +}
 +
 +/***********************************************************************//**
 +NOTE! Use the corresponding macro os_file_close(), not directly this function!
 +Closes a file handle. In case of error, error number can be retrieved with
 +os_file_get_last_error.
 +@return	TRUE if success */
 +UNIV_INTERN
 +ibool
 +os_file_close_func(
 +/*===============*/
 +	os_file_t	file)	/*!< in, own: handle to a file */
 +{
 +#ifdef __WIN__
 +	BOOL	ret;
 +
 +	ret = CloseHandle(file);
 +
 +	if (ret) {
 +		return(TRUE);
 +	}
 +
 +	os_file_handle_error(NULL, "close");
 +
 +	return(FALSE);
 +#else
 +	int	ret;
 +
 +	ret = close(file);
 +
 +	if (ret == -1) {
 +		os_file_handle_error(NULL, "close");
 +
 +		return(FALSE);
 +	}
 +
 +	return(TRUE);
 +#endif /* __WIN__ */
 +}
 +
 +#ifdef UNIV_HOTBACKUP
 +/***********************************************************************//**
 +Closes a file handle.
 +@return	TRUE if success */
 +UNIV_INTERN
 +ibool
 +os_file_close_no_error_handling(
 +/*============================*/
 +	os_file_t	file)	/*!< in, own: handle to a file */
 +{
 +#ifdef __WIN__
 +	BOOL	ret;
 +
 +	ret = CloseHandle(file);
 +
 +	if (ret) {
 +		return(TRUE);
 +	}
 +
 +	return(FALSE);
 +#else
 +	int	ret;
 +
 +	ret = close(file);
 +
 +	if (ret == -1) {
 +
 +		return(FALSE);
 +	}
 +
 +	return(TRUE);
 +#endif /* __WIN__ */
 +}
 +#endif /* UNIV_HOTBACKUP */
 +
 +/***********************************************************************//**
 +Gets a file size.
 +@return	file size, or (os_offset_t) -1 on failure */
 +UNIV_INTERN
 +os_offset_t
 +os_file_get_size(
 +/*=============*/
 +	pfs_os_file_t	file)	/*!< in: handle to a file */
 +{
 +#ifdef __WIN__
 +	os_offset_t	offset;
 +	DWORD		high;
 +	DWORD		low;
 +
 +	low = GetFileSize(file.m_file, &high);
 +
 +	if ((low == 0xFFFFFFFF) && (GetLastError() != NO_ERROR)) {
 +		return((os_offset_t) -1);
 +	}
 +
 +	offset = (os_offset_t) low | ((os_offset_t) high << 32);
 +
 +	return(offset);
 +#else
 +	return((os_offset_t) lseek(file.m_file, 0, SEEK_END));
 +
 +#endif /* __WIN__ */
 +}
 +
 +/***********************************************************************//**
 +Write the specified number of zeros to a newly created file.
 +@return	TRUE if success */
 +UNIV_INTERN
 +ibool
 +os_file_set_size(
 +/*=============*/
 +	const char*	name,	/*!< in: name of the file or path as a
 +				null-terminated string */
 +	pfs_os_file_t	file,	/*!< in: handle to a file */
 +	os_offset_t	size)	/*!< in: file size */
 +{
 +	ibool		ret;
 +	byte*		buf;
 +	byte*		buf2;
 +	ulint		buf_size;
 +
 +#ifdef HAVE_POSIX_FALLOCATE
 +	if (srv_use_posix_fallocate) {
 +		int err;
 +		do {
 +			err = posix_fallocate(file.m_file, 0, size);
 +		} while (err == EINTR
 +			 && srv_shutdown_state == SRV_SHUTDOWN_NONE);
 +
 +		if (err) {
 +			ib_logf(IB_LOG_LEVEL_ERROR,
 +				"preallocating " INT64PF " bytes for"
 +				"file %s failed with error %d",
 +				size, name, err);
 +		}
 +		return(!err);
 +	}
 +#endif
 +
 +#ifdef _WIN32
 +	/* Write 1 page of zeroes at the desired end. */
 +	buf_size = UNIV_PAGE_SIZE;
 +	os_offset_t	current_size = size - buf_size;
 +#else
 +	/* Write up to 1 megabyte at a time. */
 +	buf_size = ut_min(64, (ulint) (size / UNIV_PAGE_SIZE))
 +		* UNIV_PAGE_SIZE;
 +	os_offset_t	current_size = 0;
 +#endif
 +	buf2 = static_cast<byte*>(calloc(1, buf_size + UNIV_PAGE_SIZE));
 +
 +	if (!buf2) {
 +		ib_logf(IB_LOG_LEVEL_ERROR,
 +			"Cannot allocate " ULINTPF " bytes to extend file\n",
 +			buf_size + UNIV_PAGE_SIZE);
 +		return(FALSE);
 +	}
 +
 +	/* Align the buffer for possible raw i/o */
 +	buf = static_cast<byte*>(ut_align(buf2, UNIV_PAGE_SIZE));
 +
 +	do {
 +		ulint	n_bytes;
 +
 +		if (size - current_size < (os_offset_t) buf_size) {
 +			n_bytes = (ulint) (size - current_size);
 +		} else {
 +			n_bytes = buf_size;
 +		}
 +
 +		ret = os_file_write(name, file, buf, current_size, n_bytes);
 +		if (!ret) {
 +			break;
 +		}
 +
 +		current_size += n_bytes;
 +	} while (current_size < size);
 +
 +	free(buf2);
 +
 +	return(ret && os_file_flush(file));
 +}
 +
 +/***********************************************************************//**
 +Truncates a file at its current position.
 +@return	TRUE if success */
 +UNIV_INTERN
 +ibool
 +os_file_set_eof(
 +/*============*/
 +	FILE*		file)	/*!< in: file to be truncated */
 +{
 +#ifdef __WIN__
 +	HANDLE h = (HANDLE) _get_osfhandle(fileno(file));
 +	return(SetEndOfFile(h));
 +#else /* __WIN__ */
 +	WAIT_ALLOW_WRITES();
 +	return(!ftruncate(fileno(file), ftell(file)));
 +#endif /* __WIN__ */
 +}
 +
 +#ifndef __WIN__
 +/***********************************************************************//**
 +Wrapper to fsync(2) that retries the call on some errors.
 +Returns the value 0 if successful; otherwise the value -1 is returned and
 +the global variable errno is set to indicate the error.
 +@return	0 if success, -1 otherwise */
 +
 +static
 +int
 +os_file_fsync(
 +/*==========*/
 +	os_file_t	file)	/*!< in: handle to a file */
 +{
 +	int	ret;
 +	int	failures;
 +	ibool	retry;
 +
 +	failures = 0;
 +
 +	do {
 +		ret = fsync(file);
 +
 +		os_n_fsyncs++;
 +
 +		if (ret == -1 && errno == ENOLCK) {
 +
 +			if (failures % 100 == 0) {
 +
 +				ut_print_timestamp(stderr);
 +				fprintf(stderr,
 +					" InnoDB: fsync(): "
 +					"No locks available; retrying\n");
 +			}
 +
 +			os_thread_sleep(200000 /* 0.2 sec */);
 +
 +			failures++;
 +
 +			retry = TRUE;
 +		} else {
 +
 +			retry = FALSE;
 +		}
 +	} while (retry);
 +
 +	return(ret);
 +}
 +#endif /* !__WIN__ */
 +
 +/***********************************************************************//**
 +NOTE! Use the corresponding macro os_file_flush(), not directly this function!
 +Flushes the write buffers of a given file to the disk.
 +@return	TRUE if success */
 +UNIV_INTERN
 +ibool
 +os_file_flush_func(
 +/*===============*/
 +	os_file_t	file)	/*!< in, own: handle to a file */
 +{
 +#ifdef __WIN__
 +	BOOL	ret;
 +
 +	os_n_fsyncs++;
 +
 +	ret = FlushFileBuffers(file);
 +
 +	if (ret) {
 +		return(TRUE);
 +	}
 +
 +	/* Since Windows returns ERROR_INVALID_FUNCTION if the 'file' is
 +	actually a raw device, we choose to ignore that error if we are using
 +	raw disks */
 +
 +	if (srv_start_raw_disk_in_use && GetLastError()
 +	    == ERROR_INVALID_FUNCTION) {
 +		return(TRUE);
 +	}
 +
 +	os_file_handle_error(NULL, "flush");
 +
 +	/* It is a fatal error if a file flush does not succeed, because then
 +	the database can get corrupt on disk */
 +	ut_error;
 +
 +	return(FALSE);
 +#else
 +	int	ret;
 +	WAIT_ALLOW_WRITES();
 +
 +#if defined(HAVE_DARWIN_THREADS)
 +# ifndef F_FULLFSYNC
 +	/* The following definition is from the Mac OS X 10.3 <sys/fcntl.h> */
 +#  define F_FULLFSYNC 51 /* fsync + ask the drive to flush to the media */
 +# elif F_FULLFSYNC != 51
 +#  error "F_FULLFSYNC != 51: ABI incompatibility with Mac OS X 10.3"
 +# endif
 +	/* Apple has disabled fsync() for internal disk drives in OS X. That
 +	caused corruption for a user when he tested a power outage. Let us in
 +	OS X use a nonstandard flush method recommended by an Apple
 +	engineer. */
 +
 +	if (!srv_have_fullfsync) {
 +		/* If we are not on an operating system that supports this,
 +		then fall back to a plain fsync. */
 +
 +		ret = os_file_fsync(file);
 +	} else {
 +		ret = fcntl(file, F_FULLFSYNC, NULL);
 +
 +		if (ret) {
 +			/* If we are not on a file system that supports this,
 +			then fall back to a plain fsync. */
 +			ret = os_file_fsync(file);
 +		}
 +	}
 +#else
 +	ret = os_file_fsync(file);
 +#endif
 +
 +	if (ret == 0) {
 +		return(TRUE);
 +	}
 +
 +	/* Since Linux returns EINVAL if the 'file' is actually a raw device,
 +	we choose to ignore that error if we are using raw disks */
 +
 +	if (srv_start_raw_disk_in_use && errno == EINVAL) {
 +
 +		return(TRUE);
 +	}
 +
 +	ib_logf(IB_LOG_LEVEL_ERROR, "The OS said file flush did not succeed");
 +
 +	os_file_handle_error(NULL, "flush");
 +
 +	/* It is a fatal error if a file flush does not succeed, because then
 +	the database can get corrupt on disk */
 +	ut_error;
 +
 +	return(FALSE);
 +#endif
 +}
 +
 +#ifndef __WIN__
 +/*******************************************************************//**
 +Does a synchronous read operation in Posix.
 +@return	number of bytes read, -1 if error */
 +static MY_ATTRIBUTE((nonnull, warn_unused_result))
 +ssize_t
 +os_file_pread(
 +/*==========*/
 +	os_file_t	file,	/*!< in: handle to a file */
 +	void*		buf,	/*!< in: buffer where to read */
 +	ulint		n,	/*!< in: number of bytes to read */
 +	os_offset_t	offset)	/*!< in: file offset from where to read */
 +{
 +	off_t	offs;
 +
 +	ut_ad(n);
 +
 +	/* If off_t is > 4 bytes in size, then we assume we can pass a
 +	64-bit address */
 +	offs = (off_t) offset;
 +
 +	if (sizeof(off_t) <= 4) {
 +		if (offset != (os_offset_t) offs) {
 +			ib_logf(IB_LOG_LEVEL_ERROR,
 +				"File read at offset > 4 GB");
 +		}
 +	}
 +
 +	os_n_file_reads++;
 +
 +	const bool monitor = MONITOR_IS_ON(MONITOR_OS_PENDING_READS);
 +
 +#ifdef HAVE_PREAD
 +	MONITOR_ATOMIC_INC_LOW(MONITOR_OS_PENDING_READS, monitor);
 +	ssize_t n_bytes = pread(file, buf, n, offs);
 +	MONITOR_ATOMIC_DEC_LOW(MONITOR_OS_PENDING_READS, monitor);
 +	return(n_bytes);
 +#else
 +	{
 +		off_t	ret_offset;
 +		ssize_t	ret;
 +#ifndef UNIV_HOTBACKUP
 +		ulint	i;
 +#endif /* !UNIV_HOTBACKUP */
 +
 +		MONITOR_ATOMIC_INC_LOW(MONITOR_OS_PENDING_READS, monitor);
 +#ifndef UNIV_HOTBACKUP
 +		/* Protect the seek / read operation with a mutex */
 +		i = ((ulint) file) % OS_FILE_N_SEEK_MUTEXES;
 +
 +		os_mutex_enter(os_file_seek_mutexes[i]);
 +#endif /* !UNIV_HOTBACKUP */
 +
 +		ret_offset = lseek(file, offs, SEEK_SET);
 +
 +		if (ret_offset < 0) {
 +			ret = -1;
 +		} else {
 +			ret = read(file, buf, (ssize_t) n);
 +		}
 +
 +#ifndef UNIV_HOTBACKUP
 +		os_mutex_exit(os_file_seek_mutexes[i]);
 +#endif /* !UNIV_HOTBACKUP */
 +
 +		MONITOR_ATOMIC_DEC_LOW(MONITOR_OS_PENDING_READS, monitor);
 +		return(ret);
 +	}
 +#endif
 +}
 +
 +/*******************************************************************//**
 +Does a synchronous write operation in Posix.
 +@return	number of bytes written, -1 if error */
 +static MY_ATTRIBUTE((nonnull, warn_unused_result))
 +ssize_t
 +os_file_pwrite(
 +/*===========*/
 +	os_file_t	file,	/*!< in: handle to a file */
 +	const void*	buf,	/*!< in: buffer from where to write */
 +	ulint		n,	/*!< in: number of bytes to write */
 +	os_offset_t	offset)	/*!< in: file offset where to write */
 +{
 +	ssize_t	ret;
 +	off_t	offs;
 +
 +	ut_ad(n);
 +	ut_ad(!srv_read_only_mode);
 +
 +	/* If off_t is > 4 bytes in size, then we assume we can pass a
 +	64-bit address */
 +	offs = (off_t) offset;
 +
 +	if (sizeof(off_t) <= 4) {
 +		if (offset != (os_offset_t) offs) {
 +			ib_logf(IB_LOG_LEVEL_ERROR,
 +				"File write at offset > 4 GB.");
 +		}
 +	}
 +
 +	os_n_file_writes++;
 +
 +	const bool monitor = MONITOR_IS_ON(MONITOR_OS_PENDING_WRITES);
 +#ifdef HAVE_PWRITE
 +	MONITOR_ATOMIC_INC_LOW(MONITOR_OS_PENDING_WRITES, monitor);
 +	ret = pwrite(file, buf, (ssize_t) n, offs);
 +	MONITOR_ATOMIC_DEC_LOW(MONITOR_OS_PENDING_WRITES, monitor);
 +
 +	return(ret);
 +#else
 +	{
 +		off_t	ret_offset;
 +# ifndef UNIV_HOTBACKUP
 +		ulint	i;
 +# endif /* !UNIV_HOTBACKUP */
 +
 +		MONITOR_ATOMIC_INC_LOW(MONITOR_OS_PENDING_WRITES, monitor);
 +
 +# ifndef UNIV_HOTBACKUP
 +		/* Protect the seek / write operation with a mutex */
 +		i = ((ulint) file) % OS_FILE_N_SEEK_MUTEXES;
 +
 +		os_mutex_enter(os_file_seek_mutexes[i]);
 +# endif /* UNIV_HOTBACKUP */
 +
 +		ret_offset = lseek(file, offs, SEEK_SET);
 +
 +		if (ret_offset < 0) {
 +			ret = -1;
 +
 +			goto func_exit;
 +		}
 +
 +		ret = write(file, buf, (ssize_t) n);
 +
 +func_exit:
 +# ifndef UNIV_HOTBACKUP
 +		os_mutex_exit(os_file_seek_mutexes[i]);
 +# endif /* !UNIV_HOTBACKUP */
 +
 +		MONITOR_ATOMIC_DEC_LOW(MONITOR_OS_PENDING_WRITES, monitor);
 +		return(ret);
 +	}
 +#endif /* HAVE_PWRITE */
 +}
 +#endif
 +
 +/*******************************************************************//**
 +NOTE! Use the corresponding macro os_file_read(), not directly this
 +function!
 +Requests a synchronous positioned read operation.
 +@return	TRUE if request was successful, FALSE if fail */
 +UNIV_INTERN
 +ibool
 +os_file_read_func(
 +/*==============*/
 +	os_file_t	file,	/*!< in: handle to a file */
 +	void*		buf,	/*!< in: buffer where to read */
 +	os_offset_t	offset,	/*!< in: file offset where to read */
 +	ulint		n)	/*!< in: number of bytes to read */
 +{
 +#ifdef __WIN__
 +	BOOL		ret;
 +	DWORD		len;
 +	DWORD		ret2;
 +	DWORD		low;
 +	DWORD		high;
 +	ibool		retry;
 +#ifndef UNIV_HOTBACKUP
 +	ulint		i;
 +#endif /* !UNIV_HOTBACKUP */
 +
 +	/* On 64-bit Windows, ulint is 64 bits. But offset and n should be
 +	no more than 32 bits. */
 +	ut_a((n & 0xFFFFFFFFUL) == n);
 +
 +	os_n_file_reads++;
 +	os_bytes_read_since_printout += n;
 +	const bool monitor = MONITOR_IS_ON(MONITOR_OS_PENDING_READS);
 +
 +try_again:
 +	ut_ad(buf);
 +	ut_ad(n > 0);
 +
 +	low = (DWORD) offset & 0xFFFFFFFF;
 +	high = (DWORD) (offset >> 32);
 +
 +	MONITOR_ATOMIC_INC_LOW(MONITOR_OS_PENDING_READS, monitor);
 +
 +#ifndef UNIV_HOTBACKUP
 +	/* Protect the seek / read operation with a mutex */
 +	i = ((ulint) file) % OS_FILE_N_SEEK_MUTEXES;
 +
 +	os_mutex_enter(os_file_seek_mutexes[i]);
 +#endif /* !UNIV_HOTBACKUP */
 +
 +	ret2 = SetFilePointer(
 +		file, low, reinterpret_cast<PLONG>(&high), FILE_BEGIN);
 +
 +	if (ret2 == 0xFFFFFFFF && GetLastError() != NO_ERROR) {
 +
 +#ifndef UNIV_HOTBACKUP
 +		os_mutex_exit(os_file_seek_mutexes[i]);
 +#endif /* !UNIV_HOTBACKUP */
 +
 +		MONITOR_ATOMIC_DEC_LOW(MONITOR_OS_PENDING_READS, monitor);
 +		goto error_handling;
 +	}
 +
 +	ret = ReadFile(file, buf, (DWORD) n, &len, NULL);
 +
 +#ifndef UNIV_HOTBACKUP
 +	os_mutex_exit(os_file_seek_mutexes[i]);
 +#endif /* !UNIV_HOTBACKUP */
 +
 +	MONITOR_ATOMIC_DEC_LOW(MONITOR_OS_PENDING_READS, monitor);
 +
 +	if (ret && len == n) {
 +		return(TRUE);
 +	}
 +#else /* __WIN__ */
 +	ibool	retry;
 +	ssize_t	ret;
 +
 +	os_bytes_read_since_printout += n;
 +
 +try_again:
 +	ret = os_file_pread(file, buf, n, offset);
 +
 +	if ((ulint) ret == n) {
 +		return(TRUE);
 +	} else if (ret == -1) {
 +                ib_logf(IB_LOG_LEVEL_ERROR,
 +			"Error in system call pread(). The operating"
 +			" system error number is %lu.",(ulint) errno);
 +        } else {
 +		/* Partial read occurred */
 +		ib_logf(IB_LOG_LEVEL_ERROR,
 +			"Tried to read " ULINTPF " bytes at offset "
 +			UINT64PF ". Was only able to read %ld.",
 +			n, offset, (lint) ret);
 +	}
 +#endif /* __WIN__ */
 +#ifdef __WIN__
 +error_handling:
 +#endif
 +	retry = os_file_handle_error(NULL, "read");
 +
 +	if (retry) {
 +		goto try_again;
 +	}
 +
 +	fprintf(stderr,
 +		"InnoDB: Fatal error: cannot read from file."
 +		" OS error number %lu.\n",
 +#ifdef __WIN__
 +		(ulong) GetLastError()
 +#else
 +		(ulong) errno
 +#endif /* __WIN__ */
 +		);
 +	fflush(stderr);
 +
 +	ut_error;
 +
 +	return(FALSE);
 +}
 +
 +/*******************************************************************//**
 +NOTE! Use the corresponding macro os_file_read_no_error_handling(),
 +not directly this function!
 +Requests a synchronous positioned read operation. This function does not do
 +any error handling. In case of error it returns FALSE.
 +@return	TRUE if request was successful, FALSE if fail */
 +UNIV_INTERN
 +ibool
 +os_file_read_no_error_handling_func(
 +/*================================*/
 +	os_file_t	file,	/*!< in: handle to a file */
 +	void*		buf,	/*!< in: buffer where to read */
 +	os_offset_t	offset,	/*!< in: file offset where to read */
 +	ulint		n)	/*!< in: number of bytes to read */
 +{
 +#ifdef __WIN__
 +	BOOL		ret;
 +	DWORD		len;
 +	DWORD		ret2;
 +	DWORD		low;
 +	DWORD		high;
 +	ibool		retry;
 +#ifndef UNIV_HOTBACKUP
 +	ulint		i;
 +#endif /* !UNIV_HOTBACKUP */
 +
 +	/* On 64-bit Windows, ulint is 64 bits. But offset and n should be
 +	no more than 32 bits. */
 +	ut_a((n & 0xFFFFFFFFUL) == n);
 +
 +	os_n_file_reads++;
 +	os_bytes_read_since_printout += n;
 +	const bool monitor = MONITOR_IS_ON(MONITOR_OS_PENDING_READS);
 +
 +try_again:
 +	ut_ad(buf);
 +	ut_ad(n > 0);
 +
 +	low = (DWORD) offset & 0xFFFFFFFF;
 +	high = (DWORD) (offset >> 32);
 +
 +	MONITOR_ATOMIC_INC_LOW(MONITOR_OS_PENDING_READS, monitor);
 +
 +#ifndef UNIV_HOTBACKUP
 +	/* Protect the seek / read operation with a mutex */
 +	i = ((ulint) file) % OS_FILE_N_SEEK_MUTEXES;
 +
 +	os_mutex_enter(os_file_seek_mutexes[i]);
 +#endif /* !UNIV_HOTBACKUP */
 +
 +	ret2 = SetFilePointer(
 +		file, low, reinterpret_cast<PLONG>(&high), FILE_BEGIN);
 +
 +	if (ret2 == 0xFFFFFFFF && GetLastError() != NO_ERROR) {
 +
 +#ifndef UNIV_HOTBACKUP
 +		os_mutex_exit(os_file_seek_mutexes[i]);
 +#endif /* !UNIV_HOTBACKUP */
 +
 +		MONITOR_ATOMIC_DEC_LOW(MONITOR_OS_PENDING_READS, monitor);
 +		goto error_handling;
 +	}
 +
 +	ret = ReadFile(file, buf, (DWORD) n, &len, NULL);
 +
 +#ifndef UNIV_HOTBACKUP
 +	os_mutex_exit(os_file_seek_mutexes[i]);
 +#endif /* !UNIV_HOTBACKUP */
 +
 +	MONITOR_ATOMIC_DEC_LOW(MONITOR_OS_PENDING_READS, monitor);
 +
 +	if (ret && len == n) {
 +		return(TRUE);
 +	}
 +#else /* __WIN__ */
 +	ibool	retry;
 +	ssize_t	ret;
 +
 +	os_bytes_read_since_printout += n;
 +
 +try_again:
 +	ret = os_file_pread(file, buf, n, offset);
 +
 +	if ((ulint) ret == n) {
 +		return(TRUE);
 +	} else if (ret == -1) {
 +                ib_logf(IB_LOG_LEVEL_ERROR,
 +			"Error in system call pread(). The operating"
 +			" system error number is %lu.",(ulint) errno);
 +        } else {
 +		/* Partial read occurred */
 +		ib_logf(IB_LOG_LEVEL_ERROR,
 +			"Tried to read " ULINTPF " bytes at offset "
 +			UINT64PF ". Was only able to read %ld.",
 +			n, offset, (lint) ret);
 +	}
 +#endif /* __WIN__ */
 +#ifdef __WIN__
 +error_handling:
 +#endif
 +	retry = os_file_handle_error_no_exit(NULL, "read", FALSE);
 +
 +	if (retry) {
 +		goto try_again;
 +	}
 +
 +	return(FALSE);
 +}
 +
 +/*******************************************************************//**
 +Rewind file to its start, read at most size - 1 bytes from it to str, and
 +NUL-terminate str. All errors are silently ignored. This function is
 +mostly meant to be used with temporary files. */
 +UNIV_INTERN
 +void
 +os_file_read_string(
 +/*================*/
 +	FILE*	file,	/*!< in: file to read from */
 +	char*	str,	/*!< in: buffer where to read */
 +	ulint	size)	/*!< in: size of buffer */
 +{
 +	size_t	flen;
 +
 +	if (size == 0) {
 +		return;
 +	}
 +
 +	rewind(file);
 +	flen = fread(str, 1, size - 1, file);
 +	str[flen] = '\0';
 +}
 +
 +/*******************************************************************//**
 +NOTE! Use the corresponding macro os_file_write(), not directly
 +this function!
 +Requests a synchronous write operation.
 +@return	TRUE if request was successful, FALSE if fail */
 +UNIV_INTERN
 +ibool
 +os_file_write_func(
 +/*===============*/
 +	const char*	name,	/*!< in: name of the file or path as a
 +				null-terminated string */
 +	os_file_t	file,	/*!< in: handle to a file */
 +	const void*	buf,	/*!< in: buffer from which to write */
 +	os_offset_t	offset,	/*!< in: file offset where to write */
 +	ulint		n)	/*!< in: number of bytes to write */
 +{
 +	ut_ad(!srv_read_only_mode);
 +#ifdef __WIN__
 +	BOOL		ret;
 +	DWORD		len;
 +	DWORD		ret2;
 +	DWORD		low;
 +	DWORD		high;
 +	ulint		n_retries	= 0;
 +	ulint		err;
 +	DWORD		saved_error = 0;
 +#ifndef UNIV_HOTBACKUP
 +	ulint		i;
 +#endif /* !UNIV_HOTBACKUP */
 +
 +	/* On 64-bit Windows, ulint is 64 bits. But offset and n should be
 +	no more than 32 bits. */
 +	ut_a((n & 0xFFFFFFFFUL) == n);
 +
 +	os_n_file_writes++;
 +
 +	ut_ad(buf);
 +	ut_ad(n > 0);
 +	const bool monitor = MONITOR_IS_ON(MONITOR_OS_PENDING_WRITES);
 +retry:
 +	low = (DWORD) offset & 0xFFFFFFFF;
 +	high = (DWORD) (offset >> 32);
 +
 +	MONITOR_ATOMIC_INC_LOW(MONITOR_OS_PENDING_WRITES, monitor);
 +
 +#ifndef UNIV_HOTBACKUP
 +	/* Protect the seek / write operation with a mutex */
 +	i = ((ulint) file) % OS_FILE_N_SEEK_MUTEXES;
 +
 +	os_mutex_enter(os_file_seek_mutexes[i]);
 +#endif /* !UNIV_HOTBACKUP */
 +
 +	ret2 = SetFilePointer(
 +		file, low, reinterpret_cast<PLONG>(&high), FILE_BEGIN);
 +
 +	if (ret2 == 0xFFFFFFFF && GetLastError() != NO_ERROR) {
 +
 +#ifndef UNIV_HOTBACKUP
 +		os_mutex_exit(os_file_seek_mutexes[i]);
 +#endif /* !UNIV_HOTBACKUP */
 +
 +		MONITOR_ATOMIC_DEC_LOW(MONITOR_OS_PENDING_WRITES, monitor);
 +
 +		ut_print_timestamp(stderr);
 +
 +		fprintf(stderr,
 +			" InnoDB: Error: File pointer positioning to"
 +			" file %s failed at\n"
 +			"InnoDB: offset %llu. Operating system"
 +			" error number %lu.\n"
 +			"InnoDB: Some operating system error numbers"
 +			" are described at\n"
 +			"InnoDB: "
 +			REFMAN "operating-system-error-codes.html\n",
 +			name, offset, (ulong) GetLastError());
 +
 +		return(FALSE);
 +	}
 +
 +	ret = WriteFile(file, buf, (DWORD) n, &len, NULL);
 +
 +#ifndef UNIV_HOTBACKUP
 +	os_mutex_exit(os_file_seek_mutexes[i]);
 +#endif /* !UNIV_HOTBACKUP */
 +
 +	MONITOR_ATOMIC_DEC_LOW(MONITOR_OS_PENDING_WRITES, monitor);
 +
 +	if (ret && len == n) {
 +
 +		return(TRUE);
 +	}
 +
 +	/* If some background file system backup tool is running, then, at
 +	least in Windows 2000, we may get here a specific error. Let us
 +	retry the operation 100 times, with 1 second waits. */
 +
 +	if (GetLastError() == ERROR_LOCK_VIOLATION && n_retries < 100) {
 +
 +		os_thread_sleep(1000000);
 +
 +		n_retries++;
 +
 +		goto retry;
 +	}
 +
 +	if (!os_has_said_disk_full) {
 +		char *winmsg = NULL;
 +
 +		saved_error = GetLastError();
 +		err = (ulint) saved_error;
 +
 +		ut_print_timestamp(stderr);
 +
 +		fprintf(stderr,
 +			" InnoDB: Error: Write to file %s failed"
 +			" at offset %llu.\n"
 +			"InnoDB: %lu bytes should have been written,"
 +			" only %lu were written.\n"
 +			"InnoDB: Operating system error number %lu.\n"
 +			"InnoDB: Check that your OS and file system"
 +			" support files of this size.\n"
 +			"InnoDB: Check also that the disk is not full"
 +			" or a disk quota exceeded.\n",
 +			name, offset,
 +			(ulong) n, (ulong) len, (ulong) err);
 +
 +		/* Ask Windows to prepare a standard message for a
 +		GetLastError() */
 +
 +		FormatMessage(FORMAT_MESSAGE_ALLOCATE_BUFFER |
 +			FORMAT_MESSAGE_FROM_SYSTEM |
 +			FORMAT_MESSAGE_IGNORE_INSERTS,
 +			NULL, saved_error,
 +			MAKELANGID(LANG_NEUTRAL, SUBLANG_DEFAULT),
 +			(LPSTR)&winmsg, 0, NULL);
 +
 +		if (winmsg) {
 +			fprintf(stderr,
 +				"InnoDB: FormatMessage: Error number %lu means '%s'.\n",
 +				(ulong) saved_error, winmsg);
 +			LocalFree(winmsg);
 +		}
 +
 +		if (strerror((int) err) != NULL) {
 +			fprintf(stderr,
 +				"InnoDB: Error number %lu means '%s'.\n",
 +				(ulong) err, strerror((int) err));
 +		}
 +
 +		fprintf(stderr,
 +			"InnoDB: Some operating system error numbers"
 +			" are described at\n"
 +			"InnoDB: "
 +			REFMAN "operating-system-error-codes.html\n");
 +
 +		os_has_said_disk_full = TRUE;
 +	}
 +
 +	return(FALSE);
 +#else
 +	ssize_t	ret;
 +	WAIT_ALLOW_WRITES();
 +
 +	ret = os_file_pwrite(file, buf, n, offset);
 +
 +	if ((ulint) ret == n) {
 +
 +		return(TRUE);
 +	}
 +
 +	if (!os_has_said_disk_full) {
 +		ut_print_timestamp(stderr);
 +
 +		if(ret == -1) {
 +			ib_logf(IB_LOG_LEVEL_ERROR,
 +				"Failure of system call pwrite(). Operating"
 +				" system error number is %lu.",
 +				(ulint) errno);
 +		} else {
 +			fprintf(stderr,
 +				" InnoDB: Error: Write to file %s failed"
 +				" at offset " UINT64PF ".\n"
 +				"InnoDB: %lu bytes should have been written,"
 +				" only %ld were written.\n"
 +				"InnoDB: Operating system error number %lu.\n"
 +				"InnoDB: Check that your OS and file system"
 +				" support files of this size.\n"
 +				"InnoDB: Check also that the disk is not full"
 +				" or a disk quota exceeded.\n",
 +				name, offset, n, (lint) ret,
 +				(ulint) errno);
 +		}
 +
 +		if (strerror(errno) != NULL) {
 +			fprintf(stderr,
 +				"InnoDB: Error number %d means '%s'.\n",
 +				errno, strerror(errno));
 +		}
 +
 +		fprintf(stderr,
 +			"InnoDB: Some operating system error numbers"
 +			" are described at\n"
 +			"InnoDB: "
 +			REFMAN "operating-system-error-codes.html\n");
 +
 +		os_has_said_disk_full = TRUE;
 +	}
 +
 +	return(FALSE);
 +#endif
 +}
 +
 +/*******************************************************************//**
 +Check the existence and type of the given file.
 +@return	TRUE if call succeeded */
 +UNIV_INTERN
 +ibool
 +os_file_status(
 +/*===========*/
 +	const char*	path,	/*!< in: pathname of the file */
 +	ibool*		exists,	/*!< out: TRUE if file exists */
 +	os_file_type_t* type)	/*!< out: type of the file (if it exists) */
 +{
 +#ifdef __WIN__
 +	int		ret;
 +	struct _stat64	statinfo;
 +
 +	ret = _stat64(path, &statinfo);
 +	if (ret && (errno == ENOENT || errno == ENOTDIR || errno == ENAMETOOLONG)) {
 +		/* file does not exist */
 +		*exists = FALSE;
 +		return(TRUE);
 +	} else if (ret) {
 +		/* file exists, but stat call failed */
 +
 +		os_file_handle_error_no_exit(path, "stat", FALSE);
 +
 +		return(FALSE);
 +	}
 +
 +	if (_S_IFDIR & statinfo.st_mode) {
 +		*type = OS_FILE_TYPE_DIR;
 +	} else if (_S_IFREG & statinfo.st_mode) {
 +		*type = OS_FILE_TYPE_FILE;
 +	} else {
 +		*type = OS_FILE_TYPE_UNKNOWN;
 +	}
 +
 +	*exists = TRUE;
 +
 +	return(TRUE);
 +#else
 +	int		ret;
 +	struct stat	statinfo;
 +
 +	ret = stat(path, &statinfo);
 +	if (ret && (errno == ENOENT || errno == ENOTDIR || errno == ENAMETOOLONG)) {
 +		/* file does not exist */
 +		*exists = FALSE;
 +		return(TRUE);
 +	} else if (ret) {
 +		/* file exists, but stat call failed */
 +
 +		os_file_handle_error_no_exit(path, "stat", FALSE);
 +
 +		return(FALSE);
 +	}
 +
 +	if (S_ISDIR(statinfo.st_mode)) {
 +		*type = OS_FILE_TYPE_DIR;
 +	} else if (S_ISLNK(statinfo.st_mode)) {
 +		*type = OS_FILE_TYPE_LINK;
 +	} else if (S_ISREG(statinfo.st_mode)) {
 +		*type = OS_FILE_TYPE_FILE;
 +	} else {
 +		*type = OS_FILE_TYPE_UNKNOWN;
 +	}
 +
 +	*exists = TRUE;
 +
 +	return(TRUE);
 +#endif
 +}
 +
 +/*******************************************************************//**
 +This function returns information about the specified file
 +@return	DB_SUCCESS if all OK */
 +UNIV_INTERN
 +dberr_t
 +os_file_get_status(
 +/*===============*/
 +	const char*	path,		/*!< in:	pathname of the file */
 +	os_file_stat_t* stat_info,	/*!< information of a file in a
 +					directory */
 +	bool		check_rw_perm)	/*!< in: for testing whether the
 +					file can be opened in RW mode */
 +{
 +	int		ret;
 +
 +#ifdef __WIN__
 +	struct _stat64	statinfo;
 +
 +	ret = _stat64(path, &statinfo);
 +
 +	if (ret && (errno == ENOENT || errno == ENOTDIR)) {
 +		/* file does not exist */
 +
 +		return(DB_NOT_FOUND);
 +
 +	} else if (ret) {
 +		/* file exists, but stat call failed */
 +
 +		os_file_handle_error_no_exit(path, "stat", FALSE);
 +
 +		return(DB_FAIL);
 +
 +	} else if (_S_IFDIR & statinfo.st_mode) {
 +		stat_info->type = OS_FILE_TYPE_DIR;
 +	} else if (_S_IFREG & statinfo.st_mode) {
 +
 +		DWORD	access = GENERIC_READ;
 +
 +		if (!srv_read_only_mode) {
 +			access |= GENERIC_WRITE;
 +		}
 +
 +		stat_info->type = OS_FILE_TYPE_FILE;
 +
 +		/* Check if we can open it in read-only mode. */
 +
 +		if (check_rw_perm) {
 +			HANDLE	fh;
 +
 +			fh = CreateFile(
 +				(LPCTSTR) path,		// File to open
 +				access,
 +				0,			// No sharing
 +				NULL,			// Default security
 +				OPEN_EXISTING,		// Existing file only
 +				FILE_ATTRIBUTE_NORMAL,	// Normal file
 +				NULL);			// No attr. template
 +
 +			if (fh == INVALID_HANDLE_VALUE) {
 +				stat_info->rw_perm = false;
 +			} else {
 +				stat_info->rw_perm = true;
 +				CloseHandle(fh);
 +			}
 +		}
 +	} else {
 +		stat_info->type = OS_FILE_TYPE_UNKNOWN;
 +	}
 +#else
 +	struct stat	statinfo;
 +
 +	ret = stat(path, &statinfo);
 +
 +	if (ret && (errno == ENOENT || errno == ENOTDIR)) {
 +		/* file does not exist */
 +
 +		return(DB_NOT_FOUND);
 +
 +	} else if (ret) {
 +		/* file exists, but stat call failed */
 +
 +		os_file_handle_error_no_exit(path, "stat", FALSE);
 +
 +		return(DB_FAIL);
 +
 +	}
 +
 +	switch (statinfo.st_mode & S_IFMT) {
 +	case S_IFDIR:
 +		stat_info->type = OS_FILE_TYPE_DIR;
 +		break;
 +	case S_IFLNK:
 +		stat_info->type = OS_FILE_TYPE_LINK;
 +		break;
 +	case S_IFBLK:
 +		/* Handle block device as regular file. */
 +	case S_IFCHR:
 +		/* Handle character device as regular file. */
 +	case S_IFREG:
 +		stat_info->type = OS_FILE_TYPE_FILE;
 +		break;
 +	default:
 +		stat_info->type = OS_FILE_TYPE_UNKNOWN;
 +	}
 +
 +
 +	if (check_rw_perm && stat_info->type == OS_FILE_TYPE_FILE) {
 +
 +		int	fh;
 +		int	access;
 +
 +		access = !srv_read_only_mode ? O_RDWR : O_RDONLY;
 +
- 		fh = ::open(path, access, os_innodb_umask);
++		fh = ::open(path, access | O_CLOEXEC, os_innodb_umask);
 +
 +		if (fh == -1) {
 +			stat_info->rw_perm = false;
 +		} else {
 +			stat_info->rw_perm = true;
 +			close(fh);
 +		}
 +	}
 +
 +#endif /* _WIN_ */
 +
 +	stat_info->ctime = statinfo.st_ctime;
 +	stat_info->atime = statinfo.st_atime;
 +	stat_info->mtime = statinfo.st_mtime;
 +	stat_info->size  = statinfo.st_size;
 +
 +	return(DB_SUCCESS);
 +}
 +
 +/* path name separator character */
 +#ifdef __WIN__
 +#  define OS_FILE_PATH_SEPARATOR	'\\'
 +#else
 +#  define OS_FILE_PATH_SEPARATOR	'/'
 +#endif
 +
 +/****************************************************************//**
 +This function returns a new path name after replacing the basename
 +in an old path with a new basename.  The old_path is a full path
 +name including the extension.  The tablename is in the normal
 +form "databasename/tablename".  The new base name is found after
 +the forward slash.  Both input strings are null terminated.
 +
 +This function allocates memory to be returned.  It is the callers
 +responsibility to free the return value after it is no longer needed.
 +
 +@return	own: new full pathname */
 +UNIV_INTERN
 +char*
 +os_file_make_new_pathname(
 +/*======================*/
 +	const char*	old_path,	/*!< in: pathname */
 +	const char*	tablename)	/*!< in: contains new base name */
 +{
 +	ulint		dir_len;
 +	char*		last_slash;
 +	char*		base_name;
 +	char*		new_path;
 +	ulint		new_path_len;
 +
 +	/* Split the tablename into its database and table name components.
 +	They are separated by a '/'. */
 +	last_slash = strrchr((char*) tablename, '/');
 +	base_name = last_slash ? last_slash + 1 : (char*) tablename;
 +
 +	/* Find the offset of the last slash. We will strip off the
 +	old basename.ibd which starts after that slash. */
 +	last_slash = strrchr((char*) old_path, OS_FILE_PATH_SEPARATOR);
 +	dir_len = last_slash ? last_slash - old_path : strlen(old_path);
 +
 +	/* allocate a new path and move the old directory path to it. */
 +	new_path_len = dir_len + strlen(base_name) + sizeof "/.ibd";
 +	new_path = static_cast<char*>(mem_alloc(new_path_len));
 +	memcpy(new_path, old_path, dir_len);
 +
 +	ut_snprintf(new_path + dir_len,
 +		    new_path_len - dir_len,
 +		    "%c%s.ibd",
 +		    OS_FILE_PATH_SEPARATOR,
 +		    base_name);
 +
 +	return(new_path);
 +}
 +
 +/****************************************************************//**
 +This function returns a remote path name by combining a data directory
 +path provided in a DATA DIRECTORY clause with the tablename which is
 +in the form 'database/tablename'.  It strips the file basename (which
 +is the tablename) found after the last directory in the path provided.
 +The full filepath created will include the database name as a directory
 +under the path provided.  The filename is the tablename with the '.ibd'
 +extension. All input and output strings are null-terminated.
 +
 +This function allocates memory to be returned.  It is the callers
 +responsibility to free the return value after it is no longer needed.
 +
 +@return	own: A full pathname; data_dir_path/databasename/tablename.ibd */
 +UNIV_INTERN
 +char*
 +os_file_make_remote_pathname(
 +/*=========================*/
 +	const char*	data_dir_path,	/*!< in: pathname */
 +	const char*	tablename,	/*!< in: tablename */
 +	const char*	extention)	/*!< in: file extention; ibd,cfg */
 +{
 +	ulint		data_dir_len;
 +	char*		last_slash;
 +	char*		new_path;
 +	ulint		new_path_len;
 +
 +	ut_ad(extention && strlen(extention) == 3);
 +
 +	/* Find the offset of the last slash. We will strip off the
 +	old basename or tablename which starts after that slash. */
 +	last_slash = strrchr((char*) data_dir_path, OS_FILE_PATH_SEPARATOR);
 +	data_dir_len = last_slash ? last_slash - data_dir_path : strlen(data_dir_path);
 +
 +	/* allocate a new path and move the old directory path to it. */
 +	new_path_len = data_dir_len + strlen(tablename)
 +		       + sizeof "/." + strlen(extention);
 +	new_path = static_cast<char*>(mem_alloc(new_path_len));
 +	memcpy(new_path, data_dir_path, data_dir_len);
 +	ut_snprintf(new_path + data_dir_len,
 +		    new_path_len - data_dir_len,
 +		    "%c%s.%s",
 +		    OS_FILE_PATH_SEPARATOR,
 +		    tablename,
 +		    extention);
 +
 +	srv_normalize_path_for_win(new_path);
 +
 +	return(new_path);
 +}
 +
 +/****************************************************************//**
 +This function reduces a null-terminated full remote path name into
 +the path that is sent by MySQL for DATA DIRECTORY clause.  It replaces
 +the 'databasename/tablename.ibd' found at the end of the path with just
 +'tablename'.
 +
 +Since the result is always smaller than the path sent in, no new memory
 +is allocated. The caller should allocate memory for the path sent in.
 +This function manipulates that path in place.
 +
 +If the path format is not as expected, just return.  The result is used
 +to inform a SHOW CREATE TABLE command. */
 +UNIV_INTERN
 +void
 +os_file_make_data_dir_path(
 +/*========================*/
 +	char*	data_dir_path)	/*!< in/out: full path/data_dir_path */
 +{
 +	char*	ptr;
 +	char*	tablename;
 +	ulint	tablename_len;
 +
 +	/* Replace the period before the extension with a null byte. */
 +	ptr = strrchr((char*) data_dir_path, '.');
 +	if (!ptr) {
 +		return;
 +	}
 +	ptr[0] = '\0';
 +
 +	/* The tablename starts after the last slash. */
 +	ptr = strrchr((char*) data_dir_path, OS_FILE_PATH_SEPARATOR);
 +	if (!ptr) {
 +		return;
 +	}
 +	ptr[0] = '\0';
 +	tablename = ptr + 1;
 +
 +	/* The databasename starts after the next to last slash. */
 +	ptr = strrchr((char*) data_dir_path, OS_FILE_PATH_SEPARATOR);
 +	if (!ptr) {
 +		return;
 +	}
 +	tablename_len = ut_strlen(tablename);
 +
 +	ut_memmove(++ptr, tablename, tablename_len);
 +
 +	ptr[tablename_len] = '\0';
 +}
 +
 +/****************************************************************//**
 +The function os_file_dirname returns a directory component of a
 +null-terminated pathname string. In the usual case, dirname returns
 +the string up to, but not including, the final '/', and basename
 +is the component following the final '/'. Trailing '/' characters
 +are not counted as part of the pathname.
 +
 +If path does not contain a slash, dirname returns the string ".".
 +
 +Concatenating the string returned by dirname, a "/", and the basename
 +yields a complete pathname.
 +
 +The return value is a copy of the directory component of the pathname.
 +The copy is allocated from heap. It is the caller responsibility
 +to free it after it is no longer needed.
 +
 +The following list of examples (taken from SUSv2) shows the strings
 +returned by dirname and basename for different paths:
 +
 +       path	      dirname	     basename
 +       "/usr/lib"     "/usr"	     "lib"
 +       "/usr/"	      "/"	     "usr"
 +       "usr"	      "."	     "usr"
 +       "/"	      "/"	     "/"
 +       "."	      "."	     "."
 +       ".."	      "."	     ".."
 +
 +@return	own: directory component of the pathname */
 +UNIV_INTERN
 +char*
 +os_file_dirname(
 +/*============*/
 +	const char*	path)	/*!< in: pathname */
 +{
 +	/* Find the offset of the last slash */
 +	const char* last_slash = strrchr(path, OS_FILE_PATH_SEPARATOR);
 +	if (!last_slash) {
 +		/* No slash in the path, return "." */
 +
 +		return(mem_strdup("."));
 +	}
 +
 +	/* Ok, there is a slash */
 +
 +	if (last_slash == path) {
 +		/* last slash is the first char of the path */
 +
 +		return(mem_strdup("/"));
 +	}
 +
 +	/* Non-trivial directory component */
 +
 +	return(mem_strdupl(path, last_slash - path));
 +}
 +
 +/****************************************************************//**
 +Creates all missing subdirectories along the given path.
 +@return	TRUE if call succeeded FALSE otherwise */
 +UNIV_INTERN
 +ibool
 +os_file_create_subdirs_if_needed(
 +/*=============================*/
 +	const char*	path)	/*!< in: path name */
 +{
 +	if (srv_read_only_mode) {
 +
 +		ib_logf(IB_LOG_LEVEL_ERROR,
 +			"read only mode set. Can't create subdirectories '%s'",
 +			path);
 +
 +		return(FALSE);
 +
 +	}
 +
 +	char*	subdir = os_file_dirname(path);
 +
 +	if (strlen(subdir) == 1
 +	    && (*subdir == OS_FILE_PATH_SEPARATOR || *subdir == '.')) {
 +		/* subdir is root or cwd, nothing to do */
 +		mem_free(subdir);
 +
 +		return(TRUE);
 +	}
 +
 +	/* Test if subdir exists */
 +	os_file_type_t	type;
 +	ibool	subdir_exists;
 +	ibool	success = os_file_status(subdir, &subdir_exists, &type);
 +
 +	if (success && !subdir_exists) {
 +
 +		/* subdir does not exist, create it */
 +		success = os_file_create_subdirs_if_needed(subdir);
 +
 +		if (!success) {
 +			mem_free(subdir);
 +
 +			return(FALSE);
 +		}
 +
 +		success = os_file_create_directory(subdir, FALSE);
 +	}
 +
 +	mem_free(subdir);
 +
 +	return(success);
 +}
 +
 +#ifndef UNIV_HOTBACKUP
 +/****************************************************************//**
 +Returns a pointer to the nth slot in the aio array.
 +@return	pointer to slot */
 +static
 +os_aio_slot_t*
 +os_aio_array_get_nth_slot(
 +/*======================*/
 +	os_aio_array_t*		array,	/*!< in: aio array */
 +	ulint			index)	/*!< in: index of the slot */
 +{
 +	ut_a(index < array->n_slots);
 +
 +	return(&array->slots[index]);
 +}
 +
 +#if defined(LINUX_NATIVE_AIO)
 +/******************************************************************//**
 +Creates an io_context for native linux AIO.
 +@return	TRUE on success. */
 +static
 +ibool
 +os_aio_linux_create_io_ctx(
 +/*=======================*/
 +	ulint		max_events,	/*!< in: number of events. */
 +	io_context_t*	io_ctx)		/*!< out: io_ctx to initialize. */
 +{
 +	int	ret;
 +	ulint	retries = 0;
 +
 +retry:
 +	memset(io_ctx, 0x0, sizeof(*io_ctx));
 +
 +	/* Initialize the io_ctx. Tell it how many pending
 +	IO requests this context will handle. */
 +
 +	ret = io_setup(max_events, io_ctx);
 +	if (ret == 0) {
 +#if defined(UNIV_AIO_DEBUG)
 +		fprintf(stderr,
 +			"InnoDB: Linux native AIO:"
 +			" initialized io_ctx for segment\n");
 +#endif
 +		/* Success. Return now. */
 +		return(TRUE);
 +	}
 +
 +	/* If we hit EAGAIN we'll make a few attempts before failing. */
 +
 +	switch (ret) {
 +	case -EAGAIN:
 +		if (retries == 0) {
 +			/* First time around. */
 +			ut_print_timestamp(stderr);
 +			fprintf(stderr,
 +				" InnoDB: Warning: io_setup() failed"
 +				" with EAGAIN. Will make %d attempts"
 +				" before giving up.\n",
 +				OS_AIO_IO_SETUP_RETRY_ATTEMPTS);
 +		}
 +
 +		if (retries < OS_AIO_IO_SETUP_RETRY_ATTEMPTS) {
 +			++retries;
 +			fprintf(stderr,
 +				"InnoDB: Warning: io_setup() attempt"
 +				" %lu failed.\n",
 +				retries);
 +			os_thread_sleep(OS_AIO_IO_SETUP_RETRY_SLEEP);
 +			goto retry;
 +		}
 +
 +		/* Have tried enough. Better call it a day. */
 +		ut_print_timestamp(stderr);
 +		fprintf(stderr,
 +			" InnoDB: Error: io_setup() failed"
 +			" with EAGAIN after %d attempts.\n",
 +			OS_AIO_IO_SETUP_RETRY_ATTEMPTS);
 +		break;
 +
 +	case -ENOSYS:
 +		ut_print_timestamp(stderr);
 +		fprintf(stderr,
 +			" InnoDB: Error: Linux Native AIO interface"
 +			" is not supported on this platform. Please"
 +			" check your OS documentation and install"
 +			" appropriate binary of InnoDB.\n");
 +
 +		break;
 +
 +	default:
 +		ut_print_timestamp(stderr);
 +		fprintf(stderr,
 +			" InnoDB: Error: Linux Native AIO setup"
 +			" returned following error[%d]\n", -ret);
 +		break;
 +	}
 +
 +	fprintf(stderr,
 +		"InnoDB: You can disable Linux Native AIO by"
 +		" setting innodb_use_native_aio = 0 in my.cnf\n");
 +	return(FALSE);
 +}
 +
 +/******************************************************************//**
 +Checks if the system supports native linux aio. On some kernel
 +versions where native aio is supported it won't work on tmpfs. In such
 +cases we can't use native aio as it is not possible to mix simulated
 +and native aio.
 +@return: TRUE if supported, FALSE otherwise. */
 +static
 +ibool
 +os_aio_native_aio_supported(void)
 +/*=============================*/
 +{
 +	int			fd;
 +	io_context_t		io_ctx;
 +	char			name[1000];
 +
 +	if (!os_aio_linux_create_io_ctx(1, &io_ctx)) {
 +		/* The platform does not support native aio. */
 +		return(FALSE);
 +	} else if (!srv_read_only_mode) {
 +		/* Now check if tmpdir supports native aio ops. */
 +		fd = innobase_mysql_tmpfile(NULL);
 +
 +		if (fd < 0) {
 +			ib_logf(IB_LOG_LEVEL_WARN,
 +				"Unable to create temp file to check "
 +				"native AIO support.");
 +
 +			return(FALSE);
 +		}
 +	} else {
 +
 +		srv_normalize_path_for_win(srv_log_group_home_dir);
 +
 +		ulint	dirnamelen = strlen(srv_log_group_home_dir);
 +		ut_a(dirnamelen < (sizeof name) - 10 - sizeof "ib_logfile");
 +		memcpy(name, srv_log_group_home_dir, dirnamelen);
 +
 +		/* Add a path separator if needed. */
 +		if (dirnamelen && name[dirnamelen - 1] != SRV_PATH_SEPARATOR) {
 +			name[dirnamelen++] = SRV_PATH_SEPARATOR;
 +		}
 +
 +		strcpy(name + dirnamelen, "ib_logfile0");
 +
 +		fd = ::open(name, O_RDONLY | O_CLOEXEC);
 +
 +		if (fd == -1) {
 +
 +			ib_logf(IB_LOG_LEVEL_WARN,
 +				"Unable to open \"%s\" to check "
 +				"native AIO read support.", name);
 +
 +			return(FALSE);
 +		}
 +	}
 +
 +	struct io_event	io_event;
 +
 +	memset(&io_event, 0x0, sizeof(io_event));
 +
 +	byte*	buf = static_cast<byte*>(ut_malloc(UNIV_PAGE_SIZE * 2));
 +	byte*	ptr = static_cast<byte*>(ut_align(buf, UNIV_PAGE_SIZE));
 +
 +	struct iocb	iocb;
 +
 +	/* Suppress valgrind warning. */
 +	memset(buf, 0x00, UNIV_PAGE_SIZE * 2);
 +	memset(&iocb, 0x0, sizeof(iocb));
 +
 +	struct iocb*	p_iocb = &iocb;
 +
 +	if (!srv_read_only_mode) {
 +		io_prep_pwrite(p_iocb, fd, ptr, UNIV_PAGE_SIZE, 0);
 +	} else {
 +		ut_a(UNIV_PAGE_SIZE >= 512);
 +		io_prep_pread(p_iocb, fd, ptr, 512, 0);
 +	}
 +
 +	int	err = io_submit(io_ctx, 1, &p_iocb);
 +
 +	if (err >= 1) {
 +		/* Now collect the submitted IO request. */
 +		err = io_getevents(io_ctx, 1, 1, &io_event, NULL);
 +	}
 +
 +	ut_free(buf);
 +	close(fd);
 +
 +	switch (err) {
 +	case 1:
 +		return(TRUE);
 +
 +	case -EINVAL:
 +	case -ENOSYS:
 +		ib_logf(IB_LOG_LEVEL_ERROR,
 +			"Linux Native AIO not supported. You can either "
 +			"move %s to a file system that supports native "
 +			"AIO or you can set innodb_use_native_aio to "
 +			"FALSE to avoid this message.",
 +			srv_read_only_mode ? name : "tmpdir");
 +
 +		/* fall through. */
 +	default:
 +		ib_logf(IB_LOG_LEVEL_ERROR,
 +			"Linux Native AIO check on %s returned error[%d]",
 +			srv_read_only_mode ? name : "tmpdir", -err);
 +	}
 +
 +	return(FALSE);
 +}
 +#endif /* LINUX_NATIVE_AIO */
 +
 +/******************************************************************//**
 +Creates an aio wait array. Note that we return NULL in case of failure.
 +We don't care about freeing memory here because we assume that a
 +failure will result in server refusing to start up.
 +@return	own: aio array, NULL on failure */
 +static
 +os_aio_array_t*
 +os_aio_array_create(
 +/*================*/
 +	ulint	n,		/*!< in: maximum number of pending aio
 +				operations allowed; n must be
 +				divisible by n_segments */
 +	ulint	n_segments)	/*!< in: number of segments in the aio array */
 +{
 +	os_aio_array_t*	array;
 +#ifdef WIN_ASYNC_IO
 +	OVERLAPPED*	over;
 +#elif defined(LINUX_NATIVE_AIO)
 +	struct io_event*	io_event = NULL;
 +#endif /* WIN_ASYNC_IO */
 +	ut_a(n > 0);
 +	ut_a(n_segments > 0);
 +
 +	array = static_cast<os_aio_array_t*>(ut_malloc(sizeof(*array)));
 +	memset(array, 0x0, sizeof(*array));
 +
 +	array->mutex = os_mutex_create();
 +	array->not_full = os_event_create();
 +	array->is_empty = os_event_create();
 +
 +	os_event_set(array->is_empty);
 +
 +	array->n_slots = n;
 +	array->n_segments = n_segments;
 +
 +	array->slots = static_cast<os_aio_slot_t*>(
 +		ut_malloc(n * sizeof(*array->slots)));
 +
 +	memset(array->slots, 0x0, sizeof(n * sizeof(*array->slots)));
 +#ifdef __WIN__
 +	array->handles = static_cast<HANDLE*>(ut_malloc(n * sizeof(HANDLE)));
 +#endif /* __WIN__ */
 +
 +#if defined(LINUX_NATIVE_AIO)
 +	array->aio_ctx = NULL;
 +	array->aio_events = NULL;
 +
 +	/* If we are not using native aio interface then skip this
 +	part of initialization. */
 +	if (!srv_use_native_aio) {
 +		goto skip_native_aio;
 +	}
 +
 +	/* Initialize the io_context array. One io_context
 +	per segment in the array. */
 +
 +	array->aio_ctx = static_cast<io_context**>(
 +		ut_malloc(n_segments * sizeof(*array->aio_ctx)));
 +
 +	for (ulint i = 0; i < n_segments; ++i) {
 +		if (!os_aio_linux_create_io_ctx(n/n_segments,
 +						&array->aio_ctx[i])) {
 +			/* If something bad happened during aio setup
 +			we disable linux native aio.
 +                        The disadvantage will be a small memory leak
 +                        at shutdown but that's ok compared to a crash
 +                        or a not working server.
 +                        This frequently happens when running the test suite
 +                        with many threads on a system with low fs.aio-max-nr!
 +                        */
 +
 +                        fprintf(stderr,
 +                                "  InnoDB: Warning: Linux Native AIO disabled "
 +                                "because os_aio_linux_create_io_ctx() "
 +                                "failed. To get rid of this warning you can "
 +                                "try increasing system "
 +                                "fs.aio-max-nr to 1048576 or larger or "
 +                                "setting innodb_use_native_aio = 0 in my.cnf\n");
 +                        srv_use_native_aio = FALSE;
 +			goto skip_native_aio;
 +		}
 +	}
 +
 +	/* Initialize the event array. One event per slot. */
 +	io_event = static_cast<struct io_event*>(
 +		ut_malloc(n * sizeof(*io_event)));
 +
 +	memset(io_event, 0x0, sizeof(*io_event) * n);
 +	array->aio_events = io_event;
 +
 +skip_native_aio:
 +#endif /* LINUX_NATIVE_AIO */
 +	for (ulint i = 0; i < n; i++) {
 +		os_aio_slot_t*	slot;
 +
 +		slot = os_aio_array_get_nth_slot(array, i);
 +
 +		slot->pos = i;
 +		slot->reserved = FALSE;
 +#ifdef WIN_ASYNC_IO
 +		slot->handle = CreateEvent(NULL,TRUE, FALSE, NULL);
 +
 +		over = &slot->control;
 +
 +		over->hEvent = slot->handle;
 +
 +		array->handles[i] = over->hEvent;
 +
 +#elif defined(LINUX_NATIVE_AIO)
 +		memset(&slot->control, 0x0, sizeof(slot->control));
 +		slot->n_bytes = 0;
 +		slot->ret = 0;
 +#endif /* WIN_ASYNC_IO */
 +	}
 +
 +	return(array);
 +}
 +
 +/************************************************************************//**
 +Frees an aio wait array. */
 +static
 +void
 +os_aio_array_free(
 +/*==============*/
 +	os_aio_array_t*& array)	/*!< in, own: array to free */
 +{
 +#ifdef WIN_ASYNC_IO
 +	ulint	i;
 +
 +	for (i = 0; i < array->n_slots; i++) {
 +		os_aio_slot_t*	slot = os_aio_array_get_nth_slot(array, i);
 +		CloseHandle(slot->handle);
 +	}
 +#endif /* WIN_ASYNC_IO */
 +
 +#ifdef __WIN__
 +	ut_free(array->handles);
 +#endif /* __WIN__ */
 +	os_mutex_free(array->mutex);
 +	os_event_free(array->not_full);
 +	os_event_free(array->is_empty);
 +
 +#if defined(LINUX_NATIVE_AIO)
 +	if (srv_use_native_aio) {
 +		ut_free(array->aio_events);
 +		ut_free(array->aio_ctx);
 +	}
 +#endif /* LINUX_NATIVE_AIO */
 +
 +	ut_free(array->slots);
 +	ut_free(array);
 +
 +	array = 0;
 +}
 +
 +/***********************************************************************
 +Initializes the asynchronous io system. Creates one array each for ibuf
 +and log i/o. Also creates one array each for read and write where each
 +array is divided logically into n_read_segs and n_write_segs
 +respectively. The caller must create an i/o handler thread for each
 +segment in these arrays. This function also creates the sync array.
 +No i/o handler thread needs to be created for that */
 +UNIV_INTERN
 +ibool
 +os_aio_init(
 +/*========*/
 +	ulint	n_per_seg,	/*<! in: maximum number of pending aio
 +				operations allowed per segment */
 +	ulint	n_read_segs,	/*<! in: number of reader threads */
 +	ulint	n_write_segs,	/*<! in: number of writer threads */
 +	ulint	n_slots_sync)	/*<! in: number of slots in the sync aio
 +				array */
 +{
 +	os_io_init_simple();
 +
 +#if defined(LINUX_NATIVE_AIO)
 +	/* Check if native aio is supported on this system and tmpfs */
 +	if (srv_use_native_aio && !os_aio_native_aio_supported()) {
 +
 +		ib_logf(IB_LOG_LEVEL_WARN, "Linux Native AIO disabled.");
 +
 +		srv_use_native_aio = FALSE;
 +	}
 +#endif /* LINUX_NATIVE_AIO */
 +
 +	srv_reset_io_thread_op_info();
 +
 +	os_aio_read_array = os_aio_array_create(
 +		n_read_segs * n_per_seg, n_read_segs);
 +
 +	if (os_aio_read_array == NULL) {
 +		return(FALSE);
 +	}
 +
 +	ulint	start = (srv_read_only_mode) ? 0 : 2;
 +	ulint	n_segs = n_read_segs + start;
 +
 +	/* 0 is the ibuf segment and 1 is the insert buffer segment. */
 +	for (ulint i = start; i < n_segs; ++i) {
 +		ut_a(i < SRV_MAX_N_IO_THREADS);
 +		srv_io_thread_function[i] = "read thread";
 +	}
 +
 +	ulint	n_segments = n_read_segs;
 +
 +	if (!srv_read_only_mode) {
 +
 +		os_aio_log_array = os_aio_array_create(n_per_seg, 1);
 +
 +		if (os_aio_log_array == NULL) {
 +			return(FALSE);
 +		}
 +
 +		++n_segments;
 +
 +		srv_io_thread_function[1] = "log thread";
 +
 +		os_aio_ibuf_array = os_aio_array_create(n_per_seg, 1);
 +
 +		if (os_aio_ibuf_array == NULL) {
 +			return(FALSE);
 +		}
 +
 +		++n_segments;
 +
 +		srv_io_thread_function[0] = "insert buffer thread";
 +
 +		os_aio_write_array = os_aio_array_create(
 +			n_write_segs * n_per_seg, n_write_segs);
 +
 +		if (os_aio_write_array == NULL) {
 +			return(FALSE);
 +		}
 +
 +		n_segments += n_write_segs;
 +
 +		for (ulint i = start + n_read_segs; i < n_segments; ++i) {
 +			ut_a(i < SRV_MAX_N_IO_THREADS);
 +			srv_io_thread_function[i] = "write thread";
 +		}
 +
 +		ut_ad(n_segments >= 4);
 +	} else {
 +		ut_ad(n_segments > 0);
 +	}
 +
 +	os_aio_sync_array = os_aio_array_create(n_slots_sync, 1);
 +
 +	if (os_aio_sync_array == NULL) {
 +		return(FALSE);
 +	}
 +
 +	os_aio_n_segments = n_segments;
 +
 +	os_aio_validate();
 +
 +	os_last_printout = ut_time();
 +
 +	if (srv_use_native_aio) {
 +		return(TRUE);
 +	}
 +
 +	os_aio_segment_wait_events = static_cast<os_event_t*>(
 +		ut_malloc(n_segments * sizeof *os_aio_segment_wait_events));
 +
 +	for (ulint i = 0; i < n_segments; ++i) {
 +		os_aio_segment_wait_events[i] = os_event_create();
 +	}
 +
 +	return(TRUE);
 +}
 +
 +/***********************************************************************
 +Frees the asynchronous io system. */
 +UNIV_INTERN
 +void
 +os_aio_free(void)
 +/*=============*/
 +{
 +	if (os_aio_ibuf_array != 0) {
 +		os_aio_array_free(os_aio_ibuf_array);
 +	}
 +
 +	if (os_aio_log_array != 0) {
 +		os_aio_array_free(os_aio_log_array);
 +	}
 +
 +	if (os_aio_write_array != 0) {
 +		os_aio_array_free(os_aio_write_array);
 +	}
 +
 +	if (os_aio_sync_array != 0) {
 +		os_aio_array_free(os_aio_sync_array);
 +	}
 +
 +	os_aio_array_free(os_aio_read_array);
 +
 +	if (!srv_use_native_aio) {
 +		for (ulint i = 0; i < os_aio_n_segments; i++) {
 +			os_event_free(os_aio_segment_wait_events[i]);
 +		}
 +	}
 +
 +	ut_free(os_aio_segment_wait_events);
 +	os_aio_segment_wait_events = 0;
 +	os_aio_n_segments = 0;
 +}
 +
 +#ifdef WIN_ASYNC_IO
 +/************************************************************************//**
 +Wakes up all async i/o threads in the array in Windows async i/o at
 +shutdown. */
 +static
 +void
 +os_aio_array_wake_win_aio_at_shutdown(
 +/*==================================*/
 +	os_aio_array_t*	array)	/*!< in: aio array */
 +{
 +	ulint	i;
 +
 +	for (i = 0; i < array->n_slots; i++) {
 +
 +		SetEvent((array->slots + i)->handle);
 +	}
 +}
 +#endif
 +
 +/************************************************************************//**
 +Wakes up all async i/o threads so that they know to exit themselves in
 +shutdown. */
 +UNIV_INTERN
 +void
 +os_aio_wake_all_threads_at_shutdown(void)
 +/*=====================================*/
 +{
 +#ifdef WIN_ASYNC_IO
 +	/* This code wakes up all ai/o threads in Windows native aio */
 +	os_aio_array_wake_win_aio_at_shutdown(os_aio_read_array);
 +	if (os_aio_write_array != 0) {
 +		os_aio_array_wake_win_aio_at_shutdown(os_aio_write_array);
 +	}
 +
 +	if (os_aio_ibuf_array != 0) {
 +		os_aio_array_wake_win_aio_at_shutdown(os_aio_ibuf_array);
 +	}
 +
 +	if (os_aio_log_array != 0) {
 +		os_aio_array_wake_win_aio_at_shutdown(os_aio_log_array);
 +	}
 +#elif defined(LINUX_NATIVE_AIO)
 +	/* When using native AIO interface the io helper threads
 +	wait on io_getevents with a timeout value of 500ms. At
 +	each wake up these threads check the server status.
 +	No need to do anything to wake them up. */
 +#endif /* !WIN_ASYNC_AIO */
 +
 +	if (srv_use_native_aio) {
 +		return;
 +	}
 +
 +	/* This loop wakes up all simulated ai/o threads */
 +
 +	for (ulint i = 0; i < os_aio_n_segments; i++) {
 +
 +		os_event_set(os_aio_segment_wait_events[i]);
 +	}
 +}
 +
 +/************************************************************************//**
 +Waits until there are no pending writes in os_aio_write_array. There can
 +be other, synchronous, pending writes. */
 +UNIV_INTERN
 +void
 +os_aio_wait_until_no_pending_writes(void)
 +/*=====================================*/
 +{
 +	ut_ad(!srv_read_only_mode);
 +	os_event_wait(os_aio_write_array->is_empty);
 +}
 +
 +/**********************************************************************//**
 +Calculates segment number for a slot.
 +@return segment number (which is the number used by, for example,
 +i/o-handler threads) */
 +static
 +ulint
 +os_aio_get_segment_no_from_slot(
 +/*============================*/
 +	os_aio_array_t*	array,	/*!< in: aio wait array */
 +	os_aio_slot_t*	slot)	/*!< in: slot in this array */
 +{
 +	ulint	segment;
 +	ulint	seg_len;
 +
 +	if (array == os_aio_ibuf_array) {
 +		ut_ad(!srv_read_only_mode);
 +
 +		segment = IO_IBUF_SEGMENT;
 +
 +	} else if (array == os_aio_log_array) {
 +		ut_ad(!srv_read_only_mode);
 +
 +		segment = IO_LOG_SEGMENT;
 +
 +	} else if (array == os_aio_read_array) {
 +		seg_len = os_aio_read_array->n_slots
 +			/ os_aio_read_array->n_segments;
 +
 +		segment = (srv_read_only_mode ? 0 : 2) + slot->pos / seg_len;
 +	} else {
 +		ut_ad(!srv_read_only_mode);
 +		ut_a(array == os_aio_write_array);
 +
 +		seg_len = os_aio_write_array->n_slots
 +			/ os_aio_write_array->n_segments;
 +
 +		segment = os_aio_read_array->n_segments + 2
 +			+ slot->pos / seg_len;
 +	}
 +
 +	return(segment);
 +}
 +
 +/**********************************************************************//**
 +Calculates local segment number and aio array from global segment number.
 +@return	local segment number within the aio array */
 +static
 +ulint
 +os_aio_get_array_and_local_segment(
 +/*===============================*/
 +	os_aio_array_t** array,		/*!< out: aio wait array */
 +	ulint		 global_segment)/*!< in: global segment number */
 +{
 +	ulint		segment;
 +
 +	ut_a(global_segment < os_aio_n_segments);
 +
 +	if (srv_read_only_mode) {
 +		*array = os_aio_read_array;
 +
 +		return(global_segment);
 +	} else if (global_segment == IO_IBUF_SEGMENT) {
 +		*array = os_aio_ibuf_array;
 +		segment = 0;
 +
 +	} else if (global_segment == IO_LOG_SEGMENT) {
 +		*array = os_aio_log_array;
 +		segment = 0;
 +
 +	} else if (global_segment < os_aio_read_array->n_segments + 2) {
 +		*array = os_aio_read_array;
 +
 +		segment = global_segment - 2;
 +	} else {
 +		*array = os_aio_write_array;
 +
 +		segment = global_segment - (os_aio_read_array->n_segments + 2);
 +	}
 +
 +	return(segment);
 +}
 +
 +/*******************************************************************//**
 +Requests for a slot in the aio array. If no slot is available, waits until
 +not_full-event becomes signaled.
 +@return	pointer to slot */
 +static
 +os_aio_slot_t*
 +os_aio_array_reserve_slot(
 +/*======================*/
 +	ulint		type,	/*!< in: OS_FILE_READ or OS_FILE_WRITE */
 +	os_aio_array_t*	array,	/*!< in: aio array */
 +	fil_node_t*	message1,/*!< in: message to be passed along with
 +				the aio operation */
 +	void*		message2,/*!< in: message to be passed along with
 +				the aio operation */
 +	pfs_os_file_t	file,	/*!< in: file handle */
 +	const char*	name,	/*!< in: name of the file or path as a
 +				null-terminated string */
 +	void*		buf,	/*!< in: buffer where to read or from which
 +				to write */
 +	os_offset_t	offset,	/*!< in: file offset */
 +	ulint		len)	/*!< in: length of the block to read or write */
 +{
 +	os_aio_slot_t*	slot = NULL;
 +#ifdef WIN_ASYNC_IO
 +	OVERLAPPED*	control;
 +
 +#elif defined(LINUX_NATIVE_AIO)
 +
 +	struct iocb*	iocb;
 +	off_t		aio_offset;
 +
 +#endif /* WIN_ASYNC_IO */
 +	ulint		i;
 +	ulint		counter;
 +	ulint		slots_per_seg;
 +	ulint		local_seg;
 +
 +#ifdef WIN_ASYNC_IO
 +	ut_a((len & 0xFFFFFFFFUL) == len);
 +#endif /* WIN_ASYNC_IO */
 +
 +	/* No need of a mutex. Only reading constant fields */
 +	slots_per_seg = array->n_slots / array->n_segments;
 +
 +	/* We attempt to keep adjacent blocks in the same local
 +	segment. This can help in merging IO requests when we are
 +	doing simulated AIO */
 +	local_seg = (offset >> (UNIV_PAGE_SIZE_SHIFT + 6))
 +		% array->n_segments;
 +
 +loop:
 +	os_mutex_enter(array->mutex);
 +
 +	if (array->n_reserved == array->n_slots) {
 +		os_mutex_exit(array->mutex);
 +
 +		if (!srv_use_native_aio) {
 +			/* If the handler threads are suspended, wake them
 +			so that we get more slots */
 +
 +			os_aio_simulated_wake_handler_threads();
 +		}
 +
 +		os_event_wait(array->not_full);
 +
 +		goto loop;
 +	}
 +
 +	/* We start our search for an available slot from our preferred
 +	local segment and do a full scan of the array. We are
 +	guaranteed to find a slot in full scan. */
 +	for (i = local_seg * slots_per_seg, counter = 0;
 +	     counter < array->n_slots;
 +	     i++, counter++) {
 +
 +		i %= array->n_slots;
 +
 +		slot = os_aio_array_get_nth_slot(array, i);
 +
 +		if (slot->reserved == FALSE) {
 +			goto found;
 +		}
 +	}
 +
 +	/* We MUST always be able to get hold of a reserved slot. */
 +	ut_error;
 +
 +found:
 +	ut_a(slot->reserved == FALSE);
 +	array->n_reserved++;
 +
 +	if (array->n_reserved == 1) {
 +		os_event_reset(array->is_empty);
 +	}
 +
 +	if (array->n_reserved == array->n_slots) {
 +		os_event_reset(array->not_full);
 +	}
 +
 +	slot->reserved = TRUE;
 +	slot->reservation_time = ut_time();
 +	slot->message1 = message1;
 +	slot->message2 = message2;
 +	slot->file     = file;
 +	slot->name     = name;
 +	slot->len      = len;
 +	slot->type     = type;
 +	slot->buf      = static_cast<byte*>(buf);
 +	slot->offset   = offset;
 +	slot->io_already_done = FALSE;
 +
 +#ifdef WIN_ASYNC_IO
 +	control = &slot->control;
 +	control->Offset = (DWORD) offset & 0xFFFFFFFF;
 +	control->OffsetHigh = (DWORD) (offset >> 32);
 +	ResetEvent(slot->handle);
 +
 +#elif defined(LINUX_NATIVE_AIO)
 +
 +	/* If we are not using native AIO skip this part. */
 +	if (!srv_use_native_aio) {
 +		goto skip_native_aio;
 +	}
 +
 +	/* Check if we are dealing with 64 bit arch.
 +	If not then make sure that offset fits in 32 bits. */
 +	aio_offset = (off_t) offset;
 +
 +	ut_a(sizeof(aio_offset) >= sizeof(offset)
 +	     || ((os_offset_t) aio_offset) == offset);
 +
 +	iocb = &slot->control;
 +
 +	if (type == OS_FILE_READ) {
 +		io_prep_pread(iocb, file.m_file, buf, len, aio_offset);
 +	} else {
 +		ut_a(type == OS_FILE_WRITE);
 +		io_prep_pwrite(iocb, file.m_file, buf, len, aio_offset);
 +	}
 +
 +	iocb->data = (void*) slot;
 +	slot->n_bytes = 0;
 +	slot->ret = 0;
 +
 +skip_native_aio:
 +#endif /* LINUX_NATIVE_AIO */
 +	os_mutex_exit(array->mutex);
 +
 +	return(slot);
 +}
 +
 +/*******************************************************************//**
 +Frees a slot in the aio array. */
 +static
 +void
 +os_aio_array_free_slot(
 +/*===================*/
 +	os_aio_array_t*	array,	/*!< in: aio array */
 +	os_aio_slot_t*	slot)	/*!< in: pointer to slot */
 +{
 +	os_mutex_enter(array->mutex);
 +
 +	ut_ad(slot->reserved);
 +
 +	slot->reserved = FALSE;
 +
 +	array->n_reserved--;
 +
 +	if (array->n_reserved == array->n_slots - 1) {
 +		os_event_set(array->not_full);
 +	}
 +
 +	if (array->n_reserved == 0) {
 +		os_event_set(array->is_empty);
 +	}
 +
 +#ifdef WIN_ASYNC_IO
 +
 +	ResetEvent(slot->handle);
 +
 +#elif defined(LINUX_NATIVE_AIO)
 +
 +	if (srv_use_native_aio) {
 +		memset(&slot->control, 0x0, sizeof(slot->control));
 +		slot->n_bytes = 0;
 +		slot->ret = 0;
 +		/*fprintf(stderr, "Freed up Linux native slot.\n");*/
 +	} else {
 +		/* These fields should not be used if we are not
 +		using native AIO. */
 +		ut_ad(slot->n_bytes == 0);
 +		ut_ad(slot->ret == 0);
 +	}
 +
 +#endif
 +	os_mutex_exit(array->mutex);
 +}
 +
 +/**********************************************************************//**
 +Wakes up a simulated aio i/o-handler thread if it has something to do. */
 +static
 +void
 +os_aio_simulated_wake_handler_thread(
 +/*=================================*/
 +	ulint	global_segment)	/*!< in: the number of the segment in the aio
 +				arrays */
 +{
 +	os_aio_array_t*	array;
 +	ulint		segment;
 +
 +	ut_ad(!srv_use_native_aio);
 +
 +	segment = os_aio_get_array_and_local_segment(&array, global_segment);
 +
 +	ulint	n = array->n_slots / array->n_segments;
 +
 +	segment *= n;
 +
 +	/* Look through n slots after the segment * n'th slot */
 +
 +	os_mutex_enter(array->mutex);
 +
 +	for (ulint i = 0; i < n; ++i) {
 +		const os_aio_slot_t*	slot;
 +
 +		slot = os_aio_array_get_nth_slot(array, segment + i);
 +
 +		if (slot->reserved) {
 +
 +			/* Found an i/o request */
 +
 +			os_mutex_exit(array->mutex);
 +
 +			os_event_t	event;
 +
 +			event = os_aio_segment_wait_events[global_segment];
 +
 +			os_event_set(event);
 +
 +			return;
 +		}
 +	}
 +
 +	os_mutex_exit(array->mutex);
 +}
 +
 +/**********************************************************************//**
 +Wakes up simulated aio i/o-handler threads if they have something to do. */
 +UNIV_INTERN
 +void
 +os_aio_simulated_wake_handler_threads(void)
 +/*=======================================*/
 +{
 +	if (srv_use_native_aio) {
 +		/* We do not use simulated aio: do nothing */
 +
 +		return;
 +	}
 +
 +	os_aio_recommend_sleep_for_read_threads	= FALSE;
 +
 +	for (ulint i = 0; i < os_aio_n_segments; i++) {
 +		os_aio_simulated_wake_handler_thread(i);
 +	}
 +}
 +
 +#ifdef _WIN32
 +/**********************************************************************//**
 +This function can be called if one wants to post a batch of reads and
 +prefers an i/o-handler thread to handle them all at once later. You must
 +call os_aio_simulated_wake_handler_threads later to ensure the threads
 +are not left sleeping! */
 +UNIV_INTERN
 +void
 +os_aio_simulated_put_read_threads_to_sleep()
 +{
 +
 +/* The idea of putting background IO threads to sleep is only for
 +Windows when using simulated AIO. Windows XP seems to schedule
 +background threads too eagerly to allow for coalescing during
 +readahead requests. */
 +
 +	os_aio_array_t*	array;
 +
 +	if (srv_use_native_aio) {
 +		/* We do not use simulated aio: do nothing */
 +
 +		return;
 +	}
 +
 +	os_aio_recommend_sleep_for_read_threads	= TRUE;
 +
 +	for (ulint i = 0; i < os_aio_n_segments; i++) {
 +		os_aio_get_array_and_local_segment(&array, i);
 +
 +		if (array == os_aio_read_array) {
 +
 +			os_event_reset(os_aio_segment_wait_events[i]);
 +		}
 +	}
 +}
 +#endif /* _WIN32 */
 +
 +#if defined(LINUX_NATIVE_AIO)
 +/*******************************************************************//**
 +Dispatch an AIO request to the kernel.
 +@return	TRUE on success. */
 +static
 +ibool
 +os_aio_linux_dispatch(
 +/*==================*/
 +	os_aio_array_t*	array,	/*!< in: io request array. */
 +	os_aio_slot_t*	slot)	/*!< in: an already reserved slot. */
 +{
 +	int		ret;
 +	ulint		io_ctx_index;
 +	struct iocb*	iocb;
 +
 +	ut_ad(slot != NULL);
 +	ut_ad(array);
 +
 +	ut_a(slot->reserved);
 +
 +	/* Find out what we are going to work with.
 +	The iocb struct is directly in the slot.
 +	The io_context is one per segment. */
 +
 +	iocb = &slot->control;
 +	io_ctx_index = (slot->pos * array->n_segments) / array->n_slots;
 +
 +	ret = io_submit(array->aio_ctx[io_ctx_index], 1, &iocb);
 +
 +#if defined(UNIV_AIO_DEBUG)
 +	fprintf(stderr,
 +		"io_submit[%c] ret[%d]: slot[%p] ctx[%p] seg[%lu]\n",
 +		(slot->type == OS_FILE_WRITE) ? 'w' : 'r', ret, slot,
 +		array->aio_ctx[io_ctx_index], (ulong) io_ctx_index);
 +#endif
 +
 +	/* io_submit returns number of successfully
 +	queued requests or -errno. */
 +	if (UNIV_UNLIKELY(ret != 1)) {
 +		errno = -ret;
 +		return(FALSE);
 +	}
 +
 +	return(TRUE);
 +}
 +#endif /* LINUX_NATIVE_AIO */
 +
 +
 +/*******************************************************************//**
 +NOTE! Use the corresponding macro os_aio(), not directly this function!
 +Requests an asynchronous i/o operation.
 +@return	TRUE if request was queued successfully, FALSE if fail */
 +UNIV_INTERN
 +ibool
 +os_aio_func(
 +/*========*/
 +	ulint		type,	/*!< in: OS_FILE_READ or OS_FILE_WRITE */
 +	ulint		mode,	/*!< in: OS_AIO_NORMAL, ..., possibly ORed
 +				to OS_AIO_SIMULATED_WAKE_LATER: the
 +				last flag advises this function not to wake
 +				i/o-handler threads, but the caller will
 +				do the waking explicitly later, in this
 +				way the caller can post several requests in
 +				a batch; NOTE that the batch must not be
 +				so big that it exhausts the slots in aio
 +				arrays! NOTE that a simulated batch
 +				may introduce hidden chances of deadlocks,
 +				because i/os are not actually handled until
 +				all have been posted: use with great
 +				caution! */
 +	const char*	name,	/*!< in: name of the file or path as a
 +				null-terminated string */
 +	pfs_os_file_t	file,	/*!< in: handle to a file */
 +	void*		buf,	/*!< in: buffer where to read or from which
 +				to write */
 +	os_offset_t	offset,	/*!< in: file offset where to read or write */
 +	ulint		n,	/*!< in: number of bytes to read or write */
 +	fil_node_t*	message1,/*!< in: message for the aio handler
 +				(can be used to identify a completed
 +				aio operation); ignored if mode is
 +				OS_AIO_SYNC */
 +	void*		message2)/*!< in: message for the aio handler
 +				(can be used to identify a completed
 +				aio operation); ignored if mode is
 +				OS_AIO_SYNC */
 +{
 +	os_aio_array_t*	array;
 +	os_aio_slot_t*	slot;
 +#ifdef WIN_ASYNC_IO
 +	ibool		retval;
 +	BOOL		ret		= TRUE;
 +	DWORD		len		= (DWORD) n;
 +	struct fil_node_t* dummy_mess1;
 +	void*		dummy_mess2;
 +	ulint		dummy_type;
 +#endif /* WIN_ASYNC_IO */
 +	ulint		wake_later;
 +	ut_ad(buf);
 +	ut_ad(n > 0);
 +	ut_ad(n % OS_FILE_LOG_BLOCK_SIZE == 0);
 +	ut_ad(offset % OS_FILE_LOG_BLOCK_SIZE == 0);
 +	ut_ad(os_aio_validate_skip());
 +#ifdef WIN_ASYNC_IO
 +	ut_ad((n & 0xFFFFFFFFUL) == n);
 +#endif
 +
 +	wake_later = mode & OS_AIO_SIMULATED_WAKE_LATER;
 +	mode = mode & (~OS_AIO_SIMULATED_WAKE_LATER);
 +
 +	DBUG_EXECUTE_IF("ib_os_aio_func_io_failure_28",
 +			mode = OS_AIO_SYNC; os_has_said_disk_full = FALSE;);
 +
 +	if (mode == OS_AIO_SYNC
 +#ifdef WIN_ASYNC_IO
 +	    && !srv_use_native_aio
 +#endif /* WIN_ASYNC_IO */
 +	    ) {
 +		ibool ret;
 +
 +		/* This is actually an ordinary synchronous read or write:
 +		no need to use an i/o-handler thread. NOTE that if we use
 +		Windows async i/o, Windows does not allow us to use
 +		ordinary synchronous os_file_read etc. on the same file,
 +		therefore we have built a special mechanism for synchronous
 +		wait in the Windows case.
 +		Also note that the Performance Schema instrumentation has
 +		been performed by current os_aio_func()'s wrapper function
 +		pfs_os_aio_func(). So we would no longer need to call
 +		Performance Schema instrumented os_file_read() and
 +		os_file_write(). Instead, we should use os_file_read_func()
 +		and os_file_write_func() */
 +
 +		if (type == OS_FILE_READ) {
 +			ret = os_file_read_func(file.m_file, buf, offset, n);
 +		} else {
 +
 +			ut_ad(!srv_read_only_mode);
 +			ut_a(type == OS_FILE_WRITE);
 +
 +			ret = os_file_write_func(name, file.m_file, buf, offset, n);
 +
 +			DBUG_EXECUTE_IF("ib_os_aio_func_io_failure_28",
 +				os_has_said_disk_full = FALSE; ret = 0; errno = 28;);
 +
 +			if (!ret) {
 +				os_file_handle_error_cond_exit(name, "os_file_write_func", TRUE, FALSE);
 +			}
 +		}
 +
 +		return ret;
 +	}
 +
 +try_again:
 +	switch (mode) {
 +	case OS_AIO_NORMAL:
 +		if (type == OS_FILE_READ) {
 +			array = os_aio_read_array;
 +		} else {
 +			ut_ad(!srv_read_only_mode);
 +			array = os_aio_write_array;
 +		}
 +		break;
 +	case OS_AIO_IBUF:
 +		ut_ad(type == OS_FILE_READ);
 +		/* Reduce probability of deadlock bugs in connection with ibuf:
 +		do not let the ibuf i/o handler sleep */
 +
 +		wake_later = FALSE;
 +
 +		if (srv_read_only_mode) {
 +			array = os_aio_read_array;
 +		} else {
 +			array = os_aio_ibuf_array;
 +		}
 +		break;
 +	case OS_AIO_LOG:
 +		if (srv_read_only_mode) {
 +			array = os_aio_read_array;
 +		} else {
 +			array = os_aio_log_array;
 +		}
 +		break;
 +	case OS_AIO_SYNC:
 +		array = os_aio_sync_array;
 +#if defined(LINUX_NATIVE_AIO)
 +		/* In Linux native AIO we don't use sync IO array. */
 +		ut_a(!srv_use_native_aio);
 +#endif /* LINUX_NATIVE_AIO */
 +		break;
 +	default:
 +		ut_error;
 +		array = NULL; /* Eliminate compiler warning */
 +	}
 +
 +	slot = os_aio_array_reserve_slot(type, array, message1, message2, file,
 +					 name, buf, offset, n);
 +	if (type == OS_FILE_READ) {
 +		if (srv_use_native_aio) {
 +			os_n_file_reads++;
 +			os_bytes_read_since_printout += n;
 +#ifdef WIN_ASYNC_IO
 +			ret = ReadFile(file.m_file, buf, (DWORD) n, &len,
 +				       &(slot->control));
 +#elif defined(LINUX_NATIVE_AIO)
 +			if (!os_aio_linux_dispatch(array, slot)) {
 +				goto err_exit;
 +			}
 +#endif /* WIN_ASYNC_IO */
 +		} else {
 +			if (!wake_later) {
 +				os_aio_simulated_wake_handler_thread(
 +					os_aio_get_segment_no_from_slot(
 +						array, slot));
 +			}
 +		}
 +	} else if (type == OS_FILE_WRITE) {
 +		ut_ad(!srv_read_only_mode);
 +		if (srv_use_native_aio) {
 +			os_n_file_writes++;
 +#ifdef WIN_ASYNC_IO
 +			ret = WriteFile(file.m_file, buf, (DWORD) n, &len,
 +					&(slot->control));
 +#elif defined(LINUX_NATIVE_AIO)
 +			if (!os_aio_linux_dispatch(array, slot)) {
 +				goto err_exit;
 +			}
 +#endif /* WIN_ASYNC_IO */
 +		} else {
 +			if (!wake_later) {
 +				os_aio_simulated_wake_handler_thread(
 +					os_aio_get_segment_no_from_slot(
 +						array, slot));
 +			}
 +		}
 +	} else {
 +		ut_error;
 +	}
 +
 +#ifdef WIN_ASYNC_IO
 +	if (srv_use_native_aio) {
 +		if ((ret && len == n)
 +		    || (!ret && GetLastError() == ERROR_IO_PENDING)) {
 +			/* aio was queued successfully! */
 +
 +			if (mode == OS_AIO_SYNC) {
 +				/* We want a synchronous i/o operation on a
 +				file where we also use async i/o: in Windows
 +				we must use the same wait mechanism as for
 +				async i/o */
 +
 +				retval = os_aio_windows_handle(
 +					ULINT_UNDEFINED, slot->pos,
 +					&dummy_mess1, &dummy_mess2,
 +					&dummy_type);
 +
 +				return(retval);
 +			}
 +
 +			return(TRUE);
 +		}
 +
 +		goto err_exit;
 +	}
 +#endif /* WIN_ASYNC_IO */
 +	/* aio was queued successfully! */
 +	return(TRUE);
 +
 +#if defined LINUX_NATIVE_AIO || defined WIN_ASYNC_IO
 +err_exit:
 +#endif /* LINUX_NATIVE_AIO || WIN_ASYNC_IO */
 +	os_aio_array_free_slot(array, slot);
 +
 +	if (os_file_handle_error(
 +		name,type == OS_FILE_READ ? "aio read" : "aio write")) {
 +
 +		goto try_again;
 +	}
 +
 +	return(FALSE);
 +}
 +
 +#ifdef WIN_ASYNC_IO
 +/**********************************************************************//**
 +This function is only used in Windows asynchronous i/o.
 +Waits for an aio operation to complete. This function is used to wait the
 +for completed requests. The aio array of pending requests is divided
 +into segments. The thread specifies which segment or slot it wants to wait
 +for. NOTE: this function will also take care of freeing the aio slot,
 +therefore no other thread is allowed to do the freeing!
 +@return	TRUE if the aio operation succeeded */
 +UNIV_INTERN
 +ibool
 +os_aio_windows_handle(
 +/*==================*/
 +	ulint	segment,	/*!< in: the number of the segment in the aio
 +				arrays to wait for; segment 0 is the ibuf
 +				i/o thread, segment 1 the log i/o thread,
 +				then follow the non-ibuf read threads, and as
 +				the last are the non-ibuf write threads; if
 +				this is ULINT_UNDEFINED, then it means that
 +				sync aio is used, and this parameter is
 +				ignored */
 +	ulint	pos,		/*!< this parameter is used only in sync aio:
 +				wait for the aio slot at this position */
 +	fil_node_t**message1,	/*!< out: the messages passed with the aio
 +				request; note that also in the case where
 +				the aio operation failed, these output
 +				parameters are valid and can be used to
 +				restart the operation, for example */
 +	void**	message2,
 +	ulint*	type)		/*!< out: OS_FILE_WRITE or ..._READ */
 +{
 +	ulint		orig_seg	= segment;
 +	os_aio_array_t*	array;
 +	os_aio_slot_t*	slot;
 +	ulint		n;
 +	ulint		i;
 +	ibool		ret_val;
 +	BOOL		ret;
 +	DWORD		len;
 +	BOOL		retry		= FALSE;
 +
 +	if (segment == ULINT_UNDEFINED) {
 +		segment = 0;
 +		array = os_aio_sync_array;
 +	} else {
 +		segment = os_aio_get_array_and_local_segment(&array, segment);
 +	}
 +
 +	/* NOTE! We only access constant fields in os_aio_array. Therefore
 +	we do not have to acquire the protecting mutex yet */
 +
 +	ut_ad(os_aio_validate_skip());
 +	ut_ad(segment < array->n_segments);
 +
 +	n = array->n_slots / array->n_segments;
 +
 +	if (array == os_aio_sync_array) {
 +
 +		WaitForSingleObject(
 +			os_aio_array_get_nth_slot(array, pos)->handle,
 +			INFINITE);
 +
 +		i = pos;
 +
 +	} else {
 +		if (orig_seg != ULINT_UNDEFINED) {
 +			srv_set_io_thread_op_info(orig_seg, "wait Windows aio");
 +		}
 +
 +		i = WaitForMultipleObjects(
 +			(DWORD) n, array->handles + segment * n,
 +			FALSE, INFINITE);
 +	}
 +
 +	os_mutex_enter(array->mutex);
 +
 +	if (srv_shutdown_state == SRV_SHUTDOWN_EXIT_THREADS
 +	    && array->n_reserved == 0) {
 +		*message1 = NULL;
 +		*message2 = NULL;
 +		os_mutex_exit(array->mutex);
 +		return(TRUE);
 +	}
 +
 +	ut_a(i >= WAIT_OBJECT_0 && i <= WAIT_OBJECT_0 + n);
 +
 +	slot = os_aio_array_get_nth_slot(array, i + segment * n);
 +
 +	ut_a(slot->reserved);
 +
 +	if (orig_seg != ULINT_UNDEFINED) {
 +		srv_set_io_thread_op_info(
 +			orig_seg, "get windows aio return value");
 +	}
 +	ret = GetOverlappedResult(slot->file.m_file, &(slot->control), &len, TRUE);
 +
 +	*message1 = slot->message1;
 +	*message2 = slot->message2;
 +
 +	*type = slot->type;
 +
 +	if (ret && len == slot->len) {
 +
 +		ret_val = TRUE;
 +	} else if (os_file_handle_error(slot->name, "Windows aio")) {
 +
 +		retry = TRUE;
 +	} else {
 +
 +		ret_val = FALSE;
 +	}
 +
 +	os_mutex_exit(array->mutex);
 +
 +	if (retry) {
 +		/* retry failed read/write operation synchronously.
 +		No need to hold array->mutex. */
 +
 +#ifdef UNIV_PFS_IO
 +		/* This read/write does not go through os_file_read
 +		and os_file_write APIs, need to register with
 +		performance schema explicitly here. */
 +		struct PSI_file_locker* locker = NULL;
 +		PSI_file_locker_state	state;
 +		register_pfs_file_io_begin(&state, locker, slot->file, slot->len,
 +					   (slot->type == OS_FILE_WRITE)
 +						? PSI_FILE_WRITE
 +						: PSI_FILE_READ,
 +					    __FILE__, __LINE__);
 +#endif
 +
 +		ut_a((slot->len & 0xFFFFFFFFUL) == slot->len);
 +
 +		switch (slot->type) {
 +		case OS_FILE_WRITE:
 +			ret = WriteFile(slot->file.m_file, slot->buf,
 +					(DWORD) slot->len, &len,
 +					&(slot->control));
 +			break;
 +		case OS_FILE_READ:
 +			ret = ReadFile(slot->file.m_file, slot->buf,
 +				       (DWORD) slot->len, &len,
 +				       &(slot->control));
 +			break;
 +		default:
 +			ut_error;
 +		}
 +
 +#ifdef UNIV_PFS_IO
 +		register_pfs_file_io_end(locker, len);
 +#endif
 +
 +		if (!ret && GetLastError() == ERROR_IO_PENDING) {
 +			/* aio was queued successfully!
 +			We want a synchronous i/o operation on a
 +			file where we also use async i/o: in Windows
 +			we must use the same wait mechanism as for
 +			async i/o */
 +			ret = GetOverlappedResult(slot->file.m_file,
 +						  &(slot->control),
 +						  &len, TRUE);
 +		}
 +
 +		ret_val = ret && len == slot->len;
 +	}
 +
 +	os_aio_array_free_slot(array, slot);
 +
 +	return(ret_val);
 +}
 +#endif
 +
 +#if defined(LINUX_NATIVE_AIO)
 +/******************************************************************//**
 +This function is only used in Linux native asynchronous i/o. This is
 +called from within the io-thread. If there are no completed IO requests
 +in the slot array, the thread calls this function to collect more
 +requests from the kernel.
 +The io-thread waits on io_getevents(), which is a blocking call, with
 +a timeout value. Unless the system is very heavy loaded, keeping the
 +io-thread very busy, the io-thread will spend most of its time waiting
 +in this function.
 +The io-thread also exits in this function. It checks server status at
 +each wakeup and that is why we use timed wait in io_getevents(). */
 +static
 +void
 +os_aio_linux_collect(
 +/*=================*/
 +	os_aio_array_t* array,		/*!< in/out: slot array. */
 +	ulint		segment,	/*!< in: local segment no. */
 +	ulint		seg_size)	/*!< in: segment size. */
 +{
 +	int			i;
 +	int			ret;
 +	ulint			start_pos;
 +	ulint			end_pos;
 +	struct timespec		timeout;
 +	struct io_event*	events;
 +	struct io_context*	io_ctx;
 +
 +	/* sanity checks. */
 +	ut_ad(array != NULL);
 +	ut_ad(seg_size > 0);
 +	ut_ad(segment < array->n_segments);
 +
 +	/* Which part of event array we are going to work on. */
 +	events = &array->aio_events[segment * seg_size];
 +
 +	/* Which io_context we are going to use. */
 +	io_ctx = array->aio_ctx[segment];
 +
 +	/* Starting point of the segment we will be working on. */
 +	start_pos = segment * seg_size;
 +
 +	/* End point. */
 +	end_pos = start_pos + seg_size;
 +
 +retry:
 +
 +	/* Initialize the events. The timeout value is arbitrary.
 +	We probably need to experiment with it a little. */
 +	memset(events, 0, sizeof(*events) * seg_size);
 +	timeout.tv_sec = 0;
 +	timeout.tv_nsec = OS_AIO_REAP_TIMEOUT;
 +
 +	ret = io_getevents(io_ctx, 1, seg_size, events, &timeout);
 +
 +	if (ret > 0) {
 +		for (i = 0; i < ret; i++) {
 +			os_aio_slot_t*	slot;
 +			struct iocb*	control;
 +
 +			control = (struct iocb*) events[i].obj;
 +			ut_a(control != NULL);
 +
 +			slot = (os_aio_slot_t*) control->data;
 +
 +			/* Some sanity checks. */
 +			ut_a(slot != NULL);
 +			ut_a(slot->reserved);
 +
 +#if defined(UNIV_AIO_DEBUG)
 +			fprintf(stderr,
 +				"io_getevents[%c]: slot[%p] ctx[%p]"
 +				" seg[%lu]\n",
 +				(slot->type == OS_FILE_WRITE) ? 'w' : 'r',
 +				slot, io_ctx, segment);
 +#endif
 +
 +			/* We are not scribbling previous segment. */
 +			ut_a(slot->pos >= start_pos);
 +
 +			/* We have not overstepped to next segment. */
 +			ut_a(slot->pos < end_pos);
 +
 +			/* Mark this request as completed. The error handling
 +			will be done in the calling function. */
 +			os_mutex_enter(array->mutex);
 +			slot->n_bytes = events[i].res;
 +			slot->ret = events[i].res2;
 +			slot->io_already_done = TRUE;
 +			os_mutex_exit(array->mutex);
 +		}
 +		return;
 +	}
 +
 +	if (UNIV_UNLIKELY(srv_shutdown_state == SRV_SHUTDOWN_EXIT_THREADS)) {
 +		return;
 +	}
 +
 +	/* This error handling is for any error in collecting the
 +	IO requests. The errors, if any, for any particular IO
 +	request are simply passed on to the calling routine. */
 +
 +	switch (ret) {
 +	case -EAGAIN:
 +		/* Not enough resources! Try again. */
 +	case -EINTR:
 +		/* Interrupted! I have tested the behaviour in case of an
 +		interrupt. If we have some completed IOs available then
 +		the return code will be the number of IOs. We get EINTR only
 +		if there are no completed IOs and we have been interrupted. */
 +	case 0:
 +		/* No pending request! Go back and check again. */
 +		goto retry;
 +	}
 +
 +	/* All other errors should cause a trap for now. */
 +	ut_print_timestamp(stderr);
 +	fprintf(stderr,
 +		" InnoDB: unexpected ret_code[%d] from io_getevents()!\n",
 +		ret);
 +	ut_error;
 +}
 +
 +/**********************************************************************//**
 +This function is only used in Linux native asynchronous i/o.
 +Waits for an aio operation to complete. This function is used to wait for
 +the completed requests. The aio array of pending requests is divided
 +into segments. The thread specifies which segment or slot it wants to wait
 +for. NOTE: this function will also take care of freeing the aio slot,
 +therefore no other thread is allowed to do the freeing!
 +@return	TRUE if the IO was successful */
 +UNIV_INTERN
 +ibool
 +os_aio_linux_handle(
 +/*================*/
 +	ulint	global_seg,	/*!< in: segment number in the aio array
 +				to wait for; segment 0 is the ibuf
 +				i/o thread, segment 1 is log i/o thread,
 +				then follow the non-ibuf read threads,
 +				and the last are the non-ibuf write
 +				threads. */
 +	fil_node_t**message1,	/*!< out: the messages passed with the */
 +	void**	message2,	/*!< aio request; note that in case the
 +				aio operation failed, these output
 +				parameters are valid and can be used to
 +				restart the operation. */
 +	ulint*	type)		/*!< out: OS_FILE_WRITE or ..._READ */
 +{
 +	ulint		segment;
 +	os_aio_array_t*	array;
 +	os_aio_slot_t*	slot;
 +	ulint		n;
 +	ulint		i;
 +	ibool		ret = FALSE;
 +
 +	/* Should never be doing Sync IO here. */
 +	ut_a(global_seg != ULINT_UNDEFINED);
 +
 +	/* Find the array and the local segment. */
 +	segment = os_aio_get_array_and_local_segment(&array, global_seg);
 +	n = array->n_slots / array->n_segments;
 +
 +	/* Loop until we have found a completed request. */
 +	for (;;) {
 +		ibool	any_reserved = FALSE;
 +		os_mutex_enter(array->mutex);
 +		for (i = 0; i < n; ++i) {
 +			slot = os_aio_array_get_nth_slot(
 +				array, i + segment * n);
 +			if (!slot->reserved) {
 +				continue;
 +			} else if (slot->io_already_done) {
 +				/* Something for us to work on. */
 +				goto found;
 +			} else {
 +				any_reserved = TRUE;
 +			}
 +		}
 +
 +		os_mutex_exit(array->mutex);
 +
 +		/* There is no completed request.
 +		If there is no pending request at all,
 +		and the system is being shut down, exit. */
 +		if (UNIV_UNLIKELY
 +		    (!any_reserved
 +		     && srv_shutdown_state == SRV_SHUTDOWN_EXIT_THREADS)) {
 +			*message1 = NULL;
 +			*message2 = NULL;
 +			return(TRUE);
 +		}
 +
 +		/* Wait for some request. Note that we return
 +		from wait iff we have found a request. */
 +
 +		srv_set_io_thread_op_info(global_seg,
 +			"waiting for completed aio requests");
 +		os_aio_linux_collect(array, segment, n);
 +	}
 +
 +found:
 +	/* Note that it may be that there are more then one completed
 +	IO requests. We process them one at a time. We may have a case
 +	here to improve the performance slightly by dealing with all
 +	requests in one sweep. */
 +	srv_set_io_thread_op_info(global_seg,
 +				"processing completed aio requests");
 +
 +	/* Ensure that we are scribbling only our segment. */
 +	ut_a(i < n);
 +
 +	ut_ad(slot != NULL);
 +	ut_ad(slot->reserved);
 +	ut_ad(slot->io_already_done);
 +
 +	*message1 = slot->message1;
 +	*message2 = slot->message2;
 +
 +	*type = slot->type;
 +
 +	if (slot->ret == 0 && slot->n_bytes == (long) slot->len) {
 +
 +		ret = TRUE;
 +	} else {
 +		errno = -slot->ret;
 +
 +		/* os_file_handle_error does tell us if we should retry
 +		this IO. As it stands now, we don't do this retry when
 +		reaping requests from a different context than
 +		the dispatcher. This non-retry logic is the same for
 +		windows and linux native AIO.
 +		We should probably look into this to transparently
 +		re-submit the IO. */
 +		os_file_handle_error(slot->name, "Linux aio");
 +
 +		ret = FALSE;
 +	}
 +
 +	os_mutex_exit(array->mutex);
 +
 +	os_aio_array_free_slot(array, slot);
 +
 +	return(ret);
 +}
 +#endif /* LINUX_NATIVE_AIO */
 +
 +/**********************************************************************//**
 +Does simulated aio. This function should be called by an i/o-handler
 +thread.
 +@return	TRUE if the aio operation succeeded */
 +UNIV_INTERN
 +ibool
 +os_aio_simulated_handle(
 +/*====================*/
 +	ulint	global_segment,	/*!< in: the number of the segment in the aio
 +				arrays to wait for; segment 0 is the ibuf
 +				i/o thread, segment 1 the log i/o thread,
 +				then follow the non-ibuf read threads, and as
 +				the last are the non-ibuf write threads */
 +	fil_node_t**message1,	/*!< out: the messages passed with the aio
 +				request; note that also in the case where
 +				the aio operation failed, these output
 +				parameters are valid and can be used to
 +				restart the operation, for example */
 +	void**	message2,
 +	ulint*	type)		/*!< out: OS_FILE_WRITE or ..._READ */
 +{
 +	os_aio_array_t*	array;
 +	ulint		segment;
 +	os_aio_slot_t*	consecutive_ios[OS_AIO_MERGE_N_CONSECUTIVE];
 +	ulint		n_consecutive;
 +	ulint		total_len;
 +	ulint		offs;
 +	os_offset_t	lowest_offset;
 +	ulint		biggest_age;
 +	ulint		age;
 +	byte*		combined_buf;
 +	byte*		combined_buf2;
 +	ibool		ret;
 +	ibool		any_reserved;
 +	ulint		n;
 +	os_aio_slot_t*	aio_slot;
 +
 +	/* Fix compiler warning */
 +	*consecutive_ios = NULL;
 +
 +	segment = os_aio_get_array_and_local_segment(&array, global_segment);
 +
 +restart:
 +	/* NOTE! We only access constant fields in os_aio_array. Therefore
 +	we do not have to acquire the protecting mutex yet */
 +
 +	srv_set_io_thread_op_info(global_segment,
 +				  "looking for i/o requests (a)");
 +	ut_ad(os_aio_validate_skip());
 +	ut_ad(segment < array->n_segments);
 +
 +	n = array->n_slots / array->n_segments;
 +
 +	/* Look through n slots after the segment * n'th slot */
 +
 +	if (array == os_aio_read_array
 +	    && os_aio_recommend_sleep_for_read_threads) {
 +
 +		/* Give other threads chance to add several i/os to the array
 +		at once. */
 +
 +		goto recommended_sleep;
 +	}
 +
 +	srv_set_io_thread_op_info(global_segment,
 +				  "looking for i/o requests (b)");
 +
 +	/* Check if there is a slot for which the i/o has already been
 +	done */
 +	any_reserved = FALSE;
 +
 +	os_mutex_enter(array->mutex);
 +
 +	for (ulint i = 0; i < n; i++) {
 +		os_aio_slot_t*	slot;
 +
 +		slot = os_aio_array_get_nth_slot(array, i + segment * n);
 +
 +		if (!slot->reserved) {
 +			continue;
 +		} else if (slot->io_already_done) {
 +
 +			if (os_aio_print_debug) {
 +				fprintf(stderr,
 +					"InnoDB: i/o for slot %lu"
 +					" already done, returning\n",
 +					(ulong) i);
 +			}
 +
 +			aio_slot = slot;
 +			ret = TRUE;
 +			goto slot_io_done;
 +		} else {
 +			any_reserved = TRUE;
 +		}
 +	}
 +
 +	/* There is no completed request.
 +	If there is no pending request at all,
 +	and the system is being shut down, exit. */
 +	if (!any_reserved && srv_shutdown_state == SRV_SHUTDOWN_EXIT_THREADS) {
 +		os_mutex_exit(array->mutex);
 +		*message1 = NULL;
 +		*message2 = NULL;
 +		return(TRUE);
 +	}
 +
 +	n_consecutive = 0;
 +
 +	/* If there are at least 2 seconds old requests, then pick the oldest
 +	one to prevent starvation. If several requests have the same age,
 +	then pick the one at the lowest offset. */
 +
 +	biggest_age = 0;
 +	lowest_offset = IB_UINT64_MAX;
 +
 +	for (ulint i = 0; i < n; i++) {
 +		os_aio_slot_t*	slot;
 +
 +		slot = os_aio_array_get_nth_slot(array, i + segment * n);
 +
 +		if (slot->reserved) {
 +
 +			age = (ulint) difftime(
 +				ut_time(), slot->reservation_time);
 +
 +			if ((age >= 2 && age > biggest_age)
 +			    || (age >= 2 && age == biggest_age
 +				&& slot->offset < lowest_offset)) {
 +
 +				/* Found an i/o request */
 +				consecutive_ios[0] = slot;
 +
 +				n_consecutive = 1;
 +
 +				biggest_age = age;
 +				lowest_offset = slot->offset;
 +			}
 +		}
 +	}
 +
 +	if (n_consecutive == 0) {
 +		/* There were no old requests. Look for an i/o request at the
 +		lowest offset in the array (we ignore the high 32 bits of the
 +		offset in these heuristics) */
 +
 +		lowest_offset = IB_UINT64_MAX;
 +
 +		for (ulint i = 0; i < n; i++) {
 +			os_aio_slot_t*	slot;
 +
 +			slot = os_aio_array_get_nth_slot(
 +				array, i + segment * n);
 +
 +			if (slot->reserved && slot->offset < lowest_offset) {
 +
 +				/* Found an i/o request */
 +				consecutive_ios[0] = slot;
 +
 +				n_consecutive = 1;
 +
 +				lowest_offset = slot->offset;
 +			}
 +		}
 +	}
 +
 +	if (n_consecutive == 0) {
 +
 +		/* No i/o requested at the moment */
 +
 +		goto wait_for_io;
 +	}
 +
 +	/* if n_consecutive != 0, then we have assigned
 +	something valid to consecutive_ios[0] */
 +	ut_ad(n_consecutive != 0);
 +	ut_ad(consecutive_ios[0] != NULL);
 +
 +	aio_slot = consecutive_ios[0];
 +
 +	/* Check if there are several consecutive blocks to read or write */
 +
 +consecutive_loop:
 +	for (ulint i = 0; i < n; i++) {
 +		os_aio_slot_t*	slot;
 +
 +		slot = os_aio_array_get_nth_slot(array, i + segment * n);
 +		if (slot->reserved
 +		    && slot != aio_slot
 +		    && slot->offset == aio_slot->offset + aio_slot->len
 +		    && slot->type == aio_slot->type
 +		    && slot->file.m_file == aio_slot->file.m_file) {
 +
 +			/* Found a consecutive i/o request */
 +
 +			consecutive_ios[n_consecutive] = slot;
 +			n_consecutive++;
 +
 +			aio_slot = slot;
 +
 +			if (n_consecutive < OS_AIO_MERGE_N_CONSECUTIVE) {
 +
 +				goto consecutive_loop;
 +			} else {
 +				break;
 +			}
 +		}
 +	}
 +
 +	srv_set_io_thread_op_info(global_segment, "consecutive i/o requests");
 +
 +	/* We have now collected n_consecutive i/o requests in the array;
 +	allocate a single buffer which can hold all data, and perform the
 +	i/o */
 +
 +	total_len = 0;
 +	aio_slot = consecutive_ios[0];
 +
 +	for (ulint i = 0; i < n_consecutive; i++) {
 +		total_len += consecutive_ios[i]->len;
 +	}
 +
 +	if (n_consecutive == 1) {
 +		/* We can use the buffer of the i/o request */
 +		combined_buf = aio_slot->buf;
 +		combined_buf2 = NULL;
 +	} else {
 +		combined_buf2 = static_cast<byte*>(
 +			ut_malloc(total_len + UNIV_PAGE_SIZE));
 +
 +		ut_a(combined_buf2);
 +
 +		combined_buf = static_cast<byte*>(
 +			ut_align(combined_buf2, UNIV_PAGE_SIZE));
 +	}
 +
 +	/* We release the array mutex for the time of the i/o: NOTE that
 +	this assumes that there is just one i/o-handler thread serving
 +	a single segment of slots! */
 +
 +	os_mutex_exit(array->mutex);
 +
 +	if (aio_slot->type == OS_FILE_WRITE && n_consecutive > 1) {
 +		/* Copy the buffers to the combined buffer */
 +		offs = 0;
 +
 +		for (ulint i = 0; i < n_consecutive; i++) {
 +
 +			ut_memcpy(combined_buf + offs, consecutive_ios[i]->buf,
 +				  consecutive_ios[i]->len);
 +
 +			offs += consecutive_ios[i]->len;
 +		}
 +	}
 +
 +	srv_set_io_thread_op_info(global_segment, "doing file i/o");
 +
 +	/* Do the i/o with ordinary, synchronous i/o functions: */
 +	if (aio_slot->type == OS_FILE_WRITE) {
 +		ut_ad(!srv_read_only_mode);
 +		ret = os_file_write(
 +			aio_slot->name, aio_slot->file, combined_buf,
 +			aio_slot->offset, total_len);
 +
 +		DBUG_EXECUTE_IF("ib_os_aio_func_io_failure_28",
 +			os_has_said_disk_full = FALSE;
 +			ret = 0;
 +			errno = 28;);
 +
 +		if (!ret) {
 +			os_file_handle_error_cond_exit(aio_slot->name, "os_file_write_func", TRUE, FALSE);
 +		}
 +
 +	} else {
 +		ret = os_file_read(
 +			aio_slot->file, combined_buf,
 +			aio_slot->offset, total_len);
 +	}
 +
 +	srv_set_io_thread_op_info(global_segment, "file i/o done");
 +
 +	if (aio_slot->type == OS_FILE_READ && n_consecutive > 1) {
 +		/* Copy the combined buffer to individual buffers */
 +		offs = 0;
 +
 +		for (ulint i = 0; i < n_consecutive; i++) {
 +
 +			ut_memcpy(consecutive_ios[i]->buf, combined_buf + offs,
 +				  consecutive_ios[i]->len);
 +			offs += consecutive_ios[i]->len;
 +		}
 +	}
 +
 +	if (combined_buf2) {
 +		ut_free(combined_buf2);
 +	}
 +
 +	os_mutex_enter(array->mutex);
 +
 +	/* Mark the i/os done in slots */
 +
 +	for (ulint i = 0; i < n_consecutive; i++) {
 +		consecutive_ios[i]->io_already_done = TRUE;
 +	}
 +
 +	/* We return the messages for the first slot now, and if there were
 +	several slots, the messages will be returned with subsequent calls
 +	of this function */
 +
 +slot_io_done:
 +
 +	ut_a(aio_slot->reserved);
 +
 +	*message1 = aio_slot->message1;
 +	*message2 = aio_slot->message2;
 +
 +	*type = aio_slot->type;
 +
 +	os_mutex_exit(array->mutex);
 +
 +	os_aio_array_free_slot(array, aio_slot);
 +
 +	return(ret);
 +
 +wait_for_io:
 +	srv_set_io_thread_op_info(global_segment, "resetting wait event");
 +
 +	/* We wait here until there again can be i/os in the segment
 +	of this thread */
 +
 +	os_event_reset(os_aio_segment_wait_events[global_segment]);
 +
 +	os_mutex_exit(array->mutex);
 +
 +recommended_sleep:
 +	srv_set_io_thread_op_info(global_segment, "waiting for i/o request");
 +
 +	os_event_wait(os_aio_segment_wait_events[global_segment]);
 +
 +	goto restart;
 +}
 +
 +/**********************************************************************//**
 +Validates the consistency of an aio array.
 +@return	true if ok */
 +static
 +bool
 +os_aio_array_validate(
 +/*==================*/
 +	os_aio_array_t*	array)	/*!< in: aio wait array */
 +{
 +	ulint		i;
 +	ulint		n_reserved	= 0;
 +
 +	os_mutex_enter(array->mutex);
 +
 +	ut_a(array->n_slots > 0);
 +	ut_a(array->n_segments > 0);
 +
 +	for (i = 0; i < array->n_slots; i++) {
 +		os_aio_slot_t*	slot;
 +
 +		slot = os_aio_array_get_nth_slot(array, i);
 +
 +		if (slot->reserved) {
 +			n_reserved++;
 +			ut_a(slot->len > 0);
 +		}
 +	}
 +
 +	ut_a(array->n_reserved == n_reserved);
 +
 +	os_mutex_exit(array->mutex);
 +
 +	return(true);
 +}
 +
 +/**********************************************************************//**
 +Validates the consistency the aio system.
 +@return	TRUE if ok */
 +UNIV_INTERN
 +ibool
 +os_aio_validate(void)
 +/*=================*/
 +{
 +	os_aio_array_validate(os_aio_read_array);
 +
 +	if (os_aio_write_array != 0) {
 +		os_aio_array_validate(os_aio_write_array);
 +	}
 +
 +	if (os_aio_ibuf_array != 0) {
 +		os_aio_array_validate(os_aio_ibuf_array);
 +	}
 +
 +	if (os_aio_log_array != 0) {
 +		os_aio_array_validate(os_aio_log_array);
 +	}
 +
 +	if (os_aio_sync_array != 0) {
 +		os_aio_array_validate(os_aio_sync_array);
 +	}
 +
 +	return(TRUE);
 +}
 +
 +/**********************************************************************//**
 +Prints pending IO requests per segment of an aio array.
 +We probably don't need per segment statistics but they can help us
 +during development phase to see if the IO requests are being
 +distributed as expected. */
 +static
 +void
 +os_aio_print_segment_info(
 +/*======================*/
 +	FILE*		file,	/*!< in: file where to print */
 +	ulint*		n_seg,	/*!< in: pending IO array */
 +	os_aio_array_t*	array)	/*!< in: array to process */
 +{
 +	ulint	i;
 +
 +	ut_ad(array);
 +	ut_ad(n_seg);
 +	ut_ad(array->n_segments > 0);
 +
 +	if (array->n_segments == 1) {
 +		return;
 +	}
 +
 +	fprintf(file, " [");
 +	for (i = 0; i < array->n_segments; i++) {
 +		if (i != 0) {
 +			fprintf(file, ", ");
 +		}
 +
 +		fprintf(file, "%lu", n_seg[i]);
 +	}
 +	fprintf(file, "] ");
 +}
 +
 +/**********************************************************************//**
 +Prints info about the aio array. */
 +UNIV_INTERN
 +void
 +os_aio_print_array(
 +/*==============*/
 +	FILE*		file,	/*!< in: file where to print */
 +	os_aio_array_t*	array)	/*!< in: aio array to print */
 +{
 +	ulint			n_reserved = 0;
 +	ulint			n_res_seg[SRV_MAX_N_IO_THREADS];
 +
 +	os_mutex_enter(array->mutex);
 +
 +	ut_a(array->n_slots > 0);
 +	ut_a(array->n_segments > 0);
 +
 +	memset(n_res_seg, 0x0, sizeof(n_res_seg));
 +
 +	for (ulint i = 0; i < array->n_slots; ++i) {
 +		os_aio_slot_t*	slot;
 +		ulint		seg_no;
 +
 +		slot = os_aio_array_get_nth_slot(array, i);
 +
 +		seg_no = (i * array->n_segments) / array->n_slots;
 +
 +		if (slot->reserved) {
 +			++n_reserved;
 +			++n_res_seg[seg_no];
 +
 +			ut_a(slot->len > 0);
 +		}
 +	}
 +
 +	ut_a(array->n_reserved == n_reserved);
 +
 +	fprintf(file, " %lu", (ulong) n_reserved);
 +
 +	os_aio_print_segment_info(file, n_res_seg, array);
 +
 +	os_mutex_exit(array->mutex);
 +}
 +
 +/**********************************************************************//**
 +Prints info of the aio arrays. */
 +UNIV_INTERN
 +void
 +os_aio_print(
 +/*=========*/
 +	FILE*	file)	/*!< in: file where to print */
 +{
 +	time_t		current_time;
 +	double		time_elapsed;
 +	double		avg_bytes_read;
 +
 +	for (ulint i = 0; i < srv_n_file_io_threads; ++i) {
 +		fprintf(file, "I/O thread %lu state: %s (%s)",
 +			(ulong) i,
 +			srv_io_thread_op_info[i],
 +			srv_io_thread_function[i]);
 +
 +#ifndef _WIN32
 +		if (!srv_use_native_aio
 +		    && os_aio_segment_wait_events[i]->is_set) {
 +			fprintf(file, " ev set");
 +		}
 +#endif /* _WIN32 */
 +
 +		fprintf(file, "\n");
 +	}
 +
 +	fputs("Pending normal aio reads:", file);
 +
 +	os_aio_print_array(file, os_aio_read_array);
 +
 +	if (os_aio_write_array != 0) {
 +		fputs(", aio writes:", file);
 +		os_aio_print_array(file, os_aio_write_array);
 +	}
 +
 +	if (os_aio_ibuf_array != 0) {
 +		fputs(",\n ibuf aio reads:", file);
 +		os_aio_print_array(file, os_aio_ibuf_array);
 +	}
 +
 +	if (os_aio_log_array != 0) {
 +		fputs(", log i/o's:", file);
 +		os_aio_print_array(file, os_aio_log_array);
 +	}
 +
 +	if (os_aio_sync_array != 0) {
 +		fputs(", sync i/o's:", file);
 +		os_aio_print_array(file, os_aio_sync_array);
 +	}
 +
 +	putc('\n', file);
 +	current_time = ut_time();
 +	time_elapsed = 0.001 + difftime(current_time, os_last_printout);
 +
 +	fprintf(file,
 +		"Pending flushes (fsync) log: " ULINTPF
 +		"; buffer pool: " ULINTPF "\n"
 +		ULINTPF " OS file reads, "
 +		ULINTPF " OS file writes, "
 +		ULINTPF " OS fsyncs\n",
 +		fil_n_pending_log_flushes,
 +		fil_n_pending_tablespace_flushes,
 +		os_n_file_reads,
 +		os_n_file_writes,
 +		os_n_fsyncs);
 +
 +	const ulint n_reads = ulint(MONITOR_VALUE(MONITOR_OS_PENDING_READS));
 +	const ulint n_writes = ulint(MONITOR_VALUE(MONITOR_OS_PENDING_WRITES));
 +
 +	if (n_reads != 0 || n_writes != 0) {
 +		fprintf(file,
 +			ULINTPF " pending reads, " ULINTPF " pending writes\n",
 +			n_reads, n_writes);
 +	}
 +
 +	if (os_n_file_reads == os_n_file_reads_old) {
 +		avg_bytes_read = 0.0;
 +	} else {
 +		avg_bytes_read = (double) os_bytes_read_since_printout
 +			/ (os_n_file_reads - os_n_file_reads_old);
 +	}
 +
 +	fprintf(file,
 +		"%.2f reads/s, %lu avg bytes/read,"
 +		" %.2f writes/s, %.2f fsyncs/s\n",
 +		(os_n_file_reads - os_n_file_reads_old)
 +		/ time_elapsed,
 +		(ulong) avg_bytes_read,
 +		(os_n_file_writes - os_n_file_writes_old)
 +		/ time_elapsed,
 +		(os_n_fsyncs - os_n_fsyncs_old)
 +		/ time_elapsed);
 +
 +	os_n_file_reads_old = os_n_file_reads;
 +	os_n_file_writes_old = os_n_file_writes;
 +	os_n_fsyncs_old = os_n_fsyncs;
 +	os_bytes_read_since_printout = 0;
 +
 +	os_last_printout = current_time;
 +}
 +
 +/**********************************************************************//**
 +Refreshes the statistics used to print per-second averages. */
 +UNIV_INTERN
 +void
 +os_aio_refresh_stats(void)
 +/*======================*/
 +{
 +	os_n_file_reads_old = os_n_file_reads;
 +	os_n_file_writes_old = os_n_file_writes;
 +	os_n_fsyncs_old = os_n_fsyncs;
 +	os_bytes_read_since_printout = 0;
 +
 +	os_last_printout = time(NULL);
 +}
 +
 +#ifdef UNIV_DEBUG
 +/**********************************************************************//**
 +Checks that all slots in the system have been freed, that is, there are
 +no pending io operations.
 +@return	TRUE if all free */
 +UNIV_INTERN
 +ibool
 +os_aio_all_slots_free(void)
 +/*=======================*/
 +{
 +	os_aio_array_t*	array;
 +	ulint		n_res	= 0;
 +
 +	array = os_aio_read_array;
 +
 +	os_mutex_enter(array->mutex);
 +
 +	n_res += array->n_reserved;
 +
 +	os_mutex_exit(array->mutex);
 +
 +	if (!srv_read_only_mode) {
 +		ut_a(os_aio_write_array == 0);
 +
 +		array = os_aio_write_array;
 +
 +		os_mutex_enter(array->mutex);
 +
 +		n_res += array->n_reserved;
 +
 +		os_mutex_exit(array->mutex);
 +
 +		ut_a(os_aio_ibuf_array == 0);
 +
 +		array = os_aio_ibuf_array;
 +
 +		os_mutex_enter(array->mutex);
 +
 +		n_res += array->n_reserved;
 +
 +		os_mutex_exit(array->mutex);
 +	}
 +
 +	ut_a(os_aio_log_array == 0);
 +
 +	array = os_aio_log_array;
 +
 +	os_mutex_enter(array->mutex);
 +
 +	n_res += array->n_reserved;
 +
 +	os_mutex_exit(array->mutex);
 +
 +	array = os_aio_sync_array;
 +
 +	os_mutex_enter(array->mutex);
 +
 +	n_res += array->n_reserved;
 +
 +	os_mutex_exit(array->mutex);
 +
 +	if (n_res == 0) {
 +
 +		return(TRUE);
 +	}
 +
 +	return(FALSE);
 +}
 +#endif /* UNIV_DEBUG */
 +
 +#endif /* !UNIV_HOTBACKUP */
diff --cc storage/xtradb/handler/ha_innodb.cc
index d3e3109951b,65c5ce69713..440e14e1989
--- a/storage/xtradb/handler/ha_innodb.cc
+++ b/storage/xtradb/handler/ha_innodb.cc
@@@ -1,10 -1,8 +1,10 @@@
  /*****************************************************************************
  
 -Copyright (c) 2000, 2015, Oracle and/or its affiliates. All Rights Reserved.
 +Copyright (c) 2000, 2017, Oracle and/or its affiliates. All Rights Reserved.
  Copyright (c) 2008, 2009 Google Inc.
  Copyright (c) 2009, Percona Inc.
 +Copyright (c) 2012, Facebook Inc.
- Copyright (c) 2013, 2017, MariaDB Corporation.
++Copyright (c) 2013, 2018, MariaDB Corporation.
  
  Portions of this file contain modifications contributed and copyrighted by
  Google, Inc. Those modifications are gratefully acknowledged and are described
@@@ -1131,3238 -972,216 +1131,3242 @@@ static SHOW_VAR innodb_status_variables
    {NullS, NullS, SHOW_LONG}
  };
  
 -/* General functions */
 -
 -/******************************************************************//**
 -Returns true if the thread is the replication thread on the slave
 -server. Used in srv_conc_enter_innodb() to determine if the thread
 -should be allowed to enter InnoDB - the replication thread is treated
 -differently than other threads. Also used in
 -srv_conc_force_exit_innodb().
 -@return	true if thd is the replication thread */
 -extern "C" UNIV_INTERN
 -ibool
 -thd_is_replication_slave_thread(
 -/*============================*/
 -	const void*	thd)	/*!< in: thread handle (THD*) */
 -{
 -	return((ibool) thd_slave_thread((THD*) thd));
 -}
 +/************************************************************************//**
 +Handling the shared INNOBASE_SHARE structure that is needed to provide table
 +locking. Register the table name if it doesn't exist in the hash table. */
 +static
 +INNOBASE_SHARE*
 +get_share(
 +/*======*/
 +	const char*	table_name);	/*!< in: table to lookup */
  
 -/******************************************************************//**
 -Save some CPU by testing the value of srv_thread_concurrency in inline
 -functions. */
 -static inline
 +/************************************************************************//**
 +Free the shared object that was registered with get_share(). */
 +static
  void
 -innodb_srv_conc_enter_innodb(
 -/*=========================*/
 -	trx_t*	trx)	/*!< in: transaction handle */
 -{
 -	if (UNIV_LIKELY(!srv_thread_concurrency)) {
 +free_share(
 +/*=======*/
 +	INNOBASE_SHARE*	share);		/*!< in/own: share to free */
  
 -		return;
 -	}
 +/*****************************************************************//**
 +Frees a possible InnoDB trx object associated with the current THD.
 +@return	0 or error number */
 +static
 +int
 +innobase_close_connection(
 +/*======================*/
 +	handlerton*	hton,		/*!< in/out: Innodb handlerton */
 +	THD*		thd);		/*!< in: MySQL thread handle for
 +					which to close the connection */
  
 -	srv_conc_enter_innodb(trx);
 -}
 +static void innobase_commit_ordered(handlerton *hton, THD* thd, bool all);
 +static void innobase_checkpoint_request(handlerton *hton, void *cookie);
  
 -/******************************************************************//**
 -Save some CPU by testing the value of srv_thread_concurrency in inline
 -functions. */
 -static inline
 +/*****************************************************************//**
 +Cancel any pending lock request associated with the current THD. */
 +static
  void
 -innodb_srv_conc_exit_innodb(
 -/*========================*/
 -	trx_t*	trx)	/*!< in: transaction handle */
 -{
 -	if (UNIV_LIKELY(!trx->declared_to_be_inside_innodb)) {
 -
 -		return;
 -	}
 +innobase_kill_connection(
 +/*======================*/
 +        handlerton*	hton,	/*!< in:  innobase handlerton */
 +	THD*	thd,	/*!< in: handle to the MySQL thread being killed */
 +        thd_kill_levels);
  
 -	srv_conc_exit_innodb(trx);
 -}
 +/*****************************************************************//**
 +Commits a transaction in an InnoDB database or marks an SQL statement
 +ended.
 +@return	0 */
 +static
 +int
 +innobase_commit(
 +/*============*/
 +	handlerton*	hton,		/*!< in/out: Innodb handlerton */
 +	THD*		thd,		/*!< in: MySQL thread handle of the
 +					user for whom the transaction should
 +					be committed */
 +	bool		commit_trx);	/*!< in: true - commit transaction
 +					false - the current SQL statement
 +					ended */
  
 -/******************************************************************//**
 -Force a thread to leave InnoDB even if it has spare tickets. */
 -static inline
 -void
 -innodb_srv_conc_force_exit_innodb(
 -/*==============================*/
 -	trx_t*	trx)	/*!< in: transaction handle */
 -{
 -#ifdef UNIV_SYNC_DEBUG
 -	ut_ad(!sync_thread_levels_nonempty_trx(trx->has_search_latch));
 -#endif /* UNIV_SYNC_DEBUG */
 +/*****************************************************************//**
 +Rolls back a transaction to a savepoint.
 +@return 0 if success, HA_ERR_NO_SAVEPOINT if no savepoint with the
 +given name */
 +static
 +int
 +innobase_rollback(
 +/*==============*/
 +	handlerton*	hton,		/*!< in/out: Innodb handlerton */
 +	THD*		thd,		/*!< in: handle to the MySQL thread
 +					of the user whose transaction should
 +					be rolled back */
 +	bool		rollback_trx);	/*!< in: TRUE - rollback entire
 +					transaction FALSE - rollback the current
 +					statement only */
  
 -	if (trx->declared_to_be_inside_innodb) {
 +/*****************************************************************//**
 +Rolls back a transaction to a savepoint.
 +@return 0 if success, HA_ERR_NO_SAVEPOINT if no savepoint with the
 +given name */
 +static
 +int
 +innobase_rollback_to_savepoint(
 +/*===========================*/
 +	handlerton*	hton,		/*!< in/out: InnoDB handlerton */
 +	THD*		thd,		/*!< in: handle to the MySQL thread of
 +					the user whose XA transaction should
 +					be rolled back to savepoint */
 +	void*		savepoint);	/*!< in: savepoint data */
  
 -		srv_conc_force_exit_innodb(trx);
 -	}
 -}
 +/*****************************************************************//**
 +Check whether innodb state allows to safely release MDL locks after
 +rollback to savepoint.
 +@return true if it is safe, false if its not safe. */
 +static
 +bool
 +innobase_rollback_to_savepoint_can_release_mdl(
 +/*===========================================*/
 +	handlerton*	hton,		/*!< in/out: InnoDB handlerton */
 +	THD*		thd);		/*!< in: handle to the MySQL thread of
 +					the user whose XA transaction should
 +					be rolled back to savepoint */
  
 -/******************************************************************//**
 -Returns true if the transaction this thread is processing has edited
 -non-transactional tables. Used by the deadlock detector when deciding
 -which transaction to rollback in case of a deadlock - we try to avoid
 -rolling back transactions that have edited non-transactional tables.
 -@return	true if non-transactional tables have been edited */
 -extern "C" UNIV_INTERN
 -ibool
 -thd_has_edited_nontrans_tables(
 -/*===========================*/
 -	void*	thd)	/*!< in: thread handle (THD*) */
 -{
 -	return((ibool) thd_non_transactional_update((THD*) thd));
 -}
 +/*****************************************************************//**
 +Sets a transaction savepoint.
 +@return	always 0, that is, always succeeds */
 +static
 +int
 +innobase_savepoint(
 +/*===============*/
 +	handlerton*	hton,		/*!< in/out: InnoDB handlerton */
 +	THD*		thd,		/*!< in: handle to the MySQL thread of
 +					the user's XA transaction for which
 +					we need to take a savepoint */
 +	void*		savepoint);	/*!< in: savepoint data */
  
 -/******************************************************************//**
 -Returns true if the thread is executing a SELECT statement.
 -@return	true if thd is executing SELECT */
 -extern "C" UNIV_INTERN
 -ibool
 -thd_is_select(
 -/*==========*/
 -	const void*	thd)	/*!< in: thread handle (THD*) */
 -{
 -	return(thd_sql_command((const THD*) thd) == SQLCOM_SELECT);
 -}
 +/*****************************************************************//**
 +Release transaction savepoint name.
 +@return 0 if success, HA_ERR_NO_SAVEPOINT if no savepoint with the
 +given name */
 +static
 +int
 +innobase_release_savepoint(
 +/*=======================*/
 +	handlerton*	hton,		/*!< in/out: handlerton for Innodb */
 +	THD*		thd,		/*!< in: handle to the MySQL thread
 +					of the user whose transaction's
 +					savepoint should be released */
 +	void*		savepoint);	/*!< in: savepoint data */
  
 -/******************************************************************//**
 -Returns true if the thread supports XA,
 -global value of innodb_supports_xa if thd is NULL.
 -@return	true if thd has XA support */
 -extern "C" UNIV_INTERN
 -ibool
 -thd_supports_xa(
 -/*============*/
 -	void*	thd)	/*!< in: thread handle (THD*), or NULL to query
 -			the global innodb_supports_xa */
 -{
 -	return(THDVAR((THD*) thd, support_xa));
 -}
 +/************************************************************************//**
 +Function for constructing an InnoDB table handler instance. */
 +static
 +handler*
 +innobase_create_handler(
 +/*====================*/
 +	handlerton*	hton,		/*!< in/out: handlerton for Innodb */
 +	TABLE_SHARE*	table,
 +	MEM_ROOT*	mem_root);
  
 -/******************************************************************//**
 -Check the status of fake changes mode (innodb_fake_changes)
 -@return	true	if fake change mode is enabled. */
 -extern "C" UNIV_INTERN
 -ibool
 -thd_fake_changes(
 -/*=============*/
 -	void*	thd)	/*!< in: thread handle, or NULL to query
 -			the global innodb_supports_xa */
 -{
 -	return(THDVAR((THD*) thd, fake_changes));
 -}
 +/** @brief Initialize the default value of innodb_commit_concurrency.
  
 -/******************************************************************//**
 -Returns the lock wait timeout for the current connection.
 -@return	the lock wait timeout, in seconds */
 -extern "C" UNIV_INTERN
 -ulong
 -thd_lock_wait_timeout(
 -/*==================*/
 -	void*	thd)	/*!< in: thread handle (THD*), or NULL to query
 -			the global innodb_lock_wait_timeout */
 -{
 -	/* According to <mysql/plugin.h>, passing thd == NULL
 -	returns the global value of the session variable. */
 -	return(THDVAR((THD*) thd, lock_wait_timeout));
 -}
 +Once InnoDB is running, the innodb_commit_concurrency must not change
 +from zero to nonzero. (Bug #42101)
  
 -/******************************************************************//**
 -Set the time waited for the lock for the current query. */
 -extern "C" UNIV_INTERN
 -void
 -thd_set_lock_wait_time(
 +The initial default value is 0, and without this extra initialization,
 +SET GLOBAL innodb_commit_concurrency=DEFAULT would set the parameter
 +to 0, even if it was initially set to nonzero at the command line
 +or configuration file. */
 +static
 +void
 +innobase_commit_concurrency_init_default();
 +/*=======================================*/
 +
 +/** @brief Initialize the default and max value of innodb_undo_logs.
 +
 +Once InnoDB is running, the default value and the max value of
 +innodb_undo_logs must be equal to the available undo logs,
 +given by srv_available_undo_logs. */
 +static
 +void
 +innobase_undo_logs_init_default_max();
 +/*==================================*/
 +
 +/************************************************************//**
 +Validate the file format name and return its corresponding id.
 +@return	valid file format id */
 +static
 +uint
 +innobase_file_format_name_lookup(
 +/*=============================*/
 +	const char*	format_name);	/*!< in: pointer to file format
 +					name */
 +/************************************************************//**
 +Validate the file format check config parameters, as a side effect it
 +sets the srv_max_file_format_at_startup variable.
 +@return	the format_id if valid config value, otherwise, return -1 */
 +static
 +int
 +innobase_file_format_validate_and_set(
 +/*==================================*/
 +	const char*	format_max);	/*!< in: parameter value */
 +
 +/*******************************************************************//**
 +This function is used to prepare an X/Open XA distributed transaction.
 +@return	0 or error number */
 +static
 +int
 +innobase_xa_prepare(
 +/*================*/
 +	handlerton*	hton,		/*!< in: InnoDB handlerton */
 +	THD*		thd,		/*!< in: handle to the MySQL thread of
 +					the user whose XA transaction should
 +					be prepared */
 +	bool		all);		/*!< in: true - prepare transaction
 +					false - the current SQL statement
 +					ended */
 +/*******************************************************************//**
 +This function is used to recover X/Open XA distributed transactions.
 +@return	number of prepared transactions stored in xid_list */
 +static
 +int
 +innobase_xa_recover(
 +/*================*/
 +	handlerton*	hton,		/*!< in: InnoDB handlerton */
 +	XID*		xid_list,	/*!< in/out: prepared transactions */
 +	uint		len);		/*!< in: number of slots in xid_list */
 +/*******************************************************************//**
 +This function is used to commit one X/Open XA distributed transaction
 +which is in the prepared state
 +@return	0 or error number */
 +static
 +int
 +innobase_commit_by_xid(
  /*===================*/
 -	void*	thd,	/*!< in: thread handle (THD*) */
 -	ulint	value)	/*!< in: time waited for the lock */
 +	handlerton*	hton,		/*!< in: InnoDB handlerton */
 +	XID*		xid);		/*!< in: X/Open XA transaction
 +					identification */
 +/*******************************************************************//**
 +This function is used to rollback one X/Open XA distributed transaction
 +which is in the prepared state
 +@return	0 or error number */
 +static
 +int
 +innobase_rollback_by_xid(
 +/*=====================*/
 +	handlerton*	hton,		/*!< in: InnoDB handlerton */
 +	XID*		xid);		/*!< in: X/Open XA transaction
 +					identification */
 +/*******************************************************************//**
 +Create a consistent view for a cursor based on current transaction
 +which is created if the corresponding MySQL thread still lacks one.
 +This consistent view is then used inside of MySQL when accessing records
 +using a cursor.
 +@return	pointer to cursor view or NULL */
 +static
 +void*
 +innobase_create_cursor_view(
 +/*========================*/
 +	handlerton*	hton,		/*!< in: innobase hton */
 +	THD*		thd);		/*!< in: user thread handle */
 +/*******************************************************************//**
 +Set the given consistent cursor view to a transaction which is created
 +if the corresponding MySQL thread still lacks one. If the given
 +consistent cursor view is NULL global read view of a transaction is
 +restored to a transaction read view. */
 +static
 +void
 +innobase_set_cursor_view(
 +/*=====================*/
 +	handlerton*	hton,		/*!< in: handlerton of Innodb */
 +	THD*		thd,		/*!< in: user thread handle */
 +	void*		curview);	/*!< in: Consistent cursor view to
 +					be set */
 +/*******************************************************************//**
 +Close the given consistent cursor view of a transaction and restore
 +global read view to a transaction read view. Transaction is created if the
 +corresponding MySQL thread still lacks one. */
 +static
 +void
 +innobase_close_cursor_view(
 +/*=======================*/
 +	handlerton*	hton,		/*!< in: handlerton of Innodb */
 +	THD*		thd,		/*!< in: user thread handle */
 +	void*		curview);	/*!< in: Consistent read view to be
 +					closed */
 +/*****************************************************************//**
 +Removes all tables in the named database inside InnoDB. */
 +static
 +void
 +innobase_drop_database(
 +/*===================*/
 +	handlerton*	hton,		/*!< in: handlerton of Innodb */
 +	char*		path);		/*!< in: database path; inside InnoDB
 +					the name of the last directory in
 +					the path is used as the database name:
 +					for example, in 'mysql/data/test' the
 +					database name is 'test' */
 +/** Shut down the InnoDB storage engine.
 +@return	0 */
 +static
 +int
 +innobase_end(handlerton*, ha_panic_function);
 +
 +#if NOT_USED
 +/*****************************************************************//**
 +Stores the current binlog coordinates in the trx system header. */
 +static
 +int
 +innobase_store_binlog_info(
 +/*=======================*/
 +	handlerton*	hton,	/*!< in: InnoDB handlerton */
 +	THD*		thd);	/*!< in: MySQL thread handle */
 +#endif
 +
 +/*****************************************************************//**
 +Creates an InnoDB transaction struct for the thd if it does not yet have one.
 +Starts a new InnoDB transaction if a transaction is not yet started. And
 +assigns a new snapshot for a consistent read if the transaction does not yet
 +have one.
 +@return	0 */
 +static
 +int
 +innobase_start_trx_and_assign_read_view(
 +/*====================================*/
 +	handlerton*	hton,		/* in: Innodb handlerton */
 +	THD*		thd);		/* in: MySQL thread handle of the
 +					user for whom the transaction should
 +					be committed */
 +#ifdef NOT_USED
 +/*****************************************************************//**
 +Creates an InnoDB transaction struct for the thd if it does not yet have one.
 +Starts a new InnoDB transaction if a transaction is not yet started. And
 +clones snapshot for a consistent read from another session, if it has one.
 +@return	0 */
 +static
 +int
 +innobase_start_trx_and_clone_read_view(
 +/*====================================*/
 +	handlerton*	hton,		/* in: Innodb handlerton */
 +	THD*		thd,		/* in: MySQL thread handle of the
 +					user for whom the transaction should
 +					be committed */
 +	THD*		from_thd);	/* in: MySQL thread handle of the
 +					user session from which the consistent
 +					read should be cloned */
 +#endif
 +/****************************************************************//**
 +Flushes InnoDB logs to disk and makes a checkpoint. Really, a commit flushes
 +the logs, and the name of this function should be innobase_checkpoint.
 +@return	TRUE if error */
 +static
 +bool
 +innobase_flush_logs(
 +/*================*/
 +	handlerton*	hton);		/*!< in: InnoDB handlerton */
 +
 +/************************************************************************//**
 +Implements the SHOW ENGINE INNODB STATUS command. Sends the output of the
 +InnoDB Monitor to the client.
 +@return 0 on success */
 +static
 +int
 +innodb_show_status(
 +/*===============*/
 +	handlerton*	hton,		/*!< in: the innodb handlerton */
 +	THD*		thd,		/*!< in: the MySQL query thread of
 +					the caller */
 +	stat_print_fn*	stat_print);
 +/************************************************************************//**
 +Return 0 on success and non-zero on failure. Note: the bool return type
 +seems to be abused here, should be an int. */
 +static
 +bool
 +innobase_show_status(
 +/*=================*/
 +	handlerton*		hton,	/*!< in: the innodb handlerton */
 +	THD*			thd,	/*!< in: the MySQL query thread of
 +					the caller */
 +	stat_print_fn*		stat_print,
 +	enum ha_stat_type	stat_type);
 +
 +/*****************************************************************//**
 +Commits a transaction in an InnoDB database. */
 +static
 +void
 +innobase_commit_low(
 +/*================*/
 +	trx_t*	trx);	/*!< in: transaction handle */
 +
 +/****************************************************************//**
 +Parse and enable InnoDB monitor counters during server startup.
 +User can enable monitor counters/groups by specifying
 +"loose-innodb_monitor_enable = monitor_name1;monitor_name2..."
 +in server configuration file or at the command line. */
 +static
 +void
 +innodb_enable_monitor_at_startup(
 +/*=============================*/
 +	char*	str);	/*!< in: monitor counter enable list */
 +
 +/*********************************************************************
 +Normalizes a table name string. A normalized name consists of the
 +database name catenated to '/' and table name. An example:
 +test/mytable. On Windows normalization puts both the database name and the
 +table name always to lower case if "set_lower_case" is set to TRUE. */
 +void
 +normalize_table_name_low(
 +/*=====================*/
 +	char*           norm_name,      /* out: normalized name as a
 +					null-terminated string */
 +	const char*     name,           /* in: table name string */
 +	ibool           set_lower_case); /* in: TRUE if we want to set
 +					 name to lower case */
 +
 +#ifdef NOT_USED
 +/*************************************************************//**
 +Removes old archived transaction log files.
 +@return	true on error */
 +static bool innobase_purge_archive_logs(
 +	handlerton *hton,		/*!< in: InnoDB handlerton */
 +	time_t before_date,		/*!< in: all files modified
 +					before timestamp should be removed */
 +	const char* to_filename)	/*!< in: this and earler files
 +					should be removed */
 +{
 +	ulint err= DB_ERROR;
 +	if (before_date > 0) {
 +		err= purge_archived_logs(before_date, 0);
 +	} else if (to_filename) {
 +		if (is_prefix(to_filename, IB_ARCHIVED_LOGS_PREFIX)) {
 +			unsigned long long log_file_lsn = strtoll(to_filename
 +					+ IB_ARCHIVED_LOGS_PREFIX_LEN,
 +					NULL, 10);
 +			if (log_file_lsn > 0 && log_file_lsn < ULLONG_MAX) {
 +				err= purge_archived_logs(0, log_file_lsn);
 +			}
 +		}
 +	}
 +	return (err != DB_SUCCESS);
 +}
 +#endif
 +
 +
 +/*************************************************************//**
 +Check for a valid value of innobase_commit_concurrency.
 +@return	0 for valid innodb_commit_concurrency */
 +static
 +int
 +innobase_commit_concurrency_validate(
 +/*=================================*/
 +	THD*				thd,	/*!< in: thread handle */
 +	struct st_mysql_sys_var*	var,	/*!< in: pointer to system
 +						variable */
 +	void*				save,	/*!< out: immediate result
 +						for update function */
 +	struct st_mysql_value*		value)	/*!< in: incoming string */
 +{
 +	long long	intbuf;
 +	ulong		commit_concurrency;
 +
 +	DBUG_ENTER("innobase_commit_concurrency_validate");
 +
 +	if (value->val_int(value, &intbuf)) {
 +		/* The value is NULL. That is invalid. */
 +		DBUG_RETURN(1);
 +	}
 +
 +	*reinterpret_cast<ulong*>(save) = commit_concurrency
 +		= static_cast<ulong>(intbuf);
 +
 +	/* Allow the value to be updated, as long as it remains zero
 +	or nonzero. */
 +	DBUG_RETURN(!(!commit_concurrency == !innobase_commit_concurrency));
 +}
 +
 +/*******************************************************************//**
 +Function for constructing an InnoDB table handler instance. */
 +static
 +handler*
 +innobase_create_handler(
 +/*====================*/
 +	handlerton*	hton,	/*!< in: InnoDB handlerton */
 +	TABLE_SHARE*	table,
 +	MEM_ROOT*	mem_root)
 +{
 +	return(new (mem_root) ha_innobase(hton, table));
 +}
 +
 +/* General functions */
 +
 +/*************************************************************//**
 +Check that a page_size is correct for InnoDB.  If correct, set the
 +associated page_size_shift which is the power of 2 for this page size.
 +@return	an associated page_size_shift if valid, 0 if invalid. */
 +inline
 +int
 +innodb_page_size_validate(
 +/*======================*/
 +	ulong	page_size)		/*!< in: Page Size to evaluate */
 +{
 +	ulong		n;
 +
 +	DBUG_ENTER("innodb_page_size_validate");
 +
 +	for (n = UNIV_PAGE_SIZE_SHIFT_MIN;
 +	     n <= UNIV_PAGE_SIZE_SHIFT_MAX;
 +	     n++) {
 +		if (page_size == (ulong) (1 << n)) {
 +			DBUG_RETURN(n);
 +		}
 +	}
 +
 +	DBUG_RETURN(0);
 +}
 +
 +/******************************************************************//**
 +Returns true if the thread is the replication thread on the slave
 +server. Used in srv_conc_enter_innodb() to determine if the thread
 +should be allowed to enter InnoDB - the replication thread is treated
 +differently than other threads. Also used in
 +srv_conc_force_exit_innodb().
 +@return	true if thd is the replication thread */
 +UNIV_INTERN
 +ibool
 +thd_is_replication_slave_thread(
 +/*============================*/
 +	THD*	thd)	/*!< in: thread handle */
 +{
 +	return((ibool) thd_slave_thread(thd));
 +}
 +
 +/******************************************************************//**
 +Gets information on the durability property requested by thread.
 +Used when writing either a prepare or commit record to the log
 +buffer. @return the durability property. */
 +UNIV_INTERN
 +enum durability_properties
 +thd_requested_durability(
 +/*=====================*/
 +	const THD* thd)	/*!< in: thread handle */
 +{
 +	return(thd_get_durability_property(thd));
 +}
 +
 +/******************************************************************//**
 +Returns true if transaction should be flagged as read-only.
 +@return	true if the thd is marked as read-only */
 +UNIV_INTERN
 +ibool
 +thd_trx_is_read_only(
 +/*=================*/
 +	THD*	thd)	/*!< in: thread handle */
 +{
 +	return(thd != 0 && thd_tx_is_read_only(thd));
 +}
 +
 +/******************************************************************//**
 +Check if the transaction is an auto-commit transaction. TRUE also
 +implies that it is a SELECT (read-only) transaction.
 +@return	true if the transaction is an auto commit read-only transaction. */
 +UNIV_INTERN
 +ibool
 +thd_trx_is_auto_commit(
 +/*===================*/
 +	THD*	thd)	/*!< in: thread handle, can be NULL */
 +{
 +	return(thd != NULL
 +	       && !thd_test_options(
 +		       thd,
 +		       OPTION_NOT_AUTOCOMMIT | OPTION_BEGIN)
 +	       && thd_is_select(thd));
 +}
 +
 +/******************************************************************//**
 +Save some CPU by testing the value of srv_thread_concurrency in inline
 +functions. */
 +static inline
 +void
 +innobase_srv_conc_enter_innodb(
 +/*===========================*/
 +	trx_t*	trx)	/*!< in: transaction handle */
 +{
 +#ifdef WITH_WSREP
 +	if (wsrep_on(trx->mysql_thd) && 
 +	    wsrep_thd_is_BF(trx->mysql_thd, FALSE)) return;
 +#endif /* WITH_WSREP */
 +	if (srv_thread_concurrency) {
 +		if (trx->n_tickets_to_enter_innodb > 0) {
 +
 +			/* If trx has 'free tickets' to enter the engine left,
 +			then use one such ticket */
 +
 +			--trx->n_tickets_to_enter_innodb;
 +
 +		} else if (trx->mysql_thd != NULL
 +			   && thd_is_replication_slave_thread(trx->mysql_thd)) {
 +
 +			UT_WAIT_FOR(
 +				srv_conc_get_active_threads()
 +				< srv_thread_concurrency,
 +				srv_replication_delay * 1000);
 +
 +		}  else {
 +			srv_conc_enter_innodb(trx);
 +		}
 +	}
 +}
 +
 +/******************************************************************//**
 +Note that the thread wants to leave InnoDB only if it doesn't have
 +any spare tickets. */
 +static inline
 +void
 +innobase_srv_conc_exit_innodb(
 +/*==========================*/
 +	trx_t*	trx)	/*!< in: transaction handle */
 +{
 +#ifdef UNIV_SYNC_DEBUG
 +	ut_ad(!sync_thread_levels_nonempty_trx(trx->has_search_latch));
 +#endif /* UNIV_SYNC_DEBUG */
 +#ifdef WITH_WSREP
 +	if (wsrep_on(trx->mysql_thd) && 
 +	    wsrep_thd_is_BF(trx->mysql_thd, FALSE)) return;
 +#endif /* WITH_WSREP */
 +
 +	/* This is to avoid making an unnecessary function call. */
 +	if (trx->declared_to_be_inside_innodb
 +	    && trx->n_tickets_to_enter_innodb == 0) {
 +
 +		srv_conc_force_exit_innodb(trx);
 +	}
 +}
 +
 +/******************************************************************//**
 +Force a thread to leave InnoDB even if it has spare tickets. */
 +static inline
 +void
 +innobase_srv_conc_force_exit_innodb(
 +/*================================*/
 +	trx_t*	trx)	/*!< in: transaction handle */
 +{
 +#ifdef UNIV_SYNC_DEBUG
 +	ut_ad(!sync_thread_levels_nonempty_trx(trx->has_search_latch));
 +#endif /* UNIV_SYNC_DEBUG */
 +
 +	/* This is to avoid making an unnecessary function call. */
 +	if (trx->declared_to_be_inside_innodb) {
 +		srv_conc_force_exit_innodb(trx);
 +	}
 +}
 +
 +/******************************************************************//**
 +Returns the NUL terminated value of glob_hostname.
 +@return	pointer to glob_hostname. */
 +UNIV_INTERN
 +const char*
 +server_get_hostname()
 +/*=================*/
 +{
 +	return(glob_hostname);
 +}
 +
 +/******************************************************************//**
 +Returns true if the transaction this thread is processing has edited
 +non-transactional tables. Used by the deadlock detector when deciding
 +which transaction to rollback in case of a deadlock - we try to avoid
 +rolling back transactions that have edited non-transactional tables.
 +@return	true if non-transactional tables have been edited */
 +UNIV_INTERN
 +ibool
 +thd_has_edited_nontrans_tables(
 +/*===========================*/
 +	THD*	thd)	/*!< in: thread handle */
 +{
 +	return((ibool) thd_non_transactional_update(thd));
 +}
 +
 +/******************************************************************//**
 +Returns true if the thread is executing a SELECT statement.
 +@return	true if thd is executing SELECT */
 +UNIV_INTERN
 +ibool
 +thd_is_select(
 +/*==========*/
 +	const THD*	thd)	/*!< in: thread handle */
 +{
 +	return(thd_sql_command(thd) == SQLCOM_SELECT);
 +}
 +
 +/******************************************************************//**
 +Returns true if the thread supports XA,
 +global value of innodb_supports_xa if thd is NULL.
 +@return	true if thd has XA support */
 +UNIV_INTERN
 +ibool
 +thd_supports_xa(
 +/*============*/
 +	THD*	thd)	/*!< in: thread handle, or NULL to query
 +			the global innodb_supports_xa */
 +{
 +	return(THDVAR(thd, support_xa));
 +}
 +
 +/** Get the value of innodb_tmpdir.
 +@param[in]	thd	thread handle, or NULL to query
 +			the global innodb_tmpdir.
 +@retval NULL if innodb_tmpdir="" */
 +UNIV_INTERN
 +const char*
 +thd_innodb_tmpdir(
 +	THD*	thd)
 +{
 +#ifdef UNIV_SYNC_DEBUG
 +	ut_ad(!sync_thread_levels_nonempty_trx(false));
 +#endif /* UNIV_SYNC_DEBUG */
 +
 +	const char*	tmp_dir = THDVAR(thd, tmpdir);
 +	if (tmp_dir != NULL && *tmp_dir == '\0') {
 +		tmp_dir = NULL;
 +	}
 +
 +	return(tmp_dir);
 +}
 +/******************************************************************//**
 +Check the status of fake changes mode (innodb_fake_changes)
 +@return	true	if fake change mode is enabled. */
 +UNIV_INTERN
 +ibool
 +thd_fake_changes(
 +/*=============*/
 +	THD*	thd)	/*!< in: thread handle, or NULL to query
 +			the global innodb_supports_xa */
 +{
 +	return(THDVAR((THD*) thd, fake_changes));
 +}
 +
 +/******************************************************************//**
 +Returns the lock wait timeout for the current connection.
 +@return	the lock wait timeout, in seconds */
 +UNIV_INTERN
 +ulong
 +thd_lock_wait_timeout(
 +/*==================*/
 +	THD*	thd)	/*!< in: thread handle, or NULL to query
 +			the global innodb_lock_wait_timeout */
 +{
 +	/* According to <mysql/plugin.h>, passing thd == NULL
 +	returns the global value of the session variable. */
 +	return(THDVAR(thd, lock_wait_timeout));
 +}
 +
 +/******************************************************************//**
 +Set the time waited for the lock for the current query. */
 +UNIV_INTERN
 +void
 +thd_set_lock_wait_time(
 +/*===================*/
 +	THD*	thd,	/*!< in/out: thread handle */
 +	ulint	value)	/*!< in: time waited for the lock */
 +{
 +	if (thd) {
 +		thd_storage_lock_wait(thd, value);
 +	}
 +}
 +
 +/******************************************************************//**
 +*/
 +UNIV_INTERN
 +ulong
 +thd_flush_log_at_trx_commit(
 +/*================================*/
 +	void*	thd)
 +{
 +	return(THDVAR((THD*) thd, flush_log_at_trx_commit));
 +}
 +
 +/********************************************************************//**
 +Obtain the InnoDB transaction of a MySQL thread.
 +@return	reference to transaction pointer */
 +MY_ATTRIBUTE((warn_unused_result, nonnull))
 +static inline
 +trx_t*&
 +thd_to_trx(
 +/*=======*/
 +	THD*	thd)	/*!< in: MySQL thread */
 +{
 +	return(*(trx_t**) thd_ha_data(thd, innodb_hton_ptr));
 +}
 +
 +#ifdef WITH_WSREP
 +ulonglong
 +thd_to_trx_id(
 +/*=======*/
 +	THD*	thd)	/*!< in: MySQL thread */
 +{
 +	return(thd_to_trx(thd)->id);
 +}
 +#endif /* WITH_WSREP */
 +
 +my_bool
 +ha_innobase::is_fake_change_enabled(THD* thd)
 +{
 +	trx_t*	trx	= thd_to_trx(thd);
 +	return(trx && UNIV_UNLIKELY(trx->fake_changes));
 +}
 +
 +/********************************************************************//**
 +In XtraDB it is impossible for a transaction to own a search latch outside of
 +InnoDB code, so there is nothing to release on demand.  We keep this function to
 +simplify maintenance.
 +@return 0 */
 +static
 +int
 +innobase_release_temporary_latches(
 +/*===============================*/
 +	handlerton*	hton MY_ATTRIBUTE((unused)),	/*!< in: handlerton */
 +	THD*		thd MY_ATTRIBUTE((unused)))	/*!< in: MySQL thread */
 +{
 +#ifdef UNIV_DEBUG
 +	DBUG_ASSERT(hton == innodb_hton_ptr);
 +
 +	if (!innodb_inited || thd == NULL) {
 +
 +		return(0);
 +	}
 +
 +	trx_t*	trx = thd_to_trx(thd);
 +
 +	if (trx != NULL) {
 +#ifdef UNIV_SYNC_DEBUG
 +		ut_ad(!btr_search_own_any());
 +#endif
 +		trx_search_latch_release_if_reserved(trx);
 +	}
 +#endif
 +
 +	return(0);
 +}
 +
 +/********************************************************************//**
 +Increments innobase_active_counter and every INNOBASE_WAKE_INTERVALth
 +time calls srv_active_wake_master_thread. This function should be used
 +when a single database operation may introduce a small need for
 +server utility activity, like checkpointing. */
 +static inline
 +void
 +innobase_active_small(void)
 +/*=======================*/
 +{
 +	innobase_active_counter++;
 +
 +	if ((innobase_active_counter % INNOBASE_WAKE_INTERVAL) == 0) {
 +		srv_active_wake_master_thread();
 +	}
 +}
 +
 +/********************************************************************//**
 +Converts an InnoDB error code to a MySQL error code and also tells to MySQL
 +about a possible transaction rollback inside InnoDB caused by a lock wait
 +timeout or a deadlock.
 +@return	MySQL error code */
 +static
 +int
 +convert_error_code_to_mysql(
 +/*========================*/
 +	dberr_t	error,	/*!< in: InnoDB error code */
 +	ulint	flags,  /*!< in: InnoDB table flags, or 0 */
 +	THD*	thd)	/*!< in: user thread handle or NULL */
 +{
 +	switch (error) {
 +	case DB_SUCCESS:
 +		return(0);
 +
 +	case DB_INTERRUPTED:
 +                return(HA_ERR_ABORTED_BY_USER);
 +
 +	case DB_FOREIGN_EXCEED_MAX_CASCADE:
 +		ut_ad(thd);
 +		push_warning_printf(thd, Sql_condition::WARN_LEVEL_WARN,
 +				    HA_ERR_ROW_IS_REFERENCED,
 +				    "InnoDB: Cannot delete/update "
 +				    "rows with cascading foreign key "
 +				    "constraints that exceed max "
 +				    "depth of %d. Please "
 +				    "drop extra constraints and try "
 +				    "again", DICT_FK_MAX_RECURSIVE_LOAD);
 +
 +		/* fall through */
 +
 +	case DB_ERROR:
 +	default:
 +		return(-1); /* unspecified error */
 +
 +	case DB_DUPLICATE_KEY:
 +		/* Be cautious with returning this error, since
 +		mysql could re-enter the storage layer to get
 +		duplicated key info, the operation requires a
 +		valid table handle and/or transaction information,
 +		which might not always be available in the error
 +		handling stage. */
 +		return(HA_ERR_FOUND_DUPP_KEY);
 +
 +	case DB_READ_ONLY:
 +		return(HA_ERR_TABLE_READONLY);
 +
 +	case DB_FOREIGN_DUPLICATE_KEY:
 +		return(HA_ERR_FOREIGN_DUPLICATE_KEY);
 +
 +	case DB_MISSING_HISTORY:
 +		return(HA_ERR_TABLE_DEF_CHANGED);
 +
 +	case DB_RECORD_NOT_FOUND:
 +		return(HA_ERR_NO_ACTIVE_RECORD);
 +
 +        case DB_SEARCH_ABORTED_BY_USER:
 +                return(HA_ERR_ABORTED_BY_USER);
 +
 +	case DB_DEADLOCK:
 +		/* Since we rolled back the whole transaction, we must
 +		tell it also to MySQL so that MySQL knows to empty the
 +		cached binlog for this transaction */
 +
 +		if (thd) {
 +			thd_mark_transaction_to_rollback(thd, TRUE);
 +		}
 +
 +		return(HA_ERR_LOCK_DEADLOCK);
 +
 +	case DB_LOCK_WAIT_TIMEOUT:
 +		/* Starting from 5.0.13, we let MySQL just roll back the
 +		latest SQL statement in a lock wait timeout. Previously, we
 +		rolled back the whole transaction. */
 +
 +		if (thd) {
 +			thd_mark_transaction_to_rollback(
 +				thd, (bool) row_rollback_on_timeout);
 +		}
 +
 +		return(HA_ERR_LOCK_WAIT_TIMEOUT);
 +
 +	case DB_NO_REFERENCED_ROW:
 +		return(HA_ERR_NO_REFERENCED_ROW);
 +
 +	case DB_ROW_IS_REFERENCED:
 +		return(HA_ERR_ROW_IS_REFERENCED);
 +
 +	case DB_CANNOT_ADD_CONSTRAINT:
 +	case DB_CHILD_NO_INDEX:
 +	case DB_PARENT_NO_INDEX:
 +		return(HA_ERR_CANNOT_ADD_FOREIGN);
 +
 +	case DB_CANNOT_DROP_CONSTRAINT:
 +
 +		return(HA_ERR_ROW_IS_REFERENCED); /* TODO: This is a bit
 +						misleading, a new MySQL error
 +						code should be introduced */
 +
 +	case DB_CORRUPTION:
 +		return(HA_ERR_CRASHED);
 +
 +	case DB_OUT_OF_FILE_SPACE:
 +		return(HA_ERR_RECORD_FILE_FULL);
 +
 +	case DB_TEMP_FILE_WRITE_FAILURE:
 +		my_error(ER_GET_ERRMSG, MYF(0),
 +                         DB_TEMP_FILE_WRITE_FAILURE,
 +                         ut_strerr(DB_TEMP_FILE_WRITE_FAILURE),
 +                         "InnoDB");
 +		return(HA_ERR_INTERNAL_ERROR);
 +
 +	case DB_TABLE_IN_FK_CHECK:
 +		return(HA_ERR_TABLE_IN_FK_CHECK);
 +
 +	case DB_TABLE_IS_BEING_USED:
 +		return(HA_ERR_WRONG_COMMAND);
 +
 +	case DB_TABLESPACE_DELETED:
 +	case DB_TABLE_NOT_FOUND:
 +		return(HA_ERR_NO_SUCH_TABLE);
 +
 +	case DB_TABLESPACE_NOT_FOUND:
 +		return(HA_ERR_NO_SUCH_TABLE);
 +
 +	case DB_TOO_BIG_RECORD: {
 +		/* If prefix is true then a 768-byte prefix is stored
 +		locally for BLOB fields. Refer to dict_table_get_format() */
 +		bool prefix = (dict_tf_get_format(flags) == UNIV_FORMAT_A);
 +		my_printf_error(ER_TOO_BIG_ROWSIZE,
 +			"Row size too large (> %lu). Changing some columns "
 +			"to TEXT or BLOB %smay help. In current row "
 +			"format, BLOB prefix of %d bytes is stored inline.",
 +			MYF(0),
 +			page_get_free_space_of_empty(flags &
 +				DICT_TF_COMPACT) / 2,
 +			prefix ? "or using ROW_FORMAT=DYNAMIC "
 +			"or ROW_FORMAT=COMPRESSED ": "",
 +			prefix ? DICT_MAX_FIXED_COL_LEN : 0);
 +		return(HA_ERR_TO_BIG_ROW);
 +	}
 +
 +
 +	case DB_TOO_BIG_FOR_REDO:
 +		my_printf_error(ER_TOO_BIG_ROWSIZE, "%s" , MYF(0),
 +				"The size of BLOB/TEXT data inserted"
 +				" in one transaction is greater than"
 +				" 10% of redo log size. Increase the"
 +				" redo log size using innodb_log_file_size.");
 +		return(HA_ERR_TO_BIG_ROW);
 +
 +	case DB_TOO_BIG_INDEX_COL:
 +		my_error(ER_INDEX_COLUMN_TOO_LONG, MYF(0),
 +			 DICT_MAX_FIELD_LEN_BY_FORMAT_FLAG(flags));
 +		return(HA_ERR_INDEX_COL_TOO_LONG);
 +
 +	case DB_NO_SAVEPOINT:
 +		return(HA_ERR_NO_SAVEPOINT);
 +
 +	case DB_LOCK_TABLE_FULL:
 +		/* Since we rolled back the whole transaction, we must
 +		tell it also to MySQL so that MySQL knows to empty the
 +		cached binlog for this transaction */
 +
 +		if (thd) {
 +			thd_mark_transaction_to_rollback(thd, TRUE);
 +		}
 +
 +		return(HA_ERR_LOCK_TABLE_FULL);
 +
 +	case DB_FTS_INVALID_DOCID:
 +		return(HA_FTS_INVALID_DOCID);
 +	case DB_FTS_EXCEED_RESULT_CACHE_LIMIT:
 +		return(HA_ERR_OUT_OF_MEM);
 +	case DB_TOO_MANY_CONCURRENT_TRXS:
 +		return(HA_ERR_TOO_MANY_CONCURRENT_TRXS);
 +	case DB_UNSUPPORTED:
 +		return(HA_ERR_UNSUPPORTED);
 +	case DB_INDEX_CORRUPT:
 +		return(HA_ERR_INDEX_CORRUPT);
 +	case DB_UNDO_RECORD_TOO_BIG:
 +		return(HA_ERR_UNDO_REC_TOO_BIG);
 +	case DB_OUT_OF_MEMORY:
 +		return(HA_ERR_OUT_OF_MEM);
 +	case DB_TABLESPACE_EXISTS:
 +		return(HA_ERR_TABLESPACE_EXISTS);
 +	case DB_IDENTIFIER_TOO_LONG:
 +		return(HA_ERR_INTERNAL_ERROR);
 +	case DB_FTS_TOO_MANY_WORDS_IN_PHRASE:
 +		return(HA_ERR_FTS_TOO_MANY_WORDS_IN_PHRASE);
 +	}
 +}
 +
 +/*************************************************************//**
 +Prints info of a THD object (== user session thread) to the given file. */
 +UNIV_INTERN
 +void
 +innobase_mysql_print_thd(
 +/*=====================*/
 +	FILE*	f,		/*!< in: output stream */
 +	THD*	thd,		/*!< in: MySQL THD object */
 +	uint	max_query_len)	/*!< in: max query length to print, or 0 to
 +				use the default max length */
 +{
 +	char	buffer[1024];
 +
 +	fputs(thd_get_error_context_description((THD*) thd,
 +						buffer, sizeof buffer,
 +						max_query_len), f);
 +	putc('\n', f);
 +}
 +
 +/******************************************************************//**
 +Get the error message format string.
 +@return the format string or 0 if not found. */
 +UNIV_INTERN
 +const char*
 +innobase_get_err_msg(
 +/*=================*/
 +	int	error_code)	/*!< in: MySQL error code */
 +{
 +	return(my_get_err_msg(error_code));
 +}
 +
 +/******************************************************************//**
 +Get the variable length bounds of the given character set. */
 +UNIV_INTERN
 +void
 +innobase_get_cset_width(
 +/*====================*/
 +	ulint	cset,		/*!< in: MySQL charset-collation code */
 +	ulint*	mbminlen,	/*!< out: minimum length of a char (in bytes) */
 +	ulint*	mbmaxlen)	/*!< out: maximum length of a char (in bytes) */
 +{
 +	CHARSET_INFO*	cs;
 +	ut_ad(cset <= MAX_CHAR_COLL_NUM);
 +	ut_ad(mbminlen);
 +	ut_ad(mbmaxlen);
 +
 +	cs = all_charsets[cset];
 +	if (cs) {
 +		*mbminlen = cs->mbminlen;
 +		*mbmaxlen = cs->mbmaxlen;
 +		ut_ad(*mbminlen < DATA_MBMAX);
 +		ut_ad(*mbmaxlen < DATA_MBMAX);
 +	} else {
 +		THD*	thd = current_thd;
 +
 +		if (thd && thd_sql_command(thd) == SQLCOM_DROP_TABLE) {
 +
 +			/* Fix bug#46256: allow tables to be dropped if the
 +			collation is not found, but issue a warning. */
 +			if ((global_system_variables.log_warnings)
 +			    && (cset != 0)){
 +
 +				sql_print_warning(
 +					"Unknown collation #%lu.", cset);
 +			}
 +		} else {
 +
 +			ut_a(cset == 0);
 +		}
 +
 +		*mbminlen = *mbmaxlen = 0;
 +	}
 +}
 +
 +/******************************************************************//**
 +Converts an identifier to a table name. */
 +UNIV_INTERN
 +void
 +innobase_convert_from_table_id(
 +/*===========================*/
 +	struct charset_info_st*	cs,	/*!< in: the 'from' character set */
 +	char*			to,	/*!< out: converted identifier */
 +	const char*		from,	/*!< in: identifier to convert */
 +	ulint			len)	/*!< in: length of 'to', in bytes */
 +{
 +	uint	errors;
 +
 +	strconvert(cs, from, strlen(from), &my_charset_filename, to, (uint) len, &errors);
 +}
 +
 +/**********************************************************************
 +Check if the length of the identifier exceeds the maximum allowed.
 +return true when length of identifier is too long. */
 +UNIV_INTERN
 +my_bool
 +innobase_check_identifier_length(
 +/*=============================*/
 +	const char*	id)	/* in: FK identifier to check excluding the
 +				database portion. */
 +{
 +	int		well_formed_error = 0;
 +	CHARSET_INFO	*cs = system_charset_info;
 +	DBUG_ENTER("innobase_check_identifier_length");
 +
 +	size_t len = cs->cset->well_formed_len(
 +		cs, id, id + strlen(id),
 +		NAME_CHAR_LEN, &well_formed_error);
 +
 +	if (well_formed_error || len == NAME_CHAR_LEN) {
 +		my_error(ER_TOO_LONG_IDENT, MYF(0), id);
 +		DBUG_RETURN(true);
 +	}
 +	DBUG_RETURN(false);
 +}
 +
 +/******************************************************************//**
 +Converts an identifier to UTF-8. */
 +UNIV_INTERN
 +void
 +innobase_convert_from_id(
 +/*=====================*/
 +	struct charset_info_st*	cs,	/*!< in: the 'from' character set */
 +	char*			to,	/*!< out: converted identifier */
 +	const char*		from,	/*!< in: identifier to convert */
 +	ulint			len)	/*!< in: length of 'to', in bytes */
 +{
 +	uint	errors;
 +
 +	strconvert(cs, from, strlen(from), system_charset_info, to, (uint) len, &errors);
 +}
 +
 +/******************************************************************//**
 +Compares NUL-terminated UTF-8 strings case insensitively.
 +@return	0 if a=b, <0 if a<b, >1 if a>b */
 +UNIV_INTERN
 +int
 +innobase_strcasecmp(
 +/*================*/
 +	const char*	a,	/*!< in: first string to compare */
 +	const char*	b)	/*!< in: second string to compare */
 +{
 +	if (!a) {
 +		if (!b) {
 +			return(0);
 +		} else {
 +			return(-1);
 +		}
 +	} else if (!b) {
 +		return(1);
 +	}
 +
 +	return(my_strcasecmp(system_charset_info, a, b));
 +}
 +
 +/******************************************************************//**
 +Compares NUL-terminated UTF-8 strings case insensitively. The
 +second string contains wildcards.
 +@return 0 if a match is found, 1 if not */
 +UNIV_INTERN
 +int
 +innobase_wildcasecmp(
 +/*=================*/
 +	const char*	a,	/*!< in: string to compare */
 +	const char*	b)	/*!< in: wildcard string to compare */
 +{
 +	return(wild_case_compare(system_charset_info, a, b));
 +}
 +
 +/******************************************************************//**
 +Strip dir name from a full path name and return only the file name
 +@return file name or "null" if no file name */
 +UNIV_INTERN
 +const char*
 +innobase_basename(
 +/*==============*/
 +	const char*	path_name)	/*!< in: full path name */
 +{
 +	const char*	name = base_name(path_name);
 +
 +	return((name) ? name : "null");
 +}
 +
 +/******************************************************************//**
 +Makes all characters in a NUL-terminated UTF-8 string lower case. */
 +UNIV_INTERN
 +void
 +innobase_casedn_str(
 +/*================*/
 +	char*	a)	/*!< in/out: string to put in lower case */
 +{
 +	my_casedn_str(system_charset_info, a);
 +}
 +
 +/**********************************************************************//**
 +Determines the connection character set.
 +@return	connection character set */
 +UNIV_INTERN
 +struct charset_info_st*
 +innobase_get_charset(
 +/*=================*/
 +	THD*	mysql_thd)	/*!< in: MySQL thread handle */
 +{
 +	return(thd_charset(mysql_thd));
 +}
 +
 +/**********************************************************************//**
 +Determines the current SQL statement.
 +@return	SQL statement string */
 +UNIV_INTERN
 +const char*
 +innobase_get_stmt(
 +/*==============*/
 +	THD*	thd,		/*!< in: MySQL thread handle */
 +	size_t*	length)		/*!< out: length of the SQL statement */
 +{
 +	if (const LEX_STRING *stmt = thd_query_string(thd)) {
 +		*length = stmt->length;
 +		return stmt->str;
 +	}
 +	return NULL;
 +}
 +
 +/**********************************************************************//**
 +Get the current setting of the table_def_size global parameter. We do
 +a dirty read because for one there is no synchronization object and
 +secondly there is little harm in doing so even if we get a torn read.
 +@return	value of table_def_size */
 +UNIV_INTERN
 +ulint
 +innobase_get_table_cache_size(void)
 +/*===============================*/
 +{
 +	return(tdc_size);
 +}
 +
 +/**********************************************************************//**
 +Get the current setting of the lower_case_table_names global parameter from
 +mysqld.cc. We do a dirty read because for one there is no synchronization
 +object and secondly there is little harm in doing so even if we get a torn
 +read.
 +@return	value of lower_case_table_names */
 +UNIV_INTERN
 +ulint
 +innobase_get_lower_case_table_names(void)
 +/*=====================================*/
 +{
 +	return(lower_case_table_names);
 +}
 +
 +/** Create a temporary file in the location specified by the parameter
 +path. If the path is null, then it will be created in tmpdir.
 +@param[in]	path	location for creating temporary file
 +@return	temporary file descriptor, or < 0 on error */
 +UNIV_INTERN
 +int
 +innobase_mysql_tmpfile(
 +	const char*	path)
 +{
 +#ifdef WITH_INNODB_DISALLOW_WRITES
 +	os_event_wait(srv_allow_writes_event);
 +#endif /* WITH_INNODB_DISALLOW_WRITES */
 +	int	fd2 = -1;
 +	File	fd;
 +
 +	DBUG_EXECUTE_IF(
 +		"innobase_tmpfile_creation_failure",
 +		return(-1);
 +	);
 +
 +	if (path == NULL) {
 +		fd = mysql_tmpfile("ib");
 +	} else {
 +		fd = mysql_tmpfile_path(path, "ib");
 +	}
 +
 +	if (fd >= 0) {
 +		/* Copy the file descriptor, so that the additional resources
 +		allocated by create_temp_file() can be freed by invoking
 +		my_close().
 +
 +		Because the file descriptor returned by this function
 +		will be passed to fdopen(), it will be closed by invoking
 +		fclose(), which in turn will invoke close() instead of
 +		my_close(). */
 +
 +#ifdef _WIN32
 +		/* Note that on Windows, the integer returned by mysql_tmpfile
 +		has no relation to C runtime file descriptor. Here, we need
 +		to call my_get_osfhandle to get the HANDLE and then convert it
 +		to C runtime filedescriptor. */
 +		{
 +			HANDLE hFile = my_get_osfhandle(fd);
 +			HANDLE hDup;
 +			BOOL bOK = DuplicateHandle(
 +					GetCurrentProcess(),
 +					hFile, GetCurrentProcess(),
 +					&hDup, 0, FALSE, DUPLICATE_SAME_ACCESS);
 +			if (bOK) {
 +				fd2 = _open_osfhandle((intptr_t) hDup, 0);
 +			} else {
 +				my_osmaperr(GetLastError());
 +				fd2 = -1;
 +			}
 +		}
++#else
++#ifdef F_DUPFD_CLOEXEC
++		fd2 = fcntl(fd, F_DUPFD_CLOEXEC, 0);
 +#else
 +		fd2 = dup(fd);
++#endif
 +#endif
 +		if (fd2 < 0) {
 +			DBUG_PRINT("error",("Got error %d on dup",fd2));
 +			my_errno=errno;
 +			my_error(EE_OUT_OF_FILERESOURCES,
 +				 MYF(ME_BELL+ME_WAITTANG),
 +				 "ib*", my_errno);
 +		}
 +		my_close(fd, MYF(MY_WME));
 +	}
 +	return(fd2);
 +}
 +
 +/*********************************************************************//**
 +Wrapper around MySQL's copy_and_convert function.
 +@return	number of bytes copied to 'to' */
 +UNIV_INTERN
 +ulint
 +innobase_convert_string(
 +/*====================*/
 +	void*		to,		/*!< out: converted string */
 +	ulint		to_length,	/*!< in: number of bytes reserved
 +					for the converted string */
 +	CHARSET_INFO*	to_cs,		/*!< in: character set to convert to */
 +	const void*	from,		/*!< in: string to convert */
 +	ulint		from_length,	/*!< in: number of bytes to convert */
 +	CHARSET_INFO*	from_cs,	/*!< in: character set to convert
 +					from */
 +	uint*		errors)		/*!< out: number of errors encountered
 +					during the conversion */
 +{
 +	return(copy_and_convert(
 +			(char*) to, (uint32) to_length, to_cs,
 +			(const char*) from, (uint32) from_length, from_cs,
 +			errors));
 +}
 +
 +/*******************************************************************//**
 +Formats the raw data in "data" (in InnoDB on-disk format) that is of
 +type DATA_(CHAR|VARCHAR|MYSQL|VARMYSQL) using "charset_coll" and writes
 +the result to "buf". The result is converted to "system_charset_info".
 +Not more than "buf_size" bytes are written to "buf".
 +The result is always NUL-terminated (provided buf_size > 0) and the
 +number of bytes that were written to "buf" is returned (including the
 +terminating NUL).
 +@return	number of bytes that were written */
 +UNIV_INTERN
 +ulint
 +innobase_raw_format(
 +/*================*/
 +	const char*	data,		/*!< in: raw data */
 +	ulint		data_len,	/*!< in: raw data length
 +					in bytes */
 +	ulint		charset_coll,	/*!< in: charset collation */
 +	char*		buf,		/*!< out: output buffer */
 +	ulint		buf_size)	/*!< in: output buffer size
 +					in bytes */
 +{
 +	/* XXX we use a hard limit instead of allocating
 +	but_size bytes from the heap */
 +	CHARSET_INFO*	data_cs;
 +	char		buf_tmp[8192];
 +	ulint		buf_tmp_used;
 +	uint		num_errors;
 +
 +	data_cs = all_charsets[charset_coll];
 +
 +	buf_tmp_used = innobase_convert_string(buf_tmp, sizeof(buf_tmp),
 +					       system_charset_info,
 +					       data, data_len, data_cs,
 +					       &num_errors);
 +
 +	return(ut_str_sql_format(buf_tmp, buf_tmp_used, buf, buf_size));
 +}
 +
 +/*********************************************************************//**
 +Compute the next autoinc value.
 +
 +For MySQL replication the autoincrement values can be partitioned among
 +the nodes. The offset is the start or origin of the autoincrement value
 +for a particular node. For n nodes the increment will be n and the offset
 +will be in the interval [1, n]. The formula tries to allocate the next
 +value for a particular node.
 +
 +Note: This function is also called with increment set to the number of
 +values we want to reserve for multi-value inserts e.g.,
 +
 +	INSERT INTO T VALUES(), (), ();
 +
 +innobase_next_autoinc() will be called with increment set to 3 where
 +autoinc_lock_mode != TRADITIONAL because we want to reserve 3 values for
 +the multi-value INSERT above.
 +@return	the next value */
 +UNIV_INTERN
 +ulonglong
 +innobase_next_autoinc(
 +/*==================*/
 +	ulonglong	current,	/*!< in: Current value */
 +	ulonglong	need,		/*!< in: count of values needed */
 +	ulonglong	step,		/*!< in: AUTOINC increment step */
 +	ulonglong	offset,		/*!< in: AUTOINC offset */
 +	ulonglong	max_value)	/*!< in: max value for type */
 +{
 +	ulonglong	next_value;
 +	ulonglong	block = need * step;
 +
 +	/* Should never be 0. */
 +	ut_a(need > 0);
 +	ut_a(block > 0);
 +	ut_a(max_value > 0);
 +
 +        /*
 +          Allow auto_increment to go over max_value up to max ulonglong.
 +          This allows us to detect that all values are exhausted.
 +          If we don't do this, we will return max_value several times
 +          and get duplicate key errors instead of auto increment value
 +          out of range.
 +        */
 +        max_value= (~(ulonglong) 0);
 +
 +	/* According to MySQL documentation, if the offset is greater than
 +	the step then the offset is ignored. */
 +	if (offset > block) {
 +		offset = 0;
 +	}
 +
 +	/* Check for overflow. Current can be > max_value if the value is
 +	in reality a negative value.The visual studio compilers converts
 +	large double values automatically into unsigned long long datatype
 +	maximum value */
 +
 +	if (block >= max_value
 +	    || offset > max_value
 +	    || current >= max_value
 +	    || max_value - offset <= offset) {
 +
 +		next_value = max_value;
 +	} else {
 +		ut_a(max_value > current);
 +
 +		ulonglong	free = max_value - current;
 +
 +		if (free < offset || free - offset <= block) {
 +			next_value = max_value;
 +		} else {
 +			next_value = 0;
 +		}
 +	}
 +
 +	if (next_value == 0) {
 +		ulonglong	next;
 +
 +		if (current >= offset) {
 +			next = (current - offset) / step;
 +		} else {
 +			next = 0;
 +			block -= step;
 +		}
 +
 +		ut_a(max_value > next);
 +		next_value = next * step;
 +		/* Check for multiplication overflow. */
 +		ut_a(next_value >= next);
 +		ut_a(max_value > next_value);
 +
 +		/* Check for overflow */
 +		if (max_value - next_value >= block) {
 +
 +			next_value += block;
 +
 +			if (max_value - next_value >= offset) {
 +				next_value += offset;
 +			} else {
 +				next_value = max_value;
 +			}
 +		} else {
 +			next_value = max_value;
 +		}
 +	}
 +
 +	ut_a(next_value != 0);
 +	ut_a(next_value <= max_value);
 +
 +	return(next_value);
 +}
 +
 +/*********************************************************************//**
 +Initializes some fields in an InnoDB transaction object. */
 +static
 +void
 +innobase_trx_init(
 +/*==============*/
 +	THD*	thd,	/*!< in: user thread handle */
 +	trx_t*	trx)	/*!< in/out: InnoDB transaction handle */
 +{
 +	DBUG_ENTER("innobase_trx_init");
 +	DBUG_ASSERT(thd == trx->mysql_thd);
 +
 +	trx->check_foreigns = !thd_test_options(
 +		thd, OPTION_NO_FOREIGN_KEY_CHECKS);
 +
 +	trx->check_unique_secondary = !thd_test_options(
 +		thd, OPTION_RELAXED_UNIQUE_CHECKS);
 +
 +	/* Transaction on start caches the fake_changes state and uses it for
 +	complete transaction lifetime.
 +	There are some APIs that doesn't need an active transaction object
 +	but transaction object are just use as a cache object/data carrier.
 +	Before using transaction object for such APIs refresh the state of
 +	fake_changes. */
 +	if (trx->state == TRX_STATE_NOT_STARTED) {
 +		trx->fake_changes = thd_fake_changes(thd);
 +	}
 +
 +#ifdef EXTENDED_SLOWLOG
 +	if (thd_log_slow_verbosity(thd) & (1ULL << SLOG_V_INNODB)) {
 +		trx->take_stats = TRUE;
 +	} else {
 +		trx->take_stats = FALSE;
 +	}
 +#else
 +	trx->take_stats = FALSE;
 +#endif
 +
 +	DBUG_VOID_RETURN;
 +}
 +
 +/*********************************************************************//**
 +Allocates an InnoDB transaction for a MySQL handler object for DML.
 +@return	InnoDB transaction handle */
 +UNIV_INTERN
 +trx_t*
 +innobase_trx_allocate(
 +/*==================*/
 +	THD*	thd)	/*!< in: user thread handle */
 +{
 +	trx_t*	trx;
 +
 +	DBUG_ENTER("innobase_trx_allocate");
 +	DBUG_ASSERT(thd != NULL);
 +	DBUG_ASSERT(EQ_CURRENT_THD(thd));
 +
 +	trx = trx_allocate_for_mysql();
 +
 +	trx->mysql_thd = thd;
 +
 +	innobase_trx_init(thd, trx);
 +
 +	DBUG_RETURN(trx);
 +}
 +
 +/*********************************************************************//**
 +Gets the InnoDB transaction handle for a MySQL handler object, creates
 +an InnoDB transaction struct if the corresponding MySQL thread struct still
 +lacks one.
 +@return	InnoDB transaction handle */
 +static inline
 +trx_t*
 +check_trx_exists(
 +/*=============*/
 +	THD*	thd)	/*!< in: user thread handle */
 +{
 +	trx_t*&	trx = thd_to_trx(thd);
 +
 +	if (trx == NULL) {
 +		trx = innobase_trx_allocate(thd);
 +		thd_set_ha_data(thd, innodb_hton_ptr, trx);
 +	} else if (UNIV_UNLIKELY(trx->magic_n != TRX_MAGIC_N)) {
 +		mem_analyze_corruption(trx);
 +		ut_error;
 +	}
 +
 +	innobase_trx_init(thd, trx);
 +
 +	return(trx);
 +}
 +
 +/*************************************************************************
 +Gets current trx. */
 +trx_t*
 +innobase_get_trx()
 +{
 +	THD *thd=current_thd;
 +	if (likely(thd != 0)) {
 +		trx_t*& trx = thd_to_trx(thd);
 +		return(trx);
 +	} else {
 +		return(NULL);
 +	}
 +}
 +
 +ibool
 +innobase_get_slow_log()
 +{
 +#ifdef EXTENDED_SLOWLOG
 +	return((ibool) thd_opt_slow_log());
 +#else
 +	return(FALSE);
 +#endif
 +}
 +
 +/*********************************************************************//**
 +Note that a transaction has been registered with MySQL.
 +@return true if transaction is registered with MySQL 2PC coordinator */
 +static inline
 +bool
 +trx_is_registered_for_2pc(
 +/*=========================*/
 +	const trx_t*	trx)	/* in: transaction */
 +{
 +	return(trx->is_registered == 1);
 +}
 +
 +/*********************************************************************//**
 +Note that innobase_commit_ordered() was run. */
 +static inline
 +void
 +trx_set_active_commit_ordered(
 +/*==============================*/
 +	trx_t*	trx)	/* in: transaction */
 +{
 +	ut_a(trx_is_registered_for_2pc(trx));
 +	trx->active_commit_ordered = 1;
 +}
 +
 +/*********************************************************************//**
 +Note that a transaction has been registered with MySQL 2PC coordinator. */
 +static inline
 +void
 +trx_register_for_2pc(
 +/*==================*/
 +	trx_t*	trx)	/* in: transaction */
 +{
 +	trx->is_registered = 1;
 +	ut_ad(trx->active_commit_ordered == 0);
 +}
 +
 +/*********************************************************************//**
 +Note that a transaction has been deregistered. */
 +static inline
 +void
 +trx_deregister_from_2pc(
 +/*====================*/
 +	trx_t*	trx)	/* in: transaction */
 +{
 +	trx->is_registered = 0;
 +	trx->active_commit_ordered = 0;
 +}
 +
 +/*********************************************************************//**
 +Check whether a transaction has active_commit_ordered set */
 +static inline
 +bool
 +trx_is_active_commit_ordered(
 +/*=========================*/
 +	const trx_t*	trx)	/* in: transaction */
 +{
 +	return(trx->active_commit_ordered == 1);
 +}
 +
 +/*********************************************************************//**
 +Check if transaction is started.
 +@reutrn true if transaction is in state started */
 +static
 +bool
 +trx_is_started(
 +/*===========*/
 +	trx_t*	trx)	/* in: transaction */
 +{
 +	return(trx->state != TRX_STATE_NOT_STARTED);
 +}
 +
 +/****************************************************************//**
 +Update log_checksum_algorithm_ptr with a pointer to the function corresponding
 +to a given checksum algorithm. */
 +static
 +void
 +innodb_log_checksum_func_update(
 +/*============================*/
 +	ulint	algorithm)	/*!< in: algorithm */
 +{
 +	switch (algorithm) {
 +	case SRV_CHECKSUM_ALGORITHM_STRICT_INNODB:
 +	case SRV_CHECKSUM_ALGORITHM_INNODB:
 +		log_checksum_algorithm_ptr=log_block_calc_checksum_innodb;
 +		break;
 +	case SRV_CHECKSUM_ALGORITHM_STRICT_CRC32:
 +	case SRV_CHECKSUM_ALGORITHM_CRC32:
 +		log_checksum_algorithm_ptr=log_block_calc_checksum_crc32;
 +		break;
 +	case SRV_CHECKSUM_ALGORITHM_STRICT_NONE:
 +	case SRV_CHECKSUM_ALGORITHM_NONE:
 +		log_checksum_algorithm_ptr=log_block_calc_checksum_none;
 +		break;
 +	default:
 +		ut_a(0);
 +	}
 +}
 +
 +/****************************************************************//**
 +On update hook for the innodb_log_checksum_algorithm variable. */
 +static
 +void
 +innodb_log_checksum_algorithm_update(
 +/*=================================*/
 +	THD*				thd,	/*!< in: thread handle */
 +	struct st_mysql_sys_var*	var,	/*!< in: pointer to
 +						system variable */
 +	void*				var_ptr,/*!< out: where the
 +						formal string goes */
 +	const void*			save)	/*!< in: immediate result
 +						from check function */
 +{
 +	srv_checksum_algorithm_t	algorithm;
 +
 +	algorithm = (srv_checksum_algorithm_t)
 +		(*static_cast<const ulong*>(save));
 +
 +	/* Make sure we are the only log user */
 +	mutex_enter(&log_sys->mutex);
 +
 +	innodb_log_checksum_func_update(algorithm);
 +
 +	srv_log_checksum_algorithm = algorithm;
 +
 +	mutex_exit(&log_sys->mutex);
 +}
 +
 +/*********************************************************************//**
 +Copy table flags from MySQL's HA_CREATE_INFO into an InnoDB table object.
 +Those flags are stored in .frm file and end up in the MySQL table object,
 +but are frequently used inside InnoDB so we keep their copies into the
 +InnoDB table object. */
 +UNIV_INTERN
 +void
 +innobase_copy_frm_flags_from_create_info(
 +/*=====================================*/
 +	dict_table_t*		innodb_table,	/*!< in/out: InnoDB table */
 +	const HA_CREATE_INFO*	create_info)	/*!< in: create info */
 +{
 +	ibool	ps_on;
 +	ibool	ps_off;
 +
 +	if (dict_table_is_temporary(innodb_table)) {
 +		/* Temp tables do not use persistent stats. */
 +		ps_on = FALSE;
 +		ps_off = TRUE;
 +	} else {
 +		ps_on = create_info->table_options
 +			& HA_OPTION_STATS_PERSISTENT;
 +		ps_off = create_info->table_options
 +			& HA_OPTION_NO_STATS_PERSISTENT;
 +	}
 +
 +	dict_stats_set_persistent(innodb_table, ps_on, ps_off);
 +
 +	dict_stats_auto_recalc_set(
 +		innodb_table,
 +		create_info->stats_auto_recalc == HA_STATS_AUTO_RECALC_ON,
 +		create_info->stats_auto_recalc == HA_STATS_AUTO_RECALC_OFF);
 +
 +	innodb_table->stats_sample_pages = create_info->stats_sample_pages;
 +}
 +
 +/*********************************************************************//**
 +Copy table flags from MySQL's TABLE_SHARE into an InnoDB table object.
 +Those flags are stored in .frm file and end up in the MySQL table object,
 +but are frequently used inside InnoDB so we keep their copies into the
 +InnoDB table object. */
 +UNIV_INTERN
 +void
 +innobase_copy_frm_flags_from_table_share(
 +/*=====================================*/
 +	dict_table_t*		innodb_table,	/*!< in/out: InnoDB table */
 +	const TABLE_SHARE*	table_share)	/*!< in: table share */
 +{
 +	ibool	ps_on;
 +	ibool	ps_off;
 +
 +	if (dict_table_is_temporary(innodb_table)) {
 +		/* Temp tables do not use persistent stats */
 +		ps_on = FALSE;
 +		ps_off = TRUE;
 +	} else {
 +		ps_on = table_share->db_create_options
 +			& HA_OPTION_STATS_PERSISTENT;
 +		ps_off = table_share->db_create_options
 +			& HA_OPTION_NO_STATS_PERSISTENT;
 +	}
 +
 +	dict_stats_set_persistent(innodb_table, ps_on, ps_off);
 +
 +	dict_stats_auto_recalc_set(
 +		innodb_table,
 +		table_share->stats_auto_recalc == HA_STATS_AUTO_RECALC_ON,
 +		table_share->stats_auto_recalc == HA_STATS_AUTO_RECALC_OFF);
 +
 +	innodb_table->stats_sample_pages = table_share->stats_sample_pages;
 +}
 +
 +/*********************************************************************//**
 +Construct ha_innobase handler. */
 +UNIV_INTERN
 +ha_innobase::ha_innobase(
 +/*=====================*/
 +	handlerton*	hton,
 +	TABLE_SHARE*	table_arg)
 +	:handler(hton, table_arg),
 +	int_table_flags(HA_REC_NOT_IN_SEQ |
 +		  HA_NULL_IN_KEY | HA_CAN_VIRTUAL_COLUMNS |
 +		  HA_CAN_INDEX_BLOBS |
 +		  HA_CAN_SQL_HANDLER |
 +		  HA_PRIMARY_KEY_REQUIRED_FOR_POSITION |
 +		  HA_PRIMARY_KEY_IN_READ_INDEX |
 +		  HA_BINLOG_ROW_CAPABLE |
 +		  HA_CAN_GEOMETRY | HA_PARTIAL_COLUMN_READ |
 +		  HA_TABLE_SCAN_ON_INDEX | HA_CAN_FULLTEXT |
 +		  HA_CAN_FULLTEXT_EXT | HA_CAN_EXPORT),
 +	start_of_scan(0),
 +	num_write_row(0)
 +{}
 +
 +/*********************************************************************//**
 +Destruct ha_innobase handler. */
 +UNIV_INTERN
 +ha_innobase::~ha_innobase()
 +/*======================*/
 +{
 +}
 +
 +/*********************************************************************//**
 +Updates the user_thd field in a handle and also allocates a new InnoDB
 +transaction handle if needed, and updates the transaction fields in the
 +prebuilt struct. */
 +UNIV_INTERN inline
 +void
 +ha_innobase::update_thd(
 +/*====================*/
 +	THD*	thd)	/*!< in: thd to use the handle */
 +{
 +	trx_t*		trx;
 +
 +	DBUG_ENTER("ha_innobase::update_thd");
 +	DBUG_PRINT("ha_innobase::update_thd", ("user_thd: %p -> %p",
 +		   user_thd, thd));
 +
 +	/* The table should have been opened in ha_innobase::open(). */
 +	DBUG_ASSERT(prebuilt->table->n_ref_count > 0);
 +
 +	trx = check_trx_exists(thd);
 +
 +	if (prebuilt->trx != trx) {
 +
 +		row_update_prebuilt_trx(prebuilt, trx);
 +	}
 +
 +	user_thd = thd;
 +	DBUG_VOID_RETURN;
 +}
 +
 +/*********************************************************************//**
 +Updates the user_thd field in a handle and also allocates a new InnoDB
 +transaction handle if needed, and updates the transaction fields in the
 +prebuilt struct. */
 +UNIV_INTERN
 +void
 +ha_innobase::update_thd()
 +/*=====================*/
 +{
 +	THD*	thd = ha_thd();
 +
 +	ut_ad(EQ_CURRENT_THD(thd));
 +	update_thd(thd);
 +}
 +
 +/*********************************************************************//**
 +Registers an InnoDB transaction with the MySQL 2PC coordinator, so that
 +the MySQL XA code knows to call the InnoDB prepare and commit, or rollback
 +for the transaction. This MUST be called for every transaction for which
 +the user may call commit or rollback. Calling this several times to register
 +the same transaction is allowed, too. This function also registers the
 +current SQL statement. */
 +static inline
 +void
 +innobase_register_trx(
 +/*==================*/
 +	handlerton*	hton,	/* in: Innobase handlerton */
 +	THD*		thd,	/* in: MySQL thd (connection) object */
 +	trx_t*		trx)	/* in: transaction to register */
 +{
 +	trans_register_ha(thd, FALSE, hton);
 +
 +	if (!trx_is_registered_for_2pc(trx)
 +	    && thd_test_options(thd, OPTION_NOT_AUTOCOMMIT | OPTION_BEGIN)) {
 +
 +		trans_register_ha(thd, TRUE, hton);
 +	}
 +
 +	trx_register_for_2pc(trx);
 +}
 +
 +/*	BACKGROUND INFO: HOW THE MYSQL QUERY CACHE WORKS WITH INNODB
 +	------------------------------------------------------------
 +
 +1) The use of the query cache for TBL is disabled when there is an
 +uncommitted change to TBL.
 +
 +2) When a change to TBL commits, InnoDB stores the current value of
 +its global trx id counter, let us denote it by INV_TRX_ID, to the table object
 +in the InnoDB data dictionary, and does only allow such transactions whose
 +id <= INV_TRX_ID to use the query cache.
 +
 +3) When InnoDB does an INSERT/DELETE/UPDATE to a table TBL, or an implicit
 +modification because an ON DELETE CASCADE, we invalidate the MySQL query cache
 +of TBL immediately.
 +
 +How this is implemented inside InnoDB:
 +
 +1) Since every modification always sets an IX type table lock on the InnoDB
 +table, it is easy to check if there can be uncommitted modifications for a
 +table: just check if there are locks in the lock list of the table.
 +
 +2) When a transaction inside InnoDB commits, it reads the global trx id
 +counter and stores the value INV_TRX_ID to the tables on which it had a lock.
 +
 +3) If there is an implicit table change from ON DELETE CASCADE or SET NULL,
 +InnoDB calls an invalidate method for the MySQL query cache for that table.
 +
 +How this is implemented inside sql_cache.cc:
 +
 +1) The query cache for an InnoDB table TBL is invalidated immediately at an
 +INSERT/UPDATE/DELETE, just like in the case of MyISAM. No need to delay
 +invalidation to the transaction commit.
 +
 +2) To store or retrieve a value from the query cache of an InnoDB table TBL,
 +any query must first ask InnoDB's permission. We must pass the thd as a
 +parameter because InnoDB will look at the trx id, if any, associated with
 +that thd. Also the full_name which is used as key to search for the table
 +object. The full_name is a string containing the normalized path to the
 +table in the canonical format.
 +
 +3) Use of the query cache for InnoDB tables is now allowed also when
 +AUTOCOMMIT==0 or we are inside BEGIN ... COMMIT. Thus transactions no longer
 +put restrictions on the use of the query cache.
 +*/
 +
 +/******************************************************************//**
 +The MySQL query cache uses this to check from InnoDB if the query cache at
 +the moment is allowed to operate on an InnoDB table. The SQL query must
 +be a non-locking SELECT.
 +
 +The query cache is allowed to operate on certain query only if this function
 +returns TRUE for all tables in the query.
 +
 +If thd is not in the autocommit state, this function also starts a new
 +transaction for thd if there is no active trx yet, and assigns a consistent
 +read view to it if there is no read view yet.
 +
 +Why a deadlock of threads is not possible: the query cache calls this function
 +at the start of a SELECT processing. Then the calling thread cannot be
 +holding any InnoDB semaphores. The calling thread is holding the
 +query cache mutex, and this function will reserve the InnoDB trx_sys->mutex.
 +Thus, the 'rank' in sync0sync.h of the MySQL query cache mutex is above
 +the InnoDB trx_sys->mutex.
 +@return TRUE if permitted, FALSE if not; note that the value FALSE
 +does not mean we should invalidate the query cache: invalidation is
 +called explicitly */
 +static
 +my_bool
 +innobase_query_caching_of_table_permitted(
 +/*======================================*/
 +	THD*	thd,		/*!< in: thd of the user who is trying to
 +				store a result to the query cache or
 +				retrieve it */
 +	char*	full_name,	/*!< in: normalized path to the table */
 +	uint	full_name_len,	/*!< in: length of the normalized path
 +                                to the table */
 +	ulonglong *unused)	/*!< unused for this engine */
 +{
 +	ibool	is_autocommit;
 +	trx_t*	trx;
 +	char	norm_name[1000];
 +
 +	ut_a(full_name_len < 999);
 +
 +	trx = check_trx_exists(thd);
 +
 +	if (trx->isolation_level == TRX_ISO_SERIALIZABLE) {
 +		/* In the SERIALIZABLE mode we add LOCK IN SHARE MODE to every
 +		plain SELECT if AUTOCOMMIT is not on. */
 +
 +		return((my_bool)FALSE);
 +	}
 +
 +	if (UNIV_UNLIKELY(trx->has_search_latch)) {
 +		sql_print_error("The calling thread is holding the adaptive "
 +				"search, latch though calling "
 +				"innobase_query_caching_of_table_permitted.");
 +		trx_print(stderr, trx, 1024);
 +	}
 +
 +	trx_search_latch_release_if_reserved(trx);
 +
 +	innobase_srv_conc_force_exit_innodb(trx);
 +
 +	if (!thd_test_options(thd, OPTION_NOT_AUTOCOMMIT | OPTION_BEGIN)) {
 +
 +		is_autocommit = TRUE;
 +	} else {
 +		is_autocommit = FALSE;
 +
 +	}
 +
 +	if (is_autocommit && trx->n_mysql_tables_in_use == 0) {
 +		/* We are going to retrieve the query result from the query
 +		cache. This cannot be a store operation to the query cache
 +		because then MySQL would have locks on tables already.
 +
 +		TODO: if the user has used LOCK TABLES to lock the table,
 +		then we open a transaction in the call of row_.. below.
 +		That trx can stay open until UNLOCK TABLES. The same problem
 +		exists even if we do not use the query cache. MySQL should be
 +		modified so that it ALWAYS calls some cleanup function when
 +		the processing of a query ends!
 +
 +		We can imagine we instantaneously serialize this consistent
 +		read trx to the current trx id counter. If trx2 would have
 +		changed the tables of a query result stored in the cache, and
 +		trx2 would have already committed, making the result obsolete,
 +		then trx2 would have already invalidated the cache. Thus we
 +		can trust the result in the cache is ok for this query. */
 +
 +		return((my_bool)TRUE);
 +	}
 +
 +	/* Normalize the table name to InnoDB format */
 +	normalize_table_name(norm_name, full_name);
 +
 +	innobase_register_trx(innodb_hton_ptr, thd, trx);
 +
 +	if (row_search_check_if_query_cache_permitted(trx, norm_name)) {
 +
 +		/* printf("Query cache for %s permitted\n", norm_name); */
 +
 +		return((my_bool)TRUE);
 +	}
 +
 +	/* printf("Query cache for %s NOT permitted\n", norm_name); */
 +
 +	return((my_bool)FALSE);
 +}
 +
 +/*****************************************************************//**
 +Invalidates the MySQL query cache for the table. */
 +UNIV_INTERN
 +void
 +innobase_invalidate_query_cache(
 +/*============================*/
 +	trx_t*		trx,		/*!< in: transaction which
 +					modifies the table */
 +	const char*	full_name,	/*!< in: concatenation of
 +					database name, null char NUL,
 +					table name, null char NUL;
 +					NOTE that in Windows this is
 +					always in LOWER CASE! */
 +	ulint		full_name_len)	/*!< in: full name length where
 +					also the null chars count */
 +{
 +	/* Note that the sync0sync.h rank of the query cache mutex is just
 +	above the InnoDB trx_sys_t->lock. The caller of this function must
 +	not have latches of a lower rank. */
 +
 +#ifdef HAVE_QUERY_CACHE
 +	char	qcache_key_name[2 * (NAME_LEN + 1)];
 +	size_t	tabname_len;
 +	size_t	dbname_len;
 +
 +	/* Construct the key("db-name\0table$name\0") for the query cache using
 +	the path name("db@002dname\0table@0024name\0") of the table in its
 +        canonical form. */
 +	dbname_len = filename_to_tablename(full_name, qcache_key_name,
 +					   sizeof(qcache_key_name));
 +	tabname_len = filename_to_tablename(full_name + strlen(full_name) + 1,
 +					    qcache_key_name + dbname_len + 1,
 +					    sizeof(qcache_key_name)
 +                                            - dbname_len - 1);
 +
 +	/* Argument TRUE below means we are using transactions */
 +	mysql_query_cache_invalidate4(trx->mysql_thd,
 +				      qcache_key_name,
 +				      (dbname_len + tabname_len + 2),
 +				      TRUE);
 +#endif
 +}
 +
 +/*****************************************************************//**
 +Convert an SQL identifier to the MySQL system_charset_info (UTF-8)
 +and quote it if needed.
 +@return	pointer to the end of buf */
 +static
 +char*
 +innobase_convert_identifier(
 +/*========================*/
 +	char*		buf,	/*!< out: buffer for converted identifier */
 +	ulint		buflen,	/*!< in: length of buf, in bytes */
 +	const char*	id,	/*!< in: identifier to convert */
 +	ulint		idlen,	/*!< in: length of id, in bytes */
 +	THD*		thd,	/*!< in: MySQL connection thread, or NULL */
 +	ibool		file_id)/*!< in: TRUE=id is a table or database name;
 +				FALSE=id is an UTF-8 string */
 +{
 +	char nz2[MAX_TABLE_NAME_LEN + 1];
 +	const char*	s	= id;
 +	int		q;
 +
 +	if (file_id) {
 +
 +		char nz[MAX_TABLE_NAME_LEN + 1];
 +
 +		/* Decode the table name.  The MySQL function expects
 +		a NUL-terminated string.  The input and output strings
 +		buffers must not be shared. */
 +		ut_a(idlen <= MAX_TABLE_NAME_LEN);
 +		memcpy(nz, id, idlen);
 +		nz[idlen] = 0;
 +
 +		s = nz2;
 +		idlen = explain_filename(thd, nz, nz2, sizeof nz2,
 +					 EXPLAIN_PARTITIONS_AS_COMMENT);
 +		goto no_quote;
 +	}
 +
 +	/* See if the identifier needs to be quoted. */
 +	if (UNIV_UNLIKELY(!thd)) {
 +		q = '"';
 +	} else {
 +		q = get_quote_char_for_identifier(thd, s, (int) idlen);
 +	}
 +
 +	if (q == EOF) {
 +no_quote:
 +		if (UNIV_UNLIKELY(idlen > buflen)) {
 +			idlen = buflen;
 +		}
 +		memcpy(buf, s, idlen);
 +		return(buf + idlen);
 +	}
 +
 +	/* Quote the identifier. */
 +	if (buflen < 2) {
 +		return(buf);
 +	}
 +
 +	*buf++ = q;
 +	buflen--;
 +
 +	for (; idlen; idlen--) {
 +		int	c = *s++;
 +		if (UNIV_UNLIKELY(c == q)) {
 +			if (UNIV_UNLIKELY(buflen < 3)) {
 +				break;
 +			}
 +
 +			*buf++ = c;
 +			*buf++ = c;
 +			buflen -= 2;
 +		} else {
 +			if (UNIV_UNLIKELY(buflen < 2)) {
 +				break;
 +			}
 +
 +			*buf++ = c;
 +			buflen--;
 +		}
 +	}
 +
 +	*buf++ = q;
 +	return(buf);
 +}
 +
 +/*****************************************************************//**
 +Convert a table or index name to the MySQL system_charset_info (UTF-8)
 +and quote it if needed.
 +@return	pointer to the end of buf */
 +UNIV_INTERN
 +char*
 +innobase_convert_name(
 +/*==================*/
 +	char*		buf,	/*!< out: buffer for converted identifier */
 +	ulint		buflen,	/*!< in: length of buf, in bytes */
 +	const char*	id,	/*!< in: identifier to convert */
 +	ulint		idlen,	/*!< in: length of id, in bytes */
 +	THD*		thd,	/*!< in: MySQL connection thread, or NULL */
 +	ibool		table_id)/*!< in: TRUE=id is a table or database name;
 +				FALSE=id is an index name */
 +{
 +	char*		s	= buf;
 +	const char*	bufend	= buf + buflen;
 +
 +	if (table_id) {
 +		const char*	slash = (const char*) memchr(id, '/', idlen);
 +		if (!slash) {
 +
 +			goto no_db_name;
 +		}
 +
 +		/* Print the database name and table name separately. */
 +		s = innobase_convert_identifier(s, bufend - s, id, slash - id,
 +						thd, TRUE);
 +		if (UNIV_LIKELY(s < bufend)) {
 +			*s++ = '.';
 +			s = innobase_convert_identifier(s, bufend - s,
 +							slash + 1, idlen
 +							- (slash - id) - 1,
 +							thd, TRUE);
 +		}
 +	} else if (UNIV_UNLIKELY(*id == TEMP_INDEX_PREFIX)) {
 +		/* Temporary index name (smart ALTER TABLE) */
 +		const char temp_index_suffix[]= "--temporary--";
 +
 +		s = innobase_convert_identifier(buf, buflen, id + 1, idlen - 1,
 +						thd, FALSE);
 +		if (s - buf + (sizeof temp_index_suffix - 1) < buflen) {
 +			memcpy(s, temp_index_suffix,
 +			       sizeof temp_index_suffix - 1);
 +			s += sizeof temp_index_suffix - 1;
 +		}
 +	} else {
 +no_db_name:
 +		s = innobase_convert_identifier(buf, buflen, id, idlen,
 +						thd, table_id);
 +	}
 +
 +	return(s);
 +}
 +
 +/*****************************************************************//**
 +A wrapper function of innobase_convert_name(), convert a table or
 +index name to the MySQL system_charset_info (UTF-8) and quote it if needed.
 +@return	pointer to the end of buf */
 +UNIV_INTERN
 +void
 +innobase_format_name(
 +/*==================*/
 +	char*		buf,	/*!< out: buffer for converted identifier */
 +	ulint		buflen,	/*!< in: length of buf, in bytes */
 +	const char*	name,	/*!< in: index or table name to format */
 +	ibool		is_index_name) /*!< in: index name */
 +{
 +	const char*     bufend;
 +
 +	bufend = innobase_convert_name(buf, buflen, name, strlen(name),
 +				       NULL, !is_index_name);
 +
 +	ut_ad((ulint) (bufend - buf) < buflen);
 +
 +	buf[bufend - buf] = '\0';
 +}
 +
 +/**********************************************************************//**
 +Determines if the currently running transaction has been interrupted.
 +@return	TRUE if interrupted */
 +UNIV_INTERN
 +ibool
 +trx_is_interrupted(
 +/*===============*/
 +	const trx_t*	trx)	/*!< in: transaction */
 +{
 +	return(trx && trx->mysql_thd && thd_kill_level((THD*) trx->mysql_thd));
 +}
 +
 +/**********************************************************************//**
 +Determines if the currently running transaction is in strict mode.
 +@return	TRUE if strict */
 +UNIV_INTERN
 +ibool
 +trx_is_strict(
 +/*==========*/
 +	trx_t*	trx)	/*!< in: transaction */
 +{
 +	return(trx && trx->mysql_thd && THDVAR(trx->mysql_thd, strict_mode));
 +}
 +
 +/**************************************************************//**
 +Resets some fields of a prebuilt struct. The template is used in fast
 +retrieval of just those column values MySQL needs in its processing. */
 +inline
 +void
 +ha_innobase::reset_template(void)
 +/*=============================*/
 +{
 +	ut_ad(prebuilt->magic_n == ROW_PREBUILT_ALLOCATED);
 +	ut_ad(prebuilt->magic_n2 == prebuilt->magic_n);
 +
 +	/* Force table to be freed in close_thread_table(). */
 +	DBUG_EXECUTE_IF("free_table_in_fts_query",
 +		if (prebuilt->in_fts_query) {
 +			table->m_needs_reopen = true;
 +		}
 +	);
 +
 +	prebuilt->keep_other_fields_on_keyread = 0;
 +	prebuilt->read_just_key = 0;
 +	prebuilt->in_fts_query = 0;
 +	/* Reset index condition pushdown state. */
 +	if (prebuilt->idx_cond) {
 +		prebuilt->idx_cond = NULL;
 +		prebuilt->idx_cond_n_cols = 0;
 +		/* Invalidate prebuilt->mysql_template
 +		in ha_innobase::write_row(). */
 +		prebuilt->template_type = ROW_MYSQL_NO_TEMPLATE;
 +	}
 +}
 +
 +/*****************************************************************//**
 +Call this when you have opened a new table handle in HANDLER, before you
 +call index_read_idx() etc. Actually, we can let the cursor stay open even
 +over a transaction commit! Then you should call this before every operation,
 +fetch next etc. This function inits the necessary things even after a
 +transaction commit. */
 +UNIV_INTERN
 +void
 +ha_innobase::init_table_handle_for_HANDLER(void)
 +/*============================================*/
 +{
 +	/* If current thd does not yet have a trx struct, create one.
 +	If the current handle does not yet have a prebuilt struct, create
 +	one. Update the trx pointers in the prebuilt struct. Normally
 +	this operation is done in external_lock. */
 +
 +	update_thd(ha_thd());
 +
 +	/* Initialize the prebuilt struct much like it would be inited in
 +	external_lock */
 +
 +	trx_search_latch_release_if_reserved(prebuilt->trx);
 +
 +	innobase_srv_conc_force_exit_innodb(prebuilt->trx);
 +
 +	/* If the transaction is not started yet, start it */
 +
 +	trx_start_if_not_started_xa(prebuilt->trx);
 +
 +	/* Assign a read view if the transaction does not have it yet */
 +
 +	trx_assign_read_view(prebuilt->trx);
 +
 +	innobase_register_trx(ht, user_thd, prebuilt->trx);
 +
 +	/* We did the necessary inits in this function, no need to repeat them
 +	in row_search_for_mysql */
 +
 +	prebuilt->sql_stat_start = FALSE;
 +
 +	/* We let HANDLER always to do the reads as consistent reads, even
 +	if the trx isolation level would have been specified as SERIALIZABLE */
 +
 +	prebuilt->select_lock_type = LOCK_NONE;
 +	prebuilt->stored_select_lock_type = LOCK_NONE;
 +
 +	/* Always fetch all columns in the index record */
 +
 +	prebuilt->hint_need_to_fetch_extra_cols = ROW_RETRIEVE_ALL_COLS;
 +
 +	/* We want always to fetch all columns in the whole row? Or do
 +	we???? */
 +
 +	prebuilt->used_in_HANDLER = TRUE;
 +	reset_template();
 +}
 +
 +/****************************************************************//**
 +Gives the file extension of an InnoDB single-table tablespace. */
 +static const char* ha_innobase_exts[] = {
 +  ".ibd",
 +  ".isl",
 +  NullS
 +};
 +
 +/*********************************************************************//**
 +Opens an InnoDB database.
 +@return	0 on success, error code on failure */
 +static
 +int
 +innobase_init(
 +/*==========*/
 +	void	*p)	/*!< in: InnoDB handlerton */
 +{
 +	static char	current_dir[3];		/*!< Set if using current lib */
 +	int		err;
 +	bool		ret;
 +	char		*default_path;
 +	uint		format_id;
 +	ulong		num_pll_degree;
 +
 +	DBUG_ENTER("innobase_init");
 +	handlerton *innobase_hton= (handlerton*) p;
 +	innodb_hton_ptr = innobase_hton;
 +
 +	innobase_hton->state = SHOW_OPTION_YES;
 +	innobase_hton->db_type= DB_TYPE_INNODB;
 +	innobase_hton->savepoint_offset = sizeof(trx_named_savept_t);
 +	innobase_hton->close_connection = innobase_close_connection;
 +	innobase_hton->savepoint_set = innobase_savepoint;
 +	innobase_hton->savepoint_rollback = innobase_rollback_to_savepoint;
 +	innobase_hton->savepoint_rollback_can_release_mdl =
 +				innobase_rollback_to_savepoint_can_release_mdl;
 +	innobase_hton->savepoint_release = innobase_release_savepoint;
 +	innobase_hton->commit_ordered=innobase_commit_ordered;
 +	innobase_hton->commit = innobase_commit;
 +	innobase_hton->rollback = innobase_rollback;
 +	innobase_hton->prepare = innobase_xa_prepare;
 +	innobase_hton->recover = innobase_xa_recover;
 +	innobase_hton->commit_by_xid = innobase_commit_by_xid;
 +	innobase_hton->rollback_by_xid = innobase_rollback_by_xid;
 +        innobase_hton->commit_checkpoint_request=innobase_checkpoint_request;
 +        innobase_hton->checkpoint_state= innobase_checkpoint_state;
 +	innobase_hton->create_cursor_read_view = innobase_create_cursor_view;
 +	innobase_hton->set_cursor_read_view = innobase_set_cursor_view;
 +	innobase_hton->close_cursor_read_view = innobase_close_cursor_view;
 +	innobase_hton->create = innobase_create_handler;
 +	innobase_hton->drop_database = innobase_drop_database;
 +	innobase_hton->panic = innobase_end;
 +
 +	innobase_hton->start_consistent_snapshot =
 +		innobase_start_trx_and_assign_read_view;
 +
 +	/*innobase_hton->store_binlog_info =
 +		innobase_store_binlog_info;*/
 +
 +	innobase_hton->flush_logs = innobase_flush_logs;
 +	innobase_hton->show_status = innobase_show_status;
 +	innobase_hton->flags = HTON_SUPPORTS_EXTENDED_KEYS |
 +		HTON_SUPPORTS_FOREIGN_KEYS;
 +
 +	innobase_hton->release_temporary_latches =
 +		innobase_release_temporary_latches;
 +
 +	innobase_hton->kill_query = innobase_kill_connection;
 +
 +        if (srv_file_per_table)
 +          innobase_hton->tablefile_extensions = ha_innobase_exts;
 +
 +#ifdef WITH_WSREP
 +        innobase_hton->wsrep_abort_transaction=wsrep_abort_transaction;
 +        innobase_hton->wsrep_set_checkpoint=innobase_wsrep_set_checkpoint;
 +        innobase_hton->wsrep_get_checkpoint=innobase_wsrep_get_checkpoint;
 +        innobase_hton->wsrep_fake_trx_id=wsrep_fake_trx_id;
 +#endif /* WITH_WSREP */
 +
 +	ut_a(DATA_MYSQL_TRUE_VARCHAR == (ulint)MYSQL_TYPE_VARCHAR);
 +
 +#ifndef DBUG_OFF
 +	static const char	test_filename[] = "-@";
 +	char			test_tablename[sizeof test_filename
 +				+ sizeof(srv_mysql50_table_name_prefix) - 1];
 +	if ((sizeof(test_tablename)) - 1
 +			!= filename_to_tablename(test_filename,
 +						 test_tablename,
 +						 sizeof(test_tablename), true)
 +			|| strncmp(test_tablename,
 +				   srv_mysql50_table_name_prefix,
 +				   sizeof(srv_mysql50_table_name_prefix) - 1)
 +			|| strcmp(test_tablename
 +				  + sizeof(srv_mysql50_table_name_prefix) - 1,
 +				  test_filename)) {
 +
 +		sql_print_error("tablename encoding has been changed");
 +
 +		goto error;
 +	}
 +#endif /* DBUG_OFF */
 +
 +	srv_log_block_size = 0;
 +	if (innobase_log_block_size != (1 << 9)) { /*!=512*/
 +		uint	n_shift;
 +
 +		fprintf(stderr,
 +			"InnoDB: Warning: innodb_log_block_size has been "
 +			"changed from default value 512. (###EXPERIMENTAL### "
 +			"operation)\n");
 +		for (n_shift = 9; n_shift <= UNIV_PAGE_SIZE_SHIFT_MAX;
 +		     n_shift++) {
 +			if (innobase_log_block_size == ((ulong)1 << n_shift)) {
 +				srv_log_block_size = (1 << n_shift);
 +				fprintf(stderr,
 +					"InnoDB: The log block size is set to "
 +					ULINTPF ".\n",srv_log_block_size);
 +				break;
 +			}
 +		}
 +	} else {
 +		srv_log_block_size = 512;
 +	}
 +	ut_ad (srv_log_block_size >= OS_MIN_LOG_BLOCK_SIZE);
 +
 +	if (!srv_log_block_size) {
 +		fprintf(stderr,
 +			"InnoDB: Error: %lu is not a valid value for "
 +			"innodb_log_block_size.\n"
 +			"InnoDB: Error: A valid value for "
 +			"innodb_log_block_size is\n"
 +			"InnoDB: Error: a power of 2 from 512 to 16384.\n",
 +			innobase_log_block_size);
 +		goto error;
 +	}
 +
 +	/* Check that values don't overflow on 32-bit systems. */
 +	if (sizeof(ulint) == 4) {
 +		if (innobase_buffer_pool_size > UINT_MAX32) {
 +			sql_print_error(
 +				"innobase_buffer_pool_size can't be over 4GB"
 +				" on 32-bit systems");
 +
 +			goto error;
 +		}
 +	}
 +
 +	os_innodb_umask = (ulint) my_umask;
 +
 +	/* First calculate the default path for innodb_data_home_dir etc.,
 +	in case the user has not given any value.
 +
 +	Note that when using the embedded server, the datadirectory is not
 +	necessarily the current directory of this program. */
 +
 +	if (mysqld_embedded) {
 +		default_path = mysql_real_data_home;
 +		fil_path_to_mysql_datadir = mysql_real_data_home;
 +	} else {
 +		/* It's better to use current lib, to keep paths short */
 +		current_dir[0] = FN_CURLIB;
 +		current_dir[1] = FN_LIBCHAR;
 +		current_dir[2] = 0;
 +		default_path = current_dir;
 +	}
 +
 +	ut_a(default_path);
 +
 +	/* Set InnoDB initialization parameters according to the values
 +	read from MySQL .cnf file */
 +
 +	/*--------------- Data files -------------------------*/
 +
 +	/* The default dir for data files is the datadir of MySQL */
 +
 +	srv_data_home = (innobase_data_home_dir ? innobase_data_home_dir :
 +			 default_path);
 +
 +	/* Set default InnoDB data file size to 12 MB and let it be
 +	auto-extending. Thus users can use InnoDB in >= 4.0 without having
 +	to specify any startup options. */
 +
 +	if (!innobase_data_file_path) {
 +		innobase_data_file_path = (char*) "ibdata1:12M:autoextend";
 +	}
 +
 +	/* Since InnoDB edits the argument in the next call, we make another
 +	copy of it: */
 +
 +	internal_innobase_data_file_path = my_strdup(innobase_data_file_path,
 +						   MYF(MY_FAE));
 +
 +	ret = (bool) srv_parse_data_file_paths_and_sizes(
 +		internal_innobase_data_file_path);
 +	if (ret == FALSE) {
 +		sql_print_error(
 +			"InnoDB: syntax error in innodb_data_file_path"
 +			" or size specified is less than 1 megabyte");
 +mem_free_and_error:
 +		srv_free_paths_and_sizes();
 +		my_free(internal_innobase_data_file_path);
 +		goto error;
 +	}
 +
 +	/* -------------- All log files ---------------------------*/
 +
 +	/* The default dir for log files is the datadir of MySQL */
 +
 +	if (!srv_log_group_home_dir) {
 +		srv_log_group_home_dir = default_path;
 +	}
 +
 +#ifdef UNIV_LOG_ARCHIVE
 +	if (!innobase_log_arch_dir) {
 +		innobase_log_arch_dir = srv_log_group_home_dir;
 +	}
 +	srv_arch_dir = innobase_log_arch_dir;
 +#endif /* UNIG_LOG_ARCHIVE */
 +
 +	srv_normalize_path_for_win(srv_log_group_home_dir);
 +
 +	if (strchr(srv_log_group_home_dir, ';')) {
 +		sql_print_error("syntax error in innodb_log_group_home_dir");
 +		goto mem_free_and_error;
 +	}
 +
 +	if (innobase_mirrored_log_groups == 1) {
 +		sql_print_warning(
 +			"innodb_mirrored_log_groups is an unimplemented "
 +			"feature and the variable will be completely "
 +			"removed in a future version.");
 +	}
 +
 +	if (innobase_mirrored_log_groups > 1) {
 +		sql_print_error(
 +		"innodb_mirrored_log_groups is an unimplemented feature and "
 +		"the variable will be completely removed in a future version. "
 +		"Using values other than 1 is not supported.");
 +		goto mem_free_and_error;
 +	}
 +
 +	if (innobase_mirrored_log_groups == 0) {
 +		/* To throw a deprecation warning message when the option is
 +		passed, the default was changed to '0' (as a workaround). Since
 +		the only value accepted for this option is '1', reset it to 1 */
 +		innobase_mirrored_log_groups = 1;
 +	}
 +
 +	/* Validate the file format by animal name */
 +	if (innobase_file_format_name != NULL) {
 +
 +		format_id = innobase_file_format_name_lookup(
 +			innobase_file_format_name);
 +
 +		if (format_id > UNIV_FORMAT_MAX) {
 +
 +			sql_print_error("InnoDB: wrong innodb_file_format.");
 +
 +			goto mem_free_and_error;
 +		}
 +	} else {
 +		/* Set it to the default file format id. Though this
 +		should never happen. */
 +		format_id = 0;
 +	}
 +
 +	srv_file_format = format_id;
 +
 +	/* Given the type of innobase_file_format_name we have little
 +	choice but to cast away the constness from the returned name.
 +	innobase_file_format_name is used in the MySQL set variable
 +	interface and so can't be const. */
 +
 +	innobase_file_format_name =
 +		(char*) trx_sys_file_format_id_to_name(format_id);
 +
 +	/* Check innobase_file_format_check variable */
 +	if (!innobase_file_format_check) {
 +
 +		/* Set the value to disable checking. */
 +		srv_max_file_format_at_startup = UNIV_FORMAT_MAX + 1;
 +
 +	} else {
 +
 +		/* Set the value to the lowest supported format. */
 +		srv_max_file_format_at_startup = UNIV_FORMAT_MIN;
 +	}
 +
 +	/* Did the user specify a format name that we support?
 +	As a side effect it will update the variable
 +	srv_max_file_format_at_startup */
 +	if (innobase_file_format_validate_and_set(
 +			innobase_file_format_max) < 0) {
 +
 +		sql_print_error("InnoDB: invalid "
 +				"innodb_file_format_max value: "
 +				"should be any value up to %s or its "
 +				"equivalent numeric id",
 +				trx_sys_file_format_id_to_name(
 +					UNIV_FORMAT_MAX));
 +
 +		goto mem_free_and_error;
 +	}
 +
 +	if (innobase_change_buffering) {
 +		ulint	use;
 +
 +		for (use = 0;
 +		     use < UT_ARR_SIZE(innobase_change_buffering_values);
 +		     use++) {
 +			if (!innobase_strcasecmp(
 +				    innobase_change_buffering,
 +				    innobase_change_buffering_values[use])) {
 +				ibuf_use = (ibuf_use_t) use;
 +				goto innobase_change_buffering_inited_ok;
 +			}
 +		}
 +
 +		sql_print_error("InnoDB: invalid value "
 +				"innodb_change_buffering=%s",
 +				innobase_change_buffering);
 +		goto mem_free_and_error;
 +	}
 +
 +innobase_change_buffering_inited_ok:
 +	ut_a((ulint) ibuf_use < UT_ARR_SIZE(innobase_change_buffering_values));
 +	innobase_change_buffering = (char*)
 +		innobase_change_buffering_values[ibuf_use];
 +
 +	/* Check that interdependent parameters have sane values. */
 +	if (srv_max_buf_pool_modified_pct < srv_max_dirty_pages_pct_lwm) {
 +		sql_print_warning("InnoDB: innodb_max_dirty_pages_pct_lwm"
 +				  " cannot be set higher than"
 +				  " innodb_max_dirty_pages_pct.\n"
 +				  "InnoDB: Setting"
 +				  " innodb_max_dirty_pages_pct_lwm to %lf\n",
 +				  srv_max_buf_pool_modified_pct);
 +
 +		srv_max_dirty_pages_pct_lwm = srv_max_buf_pool_modified_pct;
 +	}
 +
 +	if (srv_max_io_capacity == SRV_MAX_IO_CAPACITY_DUMMY_DEFAULT) {
 +
 +		if (srv_io_capacity >= SRV_MAX_IO_CAPACITY_LIMIT / 2) {
 +			/* Avoid overflow. */
 +			srv_max_io_capacity = SRV_MAX_IO_CAPACITY_LIMIT;
 +		} else {
 +			/* The user has not set the value. We should
 +			set it based on innodb_io_capacity. */
 +			srv_max_io_capacity = static_cast<ulong>(
 +				ut_max(2 * srv_io_capacity, 2000));
 +		}
 +
 +	} else if (srv_max_io_capacity < srv_io_capacity) {
 +		sql_print_warning("InnoDB: innodb_io_capacity"
 +				  " cannot be set higher than"
 +				  " innodb_io_capacity_max.\n"
 +				  "InnoDB: Setting"
 +				  " innodb_io_capacity to %lu\n",
 +				  srv_max_io_capacity);
 +
 +		srv_io_capacity = srv_max_io_capacity;
 +	}
 +
 +	if (!is_filename_allowed(srv_buf_dump_filename,
 +				 strlen(srv_buf_dump_filename), FALSE)) {
 +		sql_print_error("InnoDB: innodb_buffer_pool_filename"
 +			" cannot have colon (:) in the file name.");
 +		goto mem_free_and_error;
 +	}
 +
 +	/* --------------------------------------------------*/
 +
 +	srv_file_flush_method_str = innobase_file_flush_method;
 +
 +	srv_log_file_size = (ib_uint64_t) innobase_log_file_size;
 +
 +#ifdef UNIV_LOG_ARCHIVE
 +	srv_log_archive_on = (ulint) innobase_log_archive;
 +#endif /* UNIV_LOG_ARCHIVE */
 +
 +	/* Check that the value of system variable innodb_page_size was
 +	set correctly.  Its value was put into srv_page_size. If valid,
 +	return the associated srv_page_size_shift.*/
 +	srv_page_size_shift = innodb_page_size_validate(srv_page_size);
 +	if (!srv_page_size_shift) {
 +		sql_print_error("InnoDB: Invalid page size=%lu.\n",
 +				srv_page_size);
 +		goto mem_free_and_error;
 +	}
 +	if (UNIV_PAGE_SIZE_DEF != srv_page_size) {
 +		ut_print_timestamp(stderr);
 +		fprintf(stderr,
 +			" InnoDB: innodb-page-size has been changed"
 +			" from the default value %d to %lu.\n",
 +			UNIV_PAGE_SIZE_DEF, srv_page_size);
 +	}
 +
 +	srv_log_buffer_size = (ulint) innobase_log_buffer_size;
 +
 +	if (innobase_buffer_pool_instances == 0) {
 +		innobase_buffer_pool_instances = 8;
 +
 +#if defined(__WIN__) && !defined(_WIN64)
 +		if (innobase_buffer_pool_size > 1331 * 1024 * 1024) {
 +			innobase_buffer_pool_instances
 +				= ut_min(MAX_BUFFER_POOLS,
 +					(long) (innobase_buffer_pool_size
 +					/ (128 * 1024 * 1024)));
 +		}
 +#endif /* defined(__WIN__) && !defined(_WIN64) */
 +	}
 +	srv_buf_pool_size = (ulint) innobase_buffer_pool_size;
 +	srv_buf_pool_instances = (ulint) innobase_buffer_pool_instances;
 +
 +	srv_mem_pool_size = (ulint) innobase_additional_mem_pool_size;
 +
 +	if (innobase_additional_mem_pool_size
 +	    != 8*1024*1024L /* the default */ ) {
 +
 +		ut_print_timestamp(stderr);
 +		fprintf(stderr,
 +			" InnoDB: Warning: Using "
 +			"innodb_additional_mem_pool_size is DEPRECATED. "
 +			"This option may be removed in future releases, "
 +			"together with the option innodb_use_sys_malloc "
 +			"and with the InnoDB's internal memory "
 +			"allocator.\n");
 +	}
 +
 +	if (!srv_use_sys_malloc ) {
 +		ut_print_timestamp(stderr);
 +		fprintf(stderr,
 +			" InnoDB: Warning: Setting "
 +			"innodb_use_sys_malloc to FALSE is DEPRECATED. "
 +			"This option may be removed in future releases, "
 +			"together with the InnoDB's internal memory "
 +			"allocator.\n");
 +	}
 +
 +	srv_n_file_io_threads = (ulint) innobase_file_io_threads;
 +	srv_n_read_io_threads = (ulint) innobase_read_io_threads;
 +	srv_n_write_io_threads = (ulint) innobase_write_io_threads;
 +
 +	srv_use_doublewrite_buf = (ibool) innobase_use_doublewrite;
 +
 +	if (!innobase_use_checksums) {
 +		ut_print_timestamp(stderr);
 +		fprintf(stderr,
 +			" InnoDB: Warning: Setting "
 +			"innodb_checksums to OFF is DEPRECATED. "
 +			"This option may be removed in future releases. "
 +			"You should set innodb_checksum_algorithm=NONE "
 +			"instead.\n");
 +		srv_checksum_algorithm = SRV_CHECKSUM_ALGORITHM_NONE;
 +	}
 +
 +	innodb_log_checksum_func_update(srv_log_checksum_algorithm);
 +
 +#ifdef HAVE_LARGE_PAGES
 +	if ((os_use_large_pages = (ibool) my_use_large_pages)) {
 +		os_large_page_size = (ulint) opt_large_page_size;
 +	}
 +#endif
 +
 +	row_rollback_on_timeout = (ibool) innobase_rollback_on_timeout;
 +
 +	srv_locks_unsafe_for_binlog = (ibool) innobase_locks_unsafe_for_binlog;
 +	if (innobase_locks_unsafe_for_binlog) {
 +		ut_print_timestamp(stderr);
 +		fprintf(stderr,
 +			" InnoDB: Warning: Using "
 +			"innodb_locks_unsafe_for_binlog is DEPRECATED. "
 +			"This option may be removed in future releases. "
 +			"Please use READ COMMITTED transaction isolation "
 +			"level instead, see " REFMAN "set-transaction.html.\n");
 +	}
 +
 +	if (innobase_open_files < 10) {
 +		innobase_open_files = 300;
 +		if (srv_file_per_table && tc_size > 300) {
 +			innobase_open_files = tc_size;
 +		}
 +	}
 +
 +	if (innobase_open_files > (long) open_files_limit) {
 +		fprintf(stderr,
 +                       "innodb_open_files should not be greater"
 +                       " than the open_files_limit.\n");
 +		if (innobase_open_files > (long) tc_size) {
 +			innobase_open_files = tc_size;
 +		}
 +	}
 +
 +	srv_max_n_open_files = (ulint) innobase_open_files;
 +	srv_innodb_status = (ibool) innobase_create_status_file;
 +
 +	srv_print_verbose_log = mysqld_embedded ? 0 : 1;
 +
 +	/* Round up fts_sort_pll_degree to nearest power of 2 number */
 +	for (num_pll_degree = 1;
 +	     num_pll_degree < fts_sort_pll_degree;
 +	     num_pll_degree <<= 1) {
 +
 +		/* No op */
 +	}
 +
 +	fts_sort_pll_degree = num_pll_degree;
 +
 +	/* Store the default charset-collation number of this MySQL
 +	installation */
 +
 +	data_mysql_default_charset_coll = (ulint) default_charset_info->number;
 +
 +	ut_a(DATA_MYSQL_LATIN1_SWEDISH_CHARSET_COLL ==
 +					my_charset_latin1.number);
 +	ut_a(DATA_MYSQL_BINARY_CHARSET_COLL == my_charset_bin.number);
 +
 +	/* Store the latin1_swedish_ci character ordering table to InnoDB. For
 +	non-latin1_swedish_ci charsets we use the MySQL comparison functions,
 +	and consequently we do not need to know the ordering internally in
 +	InnoDB. */
 +
 +	srv_latin1_ordering = my_charset_latin1.sort_order;
 +
 +	innobase_commit_concurrency_init_default();
 +
 +#ifdef HAVE_POSIX_FALLOCATE
 +	srv_use_posix_fallocate = (ibool) innobase_use_fallocate;
 +#endif
 +	/* Do not enable backoff algorithm for small buffer pool. */
 +	if (!innodb_empty_free_list_algorithm_allowed(
 +			static_cast<srv_empty_free_list_t>(
 +				srv_empty_free_list_algorithm))) {
 +		sql_print_information(
 +				"InnoDB: innodb_empty_free_list_algorithm "
 +				"has been changed to legacy "
 +				"because of small buffer pool size. "
 +				"In order to use backoff, "
 +				"increase buffer pool at least up to 20MB.\n");
 +			srv_empty_free_list_algorithm
 +				= SRV_EMPTY_FREE_LIST_LEGACY;
 +	}
 +
 +	srv_use_atomic_writes = (ibool) innobase_use_atomic_writes;
 +	if (innobase_use_atomic_writes) {
 +		ib_logf(IB_LOG_LEVEL_INFO, "using atomic writes.");
 +
 +		/* Force doublewrite buffer off, atomic writes replace it. */
 +		if (srv_use_doublewrite_buf) {
 +			ib_logf(IB_LOG_LEVEL_INFO, "switching off doublewrite "
 +				"buffer because of atomic writes.");
 +			innobase_use_doublewrite = FALSE;
 +			srv_use_doublewrite_buf	= FALSE;
 +		}
 +
 +		/* Force O_DIRECT on Unixes (on Windows writes are always
 +		unbuffered)*/
 +#ifndef _WIN32
 +		if(!innobase_file_flush_method ||
 +		   !strstr(innobase_file_flush_method, "O_DIRECT")) {
 +			innobase_file_flush_method =
 +				srv_file_flush_method_str = (char*)"O_DIRECT";
 +			ib_logf(IB_LOG_LEVEL_INFO,
 +				"using O_DIRECT due to atomic writes.");
 +		}
 +#endif
 +#ifdef HAVE_POSIX_FALLOCATE
 +		/* Due to a bug in directFS, using atomics needs
 +		posix_fallocate() to extend the file, because pwrite() past the
 +		end of the file won't work */
 +		srv_use_posix_fallocate = TRUE;
 +#endif
 +	}
 +
 +#ifdef HAVE_PSI_INTERFACE
 +	/* Register keys with MySQL performance schema */
 +	int	count;
 +
 +	count = array_elements(all_pthread_mutexes);
 + 	mysql_mutex_register("innodb", all_pthread_mutexes, count);
 +
 +# ifdef UNIV_PFS_MUTEX
 +	count = array_elements(all_innodb_mutexes);
 +	mysql_mutex_register("innodb", all_innodb_mutexes, count);
 +# endif /* UNIV_PFS_MUTEX */
 +
 +# ifdef UNIV_PFS_RWLOCK
 +	count = array_elements(all_innodb_rwlocks);
 +	mysql_rwlock_register("innodb", all_innodb_rwlocks, count);
 +# endif /* UNIV_PFS_MUTEX */
 +
 +# ifdef UNIV_PFS_THREAD
 +	count = array_elements(all_innodb_threads);
 +	mysql_thread_register("innodb", all_innodb_threads, count);
 +# endif /* UNIV_PFS_THREAD */
 +
 +# ifdef UNIV_PFS_IO
 +	count = array_elements(all_innodb_files);
 +	mysql_file_register("innodb", all_innodb_files, count);
 +# endif /* UNIV_PFS_IO */
 +
 +	count = array_elements(all_innodb_conds);
 +	mysql_cond_register("innodb", all_innodb_conds, count);
 +#endif /* HAVE_PSI_INTERFACE */
 +
 +	/* Since we in this module access directly the fields of a trx
 +	struct, and due to different headers and flags it might happen that
 +	ib_mutex_t has a different size in this module and in InnoDB
 +	modules, we check at run time that the size is the same in
 +	these compilation modules. */
 +
 +	err = innobase_start_or_create_for_mysql();
 +
 +	if (err != DB_SUCCESS) {
 +		goto mem_free_and_error;
 +	}
 +
 +	/* Adjust the innodb_undo_logs config object */
 +	innobase_undo_logs_init_default_max();
 +
 +	innobase_old_blocks_pct = static_cast<uint>(
 +		buf_LRU_old_ratio_update(innobase_old_blocks_pct, TRUE));
 +
 +	ibuf_max_size_update(innobase_change_buffer_max_size);
 +
 +	innobase_open_tables = hash_create(200);
 +	mysql_mutex_init(innobase_share_mutex_key,
 +			 &innobase_share_mutex,
 +			 MY_MUTEX_INIT_FAST);
 +	mysql_mutex_init(commit_cond_mutex_key,
 +			 &commit_cond_m, MY_MUTEX_INIT_FAST);
 +	mysql_cond_init(commit_cond_key, &commit_cond, NULL);
 +	mysql_mutex_init(pending_checkpoint_mutex_key,
 +			 &pending_checkpoint_mutex,
 +			 MY_MUTEX_INIT_FAST);
 +	innodb_inited= 1;
 +#ifdef MYSQL_DYNAMIC_PLUGIN
 +	if (innobase_hton != p) {
 +		innobase_hton = reinterpret_cast<handlerton*>(p);
 +		*innobase_hton = *innodb_hton_ptr;
 +	}
 +#endif /* MYSQL_DYNAMIC_PLUGIN */
 +
 +	/* Get the current high water mark format. */
 +	innobase_file_format_max = (char*) trx_sys_file_format_max_get();
 +
 +	/* Currently, monitor counter information are not persistent. */
 +	memset(monitor_set_tbl, 0, sizeof monitor_set_tbl);
 +
 +	memset(innodb_counter_value, 0, sizeof innodb_counter_value);
 +
 +	/* Do this as late as possible so server is fully starts up,
 +	since  we might get some initial stats if user choose to turn
 +	on some counters from start up */
 +	if (innobase_enable_monitor_counter) {
 +		innodb_enable_monitor_at_startup(
 +			innobase_enable_monitor_counter);
 +	}
 +
 +	/* Turn on monitor counters that are default on */
 +	srv_mon_default_on();
 +
 +	DBUG_RETURN(FALSE);
 +error:
 +	DBUG_RETURN(TRUE);
 +}
 +
 +/** Shut down the InnoDB storage engine.
 +@return	0 */
 +static
 +int
 +innobase_end(handlerton*, ha_panic_function)
  {
 -	if (thd) {
 -		thd_storage_lock_wait((THD*)thd, value);
 +	DBUG_ENTER("innobase_end");
 +
 +	if (innodb_inited) {
 +
 +		THD *thd= current_thd;
 +		if (thd) { // may be UNINSTALL PLUGIN statement
 +		 	trx_t* trx = thd_to_trx(thd);
 +		 	if (trx) {
 +		 		trx_free_for_mysql(trx);
 +		 	}
 +		}
 +
 +		srv_fast_shutdown = (ulint) innobase_fast_shutdown;
 +
 +		innodb_inited = 0;
 +		hash_table_free(innobase_open_tables);
 +		innobase_open_tables = NULL;
 +		innodb_shutdown();
 +		srv_free_paths_and_sizes();
 +		my_free(internal_innobase_data_file_path);
 +		mysql_mutex_destroy(&innobase_share_mutex);
 +		mysql_mutex_destroy(&commit_cond_m);
 +		mysql_cond_destroy(&commit_cond);
 +		mysql_mutex_destroy(&pending_checkpoint_mutex);
  	}
 +
 +	DBUG_RETURN(0);
  }
  
 -/******************************************************************//**
 -*/
 -extern "C" UNIV_INTERN
 -ulong
 -thd_flush_log_at_trx_commit(
 -/*================================*/
 -	void*	thd)
 +/****************************************************************//**
 +Flushes InnoDB logs to disk and makes a checkpoint. Really, a commit flushes
 +the logs, and the name of this function should be innobase_checkpoint.
 +@return	TRUE if error */
 +static
 +bool
 +innobase_flush_logs(
 +/*================*/
 +	handlerton*	hton)	/*!< in/out: InnoDB handlerton */
  {
 -	return(THDVAR((THD*) thd, flush_log_at_trx_commit));
 +	bool	result = 0;
 +
 +	DBUG_ENTER("innobase_flush_logs");
 +	DBUG_ASSERT(hton == innodb_hton_ptr);
 +
 +	if (!srv_read_only_mode) {
 +		log_buffer_flush_to_disk();
 +	}
 +
 +	DBUG_RETURN(result);
  }
  
 -/******************************************************************//**
 -Returns the merge-sort block size used for the secondary index creation
 -for the current connection.
 -@return	the merge-sort block size, in bytes */
 -extern "C" UNIV_INTERN
 -ulong
 -thd_merge_sort_block_size(
 -/*================================*/
 -	void*	thd)	/*!< in: thread handle (THD*), or NULL to query
 -+			the global merge_sort_block_size */
 +/************************************************************//**
 +Synchronously read and parse the redo log up to the last
 +checkpoint to write the changed page bitmap.
 +@return 0 to indicate success.  Current implementation cannot fail. */
 +static
 +my_bool
 +innobase_flush_changed_page_bitmaps()
 +/*=================================*/
  {
 -	return(THDVAR((THD*) thd, merge_sort_block_size));
 +	if (srv_track_changed_pages) {
 +		os_event_reset(srv_checkpoint_completed_event);
 +		log_online_follow_redo_log();
 +	}
 +	return FALSE;
  }
  
 -/********************************************************************//**
 -Obtain the InnoDB transaction of a MySQL thread.
 -@return	reference to transaction pointer */
 -static inline
 -trx_t*&
 -thd_to_trx(
 -/*=======*/
 -	THD*	thd)	/*!< in: MySQL thread */
 +/************************************************************//**
 +Delete all the bitmap files for data less than the specified LSN.
 +If called with lsn == IB_ULONGLONG_MAX (i.e. set by RESET request),
 +restart the bitmap file sequence, otherwise continue it.
 +@return 0 to indicate success, 1 for failure. */
 +static
 +my_bool
 +innobase_purge_changed_page_bitmaps(
 +/*================================*/
 +	ulonglong lsn)	/*!< in: LSN to purge files up to */
  {
 -	return(*(trx_t**) thd_ha_data(thd, innodb_hton_ptr));
 +	return (my_bool)log_online_purge_changed_page_bitmaps(lsn);
  }
 -#ifdef WITH_WSREP
 -ulonglong
 -thd_to_trx_id(
 -/*=======*/
 -	THD*	thd)	/*!< in: MySQL thread */
 +
 +/*****************************************************************//**
 +Commits a transaction in an InnoDB database. */
 +static
 +void
 +innobase_commit_low(
 +/*================*/
 +	trx_t*	trx)	/*!< in: transaction handle */
  {
 -	return(thd_to_trx(thd)->id);
 +#ifdef WITH_WSREP
 +	THD* thd = (THD*)trx->mysql_thd;
 +	const char* tmp = 0;
 +	if (wsrep_on((void*)thd)) {
 +#ifdef WSREP_PROC_INFO
 +		char info[64];
 +		info[sizeof(info) - 1] = '\0';
 +		snprintf(info, sizeof(info) - 1,
 +			 "innobase_commit_low():trx_commit_for_mysql(%lld)",
 +			 (long long) wsrep_thd_trx_seqno(thd));
 +		tmp = thd_proc_info(thd, info);
 +
 +#else
 +		tmp = thd_proc_info(thd, "innobase_commit_low()");
 +#endif /* WSREP_PROC_INFO */
 +	}
 +#endif /* WITH_WSREP */
 +	if (trx_is_started(trx)) {
 +
 +		trx_commit_for_mysql(trx);
 +	}
 +#ifdef WITH_WSREP
 +	if (wsrep_on((void*)thd)) { thd_proc_info(thd, tmp); }
 +#endif /* WITH_WSREP */
  }
 -#endif
  
 -my_bool
 -ha_innobase::is_fake_change_enabled(THD* thd)
 +#if NOT_USED
 +/*****************************************************************//**
 +Stores the current binlog coordinates in the trx system header. */
 +static
 +int
 +innobase_store_binlog_info(
 +/*=======================*/
 +	handlerton*	hton,	/*!< in: InnoDB handlerton */
 +	THD*		thd)	/*!< in: MySQL thread handle */
 +
  {
 -	trx_t*	trx	= thd_to_trx(thd);
 -	return(trx && UNIV_UNLIKELY(trx->fake_changes));
 +	const char*			file_name;
 +	unsigned long long 	pos;
 +	mtr_t			mtr;
 +
 +	DBUG_ENTER("innobase_store_binlog_info");
 +
 +	thd_binlog_pos(thd, &file_name, &pos);
 +
 +	mtr_start(&mtr);
 +
 +	trx_sys_update_mysql_binlog_offset(file_name, pos,
 +					   TRX_SYS_MYSQL_LOG_INFO, &mtr);
 +
 +	mtr_commit(&mtr);
 +
 +	innobase_flush_logs(hton);
 +
 +	DBUG_RETURN(0);
  }
 +#endif
  
 -/********************************************************************//**
 -Call this function when mysqld passes control to the client. That is to
 -avoid deadlocks on the adaptive hash S-latch possibly held by thd. For more
 -documentation, see handler.cc.
 +/*****************************************************************//**
 +Creates an InnoDB transaction struct for the thd if it does not yet have one.
 +Starts a new InnoDB transaction if a transaction is not yet started. And
 +assigns a new snapshot for a consistent read if the transaction does not yet
 +have one.
  @return	0 */
  static
  int
@@@ -8000,2081 -4788,1847 +8004,2079 @@@ ha_innobase::build_template
  		}
  	}
  
 -	dict_table_autoinc_initialize(prebuilt->table, auto_inc);
 -}
 +	clust_index = dict_table_get_first_index(prebuilt->table);
  
 -/*****************************************************************//**
 -Creates and opens a handle to a table which already exists in an InnoDB
 -database.
 -@return	1 if error, 0 if success */
 -UNIV_INTERN
 -int
 -ha_innobase::open(
 -/*==============*/
 -	const char*		name,		/*!< in: table name */
 -	int			mode,		/*!< in: not used */
 -	uint			test_if_locked)	/*!< in: not used */
 -{
 -	dict_table_t*		ib_table;
 -	char			norm_name[1000];
 -	THD*			thd;
 -	char*			is_part = NULL;
 -	ibool			par_case_name_set = FALSE;
 -	char			par_case_name[MAX_FULL_NAME_LEN + 1];
 -	dict_err_ignore_t	ignore_err = DICT_ERR_IGNORE_NONE;
 +	index = whole_row ? clust_index : prebuilt->index;
  
 -	DBUG_ENTER("ha_innobase::open");
 +	prebuilt->need_to_access_clustered = (index == clust_index);
  
 -	UT_NOT_USED(mode);
 -	UT_NOT_USED(test_if_locked);
 +	/* Either prebuilt->index should be a secondary index, or it
 +	should be the clustered index. */
 +	ut_ad(dict_index_is_clust(index) == (index == clust_index));
  
 -	thd = ha_thd();
 +	/* Below we check column by column if we need to access
 +	the clustered index. */
 +
 +	n_stored_fields= (ulint)table->s->stored_fields; /* number of stored columns */
  
 -	/* Under some cases MySQL seems to call this function while
 -	holding btr_search_latch. This breaks the latching order as
 -	we acquire dict_sys->mutex below and leads to a deadlock. */
 -	if (thd != NULL) {
 -		innobase_release_temporary_latches(ht, thd);
 +	if (!prebuilt->mysql_template) {
 +		prebuilt->mysql_template = (mysql_row_templ_t*)
 +			mem_alloc(n_stored_fields * sizeof(mysql_row_templ_t));
  	}
  
 -	normalize_table_name(norm_name, name);
 +	prebuilt->template_type = whole_row
 +		? ROW_MYSQL_WHOLE_ROW : ROW_MYSQL_REC_FIELDS;
 +	prebuilt->null_bitmap_len = table->s->null_bytes;
  
 -	user_thd = NULL;
 +	/* Prepare to build prebuilt->mysql_template[]. */
 +	prebuilt->templ_contains_blob = FALSE;
 +	prebuilt->mysql_prefix_len = 0;
 +	prebuilt->n_template = 0;
 +	prebuilt->idx_cond_n_cols = 0;
  
 -	if (!(share=get_share(name))) {
 +	/* Note that in InnoDB, i is the column number in the table.
 +	MySQL calls columns 'fields'. */
  
 -		DBUG_RETURN(1);
 -	}
 +	if (active_index != MAX_KEY && active_index == pushed_idx_cond_keyno) {
 +		/* Push down an index condition or an end_range check. */
 +		for (i = 0, sql_idx = 0; i < n_stored_fields; i++, sql_idx++) {
  
 -	if (UNIV_UNLIKELY(share->ib_table &&
 -			 share->ib_table->is_corrupt &&
 -			 srv_pass_corrupt_table <= 1)) {
 -		free_share(share);
 +			while (!table->field[sql_idx]->stored_in_db) {
 +				sql_idx++;
 +			}
  
 -		DBUG_RETURN(HA_ERR_CRASHED_ON_USAGE);
 -	}
 +			const ibool		index_contains
 +				= dict_index_contains_col_or_prefix(index, i);
  
 -	/* Will be allocated if it is needed in ::update_row() */
 -	upd_buf = NULL;
 -	upd_buf_size = 0;
 +			/* Test if an end_range or an index condition
 +			refers to the field. Note that "index" and
 +			"index_contains" may refer to the clustered index.
 +			Index condition pushdown is relative to prebuilt->index
 +			(the index that is being looked up first). */
  
 -	/* We look for pattern #P# to see if the table is partitioned
 -	MySQL table. */
 -#ifdef __WIN__
 -	is_part = strstr(norm_name, "#p#");
 -#else
 -	is_part = strstr(norm_name, "#P#");
 -#endif /* __WIN__ */
 +			/* When join_read_always_key() invokes this
 +			code via handler::ha_index_init() and
 +			ha_innobase::index_init(), end_range is not
 +			yet initialized. Because of that, we must
 +			always check for index_contains, instead of
 +			the subset
 +			field->part_of_key.is_set(active_index)
 +			which would be acceptable if end_range==NULL. */
 +			if (build_template_needs_field_in_icp(
 +				    index, prebuilt, index_contains, i)) {
 +				/* Needed in ICP */
 +				const Field*		field;
 +				mysql_row_templ_t*	templ;
  
 -	/* Check whether FOREIGN_KEY_CHECKS is set to 0. If so, the table
 -	can be opened even if some FK indexes are missing. If not, the table
 -	can't be opened in the same situation */
 -	if (thd_test_options(thd, OPTION_NO_FOREIGN_KEY_CHECKS)) {
 -		ignore_err = DICT_ERR_IGNORE_FK_NOKEY;
 -	}
 +				if (whole_row) {
 +					field = table->field[sql_idx];
 +				} else {
 +					field = build_template_needs_field(
 +						index_contains,
 +						prebuilt->read_just_key,
 +						fetch_all_in_key,
 +						fetch_primary_key_cols,
 +						index, table, i, sql_idx);
 +					if (!field) {
 +						continue;
 +					}
 +				}
 +
 +				templ = build_template_field(
 +					prebuilt, clust_index, index,
 +					table, field, i);
 +				prebuilt->idx_cond_n_cols++;
 +				ut_ad(prebuilt->idx_cond_n_cols
 +				      == prebuilt->n_template);
 +
 +				if (index == prebuilt->index) {
 +					templ->icp_rec_field_no
 +						= templ->rec_field_no;
 +				} else {
 +					templ->icp_rec_field_no
 +						= dict_index_get_nth_col_pos(
 +							prebuilt->index, i,
 +							NULL);
 +				}
 +
 +				if (dict_index_is_clust(prebuilt->index)) {
 +					ut_ad(templ->icp_rec_field_no
 +					      != ULINT_UNDEFINED);
 +					/* If the primary key includes
 +					a column prefix, use it in
 +					index condition pushdown,
 +					because the condition is
 +					evaluated before fetching any
 +					off-page (externally stored)
 +					columns. */
 +					if (templ->icp_rec_field_no
 +					    < prebuilt->index->n_uniq) {
 +						/* This is a key column;
 +						all set. */
 +						continue;
 +					}
 +				} else if (templ->icp_rec_field_no
 +					   != ULINT_UNDEFINED) {
 +					continue;
 +				}
 +
 +				/* This is a column prefix index.
 +				The column prefix can be used in
 +				an end_range comparison. */
 +
 +				templ->icp_rec_field_no
 +					= dict_index_get_nth_col_or_prefix_pos(
 +						prebuilt->index, i, TRUE, NULL);
 +				ut_ad(templ->icp_rec_field_no
 +				      != ULINT_UNDEFINED);
 +
 +				/* Index condition pushdown can be used on
 +				all columns of a secondary index, and on
 +				the PRIMARY KEY columns. On the clustered
 +				index, it must never be used on other than
 +				PRIMARY KEY columns, because those columns
 +				may be stored off-page, and we will not
 +				fetch externally stored columns before
 +				checking the index condition. */
 +				/* TODO: test the above with an assertion
 +				like this. Note that index conditions are
 +				currently pushed down as part of the
 +				"optimizer phase" while end_range is done
 +				as part of the execution phase. Therefore,
 +				we were unable to use an accurate condition
 +				for end_range in the "if" condition above,
 +				and the following assertion would fail.
 +				ut_ad(!dict_index_is_clust(prebuilt->index)
 +				      || templ->rec_field_no
 +				      < prebuilt->index->n_uniq);
 +				*/
 +			}
 +		}
 +
 +		ut_ad(prebuilt->idx_cond_n_cols > 0);
 +		ut_ad(prebuilt->idx_cond_n_cols == prebuilt->n_template);
  
 -	/* Get pointer to a table object in InnoDB dictionary cache */
 -	ib_table = dict_table_get(norm_name, TRUE, ignore_err);
 +		/* Include the fields that are not needed in index condition
 +		pushdown. */
 +		for (i = 0, sql_idx = 0; i < n_stored_fields; i++, sql_idx++) {
  
 -	if (UNIV_UNLIKELY(ib_table &&
 -			 ib_table->is_corrupt &&
 -			 srv_pass_corrupt_table <= 1)) {
 -		free_share(share);
 -		my_free(upd_buf);
 -		upd_buf = NULL;
 -		upd_buf_size = 0;
 +			while (!table->field[sql_idx]->stored_in_db) {
 +				sql_idx++;
 +			}
  
 -		DBUG_RETURN(HA_ERR_CRASHED_ON_USAGE);
 -	}
 +			const ibool		index_contains
 +				= dict_index_contains_col_or_prefix(index, i);
  
 -	share->ib_table = ib_table;
 +			if (!build_template_needs_field_in_icp(
 +				    index, prebuilt, index_contains, i)) {
 +				/* Not needed in ICP */
 +				const Field*	field;
  
 -	if (NULL == ib_table) {
 -		if (is_part) {
 -			/* MySQL partition engine hard codes the file name
 -			separator as "#P#". The text case is fixed even if
 -			lower_case_table_names is set to 1 or 2. This is true
 -			for sub-partition names as well. InnoDB always
 -			normalises file names to lower case on Windows, this
 -			can potentially cause problems when copying/moving
 -			tables between platforms.
 +				if (whole_row) {
 +					field = table->field[sql_idx];
 +				} else {
 +					field = build_template_needs_field(
 +						index_contains,
 +						prebuilt->read_just_key,
 +						fetch_all_in_key,
 +						fetch_primary_key_cols,
 +						index, table, i, sql_idx);
 +					if (!field) {
 +						continue;
 +					}
 +				}
  
 -			1) If boot against an installation from Windows
 -			platform, then its partition table name could
 -			be all be in lower case in system tables. So we
 -			will need to check lower case name when load table.
 +				build_template_field(prebuilt,
 +						     clust_index, index,
 +						     table, field, i);
 +			}
 +		}
  
 -			2) If  we boot an installation from other case
 -			sensitive platform in Windows, we might need to
 -			check the existence of table name without lowering
 -			case them in the system table. */
 -			if (innobase_get_lower_case_table_names() == 1) {
 +		prebuilt->idx_cond = this;
 +	} else {
 +		/* No index condition pushdown */
 +		prebuilt->idx_cond = NULL;
  
 -				if (!par_case_name_set) {
 -#ifndef __WIN__
 -					/* Check for the table using lower
 -					case name, including the partition
 -					separator "P" */
 -					memcpy(par_case_name, norm_name,
 -					       strlen(norm_name));
 -					par_case_name[strlen(norm_name)] = 0;
 -					innobase_casedn_str(par_case_name);
 -#else
 -					/* On Windows platfrom, check
 -					whether there exists table name in
 -					system table whose name is
 -					not being normalized to lower case */
 -					normalize_table_name_low(
 -						par_case_name, name, FALSE);
 -#endif
 -					par_case_name_set = TRUE;
 -				}
 +                for (i = 0, sql_idx = 0; i < n_stored_fields; i++, sql_idx++) {
 +			const Field*	field;
  
 -				ib_table = dict_table_get(
 -					par_case_name, TRUE, ignore_err);
 +			while (!table->field[sql_idx]->stored_in_db) {
 +				sql_idx++;
  			}
  
 -			if (ib_table) {
 -#ifndef __WIN__
 -				sql_print_warning("Partition table %s opened "
 -						  "after converting to lower "
 -						  "case. The table may have "
 -						  "been moved from a case "
 -						  "in-sensitive file system. "
 -						  "Please recreate table in "
 -						  "the current file system\n",
 -						  norm_name);
 -#else
 -				sql_print_warning("Partition table %s opened "
 -						  "after skipping the step to "
 -						  "lower case the table name. "
 -						  "The table may have been "
 -						  "moved from a case sensitive "
 -						  "file system. Please "
 -						  "recreate table in the "
 -						  "current file system\n",
 -						  norm_name);
 -#endif
 -				/* We allow use of table if it is found.
 -				this is consistent to current behavior
 -				to innodb_plugin */
 -				share->ib_table = ib_table;
 -				goto table_opened;
 +			if (whole_row) {
 +				field = table->field[sql_idx];
 +			} else {
 +				field = build_template_needs_field(
 +					dict_index_contains_col_or_prefix(
 +						index, i),
 +					prebuilt->read_just_key,
 +					fetch_all_in_key,
 +					fetch_primary_key_cols,
 +					index, table, i, sql_idx);
 +				if (!field) {
 +					continue;
 +				}
  			}
 -		}
  
 -		if (is_part) {
 -			sql_print_error("Failed to open table %s.\n",
 -					norm_name);
 +			build_template_field(prebuilt, clust_index, index,
 +					     table, field, i);
  		}
 -
 -		sql_print_error("Cannot find or open table %s from\n"
 -				"the internal data dictionary of InnoDB "
 -				"though the .frm file for the\n"
 -				"table exists. Maybe you have deleted and "
 -				"recreated InnoDB data\n"
 -				"files but have forgotten to delete the "
 -				"corresponding .frm files\n"
 -				"of InnoDB tables, or you have moved .frm "
 -				"files to another database?\n"
 -				"or, the table contains indexes that this "
 -				"version of the engine\n"
 -				"doesn't support.\n"
 -				"See " REFMAN "innodb-troubleshooting.html\n"
 -				"how you can resolve the problem.\n",
 -				norm_name);
 -		free_share(share);
 -		my_errno = ENOENT;
 -
 -		DBUG_RETURN(HA_ERR_NO_SUCH_TABLE);
  	}
  
 -table_opened:
 +	if (index != clust_index && prebuilt->need_to_access_clustered) {
 +		/* Change rec_field_no's to correspond to the clustered index
 +		record */
 +		for (i = 0; i < prebuilt->n_template; i++) {
  
 -	if (ib_table->ibd_file_missing && !thd_tablespace_op(thd)) {
 -		sql_print_error("MySQL is trying to open a table handle but "
 -				"the .ibd file for\ntable %s does not exist.\n"
 -				"Have you deleted the .ibd file from the "
 -				"database directory under\nthe MySQL datadir, "
 -				"or have you used DISCARD TABLESPACE?\n"
 -				"See " REFMAN "innodb-troubleshooting.html\n"
 -				"how you can resolve the problem.\n",
 -				norm_name);
 -		free_share(share);
 -		my_errno = ENOENT;
 +			mysql_row_templ_t*	templ
 +				= &prebuilt->mysql_template[i];
  
 -		dict_table_decrement_handle_count(ib_table, FALSE);
 -		DBUG_RETURN(HA_ERR_NO_SUCH_TABLE);
 +			templ->rec_field_no = templ->clust_rec_field_no;
 +		}
  	}
 +}
  
 -	prebuilt = row_create_prebuilt(ib_table, table->s->stored_rec_length);
 -
 -	prebuilt->default_rec = table->s->default_values;
 -	ut_ad(prebuilt->default_rec);
 +/********************************************************************//**
 +This special handling is really to overcome the limitations of MySQL's
 +binlogging. We need to eliminate the non-determinism that will arise in
 +INSERT ... SELECT type of statements, since MySQL binlog only stores the
 +min value of the autoinc interval. Once that is fixed we can get rid of
 +the special lock handling.
 +@return	DB_SUCCESS if all OK else error code */
 +UNIV_INTERN
 +dberr_t
 +ha_innobase::innobase_lock_autoinc(void)
 +/*====================================*/
 +{
 +	DBUG_ENTER("ha_innobase::innobase_lock_autoinc");
 +	dberr_t		error = DB_SUCCESS;
  
 -	/* Looks like MySQL-3.23 sometimes has primary key number != 0 */
 +	ut_ad(!srv_read_only_mode);
  
 -	primary_key = table->s->primary_key;
 -	key_used_on_scan = primary_key;
 +	switch (innobase_autoinc_lock_mode) {
 +	case AUTOINC_NO_LOCKING:
 +		/* Acquire only the AUTOINC mutex. */
 +		dict_table_autoinc_lock(prebuilt->table);
 +		break;
  
 -	if (!innobase_build_index_translation(table, ib_table, share)) {
 -		  sql_print_error("Build InnoDB index translation table for"
 -				  " Table %s failed", name);
 -	}
 +	case AUTOINC_NEW_STYLE_LOCKING:
 +		/* For simple (single/multi) row INSERTs/REPLACEs and RBR
 +		events, we fallback to the old style only if another
 +		transaction has already acquired the AUTOINC lock on
 +		behalf of a LOAD FILE or INSERT ... SELECT etc. type of
 +		statement. */
 +		if (thd_sql_command(user_thd) == SQLCOM_INSERT
 +		    || thd_sql_command(user_thd) == SQLCOM_REPLACE
 +		    || thd_sql_command(user_thd) == SQLCOM_END // RBR event
 +		) {
 +			dict_table_t*	ib_table = prebuilt->table;
  
 -	/* Allocate a buffer for a 'row reference'. A row reference is
 -	a string of bytes of length ref_length which uniquely specifies
 -	a row in our table. Note that MySQL may also compare two row
 -	references for equality by doing a simple memcmp on the strings
 -	of length ref_length! */
 +			/* Acquire the AUTOINC mutex. */
 +			dict_table_autoinc_lock(ib_table);
  
 -	if (!row_table_got_default_clust_index(ib_table)) {
 +			/* We need to check that another transaction isn't
 +			already holding the AUTOINC lock on the table. */
 +			if (ib_table->n_waiting_or_granted_auto_inc_locks) {
 +				/* Release the mutex to avoid deadlocks and
 +				fall back to old style locking. */
 +				dict_table_autoinc_unlock(ib_table);
 +			} else {
 +				/* Do not fall back to old style locking. */
 +				break;
 +			}
 +		}
 +		/* Use old style locking. */
 +		/* fall through */
 +	case AUTOINC_OLD_STYLE_LOCKING:
 +		DBUG_EXECUTE_IF("die_if_autoinc_old_lock_style_used",
 +				ut_ad(0););
 +		error = row_lock_table_autoinc_for_mysql(prebuilt);
  
 -		prebuilt->clust_index_was_generated = FALSE;
 +		if (error == DB_SUCCESS) {
  
 -		if (UNIV_UNLIKELY(primary_key >= MAX_KEY)) {
 -			sql_print_error("Table %s has a primary key in "
 -					"InnoDB data dictionary, but not "
 -					"in MySQL!", name);
 +			/* Acquire the AUTOINC mutex. */
 +			dict_table_autoinc_lock(prebuilt->table);
 +		}
 +		break;
  
 -			/* This mismatch could cause further problems
 -			if not attended, bring this to the user's attention
 -			by printing a warning in addition to log a message
 -			in the errorlog */
 -			push_warning_printf(thd, MYSQL_ERROR::WARN_LEVEL_WARN,
 -					    ER_NO_SUCH_INDEX,
 -					    "InnoDB: Table %s has a "
 -					    "primary key in InnoDB data "
 -					    "dictionary, but not in "
 -					    "MySQL!", name);
 +	default:
 +		ut_error;
 +	}
  
 -			/* If primary_key >= MAX_KEY, its (primary_key)
 -			value could be out of bound if continue to index
 -			into key_info[] array. Find InnoDB primary index,
 -			and assign its key_length to ref_length.
 -			In addition, since MySQL indexes are sorted starting
 -			with primary index, unique index etc., initialize
 -			ref_length to the first index key length in
 -			case we fail to find InnoDB cluster index.
 +	DBUG_RETURN(error);
 +}
  
 -			Please note, this will not resolve the primary
 -			index mismatch problem, other side effects are
 -			possible if users continue to use the table.
 -			However, we allow this table to be opened so
 -			that user can adopt necessary measures for the
 -			mismatch while still being accessible to the table
 -			date. */
 -			ref_length = table->key_info[0].key_length;
 +/********************************************************************//**
 +Reset the autoinc value in the table.
 +@return	DB_SUCCESS if all went well else error code */
 +UNIV_INTERN
 +dberr_t
 +ha_innobase::innobase_reset_autoinc(
 +/*================================*/
 +	ulonglong	autoinc)	/*!< in: value to store */
 +{
 +	dberr_t		error;
  
 -			/* Find correspoinding cluster index
 -			key length in MySQL's key_info[] array */
 -			for (ulint i = 0; i < table->s->keys; i++) {
 -				dict_index_t*	index;
 -				index = innobase_get_index(i);
 -				if (dict_index_is_clust(index)) {
 -					ref_length =
 -						 table->key_info[i].key_length;
 -				}
 -			}
 -		} else {
 -			/* MySQL allocates the buffer for ref.
 -			key_info->key_length includes space for all key
 -			columns + one byte for each column that may be
 -			NULL. ref_length must be as exact as possible to
 -			save space, because all row reference buffers are
 -			allocated based on ref_length. */
 +	error = innobase_lock_autoinc();
  
 -			ref_length = table->key_info[primary_key].key_length;
 -		}
 -	} else {
 -		if (primary_key != MAX_KEY) {
 -			sql_print_error(
 -				"Table %s has no primary key in InnoDB data "
 -				"dictionary, but has one in MySQL! If you "
 -				"created the table with a MySQL version < "
 -				"3.23.54 and did not define a primary key, "
 -				"but defined a unique key with all non-NULL "
 -				"columns, then MySQL internally treats that "
 -				"key as the primary key. You can fix this "
 -				"error by dump + DROP + CREATE + reimport "
 -				"of the table.", name);
 +	if (error == DB_SUCCESS) {
  
 -			/* This mismatch could cause further problems
 -			if not attended, bring this to the user attention
 -			by printing a warning in addition to log a message
 -			in the errorlog */
 -			push_warning_printf(thd, MYSQL_ERROR::WARN_LEVEL_WARN,
 -					    ER_NO_SUCH_INDEX,
 -					    "InnoDB: Table %s has no "
 -					    "primary key in InnoDB data "
 -					    "dictionary, but has one in "
 -					    "MySQL!", name);
 -		}
 +		dict_table_autoinc_initialize(prebuilt->table, autoinc);
  
 -		prebuilt->clust_index_was_generated = TRUE;
 +		dict_table_autoinc_unlock(prebuilt->table);
 +	}
  
 -		ref_length = DATA_ROW_ID_LEN;
 +	return(error);
 +}
  
 -		/* If we automatically created the clustered index, then
 -		MySQL does not know about it, and MySQL must NOT be aware
 -		of the index used on scan, to make it avoid checking if we
 -		update the column of the index. That is why we assert below
 -		that key_used_on_scan is the undefined value MAX_KEY.
 -		The column is the row id in the automatical generation case,
 -		and it will never be updated anyway. */
 +/********************************************************************//**
 +Store the autoinc value in the table. The autoinc value is only set if
 +it's greater than the existing autoinc value in the table.
 +@return	DB_SUCCESS if all went well else error code */
 +UNIV_INTERN
 +dberr_t
 +ha_innobase::innobase_set_max_autoinc(
 +/*==================================*/
 +	ulonglong	auto_inc)	/*!< in: value to store */
 +{
 +	dberr_t		error;
  
 -		if (key_used_on_scan != MAX_KEY) {
 -			sql_print_warning(
 -				"Table %s key_used_on_scan is %lu even "
 -				"though there is no primary key inside "
 -				"InnoDB.", name, (ulong) key_used_on_scan);
 -		}
 +	error = innobase_lock_autoinc();
 +
 +	if (error == DB_SUCCESS) {
 +
 +		dict_table_autoinc_update_if_greater(prebuilt->table, auto_inc);
 +
 +		dict_table_autoinc_unlock(prebuilt->table);
  	}
  
 -	/* Index block size in InnoDB: used by MySQL in query optimization */
 -	stats.block_size = 16 * 1024;
 +	return(error);
 +}
  
 -	/* Init table lock structure */
 -	thr_lock_data_init(&share->lock,&lock,(void*) 0);
 +/********************************************************************//**
 +Stores a row in an InnoDB database, to the table specified in this
 +handle.
 +@return	error code */
 +UNIV_INTERN
 +int
 +ha_innobase::write_row(
 +/*===================*/
 +	uchar*	record)	/*!< in: a row in MySQL format */
 +{
 +	dberr_t		error;
 +	int		error_result= 0;
 +	ibool		auto_inc_used= FALSE;
 +#ifdef WITH_WSREP
 +	ibool           auto_inc_inserted= FALSE; /* if NULL was inserted */
 +#endif
 +	ulint		sql_command;
 +	trx_t*		trx = thd_to_trx(user_thd);
  
 -	if (prebuilt->table) {
 -		/* We update the highest file format in the system table
 -		space, if this table has higher file format setting. */
 +	DBUG_ENTER("ha_innobase::write_row");
  
 -		trx_sys_file_format_max_upgrade(
 -			(const char**) &innobase_file_format_max,
 -			dict_table_get_format(prebuilt->table));
 +	if (high_level_read_only) {
 +		ib_senderrf(ha_thd(), IB_LOG_LEVEL_WARN, ER_READ_ONLY_MODE);
 +		DBUG_RETURN(HA_ERR_TABLE_READONLY);
 +	} else if (prebuilt->trx != trx) {
 +		sql_print_error("The transaction object for the table handle "
 +				"is at %p, but for the current thread it is at "
 +				"%p",
 +				(const void*) prebuilt->trx, (const void*) trx);
 +
 +		fputs("InnoDB: Dump of 200 bytes around prebuilt: ", stderr);
 +		ut_print_buf(stderr, ((const byte*) prebuilt) - 100, 200);
 +		fputs("\n"
 +			"InnoDB: Dump of 200 bytes around ha_data: ",
 +			stderr);
 +		ut_print_buf(stderr, ((const byte*) trx) - 100, 200);
 +		putc('\n', stderr);
 +		ut_error;
 +	} else if (!trx_is_started(trx)) {
 +		++trx->will_lock;
  	}
  
 -	/* Only if the table has an AUTOINC column. */
 -	if (prebuilt->table != NULL && table->found_next_number_field != NULL) {
 -		dict_table_autoinc_lock(prebuilt->table);
 +	ha_statistic_increment(&SSV::ha_write_count);
  
 -		/* Since a table can already be "open" in InnoDB's internal
 -		data dictionary, we only init the autoinc counter once, the
 -		first time the table is loaded. We can safely reuse the
 -		autoinc value from a previous MySQL open. */
 -		if (dict_table_autoinc_read(prebuilt->table) == 0) {
 +	if (share->ib_table != prebuilt->table) {
 +		fprintf(stderr,
 +			"InnoDB: Warning: share->ib_table %p prebuilt->table %p table %s is_corrupt %lu.",
 +			share->ib_table, prebuilt->table, prebuilt->table->name, prebuilt->table->is_corrupt);
 +	}
  
 -			innobase_initialize_autoinc();
 +	if (UNIV_UNLIKELY(share->ib_table && share->ib_table->is_corrupt)) {
 +		DBUG_RETURN(HA_ERR_CRASHED);
 +	}
 +
 +	sql_command = thd_sql_command(user_thd);
 +
 +	if ((sql_command == SQLCOM_ALTER_TABLE
 +	     || sql_command == SQLCOM_OPTIMIZE
 +	     || sql_command == SQLCOM_CREATE_INDEX
 +#ifdef WITH_WSREP
 +	     || (wsrep_on(user_thd) && wsrep_load_data_splitting &&
 +		 sql_command == SQLCOM_LOAD                      &&
 +		 !thd_test_options(
 +			user_thd, OPTION_NOT_AUTOCOMMIT | OPTION_BEGIN))
 +#endif /* WITH_WSREP */
 +	     || sql_command == SQLCOM_DROP_INDEX)
 +	    && num_write_row >= 10000) {
 +#ifdef WITH_WSREP
 +		if (wsrep_on(user_thd) && sql_command == SQLCOM_LOAD) {
 +			WSREP_DEBUG("forced trx split for LOAD: %s", 
 +				    wsrep_thd_query(user_thd));
  		}
 +#endif /* WITH_WSREP */
 +		/* ALTER TABLE is COMMITted at every 10000 copied rows.
 +		The IX table lock for the original table has to be re-issued.
 +		As this method will be called on a temporary table where the
 +		contents of the original table is being copied to, it is
 +		a bit tricky to determine the source table.  The cursor
 +		position in the source table need not be adjusted after the
 +		intermediate COMMIT, since writes by other transactions are
 +		being blocked by a MySQL table lock TL_WRITE_ALLOW_READ. */
  
 -		dict_table_autoinc_unlock(prebuilt->table);
 -	}
 +		dict_table_t*	src_table;
 +		enum lock_mode	mode;
  
 -	info(HA_STATUS_NO_LOCK | HA_STATUS_VARIABLE | HA_STATUS_CONST);
 +		num_write_row = 0;
  
 -	DBUG_RETURN(0);
 -}
 +		/* Commit the transaction.  This will release the table
 +		locks, so they have to be acquired again. */
  
 -UNIV_INTERN
 -handler*
 -ha_innobase::clone(
 -/*===============*/
 -	const char*	name,		/*!< in: table name */
 -	MEM_ROOT*	mem_root)	/*!< in: memory context */
 -{
 -	ha_innobase* new_handler;
 +		/* Altering an InnoDB table */
 +		/* Get the source table. */
 +		src_table = lock_get_src_table(
 +				prebuilt->trx, prebuilt->table, &mode);
 +		if (!src_table) {
 +no_commit:
 +			/* Unknown situation: do not commit */
 +			/*
 +			ut_print_timestamp(stderr);
 +			fprintf(stderr,
 +				"  InnoDB: ALTER TABLE is holding lock"
 +				" on %lu tables!\n",
 +				prebuilt->trx->mysql_n_tables_locked);
 +			*/
 +			;
 +		} else if (src_table == prebuilt->table) {
 +#ifdef WITH_WSREP
 +			if (wsrep_on(user_thd)                              &&
 +			    wsrep_load_data_splitting                       &&
 +			    sql_command == SQLCOM_LOAD                      &&
 +			    !thd_test_options(user_thd,
 +			                      OPTION_NOT_AUTOCOMMIT | OPTION_BEGIN))
 +			{
 +				switch (wsrep_run_wsrep_commit(user_thd, wsrep_hton, 1))
 +				{
 +				case WSREP_TRX_OK:
 +				  break;
 +				case WSREP_TRX_SIZE_EXCEEDED:
 +				case WSREP_TRX_CERT_FAIL:
 +				case WSREP_TRX_ERROR:
 +				  DBUG_RETURN(1);
 +				}
  
 -	DBUG_ENTER("ha_innobase::clone");
 +				if (binlog_hton->commit(binlog_hton, user_thd, 1))
 +					DBUG_RETURN(1);
 +				wsrep_post_commit(user_thd, TRUE);
 +			}
 +#endif /* WITH_WSREP */
 +			/* Source table is not in InnoDB format:
 +			no need to re-acquire locks on it. */
  
 -	new_handler = static_cast<ha_innobase*>(handler::clone(name,
 -							       mem_root));
 -	if (new_handler) {
 -		new_handler->prebuilt->select_lock_type
 -			= prebuilt->select_lock_type;
 +			/* Altering to InnoDB format */
 +			innobase_commit(ht, user_thd, 1);
 +			/* Note that this transaction is still active. */
 +			trx_register_for_2pc(prebuilt->trx);
 +			/* We will need an IX lock on the destination table. */
 +			prebuilt->sql_stat_start = TRUE;
 +		} else {
 +#ifdef WITH_WSREP
 +			if (wsrep_on(user_thd)                              &&
 +			    wsrep_load_data_splitting                       &&
 +			    sql_command == SQLCOM_LOAD                      &&
 +			    !thd_test_options(user_thd,
 +			                      OPTION_NOT_AUTOCOMMIT | OPTION_BEGIN))
 +			{
 +				switch (wsrep_run_wsrep_commit(user_thd, wsrep_hton, 1))
 +				{
 +				case WSREP_TRX_OK:
 +				  break;
 +				case WSREP_TRX_SIZE_EXCEEDED:
 +				case WSREP_TRX_CERT_FAIL:
 +				case WSREP_TRX_ERROR:
 +				  DBUG_RETURN(1);
 +				}
 +
 +				if (binlog_hton->commit(binlog_hton, user_thd, 1))
 +					DBUG_RETURN(1);
 +				wsrep_post_commit(user_thd, TRUE);
 +			}
 +#endif /* WITH_WSREP */
 +			/* Ensure that there are no other table locks than
 +			LOCK_IX and LOCK_AUTO_INC on the destination table. */
 +
 +			if (!lock_is_table_exclusive(prebuilt->table,
 +							prebuilt->trx)) {
 +				goto no_commit;
 +			}
 +
 +			/* Commit the transaction.  This will release the table
 +			locks, so they have to be acquired again. */
 +			innobase_commit(ht, user_thd, 1);
 +			/* Note that this transaction is still active. */
 +			trx_register_for_2pc(prebuilt->trx);
 +			/* Re-acquire the table lock on the source table. */
 +			row_lock_table_for_mysql(prebuilt, src_table, mode);
 +			/* We will need an IX lock on the destination table. */
 +			prebuilt->sql_stat_start = TRUE;
 +		}
  	}
  
 -	DBUG_RETURN(new_handler);
 -}
 +	num_write_row++;
  
 -UNIV_INTERN
 -uint
 -ha_innobase::max_supported_key_part_length() const
 -{
 -	/* A table format specific index column length check will be performed
 -	at ha_innobase::add_index() and row_create_index_for_mysql() */
 -	return(innobase_large_prefix
 -		? REC_VERSION_56_MAX_INDEX_COL_LEN
 -		: REC_ANTELOPE_MAX_INDEX_COL_LEN - 1);
 -}
 +	/* This is the case where the table has an auto-increment column */
 +	if (table->next_number_field && record == table->record[0]) {
  
 -/******************************************************************//**
 -Closes a handle to an InnoDB table.
 -@return	0 */
 -UNIV_INTERN
 -int
 -ha_innobase::close(void)
 -/*====================*/
 -{
 -	THD*	thd;
 +		/* Reset the error code before calling
 +		innobase_get_auto_increment(). */
 +		prebuilt->autoinc_error = DB_SUCCESS;
 +
 +#ifdef WITH_WSREP
 +		auto_inc_inserted= (table->next_number_field->val_int() == 0);
 +#endif
 +
 +		if ((error_result = update_auto_increment())) {
 +			/* We don't want to mask autoinc overflow errors. */
  
 -	DBUG_ENTER("ha_innobase::close");
 +			/* Handle the case where the AUTOINC sub-system
 +			failed during initialization. */
 +			if (prebuilt->autoinc_error == DB_UNSUPPORTED) {
 +				error_result = ER_AUTOINC_READ_FAILED;
 +				/* Set the error message to report too. */
 +				my_error(ER_AUTOINC_READ_FAILED, MYF(0));
 +				goto func_exit;
 +			} else if (prebuilt->autoinc_error != DB_SUCCESS) {
 +				error = prebuilt->autoinc_error;
 +				goto report_error;
 +			}
  
 -	thd = ha_thd();
 -	if (thd != NULL) {
 -		innobase_release_temporary_latches(ht, thd);
 +			/* MySQL errors are passed straight back. except for
 +                           ER_AUTOINC_READ_FAILED. This can only happen
 +                           for values out of range.
 +                         */
 +			goto func_exit;
 +		}
 +
 +		auto_inc_used = TRUE;
  	}
  
 -	row_prebuilt_free(prebuilt, FALSE);
 +	if (prebuilt->mysql_template == NULL
 +	    || prebuilt->template_type != ROW_MYSQL_WHOLE_ROW) {
  
 -	if (upd_buf != NULL) {
 -		ut_ad(upd_buf_size != 0);
 -		my_free(upd_buf);
 -		upd_buf = NULL;
 -		upd_buf_size = 0;
 +		/* Build the template used in converting quickly between
 +		the two database formats */
 +
 +		build_template(true);
  	}
  
 -	free_share(share);
 +	innobase_srv_conc_enter_innodb(prebuilt->trx);
  
 -	/* Tell InnoDB server that there might be work for
 -	utility threads: */
 +	error = row_insert_for_mysql((byte*) record, prebuilt);
 +	DEBUG_SYNC(user_thd, "ib_after_row_insert");
  
 -	srv_active_wake_master_thread();
 +	/* Handle duplicate key errors */
 +	if (auto_inc_used) {
 +		ulonglong	auto_inc;
 +		ulonglong	col_max_value;
  
 -	DBUG_RETURN(0);
 -}
 +		/* Note the number of rows processed for this statement, used
 +		by get_auto_increment() to determine the number of AUTO-INC
 +		values to reserve. This is only useful for a mult-value INSERT
 +		and is a statement level counter.*/
 +		if (trx->n_autoinc_rows > 0) {
 +			--trx->n_autoinc_rows;
 +		}
  
 -/* The following accessor functions should really be inside MySQL code! */
 +		/* We need the upper limit of the col type to check for
 +		whether we update the table autoinc counter or not. */
 +		col_max_value = innobase_get_int_col_max_value(
 +			table->next_number_field);
  
 -/**************************************************************//**
 -Gets field offset for a field in a table.
 -@return	offset */
 -static inline
 -uint
 -get_field_offset(
 -/*=============*/
 -	const TABLE*	table,	/*!< in: MySQL table object */
 -	const Field*	field)	/*!< in: MySQL field object */
 -{
 -	return((uint) (field->ptr - table->record[0]));
 -}
 +		/* Get the value that MySQL attempted to store in the table.*/
 +		auto_inc = table->next_number_field->val_uint();
  
 -/**************************************************************//**
 -Checks if a field in a record is SQL NULL. Uses the record format
 -information in table to track the null bit in record.
 -@return	1 if NULL, 0 otherwise */
 -static inline
 -uint
 -field_in_record_is_null(
 -/*====================*/
 -	TABLE*	table,	/*!< in: MySQL table object */
 -	Field*	field,	/*!< in: MySQL field object */
 -	char*	record)	/*!< in: a row in MySQL format */
 -{
 -	int	null_offset;
 +		switch (error) {
 +		case DB_DUPLICATE_KEY:
  
 -	if (!field->null_ptr) {
 +			/* A REPLACE command and LOAD DATA INFILE REPLACE
 +			handle a duplicate key error themselves, but we
 +			must update the autoinc counter if we are performing
 +			those statements. */
  
 -		return(0);
 -	}
 +			switch (sql_command) {
 +			case SQLCOM_LOAD:
 +				if (trx->duplicates) {
  
 -	null_offset = (uint) ((char*) field->null_ptr
 -					- (char*) table->record[0]);
 +					goto set_max_autoinc;
 +				}
 +				break;
  
 -	if (record[null_offset] & field->null_bit) {
 +			case SQLCOM_REPLACE:
 +			case SQLCOM_INSERT_SELECT:
 +			case SQLCOM_REPLACE_SELECT:
 +				goto set_max_autoinc;
  
 -		return(1);
 -	}
 +#ifdef WITH_WSREP
 +			/* workaround for LP bug #355000, retrying the insert */
 +			case SQLCOM_INSERT:
  
 -	return(0);
 -}
 +				WSREP_DEBUG("DUPKEY error for autoinc\n"
 +				      "THD %ld, value %llu, off %llu inc %llu",
 +				      wsrep_thd_thread_id(current_thd),
 +				      auto_inc,
 +				      prebuilt->autoinc_offset,
 +				      prebuilt->autoinc_increment);
  
 -/*************************************************************//**
 -InnoDB uses this function to compare two data fields for which the data type
 -is such that we must use MySQL code to compare them. NOTE that the prototype
 -of this function is in rem0cmp.c in InnoDB source code! If you change this
 -function, remember to update the prototype there!
 -@return	1, 0, -1, if a is greater, equal, less than b, respectively */
 -extern "C" UNIV_INTERN
 -int
 -innobase_mysql_cmp(
 -/*===============*/
 -	int		mysql_type,	/*!< in: MySQL type */
 -	uint		charset_number,	/*!< in: number of the charset */
 -	const unsigned char* a,		/*!< in: data field */
 -	unsigned int	a_length,	/*!< in: data field length,
 -					not UNIV_SQL_NULL */
 -	const unsigned char* b,		/*!< in: data field */
 -	unsigned int	b_length)	/*!< in: data field length,
 -					not UNIV_SQL_NULL */
 -{
 -	CHARSET_INFO*		charset;
 -	enum_field_types	mysql_tp;
 -	int			ret;
 +                               if (wsrep_on(current_thd)                     &&
 +                                   auto_inc_inserted                         &&
 +                                   wsrep_drupal_282555_workaround            &&
 +                                   wsrep_thd_retry_counter(current_thd) == 0 &&
 +				    !thd_test_options(current_thd, 
 +						      OPTION_NOT_AUTOCOMMIT | 
 +						      OPTION_BEGIN)) {
 +					WSREP_DEBUG(
 +					    "retrying insert: %s",
 +					    (*wsrep_thd_query(current_thd)) ? 
 +						wsrep_thd_query(current_thd) : 
 +						(char *)"void");
 +					error= DB_SUCCESS;
 +					wsrep_thd_set_conflict_state(
 +						current_thd, MUST_ABORT);
 +                                        innobase_srv_conc_exit_innodb(prebuilt->trx);
 +                                        /* jump straight to func exit over
 +                                         * later wsrep hooks */
 +                                        goto func_exit;
 +				}
 +                                break;
 +#endif /* WITH_WSREP */
  
 -	DBUG_ASSERT(a_length != UNIV_SQL_NULL);
 -	DBUG_ASSERT(b_length != UNIV_SQL_NULL);
 +			default:
 +				break;
 +			}
  
 -	mysql_tp = (enum_field_types) mysql_type;
 +			break;
  
 -	switch (mysql_tp) {
 +		case DB_SUCCESS:
 +			/* If the actual value inserted is greater than
 +			the upper limit of the interval, then we try and
 +			update the table upper limit. Note: last_value
 +			will be 0 if get_auto_increment() was not called.*/
  
 -	case MYSQL_TYPE_BIT:
 -	case MYSQL_TYPE_STRING:
 -	case MYSQL_TYPE_VAR_STRING:
 -	case MYSQL_TYPE_TINY_BLOB:
 -	case MYSQL_TYPE_MEDIUM_BLOB:
 -	case MYSQL_TYPE_BLOB:
 -	case MYSQL_TYPE_LONG_BLOB:
 -	case MYSQL_TYPE_VARCHAR:
 -		/* Use the charset number to pick the right charset struct for
 -		the comparison. Since the MySQL function get_charset may be
 -		slow before Bar removes the mutex operation there, we first
 -		look at 2 common charsets directly. */
 +			if (auto_inc >= prebuilt->autoinc_last_value) {
 +set_max_autoinc:
 +				/* This should filter out the negative
 +				values set explicitly by the user. */
 +				if (auto_inc <= col_max_value) {
 +					ut_a(prebuilt->autoinc_increment > 0);
  
 -		if (charset_number == default_charset_info->number) {
 -			charset = default_charset_info;
 -		} else if (charset_number == my_charset_latin1.number) {
 -			charset = &my_charset_latin1;
 -		} else {
 -			charset = get_charset(charset_number, MYF(MY_WME));
 +					ulonglong	offset;
 +					ulonglong	increment;
 +					dberr_t		err;
  
 -			if (charset == NULL) {
 -			  sql_print_error("InnoDB needs charset %lu for doing "
 -					  "a comparison, but MySQL cannot "
 -					  "find that charset.",
 -					  (ulong) charset_number);
 -				ut_a(0);
 +					offset = prebuilt->autoinc_offset;
 +					increment = prebuilt->autoinc_increment;
 +
 +					auto_inc = innobase_next_autoinc(
 +						auto_inc,
 +						1, increment, offset,
 +						col_max_value);
 +
 +					err = innobase_set_max_autoinc(
 +						auto_inc);
 +
 +					if (err != DB_SUCCESS) {
 +						error = err;
 +					}
 +				}
  			}
 +			break;
 +		default:
 +			break;
  		}
 +	}
  
 -		/* Starting from 4.1.3, we use strnncollsp() in comparisons of
 -		non-latin1_swedish_ci strings. NOTE that the collation order
 -		changes then: 'b\0\0...' is ordered BEFORE 'b  ...'. Users
 -		having indexes on such data need to rebuild their tables! */
 +	innobase_srv_conc_exit_innodb(prebuilt->trx);
  
 -		ret = charset->coll->strnncollsp(charset,
 -				  a, a_length,
 -						 b, b_length, 0);
 -		if (ret < 0) {
 -			return(-1);
 -		} else if (ret > 0) {
 -			return(1);
 -		} else {
 -			return(0);
 -		}
 -	default:
 -		ut_error;
 +report_error:
 +	if (error == DB_TABLESPACE_DELETED) {
 +		ib_senderrf(
 +			trx->mysql_thd, IB_LOG_LEVEL_ERROR,
 +			ER_TABLESPACE_DISCARDED,
 +			table->s->table_name.str);
  	}
  
 -	return(0);
 -}
 +	error_result = convert_error_code_to_mysql(error,
 +						   prebuilt->table->flags,
 +						   user_thd);
 +
  #ifdef WITH_WSREP
- 	if (!error_result                                &&
- 	    wsrep_thd_exec_mode(user_thd) == LOCAL_STATE &&
- 	    wsrep_on(user_thd)                           &&
- 	    !wsrep_consistency_check(user_thd)           &&
- 	    !wsrep_thd_skip_append_keys(user_thd))
- 	{
- 		if (wsrep_append_keys(user_thd, false, record, NULL))
- 		{
 -extern "C" UNIV_INTERN
 -int
 -wsrep_innobase_mysql_sort(
 -/*===============*/
 -					/* out: str contains sort string */
 -	int		mysql_type,	/* in: MySQL type */
 -	uint		charset_number,	/* in: number of the charset */
 -	unsigned char*	str,		/* in: data field */
 -	unsigned int	str_length,	/* in: data field length,
 -					not UNIV_SQL_NULL */
 -	unsigned int	buf_length)	/* in: total str buffer length */
++	if (!error_result
++	    && wsrep_on(user_thd)
++ 	    && wsrep_thd_exec_mode(user_thd) == LOCAL_STATE
++	    && !wsrep_consistency_check(user_thd)
++	    && !wsrep_thd_skip_append_keys(user_thd)) {
++		if (wsrep_append_keys(user_thd, false, record, NULL)) {
 +			DBUG_PRINT("wsrep", ("row key failed"));
 +			error_result = HA_ERR_INTERNAL_ERROR;
 +			goto wsrep_error;
 +		}
 +	}
 +wsrep_error:
 +#endif /* WITH_WSREP */
 +
 +	if (error_result == HA_FTS_INVALID_DOCID) {
 +		my_error(HA_FTS_INVALID_DOCID, MYF(0));
 +	}
 +
 +func_exit:
 +	innobase_active_small();
 +
 +	if (share->ib_table != prebuilt->table) {
 +		fprintf(stderr,
 +			"InnoDB: Warning: share->ib_table %p prebuilt->table %p table %s is_corrupt %lu.",
 +			share->ib_table, prebuilt->table, prebuilt->table->name, prebuilt->table->is_corrupt);
 +	}
 +
 +	if (UNIV_UNLIKELY(share->ib_table && share->ib_table->is_corrupt)) {
 +		DBUG_RETURN(HA_ERR_CRASHED);
 +	}
 +
 +	DBUG_RETURN(error_result);
 +}
  
 +/**********************************************************************//**
 +Checks which fields have changed in a row and stores information
 +of them to an update vector.
 +@return	DB_SUCCESS or error code */
 +static
 +dberr_t
 +calc_row_difference(
 +/*================*/
 +	upd_t*		uvect,		/*!< in/out: update vector */
 +	uchar*		old_row,	/*!< in: old row in MySQL format */
 +	uchar*		new_row,	/*!< in: new row in MySQL format */
 +	TABLE*		table,		/*!< in: table in MySQL data
 +					dictionary */
 +	uchar*		upd_buff,	/*!< in: buffer to use */
 +	ulint		buff_len,	/*!< in: buffer length */
 +	row_prebuilt_t*	prebuilt,	/*!< in: InnoDB prebuilt struct */
 +	THD*		thd)		/*!< in: user thread */
  {
 -	CHARSET_INFO*		charset;
 -	enum_field_types	mysql_tp;
 -	int ret_length =	str_length;
 +	uchar*		original_upd_buff = upd_buff;
 +	Field*		field;
 +	enum_field_types field_mysql_type;
 +	uint		n_fields;
 +	ulint		o_len;
 +	ulint		n_len;
 +	ulint		col_pack_len;
 +	const byte*	new_mysql_row_col;
 +	const byte*	o_ptr;
 +	const byte*	n_ptr;
 +	byte*		buf;
 +	upd_field_t*	ufield;
 +	ulint		col_type;
 +	ulint		n_changed = 0;
 +	dfield_t	dfield;
 +	dict_index_t*	clust_index;
 +	uint		sql_idx, innodb_idx= 0;
 +	ibool		changes_fts_column = FALSE;
 +	ibool		changes_fts_doc_col = FALSE;
 +	trx_t*          trx = thd_to_trx(thd);
 +	doc_id_t	doc_id = FTS_NULL_DOC_ID;
  
 -	DBUG_ASSERT(str_length != UNIV_SQL_NULL);
 +	ut_ad(!srv_read_only_mode);
  
 -	mysql_tp = (enum_field_types) mysql_type;
 +	n_fields = table->s->fields;
 +	clust_index = dict_table_get_first_index(prebuilt->table);
  
 -	switch (mysql_tp) {
 +	/* We use upd_buff to convert changed fields */
 +	buf = (byte*) upd_buff;
  
 -	case MYSQL_TYPE_BIT:
 -	case MYSQL_TYPE_STRING:
 -	case MYSQL_TYPE_VAR_STRING:
 -	case MYSQL_TYPE_TINY_BLOB:
 -	case MYSQL_TYPE_MEDIUM_BLOB:
 -	case MYSQL_TYPE_BLOB:
 -	case MYSQL_TYPE_LONG_BLOB:
 -	case MYSQL_TYPE_VARCHAR:
 -	{
 -		uchar tmp_str[REC_VERSION_56_MAX_INDEX_COL_LEN];
 -		uint tmp_length = REC_VERSION_56_MAX_INDEX_COL_LEN;
 +	for (sql_idx = 0; sql_idx < n_fields; sql_idx++) {
 +		field = table->field[sql_idx];
 +		if (!field->stored_in_db)
 +		  continue;
  
 -		/* Use the charset number to pick the right charset struct for
 -		the comparison. Since the MySQL function get_charset may be
 -		slow before Bar removes the mutex operation there, we first
 -		look at 2 common charsets directly. */
 +		o_ptr = (const byte*) old_row + get_field_offset(table, field);
 +		n_ptr = (const byte*) new_row + get_field_offset(table, field);
  
 -		if (charset_number == default_charset_info->number) {
 -			charset = default_charset_info;
 -		} else if (charset_number == my_charset_latin1.number) {
 -			charset = &my_charset_latin1;
 -		} else {
 -			charset = get_charset(charset_number, MYF(MY_WME));
 +		/* Use new_mysql_row_col and col_pack_len save the values */
  
 -			if (charset == NULL) {
 -			  sql_print_error("InnoDB needs charset %lu for doing "
 -					  "a comparison, but MySQL cannot "
 -					  "find that charset.",
 -					  (ulong) charset_number);
 -				ut_a(0);
 -			}
 -		}
 +		new_mysql_row_col = n_ptr;
 +		col_pack_len = field->pack_length();
  
 -		ut_a(str_length <= tmp_length);
 -		memcpy(tmp_str, str, str_length);
 +		o_len = col_pack_len;
 +		n_len = col_pack_len;
  
 -		if (wsrep_protocol_version < 3) {
 -			tmp_length = charset->coll->strnxfrm(
 -				charset, str, str_length,
 -				tmp_str, str_length);
 -			DBUG_ASSERT(tmp_length <= str_length);
 -		} else {
 -			/* strnxfrm will expand the destination string,
 -			   protocols < 3 truncated the sorted sring
 -			   protocols > 3 gets full sorted sring
 -			*/
 -		  	/* 5.5 strnxfrm pads the tail with spaces and
 -			   always returns the full destination buffer lenght
 -			   we cannot know how many characters were converted
 -			   using 2 * str length here as best guess
 -			*/
 -			uint dst_length = (str_length * 2 < tmp_length) ? 
 -				(str_length * 2) : tmp_length; 
 -			tmp_length = charset->coll->strnxfrm(
 -				charset, str, dst_length,
 -				tmp_str, str_length);
 -			DBUG_ASSERT(tmp_length <= buf_length);
 -			ret_length = tmp_length;
 -		}
 - 
 -		break;
 -	}
 -	case MYSQL_TYPE_DECIMAL :
 -	case MYSQL_TYPE_TINY :
 -	case MYSQL_TYPE_SHORT :
 -	case MYSQL_TYPE_LONG :
 -	case MYSQL_TYPE_FLOAT :
 -	case MYSQL_TYPE_DOUBLE :
 -	case MYSQL_TYPE_NULL :
 -	case MYSQL_TYPE_TIMESTAMP :
 -	case MYSQL_TYPE_LONGLONG :
 -	case MYSQL_TYPE_INT24 :
 -	case MYSQL_TYPE_DATE :
 -	case MYSQL_TYPE_TIME :
 -	case MYSQL_TYPE_DATETIME :
 -	case MYSQL_TYPE_YEAR :
 -	case MYSQL_TYPE_NEWDATE :
 -	case MYSQL_TYPE_NEWDECIMAL :
 -	case MYSQL_TYPE_ENUM :
 -	case MYSQL_TYPE_SET :
 -	case MYSQL_TYPE_GEOMETRY :
 -		break;
 -	default:
 -		break;
 -	}
 +		/* We use o_ptr and n_ptr to dig up the actual data for
 +		comparison. */
  
 -	return ret_length;
 -}
 -#endif // WITH_WSREP
 -/**************************************************************//**
 -Converts a MySQL type to an InnoDB type. Note that this function returns
 -the 'mtype' of InnoDB. InnoDB differentiates between MySQL's old <= 4.1
 -VARCHAR and the new true VARCHAR in >= 5.0.3 by the 'prtype'.
 -@return	DATA_BINARY, DATA_VARCHAR, ... */
 -extern "C" UNIV_INTERN
 -ulint
 -get_innobase_type_from_mysql_type(
 -/*==============================*/
 -	ulint*		unsigned_flag,	/*!< out: DATA_UNSIGNED if an
 -					'unsigned type';
 -					at least ENUM and SET,
 -					and unsigned integer
 -					types are 'unsigned types' */
 -	const void*	f)		/*!< in: MySQL Field */
 -{
 -	const class Field* field = reinterpret_cast<const class Field*>(f);
 +		field_mysql_type = field->type();
  
 -	/* The following asserts try to check that the MySQL type code fits in
 -	8 bits: this is used in ibuf and also when DATA_NOT_NULL is ORed to
 -	the type */
 +		col_type = prebuilt->table->cols[innodb_idx].mtype;
  
 -	DBUG_ASSERT((ulint)MYSQL_TYPE_STRING < 256);
 -	DBUG_ASSERT((ulint)MYSQL_TYPE_VAR_STRING < 256);
 -	DBUG_ASSERT((ulint)MYSQL_TYPE_DOUBLE < 256);
 -	DBUG_ASSERT((ulint)MYSQL_TYPE_FLOAT < 256);
 -	DBUG_ASSERT((ulint)MYSQL_TYPE_DECIMAL < 256);
 +		switch (col_type) {
  
 -	if (field->flags & UNSIGNED_FLAG) {
 +		case DATA_BLOB:
 +			/* Do not compress blob column while comparing*/
 +			o_ptr = row_mysql_read_blob_ref(&o_len, o_ptr, o_len);
 +			n_ptr = row_mysql_read_blob_ref(&n_len, n_ptr, n_len);
  
 -		*unsigned_flag = DATA_UNSIGNED;
 -	} else {
 -		*unsigned_flag = 0;
 -	}
 +			break;
  
 -	if (field->real_type() == MYSQL_TYPE_ENUM
 -		|| field->real_type() == MYSQL_TYPE_SET) {
 +		case DATA_VARCHAR:
 +		case DATA_BINARY:
 +		case DATA_VARMYSQL:
 +			if (field_mysql_type == MYSQL_TYPE_VARCHAR) {
 +				/* This is a >= 5.0.3 type true VARCHAR where
 +				the real payload data length is stored in
 +				1 or 2 bytes */
  
 -		/* MySQL has field->type() a string type for these, but the
 -		data is actually internally stored as an unsigned integer
 -		code! */
 +				o_ptr = row_mysql_read_true_varchar(
 +					&o_len, o_ptr,
 +					(ulint)
 +					(((Field_varstring*) field)->length_bytes));
  
 -		*unsigned_flag = DATA_UNSIGNED; /* MySQL has its own unsigned
 -						flag set to zero, even though
 -						internally this is an unsigned
 -						integer type */
 -		return(DATA_INT);
 -	}
 +				n_ptr = row_mysql_read_true_varchar(
 +					&n_len, n_ptr,
 +					(ulint)
 +					(((Field_varstring*) field)->length_bytes));
 +			}
  
 -	switch (field->type()) {
 -		/* NOTE that we only allow string types in DATA_MYSQL and
 -		DATA_VARMYSQL */
 -	case MYSQL_TYPE_VAR_STRING: /* old <= 4.1 VARCHAR */
 -	case MYSQL_TYPE_VARCHAR:    /* new >= 5.0.3 true VARCHAR */
 -		if (field->binary()) {
 -			return(DATA_BINARY);
 -		} else if (strcmp(
 -				   field->charset()->name,
 -				   "latin1_swedish_ci") == 0) {
 -			return(DATA_VARCHAR);
 -		} else {
 -			return(DATA_VARMYSQL);
 +			break;
 +		default:
 +			;
  		}
 -	case MYSQL_TYPE_BIT:
 -	case MYSQL_TYPE_STRING: if (field->binary()) {
  
 -			return(DATA_FIXBINARY);
 -		} else if (strcmp(
 -				   field->charset()->name,
 -				   "latin1_swedish_ci") == 0) {
 -			return(DATA_CHAR);
 -		} else {
 -			return(DATA_MYSQL);
 +		if (field_mysql_type == MYSQL_TYPE_LONGLONG
 +		    && prebuilt->table->fts
 +		    && innobase_strcasecmp(
 +			field->field_name, FTS_DOC_ID_COL_NAME) == 0) {
 +			doc_id = (doc_id_t) mach_read_from_n_little_endian(
 +				n_ptr, 8);
 +			if (doc_id == 0) {
 +				return(DB_FTS_INVALID_DOCID);
 +			}
  		}
 -	case MYSQL_TYPE_NEWDECIMAL:
 -		return(DATA_FIXBINARY);
 -	case MYSQL_TYPE_LONG:
 -	case MYSQL_TYPE_LONGLONG:
 -	case MYSQL_TYPE_TINY:
 -	case MYSQL_TYPE_SHORT:
 -	case MYSQL_TYPE_INT24:
 -	case MYSQL_TYPE_DATE:
 -	case MYSQL_TYPE_YEAR:
 -	case MYSQL_TYPE_NEWDATE:
 -           return(DATA_INT);
 -
 -	case MYSQL_TYPE_TIME:
 -	case MYSQL_TYPE_DATETIME:
 -	case MYSQL_TYPE_TIMESTAMP:
 -          /*
 -            XtraDB should ideally just check field->keytype() and never
 -            field->type().  The following check is here to only
 -            change the new hires datetime/timestamp/time fields to
 -            use DATA_FIXBINARY.  We can't convert this function to
 -            just test for field->keytype() as then the check if a
 -            table is compatible will fail for old tables.
 -          */
 -           if (field->key_type() == HA_KEYTYPE_BINARY)
 -             return(DATA_FIXBINARY);
 -           return(DATA_INT);
 -	case MYSQL_TYPE_FLOAT:
 -		return(DATA_FLOAT);
 -	case MYSQL_TYPE_DOUBLE:
 -		return(DATA_DOUBLE);
 -	case MYSQL_TYPE_DECIMAL:
 -		return(DATA_DECIMAL);
 -	case MYSQL_TYPE_GEOMETRY:
 -	case MYSQL_TYPE_TINY_BLOB:
 -	case MYSQL_TYPE_MEDIUM_BLOB:
 -	case MYSQL_TYPE_BLOB:
 -	case MYSQL_TYPE_LONG_BLOB:
 -		return(DATA_BLOB);
 -	case MYSQL_TYPE_NULL:
 -		return(DATA_FIXBINARY);
 -	default:
 -		ut_error;
 -	}
 -
 -	return(0);
 -}
 -
 -/*******************************************************************//**
 -Writes an unsigned integer value < 64k to 2 bytes, in the little-endian
 -storage format. */
 -static inline
 -void
 -innobase_write_to_2_little_endian(
 -/*==============================*/
 -	byte*	buf,	/*!< in: where to store */
 -	ulint	val)	/*!< in: value to write, must be < 64k */
 -{
 -	ut_a(val < 256 * 256);
  
 -	buf[0] = (byte)(val & 0xFF);
 -	buf[1] = (byte)(val / 256);
 -}
  
 -/*******************************************************************//**
 -Reads an unsigned integer value < 64k from 2 bytes, in the little-endian
 -storage format.
 -@return	value */
 -static inline
 -uint
 -innobase_read_from_2_little_endian(
 -/*===============================*/
 -	const uchar*	buf)	/*!< in: from where to read */
 -{
 -	return (uint) ((ulint)(buf[0]) + 256 * ((ulint)(buf[1])));
 -}
 +		if (field->real_maybe_null()) {
 +			if (field->is_null_in_record(old_row)) {
 +				o_len = UNIV_SQL_NULL;
 +			}
  
 -/*******************************************************************//**
 -Stores a key value for a row to a buffer.
 -@return	key value length as stored in buff */
 -#ifdef WITH_WSREP
 -UNIV_INTERN
 -uint
 -wsrep_store_key_val_for_row(
 -/*===============================*/
 -	TABLE*		table,
 -	uint		keynr,	/*!< in: key number */
 -	char*		buff,	/*!< in/out: buffer for the key value (in MySQL
 -				format) */
 -	uint		buff_len,/*!< in: buffer length */
 -	const uchar*	record,
 -	ibool*          key_is_null)/*!< out: full key was null */
 -{
 -	KEY*		key_info	= table->key_info + keynr;
 -	KEY_PART_INFO*	key_part	= key_info->key_part;
 -	KEY_PART_INFO*	end		= key_part + key_info->key_parts;
 -	char*		buff_start	= buff;
 -	enum_field_types mysql_type;
 -	Field*		field;
 -	
 -	DBUG_ENTER("store_key_val_for_row");
 +			if (field->is_null_in_record(new_row)) {
 +				n_len = UNIV_SQL_NULL;
 +			}
 +		}
  
 -	bzero(buff, buff_len);
 -	*key_is_null = TRUE;
 +		if (o_len != n_len || (o_len != 0 && o_len != UNIV_SQL_NULL
 +				       && 0 != memcmp(o_ptr, n_ptr, o_len))) {
 +			/* The field has changed */
  
 -	for (; key_part != end; key_part++) {
 -		uchar sorted[REC_VERSION_56_MAX_INDEX_COL_LEN] = {'\0'};
 -		ibool part_is_null = FALSE;
 +			ufield = uvect->fields + n_changed;
 +			UNIV_MEM_INVALID(ufield, sizeof *ufield);
  
 -		if (key_part->null_bit) {
 -			if (record[key_part->null_offset] & 
 -			    key_part->null_bit) {
 -				*buff = 1;
 -				part_is_null = TRUE;
 +			/* Let us use a dummy dfield to make the conversion
 +			from the MySQL column format to the InnoDB format */
 +
 +			if (n_len != UNIV_SQL_NULL) {
 +				dict_col_copy_type(prebuilt->table->cols + innodb_idx,
 +						   dfield_get_type(&dfield));
 +
 +				buf = row_mysql_store_col_in_innobase_format(
 +					&dfield,
 +					(byte*) buf,
 +					TRUE,
 +					new_mysql_row_col,
 +					col_pack_len,
 +					dict_table_is_comp(prebuilt->table));
 +				dfield_copy(&ufield->new_val, &dfield);
  			} else {
 -				*buff = 0;
 +				dfield_set_null(&ufield->new_val);
  			}
 -			buff++;
 -		}
 -		if (!part_is_null)  *key_is_null = FALSE;
  
 -		field = key_part->field;
 -		mysql_type = field->type();
 +			ufield->exp = NULL;
 +			ufield->orig_len = 0;
 +			ufield->field_no = dict_col_get_clust_pos(
 +				&prebuilt->table->cols[innodb_idx], clust_index);
 +			n_changed++;
  
 -		if (mysql_type == MYSQL_TYPE_VARCHAR) {
 -						/* >= 5.0.3 true VARCHAR */
 -			ulint		lenlen;
 -			ulint		len;
 -			const byte*	data;
 -			ulint		key_len;
 -			ulint		true_len;
 -			CHARSET_INFO*	cs;
 -			int		error=0;
 +			/* If an FTS indexed column was changed by this
 +			UPDATE then we need to inform the FTS sub-system.
  
 -			key_len = key_part->length;
 +			NOTE: Currently we re-index all FTS indexed columns
 +			even if only a subset of the FTS indexed columns
 +			have been updated. That is the reason we are
 +			checking only once here. Later we will need to
 +			note which columns have been updated and do
 +			selective processing. */
 +			if (prebuilt->table->fts != NULL) {
 +				ulint           offset;
 +				dict_table_t*   innodb_table;
  
 -			if (part_is_null) {
 -				buff += key_len + 2;
 +				innodb_table = prebuilt->table;
  
 -				continue;
 +				if (!changes_fts_column) {
 +					offset = row_upd_changes_fts_column(
 +						innodb_table, ufield);
 +
 +					if (offset != ULINT_UNDEFINED) {
 +						changes_fts_column = TRUE;
 +					}
 +				}
 +
 +				if (!changes_fts_doc_col) {
 +					changes_fts_doc_col =
 +					row_upd_changes_doc_id(
 +						innodb_table, ufield);
 +				}
 +			}
 +		}
 +		if (field->stored_in_db)
 +			innodb_idx++;
 +	}
 +
 +	/* If the update changes a column with an FTS index on it, we
 +	then add an update column node with a new document id to the
 +	other changes. We piggy back our changes on the normal UPDATE
 +	to reduce processing and IO overhead. */
 +	if (!prebuilt->table->fts) {
 +			trx->fts_next_doc_id = 0;
 +	} else if (changes_fts_column || changes_fts_doc_col) {
 +		dict_table_t*   innodb_table = prebuilt->table;
 +
 +		ufield = uvect->fields + n_changed;
 +
 +		if (!DICT_TF2_FLAG_IS_SET(
 +			innodb_table, DICT_TF2_FTS_HAS_DOC_ID)) {
 +
 +			/* If Doc ID is managed by user, and if any
 +			FTS indexed column has been updated, its corresponding
 +			Doc ID must also be updated. Otherwise, return
 +			error */
 +			if (changes_fts_column && !changes_fts_doc_col) {
 +				ut_print_timestamp(stderr);
 +				fprintf(stderr, " InnoDB: A new Doc ID"
 +					" must be supplied while updating"
 +					" FTS indexed columns.\n");
 +				return(DB_FTS_INVALID_DOCID);
  			}
 -			cs = field->charset();
  
 -			lenlen = (ulint)
 -				(((Field_varstring*)field)->length_bytes);
 +			/* Doc ID must monotonically increase */
 +			ut_ad(innodb_table->fts->cache);
 +			if (doc_id < prebuilt->table->fts->cache->next_doc_id) {
 +				fprintf(stderr,
 +					"InnoDB: FTS Doc ID must be larger than"
 +					" " IB_ID_FMT " for table",
 +					innodb_table->fts->cache->next_doc_id
 +					- 1);
 +				ut_print_name(stderr, trx,
 +					      TRUE, innodb_table->name);
 +				putc('\n', stderr);
 +
 +				return(DB_FTS_INVALID_DOCID);
 +			} else if ((doc_id
 +				    - prebuilt->table->fts->cache->next_doc_id)
 +				   >= FTS_DOC_ID_MAX_STEP) {
 +				fprintf(stderr,
 +					"InnoDB: Doc ID " UINT64PF " is too"
 +					" big. Its difference with largest"
 +					" Doc ID used " UINT64PF " cannot"
 +					" exceed or equal to %d\n",
 +					doc_id,
 +					prebuilt->table->fts->cache->next_doc_id - 1,
 +					FTS_DOC_ID_MAX_STEP);
 +			}
  
 -			data = row_mysql_read_true_varchar(&len,
 -				(byte*) (record
 -				+ (ulint)get_field_offset(table, field)),
 -				lenlen);
  
 -			true_len = len;
 +			trx->fts_next_doc_id = doc_id;
 +		} else {
 +			/* If the Doc ID is a hidden column, it can't be
 +			changed by user */
 +			ut_ad(!changes_fts_doc_col);
  
 -			/* For multi byte character sets we need to calculate
 -			the true length of the key */
 +			/* Doc ID column is hidden, a new Doc ID will be
 +			generated by following fts_update_doc_id() call */
 +			trx->fts_next_doc_id = 0;
 +		}
  
 -			if (len > 0 && cs->mbmaxlen > 1) {
 -				true_len = (ulint) cs->cset->well_formed_len(cs,
 -						(const char *) data,
 -						(const char *) data + len,
 -                                                (uint) (key_len /
 -                                                        cs->mbmaxlen),
 -						&error);
 -			}
 +		fts_update_doc_id(
 +			innodb_table, ufield, &trx->fts_next_doc_id);
  
 -			/* In a column prefix index, we may need to truncate
 -			the stored value: */
 +		++n_changed;
 +	} else {
 +		/* We have a Doc ID column, but none of FTS indexed
 +		columns are touched, nor the Doc ID column, so set
 +		fts_next_doc_id to UINT64_UNDEFINED, which means do not
 +		update the Doc ID column */
 +		trx->fts_next_doc_id = UINT64_UNDEFINED;
 +	}
  
 -			if (true_len > key_len) {
 -				true_len = key_len;
 -			}
 +	uvect->n_fields = n_changed;
 +	uvect->info_bits = 0;
  
 -			memcpy(sorted, data, true_len);
 -			true_len = wsrep_innobase_mysql_sort(
 -				mysql_type, cs->number, sorted, true_len, 
 -				REC_VERSION_56_MAX_INDEX_COL_LEN);
 +	ut_a(buf <= (byte*) original_upd_buff + buff_len);
  
 -			if (wsrep_protocol_version > 1) {
 -				memcpy(buff, sorted, true_len);
 -                        /* Note that we always reserve the maximum possible
 -			length of the true VARCHAR in the key value, though
 -			only len first bytes after the 2 length bytes contain
 -			actual data. The rest of the space was reset to zero
 -			in the bzero() call above. */
 -                                buff += true_len;
 -                        } else {
 -                                buff += key_len;
 -                        }
 -		} else if (mysql_type == MYSQL_TYPE_TINY_BLOB
 -			|| mysql_type == MYSQL_TYPE_MEDIUM_BLOB
 -			|| mysql_type == MYSQL_TYPE_BLOB
 -			|| mysql_type == MYSQL_TYPE_LONG_BLOB
 -			/* MYSQL_TYPE_GEOMETRY data is treated
 -			as BLOB data in innodb. */
 -			|| mysql_type == MYSQL_TYPE_GEOMETRY) {
 +	return(DB_SUCCESS);
 +}
  
 -			CHARSET_INFO*	cs;
 -			ulint		key_len;
 -			ulint		true_len;
 -			int		error=0;
 -			ulint		blob_len;
 -			const byte*	blob_data;
 +#ifdef WITH_WSREP
 +static
 +int
 +wsrep_calc_row_hash(
 +/*================*/
 +	byte*		digest,		/*!< in/out: md5 sum */
 +	const uchar*	row,		/*!< in: row in MySQL format */
 +	TABLE*		table,		/*!< in: table in MySQL data
 +					dictionary */
 +	row_prebuilt_t*	prebuilt,	/*!< in: InnoDB prebuilt struct */
 +	THD*		thd)		/*!< in: user thread */
 +{
 +	Field*		field;
 +	enum_field_types field_mysql_type;
 +	uint		n_fields;
 +	ulint		len;
 +	const byte*	ptr;
 +	ulint		col_type;
 +	uint		i;
  
 -			ut_a(key_part->key_part_flag & HA_PART_KEY_SEG);
 +	void *ctx = wsrep_md5_init();
  
 -			key_len = key_part->length;
 +	n_fields = table->s->fields;
  
 -			if (part_is_null) {
 -				buff += key_len + 2;
 +	for (i = 0; i < n_fields; i++) {
 +		byte null_byte=0;
 +		byte true_byte=1;
  
 -				continue;
 -			}
 +		field = table->field[i];
  
 -			cs = field->charset();
 +		ptr = (const byte*) row + get_field_offset(table, field);
 +		len = field->pack_length();
  
 -			blob_data = row_mysql_read_blob_ref(&blob_len,
 -				(byte*) (record
 -				+ (ulint)get_field_offset(table, field)),
 -					(ulint) field->pack_length());
 +		field_mysql_type = field->type();
  
 -			true_len = blob_len;
 +		col_type = prebuilt->table->cols[i].mtype;
  
 -			ut_a(get_field_offset(table, field)
 -				== key_part->offset);
 +		switch (col_type) {
  
 -			/* For multi byte character sets we need to calculate
 -			the true length of the key */
 +		case DATA_BLOB:
 +			ptr = row_mysql_read_blob_ref(&len, ptr, len);
 +			break;
  
 -			if (blob_len > 0 && cs->mbmaxlen > 1) {
 -				true_len = (ulint) cs->cset->well_formed_len(cs,
 -						(const char *) blob_data,
 -						(const char *) blob_data
 -							+ blob_len,
 -                                                (uint) (key_len /
 -                                                        cs->mbmaxlen),
 -						&error);
 -			}
 +		case DATA_VARCHAR:
 +		case DATA_BINARY:
 +		case DATA_VARMYSQL:
 +			if (field_mysql_type == MYSQL_TYPE_VARCHAR) {
 +				/* This is a >= 5.0.3 type true VARCHAR where
 +				the real payload data length is stored in
 +				1 or 2 bytes */
  
 -			/* All indexes on BLOB and TEXT are column prefix
 -			indexes, and we may need to truncate the data to be
 -			stored in the key value: */
 +				ptr = row_mysql_read_true_varchar(
 +					&len, ptr,
 +					(ulint)
 +					(((Field_varstring*)field)->length_bytes));
  
 -			if (true_len > key_len) {
 -				true_len = key_len;
  			}
  
 -			memcpy(sorted, blob_data, true_len);
 -			true_len = wsrep_innobase_mysql_sort(
 -				mysql_type, cs->number, sorted, true_len,
 -				REC_VERSION_56_MAX_INDEX_COL_LEN);
 +			break;
 +		default:
 +			;
 +		}
 +		/*
 +		if (field->null_ptr &&
 +		    field_in_record_is_null(table, field, (char*) row)) {
 +		*/
 +
 +		if (field->is_null_in_record(row)) {
 +			wsrep_md5_update(ctx, (char*)&null_byte, 1);
 +		} else {
 +			wsrep_md5_update(ctx, (char*)&true_byte, 1);
 +			wsrep_md5_update(ctx, (char*)ptr, len);
 +		}
 +	}
 +
 +	wsrep_compute_md5_hash((char*)digest, ctx);
 +
 +	return(0);
 +}
 +#endif /* WITH_WSREP */
 +/**********************************************************************//**
 +Updates a row given as a parameter to a new value. Note that we are given
 +whole rows, not just the fields which are updated: this incurs some
 +overhead for CPU when we check which fields are actually updated.
 +TODO: currently InnoDB does not prevent the 'Halloween problem':
 +in a searched update a single row can get updated several times
 +if its index columns are updated!
 +@return	error number or 0 */
 +UNIV_INTERN
 +int
 +ha_innobase::update_row(
 +/*====================*/
 +	const uchar*	old_row,	/*!< in: old row in MySQL format */
 +	uchar*		new_row)	/*!< in: new row in MySQL format */
 +{
 +	upd_t*		uvect;
 +	dberr_t		error;
 +	trx_t*		trx = thd_to_trx(user_thd);
  
 -			memcpy(buff, sorted, true_len);
 +	DBUG_ENTER("ha_innobase::update_row");
  
 -			/* Note that we always reserve the maximum possible
 -			length of the BLOB prefix in the key value. */
 -                        if (wsrep_protocol_version > 1) {
 -                                buff += true_len;
 -                        } else {
 -                                buff += key_len;
 -                        }
 -		} else {
 -			/* Here we handle all other data types except the
 -			true VARCHAR, BLOB and TEXT. Note that the column
 -			value we store may be also in a column prefix
 -			index. */
 +	ut_a(prebuilt->trx == trx);
  
 -			CHARSET_INFO*		cs;
 -			ulint			true_len;
 -			ulint			key_len;
 -			const uchar*		src_start;
 -			int			error=0;
 -			enum_field_types	real_type;
 +	if (high_level_read_only) {
 +		ib_senderrf(ha_thd(), IB_LOG_LEVEL_WARN, ER_READ_ONLY_MODE);
 +		DBUG_RETURN(HA_ERR_TABLE_READONLY);
 +	} else if (!trx_is_started(trx)) {
 +		++trx->will_lock;
 +	}
  
 -			key_len = key_part->length;
 +	if (upd_buf == NULL) {
 +		ut_ad(upd_buf_size == 0);
  
 -			if (part_is_null) {
 -				 buff += key_len;
 +		/* Create a buffer for packing the fields of a record. Why
 +		table->stored_rec_length did not work here? Obviously, because char
 +		fields when packed actually became 1 byte longer, when we also
 +		stored the string length as the first byte. */
  
 -				 continue;
 -			}
 +		upd_buf_size = table->s->stored_rec_length + table->s->max_key_length
 +			+ MAX_REF_PARTS * 3;
 +		upd_buf = (uchar*) my_malloc(upd_buf_size, MYF(MY_WME));
 +		if (upd_buf == NULL) {
 +			upd_buf_size = 0;
 +			DBUG_RETURN(HA_ERR_OUT_OF_MEM);
 +		}
 +	}
  
 -			src_start = record + key_part->offset;
 -			real_type = field->real_type();
 -			true_len = key_len;
 +	ha_statistic_increment(&SSV::ha_update_count);
  
 -			/* Character set for the field is defined only
 -			to fields whose type is string and real field
 -			type is not enum or set. For these fields check
 -			if character set is multi byte. */
 +	if (share->ib_table != prebuilt->table) {
 +		fprintf(stderr,
 +			"InnoDB: Warning: share->ib_table %p prebuilt->table %p table %s is_corrupt %lu.",
 +			share->ib_table, prebuilt->table, prebuilt->table->name, prebuilt->table->is_corrupt);
 +	}
  
 -			if (real_type != MYSQL_TYPE_ENUM
 -				&& real_type != MYSQL_TYPE_SET
 -				&& ( mysql_type == MYSQL_TYPE_VAR_STRING
 -					|| mysql_type == MYSQL_TYPE_STRING)) {
 +	if (UNIV_UNLIKELY(share->ib_table && share->ib_table->is_corrupt)) {
 +		DBUG_RETURN(HA_ERR_CRASHED);
 +	}
  
 -				cs = field->charset();
 +	if (prebuilt->upd_node) {
 +		uvect = prebuilt->upd_node->update;
 +	} else {
 +		uvect = row_get_prebuilt_update_vector(prebuilt);
 +	}
  
 -				/* For multi byte character sets we need to
 -				calculate the true length of the key */
 +	/* Build an update vector from the modified fields in the rows
 +	(uses upd_buf of the handle) */
  
 -				if (key_len > 0 && cs->mbmaxlen > 1) {
 +	error = calc_row_difference(uvect, (uchar*) old_row, new_row, table,
 +				    upd_buf, upd_buf_size, prebuilt, user_thd);
  
 -					true_len = (ulint)
 -						cs->cset->well_formed_len(cs,
 -							(const char *)src_start,
 -							(const char *)src_start
 -								+ key_len,
 -                                                        (uint) (key_len /
 -                                                                cs->mbmaxlen),
 -							&error);
 -				}
 -				memcpy(sorted, src_start, true_len);
 -				true_len = wsrep_innobase_mysql_sort(
 -					mysql_type, cs->number, sorted, true_len,
 -					REC_VERSION_56_MAX_INDEX_COL_LEN);
 +	if (error != DB_SUCCESS) {
 +		goto func_exit;
 +	}
  
 -				memcpy(buff, sorted, true_len);
 -			} else {
 -				memcpy(buff, src_start, true_len);
 -			}
 -			buff += true_len;
 +	/* This is not a delete */
 +	prebuilt->upd_node->is_delete = FALSE;
  
 -			/* Pad the unused space with spaces. */
 +	ut_a(prebuilt->template_type == ROW_MYSQL_WHOLE_ROW);
  
 -#ifdef REMOVED
 -			if (true_len < key_len) {
 -				ulint	pad_len = key_len - true_len;
 -				ut_a(!(pad_len % cs->mbminlen));
 +	innobase_srv_conc_enter_innodb(trx);
  
 -				cs->cset->fill(cs, buff, pad_len,
 -					       0x20 /* space */);
 -				buff += pad_len;
 -			}
 -#endif /* REMOVED */
 -		}
 -	}
 +	error = row_update_for_mysql((byte*) old_row, prebuilt);
  
 -	ut_a(buff <= buff_start + buff_len);
 +	/* We need to do some special AUTOINC handling for the following case:
  
 -	DBUG_RETURN((uint)(buff - buff_start));
 -}
 -#endif /* WITH_WSREP */
 -UNIV_INTERN
 -uint
 -ha_innobase::store_key_val_for_row(
 -/*===============================*/
 -	uint		keynr,	/*!< in: key number */
 -	char*		buff,	/*!< in/out: buffer for the key value (in MySQL
 -				format) */
 -	uint		buff_len,/*!< in: buffer length */
 -	const uchar*	record)/*!< in: row in MySQL format */
 -{
 -	KEY*		key_info	= table->key_info + keynr;
 -	KEY_PART_INFO*	key_part	= key_info->key_part;
 -	KEY_PART_INFO*	end		= key_part + key_info->key_parts;
 -	char*		buff_start	= buff;
 -	enum_field_types mysql_type;
 -	Field*		field;
 -	ibool		is_null;
 +	INSERT INTO t (c1,c2) VALUES(x,y) ON DUPLICATE KEY UPDATE ...
  
 -	DBUG_ENTER("store_key_val_for_row");
 +	We need to use the AUTOINC counter that was actually used by
 +	MySQL in the UPDATE statement, which can be different from the
 +	value used in the INSERT statement.*/
  
 -	/* The format for storing a key field in MySQL is the following:
 +	if (error == DB_SUCCESS
 +	    && table->next_number_field
 +	    && new_row == table->record[0]
 +	    && thd_sql_command(user_thd) == SQLCOM_INSERT
 +	    && trx->duplicates)  {
  
 -	1. If the column can be NULL, then in the first byte we put 1 if the
 -	field value is NULL, 0 otherwise.
 +		ulonglong	auto_inc;
 +		ulonglong	col_max_value;
  
 -	2. If the column is of a BLOB type (it must be a column prefix field
 -	in this case), then we put the length of the data in the field to the
 -	next 2 bytes, in the little-endian format. If the field is SQL NULL,
 -	then these 2 bytes are set to 0. Note that the length of data in the
 -	field is <= column prefix length.
 +		auto_inc = table->next_number_field->val_uint();
  
 -	3. In a column prefix field, prefix_len next bytes are reserved for
 -	data. In a normal field the max field length next bytes are reserved
 -	for data. For a VARCHAR(n) the max field length is n. If the stored
 -	value is the SQL NULL then these data bytes are set to 0.
 +		/* We need the upper limit of the col type to check for
 +		whether we update the table autoinc counter or not. */
 +		col_max_value = innobase_get_int_col_max_value(
 +			table->next_number_field);
  
 -	4. We always use a 2 byte length for a true >= 5.0.3 VARCHAR. Note that
 -	in the MySQL row format, the length is stored in 1 or 2 bytes,
 -	depending on the maximum allowed length. But in the MySQL key value
 -	format, the length always takes 2 bytes.
 +		if (auto_inc <= col_max_value && auto_inc != 0) {
  
 -	We have to zero-fill the buffer so that MySQL is able to use a
 -	simple memcmp to compare two key values to determine if they are
 -	equal. MySQL does this to compare contents of two 'ref' values. */
 +			ulonglong	offset;
 +			ulonglong	increment;
  
 -	bzero(buff, buff_len);
 +			offset = prebuilt->autoinc_offset;
 +			increment = prebuilt->autoinc_increment;
  
 -	for (; key_part != end; key_part++) {
 -		is_null = FALSE;
 +			auto_inc = innobase_next_autoinc(
 +				auto_inc, 1, increment, offset, col_max_value);
  
 -		if (key_part->null_bit) {
 -			if (record[key_part->null_offset]
 -						& key_part->null_bit) {
 -				*buff = 1;
 -				is_null = TRUE;
 -			} else {
 -				*buff = 0;
 -			}
 -			buff++;
 +			error = innobase_set_max_autoinc(auto_inc);
  		}
 +	}
  
 -		field = key_part->field;
 -		mysql_type = field->type();
 +	innobase_srv_conc_exit_innodb(trx);
  
 -		if (mysql_type == MYSQL_TYPE_VARCHAR) {
 -						/* >= 5.0.3 true VARCHAR */
 -			ulint		lenlen;
 -			ulint		len;
 -			const byte*	data;
 -			ulint		key_len;
 -			ulint		true_len;
 -			CHARSET_INFO*	cs;
 -			int		error=0;
 +func_exit:
 +	int err = convert_error_code_to_mysql(error,
 +					    prebuilt->table->flags, user_thd);
  
 -			key_len = key_part->length;
 +	/* If success and no columns were updated. */
 +	if (err == 0 && uvect->n_fields == 0) {
  
 -			if (is_null) {
 -				buff += key_len + 2;
 +		/* This is the same as success, but instructs
 +		MySQL that the row is not really updated and it
 +		should not increase the count of updated rows.
 +		This is fix for http://bugs.mysql.com/29157 */
 +		err = HA_ERR_RECORD_IS_THE_SAME;
 +	} else if (err == HA_FTS_INVALID_DOCID) {
 +		my_error(HA_FTS_INVALID_DOCID, MYF(0));
 +	}
  
 -				continue;
 -			}
 -			cs = field->charset();
 +	/* Tell InnoDB server that there might be work for
 +	utility threads: */
  
 -			lenlen = (ulint)
 -				(((Field_varstring*)field)->length_bytes);
 +	innobase_active_small();
  
 -			data = row_mysql_read_true_varchar(&len,
 -				(byte*) (record
 -				+ (ulint)get_field_offset(table, field)),
 -				lenlen);
 +#ifdef WITH_WSREP
 +	if (error == DB_SUCCESS                          &&
 +	    wsrep_thd_exec_mode(user_thd) == LOCAL_STATE &&
 +	    wsrep_on(user_thd)                           &&
 +	    !wsrep_thd_skip_append_keys(user_thd))
 +        {
 +		DBUG_PRINT("wsrep", ("update row key"));
  
 -			true_len = len;
 +		if (wsrep_append_keys(user_thd, false, old_row, new_row)) {
 +			WSREP_DEBUG("WSREP: UPDATE_ROW_KEY FAILED");
 +			DBUG_PRINT("wsrep", ("row key failed"));
 +			err = HA_ERR_INTERNAL_ERROR;
 +			goto wsrep_error;
 +		}
 +	}
 +wsrep_error:
 +#endif /* WITH_WSREP */
  
 -			/* For multi byte character sets we need to calculate
 -			the true length of the key */
 +	if (share->ib_table != prebuilt->table) {
 +		fprintf(stderr,
 +			"InnoDB: Warning: share->ib_table %p prebuilt->table %p table %s is_corrupt %lu.",
 +			share->ib_table, prebuilt->table, prebuilt->table->name, prebuilt->table->is_corrupt);
 +	}
  
 -			if (len > 0 && cs->mbmaxlen > 1) {
 -				true_len = (ulint) cs->cset->well_formed_len(cs,
 -						(const char *) data,
 -						(const char *) data + len,
 -                                                (uint) (key_len /
 -                                                        cs->mbmaxlen),
 -						&error);
 -			}
 +	if (UNIV_UNLIKELY(share->ib_table && share->ib_table->is_corrupt)) {
 +		DBUG_RETURN(HA_ERR_CRASHED);
 +	}
  
 -			/* In a column prefix index, we may need to truncate
 -			the stored value: */
 +	DBUG_RETURN(err);
 +}
  
 -			if (true_len > key_len) {
 -				true_len = key_len;
 -			}
 +/**********************************************************************//**
 +Deletes a row given as the parameter.
 +@return	error number or 0 */
 +UNIV_INTERN
 +int
 +ha_innobase::delete_row(
 +/*====================*/
 +	const uchar*	record)	/*!< in: a row in MySQL format */
 +{
 +	dberr_t		error;
 +	trx_t*		trx = thd_to_trx(user_thd);
  
 -			/* The length in a key value is always stored in 2
 -			bytes */
 +	DBUG_ENTER("ha_innobase::delete_row");
  
 -			row_mysql_store_true_var_len((byte*)buff, true_len, 2);
 -			buff += 2;
 +	ut_a(prebuilt->trx == trx);
  
 -			memcpy(buff, data, true_len);
 +	if (high_level_read_only) {
 +		ib_senderrf(ha_thd(), IB_LOG_LEVEL_WARN, ER_READ_ONLY_MODE);
 +		DBUG_RETURN(HA_ERR_TABLE_READONLY);
 +	} else if (!trx_is_started(trx)) {
 +		++trx->will_lock;
 +	}
  
 -			/* Note that we always reserve the maximum possible
 -			length of the true VARCHAR in the key value, though
 -			only len first bytes after the 2 length bytes contain
 -			actual data. The rest of the space was reset to zero
 -			in the bzero() call above. */
 +	ha_statistic_increment(&SSV::ha_delete_count);
  
 -			buff += key_len;
 +	if (UNIV_UNLIKELY(share && share->ib_table
 +			  && share->ib_table->is_corrupt)) {
 +		DBUG_RETURN(HA_ERR_CRASHED);
 +	}
  
 -		} else if (mysql_type == MYSQL_TYPE_TINY_BLOB
 -			|| mysql_type == MYSQL_TYPE_MEDIUM_BLOB
 -			|| mysql_type == MYSQL_TYPE_BLOB
 -			|| mysql_type == MYSQL_TYPE_LONG_BLOB
 -			/* MYSQL_TYPE_GEOMETRY data is treated
 -			as BLOB data in innodb. */
 -			|| mysql_type == MYSQL_TYPE_GEOMETRY) {
 +	if (!prebuilt->upd_node) {
 +		row_get_prebuilt_update_vector(prebuilt);
 +	}
  
 -			CHARSET_INFO*	cs;
 -			ulint		key_len;
 -			ulint		true_len;
 -			int		error=0;
 -			ulint		blob_len;
 -			const byte*	blob_data;
 +	/* This is a delete */
  
 -			ut_a(key_part->key_part_flag & HA_PART_KEY_SEG);
 +	prebuilt->upd_node->is_delete = TRUE;
  
 -			key_len = key_part->length;
 +	innobase_srv_conc_enter_innodb(trx);
  
 -			if (is_null) {
 -				buff += key_len + 2;
 +	error = row_update_for_mysql((byte*) record, prebuilt);
  
 -				continue;
 -			}
 +	innobase_srv_conc_exit_innodb(trx);
  
 -			cs = field->charset();
 +	/* Tell the InnoDB server that there might be work for
 +	utility threads: */
  
 -			blob_data = row_mysql_read_blob_ref(&blob_len,
 -				(byte*) (record
 -				+ (ulint)get_field_offset(table, field)),
 -					(ulint) field->pack_length());
 +	innobase_active_small();
  
 -			true_len = blob_len;
 +#ifdef WITH_WSREP
 +	if (error == DB_SUCCESS                          &&
 +            wsrep_thd_exec_mode(user_thd) == LOCAL_STATE &&
 +            wsrep_on(user_thd)                           &&
 +            !wsrep_thd_skip_append_keys(user_thd))
 +        {
 +		if (wsrep_append_keys(user_thd, false, record, NULL)) {
 +			DBUG_PRINT("wsrep", ("delete fail"));
 +			error = DB_ERROR;
 +			goto wsrep_error;
 +		}
 +	}
 +wsrep_error:
 +#endif /* WITH_WSREP */
  
 -			ut_a(get_field_offset(table, field)
 -				== key_part->offset);
 +	if (UNIV_UNLIKELY(share && share->ib_table
 +			  && share->ib_table->is_corrupt)) {
 +		DBUG_RETURN(HA_ERR_CRASHED);
 +	}
  
 -			/* For multi byte character sets we need to calculate
 -			the true length of the key */
 +	DBUG_RETURN(convert_error_code_to_mysql(
 +			    error, prebuilt->table->flags, user_thd));
 +}
  
 -			if (blob_len > 0 && cs->mbmaxlen > 1) {
 -				true_len = (ulint) cs->cset->well_formed_len(cs,
 -						(const char *) blob_data,
 -						(const char *) blob_data
 -							+ blob_len,
 -                                                (uint) (key_len /
 -                                                        cs->mbmaxlen),
 -						&error);
 -			}
 +/**********************************************************************//**
 +Removes a new lock set on a row, if it was not read optimistically. This can
 +be called after a row has been read in the processing of an UPDATE or a DELETE
 +query, if the option innodb_locks_unsafe_for_binlog is set. */
 +UNIV_INTERN
 +void
 +ha_innobase::unlock_row(void)
 +/*=========================*/
 +{
 +	DBUG_ENTER("ha_innobase::unlock_row");
  
 -			/* All indexes on BLOB and TEXT are column prefix
 -			indexes, and we may need to truncate the data to be
 -			stored in the key value: */
 +	/* Consistent read does not take any locks, thus there is
 +	nothing to unlock. */
  
 -			if (true_len > key_len) {
 -				true_len = key_len;
 -			}
 +	if (prebuilt->select_lock_type == LOCK_NONE) {
 +		DBUG_VOID_RETURN;
 +	}
  
 -			/* MySQL reserves 2 bytes for the length and the
 -			storage of the number is little-endian */
 +	/* Ideally, this assert must be in the beginning of the function.
 +	But there are some calls to this function from the SQL layer when the
 +	transaction is in state TRX_STATE_NOT_STARTED.  The check on
 +	prebuilt->select_lock_type above gets around this issue. */
 +	ut_ad(trx_state_eq(prebuilt->trx, TRX_STATE_ACTIVE));
  
 -			innobase_write_to_2_little_endian(
 -					(byte*)buff, true_len);
 -			buff += 2;
 +	switch (prebuilt->row_read_type) {
 +	case ROW_READ_WITH_LOCKS:
 +		if (!srv_locks_unsafe_for_binlog
 +		    && prebuilt->trx->isolation_level
 +		    > TRX_ISO_READ_COMMITTED) {
 +			break;
 +		}
 +		/* fall through */
 +	case ROW_READ_TRY_SEMI_CONSISTENT:
 +		row_unlock_for_mysql(prebuilt, FALSE);
 +		break;
 +	case ROW_READ_DID_SEMI_CONSISTENT:
 +		prebuilt->row_read_type = ROW_READ_TRY_SEMI_CONSISTENT;
 +		break;
 +	}
  
 -			memcpy(buff, blob_data, true_len);
 +	DBUG_VOID_RETURN;
 +}
  
 -			/* Note that we always reserve the maximum possible
 -			length of the BLOB prefix in the key value. */
 +/* See handler.h and row0mysql.h for docs on this function. */
 +UNIV_INTERN
 +bool
 +ha_innobase::was_semi_consistent_read(void)
 +/*=======================================*/
 +{
 +	return(prebuilt->row_read_type == ROW_READ_DID_SEMI_CONSISTENT);
 +}
  
 -			buff += key_len;
 -		} else {
 -			/* Here we handle all other data types except the
 -			true VARCHAR, BLOB and TEXT. Note that the column
 -			value we store may be also in a column prefix
 -			index. */
 +/* See handler.h and row0mysql.h for docs on this function. */
 +UNIV_INTERN
 +void
 +ha_innobase::try_semi_consistent_read(bool yes)
 +/*===========================================*/
 +{
 +	ut_a(prebuilt->trx == thd_to_trx(ha_thd()));
  
 -			CHARSET_INFO*		cs;
 -			ulint			true_len;
 -			ulint			key_len;
 -			const uchar*		src_start;
 -			int			error=0;
 -			enum_field_types	real_type;
 +	/* Row read type is set to semi consistent read if this was
 +	requested by the MySQL and either innodb_locks_unsafe_for_binlog
 +	option is used or this session is using READ COMMITTED isolation
 +	level. */
  
 -			key_len = key_part->length;
 +	if (yes
 +	    && (srv_locks_unsafe_for_binlog
 +		|| prebuilt->trx->isolation_level <= TRX_ISO_READ_COMMITTED)) {
 +		prebuilt->row_read_type = ROW_READ_TRY_SEMI_CONSISTENT;
 +	} else {
 +		prebuilt->row_read_type = ROW_READ_WITH_LOCKS;
 +	}
 +}
  
 -			if (is_null) {
 -				 buff += key_len;
 +/******************************************************************//**
 +Initializes a handle to use an index.
 +@return	0 or error number */
 +UNIV_INTERN
 +int
 +ha_innobase::index_init(
 +/*====================*/
 +	uint	keynr,	/*!< in: key (index) number */
 +	bool sorted)	/*!< in: 1 if result MUST be sorted according to index */
 +{
 +	DBUG_ENTER("index_init");
  
 -				 continue;
 -			}
 +	DBUG_RETURN(change_active_index(keynr));
 +}
  
 -			src_start = record + key_part->offset;
 -			real_type = field->real_type();
 -			true_len = key_len;
 +/******************************************************************//**
 +Currently does nothing.
 +@return	0 */
 +UNIV_INTERN
 +int
 +ha_innobase::index_end(void)
 +/*========================*/
 +{
 +	int	error	= 0;
 +	DBUG_ENTER("index_end");
 +	active_index = MAX_KEY;
 +	in_range_check_pushed_down = FALSE;
 +	ds_mrr.dsmrr_close();
 +	DBUG_RETURN(error);
 +}
  
 -			/* Character set for the field is defined only
 -			to fields whose type is string and real field
 -			type is not enum or set. For these fields check
 -			if character set is multi byte. */
 +/*********************************************************************//**
 +Converts a search mode flag understood by MySQL to a flag understood
 +by InnoDB. */
 +static inline
 +ulint
 +convert_search_mode_to_innobase(
 +/*============================*/
 +	enum ha_rkey_function	find_flag)
 +{
 +	switch (find_flag) {
 +	case HA_READ_KEY_EXACT:
 +		/* this does not require the index to be UNIQUE */
 +		return(PAGE_CUR_GE);
 +	case HA_READ_KEY_OR_NEXT:
 +		return(PAGE_CUR_GE);
 +	case HA_READ_KEY_OR_PREV:
 +		return(PAGE_CUR_LE);
 +	case HA_READ_AFTER_KEY:
 +		return(PAGE_CUR_G);
 +	case HA_READ_BEFORE_KEY:
 +		return(PAGE_CUR_L);
 +	case HA_READ_PREFIX:
 +		return(PAGE_CUR_GE);
 +	case HA_READ_PREFIX_LAST:
 +		return(PAGE_CUR_LE);
 +	case HA_READ_PREFIX_LAST_OR_PREV:
 +		return(PAGE_CUR_LE);
 +		/* In MySQL-4.0 HA_READ_PREFIX and HA_READ_PREFIX_LAST always
 +		pass a complete-field prefix of a key value as the search
 +		tuple. I.e., it is not allowed that the last field would
 +		just contain n first bytes of the full field value.
 +		MySQL uses a 'padding' trick to convert LIKE 'abc%'
 +		type queries so that it can use as a search tuple
 +		a complete-field-prefix of a key value. Thus, the InnoDB
 +		search mode PAGE_CUR_LE_OR_EXTENDS is never used.
 +		TODO: when/if MySQL starts to use also partial-field
 +		prefixes, we have to deal with stripping of spaces
 +		and comparison of non-latin1 char type fields in
 +		innobase_mysql_cmp() to get PAGE_CUR_LE_OR_EXTENDS to
 +		work correctly. */
 +	case HA_READ_MBR_CONTAIN:
 +	case HA_READ_MBR_INTERSECT:
 +	case HA_READ_MBR_WITHIN:
 +	case HA_READ_MBR_DISJOINT:
 +	case HA_READ_MBR_EQUAL:
 +		return(PAGE_CUR_UNSUPP);
 +	/* do not use "default:" in order to produce a gcc warning:
 +	enumeration value '...' not handled in switch
 +	(if -Wswitch or -Wall is used) */
 +	}
  
 -			if (real_type != MYSQL_TYPE_ENUM
 -				&& real_type != MYSQL_TYPE_SET
 -				&& ( mysql_type == MYSQL_TYPE_VAR_STRING
 -					|| mysql_type == MYSQL_TYPE_STRING)) {
 +	my_error(ER_CHECK_NOT_IMPLEMENTED, MYF(0), "this functionality");
  
 -				cs = field->charset();
 +	return(PAGE_CUR_UNSUPP);
 +}
 +
 +/*
 +   BACKGROUND INFO: HOW A SELECT SQL QUERY IS EXECUTED
 +   ---------------------------------------------------
 +The following does not cover all the details, but explains how we determine
 +the start of a new SQL statement, and what is associated with it.
  
 -				/* For multi byte character sets we need to
 -				calculate the true length of the key */
 +For each table in the database the MySQL interpreter may have several
 +table handle instances in use, also in a single SQL query. For each table
 +handle instance there is an InnoDB  'prebuilt' struct which contains most
 +of the InnoDB data associated with this table handle instance.
  
 -				if (key_len > 0 && cs->mbmaxlen > 1) {
 +  A) if the user has not explicitly set any MySQL table level locks:
  
 -					true_len = (ulint)
 -						cs->cset->well_formed_len(cs,
 -							(const char *)src_start,
 -							(const char *)src_start
 -								+ key_len,
 -                                                        (uint) (key_len /
 -                                                                cs->mbmaxlen),
 -							&error);
 -				}
 -			}
 +  1) MySQL calls ::external_lock to set an 'intention' table level lock on
 +the table of the handle instance. There we set
 +prebuilt->sql_stat_start = TRUE. The flag sql_stat_start should be set
 +true if we are taking this table handle instance to use in a new SQL
 +statement issued by the user. We also increment trx->n_mysql_tables_in_use.
  
 -			memcpy(buff, src_start, true_len);
 -			buff += true_len;
 +  2) If prebuilt->sql_stat_start == TRUE we 'pre-compile' the MySQL search
 +instructions to prebuilt->template of the table handle instance in
 +::index_read. The template is used to save CPU time in large joins.
  
 -			/* Pad the unused space with spaces. */
 +  3) In row_search_for_mysql, if prebuilt->sql_stat_start is true, we
 +allocate a new consistent read view for the trx if it does not yet have one,
 +or in the case of a locking read, set an InnoDB 'intention' table level
 +lock on the table.
  
 -			if (true_len < key_len) {
 -				ulint	pad_len = key_len - true_len;
 -				ut_a(!(pad_len % cs->mbminlen));
 +  4) We do the SELECT. MySQL may repeatedly call ::index_read for the
 +same table handle instance, if it is a join.
  
 -				cs->cset->fill(cs, buff, pad_len,
 -					       0x20 /* space */);
 -				buff += pad_len;
 -			}
 -		}
 -	}
 +  5) When the SELECT ends, MySQL removes its intention table level locks
 +in ::external_lock. When trx->n_mysql_tables_in_use drops to zero,
 + (a) we execute a COMMIT there if the autocommit is on,
 + (b) we also release possible 'SQL statement level resources' InnoDB may
 +have for this SQL statement. The MySQL interpreter does NOT execute
 +autocommit for pure read transactions, though it should. That is why the
 +table handler in that case has to execute the COMMIT in ::external_lock.
  
 -	ut_a(buff <= buff_start + buff_len);
 +  B) If the user has explicitly set MySQL table level locks, then MySQL
 +does NOT call ::external_lock at the start of the statement. To determine
 +when we are at the start of a new SQL statement we at the start of
 +::index_read also compare the query id to the latest query id where the
 +table handle instance was used. If it has changed, we know we are at the
 +start of a new SQL statement. Since the query id can theoretically
 +overwrap, we use this test only as a secondary way of determining the
 +start of a new SQL statement. */
  
 -	DBUG_RETURN((uint)(buff - buff_start));
 -}
  
 -/**************************************************************//**
 -Determines if a field is needed in a prebuilt struct 'template'.
 -@return field to use, or NULL if the field is not needed */
 -static
 -const Field*
 -build_template_needs_field(
 -/*=======================*/
 -	ibool		index_contains,	/*!< in:
 -					dict_index_contains_col_or_prefix(
 -					index, i) */
 -	ibool		read_just_key,	/*!< in: TRUE when MySQL calls
 -					ha_innobase::extra with the
 -					argument HA_EXTRA_KEYREAD; it is enough
 -					to read just columns defined in
 -					the index (i.e., no read of the
 -					clustered index record necessary) */
 -	ibool		fetch_all_in_key,
 -					/*!< in: true=fetch all fields in
 -					the index */
 -	ibool		fetch_primary_key_cols,
 -					/*!< in: true=fetch the
 -					primary key columns */
 -	dict_index_t*	index,		/*!< in: InnoDB index to use */
 -	const TABLE*	table,		/*!< in: MySQL table object */
 -	ulint		i,		/*!< in: field index in InnoDB table */
 -	ulint		sql_idx)	/*!< in: field index in SQL table */
 +/**********************************************************************//**
 +Positions an index cursor to the index specified in the handle. Fetches the
 +row if any.
 +@return	0, HA_ERR_KEY_NOT_FOUND, or error number */
 +UNIV_INTERN
 +int
 +ha_innobase::index_read(
 +/*====================*/
 +	uchar*		buf,		/*!< in/out: buffer for the returned
 +					row */
 +	const uchar*	key_ptr,	/*!< in: key value; if this is NULL
 +					we position the cursor at the
 +					start or end of index; this can
 +					also contain an InnoDB row id, in
 +					which case key_len is the InnoDB
 +					row id length; the key value can
 +					also be a prefix of a full key value,
 +					and the last column can be a prefix
 +					of a full column */
 +	uint			key_len,/*!< in: key value length */
 +	enum ha_rkey_function find_flag)/*!< in: search flags from my_base.h */
  {
 -	const Field*	field	= table->field[sql_idx];
 +	ulint		mode;
 +	dict_index_t*	index;
 +	ulint		match_mode	= 0;
 +	int		error;
 +	dberr_t		ret;
  
 -	ut_ad(index_contains == dict_index_contains_col_or_prefix(index, i));
 +	DBUG_ENTER("index_read");
 +	DEBUG_SYNC_C("ha_innobase_index_read_begin");
  
 -	if (!index_contains) {
 -		if (read_just_key) {
 -			/* If this is a 'key read', we do not need
 -			columns that are not in the key */
 +	ut_a(prebuilt->trx == thd_to_trx(user_thd));
 +	ut_ad(key_len != 0 || find_flag != HA_READ_KEY_EXACT);
  
 -			return(NULL);
 -		}
 -	} else if (fetch_all_in_key) {
 -		/* This field is needed in the query */
 +	ha_statistic_increment(&SSV::ha_read_key_count);
  
 -		return(field);
 +	if (UNIV_UNLIKELY(srv_pass_corrupt_table <= 1 && share
 +			  && share->ib_table && share->ib_table->is_corrupt)) {
 +		DBUG_RETURN(HA_ERR_CRASHED);
  	}
  
 -	if (bitmap_is_set(table->read_set, sql_idx)
 -	    || bitmap_is_set(table->write_set, sql_idx)) {
 -		/* This field is needed in the query */
 +	index = prebuilt->index;
  
 -		return(field);
 +	if (UNIV_UNLIKELY(index == NULL) || dict_index_is_corrupted(index)) {
 +		prebuilt->index_usable = FALSE;
 +		DBUG_RETURN(HA_ERR_CRASHED);
 +	}
 +	if (UNIV_UNLIKELY(!prebuilt->index_usable)) {
 +		DBUG_RETURN(dict_index_is_corrupted(index)
 +			    ? HA_ERR_INDEX_CORRUPT
 +			    : HA_ERR_TABLE_DEF_CHANGED);
  	}
  
 -	if (fetch_primary_key_cols
 -	    && dict_table_col_in_clustered_key(index->table, i)) {
 -		/* This field is needed in the query */
 +	if (index->type & DICT_FTS) {
 +		DBUG_RETURN(HA_ERR_KEY_NOT_FOUND);
 +	}
  
 -		return(field);
 +	/* Note that if the index for which the search template is built is not
 +	necessarily prebuilt->index, but can also be the clustered index */
 +
 +	if (prebuilt->sql_stat_start) {
 +		build_template(false);
  	}
  
 -	/* This field is not needed in the query, skip it */
 +	if (key_ptr) {
 +		/* Convert the search key value to InnoDB format into
 +		prebuilt->search_tuple */
  
 -	return(NULL);
 -}
 +		row_sel_convert_mysql_key_to_innobase(
 +			prebuilt->search_tuple,
 +			prebuilt->srch_key_val1,
 +			prebuilt->srch_key_val_len,
 +			index,
 +			(byte*) key_ptr,
 +			(ulint) key_len,
 +			prebuilt->trx);
 +		DBUG_ASSERT(prebuilt->search_tuple->n_fields > 0);
 +	} else {
 +		/* We position the cursor to the last or the first entry
 +		in the index */
  
 -/**************************************************************//**
 -Adds a field is to a prebuilt struct 'template'.
 -@return the field template */
 -static
 -mysql_row_templ_t*
 -build_template_field(
 -/*=================*/
 -	row_prebuilt_t*	prebuilt,	/*!< in/out: template */
 -	dict_index_t*	clust_index,	/*!< in: InnoDB clustered index */
 -	dict_index_t*	index,		/*!< in: InnoDB index to use */
 -	TABLE*		table,		/*!< in: MySQL table object */
 -	const Field*	field,		/*!< in: field in MySQL table */
 -	ulint		i)		/*!< in: field index in InnoDB table */
 -{
 -	mysql_row_templ_t*	templ;
 -	const dict_col_t*	col;
 +		dtuple_set_n_fields(prebuilt->search_tuple, 0);
 +	}
  
 -	//ut_ad(field == table->field[i]);
 -	ut_ad(clust_index->table == index->table);
 +	mode = convert_search_mode_to_innobase(find_flag);
  
 -	col = dict_table_get_nth_col(index->table, i);
 +	match_mode = 0;
  
 -	templ = prebuilt->mysql_template + prebuilt->n_template++;
 -	UNIV_MEM_INVALID(templ, sizeof *templ);
 -	templ->col_no = i;
 -	templ->clust_rec_field_no = dict_col_get_clust_pos(col, clust_index);
 -	ut_a(templ->clust_rec_field_no != ULINT_UNDEFINED);
 +	if (find_flag == HA_READ_KEY_EXACT) {
  
 -	if (dict_index_is_clust(index)) {
 -		templ->rec_field_no = templ->clust_rec_field_no;
 -	} else {
 -		templ->rec_field_no = dict_index_get_nth_col_pos(index, i);
 -	}
 +		match_mode = ROW_SEL_EXACT;
  
 -	if (field->null_ptr) {
 -		templ->mysql_null_byte_offset =
 -			(ulint) ((char*) field->null_ptr
 -				 - (char*) table->record[0]);
 +	} else if (find_flag == HA_READ_PREFIX
 +		   || find_flag == HA_READ_PREFIX_LAST) {
  
 -		templ->mysql_null_bit_mask = (ulint) field->null_bit;
 -	} else {
 -		templ->mysql_null_bit_mask = 0;
 +		match_mode = ROW_SEL_EXACT_PREFIX;
  	}
  
 -	templ->mysql_col_offset = (ulint) get_field_offset(table, field);
 +	last_match_mode = (uint) match_mode;
  
 -	templ->mysql_col_len = (ulint) field->pack_length();
 -	templ->type = col->mtype;
 -	templ->mysql_type = (ulint)field->type();
 +	if (mode != PAGE_CUR_UNSUPP) {
  
 -	if (templ->mysql_type == DATA_MYSQL_TRUE_VARCHAR) {
 -		templ->mysql_length_bytes = (ulint)
 -			(((Field_varstring*)field)->length_bytes);
 -	}
 +		innobase_srv_conc_enter_innodb(prebuilt->trx);
  
 -	templ->charset = dtype_get_charset_coll(col->prtype);
 -	templ->mbminlen = col->mbminlen;
 -	templ->mbmaxlen = col->mbmaxlen;
 -	templ->is_unsigned = col->prtype & DATA_UNSIGNED;
 +		ret = row_search_for_mysql((byte*) buf, mode, prebuilt,
 +					   match_mode, 0);
  
 -	if (!dict_index_is_clust(index)
 -	    && templ->rec_field_no == ULINT_UNDEFINED) {
 -		prebuilt->need_to_access_clustered = TRUE;
 +		innobase_srv_conc_exit_innodb(prebuilt->trx);
 +	} else {
 +
 +		ret = DB_UNSUPPORTED;
  	}
  
 -	if (prebuilt->mysql_prefix_len < templ->mysql_col_offset
 -	    + templ->mysql_col_len) {
 -		prebuilt->mysql_prefix_len = templ->mysql_col_offset
 -			+ templ->mysql_col_len;
 +	if (UNIV_UNLIKELY(srv_pass_corrupt_table <= 1 && share
 +			  && share->ib_table && share->ib_table->is_corrupt)) {
 +		DBUG_RETURN(HA_ERR_CRASHED);
  	}
  
 -	if (templ->type == DATA_BLOB) {
 -		prebuilt->templ_contains_blob = TRUE;
 +	switch (ret) {
 +	case DB_SUCCESS:
 +		error = 0;
 +		table->status = 0;
 +		if (prebuilt->table->is_system_db) {
 +			srv_stats.n_system_rows_read.add(
 +				(size_t) prebuilt->trx->id, 1);
 +		} else {
 +			srv_stats.n_rows_read.add(
 +				(size_t) prebuilt->trx->id, 1);
 +		}
 +		break;
 +	case DB_RECORD_NOT_FOUND:
 +		error = HA_ERR_KEY_NOT_FOUND;
 +		table->status = STATUS_NOT_FOUND;
 +		break;
 +	case DB_END_OF_INDEX:
 +		error = HA_ERR_KEY_NOT_FOUND;
 +		table->status = STATUS_NOT_FOUND;
 +		break;
 +	case DB_TABLESPACE_DELETED:
 +
 +		ib_senderrf(
 +			prebuilt->trx->mysql_thd, IB_LOG_LEVEL_ERROR,
 +			ER_TABLESPACE_DISCARDED,
 +			table->s->table_name.str);
 +
 +		table->status = STATUS_NOT_FOUND;
 +		error = HA_ERR_NO_SUCH_TABLE;
 +		break;
 +	case DB_TABLESPACE_NOT_FOUND:
 +
 +		ib_senderrf(
 +			prebuilt->trx->mysql_thd, IB_LOG_LEVEL_ERROR,
 +			ER_TABLESPACE_MISSING, MYF(0),
 +			table->s->table_name.str);
 +
 +		table->status = STATUS_NOT_FOUND;
 +		error = HA_ERR_NO_SUCH_TABLE;
 +		break;
 +	default:
 +		error = convert_error_code_to_mysql(
 +			ret, prebuilt->table->flags, user_thd);
 +
 +		table->status = STATUS_NOT_FOUND;
 +		break;
  	}
  
 -	return(templ);
 +	DBUG_RETURN(error);
  }
  
 -/**************************************************************//**
 -Builds a 'template' to the prebuilt struct. The template is used in fast
 -retrieval of just those column values MySQL needs in its processing. */
 +/*******************************************************************//**
 +The following functions works like index_read, but it find the last
 +row with the current key value or prefix.
 +@return	0, HA_ERR_KEY_NOT_FOUND, or an error code */
  UNIV_INTERN
 -void
 -ha_innobase::build_template(
 -/*========================*/
 -	bool		whole_row)	/*!< in: true=ROW_MYSQL_WHOLE_ROW,
 -					false=ROW_MYSQL_REC_FIELDS */
 +int
 +ha_innobase::index_read_last(
 +/*=========================*/
 +	uchar*		buf,	/*!< out: fetched row */
 +	const uchar*	key_ptr,/*!< in: key value, or a prefix of a full
 +				key value */
 +	uint		key_len)/*!< in: length of the key val or prefix
 +				in bytes */
  {
 -	dict_index_t*	index;
 -	dict_index_t*	clust_index;
 -	ulint		n_stored_fields;
 -	ibool		fetch_all_in_key	= FALSE;
 -	ibool		fetch_primary_key_cols	= FALSE;
 -	ulint		i, sql_idx;
 -       
 -	if (prebuilt->select_lock_type == LOCK_X) {
 -		/* We always retrieve the whole clustered index record if we
 -		use exclusive row level locks, for example, if the read is
 -		done in an UPDATE statement. */
 +	return(index_read(buf, key_ptr, key_len, HA_READ_PREFIX_LAST));
 +}
  
 -		whole_row = true;
 -	} else if (!whole_row) {
 -		if (prebuilt->hint_need_to_fetch_extra_cols
 -			== ROW_RETRIEVE_ALL_COLS) {
 +/********************************************************************//**
 +Get the index for a handle. Does not change active index.
 +@return	NULL or index instance. */
 +UNIV_INTERN
 +dict_index_t*
 +ha_innobase::innobase_get_index(
 +/*============================*/
 +	uint		keynr)	/*!< in: use this index; MAX_KEY means always
 +				clustered index, even if it was internally
 +				generated by InnoDB */
 +{
 +	KEY*		key = 0;
 +	dict_index_t*	index = 0;
  
 -			/* We know we must at least fetch all columns in the
 -			key, or all columns in the table */
 +	DBUG_ENTER("innobase_get_index");
  
 -			if (prebuilt->read_just_key) {
 -				/* MySQL has instructed us that it is enough
 -				to fetch the columns in the key; looks like
 -				MySQL can set this flag also when there is
 -				only a prefix of the column in the key: in
 -				that case we retrieve the whole column from
 -				the clustered index */
 +	if (keynr != MAX_KEY && table->s->keys > 0) {
 +		key = table->key_info + keynr;
  
 -				fetch_all_in_key = TRUE;
 -			} else {
 -				whole_row = true;
 -			}
 -		} else if (prebuilt->hint_need_to_fetch_extra_cols
 -			== ROW_RETRIEVE_PRIMARY_KEY) {
 -			/* We must at least fetch all primary key cols. Note
 -			   that if the clustered index was internally generated
 -			   by InnoDB on the row id (no primary key was
 -			   defined), then row_search_for_mysql() will always
 -			   retrieve the row id to a special buffer in the
 -			   prebuilt struct. */
 +		index = innobase_index_lookup(share, keynr);
  
 -			fetch_primary_key_cols = TRUE;
 -		}
 -	}
 +		if (index) {
  
 -	clust_index = dict_table_get_first_index(prebuilt->table);
 +			if (!key || ut_strcmp(index->name, key->name) != 0) {
 +				fprintf(stderr, "InnoDB: [Error] Index for key no %u"
 +					" mysql name %s , InnoDB name %s for table %s\n",
 +					keynr, key ? key->name : "NULL",
 +					index->name,
 +					prebuilt->table->name);
  
 -	index = whole_row ? clust_index : prebuilt->index;
 +				for(ulint i=0; i < table->s->keys; i++) {
 +					index = innobase_index_lookup(share, i);
 +					key = table->key_info + keynr;
  
 -	prebuilt->need_to_access_clustered = (index == clust_index);
 +					if (index) {
  
 -	/* Below we check column by column if we need to access
 -	the clustered index. */
 +						fprintf(stderr, "InnoDB: [Note] Index for key no %u"
 +							" mysql name %s , InnoDB name %s for table %s\n",
 +							keynr, key ? key->name : "NULL",
 +							index->name,
 +							prebuilt->table->name);
 +					}
 +				}
 +			}
  
 -	n_stored_fields= (ulint)table->s->stored_fields; /* number of stored columns */
 +			ut_a(ut_strcmp(index->name, key->name) == 0);
 +		} else {
 +			/* Can't find index with keynr in the translation
 +			table. Only print message if the index translation
 +			table exists */
 +			if (share->idx_trans_tbl.index_mapping) {
 +				sql_print_warning("InnoDB could not find "
 +						  "index %s key no %u for "
 +						  "table %s through its "
 +						  "index translation table",
 +						  key ? key->name : "NULL",
 +						  keynr,
 +						  prebuilt->table->name);
 +			}
  
 -	if (!prebuilt->mysql_template) {
 -		prebuilt->mysql_template = (mysql_row_templ_t*)
 -			mem_alloc(n_stored_fields * sizeof(mysql_row_templ_t));
 +			index = dict_table_get_index_on_name(prebuilt->table,
 +							     key->name);
 +		}
 +	} else {
 +		index = dict_table_get_first_index(prebuilt->table);
  	}
  
 -	prebuilt->template_type = whole_row
 -		? ROW_MYSQL_WHOLE_ROW : ROW_MYSQL_REC_FIELDS;
 -	prebuilt->null_bitmap_len = table->s->null_bytes;
 -
 -	/* Prepare to build prebuilt->mysql_template[]. */
 -	prebuilt->templ_contains_blob = FALSE;
 -	prebuilt->mysql_prefix_len = 0;
 -	prebuilt->n_template = 0;
 -	prebuilt->idx_cond_n_cols = 0;
 +	if (!index) {
 +		sql_print_error(
 +			"Innodb could not find key n:o %u with name %s "
 +			"from dict cache for table %s",
 +			keynr, key ? key->name : "NULL",
 +			prebuilt->table->name);
 +	}
  
 -	/* Note that in InnoDB, i is the column number in the table.
 -	MySQL calls columns 'fields'. */
 +	DBUG_RETURN(index);
 +}
  
 -	if (active_index != MAX_KEY && active_index == pushed_idx_cond_keyno) {
 -		/* Push down an index condition or an end_range check. */
 -	  for (i = 0, sql_idx = 0; i < n_stored_fields; i++, sql_idx++) {
 +/********************************************************************//**
 +Changes the active index of a handle.
 +@return	0 or error code */
 +UNIV_INTERN
 +int
 +ha_innobase::change_active_index(
 +/*=============================*/
 +	uint	keynr)	/*!< in: use this index; MAX_KEY means always clustered
 +			index, even if it was internally generated by
 +			InnoDB */
 +{
 +	DBUG_ENTER("change_active_index");
  
 -	                while (!table->field[sql_idx]->stored_in_db) {
 -			        sql_idx++;     
 -                        }
 -                       
 -			const ibool		index_contains
 -				= dict_index_contains_col_or_prefix(index, i);
 +	if (UNIV_UNLIKELY(srv_pass_corrupt_table <= 1 && share
 +			  && share->ib_table && share->ib_table->is_corrupt)) {
 +		DBUG_RETURN(HA_ERR_CRASHED);
 +	}
  
 -			/* Test if an end_range or an index condition
 -			refers to the field. Note that "index" and
 -			"index_contains" may refer to the clustered index.
 -			Index condition pushdown is relative to prebuilt->index
 -			(the index that is being looked up first). */
 +	ut_ad(user_thd == ha_thd());
 +	ut_a(prebuilt->trx == thd_to_trx(user_thd));
  
 -			/* When join_read_always_key() invokes this
 -			code via handler::ha_index_init() and
 -			ha_innobase::index_init(), end_range is not
 -			yet initialized. Because of that, we must
 -			always check for index_contains, instead of
 -			the subset
 -			field->part_of_key.is_set(active_index)
 -			which would be acceptable if end_range==NULL. */
 -			if (index == prebuilt->index
 -				? index_contains
 -				: dict_index_contains_col_or_prefix(
 -					prebuilt->index, i)) {
 -				/* Needed in ICP */
 -				const Field*		field;
 -				mysql_row_templ_t*	templ;
 +	active_index = keynr;
  
 -				if (whole_row) {
 -					field = table->field[sql_idx];
 -				} else {
 -					field = build_template_needs_field(
 -						index_contains,
 -						prebuilt->read_just_key,
 -						fetch_all_in_key,
 -						fetch_primary_key_cols,
 -						index, table, i, sql_idx);
 -					if (!field) {
 -						continue;
 -					}
 -				}
 +	prebuilt->index = innobase_get_index(keynr);
  
 -				templ = build_template_field(
 -					prebuilt, clust_index, index,
 -					table, field, i);
 -				prebuilt->idx_cond_n_cols++;
 -				ut_ad(prebuilt->idx_cond_n_cols
 -				      == prebuilt->n_template);
 +	if (UNIV_UNLIKELY(!prebuilt->index)) {
 +		sql_print_warning("InnoDB: change_active_index(%u) failed",
 +				  keynr);
 +		prebuilt->index_usable = FALSE;
 +		DBUG_RETURN(1);
 +	}
  
 -				if (index == prebuilt->index) {
 -					templ->icp_rec_field_no
 -						= templ->rec_field_no;
 -				} else {
 -					templ->icp_rec_field_no
 -						= dict_index_get_nth_col_pos(
 -							prebuilt->index, i);
 -				}
 +	prebuilt->index_usable = row_merge_is_index_usable(prebuilt->trx,
 +							   prebuilt->index);
  
 -				if (dict_index_is_clust(prebuilt->index)) {
 -					ut_ad(templ->icp_rec_field_no
 -					      != ULINT_UNDEFINED);
 -					/* If the primary key includes
 -					a column prefix, use it in
 -					index condition pushdown,
 -					because the condition is
 -					evaluated before fetching any
 -					off-page (externally stored)
 -					columns. */
 -					if (templ->icp_rec_field_no
 -					    < prebuilt->index->n_uniq) {
 -						/* This is a key column;
 -						all set. */
 -						continue;
 -					}
 -				} else if (templ->icp_rec_field_no
 -					   != ULINT_UNDEFINED) {
 -					continue;
 -				}
 +	if (UNIV_UNLIKELY(!prebuilt->index_usable)) {
 +		if (dict_index_is_corrupted(prebuilt->index)) {
 +			char index_name[MAX_FULL_NAME_LEN + 1];
 +			char table_name[MAX_FULL_NAME_LEN + 1];
  
 -				/* This is a column prefix index.
 -				The column prefix can be used in
 -				an end_range comparison. */
 +			innobase_format_name(
 +				index_name, sizeof index_name,
 +				prebuilt->index->name, TRUE);
  
 -				templ->icp_rec_field_no
 -					= dict_index_get_nth_col_or_prefix_pos(
 -						prebuilt->index, i, TRUE);
 -				ut_ad(templ->icp_rec_field_no
 -				      != ULINT_UNDEFINED);
 +			innobase_format_name(
 +				table_name, sizeof table_name,
 +				prebuilt->index->table->name, FALSE);
  
 -				/* Index condition pushdown can be used on
 -				all columns of a secondary index, and on
 -				the PRIMARY KEY columns. */
 -				/* TODO: enable this assertion
 -				(but first ensure that end_range is
 -				valid here and use an accurate condition
 -				for end_range) 
 -				ut_ad(!dict_index_is_clust(prebuilt->index)
 -				      || templ->rec_field_no
 -				      < prebuilt->index->n_uniq);
 -				*/
 -			}
 +			push_warning_printf(
 +				user_thd, Sql_condition::WARN_LEVEL_WARN,
 +				HA_ERR_INDEX_CORRUPT,
 +				"InnoDB: Index %s for table %s is"
 +				" marked as corrupted",
 +				index_name, table_name);
 +			DBUG_RETURN(HA_ERR_INDEX_CORRUPT);
 +		} else {
 +			push_warning_printf(
 +				user_thd, Sql_condition::WARN_LEVEL_WARN,
 +				HA_ERR_TABLE_DEF_CHANGED,
 +				"InnoDB: insufficient history for index %u",
 +				keynr);
  		}
  
 -		ut_ad(prebuilt->idx_cond_n_cols > 0);
 -		ut_ad(prebuilt->idx_cond_n_cols == prebuilt->n_template);
 -
 -		/* Include the fields that are not needed in index condition
 -		pushdown. */
 -                for (i = 0, sql_idx = 0; i < n_stored_fields; i++, sql_idx++) {
 -
 -		        while (!table->field[sql_idx]->stored_in_db) {
 -			        sql_idx++;     
 -                        }
 -                       
 -			const ibool		index_contains
 -				= dict_index_contains_col_or_prefix(index, i);
 -
 -			if (index == prebuilt->index
 -				? !index_contains
 -				: !dict_index_contains_col_or_prefix(
 -					prebuilt->index, i)) {
 -				/* Not needed in ICP */
 -				const Field*	field;
 +		/* The caller seems to ignore this.  Thus, we must check
 +		this again in row_search_for_mysql(). */
 +		DBUG_RETURN(convert_error_code_to_mysql(DB_MISSING_HISTORY,
 +                                                        0, NULL));
 +	}
  
 -				if (whole_row) {
 -					field = table->field[sql_idx];
 -				} else {
 -					field = build_template_needs_field(
 -						index_contains,
 -						prebuilt->read_just_key,
 -						fetch_all_in_key,
 -						fetch_primary_key_cols,
 -						index, table, i, sql_idx);
 -					if (!field) {
 -						continue;
 -					}
 -				}
 +	ut_a(prebuilt->search_tuple != 0);
  
 -				build_template_field(prebuilt,
 -						     clust_index, index,
 -						     table, field, i);
 -			}
 -		}
 +	dtuple_set_n_fields(prebuilt->search_tuple, prebuilt->index->n_fields);
  
 -		prebuilt->idx_cond = this;
 - 	} else {
 -		/* No index condition pushdown */
 -		prebuilt->idx_cond = NULL;
 +	dict_index_copy_types(prebuilt->search_tuple, prebuilt->index,
 +			      prebuilt->index->n_fields);
  
 -                for (i = 0, sql_idx = 0; i < n_stored_fields; i++, sql_idx++) {
 -			const Field*	field;
 +	/* MySQL changes the active index for a handle also during some
 +	queries, for example SELECT MAX(a), SUM(a) first retrieves the MAX()
 +	and then calculates the sum. Previously we played safe and used
 +	the flag ROW_MYSQL_WHOLE_ROW below, but that caused unnecessary
 +	copying. Starting from MySQL-4.1 we use a more efficient flag here. */
  
 -	                while (!table->field[sql_idx]->stored_in_db) {
 -			        sql_idx++;     
 -                        }
 +	build_template(false);
  
 -			if (whole_row) {
 -				field = table->field[sql_idx];
 -			} else {
 -				field = build_template_needs_field(
 -					dict_index_contains_col_or_prefix(
 -						index, i),
 -					prebuilt->read_just_key,
 -					fetch_all_in_key,
 -					fetch_primary_key_cols,
 -					index, table, i, sql_idx);
 -				if (!field) {
 -					continue;
 -				}
 -			}
 +	DBUG_RETURN(0);
 +}
  
 -			build_template_field(prebuilt, clust_index, index,
 -					     table, field, i);
 -		}
 -        }
 +/**********************************************************************//**
 +Positions an index cursor to the index specified in keynr. Fetches the
 +row if any.
 +??? This is only used to read whole keys ???
 +@return	error number or 0 */
 +UNIV_INTERN
 +int
 +ha_innobase::index_read_idx(
 +/*========================*/
 +	uchar*		buf,		/*!< in/out: buffer for the returned
 +					row */
 +	uint		keynr,		/*!< in: use this index */
 +	const uchar*	key,		/*!< in: key value; if this is NULL
 +					we position the cursor at the
 +					start or end of index */
 +	uint		key_len,	/*!< in: key value length */
 +	enum ha_rkey_function find_flag)/*!< in: search flags from my_base.h */
 +{
 +	if (change_active_index(keynr)) {
  
 -	if (index != clust_index && prebuilt->need_to_access_clustered) {
 -		/* Change rec_field_no's to correspond to the clustered index
 -		record */
 -		for (i = 0; i < prebuilt->n_template; i++) {
 -			mysql_row_templ_t*	templ
 -				= &prebuilt->mysql_template[i];
 -			templ->rec_field_no = templ->clust_rec_field_no;
 -		}
 +		return(1);
  	}
 +
 +	return(index_read(buf, key, key_len, find_flag));
  }
  
 -/********************************************************************//**
 -This special handling is really to overcome the limitations of MySQL's
 -binlogging. We need to eliminate the non-determinism that will arise in
 -INSERT ... SELECT type of statements, since MySQL binlog only stores the
 -min value of the autoinc interval. Once that is fixed we can get rid of
 -the special lock handling.
 -@return	DB_SUCCESS if all OK else error code */
 +/***********************************************************************//**
 +Reads the next or previous row from a cursor, which must have previously been
 +positioned using index_read.
 +@return	0, HA_ERR_END_OF_FILE, or error number */
  UNIV_INTERN
 -ulint
 -ha_innobase::innobase_lock_autoinc(void)
 -/*====================================*/
 +int
 +ha_innobase::general_fetch(
 +/*=======================*/
 +	uchar*	buf,		/*!< in/out: buffer for next row in MySQL
 +				format */
 +	uint	direction,	/*!< in: ROW_SEL_NEXT or ROW_SEL_PREV */
 +	uint	match_mode)	/*!< in: 0, ROW_SEL_EXACT, or
 +				ROW_SEL_EXACT_PREFIX */
  {
 -	ulint		error = DB_SUCCESS;
 +	dberr_t	ret;
 +	int	error;
  
 -	switch (innobase_autoinc_lock_mode) {
 -	case AUTOINC_NO_LOCKING:
 -		/* Acquire only the AUTOINC mutex. */
 -		dict_table_autoinc_lock(prebuilt->table);
 -		break;
 +	DBUG_ENTER("general_fetch");
  
 -	case AUTOINC_NEW_STYLE_LOCKING:
 -		/* For simple (single/multi) row INSERTs/REPLACEs and RBR
 -		events, we fallback to the old style only if another
 -		transaction has already acquired the AUTOINC lock on
 -		behalf of a LOAD FILE or INSERT ... SELECT etc. type of
 -		statement. */
 -		if (thd_sql_command(user_thd) == SQLCOM_INSERT
 -		    || thd_sql_command(user_thd) == SQLCOM_REPLACE
 -		    || thd_sql_command(user_thd) == SQLCOM_END // RBR event
 -		) {
 -			dict_table_t*	table = prebuilt->table;
 +	/* If transaction is not startted do not continue, instead return a error code. */
 +	if(!(prebuilt->sql_stat_start || (prebuilt->trx && prebuilt->trx->state == 1))) {
 +		DBUG_RETURN(HA_ERR_END_OF_FILE);
 +	}
  
 -			/* Acquire the AUTOINC mutex. */
 -			dict_table_autoinc_lock(table);
 +	if (UNIV_UNLIKELY(srv_pass_corrupt_table <= 1 && share
 +			  && share->ib_table && share->ib_table->is_corrupt)) {
 +		DBUG_RETURN(HA_ERR_CRASHED);
 +	}
  
 -			/* We need to check that another transaction isn't
 -			already holding the AUTOINC lock on the table. */
 -			if (table->n_waiting_or_granted_auto_inc_locks) {
 -				/* Release the mutex to avoid deadlocks and
 -				fall back to old style locking. */
 -				dict_table_autoinc_unlock(table);
 -			} else {
 -				/* Do not fall back to old style locking. */
 -				break;
 -			}
 -		}
 -		/* fall through */
 +	ut_a(prebuilt->trx == thd_to_trx(user_thd));
  
 -	case AUTOINC_OLD_STYLE_LOCKING:
 -		error = row_lock_table_autoinc_for_mysql(prebuilt);
 +	innobase_srv_conc_enter_innodb(prebuilt->trx);
  
 -		if (error == DB_SUCCESS) {
 +	ret = row_search_for_mysql(
 +		(byte*) buf, 0, prebuilt, match_mode, direction);
  
 -			/* Acquire the AUTOINC mutex. */
 -			dict_table_autoinc_lock(prebuilt->table);
 -		}
 +	innobase_srv_conc_exit_innodb(prebuilt->trx);
 +
 +	if (UNIV_UNLIKELY(srv_pass_corrupt_table <= 1 && share
 +			  && share->ib_table && share->ib_table->is_corrupt)) {
 +		DBUG_RETURN(HA_ERR_CRASHED);
 +	}
 +
 +	switch (ret) {
 +	case DB_SUCCESS:
 +		error = 0;
 +		table->status = 0;
 +		srv_stats.n_rows_read.add((size_t) prebuilt->trx->id, 1);
 +		break;
 +	case DB_RECORD_NOT_FOUND:
 +		error = HA_ERR_END_OF_FILE;
 +		table->status = STATUS_NOT_FOUND;
 +		break;
 +	case DB_END_OF_INDEX:
 +		error = HA_ERR_END_OF_FILE;
 +		table->status = STATUS_NOT_FOUND;
 +		break;
 +	case DB_TABLESPACE_DELETED:
 +
 +		ib_senderrf(
 +			prebuilt->trx->mysql_thd, IB_LOG_LEVEL_ERROR,
 +			ER_TABLESPACE_DISCARDED,
 +			table->s->table_name.str);
 +
 +		table->status = STATUS_NOT_FOUND;
 +		error = HA_ERR_NO_SUCH_TABLE;
  		break;
 +	case DB_TABLESPACE_NOT_FOUND:
 +
 +		ib_senderrf(
 +			prebuilt->trx->mysql_thd, IB_LOG_LEVEL_ERROR,
 +			ER_TABLESPACE_MISSING,
 +			table->s->table_name.str);
  
 +		table->status = STATUS_NOT_FOUND;
 +		error = HA_ERR_NO_SUCH_TABLE;
 +		break;
  	default:
 -		ut_error;
 +		error = convert_error_code_to_mysql(
 +			ret, prebuilt->table->flags, user_thd);
 +
 +		table->status = STATUS_NOT_FOUND;
 +		break;
  	}
  
 -	return(ulong(error));
 +	DBUG_RETURN(error);
 +}
 +
 +/***********************************************************************//**
 +Reads the next row from a cursor, which must have previously been
 +positioned using index_read.
 +@return	0, HA_ERR_END_OF_FILE, or error number */
 +UNIV_INTERN
 +int
 +ha_innobase::index_next(
 +/*====================*/
 +	uchar*		buf)	/*!< in/out: buffer for next row in MySQL
 +				format */
 +{
 +	return(general_fetch(buf, ROW_SEL_NEXT, 0));
 +}
 +
 +/*******************************************************************//**
 +Reads the next row matching to the key value given as the parameter.
 +@return	0, HA_ERR_END_OF_FILE, or error number */
 +UNIV_INTERN
 +int
 +ha_innobase::index_next_same(
 +/*=========================*/
 +	uchar*		buf,	/*!< in/out: buffer for the row */
 +	const uchar*	key,	/*!< in: key value */
 +	uint		keylen)	/*!< in: key value length */
 +{
 +	return(general_fetch(buf, ROW_SEL_NEXT, last_match_mode));
 +}
 +
 +/***********************************************************************//**
 +Reads the previous row from a cursor, which must have previously been
 +positioned using index_read.
 +@return	0, HA_ERR_END_OF_FILE, or error number */
 +UNIV_INTERN
 +int
 +ha_innobase::index_prev(
 +/*====================*/
 +	uchar*	buf)	/*!< in/out: buffer for previous row in MySQL format */
 +{
 +	return(general_fetch(buf, ROW_SEL_PREV, 0));
  }
  
  /********************************************************************//**
diff --cc storage/xtradb/log/log0online.cc
index 1a30501f266,00000000000..1e373c8345f
mode 100644,000000..100644
--- a/storage/xtradb/log/log0online.cc
+++ b/storage/xtradb/log/log0online.cc
@@@ -1,1924 -1,0 +1,1921 @@@
 +/*****************************************************************************
 +
 +Copyright (c) 2011-2012 Percona Inc. All Rights Reserved.
 +
 +This program is free software; you can redistribute it and/or modify it under
 +the terms of the GNU General Public License as published by the Free Software
 +Foundation; version 2 of the License.
 +
 +This program is distributed in the hope that it will be useful, but WITHOUT
 +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
 +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
 +
 +You should have received a copy of the GNU General Public License along with
 +this program; if not, write to the Free Software Foundation, Inc., 51 Franklin
 +Street, Fifth Floor, Boston, MA 02110-1301, USA
 +
 +*****************************************************************************/
 +
 +/**************************************************//**
 +@file log/log0online.cc
 +Online database log parsing for changed page tracking
 +
 +*******************************************************/
 +
 +#include "log0online.h"
 +
 +#include "my_dbug.h"
 +
 +#include "log0recv.h"
 +#include "mach0data.h"
 +#include "mtr0log.h"
 +#include "srv0srv.h"
 +#include "srv0start.h"
 +#include "trx0sys.h"
 +#include "ut0rbt.h"
 +
 +enum { FOLLOW_SCAN_SIZE = 4 * (UNIV_PAGE_SIZE_MAX) };
 +
 +#ifdef UNIV_PFS_MUTEX
 +/* Key to register log_bmp_sys->mutex with PFS */
 +UNIV_INTERN mysql_pfs_key_t	log_bmp_sys_mutex_key;
 +#endif /* UNIV_PFS_MUTEX */
 +
 +/** Log parsing and bitmap output data structure */
 +struct log_bitmap_struct {
 +	byte*		read_buf_ptr;	/*!< Unaligned log read buffer */
 +	byte*		read_buf;	/*!< log read buffer */
 +	byte		parse_buf[RECV_PARSING_BUF_SIZE];
 +					/*!< log parse buffer */
 +	byte*		parse_buf_end;  /*!< parse buffer position where the
 +					next read log data should be copied to.
 +					If the previous log records were fully
 +					parsed, it points to the start,
 +					otherwise points immediatelly past the
 +					end of the incomplete log record. */
 +	char		bmp_file_home[FN_REFLEN];
 +					/*!< directory for bitmap files */
 +	log_online_bitmap_file_t out;	/*!< The current bitmap file */
 +	ulint		out_seq_num;	/*!< the bitmap file sequence number */
 +	lsn_t		start_lsn;	/*!< the LSN of the next unparsed
 +					record and the start of the next LSN
 +					interval to be parsed.  */
 +	lsn_t		end_lsn;	/*!< the end of the LSN interval to be
 +					parsed, equal to the next checkpoint
 +					LSN at the time of parse */
 +	lsn_t		next_parse_lsn;	/*!< the LSN of the next unparsed
 +					record in the current parse */
 +	ib_rbt_t*	modified_pages; /*!< the current modified page set,
 +					organized as the RB-tree with the keys
 +					of (space, 4KB-block-start-page-id)
 +					pairs */
 +	ib_rbt_node_t*	page_free_list; /*!< Singly-linked list of freed nodes
 +					of modified_pages tree for later
 +					reuse.  Nodes are linked through
 +					ib_rbt_node_t.left as this field has
 +					both the correct type and the tree does
 +					not mind its overwrite during
 +					rbt_next() tree traversal. */
 +};
 +
 +/* The log parsing and bitmap output struct instance */
 +static struct log_bitmap_struct* log_bmp_sys;
 +
 +/* Mutex protecting log_bmp_sys */
 +static ib_mutex_t	log_bmp_sys_mutex;
 +
 +/** File name stem for bitmap files. */
 +static const char* bmp_file_name_stem = "ib_modified_log_";
 +
 +/** File name template for bitmap files.  The 1st format tag is a directory
 +name, the 2nd tag is the stem, the 3rd tag is a file sequence number, the 4th
 +tag is the start LSN for the file. */
 +static const char* bmp_file_name_template = "%s%s%lu_%llu.xdb";
 +
 +/* On server startup with empty database srv_start_lsn == 0, in
 +which case the first LSN of actual log records will be this. */
 +#define MIN_TRACKED_LSN ((LOG_START_LSN) + (LOG_BLOCK_HDR_SIZE))
 +
 +/* Tests if num bit of bitmap is set */
 +#define IS_BIT_SET(bitmap, num) \
 +	(*((bitmap) + ((num) >> 3)) & (1UL << ((num) & 7UL)))
 +
 +/** The bitmap file block size in bytes.  All writes will be multiples of this.
 + */
 +enum {
 +	MODIFIED_PAGE_BLOCK_SIZE = 4096
 +};
 +
 +
 +/** Offsets in a file bitmap block */
 +enum {
 +	MODIFIED_PAGE_IS_LAST_BLOCK = 0,/* 1 if last block in the current
 +					write, 0 otherwise. */
 +	MODIFIED_PAGE_START_LSN = 4,	/* The starting tracked LSN of this and
 +					other blocks in the same write */
 +	MODIFIED_PAGE_END_LSN = 12,	/* The ending tracked LSN of this and
 +					other blocks in the same write */
 +	MODIFIED_PAGE_SPACE_ID = 20,	/* The space ID of tracked pages in
 +					this block */
 +	MODIFIED_PAGE_1ST_PAGE_ID = 24,	/* The page ID of the first tracked
 +					page in this block */
 +	MODIFIED_PAGE_BLOCK_UNUSED_1 = 28,/* Unused in order to align the start
 +					of bitmap at 8 byte boundary */
 +	MODIFIED_PAGE_BLOCK_BITMAP = 32,/* Start of the bitmap itself */
 +	MODIFIED_PAGE_BLOCK_UNUSED_2 = MODIFIED_PAGE_BLOCK_SIZE - 8,
 +					/* Unused in order to align the end of
 +					bitmap at 8 byte boundary */
 +	MODIFIED_PAGE_BLOCK_CHECKSUM = MODIFIED_PAGE_BLOCK_SIZE - 4
 +					/* The checksum of the current block */
 +};
 +
 +/** Length of the bitmap data in a block in bytes */
 +enum { MODIFIED_PAGE_BLOCK_BITMAP_LEN
 +       = MODIFIED_PAGE_BLOCK_UNUSED_2 - MODIFIED_PAGE_BLOCK_BITMAP };
 +
 +/** Length of the bitmap data in a block in page ids */
 +enum { MODIFIED_PAGE_BLOCK_ID_COUNT = MODIFIED_PAGE_BLOCK_BITMAP_LEN * 8 };
 +
 +/****************************************************************//**
 +Provide a comparisson function for the RB-tree tree (space,
 +block_start_page) pairs.  Actual implementation does not matter as
 +long as the ordering is full.
 +@return -1 if p1 < p2, 0 if p1 == p2, 1 if p1 > p2
 +*/
 +static
 +int
 +log_online_compare_bmp_keys(
 +/*========================*/
 +	const void* p1,	/*!<in: 1st key to compare */
 +	const void* p2)	/*!<in: 2nd key to compare */
 +{
 +	const byte *k1 = (const byte *)p1;
 +	const byte *k2 = (const byte *)p2;
 +
 +	ulint k1_space = mach_read_from_4(k1 + MODIFIED_PAGE_SPACE_ID);
 +	ulint k2_space = mach_read_from_4(k2 + MODIFIED_PAGE_SPACE_ID);
 +	if (k1_space == k2_space) {
 +		ulint k1_start_page
 +			= mach_read_from_4(k1 + MODIFIED_PAGE_1ST_PAGE_ID);
 +		ulint k2_start_page
 +			= mach_read_from_4(k2 + MODIFIED_PAGE_1ST_PAGE_ID);
 +		return k1_start_page < k2_start_page
 +			? -1 : k1_start_page > k2_start_page ? 1 : 0;
 +	}
 +	return k1_space < k2_space ? -1 : 1;
 +}
 +
 +/****************************************************************//**
 +Set a bit for tracked page in the bitmap. Expand the bitmap tree as
 +necessary. */
 +static
 +void
 +log_online_set_page_bit(
 +/*====================*/
 +	ulint	space,	/*!<in: log record space id */
 +	ulint	page_no)/*!<in: log record page id */
 +{
 +	ut_ad(mutex_own(&log_bmp_sys_mutex));
 +
 +	ut_a(space != ULINT_UNDEFINED);
 +	ut_a(page_no != ULINT_UNDEFINED);
 +
 +	ulint block_start_page = page_no / MODIFIED_PAGE_BLOCK_ID_COUNT
 +		* MODIFIED_PAGE_BLOCK_ID_COUNT;
 +	ulint block_pos = block_start_page ? (page_no % block_start_page / 8)
 +		: (page_no / 8);
 +	uint bit_pos = page_no % 8;
 +
 +	byte search_page[MODIFIED_PAGE_BLOCK_SIZE];
 +	mach_write_to_4(search_page + MODIFIED_PAGE_SPACE_ID, space);
 +	mach_write_to_4(search_page + MODIFIED_PAGE_1ST_PAGE_ID,
 +			block_start_page);
 +
 +	byte	       *page_ptr;
 +	ib_rbt_bound_t  tree_search_pos;
 +	if (!rbt_search(log_bmp_sys->modified_pages, &tree_search_pos,
 +			search_page)) {
 +		page_ptr = rbt_value(byte, tree_search_pos.last);
 +	}
 +	else {
 +		ib_rbt_node_t *new_node;
 +
 +		if (log_bmp_sys->page_free_list) {
 +			new_node = log_bmp_sys->page_free_list;
 +			log_bmp_sys->page_free_list = new_node->left;
 +		}
 +		else {
 +			new_node = static_cast<ib_rbt_node_t *>
 +				(ut_malloc
 +				 (SIZEOF_NODE(log_bmp_sys->modified_pages)));
 +		}
 +		memset(new_node, 0, SIZEOF_NODE(log_bmp_sys->modified_pages));
 +
 +		page_ptr = rbt_value(byte, new_node);
 +		mach_write_to_4(page_ptr + MODIFIED_PAGE_SPACE_ID, space);
 +		mach_write_to_4(page_ptr + MODIFIED_PAGE_1ST_PAGE_ID,
 +				block_start_page);
 +
 +		rbt_add_preallocated_node(log_bmp_sys->modified_pages,
 +					  &tree_search_pos, new_node);
 +	}
 +	page_ptr[MODIFIED_PAGE_BLOCK_BITMAP + block_pos] |= (1U << bit_pos);
 +}
 +
 +/****************************************************************//**
 +Calculate a bitmap block checksum.  Algorithm borrowed from
 +log_block_calc_checksum.
 +@return checksum */
 +UNIV_INLINE
 +ulint
 +log_online_calc_checksum(
 +/*=====================*/
 +	const byte*	block)	/*!<in: bitmap block */
 +{
 +	ulint	sum;
 +	ulint	sh;
 +	ulint	i;
 +
 +	sum = 1;
 +	sh = 0;
 +
 +	for (i = 0; i < MODIFIED_PAGE_BLOCK_CHECKSUM; i++) {
 +
 +		ulint	b = block[i];
 +		sum &= 0x7FFFFFFFUL;
 +		sum += b;
 +		sum += b << sh;
 +		sh++;
 +		if (sh > 24) {
 +			sh = 0;
 +		}
 +	}
 +
 +	return sum;
 +}
 +
 +/****************************************************************//**
 +Read one bitmap data page and check it for corruption.
 +
 +@return TRUE if page read OK, FALSE if I/O error */
 +static
 +ibool
 +log_online_read_bitmap_page(
 +/*========================*/
 +	log_online_bitmap_file_t	*bitmap_file,	/*!<in/out: bitmap
 +							file */
 +	byte				*page,	       /*!<out: read page.
 +						       Must be at least
 +						       MODIFIED_PAGE_BLOCK_SIZE
 +						       bytes long */
 +	ibool				*checksum_ok)	/*!<out: TRUE if page
 +							checksum OK */
 +{
 +	ulint	checksum;
 +	ulint	actual_checksum;
 +	ibool	success;
 +
 +	ut_a(bitmap_file->size >= MODIFIED_PAGE_BLOCK_SIZE);
 +	ut_a(bitmap_file->offset
 +	     <= bitmap_file->size - MODIFIED_PAGE_BLOCK_SIZE);
 +	ut_a(bitmap_file->offset % MODIFIED_PAGE_BLOCK_SIZE == 0);
 +
 +	success = os_file_read(bitmap_file->file, page, bitmap_file->offset,
 +			       MODIFIED_PAGE_BLOCK_SIZE);
 +
 +	if (UNIV_UNLIKELY(!success)) {
 +
 +		/* The following call prints an error message */
 +		os_file_get_last_error(TRUE);
 +		ib_logf(IB_LOG_LEVEL_WARN,
 +			"failed reading changed page bitmap file \'%s\'",
 +			bitmap_file->name);
 +		return FALSE;
 +	}
 +
 +	bitmap_file->offset += MODIFIED_PAGE_BLOCK_SIZE;
 +	ut_ad(bitmap_file->offset <= bitmap_file->size);
 +
 +	checksum = mach_read_from_4(page + MODIFIED_PAGE_BLOCK_CHECKSUM);
 +	actual_checksum = log_online_calc_checksum(page);
 +	*checksum_ok = (checksum == actual_checksum);
 +
 +	return TRUE;
 +}
 +
 +/****************************************************************//**
 +Get the last tracked fully LSN from the bitmap file by reading
 +backwards untile a correct end page is found.  Detects incomplete
 +writes and corrupted data.  Sets the start output position for the
 +written bitmap data.
 +
 +Multiple bitmap files are handled using the following assumptions:
 +1) Only the last file might be corrupted.  In case where no good data was found
 +in the last file, assume that the next to last file is OK.  This assumption
 +does not limit crash recovery capability in any way.
 +2) If the whole of the last file was corrupted, assume that the start LSN in
 +its name is correct and use it for (re-)tracking start.
 +
 +@return the last fully tracked LSN */
 +static
 +lsn_t
 +log_online_read_last_tracked_lsn(void)
 +/*==================================*/
 +{
 +	byte		page[MODIFIED_PAGE_BLOCK_SIZE];
 +	ibool		is_last_page	= FALSE;
 +	ibool		checksum_ok	= FALSE;
 +	lsn_t		result;
 +	os_offset_t	read_offset	= log_bmp_sys->out.offset;
 +
 +	while ((!checksum_ok || !is_last_page) && read_offset > 0)
 +	{
 +		read_offset -= MODIFIED_PAGE_BLOCK_SIZE;
 +		log_bmp_sys->out.offset = read_offset;
 +
 +		if (!log_online_read_bitmap_page(&log_bmp_sys->out, page,
 +						 &checksum_ok)) {
 +			checksum_ok = FALSE;
 +			result = 0;
 +			break;
 +		}
 +
 +		if (checksum_ok) {
 +			is_last_page
 +				= mach_read_from_4
 +				(page + MODIFIED_PAGE_IS_LAST_BLOCK);
 +		} else {
 +
 +			ib_logf(IB_LOG_LEVEL_WARN,
 +				"corruption detected in \'%s\' at offset "
 +				UINT64PF,
 +				log_bmp_sys->out.name, read_offset);
 +		}
 +	};
 +
 +	result = (checksum_ok && is_last_page)
 +		? mach_read_from_8(page + MODIFIED_PAGE_END_LSN) : 0;
 +
 +	/* Truncate the output file to discard the corrupted bitmap data, if
 +	any */
 +	if (!os_file_set_eof_at(log_bmp_sys->out.file,
 +				log_bmp_sys->out.offset)) {
 +		ib_logf(IB_LOG_LEVEL_WARN,
 +			"failed truncating changed page bitmap file \'%s\' to "
 +			UINT64PF " bytes",
 +			log_bmp_sys->out.name, log_bmp_sys->out.offset);
 +		result = 0;
 +	}
 +	return result;
 +}
 +
 +/****************************************************************//**
 +Safely write the log_sys->tracked_lsn value.  Uses atomic operations
 +if available, otherwise this field is protected with the log system
 +mutex.  The reader counterpart function is log_get_tracked_lsn() in
 +log0log.c. */
 +UNIV_INLINE
 +void
 +log_set_tracked_lsn(
 +/*================*/
 +	lsn_t	tracked_lsn)	/*!<in: new value */
 +{
 +	log_sys->tracked_lsn = tracked_lsn;
 +	os_wmb;
 +}
 +
 +/*********************************************************************//**
 +Check if missing, if any, LSN interval can be read and tracked using the
 +current LSN value, the LSN value where the tracking stopped, and the log group
 +capacity.
 +
 +@return TRUE if the missing interval can be tracked or if there's no missing
 +data.  */
 +static
 +ibool
 +log_online_can_track_missing(
 +/*=========================*/
 +	lsn_t	last_tracked_lsn,	/*!<in: last tracked LSN */
 +	lsn_t	tracking_start_lsn)	/*!<in:	current LSN */
 +{
 +	/* last_tracked_lsn might be < MIN_TRACKED_LSN in the case of empty
 +	bitmap file, handle this too. */
 +	last_tracked_lsn = ut_max(last_tracked_lsn, MIN_TRACKED_LSN);
 +
 +	if (last_tracked_lsn > tracking_start_lsn) {
 +		ib_logf(IB_LOG_LEVEL_ERROR,
 +			"last tracked LSN " LSN_PF " is ahead of tracking "
 +			"start LSN " LSN_PF ".  This can be caused by "
 +			"mismatched bitmap files.",
 +			last_tracked_lsn, tracking_start_lsn);
 +		exit(1);
 +	}
 +
 +	return (last_tracked_lsn == tracking_start_lsn)
 +		|| (log_sys->lsn - last_tracked_lsn
 +		    <= log_sys->log_group_capacity);
 +}
 +
 +
 +/****************************************************************//**
 +Diagnose a gap in tracked LSN range on server startup due to crash or
 +very fast shutdown and try to close it by tracking the data
 +immediatelly, if possible. */
 +static
 +void
 +log_online_track_missing_on_startup(
 +/*================================*/
 +	lsn_t	last_tracked_lsn,	/*!<in: last tracked LSN read from the
 +					bitmap file */
 +	lsn_t	tracking_start_lsn)	/*!<in: last checkpoint LSN of the
 +					current server startup */
 +{
 +	ut_ad(last_tracked_lsn != tracking_start_lsn);
 +	ut_ad(srv_track_changed_pages);
 +
 +	ib_logf(IB_LOG_LEVEL_WARN, "last tracked LSN in \'%s\' is " LSN_PF
 +		", but the last checkpoint LSN is " LSN_PF ".  This might be "
 +		"due to a server crash or a very fast shutdown.",
 +		log_bmp_sys->out.name, last_tracked_lsn, tracking_start_lsn);
 +
 +	/* See if we can fully recover the missing interval */
 +	if (log_online_can_track_missing(last_tracked_lsn,
 +					 tracking_start_lsn)) {
 +
 +		ib_logf(IB_LOG_LEVEL_INFO,
 +			"reading the log to advance the last tracked LSN.");
 +
 +		log_bmp_sys->start_lsn = ut_max(last_tracked_lsn,
 +						MIN_TRACKED_LSN);
 +		log_set_tracked_lsn(log_bmp_sys->start_lsn);
 +		if (!log_online_follow_redo_log()) {
 +			exit(1);
 +		}
 +		ut_ad(log_bmp_sys->end_lsn >= tracking_start_lsn);
 +
 +		ib_logf(IB_LOG_LEVEL_INFO,
 +			"continuing tracking changed pages from LSN " LSN_PF,
 +			log_bmp_sys->end_lsn);
 +	}
 +	else {
 +		ib_logf(IB_LOG_LEVEL_WARN,
 +			"the age of last tracked LSN exceeds log capacity, "
 +			"tracking-based incremental backups will work only "
 +			"from the higher LSN!");
 +
 +		log_bmp_sys->end_lsn = log_bmp_sys->start_lsn
 +			= tracking_start_lsn;
 +		log_set_tracked_lsn(log_bmp_sys->start_lsn);
 +
 +		ib_logf(IB_LOG_LEVEL_INFO,
 +			"starting tracking changed pages from LSN " LSN_PF,
 +			log_bmp_sys->end_lsn);
 +	}
 +}
 +
 +/*********************************************************************//**
 +Format a bitmap output file name to log_bmp_sys->out.name.  */
 +static
 +void
 +log_online_make_bitmap_name(
 +/*=========================*/
 +	lsn_t	start_lsn)	/*!< in: the start LSN name part */
 +{
 +	ut_snprintf(log_bmp_sys->out.name, sizeof(log_bmp_sys->out.name), 
 +            bmp_file_name_template, log_bmp_sys->bmp_file_home,
 +            bmp_file_name_stem, log_bmp_sys->out_seq_num, start_lsn);
 +}
 +
- /*********************************************************************//**
- }
- 
 +/*********************************************************************//**
 +Check if an old file that has the name of a new bitmap file we are about to
 +create should be overwritten.  */
 +static
 +ibool
 +log_online_should_overwrite(
 +/*========================*/
 +	const char	*path)	/*!< in: path to file */
 +{
 +	dberr_t		err;
 +	os_file_stat_t	file_info;
 +
 +	/* Currently, it's OK to overwrite 0-sized files only */
 +	err = os_file_get_status(path, &file_info, false);
 +	return err == DB_SUCCESS && file_info.type == OS_FILE_TYPE_FILE
 +		&& file_info.size == 0LL;
 +}
 +
 +/*********************************************************************//**
 +Create a new empty bitmap output file.
 +
 +@return TRUE if operation succeeded, FALSE if I/O error */
 +static
 +ibool
 +log_online_start_bitmap_file(void)
 +/*==============================*/
 +{
 +	ibool	success	= TRUE;
 +
 +	/* Check for an old file that should be deleted first */
 +	if (log_online_should_overwrite(log_bmp_sys->out.name)) {
 +
 +		success = static_cast<ibool>(
 +			os_file_delete_if_exists(innodb_file_bmp_key,
 +						 log_bmp_sys->out.name));
 +	}
 +
 +	if (UNIV_LIKELY(success)) {
 +		log_bmp_sys->out.file
 +			= os_file_create_simple_no_error_handling(
 +							innodb_file_bmp_key,
 +							log_bmp_sys->out.name,
 +							OS_FILE_CREATE,
 +							OS_FILE_READ_WRITE_CACHED,
 +							&success);
 +	}
 +	if (UNIV_UNLIKELY(!success)) {
 +
 +		/* The following call prints an error message */
 +		os_file_get_last_error(TRUE);
 +		ib_logf(IB_LOG_LEVEL_ERROR,
 +			"cannot create \'%s\'", log_bmp_sys->out.name);
 +		return FALSE;
 +	}
 +
 +	log_bmp_sys->out.offset = 0;
 +	return TRUE;
 +}
 +
 +/*********************************************************************//**
 +Close the current bitmap output file and create the next one.
 +
 +@return TRUE if operation succeeded, FALSE if I/O error */
 +static
 +ibool
 +log_online_rotate_bitmap_file(
 +/*===========================*/
 +	lsn_t	next_file_start_lsn)	/*!<in: the start LSN name
 +					part */
 +{
 +	if (!os_file_is_invalid(log_bmp_sys->out.file)) {
 +		os_file_close(log_bmp_sys->out.file);
 +		os_file_mark_invalid(&log_bmp_sys->out.file);
 +	}
 +	log_bmp_sys->out_seq_num++;
 +	log_online_make_bitmap_name(next_file_start_lsn);
 +	return log_online_start_bitmap_file();
 +}
 +
 +/*********************************************************************//**
 +Check the name of a given file if it's a changed page bitmap file and
 +return file sequence and start LSN name components if it is.  If is not,
 +the values of output parameters are undefined.
 +
 +@return TRUE if a given file is a changed page bitmap file.  */
 +static
 +ibool
 +log_online_is_bitmap_file(
 +/*======================*/
 +	const os_file_stat_t*	file_info,		/*!<in: file to
 +							check */
 +	ulong*			bitmap_file_seq_num,	/*!<out: bitmap file
 +							sequence number */
 +	lsn_t*			bitmap_file_start_lsn)	/*!<out: bitmap file
 +							start LSN */
 +{
 +	char	stem[FN_REFLEN];
 +
 +	ut_ad (strlen(file_info->name) < OS_FILE_MAX_PATH);
 +
 +	return ((file_info->type == OS_FILE_TYPE_FILE
 +		 || file_info->type == OS_FILE_TYPE_LINK)
 +		&& (sscanf(file_info->name, "%[a-z_]%lu_%llu.xdb", stem,
 +			   bitmap_file_seq_num,
 +			   (unsigned long long *)bitmap_file_start_lsn) == 3)
 +		&& (!strcmp(stem, bmp_file_name_stem)));
 +}
 +
 +/** Initialize the constant part of the log tracking subsystem */
 +UNIV_INTERN
 +void
 +log_online_init(void)
 +{
 +	mutex_create(log_bmp_sys_mutex_key, &log_bmp_sys_mutex,
 +		     SYNC_LOG_ONLINE);
 +}
 +
 +/** Initialize the dynamic part of the log tracking subsystem */
 +UNIV_INTERN
 +void
 +log_online_read_init(void)
 +{
 +	ibool	success;
 +	lsn_t	tracking_start_lsn
 +		= ut_max(log_sys->last_checkpoint_lsn, MIN_TRACKED_LSN);
 +	os_file_dir_t	bitmap_dir;
 +	os_file_stat_t	bitmap_dir_file_info;
 +	lsn_t	last_file_start_lsn	= MIN_TRACKED_LSN;
 +	size_t	srv_data_home_len;
 +
 +	/* Bitmap data start and end in a bitmap block must be 8-byte
 +	aligned. */
 +	compile_time_assert(MODIFIED_PAGE_BLOCK_BITMAP % 8 == 0);
 +	compile_time_assert(MODIFIED_PAGE_BLOCK_BITMAP_LEN % 8 == 0);
 +
 +	ut_ad(srv_track_changed_pages);
 +
 +	log_bmp_sys = static_cast<log_bitmap_struct *>
 +		(ut_malloc(sizeof(*log_bmp_sys)));
 +	log_bmp_sys->read_buf_ptr = static_cast<byte *>
 +		(ut_malloc(FOLLOW_SCAN_SIZE + OS_FILE_LOG_BLOCK_SIZE));
 +	log_bmp_sys->read_buf = static_cast<byte *>
 +		(ut_align(log_bmp_sys->read_buf_ptr, OS_FILE_LOG_BLOCK_SIZE));
 +
 +	/* Initialize bitmap file directory from srv_data_home and add a path
 +	separator if needed.  */
 +	srv_data_home_len = strlen(srv_data_home);
 +	ut_a (srv_data_home_len < FN_REFLEN);
 +	strcpy(log_bmp_sys->bmp_file_home, srv_data_home);
 +	if (srv_data_home_len
 +	    && log_bmp_sys->bmp_file_home[srv_data_home_len - 1]
 +	    != SRV_PATH_SEPARATOR) {
 +
 +		ut_a (srv_data_home_len < FN_REFLEN - 1);
 +		log_bmp_sys->bmp_file_home[srv_data_home_len]
 +			= SRV_PATH_SEPARATOR;
 +		log_bmp_sys->bmp_file_home[srv_data_home_len + 1] = '\0';
 +	}
 +
 +	/* Enumerate existing bitmap files to either open the last one to get
 +	the last tracked LSN either to find that there are none and start
 +	tracking from scratch.  */
 +	log_bmp_sys->out.name[0] = '\0';
 +	log_bmp_sys->out_seq_num = 0;
 +
 +	bitmap_dir = os_file_opendir(log_bmp_sys->bmp_file_home, TRUE);
 +	ut_a(bitmap_dir);
 +	while (!os_file_readdir_next_file(log_bmp_sys->bmp_file_home,
 +					  bitmap_dir, &bitmap_dir_file_info)) {
 +
 +		ulong	file_seq_num;
 +		lsn_t	file_start_lsn;
 +
 +		if (!log_online_is_bitmap_file(&bitmap_dir_file_info,
 +					      &file_seq_num,
 +					      &file_start_lsn)) {
 +			continue;
 +		}
 +
 +		if (file_seq_num > log_bmp_sys->out_seq_num
 +		    && bitmap_dir_file_info.size > 0) {
 +			log_bmp_sys->out_seq_num = file_seq_num;
 +			last_file_start_lsn = file_start_lsn;
 +			/* No dir component (log_bmp_sys->bmp_file_home) here,
 +			because	that's the cwd */
 +			strncpy(log_bmp_sys->out.name,
 +				bitmap_dir_file_info.name, FN_REFLEN - 1);
 +			log_bmp_sys->out.name[FN_REFLEN - 1] = '\0';
 +		}
 +	}
 +
 +	if (os_file_closedir(bitmap_dir)) {
 +		os_file_get_last_error(TRUE);
 +		ib_logf(IB_LOG_LEVEL_ERROR, "cannot close \'%s\'",
 +			log_bmp_sys->bmp_file_home);
 +		exit(1);
 +	}
 +
 +	if (!log_bmp_sys->out_seq_num) {
 +		log_bmp_sys->out_seq_num = 1;
 +		log_online_make_bitmap_name(0);
 +	}
 +
 +	log_bmp_sys->modified_pages = rbt_create(MODIFIED_PAGE_BLOCK_SIZE,
 +						 log_online_compare_bmp_keys);
 +	log_bmp_sys->page_free_list = NULL;
 +
 +	log_bmp_sys->out.file
 +		= os_file_create_simple_no_error_handling
 +		(innodb_file_bmp_key, log_bmp_sys->out.name, OS_FILE_OPEN,
 +		 OS_FILE_READ_WRITE_CACHED, &success);
 +
 +	if (!success) {
 +
 +		/* New file, tracking from scratch */
 +		if (!log_online_start_bitmap_file()) {
 +			exit(1);
 +		}
 +	}
 +	else {
 +
 +		/* Read the last tracked LSN from the last file */
 +		lsn_t	last_tracked_lsn;
 +		lsn_t	file_start_lsn;
 +
 +		log_bmp_sys->out.size
 +			= os_file_get_size(log_bmp_sys->out.file);
 +		log_bmp_sys->out.offset	= log_bmp_sys->out.size;
 +
 +		if (log_bmp_sys->out.offset % MODIFIED_PAGE_BLOCK_SIZE != 0) {
 +
 +			ib_logf(IB_LOG_LEVEL_WARN,
 +				"truncated block detected in \'%s\' at offset "
 +				UINT64PF,
 +				log_bmp_sys->out.name,
 +				log_bmp_sys->out.offset);
 +			log_bmp_sys->out.offset -=
 +				log_bmp_sys->out.offset
 +				% MODIFIED_PAGE_BLOCK_SIZE;
 +		}
 +
 +		last_tracked_lsn = log_online_read_last_tracked_lsn();
 +		/* Do not rotate if we truncated the file to zero length - we
 +		can just start writing there */
 +		const bool need_rotate = (last_tracked_lsn != 0);
 +		if (!last_tracked_lsn) {
 +
 +			last_tracked_lsn = last_file_start_lsn;
 +		}
 +
 +		/* Start a new file.  Choose the LSN value in its name based on
 +		if we can retrack any missing data. */
 +		if (log_online_can_track_missing(last_tracked_lsn,
 +						 tracking_start_lsn)) {
 +			file_start_lsn = last_tracked_lsn;
 +		} else {
 +			file_start_lsn = tracking_start_lsn;
 +		}
 +
 +		if (need_rotate
 +		    && !log_online_rotate_bitmap_file(file_start_lsn)) {
 +
 +			exit(1);
 +		}
 +
 +		if (last_tracked_lsn < tracking_start_lsn) {
 +
 +			log_online_track_missing_on_startup
 +				(last_tracked_lsn, tracking_start_lsn);
 +			return;
 +		}
 +
 +		if (last_tracked_lsn > tracking_start_lsn) {
 +
 +			ib_logf(IB_LOG_LEVEL_WARN,
 +				"last tracked LSN is " LSN_PF ", but the last "
 +				"checkpoint LSN is " LSN_PF ". The "
 +				"tracking-based incremental backups will work "
 +				"only from the latter LSN!",
 +				last_tracked_lsn, tracking_start_lsn);
 +		}
 +
 +	}
 +
 +	ib_logf(IB_LOG_LEVEL_INFO, "starting tracking changed pages from LSN "
 +		LSN_PF, tracking_start_lsn);
 +	log_bmp_sys->start_lsn = tracking_start_lsn;
 +	log_set_tracked_lsn(tracking_start_lsn);
 +}
 +
 +/** Shut down the dynamic part of the log tracking subsystem */
 +UNIV_INTERN
 +void
 +log_online_read_shutdown(void)
 +{
 +	mutex_enter(&log_bmp_sys_mutex);
 +
 +	srv_track_changed_pages = FALSE;
 +
 +	ib_rbt_node_t *free_list_node = log_bmp_sys->page_free_list;
 +
 +	if (!os_file_is_invalid(log_bmp_sys->out.file)) {
 +		os_file_close(log_bmp_sys->out.file);
 +		os_file_mark_invalid(&log_bmp_sys->out.file);
 +	}
 +
 +	rbt_free(log_bmp_sys->modified_pages);
 +
 +	while (free_list_node) {
 +		ib_rbt_node_t *next = free_list_node->left;
 +		ut_free(free_list_node);
 +		free_list_node = next;
 +	}
 +
 +	ut_free(log_bmp_sys->read_buf_ptr);
 +	ut_free(log_bmp_sys);
 +	log_bmp_sys = NULL;
 +
 +	srv_redo_log_thread_started = false;
 +
 +	mutex_exit(&log_bmp_sys_mutex);
 +}
 +
 +/** Shut down the constant part of the log tracking subsystem */
 +UNIV_INTERN
 +void
 +log_online_shutdown(void)
 +{
 +	mutex_free(&log_bmp_sys_mutex);
 +}
 +
 +/*********************************************************************//**
 +For the given minilog record type determine if the record has (space; page)
 +associated with it.
 +@return TRUE if the record has (space; page) in it */
 +static
 +ibool
 +log_online_rec_has_page(
 +/*====================*/
 +	byte	type)	/*!<in: the minilog record type */
 +{
 +	return type != MLOG_MULTI_REC_END && type != MLOG_DUMMY_RECORD;
 +}
 +
 +/*********************************************************************//**
 +Check if a page field for a given log record type actually contains a page
 +id. It does not for file operations and MLOG_LSN.
 +@return TRUE if page field contains actual page id, FALSE otherwise */
 +static
 +ibool
 +log_online_rec_page_means_page(
 +/*===========================*/
 +	byte	type)	/*!<in: log record type */
 +{
 +	return log_online_rec_has_page(type)
 +#ifdef UNIV_LOG_LSN_DEBUG
 +		&& type != MLOG_LSN
 +#endif
 +		&& type != MLOG_FILE_CREATE
 +		&& type != MLOG_FILE_RENAME
 +		&& type != MLOG_FILE_DELETE
 +		&& type != MLOG_FILE_CREATE2;
 +}
 +
 +/*********************************************************************//**
 +Parse the log data in the parse buffer for the (space, page) pairs and add
 +them to the modified page set as necessary.  Removes the fully-parsed records
 +from the buffer.  If an incomplete record is found, moves it to the end of the
 +buffer. */
 +static
 +void
 +log_online_parse_redo_log(void)
 +/*===========================*/
 +{
 +	ut_ad(mutex_own(&log_bmp_sys_mutex));
 +
 +	byte *ptr = log_bmp_sys->parse_buf;
 +	byte *end = log_bmp_sys->parse_buf_end;
 +	ulint len = 0;
 +
 +	while (ptr != end
 +	       && log_bmp_sys->next_parse_lsn < log_bmp_sys->end_lsn) {
 +
 +		byte	type;
 +		ulint	space;
 +		ulint	page_no;
 +		byte*	body;
 +
 +		/* recv_sys is not initialized, so on corrupt log we will
 +		SIGSEGV.  But the log of a live database should not be
 +		corrupt. */
 +		len = recv_parse_log_rec(ptr, end, &type, &space, &page_no,
 +					 &body);
 +		if (len > 0) {
 +
 +			if (log_online_rec_page_means_page(type)) {
 +
 +				ut_a(len >= 3);
 +				log_online_set_page_bit(space, page_no);
 +			}
 +
 +			ptr += len;
 +			ut_ad(ptr <= end);
 +			log_bmp_sys->next_parse_lsn
 +			    = recv_calc_lsn_on_data_add
 +				(log_bmp_sys->next_parse_lsn, len);
 +		}
 +		else {
 +
 +			/* Incomplete log record.  Shift it to the
 +			beginning of the parse buffer and leave it to be
 +			completed on the next read.  */
 +			ut_memmove(log_bmp_sys->parse_buf, ptr, end - ptr);
 +			log_bmp_sys->parse_buf_end
 +				= log_bmp_sys->parse_buf + (end - ptr);
 +			ptr = end;
 +		}
 +	}
 +
 +	if (len > 0) {
 +
 +		log_bmp_sys->parse_buf_end = log_bmp_sys->parse_buf;
 +	}
 +}
 +
 +/*********************************************************************//**
 +Check the log block checksum.
 +@return TRUE if the log block checksum is OK, FALSE otherwise.  */
 +static
 +ibool
 +log_online_is_valid_log_seg(
 +/*========================*/
 +	const byte* log_block)	/*!< in: read log data */
 +{
 +	ibool checksum_is_ok
 +		= log_block_checksum_is_ok_or_old_format(log_block);
 +
 +	if (!checksum_is_ok) {
 +
 +		ib_logf(IB_LOG_LEVEL_ERROR,
 +			"log block checksum mismatch: expected " ULINTPF ", "
 +			"calculated checksum " ULINTPF,
 +			log_block_get_checksum(log_block),
 +			log_block_calc_checksum(log_block));
 +	}
 +
 +	return checksum_is_ok;
 +}
 +
 +/*********************************************************************//**
 +Copy new log data to the parse buffer while skipping log block header,
 +trailer and already parsed data.  */
 +static
 +void
 +log_online_add_to_parse_buf(
 +/*========================*/
 +	const byte*	log_block,	/*!< in: read log data */
 +	ulint		data_len,	/*!< in: length of read log data */
 +	ulint		skip_len)	/*!< in: how much of log data to
 +					skip */
 +{
 +	ut_ad(mutex_own(&log_bmp_sys_mutex));
 +
 +	ulint start_offset = skip_len ? skip_len : LOG_BLOCK_HDR_SIZE;
 +	ulint end_offset
 +		= (data_len == OS_FILE_LOG_BLOCK_SIZE)
 +		? data_len - LOG_BLOCK_TRL_SIZE
 +		: data_len;
 +	ulint actual_data_len = (end_offset >= start_offset)
 +		? end_offset - start_offset : 0;
 +
 +	ut_memcpy(log_bmp_sys->parse_buf_end, log_block + start_offset,
 +		  actual_data_len);
 +
 +	log_bmp_sys->parse_buf_end += actual_data_len;
 +
 +	ut_a(log_bmp_sys->parse_buf_end - log_bmp_sys->parse_buf
 +	     <= RECV_PARSING_BUF_SIZE);
 +}
 +
 +/*********************************************************************//**
 +Parse the log block: first copies the read log data to the parse buffer while
 +skipping log block header, trailer and already parsed data.  Then it actually
 +parses the log to add to the modified page bitmap. */
 +static
 +void
 +log_online_parse_redo_log_block(
 +/*============================*/
 +	const byte*	log_block,		  /*!< in: read log data */
 +	ulint		skip_already_parsed_len)  /*!< in: how many bytes of
 +						  log data should be skipped as
 +						  they were parsed before */
 +{
 +	ut_ad(mutex_own(&log_bmp_sys_mutex));
 +
 +	ulint block_data_len = log_block_get_data_len(log_block);
 +
 +	ut_ad(block_data_len % OS_FILE_LOG_BLOCK_SIZE == 0
 +	      || block_data_len < OS_FILE_LOG_BLOCK_SIZE);
 +
 +	log_online_add_to_parse_buf(log_block, block_data_len,
 +				    skip_already_parsed_len);
 +	log_online_parse_redo_log();
 +}
 +
 +/*********************************************************************//**
 +Read and parse one redo log chunk and updates the modified page bitmap. */
 +static
 +void
 +log_online_follow_log_seg(
 +/*======================*/
 +	log_group_t*	group,		       /*!< in: the log group to use */
 +	lsn_t		block_start_lsn,       /*!< in: the LSN to read from */
 +	lsn_t		block_end_lsn)	       /*!< in: the LSN to read to */
 +{
 +	ut_ad(mutex_own(&log_bmp_sys_mutex));
 +
 +	/* Pointer to the current OS_FILE_LOG_BLOCK-sized chunk of the read log
 +	data to parse */
 +	byte* log_block = log_bmp_sys->read_buf;
 +	byte* log_block_end = log_bmp_sys->read_buf
 +		+ (block_end_lsn - block_start_lsn);
 +
 +	mutex_enter(&log_sys->mutex);
 +	log_group_read_log_seg(LOG_RECOVER, log_bmp_sys->read_buf,
 +			       group, block_start_lsn, block_end_lsn, TRUE);
 +	/* log_group_read_log_seg will release the log_sys->mutex for us */
 +
 +	while (log_block < log_block_end
 +	       && log_bmp_sys->next_parse_lsn < log_bmp_sys->end_lsn) {
 +
 +		/* How many bytes of log data should we skip in the current log
 +		block.  Skipping is necessary because we round down the next
 +		parse LSN thus it is possible to read the already-processed log
 +		data many times */
 +		ulint skip_already_parsed_len = 0;
 +
 +		if (!log_online_is_valid_log_seg(log_block)) {
 +			break;
 +		}
 +
 +		if ((block_start_lsn <= log_bmp_sys->next_parse_lsn)
 +		    && (block_start_lsn + OS_FILE_LOG_BLOCK_SIZE
 +			> log_bmp_sys->next_parse_lsn)) {
 +
 +			/* The next parse LSN is inside the current block, skip
 +			data preceding it. */
 +			skip_already_parsed_len
 +				= (ulint)(log_bmp_sys->next_parse_lsn
 +					  - block_start_lsn);
 +		}
 +		else {
 +
 +			/* If the next parse LSN is not inside the current
 +			block, then the only option is that we have processed
 +			ahead already. */
 +			ut_a(block_start_lsn > log_bmp_sys->next_parse_lsn);
 +		}
 +
 +		/* TODO: merge the copying to the parse buf code with
 +		skip_already_len calculations */
 +		log_online_parse_redo_log_block(log_block,
 +						skip_already_parsed_len);
 +
 +		log_block += OS_FILE_LOG_BLOCK_SIZE;
 +		block_start_lsn += OS_FILE_LOG_BLOCK_SIZE;
 +	}
 +
 +	return;
 +}
 +
 +/*********************************************************************//**
 +Read and parse the redo log in a given group in FOLLOW_SCAN_SIZE-sized
 +chunks and updates the modified page bitmap. */
 +static
 +void
 +log_online_follow_log_group(
 +/*========================*/
 +	log_group_t*	group,		/*!< in: the log group to use */
 +	lsn_t		contiguous_lsn)	/*!< in: the LSN of log block start
 +					containing the log_parse_start_lsn */
 +{
 +	ut_ad(mutex_own(&log_bmp_sys_mutex));
 +
 +	lsn_t	block_start_lsn = contiguous_lsn;
 +	lsn_t	block_end_lsn;
 +
 +	log_bmp_sys->next_parse_lsn = log_bmp_sys->start_lsn;
 +	log_bmp_sys->parse_buf_end = log_bmp_sys->parse_buf;
 +
 +	do {
 +		block_end_lsn = block_start_lsn + FOLLOW_SCAN_SIZE;
 +
 +		log_online_follow_log_seg(group, block_start_lsn,
 +					  block_end_lsn);
 +
 +		/* Next parse LSN can become higher than the last read LSN
 +		only in the case when the read LSN falls right on the block
 +		boundary, in which case next parse lsn is bumped to the actual
 +		data LSN on the next (not yet read) block.  This assert is
 +		slightly conservative.  */
 +		ut_a(log_bmp_sys->next_parse_lsn
 +		     <= block_end_lsn + LOG_BLOCK_HDR_SIZE
 +		     + LOG_BLOCK_TRL_SIZE);
 +
 +		block_start_lsn = block_end_lsn;
 +	} while (block_end_lsn < log_bmp_sys->end_lsn);
 +
 +	/* Assert that the last read log record is a full one */
 +	ut_a(log_bmp_sys->parse_buf_end == log_bmp_sys->parse_buf);
 +}
 +
 +/*********************************************************************//**
 +Write, flush one bitmap block to disk and advance the output position if
 +successful.
 +
 +@return TRUE if page written OK, FALSE if I/O error */
 +static
 +ibool
 +log_online_write_bitmap_page(
 +/*=========================*/
 +	const byte *block)	/*!< in: block to write */
 +{
 +	ut_ad(mutex_own(&log_bmp_sys_mutex));
 +
 +	/* Simulate a write error */
 +	DBUG_EXECUTE_IF("bitmap_page_write_error",
 +			{
 +				ulint space_id
 +					= mach_read_from_4(block
 +					+ MODIFIED_PAGE_SPACE_ID);
 +				if (space_id > 0) {
 +					ib_logf(IB_LOG_LEVEL_ERROR,
 +						"simulating bitmap write "
 +						"error in "
 +						"log_online_write_bitmap_page "
 +						"for space ID %lu",
 +						space_id);
 +					return FALSE;
 +				}
 +			});
 +
 +	/* A crash injection site that ensures last checkpoint LSN > last
 +	tracked LSN, so that LSN tracking for this interval is tested. */
 +	DBUG_EXECUTE_IF("crash_before_bitmap_write",
 +			{
 +				ulint space_id
 +					= mach_read_from_4(block
 +						+ MODIFIED_PAGE_SPACE_ID);
 +				if (space_id > 0)
 +					DBUG_SUICIDE();
 +			});
 +
 +
 +	ibool success = os_file_write(log_bmp_sys->out.name,
 +				log_bmp_sys->out.file, block,
 +				log_bmp_sys->out.offset,
 +				MODIFIED_PAGE_BLOCK_SIZE);
 +	if (UNIV_UNLIKELY(!success)) {
 +
 +		/* The following call prints an error message */
 +		os_file_get_last_error(TRUE);
 +		ib_logf(IB_LOG_LEVEL_ERROR, "failed writing changed page "
 +			"bitmap file \'%s\'", log_bmp_sys->out.name);
 +		return FALSE;
 +	}
 +
 +	success = os_file_flush(log_bmp_sys->out.file);
 +	if (UNIV_UNLIKELY(!success)) {
 +
 +		/* The following call prints an error message */
 +		os_file_get_last_error(TRUE);
 +		ib_logf(IB_LOG_LEVEL_ERROR, "failed flushing changed page "
 +			"bitmap file \'%s\'",	log_bmp_sys->out.name);
 +		return FALSE;
 +	}
 +
 +	os_file_advise(log_bmp_sys->out.file, log_bmp_sys->out.offset,
 +		       MODIFIED_PAGE_BLOCK_SIZE, OS_FILE_ADVISE_DONTNEED);
 +
 +	log_bmp_sys->out.offset += MODIFIED_PAGE_BLOCK_SIZE;
 +	return TRUE;
 +}
 +
 +/*********************************************************************//**
 +Append the current changed page bitmap to the bitmap file.  Clears the
 +bitmap tree and recycles its nodes to the free list.
 +
 +@return TRUE if bitmap written OK, FALSE if I/O error*/
 +static
 +ibool
 +log_online_write_bitmap(void)
 +/*=========================*/
 +{
 +	ut_ad(mutex_own(&log_bmp_sys_mutex));
 +
 +	if (log_bmp_sys->out.offset >= srv_max_bitmap_file_size) {
 +		if (!log_online_rotate_bitmap_file(log_bmp_sys->start_lsn)) {
 +			return FALSE;
 +		}
 +	}
 +
 +	ib_rbt_node_t *bmp_tree_node
 +		= (ib_rbt_node_t *)rbt_first(log_bmp_sys->modified_pages);
 +	const ib_rbt_node_t * const last_bmp_tree_node
 +		= rbt_last(log_bmp_sys->modified_pages);
 +
 +	ibool success = TRUE;
 +
 +	while (bmp_tree_node) {
 +
 +		byte *page = rbt_value(byte, bmp_tree_node);
 +
 +		/* In case of a bitmap page write error keep on looping over
 +		the tree to reclaim its memory through the free list instead of
 +		returning immediatelly. */
 +		if (UNIV_LIKELY(success)) {
 +			if (bmp_tree_node == last_bmp_tree_node) {
 +				mach_write_to_4(page
 +						+ MODIFIED_PAGE_IS_LAST_BLOCK,
 +						1);
 +			}
 +
 +			mach_write_to_8(page + MODIFIED_PAGE_START_LSN,
 +				       log_bmp_sys->start_lsn);
 +			mach_write_to_8(page + MODIFIED_PAGE_END_LSN,
 +				       log_bmp_sys->end_lsn);
 +			mach_write_to_4(page + MODIFIED_PAGE_BLOCK_CHECKSUM,
 +					log_online_calc_checksum(page));
 +
 +			success = log_online_write_bitmap_page(page);
 +		}
 +
 +		bmp_tree_node->left = log_bmp_sys->page_free_list;
 +		log_bmp_sys->page_free_list = bmp_tree_node;
 +
 +		bmp_tree_node = (ib_rbt_node_t*)
 +			rbt_next(log_bmp_sys->modified_pages, bmp_tree_node);
 +
 +		DBUG_EXECUTE_IF("bitmap_page_2_write_error",
 +				if (bmp_tree_node)
 +				{
 +					DBUG_SET("+d,bitmap_page_write_error");
 +					DBUG_SET("-d,bitmap_page_2_write_error");
 +				});
 +	}
 +
 +	rbt_reset(log_bmp_sys->modified_pages);
 +	return success;
 +}
 +
 +/*********************************************************************//**
 +Read and parse the redo log up to last checkpoint LSN to build the changed
 +page bitmap which is then written to disk.
 +
 +@return TRUE if log tracking succeeded, FALSE if bitmap write I/O error */
 +UNIV_INTERN
 +ibool
 +log_online_follow_redo_log(void)
 +/*============================*/
 +{
 +	lsn_t		contiguous_start_lsn;
 +	log_group_t*	group;
 +	ibool		result;
 +
 +	ut_ad(!srv_read_only_mode);
 +
 +	if (!srv_track_changed_pages)
 +		return TRUE;
 +
 +	DEBUG_SYNC_C("log_online_follow_redo_log");
 +
 +	mutex_enter(&log_bmp_sys_mutex);
 +
 +	if (!srv_track_changed_pages) {
 +		mutex_exit(&log_bmp_sys_mutex);
 +		return TRUE;
 +	}
 +
 +	/* Grab the LSN of the last checkpoint, we will parse up to it */
 +	mutex_enter(&(log_sys->mutex));
 +	log_bmp_sys->end_lsn = log_sys->last_checkpoint_lsn;
 +	mutex_exit(&(log_sys->mutex));
 +
 +	if (log_bmp_sys->end_lsn == log_bmp_sys->start_lsn) {
 +		mutex_exit(&log_bmp_sys_mutex);
 +		return TRUE;
 +	}
 +
 +	group = UT_LIST_GET_FIRST(log_sys->log_groups);
 +	ut_a(group);
 +
 +	contiguous_start_lsn = ut_uint64_align_down(log_bmp_sys->start_lsn,
 +						    OS_FILE_LOG_BLOCK_SIZE);
 +
 +	while (group) {
 +		log_online_follow_log_group(group, contiguous_start_lsn);
 +		group = UT_LIST_GET_NEXT(log_groups, group);
 +	}
 +
 +	result = log_online_write_bitmap();
 +	log_bmp_sys->start_lsn = log_bmp_sys->end_lsn;
 +	log_set_tracked_lsn(log_bmp_sys->start_lsn);
 +
 +	mutex_exit(&log_bmp_sys_mutex);
 +	return result;
 +}
 +
 +/*********************************************************************//**
 +Diagnose a bitmap file range setup failure and free the partially-initialized
 +bitmap file range.  */
 +UNIV_COLD
 +static
 +void
 +log_online_diagnose_inconsistent_dir(
 +/*=================================*/
 +	log_online_bitmap_file_range_t	*bitmap_files)	/*!<in/out: bitmap file
 +							range */
 +{
 +	ib_logf(IB_LOG_LEVEL_WARN,
 +		"InnoDB: Warning: inconsistent bitmap file "
 +		"directory for a "
 +		"INFORMATION_SCHEMA.INNODB_CHANGED_PAGES query");
 +	free(bitmap_files->files);
 +}
 +
 +/*********************************************************************//**
 +List the bitmap files in srv_data_home and setup their range that contains the
 +specified LSN interval.  This range, if non-empty, will start with a file that
 +has the greatest LSN equal to or less than the start LSN and will include all
 +the files up to the one with the greatest LSN less than the end LSN.  Caller
 +must free bitmap_files->files when done if bitmap_files set to non-NULL and
 +this function returned TRUE.  Field bitmap_files->count might be set to a
 +larger value than the actual count of the files, and space for the unused array
 +slots will be allocated but cleared to zeroes.
 +
 +@return TRUE if succeeded
 +*/
 +static
 +ibool
 +log_online_setup_bitmap_file_range(
 +/*===============================*/
 +	log_online_bitmap_file_range_t	*bitmap_files,	/*!<in/out: bitmap file
 +							range */
 +	lsn_t				range_start,	/*!<in: start LSN */
 +	lsn_t				range_end)	/*!<in: end LSN */
 +{
 +	os_file_dir_t	bitmap_dir;
 +	os_file_stat_t	bitmap_dir_file_info;
 +	ulong		first_file_seq_num	= ULONG_MAX;
 +	ulong		last_file_seq_num	= 0;
 +	lsn_t		first_file_start_lsn	= LSN_MAX;
 +
 +	ut_ad(range_end >= range_start);
 +
 +	bitmap_files->count = 0;
 +	bitmap_files->files = NULL;
 +
 +	/* 1st pass: size the info array */
 +
 +	bitmap_dir = os_file_opendir(srv_data_home, FALSE);
 +	if (UNIV_UNLIKELY(!bitmap_dir)) {
 +
 +		ib_logf(IB_LOG_LEVEL_ERROR,
 +			"failed to open bitmap directory \'%s\'",
 +			srv_data_home);
 +		return FALSE;
 +	}
 +
 +	while (!os_file_readdir_next_file(srv_data_home, bitmap_dir,
 +					  &bitmap_dir_file_info)) {
 +
 +		ulong	file_seq_num;
 +		lsn_t	file_start_lsn;
 +
 +		if (!log_online_is_bitmap_file(&bitmap_dir_file_info,
 +					       &file_seq_num,
 +					       &file_start_lsn)
 +		    || file_start_lsn >= range_end) {
 +
 +			continue;
 +		}
 +
 +		if (file_seq_num > last_file_seq_num) {
 +
 +			last_file_seq_num = file_seq_num;
 +		}
 +
 +		if (file_start_lsn >= range_start
 +		    || file_start_lsn == first_file_start_lsn
 +		    || first_file_start_lsn > range_start) {
 +
 +			/* A file that falls into the range */
 +
 +			if (file_start_lsn < first_file_start_lsn) {
 +
 +				first_file_start_lsn = file_start_lsn;
 +			}
 +			if (file_seq_num < first_file_seq_num) {
 +
 +				first_file_seq_num = file_seq_num;
 +			}
 +		} else if (file_start_lsn > first_file_start_lsn) {
 +
 +			/* A file that has LSN closer to the range start
 +			but smaller than it, replacing another such file */
 +			first_file_start_lsn = file_start_lsn;
 +			first_file_seq_num = file_seq_num;
 +		}
 +	}
 +
 +	if (UNIV_UNLIKELY(os_file_closedir(bitmap_dir))) {
 +
 +		os_file_get_last_error(TRUE);
 +		ib_logf(IB_LOG_LEVEL_ERROR, "cannot close \'%s\'",
 +			srv_data_home);
 +		return FALSE;
 +	}
 +
 +	if (first_file_seq_num == ULONG_MAX && last_file_seq_num == 0) {
 +
 +		bitmap_files->count = 0;
 +		return TRUE;
 +	}
 +
 +	bitmap_files->count = last_file_seq_num - first_file_seq_num + 1;
 +
 +	DEBUG_SYNC_C("setup_bitmap_range_middle");
 +
 +	/* 2nd pass: get the file names in the file_seq_num order */
 +
 +	bitmap_dir = os_file_opendir(srv_data_home, FALSE);
 +	if (UNIV_UNLIKELY(!bitmap_dir)) {
 +
 +		ib_logf(IB_LOG_LEVEL_ERROR,
 +			"failed to open bitmap directory \'%s\'",
 +			srv_data_home);
 +		return FALSE;
 +	}
 +
 +	bitmap_files->files
 +		= static_cast<log_online_bitmap_file_range_struct::files_t *>
 +		(ut_malloc(bitmap_files->count
 +			   * sizeof(bitmap_files->files[0])));
 +	memset(bitmap_files->files, 0,
 +	       bitmap_files->count * sizeof(bitmap_files->files[0]));
 +
 +	while (!os_file_readdir_next_file(srv_data_home, bitmap_dir,
 +					  &bitmap_dir_file_info)) {
 +
 +		ulong	file_seq_num;
 +		lsn_t	file_start_lsn;
 +		size_t	array_pos;
 +
 +		if (!log_online_is_bitmap_file(&bitmap_dir_file_info,
 +					       &file_seq_num,
 +					       &file_start_lsn)
 +		    || file_start_lsn >= range_end
 +		    || file_start_lsn < first_file_start_lsn) {
 +
 +			continue;
 +		}
 +
 +		array_pos = file_seq_num - first_file_seq_num;
 +		if (UNIV_UNLIKELY(array_pos >= bitmap_files->count)) {
 +
 +			log_online_diagnose_inconsistent_dir(bitmap_files);
 +			os_file_closedir(bitmap_dir);
 +			return FALSE;
 +		}
 +
 +
 +		if (file_seq_num > bitmap_files->files[array_pos].seq_num) {
 +
 +			bitmap_files->files[array_pos].seq_num = file_seq_num;
 +			strncpy(bitmap_files->files[array_pos].name,
 +				bitmap_dir_file_info.name, FN_REFLEN);
 +			bitmap_files->files[array_pos].name[FN_REFLEN - 1]
 +				= '\0';
 +			bitmap_files->files[array_pos].start_lsn
 +				= file_start_lsn;
 +		}
 +	}
 +
 +	if (UNIV_UNLIKELY(os_file_closedir(bitmap_dir))) {
 +
 +		os_file_get_last_error(TRUE);
 +		ib_logf(IB_LOG_LEVEL_ERROR, "cannot close \'%s\'",
 +			srv_data_home);
 +		free(bitmap_files->files);
 +		return FALSE;
 +	}
 +
 +	if (!bitmap_files->files[0].seq_num
 +	    || bitmap_files->files[0].seq_num != first_file_seq_num) {
 +
 +		log_online_diagnose_inconsistent_dir(bitmap_files);
 +		return FALSE;
 +	}
 +
 +	{
 +		size_t i;
 +		for (i = 1; i < bitmap_files->count; i++) {
 +			if (!bitmap_files->files[i].seq_num) {
 +				break;
 +			}
 +			if ((bitmap_files->files[i].seq_num
 +			      <= bitmap_files->files[i - 1].seq_num)
 +			    || (bitmap_files->files[i].start_lsn
 +				< bitmap_files->files[i - 1].start_lsn)) {
 +
 +				log_online_diagnose_inconsistent_dir(
 +								bitmap_files);
 +				return FALSE;
 +			}
 +		}
 +	}
 +
 +	return TRUE;
 +}
 +
 +/****************************************************************//**
 +Open a bitmap file for reading.
 +
 +@return TRUE if opened successfully */
 +static
 +ibool
 +log_online_open_bitmap_file_read_only(
 +/*==================================*/
 +	const char*			name,		/*!<in: bitmap file
 +							name without directory,
 +							which is assumed to be
 +							srv_data_home */
 +	log_online_bitmap_file_t*	bitmap_file)	/*!<out: opened bitmap
 +							file */
 +{
 +	ibool	success	= FALSE;
 +	size_t  srv_data_home_len;
 +
 +	ut_ad(name[0] != '\0');
 +
 +	srv_data_home_len = strlen(srv_data_home);
 +	if (srv_data_home_len
 +			&& srv_data_home[srv_data_home_len-1]
 +			!= SRV_PATH_SEPARATOR) {
 +		ut_snprintf(bitmap_file->name, FN_REFLEN, "%s%c%s",
 +				srv_data_home, SRV_PATH_SEPARATOR, name);
 +	} else {
 +		ut_snprintf(bitmap_file->name, FN_REFLEN, "%s%s",
 +				srv_data_home, name);
 +	}
 +	bitmap_file->file
 +		= os_file_create_simple_no_error_handling(innodb_file_bmp_key,
 +							  bitmap_file->name,
 +							  OS_FILE_OPEN,
 +							  OS_FILE_READ_ONLY,
 +							  &success);
 +	if (UNIV_UNLIKELY(!success)) {
 +
 +		/* Here and below assume that bitmap file names do not
 +		contain apostrophes, thus no need for ut_print_filename(). */
 +		ib_logf(IB_LOG_LEVEL_WARN,
 +			"error opening the changed page bitmap \'%s\'",
 +			bitmap_file->name);
 +		return FALSE;
 +	}
 +
 +	bitmap_file->size = os_file_get_size(bitmap_file->file);
 +	bitmap_file->offset = 0;
 +
 +	os_file_advise(bitmap_file->file, 0, 0, OS_FILE_ADVISE_SEQUENTIAL);
 +	os_file_advise(bitmap_file->file, 0, 0, OS_FILE_ADVISE_NOREUSE);
 +
 +	return TRUE;
 +}
 +
 +/****************************************************************//**
 +Diagnose one or both of the following situations if we read close to
 +the end of bitmap file:
 +1) Warn if the remainder of the file is less than one page.
 +2) Error if we cannot read any more full pages but the last read page
 +did not have the last-in-run flag set.
 +
 +@return FALSE for the error */
 +static
 +ibool
 +log_online_diagnose_bitmap_eof(
 +/*===========================*/
 +	const log_online_bitmap_file_t*	bitmap_file,	/*!< in: bitmap file */
 +	ibool				last_page_in_run)/*!< in: "last page in
 +							run" flag value in the
 +							last read page */
 +{
 +	/* Check if we are too close to EOF to read a full page */
 +	if ((bitmap_file->size < MODIFIED_PAGE_BLOCK_SIZE)
 +	    || (bitmap_file->offset
 +		> bitmap_file->size - MODIFIED_PAGE_BLOCK_SIZE)) {
 +
 +		if (UNIV_UNLIKELY(bitmap_file->offset != bitmap_file->size)) {
 +
 +			/* If we are not at EOF and we have less than one page
 +			to read, it's junk.  This error is not fatal in
 +			itself. */
 +
 +			ib_logf(IB_LOG_LEVEL_WARN,
 +				"junk at the end of changed page bitmap file "
 +				"\'%s\'.", bitmap_file->name);
 +		}
 +
 +		if (UNIV_UNLIKELY(!last_page_in_run)) {
 +
 +			/* We are at EOF but the last read page did not finish
 +			a run */
 +			/* It's a "Warning" here because it's not a fatal error
 +			for the whole server */
 +			ib_logf(IB_LOG_LEVEL_WARN,
 +				"changed page bitmap file \'%s\', size "
 +				UINT64PF " bytes, does not "
 +				"contain a complete run at the next read "
 +				"offset " UINT64PF,
 +				bitmap_file->name, bitmap_file->size,
 +				bitmap_file->offset);
 +			return FALSE;
 +		}
 +	}
 +	return TRUE;
 +}
 +
 +/*********************************************************************//**
 +Initialize the log bitmap iterator for a given range.  The records are
 +processed at a bitmap block granularity, i.e. all the records in the same block
 +share the same start and end LSN values, the exact LSN of each record is
 +unavailable (nor is it defined for blocks that are touched more than once in
 +the LSN interval contained in the block).  Thus min_lsn and max_lsn should be
 +set at block boundaries or bigger, otherwise the records at the 1st and the
 +last blocks will not be returned.  Also note that there might be returned
 +records with LSN < min_lsn, as min_lsn is used to select the correct starting
 +file but not block.
 +
 +@return TRUE if the iterator is initialized OK, FALSE otherwise. */
 +UNIV_INTERN
 +ibool
 +log_online_bitmap_iterator_init(
 +/*============================*/
 +	log_bitmap_iterator_t	*i,	/*!<in/out:  iterator */
 +	lsn_t			min_lsn,/*!< in: start LSN */
 +	lsn_t			max_lsn)/*!< in: end LSN */
 +{
 +	ut_a(i);
 +
 +	i->max_lsn = max_lsn;
 +
 +	if (UNIV_UNLIKELY(min_lsn > max_lsn)) {
 +
 +		/* Empty range */
 +		i->in_files.count = 0;
 +		i->in_files.files = NULL;
 +		os_file_mark_invalid(&i->in.file);
 +		i->page = NULL;
 +		i->failed = FALSE;
 +		return TRUE;
 +	}
 +
 +	if (!log_online_setup_bitmap_file_range(&i->in_files, min_lsn,
 +		max_lsn)) {
 +
 +		i->failed = TRUE;
 +		return FALSE;
 +	}
 +
 +	i->in_i = 0;
 +
 +	if (i->in_files.count == 0) {
 +
 +		/* Empty range */
 +		os_file_mark_invalid(&i->in.file);
 +		i->page = NULL;
 +		i->failed = FALSE;
 +		return TRUE;
 +	}
 +
 +	/* Open the 1st bitmap file */
 +	if (UNIV_UNLIKELY(!log_online_open_bitmap_file_read_only(
 +				i->in_files.files[i->in_i].name,
 +				&i->in))) {
 +
 +		i->in_i = i->in_files.count;
 +		free(i->in_files.files);
 +		i->failed = TRUE;
 +		return FALSE;
 +	}
 +
 +	i->page = static_cast<byte *>(ut_malloc(MODIFIED_PAGE_BLOCK_SIZE));
 +	i->bit_offset = MODIFIED_PAGE_BLOCK_BITMAP_LEN;
 +	i->start_lsn = i->end_lsn = 0;
 +	i->space_id = 0;
 +	i->first_page_id = 0;
 +	i->last_page_in_run = TRUE;
 +	i->changed = FALSE;
 +	i->failed = FALSE;
 +
 +	return TRUE;
 +}
 +
 +/*********************************************************************//**
 +Releases log bitmap iterator. */
 +UNIV_INTERN
 +void
 +log_online_bitmap_iterator_release(
 +/*===============================*/
 +	log_bitmap_iterator_t *i) /*!<in/out:  iterator */
 +{
 +	ut_a(i);
 +
 +	if (!os_file_is_invalid(i->in.file)) {
 +
 +		os_file_close(i->in.file);
 +		os_file_mark_invalid(&i->in.file);
 +	}
 +	if (i->in_files.files) {
 +
 +		ut_free(i->in_files.files);
 +	}
 +	if (i->page) {
 +
 +		ut_free(i->page);
 +	}
 +	i->failed = TRUE;
 +}
 +
 +/*********************************************************************//**
 +Iterates through bits of saved bitmap blocks.
 +Sequentially reads blocks from bitmap file(s) and interates through
 +their bits. Ignores blocks with wrong checksum.
 +@return TRUE if iteration is successful, FALSE if all bits are iterated. */
 +UNIV_INTERN
 +ibool
 +log_online_bitmap_iterator_next(
 +/*============================*/
 +	log_bitmap_iterator_t *i) /*!<in/out: iterator */
 +{
 +	ibool	checksum_ok = FALSE;
 +	ibool	success;
 +
 +	ut_a(i);
 +
 +	if (UNIV_UNLIKELY(i->in_files.count == 0)) {
 +
 +		return FALSE;
 +	}
 +
 +	if (UNIV_LIKELY(i->bit_offset < MODIFIED_PAGE_BLOCK_BITMAP_LEN))
 +	{
 +		++i->bit_offset;
 +		i->changed =
 +			IS_BIT_SET(i->page + MODIFIED_PAGE_BLOCK_BITMAP,
 +				   i->bit_offset);
 +		return TRUE;
 +	}
 +
 +	if (i->end_lsn >= i->max_lsn && i->last_page_in_run)
 +		return FALSE;
 +
 +	while (!checksum_ok)
 +	{
 +		while (i->in.size < MODIFIED_PAGE_BLOCK_SIZE
 +		       || (i->in.offset
 +			   > i->in.size - MODIFIED_PAGE_BLOCK_SIZE)) {
 +
 +			/* Advance file */
 +			i->in_i++;
 +			success = os_file_close_no_error_handling(
 +				i->in.file);
 +			os_file_mark_invalid(&i->in.file);
 +			if (UNIV_UNLIKELY(!success)) {
 +
 +				os_file_get_last_error(TRUE);
 +				i->failed = TRUE;
 +				return FALSE;
 +			}
 +
 +			success = log_online_diagnose_bitmap_eof(
 +					&i->in, i->last_page_in_run);
 +			if (UNIV_UNLIKELY(!success)) {
 +
 +				i->failed = TRUE;
 +				return FALSE;
 +
 +			}
 +
 +			if (i->in_i == i->in_files.count) {
 +
 +				return FALSE;
 +			}
 +
 +			if (UNIV_UNLIKELY(i->in_files.files[i->in_i].seq_num
 +					  == 0)) {
 +
 +				i->failed = TRUE;
 +				return FALSE;
 +			}
 +
 +			success = log_online_open_bitmap_file_read_only(
 +					i->in_files.files[i->in_i].name,
 +					&i->in);
 +			if (UNIV_UNLIKELY(!success)) {
 +
 +				i->failed = TRUE;
 +				return FALSE;
 +			}
 +		}
 +
 +		success = log_online_read_bitmap_page(&i->in, i->page,
 +						      &checksum_ok);
 +		if (UNIV_UNLIKELY(!success)) {
 +
 +			os_file_get_last_error(TRUE);
 +			ib_logf(IB_LOG_LEVEL_WARN,
 +				"failed reading changed page bitmap file "
 +				"\'%s\'", i->in_files.files[i->in_i].name);
 +			i->failed = TRUE;
 +			return FALSE;
 +		}
 +	}
 +
 +	i->start_lsn = mach_read_from_8(i->page + MODIFIED_PAGE_START_LSN);
 +	i->end_lsn = mach_read_from_8(i->page + MODIFIED_PAGE_END_LSN);
 +	i->space_id = mach_read_from_4(i->page + MODIFIED_PAGE_SPACE_ID);
 +	i->first_page_id = mach_read_from_4(i->page
 +					    + MODIFIED_PAGE_1ST_PAGE_ID);
 +	i->last_page_in_run = mach_read_from_4(i->page
 +					       + MODIFIED_PAGE_IS_LAST_BLOCK);
 +	i->bit_offset = 0;
 +	i->changed = IS_BIT_SET(i->page + MODIFIED_PAGE_BLOCK_BITMAP,
 +				i->bit_offset);
 +
 +	return TRUE;
 +}
 +
 +/************************************************************//**
 +Delete all the bitmap files for data less than the specified LSN.
 +If called with lsn == 0 (i.e. set by RESET request) or LSN_MAX,
 +restart the bitmap file sequence, otherwise continue it.
 +
 +@return FALSE to indicate success, TRUE for failure. */
 +UNIV_INTERN
 +ibool
 +log_online_purge_changed_page_bitmaps(
 +/*==================================*/
 +	lsn_t	lsn)	/*!< in: LSN to purge files up to */
 +{
 +	log_online_bitmap_file_range_t	bitmap_files;
 +	size_t				i;
 +	ibool				result = FALSE;
 +
 +	if (lsn == 0) {
 +		lsn = LSN_MAX;
 +	}
 +
 +	bool log_bmp_sys_inited = false;
 +	if (srv_redo_log_thread_started) {
 +		/* User requests might happen with both enabled and disabled
 +		tracking */
 +		log_bmp_sys_inited = true;
 +		mutex_enter(&log_bmp_sys_mutex);
 +		if (!srv_redo_log_thread_started) {
 +			log_bmp_sys_inited = false;
 +			mutex_exit(&log_bmp_sys_mutex);
 +		}
 +	}
 +
 +	if (!log_online_setup_bitmap_file_range(&bitmap_files, 0, LSN_MAX)) {
 +		if (log_bmp_sys_inited) {
 +			mutex_exit(&log_bmp_sys_mutex);
 +		}
 +		return TRUE;
 +	}
 +
 +	if (srv_redo_log_thread_started && lsn > log_bmp_sys->end_lsn) {
 +		/* If we have to delete the current output file, close it
 +		first. */
 +		os_file_close(log_bmp_sys->out.file);
 +		os_file_mark_invalid(&log_bmp_sys->out.file);
 +	}
 +
 +	for (i = 0; i < bitmap_files.count; i++) {
 +
 +		/* We consider the end LSN of the current bitmap, derived from
 +		the start LSN of the subsequent bitmap file, to determine
 +		whether to remove the current bitmap.  Note that bitmap_files
 +		does not contain an entry for the bitmap past the given LSN so
 +		we must check the boundary conditions as well.  For example,
 +		consider 1_0.xdb and 2_10.xdb and querying LSN 5.  bitmap_files
 +		will only contain 1_0.xdb and we must not delete it since it
 +		represents LSNs 0-9. */
 +		if ((i + 1 == bitmap_files.count
 +		     || bitmap_files.files[i + 1].seq_num == 0
 +		     || bitmap_files.files[i + 1].start_lsn > lsn)
 +		    && (lsn != LSN_MAX)) {
 +
 +			break;
 +		}
 +		if (!os_file_delete_if_exists(innodb_file_bmp_key,
 +					      bitmap_files.files[i].name)) {
 +
 +			os_file_get_last_error(TRUE);
 +			result = TRUE;
 +			break;
 +		}
 +	}
 +
 +	if (log_bmp_sys_inited) {
 +		if (lsn > log_bmp_sys->end_lsn) {
 +			lsn_t	new_file_lsn;
 +			if (lsn == LSN_MAX) {
 +				/* RESET restarts the sequence */
 +				log_bmp_sys->out_seq_num = 0;
 +				new_file_lsn = 0;
 +			} else {
 +				new_file_lsn = log_bmp_sys->end_lsn;
 +			}
 +			if (!log_online_rotate_bitmap_file(new_file_lsn)) {
 +				/* If file create failed, stop log tracking */
 +				srv_track_changed_pages = FALSE;
 +			}
 +		}
 +
 +		mutex_exit(&log_bmp_sys_mutex);
 +	}
 +
 +	free(bitmap_files.files);
 +	return result;
 +}
diff --cc storage/xtradb/os/os0file.cc
index 89013d9068f,00000000000..b4fafb127ec
mode 100644,000000..100644
--- a/storage/xtradb/os/os0file.cc
+++ b/storage/xtradb/os/os0file.cc
@@@ -1,6177 -1,0 +1,6177 @@@
 +/***********************************************************************
 +
 +Copyright (c) 1995, 2017, Oracle and/or its affiliates. All Rights Reserved.
 +Copyright (c) 2009, Percona Inc.
- Copyright (c) 2013, 2017, MariaDB Corporation.
++Copyright (c) 2013, 2019, MariaDB Corporation.
 +
 +Portions of this file contain modifications contributed and copyrighted
 +by Percona Inc.. Those modifications are
 +gratefully acknowledged and are described briefly in the InnoDB
 +documentation. The contributions by Percona Inc. are incorporated with
 +their permission, and subject to the conditions contained in the file
 +COPYING.Percona.
 +
 +This program is free software; you can redistribute it and/or modify it
 +under the terms of the GNU General Public License as published by the
 +Free Software Foundation; version 2 of the License.
 +
 +This program is distributed in the hope that it will be useful, but
 +WITHOUT ANY WARRANTY; without even the implied warranty of
 +MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General
 +Public License for more details.
 +
 +You should have received a copy of the GNU General Public License along with
 +this program; if not, write to the Free Software Foundation, Inc.,
 +51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA
 +
 +***********************************************************************/
 +
 +/**************************************************//**
 +@file os/os0file.cc
 +The interface to the operating system file i/o primitives
 +
 +Created 10/21/1995 Heikki Tuuri
 +*******************************************************/
 +
 +#include "os0file.h"
 +
 +#ifdef UNIV_NONINL
 +#include "os0file.ic"
 +#endif
 +#include "ha_prototypes.h"
 +#include "ut0mem.h"
 +#include "srv0srv.h"
 +#include "srv0start.h"
 +#include "fil0fil.h"
 +#include "buf0buf.h"
 +#include "btr0types.h"
 +#include "trx0trx.h"
 +#include "srv0mon.h"
 +#ifndef UNIV_HOTBACKUP
 +# include "os0sync.h"
 +# include "os0thread.h"
 +#else /* !UNIV_HOTBACKUP */
 +# ifdef __WIN__
 +/* Add includes for the _stat() call to compile on Windows */
 +#  include <sys/types.h>
 +#  include <sys/stat.h>
 +#  include <errno.h>
 +# endif /* __WIN__ */
 +#endif /* !UNIV_HOTBACKUP */
 +
 +#if defined(LINUX_NATIVE_AIO)
 +#include <libaio.h>
 +#endif
 +
 +#ifdef _WIN32
 +#define IOCP_SHUTDOWN_KEY (ULONG_PTR)-1
 +#endif
 +
 +#if defined(UNIV_LINUX) && defined(HAVE_SYS_IOCTL_H)
 +# include <sys/ioctl.h>
 +# ifndef DFS_IOCTL_ATOMIC_WRITE_SET
 +#  define DFS_IOCTL_ATOMIC_WRITE_SET _IOW(0x95, 2, uint)
 +# endif
 +#endif
 +
 +/** Insert buffer segment id */
 +static const ulint IO_IBUF_SEGMENT = 0;
 +
 +/** Log segment id */
 +static const ulint IO_LOG_SEGMENT = 1;
 +
 +/* This specifies the file permissions InnoDB uses when it creates files in
 +Unix; the value of os_innodb_umask is initialized in ha_innodb.cc to
 +my_umask */
 +
 +#ifndef __WIN__
 +/** Umask for creating files */
 +UNIV_INTERN ulint	os_innodb_umask = S_IRUSR | S_IWUSR | S_IRGRP | S_IWGRP;
 +# define os_file_invalid	(-1)
 +#else
 +/** Umask for creating files */
 +UNIV_INTERN ulint	os_innodb_umask	= 0;
 +# define os_file_invalid	INVALID_HANDLE_VALUE
 +#endif /* __WIN__ */
 +
 +#ifndef UNIV_HOTBACKUP
 +/* We use these mutexes to protect lseek + file i/o operation, if the
 +OS does not provide an atomic pread or pwrite, or similar */
 +#define OS_FILE_N_SEEK_MUTEXES	16
 +UNIV_INTERN os_ib_mutex_t	os_file_seek_mutexes[OS_FILE_N_SEEK_MUTEXES];
 +
 +/* In simulated aio, merge at most this many consecutive i/os */
 +#define OS_AIO_MERGE_N_CONSECUTIVE	64
 +
 +#ifdef WITH_INNODB_DISALLOW_WRITES
 +#define WAIT_ALLOW_WRITES() os_event_wait(srv_allow_writes_event)
 +#else
 +#define WAIT_ALLOW_WRITES() do { } while (0)
 +#endif /* WITH_INNODB_DISALLOW_WRITES */
 +
 +/**********************************************************************
 +
 +InnoDB AIO Implementation:
 +=========================
 +
 +We support native AIO for windows and linux. For rest of the platforms
 +we simulate AIO by special io-threads servicing the IO-requests.
 +
 +Simulated AIO:
 +==============
 +
 +In platforms where we 'simulate' AIO following is a rough explanation
 +of the high level design.
 +There are four io-threads (for ibuf, log, read, write).
 +All synchronous IO requests are serviced by the calling thread using
 +os_file_write/os_file_read. The Asynchronous requests are queued up
 +in an array (there are four such arrays) by the calling thread.
 +Later these requests are picked up by the io-thread and are serviced
 +synchronously.
 +
 +Windows native AIO:
 +==================
 +
 +If srv_use_native_aio is not set then windows follow the same
 +code as simulated AIO. If the flag is set then native AIO interface
 +is used. On windows, one of the limitation is that if a file is opened
 +for AIO no synchronous IO can be done on it. Therefore we have an
 +extra fifth array to queue up synchronous IO requests.
 +There are innodb_file_io_threads helper threads. These threads work
 +on the four arrays mentioned above in Simulated AIO. No thread is
 +required for the sync array.
 +If a synchronous IO request is made, it is first queued in the sync
 +array. Then the calling thread itself waits on the request, thus
 +making the call synchronous.
 +If an AIO request is made the calling thread not only queues it in the
 +array but also submits the requests. The helper thread then collects
 +the completed IO request and calls completion routine on it.
 +
 +Linux native AIO:
 +=================
 +
 +If we have libaio installed on the system and innodb_use_native_aio
 +is set to TRUE we follow the code path of native AIO, otherwise we
 +do simulated AIO.
 +There are innodb_file_io_threads helper threads. These threads work
 +on the four arrays mentioned above in Simulated AIO.
 +If a synchronous IO request is made, it is handled by calling
 +os_file_write/os_file_read.
 +If an AIO request is made the calling thread not only queues it in the
 +array but also submits the requests. The helper thread then collects
 +the completed IO request and calls completion routine on it.
 +
 +**********************************************************************/
 +
 +/** Flag: enable debug printout for asynchronous i/o */
 +UNIV_INTERN ibool	os_aio_print_debug	= FALSE;
 +
 +#ifdef UNIV_PFS_IO
 +/* Keys to register InnoDB I/O with performance schema */
 +UNIV_INTERN mysql_pfs_key_t  innodb_file_data_key;
 +UNIV_INTERN mysql_pfs_key_t  innodb_file_log_key;
 +UNIV_INTERN mysql_pfs_key_t  innodb_file_temp_key;
 +UNIV_INTERN mysql_pfs_key_t  innodb_file_bmp_key;
 +#endif /* UNIV_PFS_IO */
 +
 +/** The asynchronous i/o array slot structure */
 +struct os_aio_slot_t{
 +#ifdef WIN_ASYNC_IO
 +	OVERLAPPED	control;	/*!< Windows control block for the
 +					aio request, MUST be first element in the structure*/
 +	void *arr;				/*!< Array this slot belongs to*/
 +#endif
 +
 +	ibool		is_read;	/*!< TRUE if a read operation */
 +	ulint		pos;		/*!< index of the slot in the aio
 +					array */
 +	ibool		reserved;	/*!< TRUE if this slot is reserved */
 +	time_t		reservation_time;/*!< time when reserved */
 +	ulint		len;		/*!< length of the block to read or
 +					write */
 +	byte*		buf;		/*!< buffer used in i/o */
 +	ulint		type;		/*!< OS_FILE_READ or OS_FILE_WRITE */
 +	os_offset_t	offset;		/*!< file offset in bytes */
 +	pfs_os_file_t	file;		/*!< file where to read or write */
 +	const char*	name;		/*!< file name or path */
 +	ibool		io_already_done;/*!< used only in simulated aio:
 +					TRUE if the physical i/o already
 +					made and only the slot message
 +					needs to be passed to the caller
 +					of os_aio_simulated_handle */
 +	ulint		space_id;
 +	fil_node_t*	message1;	/*!< message which is given by the */
 +	void*		message2;	/*!< the requester of an aio operation
 +					and which can be used to identify
 +					which pending aio operation was
 +					completed */
 +#ifdef LINUX_NATIVE_AIO
 +	struct iocb	control;	/* Linux control block for aio */
 +	int		n_bytes;	/* bytes written/read. */
 +	int		ret;		/* AIO return code */
 +#endif /* WIN_ASYNC_IO */
 +};
 +
 +/** The asynchronous i/o array structure */
 +struct os_aio_array_t{
 +	os_ib_mutex_t	mutex;	/*!< the mutex protecting the aio array */
 +	os_event_t	not_full;
 +				/*!< The event which is set to the
 +				signaled state when there is space in
 +				the aio outside the ibuf segment;
 +				os_event_set() and os_event_reset()
 +				are protected by os_aio_array_t::mutex */
 +	os_event_t	is_empty;
 +				/*!< The event which is set to the
 +				signaled state when there are no
 +				pending i/os in this array;
 +				os_event_set() and os_event_reset()
 +				are protected by os_aio_array_t::mutex */
 +	ulint		n_slots;/*!< Total number of slots in the aio
 +				array.  This must be divisible by
 +				n_threads. */
 +	ulint		n_segments;
 +				/*!< Number of segments in the aio
 +				array of pending aio requests. A
 +				thread can wait separately for any one
 +				of the segments. */
 +	ulint		cur_seg;/*!< We reserve IO requests in round
 +				robin fashion to different segments.
 +				This points to the segment that is to
 +				be used to service next IO request. */
 +	ulint		n_reserved;
 +				/*!< Number of reserved slots in the
 +				aio array outside the ibuf segment */
 +	os_aio_slot_t*	slots;	/*!< Pointer to the slots in the array */
 +
 +#if defined(LINUX_NATIVE_AIO)
 +	io_context_t*		aio_ctx;
 +				/* completion queue for IO. There is
 +				one such queue per segment. Each thread
 +				will work on one ctx exclusively. */
 +	struct io_event*	aio_events;
 +				/* The array to collect completed IOs.
 +				There is one such event for each
 +				possible pending IO. The size of the
 +				array is equal to n_slots. */
 +	struct iocb**		pending;
 +				/* Array to buffer the not-submitted aio
 +				requests. The array length is n_slots.
 +				It is divided into n_segments segments.
 +				pending requests on each segment are buffered
 +				separately.*/
 +	ulint*			count;
 +				/* Array of length n_segments. Each element
 +				counts the number of not-submitted aio
 +				request on that segment.*/
 +#endif /* LINUX_NATIV_AIO */
 +};
 +
 +#if defined(LINUX_NATIVE_AIO)
 +/** timeout for each io_getevents() call = 500ms. */
 +#define OS_AIO_REAP_TIMEOUT	(500000000UL)
 +
 +/** time to sleep, in microseconds if io_setup() returns EAGAIN. */
 +#define OS_AIO_IO_SETUP_RETRY_SLEEP	(500000UL)
 +
 +/** number of attempts before giving up on io_setup(). */
 +#define OS_AIO_IO_SETUP_RETRY_ATTEMPTS	5
 +#endif
 +
 +/** Array of events used in simulated aio. */
 +static os_event_t*	os_aio_segment_wait_events;
 +
 +/** The aio arrays for non-ibuf i/o and ibuf i/o, as well as sync aio. These
 +are NULL when the module has not yet been initialized. @{ */
 +static os_aio_array_t*	os_aio_read_array	= NULL;	/*!< Reads */
 +static os_aio_array_t*	os_aio_write_array	= NULL;	/*!< Writes */
 +static os_aio_array_t*	os_aio_ibuf_array	= NULL;	/*!< Insert buffer */
 +static os_aio_array_t*	os_aio_log_array	= NULL;	/*!< Redo log */
 +static os_aio_array_t*	os_aio_sync_array	= NULL;	/*!< Synchronous I/O */
 +/* @} */
 +
 +/** Number of asynchronous I/O segments.  Set by os_aio_init(). */
 +static ulint	os_aio_n_segments	= ULINT_UNDEFINED;
 +
 +/** If the following is TRUE, read i/o handler threads try to
 +wait until a batch of new read requests have been posted */
 +static ibool	os_aio_recommend_sleep_for_read_threads	= FALSE;
 +#endif /* !UNIV_HOTBACKUP */
 +
 +UNIV_INTERN ulint	os_n_file_reads		= 0;
 +UNIV_INTERN ulint	os_bytes_read_since_printout = 0;
 +UNIV_INTERN ulint	os_n_file_writes	= 0;
 +UNIV_INTERN ulint	os_n_fsyncs		= 0;
 +UNIV_INTERN ulint	os_n_file_reads_old	= 0;
 +UNIV_INTERN ulint	os_n_file_writes_old	= 0;
 +UNIV_INTERN ulint	os_n_fsyncs_old		= 0;
 +UNIV_INTERN time_t	os_last_printout;
 +
 +UNIV_INTERN ibool	os_has_said_disk_full	= FALSE;
 +
 +#ifdef UNIV_DEBUG
 +# ifndef UNIV_HOTBACKUP
 +/**********************************************************************//**
 +Validates the consistency the aio system some of the time.
 +@return	TRUE if ok or the check was skipped */
 +UNIV_INTERN
 +ibool
 +os_aio_validate_skip(void)
 +/*======================*/
 +{
 +/** Try os_aio_validate() every this many times */
 +# define OS_AIO_VALIDATE_SKIP	13
 +
 +	/** The os_aio_validate() call skip counter.
 +	Use a signed type because of the race condition below. */
 +	static int os_aio_validate_count = OS_AIO_VALIDATE_SKIP;
 +
 +	/* There is a race condition below, but it does not matter,
 +	because this call is only for heuristic purposes. We want to
 +	reduce the call frequency of the costly os_aio_validate()
 +	check in debug builds. */
 +	if (--os_aio_validate_count > 0) {
 +		return(TRUE);
 +	}
 +
 +	os_aio_validate_count = OS_AIO_VALIDATE_SKIP;
 +	return(os_aio_validate());
 +}
 +# endif /* !UNIV_HOTBACKUP */
 +#endif /* UNIV_DEBUG */
 +
 +#ifdef _WIN32
 +/** IO completion port used by background io threads */
 +static HANDLE completion_port;
 +/** IO completion port used by background io READ threads */
 +static HANDLE read_completion_port;
 +/** Thread local storage index for the per-thread event used for synchronous IO */
 +static DWORD tls_sync_io = TLS_OUT_OF_INDEXES;
 +#endif
 +
 +#ifdef __WIN__
 +/***********************************************************************//**
 +Gets the operating system version. Currently works only on Windows.
 +@return	OS_WIN95, OS_WIN31, OS_WINNT, OS_WIN2000, OS_WINXP, OS_WINVISTA,
 +OS_WIN7. */
 +UNIV_INTERN
 +ulint
 +os_get_os_version(void)
 +/*===================*/
 +{
 +	OSVERSIONINFO	os_info;
 +
 +	os_info.dwOSVersionInfoSize = sizeof(OSVERSIONINFO);
 +
 +	ut_a(GetVersionEx(&os_info));
 +
 +	if (os_info.dwPlatformId == VER_PLATFORM_WIN32s) {
 +		return(OS_WIN31);
 +	} else if (os_info.dwPlatformId == VER_PLATFORM_WIN32_WINDOWS) {
 +		return(OS_WIN95);
 +	} else if (os_info.dwPlatformId == VER_PLATFORM_WIN32_NT) {
 +		switch (os_info.dwMajorVersion) {
 +		case 3:
 +		case 4:
 +			return(OS_WINNT);
 +		case 5:
 +			return (os_info.dwMinorVersion == 0)
 +				? OS_WIN2000 : OS_WINXP;
 +		case 6:
 +			return (os_info.dwMinorVersion == 0)
 +				? OS_WINVISTA : OS_WIN7;
 +		default:
 +			return(OS_WIN7);
 +		}
 +	} else {
 +		ut_error;
 +		return(0);
 +	}
 +}
 +#endif /* __WIN__ */
 +
 +
 +#ifdef _WIN32
 +/*
 +Windows : Handling synchronous IO on files opened asynchronously.
 +
 +If file is opened for asynchronous IO (FILE_FLAG_OVERLAPPED) and also bound to 
 +a completion port, then every IO on this file would normally be enqueued to the
 +completion port. Sometimes however we would like to do a synchronous IO. This is
 +possible if we initialitze have overlapped.hEvent with a valid event and set its
 +lowest order bit to 1 (see MSDN ReadFile and WriteFile description for more info)
 +
 +We'll create this special event once for each thread and store in thread local 
 +storage.
 +*/
 +
 +
 +/***********************************************************************//**
 +Initialize tls index.for event handle used for synchronized IO on files that 
 +might be opened with FILE_FLAG_OVERLAPPED.
 +*/
 +static void win_init_syncio_event()
 +{
 +	tls_sync_io = TlsAlloc();
 +	ut_a(tls_sync_io != TLS_OUT_OF_INDEXES);
 +}
 +
 +/***********************************************************************//**
 +Retrieve per-thread event for doing synchronous io on asyncronously opened files
 +*/
 +static HANDLE win_get_syncio_event()
 +{
 +	HANDLE h;
 +	if(tls_sync_io == TLS_OUT_OF_INDEXES){
 +		win_init_syncio_event();
 +	}
 +
 +	h = (HANDLE)TlsGetValue(tls_sync_io);
 +	if (h)
 +		return h;
 +	h = CreateEventA(NULL, FALSE, FALSE, NULL);
 +	ut_a(h);
 +	h = (HANDLE)((uintptr_t)h | 1);
 +	TlsSetValue(tls_sync_io, h);
 +	return h;
 +}
 +
 +/*
 +  TLS destructor, inspired by Chromium code
 +  http://src.chromium.org/svn/trunk/src/base/threading/thread_local_storage_wi...
 +*/
 +
 +static void win_free_syncio_event()
 +{
 +	HANDLE h = win_get_syncio_event();
 +	if (h) {
 +		CloseHandle(h);
 +	}
 +}
 +
 +static void NTAPI win_tls_thread_exit(PVOID module, DWORD reason, PVOID reserved) {
 +	if (DLL_THREAD_DETACH == reason || DLL_PROCESS_DETACH == reason)
 +		win_free_syncio_event();
 +}
 +
 +extern "C" {
 +#ifdef _WIN64
 +#pragma comment(linker, "/INCLUDE:_tls_used")
 +#pragma comment(linker, "/INCLUDE:p_thread_callback_base")
 +#pragma const_seg(".CRT$XLB")
 +extern const PIMAGE_TLS_CALLBACK p_thread_callback_base;
 +const PIMAGE_TLS_CALLBACK p_thread_callback_base = win_tls_thread_exit;
 +#pragma data_seg()
 +#else
 +#pragma comment(linker, "/INCLUDE:__tls_used")
 +#pragma comment(linker, "/INCLUDE:_p_thread_callback_base")
 +#pragma data_seg(".CRT$XLB")
 +PIMAGE_TLS_CALLBACK p_thread_callback_base = win_tls_thread_exit;
 +#pragma data_seg()
 +#endif 
 +}
 +#endif /*_WIN32 */
 +
 +/***********************************************************************//**
 +For an EINVAL I/O error, prints a diagnostic message if innodb_flush_method
 +== ALL_O_DIRECT.
 +@return true if the diagnostic message was printed
 +@return false if the diagnostic message does not apply */
 +static
 +bool
 +os_diagnose_all_o_direct_einval(
 +/*============================*/
 +	ulint err)	/*!< in: C error code */
 +{
 +	if ((err == EINVAL)
 +	    && (srv_unix_file_flush_method == SRV_UNIX_ALL_O_DIRECT)) {
 +		ib_logf(IB_LOG_LEVEL_INFO,
 +			"The error might be caused by redo log I/O not "
 +			"satisfying innodb_flush_method=ALL_O_DIRECT "
 +			"requirements by the underlying file system.");
 +		if (srv_log_block_size != 512)
 +			ib_logf(IB_LOG_LEVEL_INFO,
 +				"This might be caused by an incompatible "
 +				"non-default innodb_log_block_size value %lu.",
 +				srv_log_block_size);
 +		ib_logf(IB_LOG_LEVEL_INFO,
 +			"Please file a bug at https://bugs.percona.com and "
 +			"include this error message, my.cnf settings, and "
 +			"information about the file system where the redo log "
 +			"resides.");
 +		ib_logf(IB_LOG_LEVEL_INFO,
 +			"A possible workaround is to change "
 +			"innodb_flush_method value to something else "
 +			"than ALL_O_DIRECT.");
 +		return(true);
 +	}
 +	return(false);
 +}
 +
 +/***********************************************************************//**
 +Retrieves the last error number if an error occurs in a file io function.
 +The number should be retrieved before any other OS calls (because they may
 +overwrite the error number). If the number is not known to this program,
 +the OS error number + 100 is returned.
 +@return	error number, or OS error number + 100 */
 +static
 +ulint
 +os_file_get_last_error_low(
 +/*=======================*/
 +	bool	report_all_errors,	/*!< in: TRUE if we want an error
 +					message printed of all errors */
 +	bool	on_error_silent)	/*!< in: TRUE then don't print any
 +					diagnostic to the log */
 +{
 +#ifdef __WIN__
 +
 +	ulint	err = (ulint) GetLastError();
 +	if (err == ERROR_SUCCESS) {
 +		return(0);
 +	}
 +
 +	if (report_all_errors
 +	    || (!on_error_silent
 +		&& err != ERROR_DISK_FULL
 +		&& err != ERROR_FILE_EXISTS)) {
 +
 +		ut_print_timestamp(stderr);
 +		fprintf(stderr,
 +			"  InnoDB: Operating system error number %lu"
 +			" in a file operation.\n", (ulong) err);
 +
 +		if (err == ERROR_PATH_NOT_FOUND) {
 +			fprintf(stderr,
 +				"InnoDB: The error means the system"
 +				" cannot find the path specified.\n");
 +
 +			if (srv_is_being_started) {
 +				fprintf(stderr,
 +					"InnoDB: If you are installing InnoDB,"
 +					" remember that you must create\n"
 +					"InnoDB: directories yourself, InnoDB"
 +					" does not create them.\n");
 +			}
 +		} else if (err == ERROR_ACCESS_DENIED) {
 +			fprintf(stderr,
 +				"InnoDB: The error means mysqld does not have"
 +				" the access rights to\n"
 +				"InnoDB: the directory. It may also be"
 +				" you have created a subdirectory\n"
 +				"InnoDB: of the same name as a data file.\n");
 +		} else if (err == ERROR_SHARING_VIOLATION
 +			   || err == ERROR_LOCK_VIOLATION) {
 +			fprintf(stderr,
 +				"InnoDB: The error means that another program"
 +				" is using InnoDB's files.\n"
 +				"InnoDB: This might be a backup or antivirus"
 +				" software or another instance\n"
 +				"InnoDB: of MySQL."
 +				" Please close it to get rid of this error.\n");
 +		} else if (err == ERROR_WORKING_SET_QUOTA
 +			   || err == ERROR_NO_SYSTEM_RESOURCES) {
 +			fprintf(stderr,
 +				"InnoDB: The error means that there are no"
 +				" sufficient system resources or quota to"
 +				" complete the operation.\n");
 +		} else if (err == ERROR_OPERATION_ABORTED) {
 +			fprintf(stderr,
 +				"InnoDB: The error means that the I/O"
 +				" operation has been aborted\n"
 +				"InnoDB: because of either a thread exit"
 +				" or an application request.\n"
 +				"InnoDB: Retry attempt is made.\n");
 +		} else {
 +			fprintf(stderr,
 +				"InnoDB: Some operating system error numbers"
 +				" are described at\n"
 +				"InnoDB: "
 +				REFMAN
 +				"operating-system-error-codes.html\n");
 +		}
 +	}
 +
 +	fflush(stderr);
 +
 +	if (err == ERROR_FILE_NOT_FOUND) {
 +		return(OS_FILE_NOT_FOUND);
 +	} else if (err == ERROR_DISK_FULL) {
 +		return(OS_FILE_DISK_FULL);
 +	} else if (err == ERROR_FILE_EXISTS) {
 +		return(OS_FILE_ALREADY_EXISTS);
 +	} else if (err == ERROR_SHARING_VIOLATION
 +		   || err == ERROR_LOCK_VIOLATION) {
 +		return(OS_FILE_SHARING_VIOLATION);
 +	} else if (err == ERROR_WORKING_SET_QUOTA
 +		   || err == ERROR_NO_SYSTEM_RESOURCES) {
 +		return(OS_FILE_INSUFFICIENT_RESOURCE);
 +	} else if (err == ERROR_OPERATION_ABORTED) {
 +		return(OS_FILE_OPERATION_ABORTED);
 +	} else if (err == ERROR_ACCESS_DENIED) {
 +		return(OS_FILE_ACCESS_VIOLATION);
 +	} else if (err == ERROR_BUFFER_OVERFLOW) {
 +		return(OS_FILE_NAME_TOO_LONG);
 +	} else {
 +		return(OS_FILE_ERROR_MAX + err);
 +	}
 +#else
 +	int err = errno;
 +	if (err == 0) {
 +		return(0);
 +	}
 +
 +	if (report_all_errors
 +	    || (err != ENOSPC && err != EEXIST && !on_error_silent)) {
 +
 +		ut_print_timestamp(stderr);
 +		fprintf(stderr,
 +			"  InnoDB: Operating system error number %d"
 +			" in a file operation.\n", err);
 +
 +		if (err == ENOENT) {
 +			fprintf(stderr,
 +				"InnoDB: The error means the system"
 +				" cannot find the path specified.\n");
 +
 +			if (srv_is_being_started) {
 +				fprintf(stderr,
 +					"InnoDB: If you are installing InnoDB,"
 +					" remember that you must create\n"
 +					"InnoDB: directories yourself, InnoDB"
 +					" does not create them.\n");
 +			}
 +		} else if (err == EACCES) {
 +			fprintf(stderr,
 +				"InnoDB: The error means mysqld does not have"
 +				" the access rights to\n"
 +				"InnoDB: the directory.\n");
 +		} else if (!os_diagnose_all_o_direct_einval(err)) {
 +			if (strerror(err) != NULL) {
 +				fprintf(stderr,
 +					"InnoDB: Error number %d"
 +					" means '%s'.\n",
 +					err, strerror(err));
 +			}
 +
 +
 +			fprintf(stderr,
 +				"InnoDB: Some operating system"
 +				" error numbers are described at\n"
 +				"InnoDB: "
 +				REFMAN
 +				"operating-system-error-codes.html\n");
 +		}
 +	}
 +
 +	fflush(stderr);
 +
 +	switch (err) {
 +	case ENOSPC:
 +		return(OS_FILE_DISK_FULL);
 +	case ENOENT:
 +		return(OS_FILE_NOT_FOUND);
 +	case EEXIST:
 +		return(OS_FILE_ALREADY_EXISTS);
 +	case ENAMETOOLONG:
 +		return(OS_FILE_NAME_TOO_LONG);
 +	case EXDEV:
 +	case ENOTDIR:
 +	case EISDIR:
 +		return(OS_FILE_PATH_ERROR);
 +	case EAGAIN:
 +		if (srv_use_native_aio) {
 +			return(OS_FILE_AIO_RESOURCES_RESERVED);
 +		}
 +		break;
 +	case EINTR:
 +		if (srv_use_native_aio) {
 +			return(OS_FILE_AIO_INTERRUPTED);
 +		}
 +		break;
 +	case EACCES:
 +		return(OS_FILE_ACCESS_VIOLATION);
 +	}
 +	return(OS_FILE_ERROR_MAX + err);
 +#endif
 +}
 +
 +/***********************************************************************//**
 +Retrieves the last error number if an error occurs in a file io function.
 +The number should be retrieved before any other OS calls (because they may
 +overwrite the error number). If the number is not known to this program,
 +the OS error number + 100 is returned.
 +@return	error number, or OS error number + 100 */
 +UNIV_INTERN
 +ulint
 +os_file_get_last_error(
 +/*===================*/
 +	bool	report_all_errors)	/*!< in: TRUE if we want an error
 +					message printed of all errors */
 +{
 +	return(os_file_get_last_error_low(report_all_errors, false));
 +}
 +
 +/****************************************************************//**
 +Does error handling when a file operation fails.
 +Conditionally exits (calling exit(3)) based on should_exit value and the
 +error type, if should_exit is TRUE then on_error_silent is ignored.
 +@return	TRUE if we should retry the operation */
 +static
 +ibool
 +os_file_handle_error_cond_exit(
 +/*===========================*/
 +	const char*	name,		/*!< in: name of a file or NULL */
 +	const char*	operation,	/*!< in: operation */
 +	ibool		should_exit,	/*!< in: call exit(3) if unknown error
 +					and this parameter is TRUE */
 +	ibool		on_error_silent)/*!< in: if TRUE then don't print
 +					any message to the log iff it is
 +					an unknown non-fatal error */
 +{
 +	ulint	err;
 +
 +	err = os_file_get_last_error_low(false, on_error_silent);
 +
 +	switch (err) {
 +	case OS_FILE_DISK_FULL:
 +		/* We only print a warning about disk full once */
 +
 +		if (os_has_said_disk_full) {
 +
 +			return(FALSE);
 +		}
 +
 +		/* Disk full error is reported irrespective of the
 +		on_error_silent setting. */
 +
 +		if (name) {
 +			ut_print_timestamp(stderr);
 +			fprintf(stderr,
 +				"  InnoDB: Encountered a problem with"
 +				" file %s\n", name);
 +		}
 +
 +		ut_print_timestamp(stderr);
 +		fprintf(stderr,
 +			"  InnoDB: Disk is full. Try to clean the disk"
 +			" to free space.\n");
 +
 +		os_has_said_disk_full = TRUE;
 +
 +		fflush(stderr);
 +
 +		ut_error;
 +		return(FALSE);
 +
 +	case OS_FILE_AIO_RESOURCES_RESERVED:
 +	case OS_FILE_AIO_INTERRUPTED:
 +
 +		return(TRUE);
 +
 +	case OS_FILE_PATH_ERROR:
 +	case OS_FILE_ALREADY_EXISTS:
 +	case OS_FILE_ACCESS_VIOLATION:
 +
 +		return(FALSE);
 +
 +	case OS_FILE_SHARING_VIOLATION:
 +
 +		os_thread_sleep(10000000);  /* 10 sec */
 +		return(TRUE);
 +
 +	case OS_FILE_OPERATION_ABORTED:
 +	case OS_FILE_INSUFFICIENT_RESOURCE:
 +
 +		os_thread_sleep(100000);	/* 100 ms */
 +		return(TRUE);
 +
 +	default:
 +
 +		/* If it is an operation that can crash on error then it
 +		is better to ignore on_error_silent and print an error message
 +		to the log. */
 +
 +		if (should_exit || !on_error_silent) {
 +			ib_logf(IB_LOG_LEVEL_ERROR, "File %s: '%s' returned OS "
 +				"error " ULINTPF ".%s", name ? name : "(unknown)",
 +				operation, err, should_exit
 +				? " Cannot continue operation" : "");
 +		}
 +
 +		if (should_exit) {
 +			exit(1);
 +		}
 +	}
 +
 +	return(FALSE);
 +}
 +
 +/****************************************************************//**
 +Does error handling when a file operation fails.
 +@return	TRUE if we should retry the operation */
 +static
 +ibool
 +os_file_handle_error(
 +/*=================*/
 +	const char*	name,		/*!< in: name of a file or NULL */
 +	const char*	operation)	/*!< in: operation */
 +{
 +	/* exit in case of unknown error */
 +	return(os_file_handle_error_cond_exit(name, operation, TRUE, FALSE));
 +}
 +
 +/****************************************************************//**
 +Does error handling when a file operation fails.
 +@return	TRUE if we should retry the operation */
 +ibool
 +os_file_handle_error_no_exit(
 +/*=========================*/
 +	const char*	name,		/*!< in: name of a file or NULL */
 +	const char*	operation,	/*!< in: operation */
 +	ibool		on_error_silent)/*!< in: if TRUE then don't print
 +					any message to the log. */
 +{
 +	/* don't exit in case of unknown error */
 +	return(os_file_handle_error_cond_exit(
 +			name, operation, FALSE, on_error_silent));
 +}
 +
 +#undef USE_FILE_LOCK
 +#define USE_FILE_LOCK
 +#if defined(UNIV_HOTBACKUP) || defined(__WIN__)
 +/* InnoDB Hot Backup does not lock the data files.
 + * On Windows, mandatory locking is used.
 + */
 +# undef USE_FILE_LOCK
 +#endif
 +#ifdef USE_FILE_LOCK
 +/****************************************************************//**
 +Obtain an exclusive lock on a file.
 +@return	0 on success */
 +static
 +int
 +os_file_lock(
 +/*=========*/
 +	int		fd,	/*!< in: file descriptor */
 +	const char*	name)	/*!< in: file name */
 +{
 +	struct flock lk;
 +
 +	ut_ad(!srv_read_only_mode);
 +
 +	lk.l_type = F_WRLCK;
 +	lk.l_whence = SEEK_SET;
 +	lk.l_start = lk.l_len = 0;
 +
 +	if (fcntl(fd, F_SETLK, &lk) == -1) {
 +
 +		ib_logf(IB_LOG_LEVEL_ERROR,
 +			"Unable to lock %s, error: %d", name, errno);
 +
 +		if (errno == EAGAIN || errno == EACCES) {
 +			ib_logf(IB_LOG_LEVEL_INFO,
 +				"Check that you do not already have "
 +				"another mysqld process using the "
 +				"same InnoDB data or log files.");
 +		}
 +
 +		return(-1);
 +	}
 +
 +	return(0);
 +}
 +#endif /* USE_FILE_LOCK */
 +
 +#ifndef UNIV_HOTBACKUP
 +/****************************************************************//**
 +Creates the seek mutexes used in positioned reads and writes. */
 +static
 +void
 +os_io_init_simple(void)
 +/*===================*/
 +{
 +	for (ulint i = 0; i < OS_FILE_N_SEEK_MUTEXES; i++) {
 +		os_file_seek_mutexes[i] = os_mutex_create();
 +	}
 +#ifdef _WIN32
 +	win_init_syncio_event();
 +#endif
 +}
 +
 +/** Create a temporary file. This function is like tmpfile(3), but
 +the temporary file is created in the given parameter path. If the path
 +is null then it will create the file in the mysql server configuration
 +parameter (--tmpdir).
 +@param[in]	path	location for creating temporary file
 +@return temporary file handle, or NULL on error */
 +UNIV_INTERN
 +FILE*
 +os_file_create_tmpfile(
 +	const char*	path)
 +{
 +	FILE*	file	= NULL;
 +	int	fd;
 +	WAIT_ALLOW_WRITES();
 +	fd	= innobase_mysql_tmpfile(path);
 +
 +	ut_ad(!srv_read_only_mode);
 +
 +	if (fd >= 0) {
 +		file = fdopen(fd, "w+b");
 +	}
 +
 +	if (!file) {
 +		ut_print_timestamp(stderr);
 +		fprintf(stderr,
 +			"  InnoDB: Error: unable to create temporary file;"
 +			" errno: %d\n", errno);
 +		if (fd >= 0) {
 +			close(fd);
 +		}
 +	}
 +
 +	return(file);
 +}
 +#endif /* !UNIV_HOTBACKUP */
 +
 +/***********************************************************************//**
 +The os_file_opendir() function opens a directory stream corresponding to the
 +directory named by the dirname argument. The directory stream is positioned
 +at the first entry. In both Unix and Windows we automatically skip the '.'
 +and '..' items at the start of the directory listing.
 +@return	directory stream, NULL if error */
 +UNIV_INTERN
 +os_file_dir_t
 +os_file_opendir(
 +/*============*/
 +	const char*	dirname,	/*!< in: directory name; it must not
 +					contain a trailing '\' or '/' */
 +	ibool		error_is_fatal)	/*!< in: TRUE if we should treat an
 +					error as a fatal error; if we try to
 +					open symlinks then we do not wish a
 +					fatal error if it happens not to be
 +					a directory */
 +{
 +	os_file_dir_t		dir;
 +#ifdef __WIN__
 +	LPWIN32_FIND_DATA	lpFindFileData;
 +	char			path[OS_FILE_MAX_PATH + 3];
 +
 +	ut_a(strlen(dirname) < OS_FILE_MAX_PATH);
 +
 +	strcpy(path, dirname);
 +	strcpy(path + strlen(path), "\\*");
 +
 +	/* Note that in Windows opening the 'directory stream' also retrieves
 +	the first entry in the directory. Since it is '.', that is no problem,
 +	as we will skip over the '.' and '..' entries anyway. */
 +
 +	lpFindFileData = static_cast<LPWIN32_FIND_DATA>(
 +		ut_malloc(sizeof(WIN32_FIND_DATA)));
 +
 +	dir = FindFirstFile((LPCTSTR) path, lpFindFileData);
 +
 +	ut_free(lpFindFileData);
 +
 +	if (dir == INVALID_HANDLE_VALUE) {
 +
 +		if (error_is_fatal) {
 +			os_file_handle_error(dirname, "opendir");
 +		}
 +
 +		return(NULL);
 +	}
 +
 +	return(dir);
 +#else
 +	dir = opendir(dirname);
 +
 +	if (dir == NULL && error_is_fatal) {
 +		os_file_handle_error(dirname, "opendir");
 +	}
 +
 +	return(dir);
 +#endif /* __WIN__ */
 +}
 +
 +/***********************************************************************//**
 +Closes a directory stream.
 +@return	0 if success, -1 if failure */
 +UNIV_INTERN
 +int
 +os_file_closedir(
 +/*=============*/
 +	os_file_dir_t	dir)	/*!< in: directory stream */
 +{
 +#ifdef __WIN__
 +	BOOL		ret;
 +
 +	ret = FindClose(dir);
 +
 +	if (!ret) {
 +		os_file_handle_error_no_exit(NULL, "closedir", FALSE);
 +
 +		return(-1);
 +	}
 +
 +	return(0);
 +#else
 +	int	ret;
 +
 +	ret = closedir(dir);
 +
 +	if (ret) {
 +		os_file_handle_error_no_exit(NULL, "closedir", FALSE);
 +	}
 +
 +	return(ret);
 +#endif /* __WIN__ */
 +}
 +
 +/***********************************************************************//**
 +This function returns information of the next file in the directory. We jump
 +over the '.' and '..' entries in the directory.
 +@return	0 if ok, -1 if error, 1 if at the end of the directory */
 +UNIV_INTERN
 +int
 +os_file_readdir_next_file(
 +/*======================*/
 +	const char*	dirname,/*!< in: directory name or path */
 +	os_file_dir_t	dir,	/*!< in: directory stream */
 +	os_file_stat_t*	info)	/*!< in/out: buffer where the info is returned */
 +{
 +#ifdef __WIN__
 +	LPWIN32_FIND_DATA	lpFindFileData;
 +	BOOL			ret;
 +
 +	lpFindFileData = static_cast<LPWIN32_FIND_DATA>(
 +		ut_malloc(sizeof(WIN32_FIND_DATA)));
 +next_file:
 +	ret = FindNextFile(dir, lpFindFileData);
 +
 +	if (ret) {
 +		ut_a(strlen((char*) lpFindFileData->cFileName)
 +		     < OS_FILE_MAX_PATH);
 +
 +		if (strcmp((char*) lpFindFileData->cFileName, ".") == 0
 +		    || strcmp((char*) lpFindFileData->cFileName, "..") == 0) {
 +
 +			goto next_file;
 +		}
 +
 +		strcpy(info->name, (char*) lpFindFileData->cFileName);
 +
 +		info->size = (ib_int64_t)(lpFindFileData->nFileSizeLow)
 +			+ (((ib_int64_t)(lpFindFileData->nFileSizeHigh))
 +			   << 32);
 +
 +		if (lpFindFileData->dwFileAttributes
 +		    & FILE_ATTRIBUTE_REPARSE_POINT) {
 +			/* TODO: test Windows symlinks */
 +			/* TODO: MySQL has apparently its own symlink
 +			implementation in Windows, dbname.sym can
 +			redirect a database directory:
 +			REFMAN "windows-symbolic-links.html" */
 +			info->type = OS_FILE_TYPE_LINK;
 +		} else if (lpFindFileData->dwFileAttributes
 +			   & FILE_ATTRIBUTE_DIRECTORY) {
 +			info->type = OS_FILE_TYPE_DIR;
 +		} else {
 +			/* It is probably safest to assume that all other
 +			file types are normal. Better to check them rather
 +			than blindly skip them. */
 +
 +			info->type = OS_FILE_TYPE_FILE;
 +		}
 +	}
 +
 +	ut_free(lpFindFileData);
 +
 +	if (ret) {
 +		return(0);
 +	} else if (GetLastError() == ERROR_NO_MORE_FILES) {
 +
 +		return(1);
 +	} else {
 +		os_file_handle_error_no_exit(NULL, "readdir_next_file", FALSE);
 +		return(-1);
 +	}
 +#else
 +	struct dirent*	ent;
 +	char*		full_path;
 +	int		ret;
 +	struct stat	statinfo;
 +
 +next_file:
 +
 +	ent = readdir(dir);
 +
 +	if (ent == NULL) {
 +
 +		return(1);
 +	}
 +	ut_a(strlen(ent->d_name) < OS_FILE_MAX_PATH);
 +
 +	if (strcmp(ent->d_name, ".") == 0 || strcmp(ent->d_name, "..") == 0) {
 +
 +		goto next_file;
 +	}
 +
 +	strcpy(info->name, ent->d_name);
 +
 +	full_path = static_cast<char*>(
 +		ut_malloc(strlen(dirname) + strlen(ent->d_name) + 10));
 +
 +	sprintf(full_path, "%s/%s", dirname, ent->d_name);
 +
 +	ret = stat(full_path, &statinfo);
 +
 +	if (ret) {
 +
 +		if (errno == ENOENT) {
 +			/* readdir() returned a file that does not exist,
 +			it must have been deleted in the meantime. Do what
 +			would have happened if the file was deleted before
 +			readdir() - ignore and go to the next entry.
 +			If this is the last entry then info->name will still
 +			contain the name of the deleted file when this
 +			function returns, but this is not an issue since the
 +			caller shouldn't be looking at info when end of
 +			directory is returned. */
 +
 +			ut_free(full_path);
 +
 +			goto next_file;
 +		}
 +
 +		os_file_handle_error_no_exit(full_path, "stat", FALSE);
 +
 +		ut_free(full_path);
 +
 +		return(-1);
 +	}
 +
 +	info->size = (ib_int64_t) statinfo.st_size;
 +
 +	if (S_ISDIR(statinfo.st_mode)) {
 +		info->type = OS_FILE_TYPE_DIR;
 +	} else if (S_ISLNK(statinfo.st_mode)) {
 +		info->type = OS_FILE_TYPE_LINK;
 +	} else if (S_ISREG(statinfo.st_mode)) {
 +		info->type = OS_FILE_TYPE_FILE;
 +	} else {
 +		info->type = OS_FILE_TYPE_UNKNOWN;
 +	}
 +
 +	ut_free(full_path);
 +
 +	return(0);
 +#endif
 +}
 +
 +/*****************************************************************//**
 +This function attempts to create a directory named pathname. The new
 +directory gets default permissions. On Unix the permissions are
 +(0770 & ~umask). If the directory exists already, nothing is done and
 +the call succeeds, unless the fail_if_exists arguments is true.
 +If another error occurs, such as a permission error, this does not crash,
 +but reports the error and returns FALSE.
 +@return	TRUE if call succeeds, FALSE on error */
 +UNIV_INTERN
 +ibool
 +os_file_create_directory(
 +/*=====================*/
 +	const char*	pathname,	/*!< in: directory name as
 +					null-terminated string */
 +	ibool		fail_if_exists)	/*!< in: if TRUE, pre-existing directory
 +					is treated as an error. */
 +{
 +#ifdef __WIN__
 +	BOOL	rcode;
 +
 +	rcode = CreateDirectory((LPCTSTR) pathname, NULL);
 +	if (!(rcode != 0
 +	      || (GetLastError() == ERROR_ALREADY_EXISTS
 +		  && !fail_if_exists))) {
 +
 +		os_file_handle_error_no_exit(
 +			pathname, "CreateDirectory", FALSE);
 +
 +		return(FALSE);
 +	}
 +
 +	return(TRUE);
 +#else
 +	int	rcode;
 +	WAIT_ALLOW_WRITES();
 +
 +	rcode = mkdir(pathname, 0770);
 +
 +	if (!(rcode == 0 || (errno == EEXIST && !fail_if_exists))) {
 +		/* failure */
 +		os_file_handle_error_no_exit(pathname, "mkdir", FALSE);
 +
 +		return(FALSE);
 +	}
 +
 +	return (TRUE);
 +#endif /* __WIN__ */
 +}
 +
 +/****************************************************************//**
 +NOTE! Use the corresponding macro os_file_create_simple(), not directly
 +this function!
 +A simple function to open or create a file.
 +@return own: handle to the file, not defined if error, error number
 +can be retrieved with os_file_get_last_error */
 +UNIV_INTERN
 +os_file_t
 +os_file_create_simple_func(
 +/*=======================*/
 +	const char*	name,	/*!< in: name of the file or path as a
 +				null-terminated string */
 +	ulint		create_mode,/*!< in: create mode */
 +	ulint		access_type,/*!< in: OS_FILE_READ_ONLY or
 +				OS_FILE_READ_WRITE */
 +	ibool*		success)/*!< out: TRUE if succeed, FALSE if error */
 +{
 +	os_file_t	file;
 +	ibool		retry;
 +
 +	*success = FALSE;
 +#ifdef __WIN__
 +	DWORD		access;
 +	DWORD		create_flag;
 +	DWORD		attributes	= 0;
 +
 +	ut_a(!(create_mode & OS_FILE_ON_ERROR_SILENT));
 +	ut_a(!(create_mode & OS_FILE_ON_ERROR_NO_EXIT));
 +
 +	if (create_mode == OS_FILE_OPEN) {
 +
 +		create_flag = OPEN_EXISTING;
 +
 +	} else if (srv_read_only_mode) {
 +
 +		create_flag = OPEN_EXISTING;
 +
 +	} else if (create_mode == OS_FILE_CREATE) {
 +
 +		create_flag = CREATE_NEW;
 +
 +	} else if (create_mode == OS_FILE_CREATE_PATH) {
 +
 +		ut_a(!srv_read_only_mode);
 +
 +		/* Create subdirs along the path if needed  */
 +		*success = os_file_create_subdirs_if_needed(name);
 +
 +		if (!*success) {
 +
 +			ib_logf(IB_LOG_LEVEL_ERROR,
 +				"Unable to create subdirectories '%s'",
 +				name);
 +
 +			return((os_file_t) -1);
 +		}
 +
 +		create_flag = CREATE_NEW;
 +		create_mode = OS_FILE_CREATE;
 +
 +	} else {
 +		ib_logf(IB_LOG_LEVEL_ERROR,
 +			"Unknown file create mode (%lu) for file '%s'",
 +			create_mode, name);
 +
 +		return((os_file_t) -1);
 +	}
 +
 +	if (access_type == OS_FILE_READ_ONLY) {
 +		access = GENERIC_READ;
 +	} else if (srv_read_only_mode) {
 +
 +		ib_logf(IB_LOG_LEVEL_INFO,
 +			"read only mode set. Unable to "
 +			"open file '%s' in RW mode, trying RO mode", name);
 +
 +		access = GENERIC_READ;
 +
 +	} else if (access_type == OS_FILE_READ_WRITE) {
 +		access = GENERIC_READ | GENERIC_WRITE;
 +	} else {
 +		ib_logf(IB_LOG_LEVEL_ERROR,
 +			"Unknown file access type (%lu) for file '%s'",
 +			access_type, name);
 +
 +		return((os_file_t) -1);
 +	}
 +
 +	do {
 +		/* Use default security attributes and no template file. */
 +
 +		file = CreateFile(
 +			(LPCTSTR) name, access, FILE_SHARE_READ, NULL,
 +			create_flag, attributes, NULL);
 +
 +		if (file == INVALID_HANDLE_VALUE) {
 +
 +			*success = FALSE;
 +
 +			retry = os_file_handle_error(
 +				name, create_mode == OS_FILE_OPEN ?
 +				"open" : "create");
 +
 +		} else {
 +			*success = TRUE;
 +			retry = false;
 +		}
 +
 +	} while (retry);
 +
 +#else /* __WIN__ */
 +	int		create_flag;
 +	if (create_mode != OS_FILE_OPEN && create_mode != OS_FILE_OPEN_RAW)
 +		WAIT_ALLOW_WRITES();
 +
 +	ut_a(!(create_mode & OS_FILE_ON_ERROR_SILENT));
 +	ut_a(!(create_mode & OS_FILE_ON_ERROR_NO_EXIT));
 +
 +	if (create_mode == OS_FILE_OPEN) {
 +
 +		if (access_type == OS_FILE_READ_ONLY) {
 +			create_flag = O_RDONLY;
 +		} else if (srv_read_only_mode) {
 +			create_flag = O_RDONLY;
 +		} else {
 +			create_flag = O_RDWR;
 +		}
 +
 +	} else if (srv_read_only_mode) {
 +
 +		create_flag = O_RDONLY;
 +
 +	} else if (create_mode == OS_FILE_CREATE) {
 +
 +		create_flag = O_RDWR | O_CREAT | O_EXCL;
 +
 +	} else if (create_mode == OS_FILE_CREATE_PATH) {
 +
 +		/* Create subdirs along the path if needed  */
 +
 +		*success = os_file_create_subdirs_if_needed(name);
 +
 +		if (!*success) {
 +
 +			ib_logf(IB_LOG_LEVEL_ERROR,
 +				"Unable to create subdirectories '%s'",
 +				name);
 +
 +			return((os_file_t) -1);
 +		}
 +
 +		create_flag = O_RDWR | O_CREAT | O_EXCL;
 +		create_mode = OS_FILE_CREATE;
 +	} else {
 +
 +		ib_logf(IB_LOG_LEVEL_ERROR,
 +			"Unknown file create mode (%lu) for file '%s'",
 +			create_mode, name);
 +
 +		return((os_file_t) -1);
 +	}
 +
 +	do {
- 		file = ::open(name, create_flag, os_innodb_umask);
++		file = ::open(name, create_flag | O_CLOEXEC, os_innodb_umask);
 +
 +		if (file == -1) {
 +			*success = FALSE;
 +
 +			retry = os_file_handle_error(
 +				name,
 +				create_mode == OS_FILE_OPEN
 +				?  "open" : "create");
 +		} else {
 +			*success = TRUE;
 +			retry = false;
 +		}
 +
 +	} while (retry);
 +
 +#ifdef USE_FILE_LOCK
 +	if (!srv_read_only_mode
 +	    && *success
 +	    && access_type == OS_FILE_READ_WRITE
 +	    && os_file_lock(file, name)) {
 +
 +		*success = FALSE;
 +		close(file);
 +		file = -1;
 +	}
 +#endif /* USE_FILE_LOCK */
 +
 +#endif /* __WIN__ */
 +
 +	return(file);
 +}
 +
 +/** Disable OS I/O caching on the file if the file type and server
 +configuration requires it.
 +@param file handle to the file
 +@param name name of the file, for diagnostics
 +@param mode_str operation on the file, for diagnostics
 +@param type OS_LOG_FILE or OS_DATA_FILE
 +@param access_type if OS_FILE_READ_WRITE_CACHED, then caching will be disabled
 +unconditionally, ignored otherwise */
 +static
 +void
 +os_file_set_nocache_if_needed(os_file_t file, const char* name,
 +			      const char *mode_str, ulint type,
 +			      ulint access_type)
 +{
 +	if (srv_read_only_mode || access_type == OS_FILE_READ_WRITE_CACHED)
 +		return;
 +
 +	if (srv_unix_file_flush_method == SRV_UNIX_ALL_O_DIRECT
 +	    || (type != OS_LOG_FILE
 +		&& (srv_unix_file_flush_method == SRV_UNIX_O_DIRECT
 +		    || (srv_unix_file_flush_method
 +			== SRV_UNIX_O_DIRECT_NO_FSYNC))))
 +		/* Do fsync() on log files when setting O_DIRECT fails.
 +		See log_io_complete() */
 +		if (!os_file_set_nocache(file, name, mode_str)
 +		    && srv_unix_file_flush_method == SRV_UNIX_ALL_O_DIRECT)
 +			srv_unix_file_flush_method = SRV_UNIX_O_DIRECT;
 +}
 +
 +/****************************************************************//**
 +NOTE! Use the corresponding macro
 +os_file_create_simple_no_error_handling(), not directly this function!
 +A simple function to open or create a file.
 +@return own: handle to the file, not defined if error, error number
 +can be retrieved with os_file_get_last_error */
 +UNIV_INTERN
 +pfs_os_file_t
 +os_file_create_simple_no_error_handling_func(
 +/*=========================================*/
 +	const char*	name,	/*!< in: name of the file or path as a
 +				null-terminated string */
 +	ulint		create_mode,/*!< in: create mode */
 +	ulint		access_type,/*!< in: OS_FILE_READ_ONLY,
 +				OS_FILE_READ_WRITE,
 +				OS_FILE_READ_ALLOW_DELETE (used by a backup
 +				program reading the file), or
 +				OS_FILE_READ_WRITE_CACHED (disable O_DIRECT
 +				if it would be enabled otherwise) */
 +	ibool*		success)/*!< out: TRUE if succeed, FALSE if error */
 +{
 +	pfs_os_file_t	file;
 +
 +	*success = FALSE;
 +#ifdef __WIN__
 +	DWORD		access;
 +	DWORD		create_flag;
 +	DWORD		attributes	= 0;
 +	DWORD		share_mode	= FILE_SHARE_READ;
 +	ut_a(name);
 +
 +	ut_a(!(create_mode & OS_FILE_ON_ERROR_SILENT));
 +	ut_a(!(create_mode & OS_FILE_ON_ERROR_NO_EXIT));
 +
 +	if (create_mode == OS_FILE_OPEN) {
 +		create_flag = OPEN_EXISTING;
 +	} else if (srv_read_only_mode) {
 +		create_flag = OPEN_EXISTING;
 +	} else if (create_mode == OS_FILE_CREATE) {
 +		create_flag = CREATE_NEW;
 +	} else {
 +
 +		ib_logf(IB_LOG_LEVEL_ERROR,
 +			"Unknown file create mode (%lu) for file '%s'",
 +			create_mode, name);
 +		file.m_file = (os_file_t)-1;
 +		return(file);
 +	}
 +
 +	if (access_type == OS_FILE_READ_ONLY) {
 +		access = GENERIC_READ;
 +	} else if (srv_read_only_mode) {
 +		access = GENERIC_READ;
 +	} else if (access_type == OS_FILE_READ_WRITE
 +		   || access_type == OS_FILE_READ_WRITE_CACHED) {
 +		access = GENERIC_READ | GENERIC_WRITE;
 +	} else if (access_type == OS_FILE_READ_ALLOW_DELETE) {
 +
 +		ut_a(!srv_read_only_mode);
 +
 +		access = GENERIC_READ;
 +
 +		/*!< A backup program has to give mysqld the maximum
 +		freedom to do what it likes with the file */
 +
 +		share_mode |= FILE_SHARE_DELETE | FILE_SHARE_WRITE;
 +	} else {
 +		ib_logf(IB_LOG_LEVEL_ERROR,
 +			"Unknown file access type (%lu) for file '%s'",
 +			access_type, name);
 +		file.m_file = (os_file_t)-1;
 +		return(file);
 +	}
 +
 +	file.m_file = CreateFile((LPCTSTR) name,
 +			  access,
 +			  share_mode,
 +			  NULL,			// Security attributes
 +			  create_flag,
 +			  attributes,
 +			  NULL);		// No template file
 +
 +	*success = (file.m_file != INVALID_HANDLE_VALUE);
 +#else /* __WIN__ */
 +	int		create_flag;
 +	const char*	mode_str	= NULL;
 +	ut_a(name);
 +	if (create_mode != OS_FILE_OPEN && create_mode != OS_FILE_OPEN_RAW)
 +		WAIT_ALLOW_WRITES();
 +
 +	ut_a(!(create_mode & OS_FILE_ON_ERROR_SILENT));
 +	ut_a(!(create_mode & OS_FILE_ON_ERROR_NO_EXIT));
 +
 +	if (create_mode == OS_FILE_OPEN) {
 +
 +		mode_str = "OPEN";
 +
 +		if (access_type == OS_FILE_READ_ONLY) {
 +
 +			create_flag = O_RDONLY;
 +
 +		} else if (srv_read_only_mode) {
 +
 +			create_flag = O_RDONLY;
 +
 +		} else {
 +
 +			ut_a(access_type == OS_FILE_READ_WRITE
 +			     || access_type == OS_FILE_READ_ALLOW_DELETE
 +			     || access_type == OS_FILE_READ_WRITE_CACHED);
 +
 +			create_flag = O_RDWR;
 +		}
 +
 +	} else if (srv_read_only_mode) {
 +
 +		mode_str = "OPEN";
 +
 +		create_flag = O_RDONLY;
 +
 +	} else if (create_mode == OS_FILE_CREATE) {
 +
 +		mode_str = "CREATE";
 +
 +		create_flag = O_RDWR | O_CREAT | O_EXCL;
 +
 +	} else {
 +		ib_logf(IB_LOG_LEVEL_ERROR,
 +			"Unknown file create mode (%lu) for file '%s'",
 +			create_mode, name);
 +		file.m_file = -1;
 +		return(file);
 +	}
 +
- 	file.m_file = ::open(name, create_flag, os_innodb_umask);
++	file.m_file = ::open(name, create_flag | O_CLOEXEC , os_innodb_umask);
 +
 +	*success = file.m_file == -1 ? FALSE : TRUE;
 +
 +	/* This function is always called for data files, we should disable
 +	OS caching (O_DIRECT) here as we do in os_file_create_func(), so
 +	we open the same file in the same mode, see man page of open(2). */
 +	if (*success) {
 +		os_file_set_nocache_if_needed(file.m_file, name, mode_str,
 +					      OS_DATA_FILE, access_type);
 +	}
 +
 +#ifdef USE_FILE_LOCK
 +	if (!srv_read_only_mode
 +	    && *success
 +	    && (access_type == OS_FILE_READ_WRITE
 +		|| access_type == OS_FILE_READ_WRITE_CACHED)
 +	    && os_file_lock(file.m_file, name)) {
 +
 +		*success = FALSE;
 +		close(file.m_file);
 +		file.m_file = -1;
 +
 +	}
 +#endif /* USE_FILE_LOCK */
 +
 +#endif /* __WIN__ */
 +
 +	return(file);
 +}
 +
 +/****************************************************************//**
 +Tries to disable OS caching on an opened file descriptor.
 +@return TRUE if operation is success and FALSE otherwise */
 +UNIV_INTERN
 +bool
 +os_file_set_nocache(
 +/*================*/
 +	os_file_t fd		/*!< in: file descriptor to alter */
 +					MY_ATTRIBUTE((unused)),
 +	const char*	file_name	/*!< in: used in the diagnostic
 +					message */
 +					MY_ATTRIBUTE((unused)),
 +	const char*	operation_name MY_ATTRIBUTE((unused)))
 +					/*!< in: "open" or "create"; used
 +					in the diagnostic message */
 +{
 +	/* some versions of Solaris may not have DIRECTIO_ON */
 +#if defined(UNIV_SOLARIS) && defined(DIRECTIO_ON)
 +	if (directio(fd, DIRECTIO_ON) == -1) {
 +		int	errno_save = errno;
 +
 +		ib_logf(IB_LOG_LEVEL_ERROR,
 +			"Failed to set DIRECTIO_ON on file %s: %s: %s, "
 +			"continuing anyway.",
 +			file_name, operation_name, strerror(errno_save));
 +		return false;
 +	}
 +#elif defined(O_DIRECT)
 +	if (fcntl(fd, F_SETFL, O_DIRECT) == -1) {
 +		int		errno_save = errno;
 +		static bool	warning_message_printed = false;
 +		if (errno_save == EINVAL) {
 +			if (!warning_message_printed) {
 +				warning_message_printed = true;
 +# ifdef UNIV_LINUX
 +				ib_logf(IB_LOG_LEVEL_WARN,
 +					"Failed to set O_DIRECT on file "
 +					"%s: %s: %s, continuing anyway. "
 +					"O_DIRECT is known to result "
 +					"in 'Invalid argument' on Linux on "
 +					"tmpfs, see MySQL Bug#26662.",
 +					file_name, operation_name,
 +					strerror(errno_save));
 +# else /* UNIV_LINUX */
 +				goto short_warning;
 +# endif /* UNIV_LINUX */
 +			}
 +		} else {
 +# ifndef UNIV_LINUX
 +short_warning:
 +# endif
 +			ib_logf(IB_LOG_LEVEL_WARN,
 +				"Failed to set O_DIRECT on file %s: %s: %s, "
 +				"continuing anyway.",
 +				file_name, operation_name, strerror(errno_save));
 +		}
 +		return false;
 +	}
 +#endif /* defined(UNIV_SOLARIS) && defined(DIRECTIO_ON) */
 +	return true;
 +}
 +
 +
 +/****************************************************************//**
 +Tries to enable the atomic write feature, if available, for the specified file
 +handle.
 +@return TRUE if success */
 +static MY_ATTRIBUTE((warn_unused_result))
 +ibool
 +os_file_set_atomic_writes(
 +/*======================*/
 +	const char*	name	/*!< in: name of the file */
 +	MY_ATTRIBUTE((unused)),
 +	os_file_t	file	/*!< in: handle to the file */
 +	MY_ATTRIBUTE((unused)))
 +
 +{
 +#ifdef DFS_IOCTL_ATOMIC_WRITE_SET
 +	int	atomic_option	= 1;
 +
 +	if (ioctl(file, DFS_IOCTL_ATOMIC_WRITE_SET, &atomic_option)) {
 +
 +		os_file_handle_error_no_exit(name, "ioctl", FALSE);
 +		return(FALSE);
 +	}
 +
 +	return(TRUE);
 +#else
 +	ib_logf(IB_LOG_LEVEL_ERROR,
 +		"trying to enable atomic writes on non-supported platform! "
 +		"Please restart with innodb_use_atomic_writes disabled.");
 +	return(FALSE);
 +#endif
 +}
 +
 +/****************************************************************//**
 +NOTE! Use the corresponding macro os_file_create(), not directly
 +this function!
 +Opens an existing file or creates a new.
 +@return own: handle to the file, not defined if error, error number
 +can be retrieved with os_file_get_last_error */
 +UNIV_INTERN
 +pfs_os_file_t
 +os_file_create_func(
 +/*================*/
 +	const char*	name,	/*!< in: name of the file or path as a
 +				null-terminated string */
 +	ulint		create_mode,/*!< in: create mode */
 +	ulint		purpose,/*!< in: OS_FILE_AIO, if asynchronous,
 +				non-buffered i/o is desired,
 +				OS_FILE_NORMAL, if any normal file;
 +				NOTE that it also depends on type, os_aio_..
 +				and srv_.. variables whether we really use
 +				async i/o or unbuffered i/o: look in the
 +				function source code for the exact rules */
 +	ulint		type,	/*!< in: OS_DATA_FILE or OS_LOG_FILE */
 +	ibool*		success)/*!< out: TRUE if succeed, FALSE if error */
 +{
 +	pfs_os_file_t	file;
 +	ibool		retry;
 +	ibool		on_error_no_exit;
 +	ibool		on_error_silent;
 +#ifdef __WIN__
 +	DBUG_EXECUTE_IF(
 +		"ib_create_table_fail_disk_full",
 +		*success = FALSE;
 +		SetLastError(ERROR_DISK_FULL);
 +		file.m_file = (os_file_t)-1;
 +		return(file);
 +	);
 +#else /* __WIN__ */
 +	DBUG_EXECUTE_IF(
 +		"ib_create_table_fail_disk_full",
 +		*success = FALSE;
 +		errno = ENOSPC;
 +		file.m_file = -1;
 +		return(file);
 +	);
 +#endif /* __WIN__ */
 +
 +#ifdef __WIN__
 +	DWORD		create_flag;
 +	DWORD		share_mode	= FILE_SHARE_READ;
 +
 +	on_error_no_exit = create_mode & OS_FILE_ON_ERROR_NO_EXIT
 +		? TRUE : FALSE;
 +
 +	on_error_silent = create_mode & OS_FILE_ON_ERROR_SILENT
 +		? TRUE : FALSE;
 +
 +	create_mode &= ~OS_FILE_ON_ERROR_NO_EXIT;
 +	create_mode &= ~OS_FILE_ON_ERROR_SILENT;
 +
 +	if (create_mode == OS_FILE_OPEN_RAW) {
 +
 +		ut_a(!srv_read_only_mode);
 +
 +		create_flag = OPEN_EXISTING;
 +
 +		/* On Windows Physical devices require admin privileges and
 +		have to have the write-share mode set. See the remarks
 +		section for the CreateFile() function documentation in MSDN. */
 +
 +		share_mode |= FILE_SHARE_WRITE;
 +
 +	} else if (create_mode == OS_FILE_OPEN
 +		   || create_mode == OS_FILE_OPEN_RETRY) {
 +
 +		create_flag = OPEN_EXISTING;
 +
 +	} else if (srv_read_only_mode) {
 +
 +		create_flag = OPEN_EXISTING;
 +
 +	} else if (create_mode == OS_FILE_CREATE) {
 +
 +		create_flag = CREATE_NEW;
 +
 +	} else if (create_mode == OS_FILE_OVERWRITE) {
 +
 +		create_flag = CREATE_ALWAYS;
 +
 +	} else {
 +		ib_logf(IB_LOG_LEVEL_ERROR,
 +			"Unknown file create mode (%lu) for file '%s'",
 +			create_mode, name);
 +
 +		file.m_file = (os_file_t)-1;
 +		return(file);
 +	}
 +
 +	DWORD		attributes = 0;
 +
 +#ifdef UNIV_HOTBACKUP
 +	attributes |= FILE_FLAG_NO_BUFFERING;
 +#else
 +	if (purpose == OS_FILE_AIO) {
 +
 +#ifdef WIN_ASYNC_IO
 +		/* If specified, use asynchronous (overlapped) io and no
 +		buffering of writes in the OS */
 +
 +		if (srv_use_native_aio) {
 +			attributes |= FILE_FLAG_OVERLAPPED;
 +		}
 +#endif /* WIN_ASYNC_IO */
 +
 +	} else if (purpose == OS_FILE_NORMAL) {
 +		/* Use default setting. */
 +	} else {
 +		ib_logf(IB_LOG_LEVEL_ERROR,
 +			"Unknown purpose flag (%lu) while opening file '%s'",
 +			purpose, name);
 +		file.m_file = (os_file_t)-1;
 +		return(file);
 +	}
 +
 +#ifdef UNIV_NON_BUFFERED_IO
 +	// TODO: Create a bug, this looks wrong. The flush log
 +	// parameter is dynamic.
 +	if (type == OS_LOG_FILE && thd_flush_log_at_trx_commit(NULL) == 2) {
 +
 +		/* Do not use unbuffered i/o for the log files because
 +		value 2 denotes that we do not flush the log at every
 +		commit, but only once per second */
 +
 +	} else if (srv_win_file_flush_method == SRV_WIN_IO_UNBUFFERED) {
 +
 +		attributes |= FILE_FLAG_NO_BUFFERING;
 +	}
 +#endif /* UNIV_NON_BUFFERED_IO */
 +
 +#endif /* UNIV_HOTBACKUP */
 +	DWORD	access = GENERIC_READ;
 +
 +	if (!srv_read_only_mode) {
 +		access |= GENERIC_WRITE;
 +	}
 +
 +	if (type == OS_LOG_FILE) {
 +		if (srv_unix_file_flush_method == SRV_UNIX_O_DSYNC) {
 +			/* Map O_DSYNC to WRITE_THROUGH */
 +			attributes |= FILE_FLAG_WRITE_THROUGH;
 +		} else if (srv_unix_file_flush_method == SRV_UNIX_ALL_O_DIRECT) {
 +			/* Open log file without buffering */
 +			attributes |= FILE_FLAG_NO_BUFFERING;
 +		}
 +	}
 +
 +	do {
 +		/* Use default security attributes and no template file. */
 +		file.m_file = CreateFile(
 +			(LPCTSTR) name, access, share_mode, NULL,
 +			create_flag, attributes, NULL);
 +
 +		if (file.m_file == INVALID_HANDLE_VALUE) {
 +			const char*	operation;
 +
 +			operation = (create_mode == OS_FILE_CREATE
 +				     && !srv_read_only_mode)
 +				? "create" : "open";
 +
 +			*success = FALSE;
 +
 +			if (on_error_no_exit) {
 +				retry = os_file_handle_error_no_exit(
 +					name, operation, on_error_silent);
 +			} else {
 +				retry = os_file_handle_error(name, operation);
 +			}
 +		} else {
 +			*success = TRUE;
 +			retry = FALSE;
 +			if (srv_use_native_aio && ((attributes & FILE_FLAG_OVERLAPPED) != 0)) {
 +				ut_a(CreateIoCompletionPort(file.m_file, completion_port, 0, 0));
 +			}
 +		}
 +
 +	} while (retry);
 +
 +	if (srv_use_atomic_writes && type == OS_DATA_FILE &&
 +		!os_file_set_atomic_writes(name, file.m_file)) {
 +			 CloseHandle(file.m_file);
 +			*success = FALSE;
 +			file.m_file = INVALID_HANDLE_VALUE;
 +	}
 +
 +#else /* __WIN__ */
 +	int		create_flag;
 +	const char*	mode_str	= NULL;
 +	if (create_mode != OS_FILE_OPEN && create_mode != OS_FILE_OPEN_RAW)
 +		WAIT_ALLOW_WRITES();
 +
 +	on_error_no_exit = create_mode & OS_FILE_ON_ERROR_NO_EXIT
 +		? TRUE : FALSE;
 +	on_error_silent = create_mode & OS_FILE_ON_ERROR_SILENT
 +		? TRUE : FALSE;
 +
 +	create_mode &= ~OS_FILE_ON_ERROR_NO_EXIT;
 +	create_mode &= ~OS_FILE_ON_ERROR_SILENT;
 +
 +	if (create_mode == OS_FILE_OPEN
 +	    || create_mode == OS_FILE_OPEN_RAW
 +	    || create_mode == OS_FILE_OPEN_RETRY) {
 +
 +		mode_str = "OPEN";
 +
 +		create_flag = srv_read_only_mode ? O_RDONLY : O_RDWR;
 +
 +	} else if (srv_read_only_mode) {
 +
 +		mode_str = "OPEN";
 +
 +		create_flag = O_RDONLY;
 +
 +	} else if (create_mode == OS_FILE_CREATE) {
 +
 +		mode_str = "CREATE";
 +		create_flag = O_RDWR | O_CREAT | O_EXCL;
 +
 +	} else if (create_mode == OS_FILE_OVERWRITE) {
 +
 +		mode_str = "OVERWRITE";
 +		create_flag = O_RDWR | O_CREAT | O_TRUNC;
 +
 +	} else {
 +		ib_logf(IB_LOG_LEVEL_ERROR,
 +			"Unknown file create mode (%lu) for file '%s'",
 +			create_mode, name);
 +
 +		file.m_file = -1;
 +		return(file);
 +	}
 +
 +	ut_a(type == OS_LOG_FILE || type == OS_DATA_FILE);
 +	ut_a(purpose == OS_FILE_AIO || purpose == OS_FILE_NORMAL);
 +
 +#ifdef O_SYNC
 +	/* We let O_SYNC only affect log files; note that we map O_DSYNC to
 +	O_SYNC because the datasync options seemed to corrupt files in 2001
 +	in both Linux and Solaris */
 +
 +	if (!srv_read_only_mode
 +	    && type == OS_LOG_FILE
 +	    && srv_unix_file_flush_method == SRV_UNIX_O_DSYNC) {
 +
 +		create_flag |= O_SYNC;
 +	}
 +#endif /* O_SYNC */
 +
 +	do {
- 		file.m_file = ::open(name, create_flag, os_innodb_umask);
++		file.m_file = ::open(name, create_flag | O_CLOEXEC, os_innodb_umask);
 +
 +		if (file.m_file == -1) {
 +			const char*	operation;
 +
 +			operation = (create_mode == OS_FILE_CREATE
 +				     && !srv_read_only_mode)
 +				? "create" : "open";
 +
 +			*success = FALSE;
 +
 +			if (on_error_no_exit) {
 +				retry = os_file_handle_error_no_exit(
 +					name, operation, on_error_silent);
 +			} else {
 +				retry = os_file_handle_error(name, operation);
 +			}
 +		} else {
 +			*success = TRUE;
 +			retry = false;
 +		}
 +
 +	} while (retry);
 +
 +	if (*success) {
 +
 +		os_file_set_nocache_if_needed(file.m_file, name, mode_str,
 +			type, 0);
 +	}
 +
 +#ifdef USE_FILE_LOCK
 +	if (!srv_read_only_mode
 +	    && *success
 +	    && create_mode != OS_FILE_OPEN_RAW
 +	    && os_file_lock(file.m_file, name)) {
 +
 +		if (create_mode == OS_FILE_OPEN_RETRY) {
 +
 +			ut_a(!srv_read_only_mode);
 +
 +			ib_logf(IB_LOG_LEVEL_INFO,
 +				"Retrying to lock the first data file");
 +
 +			for (int i = 0; i < 100; i++) {
 +				os_thread_sleep(1000000);
 +
 +				if (!os_file_lock(file.m_file, name)) {
 +					*success = TRUE;
 +					return(file);
 +				}
 +			}
 +
 +			ib_logf(IB_LOG_LEVEL_INFO,
 +				"Unable to open the first data file");
 +		}
 +
 +		*success = FALSE;
 +		close(file.m_file);
 +		file.m_file = -1;
 +	}
 +#endif /* USE_FILE_LOCK */
 +
 +	if (srv_use_atomic_writes && type == OS_DATA_FILE
 +	    && file.m_file != -1
 +	    && !os_file_set_atomic_writes(name, file.m_file)) {
 +
 +		*success = FALSE;
 +		close(file.m_file);
 +		file.m_file = -1;
 +	}
 +
 +#endif /* __WIN__ */
 +
 +	return(file);
 +}
 +
 +/***********************************************************************//**
 +Deletes a file if it exists. The file has to be closed before calling this.
 +@return	TRUE if success */
 +UNIV_INTERN
 +bool
 +os_file_delete_if_exists_func(
 +/*==========================*/
 +	const char*	name)	/*!< in: file path as a null-terminated
 +				string */
 +{
 +#ifdef __WIN__
 +	bool	ret;
 +	ulint	count	= 0;
 +loop:
 +	/* In Windows, deleting an .ibd file may fail if mysqlbackup is copying
 +	it */
 +
 +	ret = DeleteFile((LPCTSTR) name);
 +
 +	if (ret) {
 +		return(true);
 +	}
 +
 +	DWORD lasterr = GetLastError();
 +	if (lasterr == ERROR_FILE_NOT_FOUND
 +	    || lasterr == ERROR_PATH_NOT_FOUND) {
 +		/* the file does not exist, this not an error */
 +
 +		return(true);
 +	}
 +
 +	count++;
 +
 +	if (count > 100 && 0 == (count % 10)) {
 +		os_file_get_last_error(true); /* print error information */
 +
 +		ib_logf(IB_LOG_LEVEL_WARN, "Delete of file %s failed.", name);
 +	}
 +
 +	os_thread_sleep(500000);	/* sleep for 0.5 second */
 +
 +	if (count > 2000) {
 +
 +		return(false);
 +	}
 +
 +	goto loop;
 +#else
 +	int	ret;
 +	WAIT_ALLOW_WRITES();
 +
 +	ret = unlink(name);
 +
 +	if (ret != 0 && errno != ENOENT) {
 +		os_file_handle_error_no_exit(name, "delete", FALSE);
 +
 +		return(false);
 +	}
 +
 +	return(true);
 +#endif /* __WIN__ */
 +}
 +
 +/***********************************************************************//**
 +Deletes a file. The file has to be closed before calling this.
 +@return	TRUE if success */
 +UNIV_INTERN
 +bool
 +os_file_delete_func(
 +/*================*/
 +	const char*	name)	/*!< in: file path as a null-terminated
 +				string */
 +{
 +#ifdef __WIN__
 +	BOOL	ret;
 +	ulint	count	= 0;
 +loop:
 +	/* In Windows, deleting an .ibd file may fail if mysqlbackup is copying
 +	it */
 +
 +	ret = DeleteFile((LPCTSTR) name);
 +
 +	if (ret) {
 +		return(true);
 +	}
 +
 +	if (GetLastError() == ERROR_FILE_NOT_FOUND) {
 +		/* If the file does not exist, we classify this as a 'mild'
 +		error and return */
 +
 +		return(false);
 +	}
 +
 +	count++;
 +
 +	if (count > 100 && 0 == (count % 10)) {
 +		os_file_get_last_error(true); /* print error information */
 +
 +		fprintf(stderr,
 +			"InnoDB: Warning: cannot delete file %s\n"
 +			"InnoDB: Are you running mysqlbackup"
 +			" to back up the file?\n", name);
 +	}
 +
 +	os_thread_sleep(1000000);	/* sleep for a second */
 +
 +	if (count > 2000) {
 +
 +		return(false);
 +	}
 +
 +	goto loop;
 +#else
 +	int	ret;
 +	WAIT_ALLOW_WRITES();
 +
 +	ret = unlink(name);
 +
 +	if (ret != 0) {
 +		os_file_handle_error_no_exit(name, "delete", FALSE);
 +
 +		return(false);
 +	}
 +
 +	return(true);
 +#endif
 +}
 +
 +/***********************************************************************//**
 +NOTE! Use the corresponding macro os_file_rename(), not directly this function!
 +Renames a file (can also move it to another directory). It is safest that the
 +file is closed before calling this function.
 +@return	TRUE if success */
 +UNIV_INTERN
 +ibool
 +os_file_rename_func(
 +/*================*/
 +	const char*	oldpath,/*!< in: old file path as a null-terminated
 +				string */
 +	const char*	newpath)/*!< in: new file path */
 +{
 +#ifdef UNIV_DEBUG
 +	os_file_type_t	type;
 +	ibool		exists;
 +
 +	/* New path must not exist. */
 +	ut_ad(os_file_status(newpath, &exists, &type));
 +	ut_ad(!exists);
 +
 +	/* Old path must exist. */
 +	ut_ad(os_file_status(oldpath, &exists, &type));
 +	ut_ad(exists);
 +#endif /* UNIV_DEBUG */
 +
 +#ifdef __WIN__
 +	BOOL	ret;
 +
 +	ret = MoveFileEx((LPCTSTR)oldpath, (LPCTSTR)newpath, MOVEFILE_REPLACE_EXISTING);
 +
 +	if (ret) {
 +		return(TRUE);
 +	}
 +
 +	os_file_handle_error_no_exit(oldpath, "rename", FALSE);
 +
 +	return(FALSE);
 +#else
 +	int	ret;
 +	WAIT_ALLOW_WRITES();
 +
 +	ret = rename(oldpath, newpath);
 +
 +	if (ret != 0) {
 +		os_file_handle_error_no_exit(oldpath, "rename", FALSE);
 +
 +		return(FALSE);
 +	}
 +
 +	return(TRUE);
 +#endif /* __WIN__ */
 +}
 +
 +/***********************************************************************//**
 +NOTE! Use the corresponding macro os_file_close(), not directly this function!
 +Closes a file handle. In case of error, error number can be retrieved with
 +os_file_get_last_error.
 +@return	TRUE if success */
 +UNIV_INTERN
 +ibool
 +os_file_close_func(
 +/*===============*/
 +	os_file_t	file)	/*!< in, own: handle to a file */
 +{
 +#ifdef __WIN__
 +	BOOL	ret;
 +
 +	ret = CloseHandle(file);
 +
 +	if (ret) {
 +		return(TRUE);
 +	}
 +
 +	os_file_handle_error(NULL, "close");
 +
 +	return(FALSE);
 +#else
 +	int	ret;
 +
 +	ret = close(file);
 +
 +	if (ret == -1) {
 +		os_file_handle_error(NULL, "close");
 +
 +		return(FALSE);
 +	}
 +
 +	return(TRUE);
 +#endif /* __WIN__ */
 +}
 +
 +/***********************************************************************//**
 +Closes a file handle.
 +@return	TRUE if success */
 +UNIV_INTERN
 +bool
 +os_file_close_no_error_handling_func(
 +/*============================*/
 +	os_file_t	file)	/*!< in, own: handle to a file */
 +{
 +#ifdef __WIN__
 +	BOOL	ret;
 +
 +	ret = CloseHandle(file);
 +
 +	if (ret) {
 +		return(true);
 +	}
 +
 +	return(false);
 +#else
 +	int	ret;
 +
 +	ret = close(file);
 +
 +	if (ret == -1) {
 +
 +		return(false);
 +	}
 +
 +	return(true);
 +#endif /* __WIN__ */
 +}
 +
 +#ifdef HAVE_POSIX_FALLOCATE
 +/***********************************************************************//**
 +Ensures that disk space is allocated for the file.
 +@return TRUE if success */
 +UNIV_INTERN
 +bool
 +os_file_allocate_func(
 +	os_file_t	file,	/*!< in, own: handle to a file */
 +	os_offset_t	offset,	/*!< in: file region offset  */
 +	os_offset_t	len)	/*!< in: file region length  */
 +{
 +	return(posix_fallocate(file, offset, len) == 0);
 +}
 +#endif
 +
 +/***********************************************************************//**
 +Checks if the file is marked as invalid.
 +@return TRUE if invalid */
 +UNIV_INTERN
 +bool
 +os_file_is_invalid(
 +	pfs_os_file_t	file)	/*!< in, own: handle to a file */
 +{
 +	return(file.m_file == os_file_invalid);
 +}
 +
 +/***********************************************************************//**
 +Marks the file as invalid. */
 +UNIV_INTERN
 +void
 +os_file_mark_invalid(
 +	pfs_os_file_t*	file)	/*!< out: pointer to a handle to a file */
 +{
 +	file->m_file = os_file_invalid;
 +}
 +
 +/***********************************************************************//**
 +Announces an intention to access file data in a specific pattern in the
 +future.
 +@return	TRUE if success */
 +UNIV_INTERN
 +bool
 +os_file_advise(
 +	pfs_os_file_t	file,	/*!< in, own: handle to a file */
 +	os_offset_t	offset,	/*!< in: file region offset  */
 +	os_offset_t	len,	/*!< in: file region length  */
 +	ulint		advice)/*!< in: advice for access pattern */
 +{
 +#ifdef __WIN__
 +	return(true);
 +#else
 +#ifdef UNIV_LINUX
 +	int	native_advice = 0;
 +	if ((advice & OS_FILE_ADVISE_NORMAL) != 0)
 +		native_advice |= POSIX_FADV_NORMAL;
 +	if ((advice & OS_FILE_ADVISE_RANDOM) != 0)
 +		native_advice |= POSIX_FADV_RANDOM;
 +	if ((advice & OS_FILE_ADVISE_SEQUENTIAL) != 0)
 +		native_advice |= POSIX_FADV_SEQUENTIAL;
 +	if ((advice & OS_FILE_ADVISE_WILLNEED) != 0)
 +		native_advice |= POSIX_FADV_WILLNEED;
 +	if ((advice & OS_FILE_ADVISE_DONTNEED) != 0)
 +		native_advice |= POSIX_FADV_DONTNEED;
 +	if ((advice & OS_FILE_ADVISE_NOREUSE) != 0)
 +		native_advice |= POSIX_FADV_NOREUSE;
 +
 +	return(posix_fadvise(file.m_file, offset, len, native_advice) == 0);
 +#else
 +	return(true);
 +#endif
 +#endif /* __WIN__ */
 +}
 +
 +/***********************************************************************//**
 +Gets a file size.
 +@return	file size, or (os_offset_t) -1 on failure */
 +UNIV_INTERN
 +os_offset_t
 +os_file_get_size(
 +/*=============*/
 +	pfs_os_file_t	file)	/*!< in: handle to a file */
 +{
 +#ifdef __WIN__
 +	os_offset_t	offset;
 +	DWORD		high;
 +	DWORD		low;
 +
 +	low = GetFileSize(file.m_file, &high);
 +
 +	if ((low == 0xFFFFFFFF) && (GetLastError() != NO_ERROR)) {
 +		return((os_offset_t) -1);
 +	}
 +
 +	offset = (os_offset_t) low | ((os_offset_t) high << 32);
 +
 +	return(offset);
 +#else
 +	return((os_offset_t) lseek(file.m_file, 0, SEEK_END));
 +
 +#endif /* __WIN__ */
 +}
 +
 +/***********************************************************************//**
 +Write the specified number of zeros to a newly created file.
 +@return	TRUE if success */
 +UNIV_INTERN
 +ibool
 +os_file_set_size(
 +/*=============*/
 +	const char*	name,	/*!< in: name of the file or path as a
 +				null-terminated string */
 +	pfs_os_file_t	file,	/*!< in: handle to a file */
 +	os_offset_t	size)	/*!< in: file size */
 +{
 +	ibool		ret;
 +	byte*		buf;
 +	byte*		buf2;
 +	ulint		buf_size;
 +
 +#ifdef HAVE_POSIX_FALLOCATE
 +	if (srv_use_posix_fallocate) {
 +		int err;
 +		do {
 +			err = posix_fallocate(file.m_file, 0, size);
 +		} while (err == EINTR
 +			 && srv_shutdown_state == SRV_SHUTDOWN_NONE);
 +
 +		if (err) {
 +			ib_logf(IB_LOG_LEVEL_ERROR,
 +				"preallocating " INT64PF " bytes for"
 +				"file %s failed with error %d",
 +				size, name, err);
 +		}
 +		return(!err);
 +	}
 +#endif
 +
 +#ifdef _WIN32
 +	/* Write 1 page of zeroes at the desired end. */
 +	buf_size = UNIV_PAGE_SIZE;
 +	os_offset_t	current_size = size - buf_size;
 +#else
 +	/* Write up to 1 megabyte at a time. */
 +	buf_size = ut_min(64, (ulint) (size / UNIV_PAGE_SIZE))
 +		* UNIV_PAGE_SIZE;
 +	os_offset_t	current_size = 0;
 +#endif
 +	buf2 = static_cast<byte*>(calloc(1, buf_size + UNIV_PAGE_SIZE));
 +
 +	if (!buf2) {
 +		ib_logf(IB_LOG_LEVEL_ERROR,
 +			"Cannot allocate " ULINTPF " bytes to extend file\n",
 +			buf_size + UNIV_PAGE_SIZE);
 +		return(FALSE);
 +	}
 +
 +	/* Align the buffer for possible raw i/o */
 +	buf = static_cast<byte*>(ut_align(buf2, UNIV_PAGE_SIZE));
 +
 +	do {
 +		ulint	n_bytes;
 +
 +		if (size - current_size < (os_offset_t) buf_size) {
 +			n_bytes = (ulint) (size - current_size);
 +		} else {
 +			n_bytes = buf_size;
 +		}
 +
 +		ret = os_file_write(name, file, buf, current_size, n_bytes);
 +		if (!ret) {
 +			break;
 +		}
 +
 +		current_size += n_bytes;
 +	} while (current_size < size);
 +
 +	free(buf2);
 +
 +	return(ret && os_file_flush(file));
 +}
 +
 +/***********************************************************************//**
 +Truncates a file at its current position.
 +@return	TRUE if success */
 +UNIV_INTERN
 +ibool
 +os_file_set_eof(
 +/*============*/
 +	FILE*		file)	/*!< in: file to be truncated */
 +{
 +#ifdef __WIN__
 +	HANDLE h = (HANDLE) _get_osfhandle(fileno(file));
 +	return(SetEndOfFile(h));
 +#else /* __WIN__ */
 +	WAIT_ALLOW_WRITES();
 +	return(!ftruncate(fileno(file), ftell(file)));
 +#endif /* __WIN__ */
 +}
 +
 +/***********************************************************************//**
 +Truncates a file at the specified position.
 +@return TRUE if success */
 +UNIV_INTERN
 +bool
 +os_file_set_eof_at_func(
 +	os_file_t	file, /*!< in: handle to a file */
 +	ib_uint64_t	new_len)/*!< in: new file length */
 +{
 +#ifdef __WIN__
 +	LARGE_INTEGER li, li2;
 +	li.QuadPart = new_len;
 +	return(SetFilePointerEx(file, li, &li2,FILE_BEGIN)
 +	       && SetEndOfFile(file));
 +#else
 +	WAIT_ALLOW_WRITES();
 +	/* TODO: works only with -D_FILE_OFFSET_BITS=64 ? */
 +	return(!ftruncate(file, new_len));
 +#endif
 +}
 +
 +
 +#ifndef __WIN__
 +/***********************************************************************//**
 +Wrapper to fsync(2) that retries the call on some errors.
 +Returns the value 0 if successful; otherwise the value -1 is returned and
 +the global variable errno is set to indicate the error.
 +@return	0 if success, -1 otherwise */
 +
 +static
 +int
 +os_file_fsync(
 +/*==========*/
 +	os_file_t	file)	/*!< in: handle to a file */
 +{
 +	int	ret;
 +	int	failures;
 +	ibool	retry;
 +
 +	failures = 0;
 +
 +	do {
 +		ret = fsync(file);
 +
 +		os_n_fsyncs++;
 +
 +		if (ret == -1 && errno == ENOLCK) {
 +
 +			if (failures % 100 == 0) {
 +
 +				ut_print_timestamp(stderr);
 +				fprintf(stderr,
 +					" InnoDB: fsync(): "
 +					"No locks available; retrying\n");
 +			}
 +
 +			os_thread_sleep(200000 /* 0.2 sec */);
 +
 +			failures++;
 +
 +			retry = TRUE;
 +		} else if (ret == -1 && errno == EINTR) {
 +			/* Handle signal interruptions correctly */
 +			retry = TRUE;
 +		} else {
 +
 +			retry = FALSE;
 +		}
 +	} while (retry);
 +
 +	return(ret);
 +}
 +#endif /* !__WIN__ */
 +
 +/***********************************************************************//**
 +NOTE! Use the corresponding macro os_file_flush(), not directly this function!
 +Flushes the write buffers of a given file to the disk.
 +@return	TRUE if success */
 +UNIV_INTERN
 +ibool
 +os_file_flush_func(
 +/*===============*/
 +	os_file_t	file)	/*!< in, own: handle to a file */
 +{
 +#ifdef __WIN__
 +	BOOL	ret;
 +
 +	os_n_fsyncs++;
 +
 +	ret = FlushFileBuffers(file);
 +
 +	if (ret) {
 +		return(TRUE);
 +	}
 +
 +	/* Since Windows returns ERROR_INVALID_FUNCTION if the 'file' is
 +	actually a raw device, we choose to ignore that error if we are using
 +	raw disks */
 +
 +	if (srv_start_raw_disk_in_use && GetLastError()
 +	    == ERROR_INVALID_FUNCTION) {
 +		return(TRUE);
 +	}
 +
 +	os_file_handle_error(NULL, "flush");
 +
 +	/* It is a fatal error if a file flush does not succeed, because then
 +	the database can get corrupt on disk */
 +	ut_error;
 +
 +	return(FALSE);
 +#else
 +	int	ret;
 +	WAIT_ALLOW_WRITES();
 +
 +#if defined(HAVE_DARWIN_THREADS)
 +# ifndef F_FULLFSYNC
 +	/* The following definition is from the Mac OS X 10.3 <sys/fcntl.h> */
 +#  define F_FULLFSYNC 51 /* fsync + ask the drive to flush to the media */
 +# elif F_FULLFSYNC != 51
 +#  error "F_FULLFSYNC != 51: ABI incompatibility with Mac OS X 10.3"
 +# endif
 +	/* Apple has disabled fsync() for internal disk drives in OS X. That
 +	caused corruption for a user when he tested a power outage. Let us in
 +	OS X use a nonstandard flush method recommended by an Apple
 +	engineer. */
 +
 +	if (!srv_have_fullfsync) {
 +		/* If we are not on an operating system that supports this,
 +		then fall back to a plain fsync. */
 +
 +		ret = os_file_fsync(file);
 +	} else {
 +		ret = fcntl(file, F_FULLFSYNC, NULL);
 +
 +		if (ret) {
 +			/* If we are not on a file system that supports this,
 +			then fall back to a plain fsync. */
 +			ret = os_file_fsync(file);
 +		}
 +	}
 +#else
 +	ret = os_file_fsync(file);
 +#endif
 +
 +	if (ret == 0) {
 +		return(TRUE);
 +	}
 +
 +	/* Since Linux returns EINVAL if the 'file' is actually a raw device,
 +	we choose to ignore that error if we are using raw disks */
 +
 +	if (srv_start_raw_disk_in_use && errno == EINVAL) {
 +
 +		return(TRUE);
 +	}
 +
 +	ib_logf(IB_LOG_LEVEL_ERROR, "The OS said file flush did not succeed");
 +
 +	os_file_handle_error(NULL, "flush");
 +
 +	/* It is a fatal error if a file flush does not succeed, because then
 +	the database can get corrupt on disk */
 +	ut_error;
 +
 +	return(FALSE);
 +#endif
 +}
 +
 +#ifndef __WIN__
 +/*******************************************************************//**
 +Does a synchronous read operation in Posix.
 +@return	number of bytes read, -1 if error */
 +static MY_ATTRIBUTE((nonnull(2), warn_unused_result))
 +ssize_t
 +os_file_pread(
 +/*==========*/
 +	os_file_t	file,	/*!< in: handle to a file */
 +	void*		buf,	/*!< in: buffer where to read */
 +	ulint		n,	/*!< in: number of bytes to read */
 +	os_offset_t	offset,	/*!< in: file offset from where to read */
 +	trx_t*		trx)
 +{
 +	off_t	offs;
 +	ulint		sec;
 +	ulint		ms;
 +	ib_uint64_t	start_time;
 +	ib_uint64_t	finish_time;
 +
 +	ut_ad(n);
 +
 +	/* If off_t is > 4 bytes in size, then we assume we can pass a
 +	64-bit address */
 +	offs = (off_t) offset;
 +
 +	if (sizeof(off_t) <= 4) {
 +		if (offset != (os_offset_t) offs) {
 +			ib_logf(IB_LOG_LEVEL_ERROR,
 +				"File read at offset > 4 GB");
 +		}
 +	}
 +
 +	os_n_file_reads++;
 +
 +	if (UNIV_UNLIKELY(trx && trx->take_stats))
 +	{
 +	        trx->io_reads++;
 +		trx->io_read += n;
 +		ut_usectime(&sec, &ms);
 +		start_time = (ib_uint64_t)sec * 1000000 + ms;
 +	} else {
 +		start_time = 0;
 +	}
 +
 +	const bool monitor = MONITOR_IS_ON(MONITOR_OS_PENDING_READS);
 +#ifdef HAVE_PREAD
 +	MONITOR_ATOMIC_INC_LOW(MONITOR_OS_PENDING_READS, monitor);
 +
 +	ssize_t	n_bytes;
 +
 +	/* Handle partial reads and signal interruptions correctly */
 +	for (n_bytes = 0; n_bytes < (ssize_t) n; ) {
 +		ssize_t n_read = pread(file, buf, (ssize_t)n - n_bytes, offs);
 +		if (n_read > 0) {
 +			n_bytes += n_read;
 +			offs += n_read;
 +			buf = (char *)buf + n_read;
 +		} else if (n_read == -1 && errno == EINTR) {
 +			continue;
 +		} else {
 +			break;
 +		}
 +	}
 +
 +	MONITOR_ATOMIC_DEC_LOW(MONITOR_OS_PENDING_READS, monitor);
 +
 +	if (UNIV_UNLIKELY(start_time != 0))
 +	{
 +		ut_usectime(&sec, &ms);
 +		finish_time = (ib_uint64_t)sec * 1000000 + ms;
 +		trx->io_reads_wait_timer += (ulint)(finish_time - start_time);
 +	}
 +
 +	return(n_bytes);
 +#else
 +	{
 +		off_t	ret_offset;
 +		ssize_t	ret;
 +		ssize_t n_read;
 +#ifndef UNIV_HOTBACKUP
 +		ulint	i;
 +#endif /* !UNIV_HOTBACKUP */
 +
 +		MONITOR_ATOMIC_INC_LOW(MONITOR_OS_PENDING_READS, monitor);
 +#ifndef UNIV_HOTBACKUP
 +		/* Protect the seek / read operation with a mutex */
 +		i = ((ulint) file) % OS_FILE_N_SEEK_MUTEXES;
 +
 +		os_mutex_enter(os_file_seek_mutexes[i]);
 +#endif /* !UNIV_HOTBACKUP */
 +
 +		ret_offset = lseek(file, offs, SEEK_SET);
 +
 +		if (ret_offset < 0) {
 +			ret = -1;
 +		} else {
 +			/* Handle signal interruptions correctly */
 +			for (ret = 0; ret < (ssize_t) n; ) {
 +				n_read = read(file, buf, (ssize_t)n);
 +				if (n_read > 0) {
 +					ret += n_read;
 +				} else if (n_read == -1 && errno == EINTR) {
 +					continue;
 +				} else {
 +					break;
 +				}
 +			}
 +		}
 +
 +#ifndef UNIV_HOTBACKUP
 +		os_mutex_exit(os_file_seek_mutexes[i]);
 +#endif /* !UNIV_HOTBACKUP */
 +
 +		MONITOR_ATOMIC_DEC_LOW(MONITOR_OS_PENDING_READS, monitor);
 +
 +		if (UNIV_UNLIKELY(start_time != 0)
 +		{
 +			ut_usectime(&sec, &ms);
 +			finish_time = (ib_uint64_t)sec * 1000000 + ms;
 +			trx->io_reads_wait_timer += (ulint)(finish_time - start_time);
 +		}
 +
 +		return(ret);
 +	}
 +#endif
 +}
 +
 +/*******************************************************************//**
 +Does a synchronous write operation in Posix.
 +@return	number of bytes written, -1 if error */
 +static MY_ATTRIBUTE((nonnull, warn_unused_result))
 +ssize_t
 +os_file_pwrite(
 +/*===========*/
 +	os_file_t	file,	/*!< in: handle to a file */
 +	const void*	buf,	/*!< in: buffer from where to write */
 +	ulint		n,	/*!< in: number of bytes to write */
 +	os_offset_t	offset)	/*!< in: file offset where to write */
 +{
 +	ssize_t	ret;
 +	ssize_t n_written;
 +	off_t	offs;
 +
 +	ut_ad(n);
 +	ut_ad(!srv_read_only_mode);
 +
 +	/* If off_t is > 4 bytes in size, then we assume we can pass a
 +	64-bit address */
 +	offs = (off_t) offset;
 +
 +	if (sizeof(off_t) <= 4) {
 +		if (offset != (os_offset_t) offs) {
 +			ib_logf(IB_LOG_LEVEL_ERROR,
 +				"File write at offset > 4 GB.");
 +		}
 +	}
 +
 +	os_n_file_writes++;
 +
 +	const bool monitor = MONITOR_IS_ON(MONITOR_OS_PENDING_WRITES);
 +#ifdef HAVE_PWRITE
 +	MONITOR_ATOMIC_INC_LOW(MONITOR_OS_PENDING_WRITES, monitor);
 +
 +	/* Handle partial writes and signal interruptions correctly */
 +	for (ret = 0; ret < (ssize_t) n; ) {
 +		n_written = pwrite(file, buf, (ssize_t)n - ret, offs);
 +		DBUG_EXECUTE_IF("xb_simulate_all_o_direct_write_failure",
 +				n_written = -1;
 +				errno = EINVAL;);
 +		if (n_written >= 0) {
 +			ret += n_written;
 +			offs += n_written;
 +			buf = (char *)buf + n_written;
 +		} else if (n_written == -1 && errno == EINTR) {
 +			continue;
 +		} else {
 +			break;
 +		}
 +	}
 +
 +	MONITOR_ATOMIC_DEC_LOW(MONITOR_OS_PENDING_WRITES, monitor);
 +
 +	return(ret);
 +#else
 +	{
 +		off_t	ret_offset;
 +# ifndef UNIV_HOTBACKUP
 +		ulint	i;
 +# endif /* !UNIV_HOTBACKUP */
 +
 +		MONITOR_ATOMIC_INC_LOW(MONITOR_OS_PENDING_WRITES, monitor);
 +
 +# ifndef UNIV_HOTBACKUP
 +		/* Protect the seek / write operation with a mutex */
 +		i = ((ulint) file) % OS_FILE_N_SEEK_MUTEXES;
 +
 +		os_mutex_enter(os_file_seek_mutexes[i]);
 +# endif /* UNIV_HOTBACKUP */
 +
 +		ret_offset = lseek(file, offs, SEEK_SET);
 +
 +		if (ret_offset < 0) {
 +			ret = -1;
 +
 +			goto func_exit;
 +		}
 +
 +		/* Handle signal interruptions correctly */
 +		for (ret = 0; ret < (ssize_t) n; ) {
 +			n_written = write(file, buf, (ssize_t)n);
 +			if (n_written > 0) {
 +				ret += n_written;
 +			} else if (n_written == -1 && errno == EINTR) {
 +				continue;
 +			} else {
 +				break;
 +			}
 +		}
 +
 +func_exit:
 +# ifndef UNIV_HOTBACKUP
 +		os_mutex_exit(os_file_seek_mutexes[i]);
 +# endif /* !UNIV_HOTBACKUP */
 +
 +		MONITOR_ATOMIC_DEC_LOW(MONITOR_OS_PENDING_WRITES, monitor);
 +		return(ret);
 +	}
 +#endif /* HAVE_PWRITE */
 +}
 +#endif
 +
 +/*******************************************************************//**
 +NOTE! Use the corresponding macro os_file_read(), not directly this
 +function!
 +Requests a synchronous positioned read operation.
 +@return	TRUE if request was successful, FALSE if fail */
 +UNIV_INTERN
 +ibool
 +os_file_read_func(
 +/*==============*/
 +	os_file_t	file,	/*!< in: handle to a file */
 +	void*		buf,	/*!< in: buffer where to read */
 +	os_offset_t	offset,	/*!< in: file offset where to read */
 +	ulint		n,	/*!< in: number of bytes to read */
 +	trx_t*		trx)
 +{
 +#ifdef __WIN__
 +	BOOL		ret;
 +	DWORD		len;
 +	ibool		retry;
 +	OVERLAPPED overlapped;
 +
 +
 +	/* On 64-bit Windows, ulint is 64 bits. But offset and n should be
 +	no more than 32 bits. */
 +	ut_a((n & 0xFFFFFFFFUL) == n);
 +
 +	os_n_file_reads++;
 +	os_bytes_read_since_printout += n;
 +	const bool monitor = MONITOR_IS_ON(MONITOR_OS_PENDING_READS);
 +
 +try_again:
 +	ut_ad(buf);
 +	ut_ad(n > 0);
 +
 +	MONITOR_ATOMIC_INC_LOW(MONITOR_OS_PENDING_READS, monitor);
 +
 +	memset (&overlapped, 0, sizeof (overlapped));
 +	overlapped.Offset = (DWORD)(offset & 0xFFFFFFFF);
 +	overlapped.OffsetHigh = (DWORD)(offset >> 32);
 +	overlapped.hEvent = win_get_syncio_event();
 +	ret = ReadFile(file, buf, n, NULL, &overlapped);
 +	if (ret) {
 +		ret = GetOverlappedResult(file, &overlapped, (DWORD *)&len, FALSE);
 +	}
 +	else if(GetLastError() == ERROR_IO_PENDING) {
 +		ret = GetOverlappedResult(file, &overlapped, (DWORD *)&len, TRUE);
 +        }
 +	MONITOR_ATOMIC_DEC_LOW(MONITOR_OS_PENDING_READS, monitor);
 +
 +	if (ret && len == n) {
 +		return(TRUE);
 +	}
 +#else /* __WIN__ */
 +	ibool	retry;
 +	ssize_t	ret;
 +
 +	os_bytes_read_since_printout += n;
 +
 +try_again:
 +	ret = os_file_pread(file, buf, n, offset, trx);
 +
 +	DBUG_EXECUTE_IF("xb_simulate_all_o_direct_read_failure",
 +			ret = -1;
 +			errno = EINVAL;);
 +
 +	if ((ulint) ret == n) {
 +		return(TRUE);
 +	} else if (ret == -1) {
 +                ib_logf(IB_LOG_LEVEL_ERROR,
 +			"Error in system call pread(). The operating"
 +			" system error number is %lu.",(ulint) errno);
 +        } else {
 +		/* Partial read occurred */
 +		ib_logf(IB_LOG_LEVEL_ERROR,
 +			"Tried to read " ULINTPF " bytes at offset "
 +			UINT64PF ". Was only able to read %ld.",
 +			n, offset, (lint) ret);
 +	}
 +#endif /* __WIN__ */
 +	retry = os_file_handle_error(NULL, "read");
 +
 +	if (retry) {
 +		goto try_again;
 +	}
 +
 +	fprintf(stderr,
 +		"InnoDB: Fatal error: cannot read from file."
 +		" OS error number %lu.\n",
 +#ifdef __WIN__
 +		(ulong) GetLastError()
 +#else
 +		(ulong) errno
 +#endif /* __WIN__ */
 +		);
 +	fflush(stderr);
 +
 +	ut_error;
 +
 +	return(FALSE);
 +}
 +
 +/*******************************************************************//**
 +NOTE! Use the corresponding macro os_file_read_no_error_handling(),
 +not directly this function!
 +Requests a synchronous positioned read operation. This function does not do
 +any error handling. In case of error it returns FALSE.
 +@return	TRUE if request was successful, FALSE if fail */
 +UNIV_INTERN
 +ibool
 +os_file_read_no_error_handling_func(
 +/*================================*/
 +	os_file_t	file,	/*!< in: handle to a file */
 +	void*		buf,	/*!< in: buffer where to read */
 +	os_offset_t	offset,	/*!< in: file offset where to read */
 +	ulint		n)	/*!< in: number of bytes to read */
 +{
 +#ifdef __WIN__
 +	BOOL		ret;
 +	DWORD		len;
 +	ibool		retry;
 +	OVERLAPPED overlapped;
 +	overlapped.Offset = (DWORD)(offset & 0xFFFFFFFF);
 +	overlapped.OffsetHigh = (DWORD)(offset >> 32);
 +
 +
 +	/* On 64-bit Windows, ulint is 64 bits. But offset and n should be
 +	no more than 32 bits. */
 +	ut_a((n & 0xFFFFFFFFUL) == n);
 +
 +	os_n_file_reads++;
 +	os_bytes_read_since_printout += n;
 +	const bool monitor = MONITOR_IS_ON(MONITOR_OS_PENDING_READS);
 +
 +try_again:
 +	ut_ad(buf);
 +	ut_ad(n > 0);
 +
 +	MONITOR_ATOMIC_INC_LOW(MONITOR_OS_PENDING_READS, monitor);
 +
 +	memset (&overlapped, 0, sizeof (overlapped));
 +	overlapped.Offset = (DWORD)(offset & 0xFFFFFFFF);
 +	overlapped.OffsetHigh = (DWORD)(offset >> 32);
 +	overlapped.hEvent = win_get_syncio_event();
 +	ret = ReadFile(file, buf, n, NULL, &overlapped);
 +	if (ret) {
 +		ret = GetOverlappedResult(file, &overlapped, (DWORD *)&len, FALSE);
 +	}
 +	else if(GetLastError() == ERROR_IO_PENDING) {
 +		ret = GetOverlappedResult(file, &overlapped, (DWORD *)&len, TRUE);
 +	}
 +	MONITOR_ATOMIC_DEC_LOW(MONITOR_OS_PENDING_READS, monitor);
 +
 +	if (ret && len == n) {
 +		return(TRUE);
 +	}
 +#else /* __WIN__ */
 +	ibool	retry;
 +	ssize_t	ret;
 +
 +	os_bytes_read_since_printout += n;
 +
 +try_again:
 +	ret = os_file_pread(file, buf, n, offset, NULL);
 +
 +	if ((ulint) ret == n) {
 +		return(TRUE);
 +	} else if (ret == -1) {
 +                ib_logf(IB_LOG_LEVEL_ERROR,
 +			"Error in system call pread(). The operating"
 +			" system error number is %lu.",(ulint) errno);
 +        } else {
 +		/* Partial read occurred */
 +		ib_logf(IB_LOG_LEVEL_ERROR,
 +			"Tried to read " ULINTPF " bytes at offset "
 +			UINT64PF ". Was only able to read %ld.",
 +			n, offset, (lint) ret);
 +	}
 +#endif /* __WIN__ */
 +	retry = os_file_handle_error_no_exit(NULL, "read", FALSE);
 +
 +	if (retry) {
 +		goto try_again;
 +	}
 +
 +	return(FALSE);
 +}
 +
 +/*******************************************************************//**
 +Rewind file to its start, read at most size - 1 bytes from it to str, and
 +NUL-terminate str. All errors are silently ignored. This function is
 +mostly meant to be used with temporary files. */
 +UNIV_INTERN
 +void
 +os_file_read_string(
 +/*================*/
 +	FILE*	file,	/*!< in: file to read from */
 +	char*	str,	/*!< in: buffer where to read */
 +	ulint	size)	/*!< in: size of buffer */
 +{
 +	size_t	flen;
 +
 +	if (size == 0) {
 +		return;
 +	}
 +
 +	rewind(file);
 +	flen = fread(str, 1, size - 1, file);
 +	str[flen] = '\0';
 +}
 +
 +/*******************************************************************//**
 +NOTE! Use the corresponding macro os_file_write(), not directly
 +this function!
 +Requests a synchronous write operation.
 +@return	TRUE if request was successful, FALSE if fail */
 +UNIV_INTERN
 +ibool
 +os_file_write_func(
 +/*===============*/
 +	const char*	name,	/*!< in: name of the file or path as a
 +				null-terminated string */
 +	os_file_t	file,	/*!< in: handle to a file */
 +	const void*	buf,	/*!< in: buffer from which to write */
 +	os_offset_t	offset,	/*!< in: file offset where to write */
 +	ulint		n)	/*!< in: number of bytes to write */
 +{
 +	ut_ad(!srv_read_only_mode);
 +
 +#ifdef __WIN__
 +	BOOL		ret;
 +	DWORD		len;
 +	ulint		n_retries	= 0;
 +	ulint		err;
 +	OVERLAPPED	overlapped;
 +	DWORD		saved_error = 0;
 +
 +	/* On 64-bit Windows, ulint is 64 bits. But offset and n should be
 +	no more than 32 bits. */
 +	ut_a((n & 0xFFFFFFFFUL) == n);
 +
 +	os_n_file_writes++;
 +
 +	ut_ad(buf);
 +	ut_ad(n > 0);
 +	const bool monitor = MONITOR_IS_ON(MONITOR_OS_PENDING_WRITES);
 +retry:
 +
 +	MONITOR_ATOMIC_INC_LOW(MONITOR_OS_PENDING_WRITES, monitor);
 +
 +	memset (&overlapped, 0, sizeof (overlapped));
 +	overlapped.Offset = (DWORD)(offset & 0xFFFFFFFF);
 +	overlapped.OffsetHigh = (DWORD)(offset >> 32);
 +
 +	overlapped.hEvent = win_get_syncio_event();
 +	ret = WriteFile(file, buf, n, NULL, &overlapped);
 +	if (ret) {
 +		ret = GetOverlappedResult(file, &overlapped, (DWORD *)&len, FALSE);
 +	}
 +	else if ( GetLastError() == ERROR_IO_PENDING) {
 +		ret = GetOverlappedResult(file, &overlapped, (DWORD *)&len, TRUE);
 +	}
 +
 +	MONITOR_ATOMIC_DEC_LOW(MONITOR_OS_PENDING_WRITES, monitor);
 +
 +	if (ret && len == n) {
 +
 +		return(TRUE);
 +	}
 +
 +	/* If some background file system backup tool is running, then, at
 +	least in Windows 2000, we may get here a specific error. Let us
 +	retry the operation 100 times, with 1 second waits. */
 +
 +	if (GetLastError() == ERROR_LOCK_VIOLATION && n_retries < 100) {
 +
 +		os_thread_sleep(1000000);
 +
 +		n_retries++;
 +
 +		goto retry;
 +	}
 +
 +	if (!os_has_said_disk_full) {
 +		char *winmsg = NULL;
 +
 +		saved_error = GetLastError();
 +		err = (ulint) saved_error;
 +
 +		ut_print_timestamp(stderr);
 +
 +		fprintf(stderr,
 +			" InnoDB: Error: Write to file %s failed"
 +			" at offset %llu.\n"
 +			"InnoDB: %lu bytes should have been written,"
 +			" only %lu were written.\n"
 +			"InnoDB: Operating system error number %lu.\n"
 +			"InnoDB: Check that your OS and file system"
 +			" support files of this size.\n"
 +			"InnoDB: Check also that the disk is not full"
 +			" or a disk quota exceeded.\n",
 +			name, offset,
 +			(ulong) n, (ulong) len, (ulong) err);
 +
 +		/* Ask Windows to prepare a standard message for a
 +		GetLastError() */
 +
 +		FormatMessage(FORMAT_MESSAGE_ALLOCATE_BUFFER |
 +			FORMAT_MESSAGE_FROM_SYSTEM |
 +			FORMAT_MESSAGE_IGNORE_INSERTS,
 +			NULL, saved_error,
 +			MAKELANGID(LANG_NEUTRAL, SUBLANG_DEFAULT),
 +			(LPSTR)&winmsg, 0, NULL);
 +
 +		if (winmsg) {
 +			fprintf(stderr,
 +				"InnoDB: FormatMessage: Error number %lu means '%s'.\n",
 +				(ulong) saved_error, winmsg);
 +			LocalFree(winmsg);
 +		}
 +
 +		if (strerror((int) err) != NULL) {
 +			fprintf(stderr,
 +				"InnoDB: Error number %lu means '%s'.\n",
 +				(ulong) err, strerror((int) err));
 +		}
 +
 +		fprintf(stderr,
 +			"InnoDB: Some operating system error numbers"
 +			" are described at\n"
 +			"InnoDB: "
 +			REFMAN "operating-system-error-codes.html\n");
 +
 +		os_has_said_disk_full = TRUE;
 +	}
 +
 +	return(FALSE);
 +#else
 +	ssize_t	ret;
 +	WAIT_ALLOW_WRITES();
 +
 +	ret = os_file_pwrite(file, buf, n, offset);
 +
 +	if ((ulint) ret == n) {
 +
 +		return(TRUE);
 +	}
 +
 +	if (!os_has_said_disk_full) {
 +
 +		ut_print_timestamp(stderr);
 +
 +		if(ret == -1) {
 +			ib_logf(IB_LOG_LEVEL_ERROR,
 +				"Failure of system call pwrite(). Operating"
 +				" system error number is %lu.",
 +				(ulint) errno);
 +		} else {
 +			fprintf(stderr,
 +				" InnoDB: Error: Write to file %s failed"
 +				" at offset " UINT64PF ".\n"
 +				"InnoDB: %lu bytes should have been written,"
 +				" only %ld were written.\n"
 +				"InnoDB: Operating system error number %lu.\n"
 +				"InnoDB: Check that your OS and file system"
 +				" support files of this size.\n"
 +				"InnoDB: Check also that the disk is not full"
 +				" or a disk quota exceeded.\n",
 +				name, offset, n, (lint) ret,
 +				(ulint) errno);
 +		}
 +
 +		if (strerror(errno) != NULL) {
 +			fprintf(stderr,
 +				"InnoDB: Error number %d means '%s'.\n",
 +				errno, strerror(errno));
 +		}
 +
 +		fprintf(stderr,
 +			"InnoDB: Some operating system error numbers"
 +			" are described at\n"
 +			"InnoDB: "
 +			REFMAN "operating-system-error-codes.html\n");
 +
 +		os_diagnose_all_o_direct_einval(errno);
 +
 +		os_has_said_disk_full = TRUE;
 +	}
 +
 +	return(FALSE);
 +#endif
 +}
 +
 +/*******************************************************************//**
 +Check the existence and type of the given file.
 +@return	TRUE if call succeeded */
 +UNIV_INTERN
 +ibool
 +os_file_status(
 +/*===========*/
 +	const char*	path,	/*!< in: pathname of the file */
 +	ibool*		exists,	/*!< out: TRUE if file exists */
 +	os_file_type_t* type)	/*!< out: type of the file (if it exists) */
 +{
 +#ifdef __WIN__
 +	int		ret;
 +	struct _stat64	statinfo;
 +
 +	ret = _stat64(path, &statinfo);
 +	if (ret && (errno == ENOENT || errno == ENOTDIR || errno == ENAMETOOLONG)) {
 +		/* file does not exist */
 +		*exists = FALSE;
 +		return(TRUE);
 +	} else if (ret) {
 +		/* file exists, but stat call failed */
 +
 +		os_file_handle_error_no_exit(path, "stat", FALSE);
 +
 +		return(FALSE);
 +	}
 +
 +	if (_S_IFDIR & statinfo.st_mode) {
 +		*type = OS_FILE_TYPE_DIR;
 +	} else if (_S_IFREG & statinfo.st_mode) {
 +		*type = OS_FILE_TYPE_FILE;
 +	} else {
 +		*type = OS_FILE_TYPE_UNKNOWN;
 +	}
 +
 +	*exists = TRUE;
 +
 +	return(TRUE);
 +#else
 +	int		ret;
 +	struct stat	statinfo;
 +
 +	ret = stat(path, &statinfo);
 +	if (ret && (errno == ENOENT || errno == ENOTDIR || errno == ENAMETOOLONG)) {
 +		/* file does not exist */
 +		*exists = FALSE;
 +		return(TRUE);
 +	} else if (ret) {
 +		/* file exists, but stat call failed */
 +
 +		os_file_handle_error_no_exit(path, "stat", FALSE);
 +
 +		return(FALSE);
 +	}
 +
 +	if (S_ISDIR(statinfo.st_mode)) {
 +		*type = OS_FILE_TYPE_DIR;
 +	} else if (S_ISLNK(statinfo.st_mode)) {
 +		*type = OS_FILE_TYPE_LINK;
 +	} else if (S_ISREG(statinfo.st_mode)) {
 +		*type = OS_FILE_TYPE_FILE;
 +	} else {
 +		*type = OS_FILE_TYPE_UNKNOWN;
 +	}
 +
 +	*exists = TRUE;
 +
 +	return(TRUE);
 +#endif
 +}
 +
 +/*******************************************************************//**
 +This function returns information about the specified file
 +@return	DB_SUCCESS if all OK */
 +UNIV_INTERN
 +dberr_t
 +os_file_get_status(
 +/*===============*/
 +	const char*	path,		/*!< in:	pathname of the file */
 +	os_file_stat_t* stat_info,	/*!< information of a file in a
 +					directory */
 +	bool		check_rw_perm)	/*!< in: for testing whether the
 +					file can be opened in RW mode */
 +{
 +	int		ret;
 +
 +#ifdef __WIN__
 +	struct _stat64	statinfo;
 +
 +	ret = _stat64(path, &statinfo);
 +
 +	if (ret && (errno == ENOENT || errno == ENOTDIR)) {
 +		/* file does not exist */
 +
 +		return(DB_NOT_FOUND);
 +
 +	} else if (ret) {
 +		/* file exists, but stat call failed */
 +
 +		os_file_handle_error_no_exit(path, "stat", FALSE);
 +
 +		return(DB_FAIL);
 +
 +	} else if (_S_IFDIR & statinfo.st_mode) {
 +		stat_info->type = OS_FILE_TYPE_DIR;
 +	} else if (_S_IFREG & statinfo.st_mode) {
 +
 +		DWORD	access = GENERIC_READ;
 +
 +		if (!srv_read_only_mode) {
 +			access |= GENERIC_WRITE;
 +		}
 +
 +		stat_info->type = OS_FILE_TYPE_FILE;
 +
 +		/* Check if we can open it in read-only mode. */
 +
 +		if (check_rw_perm) {
 +			HANDLE	fh;
 +
 +			fh = CreateFile(
 +				(LPCTSTR) path,		// File to open
 +				access,
 +				0,			// No sharing
 +				NULL,			// Default security
 +				OPEN_EXISTING,		// Existing file only
 +				FILE_ATTRIBUTE_NORMAL,	// Normal file
 +				NULL);			// No attr. template
 +
 +			if (fh == INVALID_HANDLE_VALUE) {
 +				stat_info->rw_perm = false;
 +			} else {
 +				stat_info->rw_perm = true;
 +				CloseHandle(fh);
 +			}
 +		}
 +	} else {
 +		stat_info->type = OS_FILE_TYPE_UNKNOWN;
 +	}
 +#else
 +	struct stat	statinfo;
 +
 +	ret = stat(path, &statinfo);
 +
 +	if (ret && (errno == ENOENT || errno == ENOTDIR)) {
 +		/* file does not exist */
 +
 +		return(DB_NOT_FOUND);
 +
 +	} else if (ret) {
 +		/* file exists, but stat call failed */
 +
 +		os_file_handle_error_no_exit(path, "stat", FALSE);
 +
 +		return(DB_FAIL);
 +
 +	}
 +
 +	switch (statinfo.st_mode & S_IFMT) {
 +	case S_IFDIR:
 +		stat_info->type = OS_FILE_TYPE_DIR;
 +		break;
 +	case S_IFLNK:
 +		stat_info->type = OS_FILE_TYPE_LINK;
 +		break;
 +	case S_IFBLK:
 +		/* Handle block device as regular file. */
 +	case S_IFCHR:
 +		/* Handle character device as regular file. */
 +	case S_IFREG:
 +		stat_info->type = OS_FILE_TYPE_FILE;
 +		break;
 +	default:
 +		stat_info->type = OS_FILE_TYPE_UNKNOWN;
 +	}
 +
 +
 +	if (check_rw_perm && stat_info->type == OS_FILE_TYPE_FILE) {
 +
 +		int	fh;
 +		int	access;
 +
 +		access = !srv_read_only_mode ? O_RDWR : O_RDONLY;
 +
- 		fh = ::open(path, access, os_innodb_umask);
++		fh = ::open(path, access | O_CLOEXEC, os_innodb_umask);
 +
 +		if (fh == -1) {
 +			stat_info->rw_perm = false;
 +		} else {
 +			stat_info->rw_perm = true;
 +			close(fh);
 +		}
 +	}
 +
 +#endif /* _WIN_ */
 +
 +	stat_info->ctime = statinfo.st_ctime;
 +	stat_info->atime = statinfo.st_atime;
 +	stat_info->mtime = statinfo.st_mtime;
 +	stat_info->size  = statinfo.st_size;
 +
 +	return(DB_SUCCESS);
 +}
 +
 +/* path name separator character */
 +#ifdef __WIN__
 +#  define OS_FILE_PATH_SEPARATOR	'\\'
 +#else
 +#  define OS_FILE_PATH_SEPARATOR	'/'
 +#endif
 +
 +/****************************************************************//**
 +This function returns a new path name after replacing the basename
 +in an old path with a new basename.  The old_path is a full path
 +name including the extension.  The tablename is in the normal
 +form "databasename/tablename".  The new base name is found after
 +the forward slash.  Both input strings are null terminated.
 +
 +This function allocates memory to be returned.  It is the callers
 +responsibility to free the return value after it is no longer needed.
 +
 +@return	own: new full pathname */
 +UNIV_INTERN
 +char*
 +os_file_make_new_pathname(
 +/*======================*/
 +	const char*	old_path,	/*!< in: pathname */
 +	const char*	tablename)	/*!< in: contains new base name */
 +{
 +	ulint		dir_len;
 +	char*		last_slash;
 +	char*		base_name;
 +	char*		new_path;
 +	ulint		new_path_len;
 +
 +	/* Split the tablename into its database and table name components.
 +	They are separated by a '/'. */
 +	last_slash = strrchr((char*) tablename, '/');
 +	base_name = last_slash ? last_slash + 1 : (char*) tablename;
 +
 +	/* Find the offset of the last slash. We will strip off the
 +	old basename.ibd which starts after that slash. */
 +	last_slash = strrchr((char*) old_path, OS_FILE_PATH_SEPARATOR);
 +	dir_len = last_slash ? last_slash - old_path : strlen(old_path);
 +
 +	/* allocate a new path and move the old directory path to it. */
 +	new_path_len = dir_len + strlen(base_name) + sizeof "/.ibd";
 +	new_path = static_cast<char*>(mem_alloc(new_path_len));
 +	memcpy(new_path, old_path, dir_len);
 +
 +	ut_snprintf(new_path + dir_len,
 +		    new_path_len - dir_len,
 +		    "%c%s.ibd",
 +		    OS_FILE_PATH_SEPARATOR,
 +		    base_name);
 +
 +	return(new_path);
 +}
 +
 +/****************************************************************//**
 +This function returns a remote path name by combining a data directory
 +path provided in a DATA DIRECTORY clause with the tablename which is
 +in the form 'database/tablename'.  It strips the file basename (which
 +is the tablename) found after the last directory in the path provided.
 +The full filepath created will include the database name as a directory
 +under the path provided.  The filename is the tablename with the '.ibd'
 +extension. All input and output strings are null-terminated.
 +
 +This function allocates memory to be returned.  It is the callers
 +responsibility to free the return value after it is no longer needed.
 +
 +@return	own: A full pathname; data_dir_path/databasename/tablename.ibd */
 +UNIV_INTERN
 +char*
 +os_file_make_remote_pathname(
 +/*=========================*/
 +	const char*	data_dir_path,	/*!< in: pathname */
 +	const char*	tablename,	/*!< in: tablename */
 +	const char*	extention)	/*!< in: file extention; ibd,cfg */
 +{
 +	ulint		data_dir_len;
 +	char*		last_slash;
 +	char*		new_path;
 +	ulint		new_path_len;
 +
 +	ut_ad(extention && strlen(extention) == 3);
 +
 +	/* Find the offset of the last slash. We will strip off the
 +	old basename or tablename which starts after that slash. */
 +	last_slash = strrchr((char*) data_dir_path, OS_FILE_PATH_SEPARATOR);
 +	data_dir_len = last_slash ? last_slash - data_dir_path : strlen(data_dir_path);
 +
 +	/* allocate a new path and move the old directory path to it. */
 +	new_path_len = data_dir_len + strlen(tablename)
 +		       + sizeof "/." + strlen(extention);
 +	new_path = static_cast<char*>(mem_alloc(new_path_len));
 +	memcpy(new_path, data_dir_path, data_dir_len);
 +	ut_snprintf(new_path + data_dir_len,
 +		    new_path_len - data_dir_len,
 +		    "%c%s.%s",
 +		    OS_FILE_PATH_SEPARATOR,
 +		    tablename,
 +		    extention);
 +
 +	srv_normalize_path_for_win(new_path);
 +
 +	return(new_path);
 +}
 +
 +/****************************************************************//**
 +This function reduces a null-terminated full remote path name into
 +the path that is sent by MySQL for DATA DIRECTORY clause.  It replaces
 +the 'databasename/tablename.ibd' found at the end of the path with just
 +'tablename'.
 +
 +Since the result is always smaller than the path sent in, no new memory
 +is allocated. The caller should allocate memory for the path sent in.
 +This function manipulates that path in place.
 +
 +If the path format is not as expected, just return.  The result is used
 +to inform a SHOW CREATE TABLE command. */
 +UNIV_INTERN
 +void
 +os_file_make_data_dir_path(
 +/*========================*/
 +	char*	data_dir_path)	/*!< in/out: full path/data_dir_path */
 +{
 +	char*	ptr;
 +	char*	tablename;
 +	ulint	tablename_len;
 +
 +	/* Replace the period before the extension with a null byte. */
 +	ptr = strrchr((char*) data_dir_path, '.');
 +	if (!ptr) {
 +		return;
 +	}
 +	ptr[0] = '\0';
 +
 +	/* The tablename starts after the last slash. */
 +	ptr = strrchr((char*) data_dir_path, OS_FILE_PATH_SEPARATOR);
 +	if (!ptr) {
 +		return;
 +	}
 +	ptr[0] = '\0';
 +	tablename = ptr + 1;
 +
 +	/* The databasename starts after the next to last slash. */
 +	ptr = strrchr((char*) data_dir_path, OS_FILE_PATH_SEPARATOR);
 +	if (!ptr) {
 +		return;
 +	}
 +	tablename_len = ut_strlen(tablename);
 +
 +	ut_memmove(++ptr, tablename, tablename_len);
 +
 +	ptr[tablename_len] = '\0';
 +}
 +
 +/****************************************************************//**
 +The function os_file_dirname returns a directory component of a
 +null-terminated pathname string. In the usual case, dirname returns
 +the string up to, but not including, the final '/', and basename
 +is the component following the final '/'. Trailing '/' characters
 +are not counted as part of the pathname.
 +
 +If path does not contain a slash, dirname returns the string ".".
 +
 +Concatenating the string returned by dirname, a "/", and the basename
 +yields a complete pathname.
 +
 +The return value is a copy of the directory component of the pathname.
 +The copy is allocated from heap. It is the caller responsibility
 +to free it after it is no longer needed.
 +
 +The following list of examples (taken from SUSv2) shows the strings
 +returned by dirname and basename for different paths:
 +
 +       path	      dirname	     basename
 +       "/usr/lib"     "/usr"	     "lib"
 +       "/usr/"	      "/"	     "usr"
 +       "usr"	      "."	     "usr"
 +       "/"	      "/"	     "/"
 +       "."	      "."	     "."
 +       ".."	      "."	     ".."
 +
 +@return	own: directory component of the pathname */
 +UNIV_INTERN
 +char*
 +os_file_dirname(
 +/*============*/
 +	const char*	path)	/*!< in: pathname */
 +{
 +	/* Find the offset of the last slash */
 +	const char* last_slash = strrchr(path, OS_FILE_PATH_SEPARATOR);
 +	if (!last_slash) {
 +		/* No slash in the path, return "." */
 +
 +		return(mem_strdup("."));
 +	}
 +
 +	/* Ok, there is a slash */
 +
 +	if (last_slash == path) {
 +		/* last slash is the first char of the path */
 +
 +		return(mem_strdup("/"));
 +	}
 +
 +	/* Non-trivial directory component */
 +
 +	return(mem_strdupl(path, last_slash - path));
 +}
 +
 +/****************************************************************//**
 +Creates all missing subdirectories along the given path.
 +@return	TRUE if call succeeded FALSE otherwise */
 +UNIV_INTERN
 +ibool
 +os_file_create_subdirs_if_needed(
 +/*=============================*/
 +	const char*	path)	/*!< in: path name */
 +{
 +	if (srv_read_only_mode) {
 +
 +		ib_logf(IB_LOG_LEVEL_ERROR,
 +			"read only mode set. Can't create subdirectories '%s'",
 +			path);
 +
 +		return(FALSE);
 +
 +	}
 +
 +	char*	subdir = os_file_dirname(path);
 +
 +	if (strlen(subdir) == 1
 +	    && (*subdir == OS_FILE_PATH_SEPARATOR || *subdir == '.')) {
 +		/* subdir is root or cwd, nothing to do */
 +		mem_free(subdir);
 +
 +		return(TRUE);
 +	}
 +
 +	/* Test if subdir exists */
 +	os_file_type_t	type;
 +	ibool	subdir_exists;
 +	ibool	success = os_file_status(subdir, &subdir_exists, &type);
 +
 +	if (success && !subdir_exists) {
 +
 +		/* subdir does not exist, create it */
 +		success = os_file_create_subdirs_if_needed(subdir);
 +
 +		if (!success) {
 +			mem_free(subdir);
 +
 +			return(FALSE);
 +		}
 +
 +		success = os_file_create_directory(subdir, FALSE);
 +	}
 +
 +	mem_free(subdir);
 +
 +	return(success);
 +}
 +
 +#ifndef UNIV_HOTBACKUP
 +/****************************************************************//**
 +Returns a pointer to the nth slot in the aio array.
 +@return	pointer to slot */
 +static
 +os_aio_slot_t*
 +os_aio_array_get_nth_slot(
 +/*======================*/
 +	os_aio_array_t*		array,	/*!< in: aio array */
 +	ulint			index)	/*!< in: index of the slot */
 +{
 +	ut_a(index < array->n_slots);
 +
 +	return(&array->slots[index]);
 +}
 +
 +#if defined(LINUX_NATIVE_AIO)
 +/******************************************************************//**
 +Creates an io_context for native linux AIO.
 +@return	TRUE on success. */
 +static
 +ibool
 +os_aio_linux_create_io_ctx(
 +/*=======================*/
 +	ulint		max_events,	/*!< in: number of events. */
 +	io_context_t*	io_ctx)		/*!< out: io_ctx to initialize. */
 +{
 +	int	ret;
 +	ulint	retries = 0;
 +
 +retry:
 +	memset(io_ctx, 0x0, sizeof(*io_ctx));
 +
 +	/* Initialize the io_ctx. Tell it how many pending
 +	IO requests this context will handle. */
 +
 +	ret = io_setup(max_events, io_ctx);
 +	if (ret == 0) {
 +#if defined(UNIV_AIO_DEBUG)
 +		fprintf(stderr,
 +			"InnoDB: Linux native AIO:"
 +			" initialized io_ctx for segment\n");
 +#endif
 +		/* Success. Return now. */
 +		return(TRUE);
 +	}
 +
 +	/* If we hit EAGAIN we'll make a few attempts before failing. */
 +
 +	switch (ret) {
 +	case -EAGAIN:
 +		if (retries == 0) {
 +			/* First time around. */
 +			ut_print_timestamp(stderr);
 +			fprintf(stderr,
 +				" InnoDB: Warning: io_setup() failed"
 +				" with EAGAIN. Will make %d attempts"
 +				" before giving up.\n",
 +				OS_AIO_IO_SETUP_RETRY_ATTEMPTS);
 +		}
 +
 +		if (retries < OS_AIO_IO_SETUP_RETRY_ATTEMPTS) {
 +			++retries;
 +			fprintf(stderr,
 +				"InnoDB: Warning: io_setup() attempt"
 +				" %lu failed.\n",
 +				retries);
 +			os_thread_sleep(OS_AIO_IO_SETUP_RETRY_SLEEP);
 +			goto retry;
 +		}
 +
 +		/* Have tried enough. Better call it a day. */
 +		ut_print_timestamp(stderr);
 +		fprintf(stderr,
 +			" InnoDB: Error: io_setup() failed"
 +			" with EAGAIN after %d attempts.\n",
 +			OS_AIO_IO_SETUP_RETRY_ATTEMPTS);
 +		break;
 +
 +	case -ENOSYS:
 +		ut_print_timestamp(stderr);
 +		fprintf(stderr,
 +			" InnoDB: Error: Linux Native AIO interface"
 +			" is not supported on this platform. Please"
 +			" check your OS documentation and install"
 +			" appropriate binary of InnoDB.\n");
 +
 +		break;
 +
 +	default:
 +		ut_print_timestamp(stderr);
 +		fprintf(stderr,
 +			" InnoDB: Error: Linux Native AIO setup"
 +			" returned following error[%d]\n", -ret);
 +		break;
 +	}
 +
 +	fprintf(stderr,
 +		"InnoDB: You can disable Linux Native AIO by"
 +		" setting innodb_use_native_aio = 0 in my.cnf\n");
 +	return(FALSE);
 +}
 +
 +/******************************************************************//**
 +Checks if the system supports native linux aio. On some kernel
 +versions where native aio is supported it won't work on tmpfs. In such
 +cases we can't use native aio as it is not possible to mix simulated
 +and native aio.
 +@return: TRUE if supported, FALSE otherwise. */
 +static
 +ibool
 +os_aio_native_aio_supported(void)
 +/*=============================*/
 +{
 +	int			fd;
 +	io_context_t		io_ctx;
 +	char			name[1000];
 +
 +	if (!os_aio_linux_create_io_ctx(1, &io_ctx)) {
 +		/* The platform does not support native aio. */
 +		return(FALSE);
 +	} else if (!srv_read_only_mode) {
 +		/* Now check if tmpdir supports native aio ops. */
 +		fd = innobase_mysql_tmpfile(NULL);
 +
 +		if (fd < 0) {
 +			ib_logf(IB_LOG_LEVEL_WARN,
 +				"Unable to create temp file to check "
 +				"native AIO support.");
 +
 +			return(FALSE);
 +		}
 +	} else {
 +
 +		srv_normalize_path_for_win(srv_log_group_home_dir);
 +
 +		ulint	dirnamelen = strlen(srv_log_group_home_dir);
 +		ut_a(dirnamelen < (sizeof name) - 10 - sizeof "ib_logfile");
 +		memcpy(name, srv_log_group_home_dir, dirnamelen);
 +
 +		/* Add a path separator if needed. */
 +		if (dirnamelen && name[dirnamelen - 1] != SRV_PATH_SEPARATOR) {
 +			name[dirnamelen++] = SRV_PATH_SEPARATOR;
 +		}
 +
 +		strcpy(name + dirnamelen, "ib_logfile0");
 +
- 		fd = ::open(name, O_RDONLY);
++		fd = ::open(name, O_RDONLY | O_CLOEXEC);
 +
 +		if (fd == -1) {
 +
 +			ib_logf(IB_LOG_LEVEL_WARN,
 +				"Unable to open \"%s\" to check "
 +				"native AIO read support.", name);
 +
 +			return(FALSE);
 +		}
 +	}
 +
 +	struct io_event	io_event;
 +
 +	memset(&io_event, 0x0, sizeof(io_event));
 +
 +	byte*	buf = static_cast<byte*>(ut_malloc(UNIV_PAGE_SIZE * 2));
 +	byte*	ptr = static_cast<byte*>(ut_align(buf, UNIV_PAGE_SIZE));
 +
 +	struct iocb	iocb;
 +
 +	/* Suppress valgrind warning. */
 +	memset(buf, 0x00, UNIV_PAGE_SIZE * 2);
 +	memset(&iocb, 0x0, sizeof(iocb));
 +
 +	struct iocb*	p_iocb = &iocb;
 +
 +	if (!srv_read_only_mode) {
 +		io_prep_pwrite(p_iocb, fd, ptr, UNIV_PAGE_SIZE, 0);
 +	} else {
 +		ut_a(UNIV_PAGE_SIZE >= 512);
 +		io_prep_pread(p_iocb, fd, ptr, 512, 0);
 +	}
 +
 +	int	err = io_submit(io_ctx, 1, &p_iocb);
 +
 +	if (err >= 1) {
 +		/* Now collect the submitted IO request. */
 +		err = io_getevents(io_ctx, 1, 1, &io_event, NULL);
 +	}
 +
 +	ut_free(buf);
 +	close(fd);
 +
 +	switch (err) {
 +	case 1:
 +		return(TRUE);
 +
 +	case -EINVAL:
 +	case -ENOSYS:
 +		ib_logf(IB_LOG_LEVEL_ERROR,
 +			"Linux Native AIO not supported. You can either "
 +			"move %s to a file system that supports native "
 +			"AIO or you can set innodb_use_native_aio to "
 +			"FALSE to avoid this message.",
 +			srv_read_only_mode ? name : "tmpdir");
 +
 +		/* fall through. */
 +	default:
 +		ib_logf(IB_LOG_LEVEL_ERROR,
 +			"Linux Native AIO check on %s returned error[%d]",
 +			srv_read_only_mode ? name : "tmpdir", -err);
 +	}
 +
 +	return(FALSE);
 +}
 +#endif /* LINUX_NATIVE_AIO */
 +
 +/******************************************************************//**
 +Creates an aio wait array. Note that we return NULL in case of failure.
 +We don't care about freeing memory here because we assume that a
 +failure will result in server refusing to start up.
 +@return	own: aio array, NULL on failure */
 +static
 +os_aio_array_t*
 +os_aio_array_create(
 +/*================*/
 +	ulint	n,		/*!< in: maximum number of pending aio
 +				operations allowed; n must be
 +				divisible by n_segments */
 +	ulint	n_segments)	/*!< in: number of segments in the aio array */
 +{
 +	os_aio_array_t*	array;
 +#ifdef LINUX_NATIVE_AIO
 +	struct io_event*	io_event = NULL;
 +#endif
 +	ut_a(n > 0);
 +	ut_a(n_segments > 0);
 +
 +	array = static_cast<os_aio_array_t*>(ut_malloc(sizeof(*array)));
 +	memset(array, 0x0, sizeof(*array));
 +
 +	array->mutex = os_mutex_create();
 +	array->not_full = os_event_create();
 +	array->is_empty = os_event_create();
 +
 +	os_event_set(array->is_empty);
 +
 +	array->n_slots = n;
 +	array->n_segments = n_segments;
 +
 +	array->slots = static_cast<os_aio_slot_t*>(
 +		ut_malloc(n * sizeof(*array->slots)));
 +
 +	memset(array->slots, 0x0, sizeof(n * sizeof(*array->slots)));
 +
 +#if defined(LINUX_NATIVE_AIO)
 +	array->aio_ctx = NULL;
 +	array->aio_events = NULL;
 +
 +	/* If we are not using native aio interface then skip this
 +	part of initialization. */
 +	if (!srv_use_native_aio) {
 +		goto skip_native_aio;
 +	}
 +
 +	/* Initialize the io_context array. One io_context
 +	per segment in the array. */
 +
 +	array->aio_ctx = static_cast<io_context**>(
 +		ut_malloc(n_segments * sizeof(*array->aio_ctx)));
 +
 +	for (ulint i = 0; i < n_segments; ++i) {
 +		if (!os_aio_linux_create_io_ctx(n/n_segments,
 +						&array->aio_ctx[i])) {
 +			/* If something bad happened during aio setup
 +			we disable linux native aio.
 +                        The disadvantage will be a small memory leak
 +                        at shutdown but that's ok compared to a crash
 +                        or a not working server.
 +                        This frequently happens when running the test suite
 +                        with many threads on a system with low fs.aio-max-nr!
 +                        */
 +
 +                        fprintf(stderr,
 +                                "  InnoDB: Warning: Linux Native AIO disabled "
 +                                "because os_aio_linux_create_io_ctx() "
 +                                "failed. To get rid of this warning you can "
 +                                "try increasing system "
 +                                "fs.aio-max-nr to 1048576 or larger or "
 +                                "setting innodb_use_native_aio = 0 in my.cnf\n");
 +                        srv_use_native_aio = FALSE;
 +			goto skip_native_aio;
 +		}
 +	}
 +
 +	/* Initialize the event array. One event per slot. */
 +	io_event = static_cast<struct io_event*>(
 +		ut_malloc(n * sizeof(*io_event)));
 +
 +	memset(io_event, 0x0, sizeof(*io_event) * n);
 +	array->aio_events = io_event;
 +
 +	array->pending = static_cast<struct iocb**>(
 +		ut_malloc(n * sizeof(struct iocb*)));
 +	memset(array->pending, 0x0, sizeof(struct iocb*) * n);
 +	array->count = static_cast<ulint*>(
 +		ut_malloc(n_segments * sizeof(ulint)));
 +	memset(array->count, 0x0, sizeof(ulint) * n_segments);
 +
 +skip_native_aio:
 +#endif /* LINUX_NATIVE_AIO */
 +	for (ulint i = 0; i < n; i++) {
 +		os_aio_slot_t*	slot;
 +
 +		slot = os_aio_array_get_nth_slot(array, i);
 +		slot->pos = i;
 +		slot->reserved = FALSE;
 +#ifdef LINUX_NATIVE_AIO
 +		memset(&slot->control, 0x0, sizeof(slot->control));
 +		slot->n_bytes = 0;
 +		slot->ret = 0;
 +#endif /* WIN_ASYNC_IO */
 +	}
 +
 +	return(array);
 +}
 +
 +/************************************************************************//**
 +Frees an aio wait array. */
 +static
 +void
 +os_aio_array_free(
 +/*==============*/
 +	os_aio_array_t*& array)	/*!< in, own: array to free */
 +{
 +	os_mutex_free(array->mutex);
 +	os_event_free(array->not_full);
 +	os_event_free(array->is_empty);
 +
 +#if defined(LINUX_NATIVE_AIO)
 +	if (srv_use_native_aio) {
 +		ut_free(array->aio_events);
 +		ut_free(array->aio_ctx);
 +
 +#ifdef UNIV_DEBUG
 +		for (size_t idx = 0; idx < array->n_slots; ++idx)
 +			ut_ad(array->pending[idx] == NULL);
 +		for (size_t idx = 0; idx < array->n_segments; ++idx)
 +			ut_ad(array->count[idx] == 0);
 +#endif
 +
 +		ut_free(array->pending);
 +		ut_free(array->count);
 +	}
 +#endif /* LINUX_NATIVE_AIO */
 +
 +	ut_free(array->slots);
 +	ut_free(array);
 +
 +	array = 0;
 +}
 +
 +/***********************************************************************
 +Initializes the asynchronous io system. Creates one array each for ibuf
 +and log i/o. Also creates one array each for read and write where each
 +array is divided logically into n_read_segs and n_write_segs
 +respectively. The caller must create an i/o handler thread for each
 +segment in these arrays. This function also creates the sync array.
 +No i/o handler thread needs to be created for that */
 +UNIV_INTERN
 +ibool
 +os_aio_init(
 +/*========*/
 +	ulint	n_per_seg,	/*<! in: maximum number of pending aio
 +				operations allowed per segment */
 +	ulint	n_read_segs,	/*<! in: number of reader threads */
 +	ulint	n_write_segs,	/*<! in: number of writer threads */
 +	ulint	n_slots_sync)	/*<! in: number of slots in the sync aio
 +				array */
 +{
 +	os_io_init_simple();
 +
 +#if defined(LINUX_NATIVE_AIO)
 +	/* Check if native aio is supported on this system and tmpfs */
 +	if (srv_use_native_aio && !os_aio_native_aio_supported()) {
 +
 +		ib_logf(IB_LOG_LEVEL_WARN, "Linux Native AIO disabled.");
 +
 +		srv_use_native_aio = FALSE;
 +	}
 +#endif /* LINUX_NATIVE_AIO */
 +
 +	srv_reset_io_thread_op_info();
 +
 +	os_aio_read_array = os_aio_array_create(
 +		n_read_segs * n_per_seg, n_read_segs);
 +
 +	if (os_aio_read_array == NULL) {
 +		return(FALSE);
 +	}
 +
 +	ulint	start = (srv_read_only_mode) ? 0 : 2;
 +	ulint	n_segs = n_read_segs + start;
 +
 +	/* 0 is the ibuf segment and 1 is the insert buffer segment. */
 +	for (ulint i = start; i < n_segs; ++i) {
 +		ut_a(i < SRV_MAX_N_IO_THREADS);
 +		srv_io_thread_function[i] = "read thread";
 +	}
 +
 +	ulint	n_segments = n_read_segs;
 +
 +	if (!srv_read_only_mode) {
 +
 +		os_aio_log_array = os_aio_array_create(n_per_seg, 1);
 +
 +		if (os_aio_log_array == NULL) {
 +			return(FALSE);
 +		}
 +
 +		++n_segments;
 +
 +		srv_io_thread_function[1] = "log thread";
 +
 +		os_aio_ibuf_array = os_aio_array_create(n_per_seg, 1);
 +
 +		if (os_aio_ibuf_array == NULL) {
 +			return(FALSE);
 +		}
 +
 +		++n_segments;
 +
 +		srv_io_thread_function[0] = "insert buffer thread";
 +
 +		os_aio_write_array = os_aio_array_create(
 +			n_write_segs * n_per_seg, n_write_segs);
 +
 +		if (os_aio_write_array == NULL) {
 +			return(FALSE);
 +		}
 +
 +		n_segments += n_write_segs;
 +
 +		for (ulint i = start + n_read_segs; i < n_segments; ++i) {
 +			ut_a(i < SRV_MAX_N_IO_THREADS);
 +			srv_io_thread_function[i] = "write thread";
 +		}
 +
 +		ut_ad(n_segments >= 4);
 +	} else {
 +		ut_ad(n_segments > 0);
 +	}
 +
 +	os_aio_sync_array = os_aio_array_create(n_slots_sync, 1);
 +
 +	if (os_aio_sync_array == NULL) {
 +		return(FALSE);
 +	}
 +
 +	os_aio_n_segments = n_segments;
 +
 +	os_aio_validate();
 +
 +	os_last_printout = ut_time();
 +
 +#ifdef _WIN32
 +	ut_a(completion_port == 0 && read_completion_port == 0);
 +	completion_port = CreateIoCompletionPort(INVALID_HANDLE_VALUE, NULL, 0, 0);
 +	read_completion_port = srv_read_only_mode? completion_port : CreateIoCompletionPort(INVALID_HANDLE_VALUE, NULL, 0, 0);
 +	ut_a(completion_port && read_completion_port);
 +#endif
 +
 +	if (srv_use_native_aio) {
 +		return(TRUE);
 +	}
 +
 +	os_aio_segment_wait_events = static_cast<os_event_t*>(
 +		ut_malloc(n_segments * sizeof *os_aio_segment_wait_events));
 +
 +	for (ulint i = 0; i < n_segments; ++i) {
 +		os_aio_segment_wait_events[i] = os_event_create();
 +	}
 +
 +	return(TRUE);
 +}
 +
 +/***********************************************************************
 +Frees the asynchronous io system. */
 +UNIV_INTERN
 +void
 +os_aio_free(void)
 +/*=============*/
 +{
 +	if (os_aio_ibuf_array != 0) {
 +		os_aio_array_free(os_aio_ibuf_array);
 +	}
 +
 +	if (os_aio_log_array != 0) {
 +		os_aio_array_free(os_aio_log_array);
 +	}
 +
 +	if (os_aio_write_array != 0) {
 +		os_aio_array_free(os_aio_write_array);
 +	}
 +
 +	if (os_aio_sync_array != 0) {
 +		os_aio_array_free(os_aio_sync_array);
 +	}
 +
 +	os_aio_array_free(os_aio_read_array);
 +
 +	if (!srv_use_native_aio) {
 +		for (ulint i = 0; i < os_aio_n_segments; i++) {
 +			os_event_free(os_aio_segment_wait_events[i]);
 +		}
 +	}
 +
 +	for (ulint i = 0; i < OS_FILE_N_SEEK_MUTEXES; i++) {
 +		os_mutex_free(os_file_seek_mutexes[i]);
 +	}
 +
 +	ut_free(os_aio_segment_wait_events);
 +	os_aio_segment_wait_events = 0;
 +	os_aio_n_segments = 0;
 +#ifdef _WIN32
 +	completion_port = 0;
 +	read_completion_port = 0;
 +#endif
 +}
 +
 +#ifdef WIN_ASYNC_IO
 +/************************************************************************//**
 +Wakes up all async i/o threads in the array in Windows async i/o at
 +shutdown. */
 +static
 +void
 +os_aio_array_wake_win_aio_at_shutdown(
 +/*==================================*/
 +	os_aio_array_t*	array)	/*!< in: aio array */
 +{
 +	if(completion_port)
 +	{
 +		PostQueuedCompletionStatus(completion_port, 0, IOCP_SHUTDOWN_KEY, NULL);
 +		PostQueuedCompletionStatus(read_completion_port, 0, IOCP_SHUTDOWN_KEY, NULL);
 +	}
 +}
 +#endif
 +
 +/************************************************************************//**
 +Wakes up all async i/o threads so that they know to exit themselves in
 +shutdown. */
 +UNIV_INTERN
 +void
 +os_aio_wake_all_threads_at_shutdown(void)
 +/*=====================================*/
 +{
 +#ifdef WIN_ASYNC_IO
 +	/* This code wakes up all ai/o threads in Windows native aio */
 +	os_aio_array_wake_win_aio_at_shutdown(os_aio_read_array);
 +	if (os_aio_write_array != 0) {
 +		os_aio_array_wake_win_aio_at_shutdown(os_aio_write_array);
 +	}
 +
 +	if (os_aio_ibuf_array != 0) {
 +		os_aio_array_wake_win_aio_at_shutdown(os_aio_ibuf_array);
 +	}
 +
 +	if (os_aio_log_array != 0) {
 +		os_aio_array_wake_win_aio_at_shutdown(os_aio_log_array);
 +	}
 +#elif defined(LINUX_NATIVE_AIO)
 +	/* When using native AIO interface the io helper threads
 +	wait on io_getevents with a timeout value of 500ms. At
 +	each wake up these threads check the server status.
 +	No need to do anything to wake them up. */
 +#endif /* !WIN_ASYNC_AIO */
 +
 +	if (srv_use_native_aio) {
 +		return;
 +	}
 +
 +	/* This loop wakes up all simulated ai/o threads */
 +
 +	for (ulint i = 0; i < os_aio_n_segments; i++) {
 +
 +		os_event_set(os_aio_segment_wait_events[i]);
 +	}
 +}
 +
 +/************************************************************************//**
 +Waits until there are no pending writes in os_aio_write_array. There can
 +be other, synchronous, pending writes. */
 +UNIV_INTERN
 +void
 +os_aio_wait_until_no_pending_writes(void)
 +/*=====================================*/
 +{
 +	ut_ad(!srv_read_only_mode);
 +	os_event_wait(os_aio_write_array->is_empty);
 +}
 +
 +/**********************************************************************//**
 +Calculates segment number for a slot.
 +@return segment number (which is the number used by, for example,
 +i/o-handler threads) */
 +static
 +ulint
 +os_aio_get_segment_no_from_slot(
 +/*============================*/
 +	os_aio_array_t*	array,	/*!< in: aio wait array */
 +	os_aio_slot_t*	slot)	/*!< in: slot in this array */
 +{
 +	ulint	segment;
 +	ulint	seg_len;
 +
 +	if (array == os_aio_ibuf_array) {
 +		ut_ad(!srv_read_only_mode);
 +
 +		segment = IO_IBUF_SEGMENT;
 +
 +	} else if (array == os_aio_log_array) {
 +		ut_ad(!srv_read_only_mode);
 +
 +		segment = IO_LOG_SEGMENT;
 +
 +	} else if (array == os_aio_read_array) {
 +		seg_len = os_aio_read_array->n_slots
 +			/ os_aio_read_array->n_segments;
 +
 +		segment = (srv_read_only_mode ? 0 : 2) + slot->pos / seg_len;
 +	} else {
 +		ut_ad(!srv_read_only_mode);
 +		ut_a(array == os_aio_write_array);
 +
 +		seg_len = os_aio_write_array->n_slots
 +			/ os_aio_write_array->n_segments;
 +
 +		segment = os_aio_read_array->n_segments + 2
 +			+ slot->pos / seg_len;
 +	}
 +
 +	return(segment);
 +}
 +
 +/**********************************************************************//**
 +Calculates local segment number and aio array from global segment number.
 +@return	local segment number within the aio array */
 +static
 +ulint
 +os_aio_get_array_and_local_segment(
 +/*===============================*/
 +	os_aio_array_t** array,		/*!< out: aio wait array */
 +	ulint		 global_segment)/*!< in: global segment number */
 +{
 +	ulint		segment;
 +
 +	ut_a(global_segment < os_aio_n_segments);
 +
 +	if (srv_read_only_mode) {
 +		*array = os_aio_read_array;
 +
 +		return(global_segment);
 +	} else if (global_segment == IO_IBUF_SEGMENT) {
 +		*array = os_aio_ibuf_array;
 +		segment = 0;
 +
 +	} else if (global_segment == IO_LOG_SEGMENT) {
 +		*array = os_aio_log_array;
 +		segment = 0;
 +
 +	} else if (global_segment < os_aio_read_array->n_segments + 2) {
 +		*array = os_aio_read_array;
 +
 +		segment = global_segment - 2;
 +	} else {
 +		*array = os_aio_write_array;
 +
 +		segment = global_segment - (os_aio_read_array->n_segments + 2);
 +	}
 +
 +	return(segment);
 +}
 +
 +/*******************************************************************//**
 +Requests for a slot in the aio array. If no slot is available, waits until
 +not_full-event becomes signaled.
 +@return	pointer to slot */
 +static
 +os_aio_slot_t*
 +os_aio_array_reserve_slot(
 +/*======================*/
 +	ulint		type,	/*!< in: OS_FILE_READ or OS_FILE_WRITE */
 +	os_aio_array_t*	array,	/*!< in: aio array */
 +	fil_node_t*	message1,/*!< in: message to be passed along with
 +				the aio operation */
 +	void*		message2,/*!< in: message to be passed along with
 +				the aio operation */
 +	pfs_os_file_t	file,	/*!< in: file handle */
 +	const char*	name,	/*!< in: name of the file or path as a
 +				null-terminated string */
 +	void*		buf,	/*!< in: buffer where to read or from which
 +				to write */
 +	os_offset_t	offset,	/*!< in: file offset */
 +	ulint		len,	/*!< in: length of the block to read or write */
 +	ulint		space_id)
 +{
 +	os_aio_slot_t*	slot = NULL;
 +#ifdef WIN_ASYNC_IO
 +	OVERLAPPED*	control;
 +
 +#elif defined(LINUX_NATIVE_AIO)
 +
 +	struct iocb*	iocb;
 +	off_t		aio_offset;
 +
 +#endif /* WIN_ASYNC_IO */
 +	ulint		i;
 +	ulint		counter;
 +	ulint		slots_per_seg;
 +	ulint		local_seg;
 +
 +#ifdef WIN_ASYNC_IO
 +	ut_a((len & 0xFFFFFFFFUL) == len);
 +#endif /* WIN_ASYNC_IO */
 +
 +	/* No need of a mutex. Only reading constant fields */
 +	slots_per_seg = array->n_slots / array->n_segments;
 +
 +	/* We attempt to keep adjacent blocks in the same local
 +	segment. This can help in merging IO requests when we are
 +	doing simulated AIO */
 +	local_seg = (offset >> (UNIV_PAGE_SIZE_SHIFT + 6))
 +		% array->n_segments;
 +
 +loop:
 +	os_mutex_enter(array->mutex);
 +
 +	if (array->n_reserved == array->n_slots) {
 +		os_mutex_exit(array->mutex);
 +
 +		if (!srv_use_native_aio) {
 +			/* If the handler threads are suspended, wake them
 +			so that we get more slots */
 +
 +			os_aio_simulated_wake_handler_threads();
 +		}
 +
 +		os_event_wait(array->not_full);
 +
 +		goto loop;
 +	}
 +
 +	/* We start our search for an available slot from our preferred
 +	local segment and do a full scan of the array. We are
 +	guaranteed to find a slot in full scan. */
 +	for (i = local_seg * slots_per_seg, counter = 0;
 +	     counter < array->n_slots;
 +	     i++, counter++) {
 +
 +		i %= array->n_slots;
 +
 +		slot = os_aio_array_get_nth_slot(array, i);
 +
 +		if (slot->reserved == FALSE) {
 +			goto found;
 +		}
 +	}
 +
 +	/* We MUST always be able to get hold of a reserved slot. */
 +	ut_error;
 +
 +found:
 +	ut_a(slot->reserved == FALSE);
 +	array->n_reserved++;
 +
 +	if (array->n_reserved == 1) {
 +		os_event_reset(array->is_empty);
 +	}
 +
 +	if (array->n_reserved == array->n_slots) {
 +		os_event_reset(array->not_full);
 +	}
 +
 +	slot->reserved = TRUE;
 +	slot->reservation_time = ut_time();
 +	slot->message1 = message1;
 +	slot->message2 = message2;
 +	slot->file     = file;
 +	slot->name     = name;
 +	slot->len      = len;
 +	slot->type     = type;
 +	slot->buf      = static_cast<byte*>(buf);
 +	slot->offset   = offset;
 +	slot->io_already_done = FALSE;
 +	slot->space_id = space_id;
 +
 +#ifdef WIN_ASYNC_IO
 +	control = &slot->control;
 +	control->Offset = (DWORD) offset & 0xFFFFFFFF;
 +	control->OffsetHigh = (DWORD) (offset >> 32);
 +	control->hEvent = 0;
 +	slot->arr = array;
 +
 +#elif defined(LINUX_NATIVE_AIO)
 +
 +	/* If we are not using native AIO skip this part. */
 +	if (!srv_use_native_aio) {
 +		goto skip_native_aio;
 +	}
 +
 +	/* Check if we are dealing with 64 bit arch.
 +	If not then make sure that offset fits in 32 bits. */
 +	aio_offset = (off_t) offset;
 +
 +	ut_a(sizeof(aio_offset) >= sizeof(offset)
 +	     || ((os_offset_t) aio_offset) == offset);
 +
 +	iocb = &slot->control;
 +
 +	if (type == OS_FILE_READ) {
 +		io_prep_pread(iocb, file.m_file, buf, len, aio_offset);
 +	} else {
 +		ut_a(type == OS_FILE_WRITE);
 +		io_prep_pwrite(iocb, file.m_file, buf, len, aio_offset);
 +	}
 +
 +	iocb->data = (void*) slot;
 +	slot->n_bytes = 0;
 +	slot->ret = 0;
 +
 +skip_native_aio:
 +#endif /* LINUX_NATIVE_AIO */
 +	os_mutex_exit(array->mutex);
 +
 +	return(slot);
 +}
 +
 +/*******************************************************************//**
 +Frees a slot in the aio array. */
 +static
 +void
 +os_aio_array_free_slot(
 +/*===================*/
 +	os_aio_array_t*	array,	/*!< in: aio array */
 +	os_aio_slot_t*	slot)	/*!< in: pointer to slot */
 +{
 +	os_mutex_enter(array->mutex);
 +
 +	ut_ad(slot->reserved);
 +
 +	slot->reserved = FALSE;
 +
 +	array->n_reserved--;
 +
 +	if (array->n_reserved == array->n_slots - 1) {
 +		os_event_set(array->not_full);
 +	}
 +
 +	if (array->n_reserved == 0) {
 +		os_event_set(array->is_empty);
 +	}
 +
 +#ifdef LINUX_NATIVE_AIO
 +
 +	if (srv_use_native_aio) {
 +		memset(&slot->control, 0x0, sizeof(slot->control));
 +		slot->n_bytes = 0;
 +		slot->ret = 0;
 +		/*fprintf(stderr, "Freed up Linux native slot.\n");*/
 +	} else {
 +		/* These fields should not be used if we are not
 +		using native AIO. */
 +		ut_ad(slot->n_bytes == 0);
 +		ut_ad(slot->ret == 0);
 +	}
 +
 +#endif
 +	os_mutex_exit(array->mutex);
 +}
 +
 +/**********************************************************************//**
 +Wakes up a simulated aio i/o-handler thread if it has something to do. */
 +static
 +void
 +os_aio_simulated_wake_handler_thread(
 +/*=================================*/
 +	ulint	global_segment)	/*!< in: the number of the segment in the aio
 +				arrays */
 +{
 +	os_aio_array_t*	array;
 +	ulint		segment;
 +
 +	ut_ad(!srv_use_native_aio);
 +
 +	segment = os_aio_get_array_and_local_segment(&array, global_segment);
 +
 +	ulint	n = array->n_slots / array->n_segments;
 +
 +	segment *= n;
 +
 +	/* Look through n slots after the segment * n'th slot */
 +
 +	os_mutex_enter(array->mutex);
 +
 +	for (ulint i = 0; i < n; ++i) {
 +		const os_aio_slot_t*	slot;
 +
 +		slot = os_aio_array_get_nth_slot(array, segment + i);
 +
 +		if (slot->reserved) {
 +
 +			/* Found an i/o request */
 +
 +			os_mutex_exit(array->mutex);
 +
 +			os_event_t	event;
 +
 +			event = os_aio_segment_wait_events[global_segment];
 +
 +			os_event_set(event);
 +
 +			return;
 +		}
 +	}
 +
 +	os_mutex_exit(array->mutex);
 +}
 +
 +/**********************************************************************//**
 +Wakes up simulated aio i/o-handler threads if they have something to do. */
 +UNIV_INTERN
 +void
 +os_aio_simulated_wake_handler_threads(void)
 +/*=======================================*/
 +{
 +	if (srv_use_native_aio) {
 +		/* We do not use simulated aio: do nothing */
 +
 +		return;
 +	}
 +
 +	os_aio_recommend_sleep_for_read_threads	= FALSE;
 +
 +	for (ulint i = 0; i < os_aio_n_segments; i++) {
 +		os_aio_simulated_wake_handler_thread(i);
 +	}
 +}
 +
 +#ifdef _WIN32
 +/**********************************************************************//**
 +This function can be called if one wants to post a batch of reads and
 +prefers an i/o-handler thread to handle them all at once later. You must
 +call os_aio_simulated_wake_handler_threads later to ensure the threads
 +are not left sleeping! */
 +UNIV_INTERN
 +void
 +os_aio_simulated_put_read_threads_to_sleep()
 +{
 +
 +/* The idea of putting background IO threads to sleep is only for
 +Windows when using simulated AIO. Windows XP seems to schedule
 +background threads too eagerly to allow for coalescing during
 +readahead requests. */
 +
 +	os_aio_array_t*	array;
 +
 +	if (srv_use_native_aio) {
 +		/* We do not use simulated aio: do nothing */
 +
 +		return;
 +	}
 +
 +	os_aio_recommend_sleep_for_read_threads	= TRUE;
 +
 +	for (ulint i = 0; i < os_aio_n_segments; i++) {
 +		os_aio_get_array_and_local_segment(&array, i);
 +
 +		if (array == os_aio_read_array) {
 +
 +			os_event_reset(os_aio_segment_wait_events[i]);
 +		}
 +	}
 +}
 +#endif /* _WIN32 */
 +
 +/** Submit buffered AIO requests on the given segment to the kernel
 +(low level function).
 +@param acquire_mutex specifies whether to lock array mutex
 +*/
 +static
 +void
 +os_aio_dispatch_read_array_submit_low(bool acquire_mutex MY_ATTRIBUTE((unused)))
 +{
 +	if (!srv_use_native_aio) {
 +		return;
 +	}
 +#if defined(LINUX_NATIVE_AIO)
 +	os_aio_array_t*	array = os_aio_read_array;
 +	ulint		total_submitted = 0;
 +	if (acquire_mutex)
 +		os_mutex_enter(array->mutex);
 +	/* Submit aio requests buffered on all segments. */
 +	for (ulint i = 0; i < array->n_segments; i++) {
 +		const int	count = array->count[i];
 +		int	offset = 0;
 +		while (offset != count) {
 +			struct iocb** const	iocb_array = array->pending
 +				+ i * array->n_slots / array->n_segments
 +				+ offset;
 +			const int	partial_count = count - offset;
 +			/* io_submit() returns number of successfully queued
 +			requests or (-errno).
 +			It returns 0 only if the number of iocb blocks passed
 +			is also 0. */
 +			const int	submitted = io_submit(array->aio_ctx[i],
 +						partial_count, iocb_array);
 +
 +			/* This assertion prevents infinite loop in both
 +			debug and release modes. */
 +			ut_a(submitted != 0);
 +
 +			if (submitted < 0) {
 +				/* Terminating with fatal error */
 +				const char*	errmsg =
 +					strerror(-submitted);
 +				ib_logf(IB_LOG_LEVEL_FATAL,
 +					"Trying to sumbit %d aio requests, "
 +					"io_submit() set errno to %d: %s",
 +					partial_count, -submitted,
 +					errmsg ? errmsg : "<unknown>");
 +			}
 +			ut_ad(submitted <= partial_count);
 +			if (submitted < partial_count)
 +			{
 +				ib_logf(IB_LOG_LEVEL_WARN,
 +					"Trying to sumbit %d aio requests, "
 +					"io_submit() submitted only %d",
 +					partial_count, submitted);
 +			}
 +			offset += submitted;
 +		}
 +		total_submitted += count;
 +	}
 +	/* Reset the aio request buffer. */
 +	memset(array->pending, 0x0, sizeof(struct iocb*) * array->n_slots);
 +	memset(array->count, 0x0, sizeof(ulint) * array->n_segments);
 +
 +	if (acquire_mutex)
 +		os_mutex_exit(array->mutex);
 +
 +	srv_stats.n_aio_submitted.add(total_submitted);
 +#endif
 +}
 +
 +/** Submit buffered AIO requests on the given segment to the kernel. */
 +UNIV_INTERN
 +void
 +os_aio_dispatch_read_array_submit()
 +{
 +	os_aio_dispatch_read_array_submit_low(true);
 +}
 +
 +#if defined(LINUX_NATIVE_AIO)
 +/*******************************************************************//**
 +Dispatch an AIO request to the kernel.
 +@return	TRUE on success. */
 +static
 +ibool
 +os_aio_linux_dispatch(
 +/*==================*/
 +	os_aio_array_t*	array,	/*!< in: io request array. */
 +	os_aio_slot_t*	slot,	/*!< in: an already reserved slot. */
 +	bool		should_buffer)	/*!< in: should buffer the request
 +					rather than submit. */
 +{
 +	int		ret;
 +	struct iocb*	iocb;
 +
 +	ut_ad(slot != NULL);
 +	ut_ad(array);
 +
 +	ut_a(slot->reserved);
 +
 +	/* Find out what we are going to work with.
 +	The iocb struct is directly in the slot.
 +	The io_context is one per segment. */
 +
 +	ulint	slots_per_segment = array->n_slots / array->n_segments;
 +	iocb = &slot->control;
 +	ulint	io_ctx_index = slot->pos / slots_per_segment;
 +	if (should_buffer) {
 +		ut_ad(array == os_aio_read_array);
 +
 +		os_mutex_enter(array->mutex);
 +		/* There are array->n_slots elements in array->pending,
 +		which is divided into array->n_segments area of equal size.
 +		The iocb of each segment are buffered in its corresponding area
 +		in the pending array consecutively as they come.
 +		array->count[i] records the number of buffered aio requests
 +		in the ith segment.*/
 +		ulint&	count = array->count[io_ctx_index];
 +		ut_ad(count != slots_per_segment);
 +		ulint	n = io_ctx_index * slots_per_segment + count;
 +		array->pending[n] = iocb;
 +		++count;
 +		if (count == slots_per_segment) {
 +			os_aio_dispatch_read_array_submit_low(false);
 +		}
 +		os_mutex_exit(array->mutex);
 +		return(TRUE);
 +	}
 +	/* Submit the given request. */
 +	ret = io_submit(array->aio_ctx[io_ctx_index], 1, &iocb);
 +
 +#if defined(UNIV_AIO_DEBUG)
 +	fprintf(stderr,
 +		"io_submit[%c] ret[%d]: slot[%p] ctx[%p] seg[%lu]\n",
 +		(slot->type == OS_FILE_WRITE) ? 'w' : 'r', ret, slot,
 +		array->aio_ctx[io_ctx_index], (ulong) io_ctx_index);
 +#endif
 +
 +	/* io_submit returns number of successfully
 +	queued requests or -errno. */
 +	if (UNIV_UNLIKELY(ret != 1)) {
 +		errno = -ret;
 +		return(FALSE);
 +	}
 +
 +	return(TRUE);
 +}
 +#endif /* LINUX_NATIVE_AIO */
 +
 +
 +/*******************************************************************//**
 +NOTE! Use the corresponding macro os_aio(), not directly this function!
 +Requests an asynchronous i/o operation.
 +@return	TRUE if request was queued successfully, FALSE if fail */
 +UNIV_INTERN
 +ibool
 +os_aio_func(
 +/*========*/
 +	ulint		type,	/*!< in: OS_FILE_READ or OS_FILE_WRITE */
 +	ulint		mode,	/*!< in: OS_AIO_NORMAL, ..., possibly ORed
 +				to OS_AIO_SIMULATED_WAKE_LATER: the
 +				last flag advises this function not to wake
 +				i/o-handler threads, but the caller will
 +				do the waking explicitly later, in this
 +				way the caller can post several requests in
 +				a batch; NOTE that the batch must not be
 +				so big that it exhausts the slots in aio
 +				arrays! NOTE that a simulated batch
 +				may introduce hidden chances of deadlocks,
 +				because i/os are not actually handled until
 +				all have been posted: use with great
 +				caution! */
 +	const char*	name,	/*!< in: name of the file or path as a
 +				null-terminated string */
 +	pfs_os_file_t	file,	/*!< in: handle to a file */
 +	void*		buf,	/*!< in: buffer where to read or from which
 +				to write */
 +	os_offset_t	offset,	/*!< in: file offset where to read or write */
 +	ulint		n,	/*!< in: number of bytes to read or write */
 +	fil_node_t*	message1,/*!< in: message for the aio handler
 +				(can be used to identify a completed
 +				aio operation); ignored if mode is
 +				OS_AIO_SYNC */
 +	void*		message2,/*!< in: message for the aio handler
 +				(can be used to identify a completed
 +				aio operation); ignored if mode is
 +				OS_AIO_SYNC */
 +	ulint		space_id,
 +	trx_t*		trx,
 +	bool		should_buffer)
 +				/*!< in: Whether to buffer an aio request.
 +				AIO read ahead uses this. If you plan to
 +				use this parameter, make sure you remember
 +				to call os_aio_dispatch_read_array_submit()
 +				when you're ready to commit all your requests.*/
 +{
 +	os_aio_array_t*	array;
 +	os_aio_slot_t*	slot;
 +#ifdef WIN_ASYNC_IO
 +	DWORD		len		= (DWORD) n;
 +	BOOL	ret;
 +#endif
 +	ulint		wake_later;
 +	ut_ad(buf);
 +	ut_ad(n > 0);
 +	ut_ad(n % OS_MIN_LOG_BLOCK_SIZE == 0);
 +	ut_ad(offset % OS_MIN_LOG_BLOCK_SIZE == 0);
 +	ut_ad(os_aio_validate_skip());
 +#ifdef WIN_ASYNC_IO
 +	ut_ad((n & 0xFFFFFFFFUL) == n);
 +#endif
 +
 +	wake_later = mode & OS_AIO_SIMULATED_WAKE_LATER;
 +	mode = mode & (~OS_AIO_SIMULATED_WAKE_LATER);
 +
 +	DBUG_EXECUTE_IF("ib_os_aio_func_io_failure_28",
 +		mode = OS_AIO_SYNC; os_has_said_disk_full = FALSE;);
 +
 +	if (mode == OS_AIO_SYNC) {
 +		ibool ret;
 +		/* This is actually an ordinary synchronous read or write:
 +		no need to use an i/o-handler thread */
 +
 +		if (type == OS_FILE_READ) {
 +			ret = os_file_read_func(file.m_file, buf, offset, n, trx);
 +		} else {
 +			ut_ad(!srv_read_only_mode);
 +			ut_a(type == OS_FILE_WRITE);
 +
 +			ret = os_file_write(name, file, buf, offset, n);
 +
 +			DBUG_EXECUTE_IF("ib_os_aio_func_io_failure_28",
 +				os_has_said_disk_full = FALSE; ret = 0; errno = 28;);
 +
 +			if (!ret) {
 +				os_file_handle_error_cond_exit(name, "os_file_write_func", TRUE, FALSE);
 +			}
 +		}
 +
 +		if (!ret) {
 +			fprintf(stderr, "FAIL");
 +		}
 +
 +		return ret;
 +	}
 +
 +try_again:
 +	switch (mode) {
 +	case OS_AIO_NORMAL:
 +		if (type == OS_FILE_READ) {
 +			array = os_aio_read_array;
 +		} else {
 +			ut_ad(!srv_read_only_mode);
 +			array = os_aio_write_array;
 +		}
 +		break;
 +	case OS_AIO_IBUF:
 +		ut_ad(type == OS_FILE_READ);
 +		/* Reduce probability of deadlock bugs in connection with ibuf:
 +		do not let the ibuf i/o handler sleep */
 +
 +		wake_later = FALSE;
 +
 +		if (srv_read_only_mode) {
 +			array = os_aio_read_array;
 +		} else {
 +			array = os_aio_ibuf_array;
 +		}
 +		break;
 +	case OS_AIO_LOG:
 +		if (srv_read_only_mode) {
 +			array = os_aio_read_array;
 +		} else {
 +			array = os_aio_log_array;
 +		}
 +		break;
 +	case OS_AIO_SYNC:
 +		array = os_aio_sync_array;
 +#if defined(LINUX_NATIVE_AIO)
 +		/* In Linux native AIO we don't use sync IO array. */
 +		ut_a(!srv_use_native_aio);
 +#endif /* LINUX_NATIVE_AIO */
 +		break;
 +	default:
 +		ut_error;
 +		array = NULL; /* Eliminate compiler warning */
 +	}
 +
 +	if (trx && type == OS_FILE_READ)
 +	{
 +		trx->io_reads++;
 +		trx->io_read += n;
 +	}
 +	slot = os_aio_array_reserve_slot(type, array, message1, message2, file,
 +					 name, buf, offset, n, space_id);
 +	if (type == OS_FILE_READ) {
 +		if (srv_use_native_aio) {
 +			os_n_file_reads++;
 +			os_bytes_read_since_printout += n;
 +#ifdef WIN_ASYNC_IO
 +			ret = ReadFile(file.m_file, buf, (DWORD) n, &len,
 +				       &(slot->control));
 +			if(!ret && GetLastError() != ERROR_IO_PENDING)
 +				goto err_exit;
 +
 +#elif defined(LINUX_NATIVE_AIO)
 +			if (!os_aio_linux_dispatch(array, slot,
 +						   should_buffer)) {
 +				goto err_exit;
 +			}
 +#endif /* WIN_ASYNC_IO */
 +		} else {
 +			if (!wake_later) {
 +				os_aio_simulated_wake_handler_thread(
 +					os_aio_get_segment_no_from_slot(
 +						array, slot));
 +			}
 +		}
 +	} else if (type == OS_FILE_WRITE) {
 +		ut_ad(!srv_read_only_mode);
 +		if (srv_use_native_aio) {
 +			os_n_file_writes++;
 +#ifdef WIN_ASYNC_IO
 +			ret = WriteFile(file.m_file, buf, (DWORD) n, &len,
 +					&(slot->control));
 +
 +			if(!ret && GetLastError() != ERROR_IO_PENDING)
 +				goto err_exit;
 +#elif defined(LINUX_NATIVE_AIO)
 +			if (!os_aio_linux_dispatch(array, slot, false)) {
 +				goto err_exit;
 +			}
 +#endif /* WIN_ASYNC_IO */
 +		} else {
 +			if (!wake_later) {
 +				os_aio_simulated_wake_handler_thread(
 +					os_aio_get_segment_no_from_slot(
 +						array, slot));
 +			}
 +		}
 +	} else {
 +		ut_error;
 +	}
 +
 +	/* aio was queued successfully! */
 +	return(TRUE);
 +
 +#if defined LINUX_NATIVE_AIO || defined WIN_ASYNC_IO
 +err_exit:
 +#endif /* LINUX_NATIVE_AIO || WIN_ASYNC_IO */
 +	os_aio_array_free_slot(array, slot);
 +
 +	if (os_file_handle_error(
 +		name,type == OS_FILE_READ ? "aio read" : "aio write")) {
 +
 +		goto try_again;
 +	}
 +
 +	return(FALSE);
 +}
 +
 +#ifdef WIN_ASYNC_IO
 +#define READ_SEGMENT(x) (x < srv_n_read_io_threads)
 +#define WRITE_SEGMENT(x) !READ_SEGMENT(x)
 +
 +/**********************************************************************//**
 +This function is only used in Windows asynchronous i/o.
 +Waits for an aio operation to complete. This function is used to wait the
 +for completed requests. The aio array of pending requests is divided
 +into segments. The thread specifies which segment or slot it wants to wait
 +for. NOTE: this function will also take care of freeing the aio slot,
 +therefore no other thread is allowed to do the freeing!
 +@return	TRUE if the aio operation succeeded */
 +UNIV_INTERN
 +ibool
 +os_aio_windows_handle(
 +/*==================*/
 +	ulint	segment,	/*!< in: the number of the segment in the aio
 +				arrays to wait for; segment 0 is the ibuf
 +				i/o thread, segment 1 the log i/o thread,
 +				then follow the non-ibuf read threads, and as
 +				the last are the non-ibuf write threads; if
 +				this is ULINT_UNDEFINED, then it means that
 +				sync aio is used, and this parameter is
 +				ignored */
 +	ulint	pos,		/*!< this parameter is used only in sync aio:
 +				wait for the aio slot at this position */
 +	fil_node_t**message1,	/*!< out: the messages passed with the aio
 +				request; note that also in the case where
 +				the aio operation failed, these output
 +				parameters are valid and can be used to
 +				restart the operation, for example */
 +	void**	message2,
 +	ulint*	type,		/*!< out: OS_FILE_WRITE or ..._READ */
 +	ulint*	space_id)
 +{
 +	ulint		orig_seg	= segment;
 +	os_aio_slot_t*	slot;
 +	ibool		ret_val;
 +	BOOL		ret;
 +	DWORD		len;
 +	BOOL		retry		= FALSE;
 +	ULONG_PTR key;
 +	HANDLE port = READ_SEGMENT(segment)? read_completion_port : completion_port;
 +
 +	for(;;) {
 +		ret = GetQueuedCompletionStatus(port, &len, &key, 
 +			(OVERLAPPED **)&slot, INFINITE);
 +
 +		/* If shutdown key was received, repost the shutdown message and exit */
 +		if (ret && (key == IOCP_SHUTDOWN_KEY)) {
 +			PostQueuedCompletionStatus(port, 0, key, NULL);
 +			os_thread_exit(NULL);
 +		}
 +
 +		if (srv_shutdown_state == SRV_SHUTDOWN_EXIT_THREADS) {
 +			os_thread_exit(NULL);
 +		}
 +
 +		if(WRITE_SEGMENT(segment)&& slot->type == OS_FILE_READ) {
 +			/*
 +			Redirect read completions  to the dedicated completion port 
 +			and thread. We need to split read and write threads. If we do not
 +			do that, and just allow all io threads process all IO, it is possible 
 +			to get stuck in a deadlock in buffer pool code,
 +
 +			Currently, the problem is solved this way - "write io" threads  
 +			always get all completion notifications, from both async reads and
 +			writes. Write completion is handled in the same thread that gets it.
 +			Read completion is forwarded via PostQueueCompletionStatus())
 +			to the second completion port dedicated solely to reads. One of the
 +			"read io" threads waiting on this port will finally handle the IO.
 +
 +			Forwarding IO completion this way costs a context switch , and this 
 +			seems tolerable  since asynchronous reads are by far less frequent.
 +			*/
 +			ut_a(PostQueuedCompletionStatus(read_completion_port, len, key,
 +				&slot->control));
 +		}
 +		else {
 +			break;
 +		}
 +	}
 +	*message1 = slot->message1;
 +	*message2 = slot->message2;
 +
 +	*type = slot->type;
 +	*space_id = slot->space_id;
 +
 +	if (ret && len == slot->len) {
 +
 +		ret_val = TRUE;
 +	} else if (os_file_handle_error(slot->name, "Windows aio")) {
 +
 +		retry = TRUE;
 +	} else {
 +
 +		ret_val = FALSE;
 +	}
 +
 +	if (retry) {
 +		LARGE_INTEGER li;
 +		li.LowPart = slot->control.Offset;
 +		li.HighPart = slot->control.OffsetHigh;
 +
 +		ut_a((slot->len & 0xFFFFFFFFUL) == slot->len);
 +
 +		switch (slot->type) {
 +		case OS_FILE_WRITE:
 +			ret_val = os_file_write(slot->name, slot->file, slot->buf,
 +				li.QuadPart, slot->len);
 +			break;
 +		case OS_FILE_READ:
 +			ret_val = os_file_read(slot->file, slot->buf,
 +				 li.QuadPart, slot->len);
 +			break;
 +		default:
 +			ut_error;
 +		}
 +
 +	}
 +
 +	os_aio_array_free_slot((os_aio_array_t *)slot->arr, slot);
 +
 +	return(ret_val);
 +}
 +#endif
 +
 +#if defined(LINUX_NATIVE_AIO)
 +/******************************************************************//**
 +This function is only used in Linux native asynchronous i/o. This is
 +called from within the io-thread. If there are no completed IO requests
 +in the slot array, the thread calls this function to collect more
 +requests from the kernel.
 +The io-thread waits on io_getevents(), which is a blocking call, with
 +a timeout value. Unless the system is very heavy loaded, keeping the
 +io-thread very busy, the io-thread will spend most of its time waiting
 +in this function.
 +The io-thread also exits in this function. It checks server status at
 +each wakeup and that is why we use timed wait in io_getevents(). */
 +static
 +void
 +os_aio_linux_collect(
 +/*=================*/
 +	os_aio_array_t* array,		/*!< in/out: slot array. */
 +	ulint		segment,	/*!< in: local segment no. */
 +	ulint		seg_size)	/*!< in: segment size. */
 +{
 +	int			i;
 +	int			ret;
 +	ulint			start_pos;
 +	ulint			end_pos;
 +	struct timespec		timeout;
 +	struct io_event*	events;
 +	struct io_context*	io_ctx;
 +
 +	/* sanity checks. */
 +	ut_ad(array != NULL);
 +	ut_ad(seg_size > 0);
 +	ut_ad(segment < array->n_segments);
 +
 +	/* Which part of event array we are going to work on. */
 +	events = &array->aio_events[segment * seg_size];
 +
 +	/* Which io_context we are going to use. */
 +	io_ctx = array->aio_ctx[segment];
 +
 +	/* Starting point of the segment we will be working on. */
 +	start_pos = segment * seg_size;
 +
 +	/* End point. */
 +	end_pos = start_pos + seg_size;
 +
 +retry:
 +
 +	/* Initialize the events. The timeout value is arbitrary.
 +	We probably need to experiment with it a little. */
 +	memset(events, 0, sizeof(*events) * seg_size);
 +	timeout.tv_sec = 0;
 +	timeout.tv_nsec = OS_AIO_REAP_TIMEOUT;
 +
 +	ret = io_getevents(io_ctx, 1, seg_size, events, &timeout);
 +
 +	if (ret > 0) {
 +		for (i = 0; i < ret; i++) {
 +			os_aio_slot_t*	slot;
 +			struct iocb*	control;
 +
 +			control = (struct iocb*) events[i].obj;
 +			ut_a(control != NULL);
 +
 +			slot = (os_aio_slot_t*) control->data;
 +
 +			/* Some sanity checks. */
 +			ut_a(slot != NULL);
 +			ut_a(slot->reserved);
 +
 +#if defined(UNIV_AIO_DEBUG)
 +			fprintf(stderr,
 +				"io_getevents[%c]: slot[%p] ctx[%p]"
 +				" seg[%lu]\n",
 +				(slot->type == OS_FILE_WRITE) ? 'w' : 'r',
 +				slot, io_ctx, segment);
 +#endif
 +
 +			/* We are not scribbling previous segment. */
 +			ut_a(slot->pos >= start_pos);
 +
 +			/* We have not overstepped to next segment. */
 +			ut_a(slot->pos < end_pos);
 +
 +			/* Mark this request as completed. The error handling
 +			will be done in the calling function. */
 +			os_mutex_enter(array->mutex);
 +			slot->n_bytes = events[i].res;
 +			slot->ret = events[i].res2;
 +			slot->io_already_done = TRUE;
 +			os_mutex_exit(array->mutex);
 +		}
 +		return;
 +	}
 +
 +	if (UNIV_UNLIKELY(srv_shutdown_state == SRV_SHUTDOWN_EXIT_THREADS)) {
 +		return;
 +	}
 +
 +	/* This error handling is for any error in collecting the
 +	IO requests. The errors, if any, for any particular IO
 +	request are simply passed on to the calling routine. */
 +
 +	switch (ret) {
 +	case -EAGAIN:
 +		/* Not enough resources! Try again. */
 +	case -EINTR:
 +		/* Interrupted! I have tested the behaviour in case of an
 +		interrupt. If we have some completed IOs available then
 +		the return code will be the number of IOs. We get EINTR only
 +		if there are no completed IOs and we have been interrupted. */
 +	case 0:
 +		/* No pending request! Go back and check again. */
 +		goto retry;
 +	}
 +
 +	/* All other errors should cause a trap for now. */
 +	ut_print_timestamp(stderr);
 +	fprintf(stderr,
 +		" InnoDB: unexpected ret_code[%d] from io_getevents()!\n",
 +		ret);
 +	ut_error;
 +}
 +
 +/**********************************************************************//**
 +This function is only used in Linux native asynchronous i/o.
 +Waits for an aio operation to complete. This function is used to wait for
 +the completed requests. The aio array of pending requests is divided
 +into segments. The thread specifies which segment or slot it wants to wait
 +for. NOTE: this function will also take care of freeing the aio slot,
 +therefore no other thread is allowed to do the freeing!
 +@return	TRUE if the IO was successful */
 +UNIV_INTERN
 +ibool
 +os_aio_linux_handle(
 +/*================*/
 +	ulint	global_seg,	/*!< in: segment number in the aio array
 +				to wait for; segment 0 is the ibuf
 +				i/o thread, segment 1 is log i/o thread,
 +				then follow the non-ibuf read threads,
 +				and the last are the non-ibuf write
 +				threads. */
 +	fil_node_t**message1,	/*!< out: the messages passed with the */
 +	void**	message2,	/*!< aio request; note that in case the
 +				aio operation failed, these output
 +				parameters are valid and can be used to
 +				restart the operation. */
 +	ulint*	type,		/*!< out: OS_FILE_WRITE or ..._READ */
 +	ulint*	space_id)
 +{
 +	ulint		segment;
 +	os_aio_array_t*	array;
 +	os_aio_slot_t*	slot;
 +	ulint		n;
 +	ulint		i;
 +	ibool		ret = FALSE;
 +
 +	/* Should never be doing Sync IO here. */
 +	ut_a(global_seg != ULINT_UNDEFINED);
 +
 +	/* Find the array and the local segment. */
 +	segment = os_aio_get_array_and_local_segment(&array, global_seg);
 +	n = array->n_slots / array->n_segments;
 +
 + wait_for_event:
 +	/* Loop until we have found a completed request. */
 +	for (;;) {
 +		ibool	any_reserved = FALSE;
 +		os_mutex_enter(array->mutex);
 +		for (i = 0; i < n; ++i) {
 +			slot = os_aio_array_get_nth_slot(
 +				array, i + segment * n);
 +			if (!slot->reserved) {
 +				continue;
 +			} else if (slot->io_already_done) {
 +				/* Something for us to work on. */
 +				goto found;
 +			} else {
 +				any_reserved = TRUE;
 +			}
 +		}
 +
 +		os_mutex_exit(array->mutex);
 +
 +		/* There is no completed request.
 +		If there is no pending request at all,
 +		and the system is being shut down, exit. */
 +		if (UNIV_UNLIKELY
 +		    (!any_reserved
 +		     && srv_shutdown_state == SRV_SHUTDOWN_EXIT_THREADS)) {
 +			*message1 = NULL;
 +			*message2 = NULL;
 +			return(TRUE);
 +		}
 +
 +		/* Wait for some request. Note that we return
 +		from wait iff we have found a request. */
 +
 +		srv_set_io_thread_op_info(global_seg,
 +			"waiting for completed aio requests");
 +		os_aio_linux_collect(array, segment, n);
 +	}
 +
 +found:
 +	/* Note that it may be that there are more then one completed
 +	IO requests. We process them one at a time. We may have a case
 +	here to improve the performance slightly by dealing with all
 +	requests in one sweep. */
 +	srv_set_io_thread_op_info(global_seg,
 +				"processing completed aio requests");
 +
 +	/* Ensure that we are scribbling only our segment. */
 +	ut_a(i < n);
 +
 +	ut_ad(slot != NULL);
 +	ut_ad(slot->reserved);
 +	ut_ad(slot->io_already_done);
 +
 +	*message1 = slot->message1;
 +	*message2 = slot->message2;
 +
 +	*type = slot->type;
 +	*space_id = slot->space_id;
 +
 +	if (slot->ret == 0 && slot->n_bytes == (long) slot->len) {
 +
 +		ret = TRUE;
 +	} else if ((slot->ret == 0) && (slot->n_bytes > 0)
 +		   && (slot->n_bytes < (long) slot->len)) {
 +		/* Partial read or write scenario */
 +		int submit_ret;
 +		struct iocb*    iocb;
 +		slot->buf = (byte*)slot->buf + slot->n_bytes;
 +		slot->offset = slot->offset + slot->n_bytes;
 +		slot->len = slot->len - slot->n_bytes;
 +		/* Resetting the bytes read/written */
 +		slot->n_bytes = 0;
 +		slot->io_already_done = FALSE;
 +		iocb = &(slot->control);
 +
 +		if (slot->type == OS_FILE_READ) {
 +			io_prep_pread(&slot->control, slot->file.m_file,
 +				      slot->buf, slot->len,
 +				      (off_t) slot->offset);
 +		} else {
 +			ut_a(slot->type == OS_FILE_WRITE);
 +			io_prep_pwrite(&slot->control, slot->file.m_file,
 +				       slot->buf, slot->len,
 +				       (off_t) slot->offset);
 +		}
 +		/* Resubmit an I/O request */
 +		submit_ret = io_submit(array->aio_ctx[segment], 1, &iocb);
 +		if (submit_ret < 0 ) {
 +			/* Aborting in case of submit failure */
 +			ib_logf(IB_LOG_LEVEL_FATAL,
 +				"Native Linux AIO interface. io_submit()"
 +				" call failed when resubmitting a partial"
 +				" I/O request on the file %s.",
 +				slot->name);
 +		} else {
 +			ret = FALSE;
 +			os_mutex_exit(array->mutex);
 +			goto wait_for_event;
 +		}
 +	} else {
 +		errno = -slot->ret;
 +
 +		/* os_file_handle_error does tell us if we should retry
 +		this IO. As it stands now, we don't do this retry when
 +		reaping requests from a different context than
 +		the dispatcher. This non-retry logic is the same for
 +		windows and linux native AIO.
 +		We should probably look into this to transparently
 +		re-submit the IO. */
 +		os_file_handle_error(slot->name, "Linux aio");
 +
 +		ret = FALSE;
 +	}
 +
 +	os_mutex_exit(array->mutex);
 +
 +	os_aio_array_free_slot(array, slot);
 +
 +	return(ret);
 +}
 +#endif /* LINUX_NATIVE_AIO */
 +
 +/**********************************************************************//**
 +Does simulated aio. This function should be called by an i/o-handler
 +thread.
 +@return	TRUE if the aio operation succeeded */
 +UNIV_INTERN
 +ibool
 +os_aio_simulated_handle(
 +/*====================*/
 +	ulint	global_segment,	/*!< in: the number of the segment in the aio
 +				arrays to wait for; segment 0 is the ibuf
 +				i/o thread, segment 1 the log i/o thread,
 +				then follow the non-ibuf read threads, and as
 +				the last are the non-ibuf write threads */
 +	fil_node_t**message1,	/*!< out: the messages passed with the aio
 +				request; note that also in the case where
 +				the aio operation failed, these output
 +				parameters are valid and can be used to
 +				restart the operation, for example */
 +	void**	message2,
 +	ulint*	type,		/*!< out: OS_FILE_WRITE or ..._READ */
 +	ulint*	space_id)
 +{
 +	os_aio_array_t*	array;
 +	ulint		segment;
 +	os_aio_slot_t*	consecutive_ios[OS_AIO_MERGE_N_CONSECUTIVE];
 +	ulint		n_consecutive;
 +	ulint		total_len;
 +	ulint		offs;
 +	os_offset_t	lowest_offset;
 +	ulint		biggest_age;
 +	ulint		age;
 +	byte*		combined_buf;
 +	byte*		combined_buf2;
 +	ibool		ret;
 +	ibool		any_reserved;
 +	ulint		n;
 +	os_aio_slot_t*	aio_slot;
 +
 +	/* Fix compiler warning */
 +	*consecutive_ios = NULL;
 +
 +	segment = os_aio_get_array_and_local_segment(&array, global_segment);
 +
 +restart:
 +	/* NOTE! We only access constant fields in os_aio_array. Therefore
 +	we do not have to acquire the protecting mutex yet */
 +
 +	srv_set_io_thread_op_info(global_segment,
 +				  "looking for i/o requests (a)");
 +	ut_ad(os_aio_validate_skip());
 +	ut_ad(segment < array->n_segments);
 +
 +	n = array->n_slots / array->n_segments;
 +
 +	/* Look through n slots after the segment * n'th slot */
 +
 +	if (array == os_aio_read_array
 +	    && os_aio_recommend_sleep_for_read_threads) {
 +
 +		/* Give other threads chance to add several i/os to the array
 +		at once. */
 +
 +		goto recommended_sleep;
 +	}
 +
 +	srv_set_io_thread_op_info(global_segment,
 +				  "looking for i/o requests (b)");
 +
 +	/* Check if there is a slot for which the i/o has already been
 +	done */
 +	any_reserved = FALSE;
 +
 +	os_mutex_enter(array->mutex);
 +
 +	for (ulint i = 0; i < n; i++) {
 +		os_aio_slot_t*	slot;
 +
 +		slot = os_aio_array_get_nth_slot(array, i + segment * n);
 +
 +		if (!slot->reserved) {
 +			continue;
 +		} else if (slot->io_already_done) {
 +
 +			if (os_aio_print_debug) {
 +				fprintf(stderr,
 +					"InnoDB: i/o for slot %lu"
 +					" already done, returning\n",
 +					(ulong) i);
 +			}
 +
 +			aio_slot = slot;
 +			ret = TRUE;
 +			goto slot_io_done;
 +		} else {
 +			any_reserved = TRUE;
 +		}
 +	}
 +
 +	/* There is no completed request.
 +	If there is no pending request at all,
 +	and the system is being shut down, exit. */
 +	if (!any_reserved && srv_shutdown_state == SRV_SHUTDOWN_EXIT_THREADS) {
 +		os_mutex_exit(array->mutex);
 +		*message1 = NULL;
 +		*message2 = NULL;
 +		return(TRUE);
 +	}
 +
 +	n_consecutive = 0;
 +
 +	/* If there are at least 2 seconds old requests, then pick the oldest
 +	one to prevent starvation. If several requests have the same age,
 +	then pick the one at the lowest offset. */
 +
 +	biggest_age = 0;
 +	lowest_offset = IB_UINT64_MAX;
 +
 +	for (ulint i = 0; i < n; i++) {
 +		os_aio_slot_t*	slot;
 +
 +		slot = os_aio_array_get_nth_slot(array, i + segment * n);
 +
 +		if (slot->reserved) {
 +
 +			age = (ulint) difftime(
 +				ut_time(), slot->reservation_time);
 +
 +			if ((age >= 2 && age > biggest_age)
 +			    || (age >= 2 && age == biggest_age
 +				&& slot->offset < lowest_offset)) {
 +
 +				/* Found an i/o request */
 +				consecutive_ios[0] = slot;
 +
 +				n_consecutive = 1;
 +
 +				biggest_age = age;
 +				lowest_offset = slot->offset;
 +			}
 +		}
 +	}
 +
 +	if (n_consecutive == 0) {
 +		/* There were no old requests. Look for an i/o request at the
 +		lowest offset in the array (we ignore the high 32 bits of the
 +		offset in these heuristics) */
 +
 +		lowest_offset = IB_UINT64_MAX;
 +
 +		for (ulint i = 0; i < n; i++) {
 +			os_aio_slot_t*	slot;
 +
 +			slot = os_aio_array_get_nth_slot(
 +				array, i + segment * n);
 +
 +			if (slot->reserved && slot->offset < lowest_offset) {
 +
 +				/* Found an i/o request */
 +				consecutive_ios[0] = slot;
 +
 +				n_consecutive = 1;
 +
 +				lowest_offset = slot->offset;
 +			}
 +		}
 +	}
 +
 +	if (n_consecutive == 0) {
 +
 +		/* No i/o requested at the moment */
 +
 +		goto wait_for_io;
 +	}
 +
 +	/* if n_consecutive != 0, then we have assigned
 +	something valid to consecutive_ios[0] */
 +	ut_ad(n_consecutive != 0);
 +	ut_ad(consecutive_ios[0] != NULL);
 +
 +	aio_slot = consecutive_ios[0];
 +
 +	/* Check if there are several consecutive blocks to read or write */
 +
 +consecutive_loop:
 +	for (ulint i = 0; i < n; i++) {
 +		os_aio_slot_t*	slot;
 +
 +		slot = os_aio_array_get_nth_slot(array, i + segment * n);
 +		if (slot->reserved
 +		    && slot != aio_slot
 +		    && slot->offset == aio_slot->offset + aio_slot->len
 +		    && slot->type == aio_slot->type
 +		    && slot->file.m_file == aio_slot->file.m_file) {
 +
 +			/* Found a consecutive i/o request */
 +
 +			consecutive_ios[n_consecutive] = slot;
 +			n_consecutive++;
 +
 +			aio_slot = slot;
 +
 +			if (n_consecutive < OS_AIO_MERGE_N_CONSECUTIVE) {
 +
 +				goto consecutive_loop;
 +			} else {
 +				break;
 +			}
 +		}
 +	}
 +
 +	srv_set_io_thread_op_info(global_segment, "consecutive i/o requests");
 +
 +	/* We have now collected n_consecutive i/o requests in the array;
 +	allocate a single buffer which can hold all data, and perform the
 +	i/o */
 +
 +	total_len = 0;
 +	aio_slot = consecutive_ios[0];
 +
 +	for (ulint i = 0; i < n_consecutive; i++) {
 +		total_len += consecutive_ios[i]->len;
 +	}
 +
 +	if (n_consecutive == 1) {
 +		/* We can use the buffer of the i/o request */
 +		combined_buf = aio_slot->buf;
 +		combined_buf2 = NULL;
 +	} else {
 +		combined_buf2 = static_cast<byte*>(
 +			ut_malloc(total_len + UNIV_PAGE_SIZE));
 +
 +		ut_a(combined_buf2);
 +
 +		combined_buf = static_cast<byte*>(
 +			ut_align(combined_buf2, UNIV_PAGE_SIZE));
 +	}
 +
 +	/* We release the array mutex for the time of the i/o: NOTE that
 +	this assumes that there is just one i/o-handler thread serving
 +	a single segment of slots! */
 +
 +	os_mutex_exit(array->mutex);
 +
 +	if (aio_slot->type == OS_FILE_WRITE && n_consecutive > 1) {
 +		/* Copy the buffers to the combined buffer */
 +		offs = 0;
 +
 +		for (ulint i = 0; i < n_consecutive; i++) {
 +
 +			ut_memcpy(combined_buf + offs, consecutive_ios[i]->buf,
 +				  consecutive_ios[i]->len);
 +
 +			offs += consecutive_ios[i]->len;
 +		}
 +	}
 +
 +	srv_set_io_thread_op_info(global_segment, "doing file i/o");
 +
 +	/* Do the i/o with ordinary, synchronous i/o functions: */
 +	if (aio_slot->type == OS_FILE_WRITE) {
 +		ut_ad(!srv_read_only_mode);
 +		ret = os_file_write(
 +			aio_slot->name, aio_slot->file, combined_buf,
 +			aio_slot->offset, total_len);
 +
 +		DBUG_EXECUTE_IF("ib_os_aio_func_io_failure_28",
 +			os_has_said_disk_full = FALSE; ret = 0; errno = 28;);
 +
 +		if (!ret) {
 +			os_file_handle_error_cond_exit(aio_slot->name, "os_file_write_func", TRUE, FALSE);
 +		}
 +
 +	} else {
 +		ret = os_file_read(
 +			aio_slot->file, combined_buf,
 +			aio_slot->offset, total_len);
 +	}
 +
 +	srv_set_io_thread_op_info(global_segment, "file i/o done");
 +
 +	if (aio_slot->type == OS_FILE_READ && n_consecutive > 1) {
 +		/* Copy the combined buffer to individual buffers */
 +		offs = 0;
 +
 +		for (ulint i = 0; i < n_consecutive; i++) {
 +
 +			ut_memcpy(consecutive_ios[i]->buf, combined_buf + offs,
 +				  consecutive_ios[i]->len);
 +			offs += consecutive_ios[i]->len;
 +		}
 +	}
 +
 +	if (combined_buf2) {
 +		ut_free(combined_buf2);
 +	}
 +
 +	os_mutex_enter(array->mutex);
 +
 +	/* Mark the i/os done in slots */
 +
 +	for (ulint i = 0; i < n_consecutive; i++) {
 +		consecutive_ios[i]->io_already_done = TRUE;
 +	}
 +
 +	/* We return the messages for the first slot now, and if there were
 +	several slots, the messages will be returned with subsequent calls
 +	of this function */
 +
 +slot_io_done:
 +
 +	ut_a(aio_slot->reserved);
 +
 +	*message1 = aio_slot->message1;
 +	*message2 = aio_slot->message2;
 +
 +	*type = aio_slot->type;
 +	*space_id = aio_slot->space_id;
 +
 +	os_mutex_exit(array->mutex);
 +
 +	os_aio_array_free_slot(array, aio_slot);
 +
 +	return(ret);
 +
 +wait_for_io:
 +	srv_set_io_thread_op_info(global_segment, "resetting wait event");
 +
 +	/* We wait here until there again can be i/os in the segment
 +	of this thread */
 +
 +	os_event_reset(os_aio_segment_wait_events[global_segment]);
 +
 +	os_mutex_exit(array->mutex);
 +
 +recommended_sleep:
 +	srv_set_io_thread_op_info(global_segment, "waiting for i/o request");
 +
 +	os_event_wait(os_aio_segment_wait_events[global_segment]);
 +
 +	goto restart;
 +}
 +
 +/**********************************************************************//**
 +Validates the consistency of an aio array.
 +@return	true if ok */
 +static
 +bool
 +os_aio_array_validate(
 +/*==================*/
 +	os_aio_array_t*	array)	/*!< in: aio wait array */
 +{
 +	ulint		i;
 +	ulint		n_reserved	= 0;
 +
 +	os_mutex_enter(array->mutex);
 +
 +	ut_a(array->n_slots > 0);
 +	ut_a(array->n_segments > 0);
 +
 +	for (i = 0; i < array->n_slots; i++) {
 +		os_aio_slot_t*	slot;
 +
 +		slot = os_aio_array_get_nth_slot(array, i);
 +
 +		if (slot->reserved) {
 +			n_reserved++;
 +			ut_a(slot->len > 0);
 +		}
 +	}
 +
 +	ut_a(array->n_reserved == n_reserved);
 +
 +	os_mutex_exit(array->mutex);
 +
 +	return(true);
 +}
 +
 +/**********************************************************************//**
 +Validates the consistency the aio system.
 +@return	TRUE if ok */
 +UNIV_INTERN
 +ibool
 +os_aio_validate(void)
 +/*=================*/
 +{
 +	os_aio_array_validate(os_aio_read_array);
 +
 +	if (os_aio_write_array != 0) {
 +		os_aio_array_validate(os_aio_write_array);
 +	}
 +
 +	if (os_aio_ibuf_array != 0) {
 +		os_aio_array_validate(os_aio_ibuf_array);
 +	}
 +
 +	if (os_aio_log_array != 0) {
 +		os_aio_array_validate(os_aio_log_array);
 +	}
 +
 +	if (os_aio_sync_array != 0) {
 +		os_aio_array_validate(os_aio_sync_array);
 +	}
 +
 +	return(TRUE);
 +}
 +
 +/**********************************************************************//**
 +Prints pending IO requests per segment of an aio array.
 +We probably don't need per segment statistics but they can help us
 +during development phase to see if the IO requests are being
 +distributed as expected. */
 +static
 +void
 +os_aio_print_segment_info(
 +/*======================*/
 +	FILE*		file,	/*!< in: file where to print */
 +	ulint*		n_seg,	/*!< in: pending IO array */
 +	os_aio_array_t*	array)	/*!< in: array to process */
 +{
 +	ulint	i;
 +
 +	ut_ad(array);
 +	ut_ad(n_seg);
 +	ut_ad(array->n_segments > 0);
 +
 +	if (array->n_segments == 1) {
 +		return;
 +	}
 +
 +	fprintf(file, " [");
 +	for (i = 0; i < array->n_segments; i++) {
 +		if (i != 0) {
 +			fprintf(file, ", ");
 +		}
 +
 +		fprintf(file, "%lu", n_seg[i]);
 +	}
 +	fprintf(file, "] ");
 +}
 +
 +/**********************************************************************//**
 +Prints info about the aio array. */
 +UNIV_INTERN
 +void
 +os_aio_print_array(
 +/*==============*/
 +	FILE*		file,	/*!< in: file where to print */
 +	os_aio_array_t*	array)	/*!< in: aio array to print */
 +{
 +	ulint			n_reserved = 0;
 +	ulint			n_res_seg[SRV_MAX_N_IO_THREADS];
 +
 +	os_mutex_enter(array->mutex);
 +
 +	ut_a(array->n_slots > 0);
 +	ut_a(array->n_segments > 0);
 +
 +	memset(n_res_seg, 0x0, sizeof(n_res_seg));
 +
 +	for (ulint i = 0; i < array->n_slots; ++i) {
 +		os_aio_slot_t*	slot;
 +		ulint		seg_no;
 +
 +		slot = os_aio_array_get_nth_slot(array, i);
 +
 +		seg_no = (i * array->n_segments) / array->n_slots;
 +
 +		if (slot->reserved) {
 +			++n_reserved;
 +			++n_res_seg[seg_no];
 +
 +			ut_a(slot->len > 0);
 +		}
 +	}
 +
 +	ut_a(array->n_reserved == n_reserved);
 +
 +	fprintf(file, " %lu", (ulong) n_reserved);
 +
 +	os_aio_print_segment_info(file, n_res_seg, array);
 +
 +	os_mutex_exit(array->mutex);
 +}
 +
 +/**********************************************************************//**
 +Prints info of the aio arrays. */
 +UNIV_INTERN
 +void
 +os_aio_print(
 +/*=========*/
 +	FILE*	file)	/*!< in: file where to print */
 +{
 +	time_t		current_time;
 +	double		time_elapsed;
 +	double		avg_bytes_read;
 +
 +	for (ulint i = 0; i < srv_n_file_io_threads; ++i) {
 +		fprintf(file, "I/O thread %lu state: %s (%s)",
 +			(ulong) i,
 +			srv_io_thread_op_info[i],
 +			srv_io_thread_function[i]);
 +
 +#ifndef _WIN32
 +		if (!srv_use_native_aio
 +		    && os_aio_segment_wait_events[i]->is_set()) {
 +			fprintf(file, " ev set");
 +		}
 +#endif /* _WIN32 */
 +
 +		fprintf(file, "\n");
 +	}
 +
 +	fputs("Pending normal aio reads:", file);
 +
 +	os_aio_print_array(file, os_aio_read_array);
 +
 +	if (os_aio_write_array != 0) {
 +		fputs(", aio writes:", file);
 +		os_aio_print_array(file, os_aio_write_array);
 +	}
 +
 +	if (os_aio_ibuf_array != 0) {
 +		fputs(",\n ibuf aio reads:", file);
 +		os_aio_print_array(file, os_aio_ibuf_array);
 +	}
 +
 +	if (os_aio_log_array != 0) {
 +		fputs(", log i/o's:", file);
 +		os_aio_print_array(file, os_aio_log_array);
 +	}
 +
 +	if (os_aio_sync_array != 0) {
 +		fputs(", sync i/o's:", file);
 +		os_aio_print_array(file, os_aio_sync_array);
 +	}
 +
 +	putc('\n', file);
 +	current_time = ut_time();
 +	time_elapsed = 0.001 + difftime(current_time, os_last_printout);
 +
 +	fprintf(file,
 +		"Pending flushes (fsync) log: " ULINTPF
 +		"; buffer pool: " ULINTPF "\n"
 +		ULINTPF " OS file reads, "
 +		ULINTPF " OS file writes, "
 +		ULINTPF " OS fsyncs\n",
 +		fil_n_pending_log_flushes,
 +		fil_n_pending_tablespace_flushes,
 +		os_n_file_reads,
 +		os_n_file_writes,
 +		os_n_fsyncs);
 +
 +	const ulint n_reads = ulint(MONITOR_VALUE(MONITOR_OS_PENDING_READS));
 +	const ulint n_writes = ulint(MONITOR_VALUE(MONITOR_OS_PENDING_WRITES));
 +
 +	if (n_reads != 0 || n_writes != 0) {
 +		fprintf(file,
 +			ULINTPF " pending reads, " ULINTPF " pending writes\n",
 +			n_reads, n_writes);
 +	}
 +
 +	if (os_n_file_reads == os_n_file_reads_old) {
 +		avg_bytes_read = 0.0;
 +	} else {
 +		avg_bytes_read = (double) os_bytes_read_since_printout
 +			/ (os_n_file_reads - os_n_file_reads_old);
 +	}
 +
 +	fprintf(file,
 +		"%.2f reads/s, %lu avg bytes/read,"
 +		" %.2f writes/s, %.2f fsyncs/s\n",
 +		(os_n_file_reads - os_n_file_reads_old)
 +		/ time_elapsed,
 +		(ulong) avg_bytes_read,
 +		(os_n_file_writes - os_n_file_writes_old)
 +		/ time_elapsed,
 +		(os_n_fsyncs - os_n_fsyncs_old)
 +		/ time_elapsed);
 +
 +	os_n_file_reads_old = os_n_file_reads;
 +	os_n_file_writes_old = os_n_file_writes;
 +	os_n_fsyncs_old = os_n_fsyncs;
 +	os_bytes_read_since_printout = 0;
 +
 +	os_last_printout = current_time;
 +}
 +
 +/**********************************************************************//**
 +Refreshes the statistics used to print per-second averages. */
 +UNIV_INTERN
 +void
 +os_aio_refresh_stats(void)
 +/*======================*/
 +{
 +	os_n_file_reads_old = os_n_file_reads;
 +	os_n_file_writes_old = os_n_file_writes;
 +	os_n_fsyncs_old = os_n_fsyncs;
 +	os_bytes_read_since_printout = 0;
 +
 +	os_last_printout = time(NULL);
 +}
 +
 +#ifdef UNIV_DEBUG
 +/**********************************************************************//**
 +Checks that all slots in the system have been freed, that is, there are
 +no pending io operations.
 +@return	TRUE if all free */
 +UNIV_INTERN
 +ibool
 +os_aio_all_slots_free(void)
 +/*=======================*/
 +{
 +	os_aio_array_t*	array;
 +	ulint		n_res	= 0;
 +
 +	array = os_aio_read_array;
 +
 +	os_mutex_enter(array->mutex);
 +
 +	n_res += array->n_reserved;
 +
 +	os_mutex_exit(array->mutex);
 +
 +	if (!srv_read_only_mode) {
 +		ut_a(os_aio_write_array == 0);
 +
 +		array = os_aio_write_array;
 +
 +		os_mutex_enter(array->mutex);
 +
 +		n_res += array->n_reserved;
 +
 +		os_mutex_exit(array->mutex);
 +
 +		ut_a(os_aio_ibuf_array == 0);
 +
 +		array = os_aio_ibuf_array;
 +
 +		os_mutex_enter(array->mutex);
 +
 +		n_res += array->n_reserved;
 +
 +		os_mutex_exit(array->mutex);
 +	}
 +
 +	ut_a(os_aio_log_array == 0);
 +
 +	array = os_aio_log_array;
 +
 +	os_mutex_enter(array->mutex);
 +
 +	n_res += array->n_reserved;
 +
 +	os_mutex_exit(array->mutex);
 +
 +	array = os_aio_sync_array;
 +
 +	os_mutex_enter(array->mutex);
 +
 +	n_res += array->n_reserved;
 +
 +	os_mutex_exit(array->mutex);
 +
 +	if (n_res == 0) {
 +
 +		return(TRUE);
 +	}
 +
 +	return(FALSE);
 +}
 +#endif /* UNIV_DEBUG */
 +
 +#endif /* !UNIV_HOTBACKUP */
diff --cc support-files/mysql.server.sh
index 9c4d8e35ec5,28f0c2f041b..34f3ca4af34
--- a/support-files/mysql.server.sh
+++ b/support-files/mysql.server.sh
@@@ -25,7 -25,14 +25,6 @@@
  # Description: MariaDB is a very fast and reliable SQL database engine.
  ### END INIT INFO
  
- # If you install MariaDB on some other places than @prefix@, then you
 -# Prevent OpenSUSE's init scripts from calling systemd, so that
 -# both 'bootstrap' and 'start' are handled entirely within this
 -# script
 -SYSTEMD_NO_WRAP=1
 -
 -# Prevent Debian's init scripts from calling systemctl
 -_SYSTEMCTL_SKIP_REDIRECT=true
 - 
  # have to do one of the following things for this script to work:
  #
  # - Run this script from within the MariaDB installation directory
@@@ -438,7 -452,7 +437,6 @@@ case "$mode" i
    *)
        # usage
        basename=`basename "$0"`
-       echo "Usage: $basename  {start|stop|restart|reload|force-reload|status|configtest}  [ MariaDB server options ]"
 -      echo "Usage: $basename  {start|stop|restart|reload|force-reload|status|configtest|bootstrap}  [ MySQL server options ]"
        exit 1
      ;;
  esac

    

[Commits] 648cf7176cc: Merge remote-tracking branch 'origin/5.5-galera' into 10.0-galera

jan